biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,1427 +0,0 @@
1
- #!/usr/bin/perl -wT
2
-
3
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
- use Getopt::Long;
9
- use HTML::Entities;
10
-
11
- # I do not know a better solution to find a lib path in -T mode.
12
- # So if you know a better solution, I'd be glad to hear.
13
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
- use FindBin;
15
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
-
17
- my $path;
18
- BEGIN
19
- {
20
- if ($FindBin::Bin =~ /(.*)/)
21
- {
22
- $path = $1;
23
- }
24
- }
25
- use lib "$path/../../lib";
26
-
27
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
28
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
29
-
30
- use SectLabel::PreProcess;
31
-
32
- # USER customizable section
33
- $0 =~ /([^\/]+)$/; my $progname = $1;
34
- my $outputVersion = "1.0";
35
- # END user customizable section
36
-
37
- sub License
38
- {
39
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
40
- }
41
-
42
- # HELP sub-procedure
43
- sub Help
44
- {
45
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
46
- print STDERR "usage: $progname -h\t[invokes help]\n";
47
- print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
48
- print STDERR "Options:\n";
49
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
50
- print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
51
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
52
- print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
53
- print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
54
- print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
55
- }
56
-
57
- my $QUIET = 0;
58
- my $HELP = 0;
59
- my $outFile = undef;
60
- my $inFile = undef;
61
-
62
- my $isXmlFeature = 0;
63
- my $isDecode = 0;
64
-
65
- my $isMarkup = 0;
66
- my $isParaDelimiter = 0;
67
-
68
- my $tagFile = "";
69
- my $isAllowEmpty = 0;
70
- my $isDebug = 1;
71
-
72
- $HELP = 1 unless GetOptions('in=s' => \$inFile,
73
- 'out=s' => \$outFile,
74
- 'decode' => \$isDecode,
75
- 'xmlFeature' => \$isXmlFeature,
76
-
77
- 'tag=s' => \$tagFile,
78
- 'allowEmptyLine' => \$isAllowEmpty,
79
- 'markup' => \$isMarkup,
80
-
81
- 'para' => \$isParaDelimiter,
82
- 'log' => \$isDebug,
83
- 'h' => \$HELP,
84
- 'q' => \$QUIET);
85
-
86
- if ($HELP || !defined $inFile || !defined $outFile)
87
- {
88
- Help();
89
- exit(0);
90
- }
91
-
92
- if (!$QUIET)
93
- {
94
- License();
95
- }
96
-
97
- # Untaint
98
- $inFile = untaintPath($inFile);
99
- $outFile = untaintPath($outFile);
100
- $tagFile = untaintPath($tagFile);
101
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
102
- # End untaint
103
-
104
- # Mark page, para, line, word
105
- my %gPageHash = ();
106
-
107
- # Mark paragraph
108
- my @gPara = ();
109
-
110
- # XML features
111
- # locFeature
112
- my @gPosHash = ();
113
- my $gMinPos = 1000000;
114
- my $gMaxPos = 0;
115
-
116
- # alignFeature
117
- my @gAlign = ();
118
-
119
- # boldFeature
120
- my @gBold = ();
121
-
122
- # italicFeature
123
- my @gItalic = ();
124
-
125
- # fontSizeFeature
126
- my %gFontSizeHash = ();
127
- my @gFontSize = ();
128
-
129
- # fontFaceFeature
130
- my %gFontFaceHash = ();
131
- my @gFontFace = ();
132
-
133
- # picFeature
134
- my @gPic = ();
135
-
136
- # tableFeature
137
- my @gTable = ();
138
-
139
- # bulletFeature
140
- my @gBullet = ();
141
-
142
- # spaceFeature
143
- #my %gSpaceHash = ();
144
- #my @gSpace = ();
145
- # End XML features
146
-
147
- my %tags = ();
148
-
149
- if ($isDebug)
150
- {
151
- print STDERR "\n# Processing file $inFile & output to $outFile\n";
152
- }
153
-
154
- my $markupOutput = "";
155
- my $allText = processFile($inFile, $outFile, \%tags);
156
-
157
- # Find header part
158
- my @lines = split(/\n/, $allText);
159
- my $numLines = scalar(@lines);
160
- my ($headerLength, $bodyLength, $bodyStartId) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $numLines);
161
-
162
- # Output
163
- if ($isMarkup)
164
- {
165
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
166
- print OF "$markupOutput";
167
- close OF;
168
- }
169
- else
170
- {
171
- output(\@lines, $outFile);
172
- }
173
-
174
- if ($tagFile ne "")
175
- {
176
- printTagInfo(\%tags, $tagFile);
177
- }
178
-
179
- sub processFile
180
- {
181
- my ($inFile, $tags) = @_;
182
-
183
- if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
184
- open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
185
-
186
- my $isPic = 0;
187
- my $isPara = 0;
188
- my $isTable = 0;
189
- my $isSpace = 0;
190
-
191
- my $text = undef;
192
- my $allText = undef;
193
-
194
- my $lineId = 0;
195
-
196
- # Each line contains a header
197
- while (<IF>)
198
- {
199
- if (/^\#/) { next; } # skip comments
200
-
201
- chomp;
202
- s/\cM$//; # remove ^M character at the end of the file if any
203
-
204
- my $line = $_;
205
- if($tagFile ne "")
206
- {
207
- processTagInfo($line, $tags);
208
- }
209
-
210
- # if ($line =~ /<\?xml version.+>/) { } ### Xml ###
211
- # if ($line =~ /^<\/column>$/) { } ### Column ###
212
- if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup)
213
- {
214
- $markupOutput .= "### Page $1\n";
215
- }
216
-
217
- ### pic ###
218
- if ($line =~ /^<dd (.*)>$/)
219
- {
220
- $isPic = 1;
221
- if($isMarkup)
222
- {
223
- $markupOutput .= "### Dd $1\n";
224
- }
225
- }
226
- elsif ($line =~ /^<\/dd>$/)
227
- {
228
- $isPic = 0;
229
- }
230
-
231
- ### Table ###
232
- if ($line =~ /^<table .*>$/)
233
- {
234
- $text .= $line."\n"; # we need the header
235
- $isTable = 1;
236
- }
237
- elsif ($line =~ /^<\/table>$/)
238
- {
239
- my $tableText = processTable($text, $isPic);
240
- $allText .= $tableText;
241
-
242
- my @tmpLines = split(/\n/, $tableText);
243
- $lineId += scalar(@tmpLines);
244
-
245
- $isTable = 0;
246
- $text = "";
247
- }
248
- elsif($isTable)
249
- {
250
- $text .= $line."\n";
251
- next;
252
- }
253
- ### Paragraph ###
254
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
255
- elsif ($line =~ /^<para (.*)>$/)
256
- {
257
- $text .= $line."\n"; # we need the header
258
- $isPara = 1;
259
-
260
- if($isMarkup)
261
- {
262
- $markupOutput .= "## Para $1\n";
263
- }
264
- }
265
- elsif ($line =~ /^<\/para>$/)
266
- {
267
- my ($paraText, $l, $t, $r, $b);
268
- ($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, 0, $isPic);
269
- $allText .= $paraText;
270
-
271
- my @tmpLines = split(/\n/, $paraText);
272
- $lineId += scalar(@tmpLines);
273
- $isPara = 0;
274
- $text = "";
275
- }
276
- elsif($isPara)
277
- {
278
- $text .= $line."\n";
279
- next;
280
- }
281
- }
282
-
283
- close IF;
284
- return $allText;
285
- }
286
-
287
- sub output
288
- {
289
- my ($lines, $outFile) = @_;
290
-
291
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
292
-
293
- ####### Final output ############
294
- # xml feature label
295
- my %gFontSizeLabels = ();
296
- # my %gSpaceLabels = (); # yes, no
297
-
298
- if($isXmlFeature)
299
- {
300
- getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
301
- # getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
302
- }
303
-
304
- my $id = -1;
305
- my $output = "";
306
- my $paraLineId = -1;
307
- my $paraLineCount = 0;
308
- foreach my $line (@{$lines})
309
- {
310
- $id++;
311
-
312
- $line =~ s/\cM$//; # remove ^M character at the end of each line if any
313
-
314
- # empty lines
315
- if ($line =~ /^\s*$/)
316
- {
317
- if (!$isAllowEmpty)
318
- {
319
- next;
320
- }
321
- else
322
- {
323
- if ($isDebug)
324
- {
325
- print STDERR "#! Line $id empty!\n";
326
- }
327
- }
328
- }
329
-
330
- if ($gPara[$id] eq "yes")
331
- {
332
- # mark para
333
- if ($output ne "")
334
- {
335
- if ($isParaDelimiter)
336
- {
337
- print OF "# Para $paraLineId $paraLineCount\n$output";
338
- $paraLineCount = 0;
339
- }
340
- else
341
- {
342
- if($isDecode)
343
- {
344
- $output = decode_entities($output);
345
- }
346
- print OF $output;
347
- }
348
-
349
- $output = "";
350
- }
351
- $paraLineId = $id;
352
- }
353
-
354
- $output .= $line;
355
- $paraLineCount++;
356
-
357
- ## Output XML features
358
- if ($isXmlFeature)
359
- {
360
- # loc feature
361
- my $locFeature;
362
-
363
- if($gPosHash[$id] != -1)
364
- {
365
- $locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos) * 8.0 / ($gMaxPos - $gMinPos + 1));
366
- }
367
-
368
- # align feature
369
- my $alignFeature = "xmlAlign_".$gAlign[$id];
370
-
371
- # fontSize feature
372
- my $fontSizeFeature;
373
- if($gFontSize[$id] == -1)
374
- {
375
- $fontSizeFeature = "xmlFontSize_none";
376
- }
377
- else
378
- {
379
- $fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
380
- }
381
-
382
- my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
383
- my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
384
- my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
385
- my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
386
- my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
387
-
388
- # space feature
389
- #my $spaceFeature;
390
- #if($gSpace[$id] eq "none")
391
- #{
392
- # $spaceFeature = "xmlSpace_none";
393
- #}
394
- #else
395
- #{
396
- # $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
397
- #}
398
-
399
- # differential features
400
- my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
401
-
402
- $output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n";
403
- # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
404
- }
405
- else
406
- {
407
- $output .= "\n";
408
- }
409
- }
410
-
411
- # mark para
412
- if ($output ne "")
413
- {
414
- if ($isParaDelimiter)
415
- {
416
- print OF "# Para $paraLineId $paraLineCount\n$output";
417
- $paraLineCount = 0;
418
- }
419
- else
420
- {
421
- if ($isDecode)
422
- {
423
- $output = decode_entities($output);
424
- }
425
-
426
- print OF $output;
427
- }
428
- $output = ""
429
- }
430
- close OF;
431
- }
432
-
433
- sub getDifferentialFeatures
434
- {
435
- my ($id) = @_;
436
-
437
- # alignChange feature
438
- my $alignDiff = "bi_xmlA_";
439
- if ($id == 0)
440
- {
441
- $alignDiff .= $gAlign[$id];
442
- }
443
- elsif ($gAlign[$id] eq $gAlign[$id-1])
444
- {
445
- $alignDiff .= "continue";
446
- }
447
- else
448
- {
449
- $alignDiff .= $gAlign[$id];
450
- }
451
-
452
- # fontFaceChange feature
453
- my $fontFaceDiff = "bi_xmlF_";
454
- if ($id == 0)
455
- {
456
- $fontFaceDiff .= "new";
457
- }
458
- elsif ($gFontFace[$id] eq $gFontFace[$id-1])
459
- {
460
- $fontFaceDiff .= "continue";
461
- }
462
- else
463
- {
464
- $fontFaceDiff .= "new";
465
- }
466
-
467
- # fontSizeChange feature
468
- my $fontSizeDiff = "bi_xmlS_";
469
- if ($id == 0)
470
- {
471
- $fontSizeDiff .= "new";
472
- }
473
- elsif ($gFontSize[$id] == $gFontSize[$id-1])
474
- {
475
- $fontSizeDiff .= "continue";
476
- }
477
- else
478
- {
479
- $fontSizeDiff .= "new";
480
- }
481
-
482
- # fontSFChange feature
483
- my $fontSFDiff = "bi_xmlSF_";
484
- if($id == 0){
485
- $fontSFDiff .= "new";
486
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
487
- $fontSFDiff .= "continue";
488
- } else {
489
- $fontSFDiff .= "new";
490
- }
491
-
492
- # fontSFBIChange feature
493
- my $fontSFBIDiff = "bi_xmlSFBI_";
494
- if($id == 0){
495
- $fontSFBIDiff .= "new";
496
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
497
- $fontSFBIDiff .= "continue";
498
- } else {
499
- $fontSFBIDiff .= "new";
500
- }
501
-
502
- # fontSFBIAChange feature
503
- my $fontSFBIADiff = "bi_xmlSFBIA_";
504
- if($id == 0){
505
- $fontSFBIADiff .= "new";
506
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
507
- $fontSFBIADiff .= "continue";
508
- } else {
509
- $fontSFBIADiff .= "new";
510
- }
511
-
512
- # para change feature
513
- my $paraDiff = "bi_xmlPara_";
514
- if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
515
- $paraDiff .= "header";
516
- } else {
517
- if($gPara[$id] eq "yes"){
518
- $paraDiff .= "new";
519
- } else {
520
- $paraDiff .= "continue";
521
- }
522
- }
523
-
524
- return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
525
- }
526
-
527
- sub getFontSizeLabels {
528
- my ($gFontSizeHash, $gFontSizeLabels) = @_;
529
-
530
- if($isDebug){ print STDERR "# Map fonts\n"; }
531
- my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
532
-
533
- my $commonSize = $sortedFonts[0];
534
- @sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
535
- my $commonIndex = 0; # index of common font size
536
- foreach(@sortedFonts){
537
- if($commonSize == $_) { # found
538
- last;
539
- }
540
- $commonIndex++;
541
- }
542
-
543
- # small fonts
544
- for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
545
- $gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
546
-
547
- if($isDebug){
548
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
549
- }
550
- }
551
-
552
- # common fonts
553
- $gFontSizeLabels->{$commonSize} = "common";
554
- if($isDebug){
555
- print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
556
- }
557
-
558
- # large fonts
559
- for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
560
- if((scalar(@sortedFonts)-$i) <= 3){
561
- $gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
562
- } else {
563
- $gFontSizeLabels->{$sortedFonts[$i]} = "larger";
564
- }
565
-
566
- if($isDebug){
567
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
568
- }
569
- }
570
- }
571
-
572
- sub getSpaceLabels {
573
- my ($gSpaceHash, $gSpaceLabels) = @_;
574
-
575
- if($isDebug){
576
- print STDERR "\n# Map space\n";
577
- }
578
- my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
579
-
580
- my $commonSpace = $sortedSpaces[0];
581
- my $commonFreq = $gSpaceHash->{$commonSpace};
582
- # find similar common freq with larger spaces
583
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
584
- my $freq = $gSpaceHash->{$sortedSpaces[$i]};
585
- if($freq/$commonFreq > 0.8){
586
- if($sortedSpaces[$i] > $commonSpace){
587
- $commonSpace = $sortedSpaces[$i];
588
- }
589
- } else {
590
- last;
591
- }
592
- }
593
-
594
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
595
- if($sortedSpaces[$i] > $commonSpace){
596
- $gSpaceLabels->{$sortedSpaces[$i]} = "yes";
597
- } else {
598
- $gSpaceLabels->{$sortedSpaces[$i]} = "no";
599
- }
600
-
601
- if($isDebug){
602
- print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
603
- }
604
- }
605
- }
606
-
607
- sub processTable
608
- {
609
- my ($inputText, $isPic) = @_;
610
-
611
- my $isCell = 0; # for table cell object
612
-
613
- my $allText = "";
614
- my $text = "";
615
-
616
- my @lines = split(/\n/, $inputText);
617
- my %tablePos = (); # $tablePos{$cellText} = "$l-$t-$r-$bottom"
618
- my %table = (); # $table{$row}->{$col} = \@paraTexts
619
- my $rowFrom;
620
- my $colFrom;
621
- my $rowTill;
622
- my $colTill;
623
-
624
- # xml feature
625
- my $align = "none";
626
- my $pos = -1;
627
- foreach my $line (@lines)
628
- {
629
- if ($line =~ /^<table (.+?)>$/)
630
- {
631
- my $attr = $1;
632
-
633
- if($isMarkup) { $markupOutput .= "### Table $attr\n"; }
634
-
635
- # Fix: wrong regex sequence, huydhn
636
- #if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
637
- #{
638
- # my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
639
- # $align = $5;
640
-
641
- # # pos feature
642
- # $pos = ($t+$bottom)/2.0;
643
-
644
- # if($pos < $gMinPos) { $gMinPos = $pos; }
645
- # if($pos > $gMaxPos) { $gMaxPos = $pos; }
646
- #}
647
- #else
648
- #{
649
- # print STDERR "# no table alignment or location \"$line\"\n";
650
- # $align = "";
651
- #}
652
-
653
- my ($l, $t, $r, $bottom) = undef;
654
- if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
655
- if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
656
- if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
657
- if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
658
-
659
- if ($t && $bottom)
660
- {
661
- # pos feature
662
- $pos = ($t + $bottom) / 2.0;
663
-
664
- if($pos < $gMinPos) { $gMinPos = $pos; }
665
- if($pos > $gMaxPos) { $gMaxPos = $pos; }
666
- }
667
- else
668
- {
669
- die "# Undefined table location \"$line\"\n";
670
- }
671
-
672
- if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
673
- {
674
- $align = $1;
675
- }
676
- else
677
- {
678
- print STDERR "# no table alignment \"$line\"\n";
679
- $align = "";
680
- }
681
- # End.
682
- }
683
- elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
684
- {
685
- $colFrom = $1;
686
- $colTill = $2;
687
- $rowFrom = $3;
688
- $rowTill = $4;
689
- #print STDERR "$rowFrom $rowTill $colFrom $colTill\n";
690
- $isCell = 1;
691
- }
692
- elsif ($line =~ /^<\/cell>$/) # end cell
693
- {
694
- my @paraTexts = ();
695
- processCell($text, \@paraTexts, \%tablePos, $isPic);
696
-
697
- for(my $i = $rowFrom; $i<=$rowTill; $i++)
698
- {
699
- for(my $j = $colFrom; $j<=$colTill; $j++)
700
- {
701
- if(!$table{$i}) { $table{$i} = (); }
702
- if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
703
-
704
- if($i == $rowFrom && $j == $colFrom)
705
- {
706
- push(@{$table{$i}->{$j}}, @paraTexts);
707
- if(scalar(@paraTexts) > 1) { last; }
708
- }
709
- else
710
- {
711
- push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
712
- }
713
- }
714
- }
715
-
716
- $isCell = 0;
717
- $text = "";
718
- }
719
- elsif($isCell)
720
- {
721
- $text .= $line."\n";
722
- next;
723
- }
724
- }
725
-
726
- # note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
727
- # E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
728
- # It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
729
- my @sortedRows = sort {$a <=> $b} keys %table;
730
- my $isFirstLinePara = 1;
731
- foreach my $row (@sortedRows)
732
- {
733
- my %tableR = %{$table{$row}};
734
- my @sortedCols = sort {$a <=> $b} keys %tableR;
735
- while(1)
736
- {
737
- my $isStop = 1;
738
- my $rowText = "";
739
-
740
- foreach my $col (@sortedCols)
741
- {
742
- # there's still some thing to process
743
- if(scalar(@{$tableR{$col}}) > 0)
744
- {
745
- $isStop = 0;
746
- $rowText .= shift(@{$tableR{$col}});
747
- }
748
- $rowText .= "\t";
749
- }
750
-
751
- if ((!$isAllowEmpty && $rowText =~ /^\s*$/) || ($isAllowEmpty && $rowText eq ""))
752
- {
753
- $isStop = 1;
754
- }
755
-
756
- if($isStop)
757
- {
758
- last;
759
- }
760
- else
761
- {
762
- $rowText =~ s/\t$/\n/;
763
- $allText .= $rowText;
764
- # print STDERR "$rowText";
765
-
766
- # para
767
- if($isFirstLinePara)
768
- {
769
- push(@gPara, "yes");
770
- $isFirstLinePara = 0;
771
- }
772
- else
773
- {
774
- push(@gPara, "no");
775
- }
776
-
777
- if($isXmlFeature)
778
- {
779
- # table feature
780
- push(@gTable, "yes");
781
-
782
- # pic feature
783
- if($isPic)
784
- {
785
- push(@gPic, "yes");
786
- }
787
- else
788
- {
789
- push(@gPic, "no");
790
- }
791
-
792
- push(@gPosHash, $pos); # update xml pos value
793
- push(@gAlign, $align); # update xml alignment value
794
-
795
- ### Not assign value ###
796
- push(@gFontSize, -1); # fontSize feature
797
- push(@gFontFace, "none"); # fontFace feature
798
- push(@gBold, "no"); # bold feature
799
- push(@gItalic, "no"); # italic feature
800
- push(@gBullet, "no"); # bullet feature
801
- # push(@gSpace, "none"); # space feature
802
- } # end if xml feature
803
- }
804
- }
805
- }
806
-
807
- return $allText;
808
- }
809
-
810
- sub processCell
811
- {
812
- my ($inputText, $paraTexts, $tablePos, $isPic) = @_;
813
-
814
- my $text = "";
815
- my @lines = split(/\n/, $inputText);
816
- my $isPara = 0;
817
- my $flag = 0;
818
- foreach my $line (@lines)
819
- {
820
- if ($line =~ /^<para (.*)>$/)
821
- {
822
- $text .= $line."\n"; # we need the header
823
- $isPara = 1;
824
-
825
- if($isMarkup)
826
- {
827
- $markupOutput .= "## ParaTable $1\n";
828
- }
829
- }
830
- elsif ($line =~ /^<\/para>$/)
831
- {
832
- my ($paraText, $l, $t, $r, $b) = processPara($text, 1, $isPic);
833
- my @tokens = split(/\n/, $paraText);
834
-
835
- foreach my $token (@tokens)
836
- {
837
- if($token ne "")
838
- {
839
- push(@{$paraTexts}, $token);
840
- $flag = 1;
841
- }
842
- }
843
-
844
- if(!$tablePos->{$paraText})
845
- {
846
- $tablePos->{$paraText} = "$l-$t-$r-$b";
847
- }
848
- else
849
- {
850
- #print STDERR "#! Warning: in method processCell, encounter the same paraText $paraText\n";
851
- }
852
-
853
- $isPara = 0;
854
- $text = "";
855
- }
856
- elsif ($isPara)
857
- {
858
- $text .= $line."\n";
859
- next;
860
- }
861
- }
862
-
863
- # at least one value should be added for cell which is ""
864
- if($flag == 0)
865
- {
866
- push(@{$paraTexts}, "");
867
- }
868
- }
869
-
870
- sub getAttrValue {
871
- my ($attrText, $attr) = @_;
872
-
873
- my $value = "none";
874
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
875
- $value = $1;
876
- }
877
-
878
- return $value;
879
- }
880
-
881
- sub checkFontAttr {
882
- my ($attrText, $attr, $attrHash, $count) = @_;
883
-
884
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
885
- my $attrValue = $1;
886
-
887
- $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
888
- }
889
- }
890
-
891
- sub processPara
892
- {
893
- my ($inputText, $isCell, $isPic) = @_;
894
-
895
- my $isSpace = 0;
896
- my $isSpecialSpace = 0;
897
- my $isTab = 0;
898
- my $isBullet = 0;
899
-
900
- my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
901
- # xml feature
902
- my $align = "none";
903
- my ($l, $t, $r, $bottom);
904
- my %fontSizeHash = ();
905
- my %fontFaceHash = ();
906
- my @boldArray = ();
907
- my @italicArray = ();
908
- my $space = "none";
909
-
910
- my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
911
- my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
912
- my $wdAttr; my $wdText = ""; my $isWd = 0;
913
-
914
- my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
915
- my $lnBoldCount = 0;
916
- my $lnItalicCount = 0;
917
-
918
- my $allText = "";
919
- my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
920
-
921
- binmode(STDERR, ":utf8");
922
-
923
- my $isFirstLinePara = 1;
924
- my @lines = split(/\n/, $inputText);
925
- for (my $i=0; $i<scalar(@lines); $i++)
926
- {
927
- my $line = $lines[$i];
928
-
929
- ## new para
930
- if ($line =~ /^<para (.+?)>$/)
931
- {
932
- my $attr = $1;
933
- $align = getAttrValue($attr, "alignment");
934
- # $indent = getAttrValue($attr, "li");
935
- $space = getAttrValue($attr, "spaceBefore");
936
- }
937
- ## new ln
938
- elsif ($line =~ /^<ln (.+)>$/)
939
- {
940
- $lnAttr = $1;
941
- $isLn = 1;
942
-
943
- if ($isMarkup)
944
- {
945
- $markupOutput .= "# Line $lnAttr\n";
946
- }
947
-
948
- # Fix: wrong regex sequence, huydhn
949
- #if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
950
- #{
951
- # ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
952
- #}
953
-
954
- if ($lnAttr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
955
- if ($lnAttr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
956
- if ($lnAttr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
957
- if ($lnAttr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
958
- # End.
959
-
960
- $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
961
-
962
- # Bold & Italic
963
- if($isXmlFeature)
964
- {
965
- $lnBold = getAttrValue($lnAttr, "bold");
966
- $lnItalic = getAttrValue($lnAttr, "italic");
967
- }
968
- }
969
- ## new run
970
- elsif ($line =~ /<run (.*)>$/)
971
- {
972
- $runAttr = $1;
973
-
974
- $isSpace = 0;
975
- $isTab = 0;
976
- $isRun = 1;
977
-
978
- # new wd, that consists of many runs
979
- if($line =~ /^<wd (.*?)>/)
980
- {
981
- $isWd = 1;
982
- $wdAttr = $1;
983
- }
984
-
985
- # Bold & Italic
986
- if ($isXmlFeature)
987
- {
988
- $runBold = getAttrValue($runAttr, "bold");
989
- $runItalic = getAttrValue($runAttr, "italic");
990
- }
991
- }
992
- ## wd
993
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
994
- {
995
- $wdAttr = $1;
996
- my $word = $2;
997
- $isSpace = 0;
998
- $isTab = 0;
999
-
1000
- if ($isMarkup)
1001
- {
1002
- $markupOutput .= "$word $wdAttr\n";
1003
- }
1004
-
1005
- # FontSize & FontFace
1006
- if($isXmlFeature)
1007
- {
1008
- checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
1009
- checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
1010
- }
1011
-
1012
- # Bold & Italic
1013
- if($isXmlFeature)
1014
- {
1015
- my $wdBold = getAttrValue($wdAttr, "bold");
1016
- my $wdItalic = getAttrValue($wdAttr, "italic");
1017
-
1018
- if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true")
1019
- {
1020
- $boldArray[$wdIndex] = 1;
1021
- $lnBoldCount++;
1022
- }
1023
-
1024
- if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true")
1025
- {
1026
- $italicArray[$wdIndex] = 1;
1027
- $lnItalicCount++;
1028
- }
1029
- } # if($isXmlFeature)
1030
-
1031
- ## add text
1032
- $text .= "$word";
1033
-
1034
- if($isRun) { $runText .= "$word "; }
1035
- $wdIndex++;
1036
- }
1037
- ## end wd
1038
- elsif ($line =~ /^<\/wd>$/)
1039
- {
1040
- $isWd = 0;
1041
-
1042
- if($isMarkup)
1043
- {
1044
- $markupOutput .= "$wdText $wdAttr\n";
1045
- $wdAttr = "";
1046
- }
1047
- }
1048
- ## end run
1049
- elsif ($line =~ /^(.*)<\/run>$/)
1050
- {
1051
- my $word = $1;
1052
-
1053
- ## add text
1054
- if($word ne "")
1055
- {
1056
- # Bold & Italic
1057
- if($isXmlFeature)
1058
- {
1059
- if($runBold eq "true" || $lnBold eq "true")
1060
- {
1061
- $boldArray[$wdIndex] = 1;
1062
- $lnBoldCount++;
1063
- }
1064
-
1065
- if($runItalic eq "true" || $lnItalic eq "true")
1066
- {
1067
- $italicArray[$wdIndex] = 1;
1068
- $lnItalicCount++;
1069
- }
1070
- }
1071
-
1072
- # appear in the final result
1073
- if($isLn) { $text .= "$word"; }
1074
-
1075
- # for internal record
1076
- if($isRun) { $runText .= "$word "; }
1077
- if($isWd) { $wdText .= "$word"; }
1078
-
1079
- $wdIndex++;
1080
- }
1081
-
1082
- # xml feature
1083
- # not a space, tab or new-line run
1084
- if($isXmlFeature && $runText ne "")
1085
- {
1086
- my @words = split(/\s+/, $runText);
1087
- my $numWords = scalar(@words);
1088
- checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
1089
- checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
1090
- }
1091
-
1092
- # reset run
1093
- # <run> not enclosed within <ln>
1094
- if (!$isLn) { $wdIndex = 0; }
1095
- $runText = "";
1096
- $isRun = 0;
1097
- $isSpecialSpace = 0;
1098
-
1099
- # Bold & Italic
1100
- if($isXmlFeature)
1101
- {
1102
- $runBold = "none";
1103
- $runItalic = "none";
1104
-
1105
- # <run> not enclosed within <ln>
1106
- if (!$isLn)
1107
- {
1108
- $lnBoldCount = 0;
1109
- $lnItalicCount = 0;
1110
- }
1111
- }
1112
- }
1113
- ## end ln
1114
- elsif ($line =~ /^<\/ln>$/)
1115
- {
1116
- if((!$isAllowEmpty && $text !~ /^\s*$/) || ($isAllowEmpty && $text ne ""))
1117
- {
1118
- # there's a forced EOL?
1119
- # not an emply line with space character
1120
- if($isForcedEOF eq "true" || (!$isCell && !$isSpecialSpace) )
1121
- {
1122
- $text .= "\n";
1123
-
1124
- # update allText
1125
- $allText .= $text;
1126
- $text = "";
1127
- }
1128
-
1129
- my $numWords = $wdIndex;
1130
-
1131
- if (!$isCell)
1132
- {
1133
- if ($isFirstLinePara)
1134
- {
1135
- push(@gPara, "yes");
1136
- $isFirstLinePara = 0;
1137
- }
1138
- else
1139
- {
1140
- push(@gPara, "no");
1141
- }
1142
- }
1143
-
1144
- if ($isXmlFeature && $numWords >= 1)
1145
- {
1146
- # xml feature
1147
- # assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
1148
- checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
1149
- checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
1150
- }
1151
-
1152
- if ($isXmlFeature && !$isCell && !$isSpecialSpace)
1153
- {
1154
- my $pos = ($t+$bottom)/2.0;
1155
- if ($pos < $gMinPos) { $gMinPos = $pos; }
1156
- if ($pos > $gMaxPos) { $gMaxPos = $pos; }
1157
-
1158
- push(@gPosHash, $pos); # pos feature
1159
- push(@gAlign, $align); # alignment feature
1160
- push(@gTable, "no"); # table feature
1161
-
1162
- # pic feature
1163
- if($isPic)
1164
- {
1165
- push(@gPic, "yes");
1166
-
1167
- ### Not assign value ###
1168
- push(@gFontSize, -1); # bold feature
1169
- push(@gFontFace, "none"); # bold feature
1170
- push(@gBold, "no"); # bold feature
1171
- push(@gItalic, "no"); # italic feature
1172
- push(@gBullet, "no"); # bullet feature
1173
- }
1174
- else
1175
- {
1176
- push(@gPic, "no");
1177
-
1178
- updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
1179
- %fontSizeHash = (); %fontFaceHash = ();
1180
-
1181
- updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
1182
- } # end if pic
1183
- } # end if($isXmlFeature && !$isCell && !$isSpecialSpace)
1184
- }
1185
-
1186
- ## reset ln
1187
- $isLn = 0;
1188
- $isForcedEOF = "none";
1189
- $isSpecialSpace = 0;
1190
- $wdIndex = 0;
1191
-
1192
- # Bold & Italic
1193
- if ($isXmlFeature)
1194
- {
1195
- $lnBold = "none";
1196
- $lnItalic = "none";
1197
-
1198
- $lnBoldCount = 0;
1199
- $lnItalicCount = 0;
1200
- }
1201
- } # end else </ln>
1202
- ## nl newline signal
1203
- elsif ($line =~ /^<nl orig=\"true\"\/>$/)
1204
- {
1205
- if($isLn)
1206
- {
1207
- $isSpace = 0;
1208
- }
1209
- else
1210
- {
1211
- if($isDebug)
1212
- {
1213
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
1214
- }
1215
- }
1216
- }
1217
- ## space
1218
- elsif ($line =~ /^<space\/>$/)
1219
- {
1220
- my $startTag = "";
1221
- my $endTag = "";
1222
- if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/)
1223
- {
1224
- $startTag = $1;
1225
- }
1226
-
1227
- if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/)
1228
- {
1229
- $endTag = $1;
1230
- }
1231
-
1232
- if($startTag eq $endTag && $startTag ne "")
1233
- {
1234
- # print STDERR "# Special space after \"$text\"\n";
1235
- $isSpecialSpace = 1;
1236
- }
1237
-
1238
- ## addText
1239
- $text .= " ";
1240
- $isSpace = 1;
1241
- }
1242
- ## tab
1243
- elsif ($line =~ /^<tab .*\/>$/)
1244
- {
1245
- ## add Text
1246
- $text .= "\t";
1247
- $isTab = 1;
1248
- }
1249
- ## bullet
1250
- elsif ($line =~ /^<bullet .*>$/)
1251
- {
1252
- $isBullet = 1;
1253
- }
1254
- }
1255
-
1256
- $allText .= $text;
1257
- return ($allText, $l, $t, $r, $bottom, $isSpace);
1258
- }
1259
-
1260
- sub updateXMLFontFeature {
1261
- my ($fontSizeHash, $fontFaceHash) = @_;
1262
-
1263
- # font size feature
1264
- if(scalar(keys %{$fontSizeHash}) == 0){
1265
- push(@gFontSize, -1);
1266
- } else {
1267
- my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
1268
-
1269
- my $fontSize = $sortedFonts[0];
1270
- push(@gFontSize, $fontSize);
1271
-
1272
- $gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
1273
- }
1274
-
1275
- # font face feature
1276
- if(scalar(keys %{$fontFaceHash}) == 0){
1277
- push(@gFontFace, "none");
1278
- } else {
1279
- my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
1280
- my $fontFace = $sortedFonts[0];
1281
- push(@gFontFace, $fontFace);
1282
-
1283
- $gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
1284
- }
1285
- }
1286
-
1287
- sub updateXMLFeatures {
1288
- my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
1289
- # bold feature
1290
- my $boldFeature;
1291
- if ($lnBoldCount/$numWords >= 0.667){
1292
- $boldFeature = "yes";
1293
- } else {
1294
- $boldFeature = "no";
1295
- }
1296
- push(@gBold, $boldFeature);
1297
-
1298
- # italic feature
1299
- my $italicFeature;
1300
- if ($lnItalicCount/$numWords >= 0.667){
1301
- $italicFeature = "yes";
1302
- } else {
1303
- $italicFeature = "no";
1304
- }
1305
- push(@gItalic, $italicFeature);
1306
-
1307
- # bullet feature
1308
- if($isBullet){
1309
- push(@gBullet, "yes");
1310
- } else {
1311
- push(@gBullet, "no");
1312
- }
1313
-
1314
- # space feature
1315
- # push(@gSpace, $space);
1316
- }
1317
-
1318
- ## Find the positions of header, body, and citation
1319
- sub getStructureInfo {
1320
- my ($lines, $numLines) = @_;
1321
-
1322
- my ($bodyLength, $citationLength, $bodyEndId) =
1323
- SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
1324
-
1325
- my ($headerLength, $bodyStartId);
1326
- ($headerLength, $bodyLength, $bodyStartId) =
1327
- SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
1328
-
1329
- # sanity check
1330
- my $totalLength = $headerLength + $bodyLength + $citationLength;
1331
- if($numLines != $totalLength){
1332
- print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
1333
- die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
1334
- }
1335
- return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
1336
- }
1337
-
1338
- ## Count XML tags/values for statistics purpose
1339
- sub processTagInfo {
1340
- my ($line, $tags) = @_;
1341
-
1342
- my $tag;
1343
- my $attr;
1344
- if($line =~ /^<(.+?)\b(.*)/){
1345
- $tag = $1;
1346
- $attr = $2;
1347
- if(!$tags->{$tag}){
1348
- $tags->{$tag} = ();
1349
- }
1350
- if($attr =~ /^\s*(.+?)\s*\/?>/){
1351
- $attr = $1;
1352
- }
1353
-
1354
- my @tokens = split(/\s+/, $attr);
1355
- foreach my $token (@tokens){
1356
- if($token =~ /^(.+)=(.+)$/){
1357
- my $attrName = $1;
1358
- my $value = $2;
1359
- if(!$tags->{$tag}->{$attrName}){
1360
- $tags->{$tag}->{$attrName} = ();
1361
- }
1362
- if(!$tags->{$tag}->{$attrName}->{$value}){
1363
- $tags->{$tag}->{$attrName}->{$value} = 0;
1364
- }
1365
- $tags->{$tag}->{$attrName}->{$value}++;
1366
- }
1367
- }
1368
- }
1369
- }
1370
-
1371
- ## Print tag info to file
1372
- sub printTagInfo {
1373
- my ($tags, $tagFile) = @_;
1374
-
1375
- open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
1376
- my @sortedTags = sort {$a cmp $b} keys %{$tags};
1377
- foreach(@sortedTags){
1378
- my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
1379
- print TAG "# Tag = $_\n";
1380
- foreach my $attr (@attrs) {
1381
- print TAG "$attr:";
1382
- my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
1383
- foreach my $value (@values){
1384
- print TAG " $value-$tags->{$_}->{$attr}->{$value}";
1385
- }
1386
- print TAG "\n";
1387
- }
1388
- }
1389
- close TAG;
1390
- }
1391
-
1392
- sub untaintPath {
1393
- my ($path) = @_;
1394
-
1395
- if ( $path =~ /^([-_\/\w\.]*)$/ ) {
1396
- $path = $1;
1397
- } else {
1398
- die "Bad path \"$path\"\n";
1399
- }
1400
-
1401
- return $path;
1402
- }
1403
-
1404
- sub untaint {
1405
- my ($s) = @_;
1406
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
1407
- $s = $1; # $data now untainted
1408
- } else {
1409
- die "Bad data in $s"; # log this somewhere
1410
- }
1411
- return $s;
1412
- }
1413
-
1414
- sub execute {
1415
- my ($cmd) = @_;
1416
- if($isDebug){
1417
- print STDERR "Executing: $cmd\n";
1418
- }
1419
- $cmd = untaint($cmd);
1420
- system($cmd);
1421
- }
1422
-
1423
- sub newTmpFile {
1424
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1425
- chomp($tmpFile);
1426
- return $tmpFile;
1427
- }