biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1427 +0,0 @@
1
- #!/usr/bin/perl -wT
2
-
3
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
- use Getopt::Long;
9
- use HTML::Entities;
10
-
11
- # I do not know a better solution to find a lib path in -T mode.
12
- # So if you know a better solution, I'd be glad to hear.
13
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
- use FindBin;
15
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
-
17
- my $path;
18
- BEGIN
19
- {
20
- if ($FindBin::Bin =~ /(.*)/)
21
- {
22
- $path = $1;
23
- }
24
- }
25
- use lib "$path/../../lib";
26
-
27
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
28
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
29
-
30
- use SectLabel::PreProcess;
31
-
32
- # USER customizable section
33
- $0 =~ /([^\/]+)$/; my $progname = $1;
34
- my $outputVersion = "1.0";
35
- # END user customizable section
36
-
37
- sub License
38
- {
39
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
40
- }
41
-
42
- # HELP sub-procedure
43
- sub Help
44
- {
45
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
46
- print STDERR "usage: $progname -h\t[invokes help]\n";
47
- print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
48
- print STDERR "Options:\n";
49
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
50
- print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
51
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
52
- print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
53
- print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
54
- print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
55
- }
56
-
57
- my $QUIET = 0;
58
- my $HELP = 0;
59
- my $outFile = undef;
60
- my $inFile = undef;
61
-
62
- my $isXmlFeature = 0;
63
- my $isDecode = 0;
64
-
65
- my $isMarkup = 0;
66
- my $isParaDelimiter = 0;
67
-
68
- my $tagFile = "";
69
- my $isAllowEmpty = 0;
70
- my $isDebug = 1;
71
-
72
- $HELP = 1 unless GetOptions('in=s' => \$inFile,
73
- 'out=s' => \$outFile,
74
- 'decode' => \$isDecode,
75
- 'xmlFeature' => \$isXmlFeature,
76
-
77
- 'tag=s' => \$tagFile,
78
- 'allowEmptyLine' => \$isAllowEmpty,
79
- 'markup' => \$isMarkup,
80
-
81
- 'para' => \$isParaDelimiter,
82
- 'log' => \$isDebug,
83
- 'h' => \$HELP,
84
- 'q' => \$QUIET);
85
-
86
- if ($HELP || !defined $inFile || !defined $outFile)
87
- {
88
- Help();
89
- exit(0);
90
- }
91
-
92
- if (!$QUIET)
93
- {
94
- License();
95
- }
96
-
97
- # Untaint
98
- $inFile = untaintPath($inFile);
99
- $outFile = untaintPath($outFile);
100
- $tagFile = untaintPath($tagFile);
101
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
102
- # End untaint
103
-
104
- # Mark page, para, line, word
105
- my %gPageHash = ();
106
-
107
- # Mark paragraph
108
- my @gPara = ();
109
-
110
- # XML features
111
- # locFeature
112
- my @gPosHash = ();
113
- my $gMinPos = 1000000;
114
- my $gMaxPos = 0;
115
-
116
- # alignFeature
117
- my @gAlign = ();
118
-
119
- # boldFeature
120
- my @gBold = ();
121
-
122
- # italicFeature
123
- my @gItalic = ();
124
-
125
- # fontSizeFeature
126
- my %gFontSizeHash = ();
127
- my @gFontSize = ();
128
-
129
- # fontFaceFeature
130
- my %gFontFaceHash = ();
131
- my @gFontFace = ();
132
-
133
- # picFeature
134
- my @gPic = ();
135
-
136
- # tableFeature
137
- my @gTable = ();
138
-
139
- # bulletFeature
140
- my @gBullet = ();
141
-
142
- # spaceFeature
143
- #my %gSpaceHash = ();
144
- #my @gSpace = ();
145
- # End XML features
146
-
147
- my %tags = ();
148
-
149
- if ($isDebug)
150
- {
151
- print STDERR "\n# Processing file $inFile & output to $outFile\n";
152
- }
153
-
154
- my $markupOutput = "";
155
- my $allText = processFile($inFile, $outFile, \%tags);
156
-
157
- # Find header part
158
- my @lines = split(/\n/, $allText);
159
- my $numLines = scalar(@lines);
160
- my ($headerLength, $bodyLength, $bodyStartId) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $numLines);
161
-
162
- # Output
163
- if ($isMarkup)
164
- {
165
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
166
- print OF "$markupOutput";
167
- close OF;
168
- }
169
- else
170
- {
171
- output(\@lines, $outFile);
172
- }
173
-
174
- if ($tagFile ne "")
175
- {
176
- printTagInfo(\%tags, $tagFile);
177
- }
178
-
179
- sub processFile
180
- {
181
- my ($inFile, $tags) = @_;
182
-
183
- if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
184
- open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
185
-
186
- my $isPic = 0;
187
- my $isPara = 0;
188
- my $isTable = 0;
189
- my $isSpace = 0;
190
-
191
- my $text = undef;
192
- my $allText = undef;
193
-
194
- my $lineId = 0;
195
-
196
- # Each line contains a header
197
- while (<IF>)
198
- {
199
- if (/^\#/) { next; } # skip comments
200
-
201
- chomp;
202
- s/\cM$//; # remove ^M character at the end of the file if any
203
-
204
- my $line = $_;
205
- if($tagFile ne "")
206
- {
207
- processTagInfo($line, $tags);
208
- }
209
-
210
- # if ($line =~ /<\?xml version.+>/) { } ### Xml ###
211
- # if ($line =~ /^<\/column>$/) { } ### Column ###
212
- if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup)
213
- {
214
- $markupOutput .= "### Page $1\n";
215
- }
216
-
217
- ### pic ###
218
- if ($line =~ /^<dd (.*)>$/)
219
- {
220
- $isPic = 1;
221
- if($isMarkup)
222
- {
223
- $markupOutput .= "### Dd $1\n";
224
- }
225
- }
226
- elsif ($line =~ /^<\/dd>$/)
227
- {
228
- $isPic = 0;
229
- }
230
-
231
- ### Table ###
232
- if ($line =~ /^<table .*>$/)
233
- {
234
- $text .= $line."\n"; # we need the header
235
- $isTable = 1;
236
- }
237
- elsif ($line =~ /^<\/table>$/)
238
- {
239
- my $tableText = processTable($text, $isPic);
240
- $allText .= $tableText;
241
-
242
- my @tmpLines = split(/\n/, $tableText);
243
- $lineId += scalar(@tmpLines);
244
-
245
- $isTable = 0;
246
- $text = "";
247
- }
248
- elsif($isTable)
249
- {
250
- $text .= $line."\n";
251
- next;
252
- }
253
- ### Paragraph ###
254
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
255
- elsif ($line =~ /^<para (.*)>$/)
256
- {
257
- $text .= $line."\n"; # we need the header
258
- $isPara = 1;
259
-
260
- if($isMarkup)
261
- {
262
- $markupOutput .= "## Para $1\n";
263
- }
264
- }
265
- elsif ($line =~ /^<\/para>$/)
266
- {
267
- my ($paraText, $l, $t, $r, $b);
268
- ($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, 0, $isPic);
269
- $allText .= $paraText;
270
-
271
- my @tmpLines = split(/\n/, $paraText);
272
- $lineId += scalar(@tmpLines);
273
- $isPara = 0;
274
- $text = "";
275
- }
276
- elsif($isPara)
277
- {
278
- $text .= $line."\n";
279
- next;
280
- }
281
- }
282
-
283
- close IF;
284
- return $allText;
285
- }
286
-
287
- sub output
288
- {
289
- my ($lines, $outFile) = @_;
290
-
291
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
292
-
293
- ####### Final output ############
294
- # xml feature label
295
- my %gFontSizeLabels = ();
296
- # my %gSpaceLabels = (); # yes, no
297
-
298
- if($isXmlFeature)
299
- {
300
- getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
301
- # getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
302
- }
303
-
304
- my $id = -1;
305
- my $output = "";
306
- my $paraLineId = -1;
307
- my $paraLineCount = 0;
308
- foreach my $line (@{$lines})
309
- {
310
- $id++;
311
-
312
- $line =~ s/\cM$//; # remove ^M character at the end of each line if any
313
-
314
- # empty lines
315
- if ($line =~ /^\s*$/)
316
- {
317
- if (!$isAllowEmpty)
318
- {
319
- next;
320
- }
321
- else
322
- {
323
- if ($isDebug)
324
- {
325
- print STDERR "#! Line $id empty!\n";
326
- }
327
- }
328
- }
329
-
330
- if ($gPara[$id] eq "yes")
331
- {
332
- # mark para
333
- if ($output ne "")
334
- {
335
- if ($isParaDelimiter)
336
- {
337
- print OF "# Para $paraLineId $paraLineCount\n$output";
338
- $paraLineCount = 0;
339
- }
340
- else
341
- {
342
- if($isDecode)
343
- {
344
- $output = decode_entities($output);
345
- }
346
- print OF $output;
347
- }
348
-
349
- $output = "";
350
- }
351
- $paraLineId = $id;
352
- }
353
-
354
- $output .= $line;
355
- $paraLineCount++;
356
-
357
- ## Output XML features
358
- if ($isXmlFeature)
359
- {
360
- # loc feature
361
- my $locFeature;
362
-
363
- if($gPosHash[$id] != -1)
364
- {
365
- $locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos) * 8.0 / ($gMaxPos - $gMinPos + 1));
366
- }
367
-
368
- # align feature
369
- my $alignFeature = "xmlAlign_".$gAlign[$id];
370
-
371
- # fontSize feature
372
- my $fontSizeFeature;
373
- if($gFontSize[$id] == -1)
374
- {
375
- $fontSizeFeature = "xmlFontSize_none";
376
- }
377
- else
378
- {
379
- $fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
380
- }
381
-
382
- my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
383
- my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
384
- my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
385
- my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
386
- my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
387
-
388
- # space feature
389
- #my $spaceFeature;
390
- #if($gSpace[$id] eq "none")
391
- #{
392
- # $spaceFeature = "xmlSpace_none";
393
- #}
394
- #else
395
- #{
396
- # $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
397
- #}
398
-
399
- # differential features
400
- my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
401
-
402
- $output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n";
403
- # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
404
- }
405
- else
406
- {
407
- $output .= "\n";
408
- }
409
- }
410
-
411
- # mark para
412
- if ($output ne "")
413
- {
414
- if ($isParaDelimiter)
415
- {
416
- print OF "# Para $paraLineId $paraLineCount\n$output";
417
- $paraLineCount = 0;
418
- }
419
- else
420
- {
421
- if ($isDecode)
422
- {
423
- $output = decode_entities($output);
424
- }
425
-
426
- print OF $output;
427
- }
428
- $output = ""
429
- }
430
- close OF;
431
- }
432
-
433
- sub getDifferentialFeatures
434
- {
435
- my ($id) = @_;
436
-
437
- # alignChange feature
438
- my $alignDiff = "bi_xmlA_";
439
- if ($id == 0)
440
- {
441
- $alignDiff .= $gAlign[$id];
442
- }
443
- elsif ($gAlign[$id] eq $gAlign[$id-1])
444
- {
445
- $alignDiff .= "continue";
446
- }
447
- else
448
- {
449
- $alignDiff .= $gAlign[$id];
450
- }
451
-
452
- # fontFaceChange feature
453
- my $fontFaceDiff = "bi_xmlF_";
454
- if ($id == 0)
455
- {
456
- $fontFaceDiff .= "new";
457
- }
458
- elsif ($gFontFace[$id] eq $gFontFace[$id-1])
459
- {
460
- $fontFaceDiff .= "continue";
461
- }
462
- else
463
- {
464
- $fontFaceDiff .= "new";
465
- }
466
-
467
- # fontSizeChange feature
468
- my $fontSizeDiff = "bi_xmlS_";
469
- if ($id == 0)
470
- {
471
- $fontSizeDiff .= "new";
472
- }
473
- elsif ($gFontSize[$id] == $gFontSize[$id-1])
474
- {
475
- $fontSizeDiff .= "continue";
476
- }
477
- else
478
- {
479
- $fontSizeDiff .= "new";
480
- }
481
-
482
- # fontSFChange feature
483
- my $fontSFDiff = "bi_xmlSF_";
484
- if($id == 0){
485
- $fontSFDiff .= "new";
486
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
487
- $fontSFDiff .= "continue";
488
- } else {
489
- $fontSFDiff .= "new";
490
- }
491
-
492
- # fontSFBIChange feature
493
- my $fontSFBIDiff = "bi_xmlSFBI_";
494
- if($id == 0){
495
- $fontSFBIDiff .= "new";
496
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
497
- $fontSFBIDiff .= "continue";
498
- } else {
499
- $fontSFBIDiff .= "new";
500
- }
501
-
502
- # fontSFBIAChange feature
503
- my $fontSFBIADiff = "bi_xmlSFBIA_";
504
- if($id == 0){
505
- $fontSFBIADiff .= "new";
506
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
507
- $fontSFBIADiff .= "continue";
508
- } else {
509
- $fontSFBIADiff .= "new";
510
- }
511
-
512
- # para change feature
513
- my $paraDiff = "bi_xmlPara_";
514
- if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
515
- $paraDiff .= "header";
516
- } else {
517
- if($gPara[$id] eq "yes"){
518
- $paraDiff .= "new";
519
- } else {
520
- $paraDiff .= "continue";
521
- }
522
- }
523
-
524
- return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
525
- }
526
-
527
- sub getFontSizeLabels {
528
- my ($gFontSizeHash, $gFontSizeLabels) = @_;
529
-
530
- if($isDebug){ print STDERR "# Map fonts\n"; }
531
- my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
532
-
533
- my $commonSize = $sortedFonts[0];
534
- @sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
535
- my $commonIndex = 0; # index of common font size
536
- foreach(@sortedFonts){
537
- if($commonSize == $_) { # found
538
- last;
539
- }
540
- $commonIndex++;
541
- }
542
-
543
- # small fonts
544
- for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
545
- $gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
546
-
547
- if($isDebug){
548
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
549
- }
550
- }
551
-
552
- # common fonts
553
- $gFontSizeLabels->{$commonSize} = "common";
554
- if($isDebug){
555
- print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
556
- }
557
-
558
- # large fonts
559
- for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
560
- if((scalar(@sortedFonts)-$i) <= 3){
561
- $gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
562
- } else {
563
- $gFontSizeLabels->{$sortedFonts[$i]} = "larger";
564
- }
565
-
566
- if($isDebug){
567
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
568
- }
569
- }
570
- }
571
-
572
- sub getSpaceLabels {
573
- my ($gSpaceHash, $gSpaceLabels) = @_;
574
-
575
- if($isDebug){
576
- print STDERR "\n# Map space\n";
577
- }
578
- my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
579
-
580
- my $commonSpace = $sortedSpaces[0];
581
- my $commonFreq = $gSpaceHash->{$commonSpace};
582
- # find similar common freq with larger spaces
583
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
584
- my $freq = $gSpaceHash->{$sortedSpaces[$i]};
585
- if($freq/$commonFreq > 0.8){
586
- if($sortedSpaces[$i] > $commonSpace){
587
- $commonSpace = $sortedSpaces[$i];
588
- }
589
- } else {
590
- last;
591
- }
592
- }
593
-
594
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
595
- if($sortedSpaces[$i] > $commonSpace){
596
- $gSpaceLabels->{$sortedSpaces[$i]} = "yes";
597
- } else {
598
- $gSpaceLabels->{$sortedSpaces[$i]} = "no";
599
- }
600
-
601
- if($isDebug){
602
- print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
603
- }
604
- }
605
- }
606
-
607
- sub processTable
608
- {
609
- my ($inputText, $isPic) = @_;
610
-
611
- my $isCell = 0; # for table cell object
612
-
613
- my $allText = "";
614
- my $text = "";
615
-
616
- my @lines = split(/\n/, $inputText);
617
- my %tablePos = (); # $tablePos{$cellText} = "$l-$t-$r-$bottom"
618
- my %table = (); # $table{$row}->{$col} = \@paraTexts
619
- my $rowFrom;
620
- my $colFrom;
621
- my $rowTill;
622
- my $colTill;
623
-
624
- # xml feature
625
- my $align = "none";
626
- my $pos = -1;
627
- foreach my $line (@lines)
628
- {
629
- if ($line =~ /^<table (.+?)>$/)
630
- {
631
- my $attr = $1;
632
-
633
- if($isMarkup) { $markupOutput .= "### Table $attr\n"; }
634
-
635
- # Fix: wrong regex sequence, huydhn
636
- #if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
637
- #{
638
- # my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
639
- # $align = $5;
640
-
641
- # # pos feature
642
- # $pos = ($t+$bottom)/2.0;
643
-
644
- # if($pos < $gMinPos) { $gMinPos = $pos; }
645
- # if($pos > $gMaxPos) { $gMaxPos = $pos; }
646
- #}
647
- #else
648
- #{
649
- # print STDERR "# no table alignment or location \"$line\"\n";
650
- # $align = "";
651
- #}
652
-
653
- my ($l, $t, $r, $bottom) = undef;
654
- if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
655
- if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
656
- if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
657
- if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
658
-
659
- if ($t && $bottom)
660
- {
661
- # pos feature
662
- $pos = ($t + $bottom) / 2.0;
663
-
664
- if($pos < $gMinPos) { $gMinPos = $pos; }
665
- if($pos > $gMaxPos) { $gMaxPos = $pos; }
666
- }
667
- else
668
- {
669
- die "# Undefined table location \"$line\"\n";
670
- }
671
-
672
- if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
673
- {
674
- $align = $1;
675
- }
676
- else
677
- {
678
- print STDERR "# no table alignment \"$line\"\n";
679
- $align = "";
680
- }
681
- # End.
682
- }
683
- elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
684
- {
685
- $colFrom = $1;
686
- $colTill = $2;
687
- $rowFrom = $3;
688
- $rowTill = $4;
689
- #print STDERR "$rowFrom $rowTill $colFrom $colTill\n";
690
- $isCell = 1;
691
- }
692
- elsif ($line =~ /^<\/cell>$/) # end cell
693
- {
694
- my @paraTexts = ();
695
- processCell($text, \@paraTexts, \%tablePos, $isPic);
696
-
697
- for(my $i = $rowFrom; $i<=$rowTill; $i++)
698
- {
699
- for(my $j = $colFrom; $j<=$colTill; $j++)
700
- {
701
- if(!$table{$i}) { $table{$i} = (); }
702
- if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
703
-
704
- if($i == $rowFrom && $j == $colFrom)
705
- {
706
- push(@{$table{$i}->{$j}}, @paraTexts);
707
- if(scalar(@paraTexts) > 1) { last; }
708
- }
709
- else
710
- {
711
- push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
712
- }
713
- }
714
- }
715
-
716
- $isCell = 0;
717
- $text = "";
718
- }
719
- elsif($isCell)
720
- {
721
- $text .= $line."\n";
722
- next;
723
- }
724
- }
725
-
726
- # note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
727
- # E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
728
- # It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
729
- my @sortedRows = sort {$a <=> $b} keys %table;
730
- my $isFirstLinePara = 1;
731
- foreach my $row (@sortedRows)
732
- {
733
- my %tableR = %{$table{$row}};
734
- my @sortedCols = sort {$a <=> $b} keys %tableR;
735
- while(1)
736
- {
737
- my $isStop = 1;
738
- my $rowText = "";
739
-
740
- foreach my $col (@sortedCols)
741
- {
742
- # there's still some thing to process
743
- if(scalar(@{$tableR{$col}}) > 0)
744
- {
745
- $isStop = 0;
746
- $rowText .= shift(@{$tableR{$col}});
747
- }
748
- $rowText .= "\t";
749
- }
750
-
751
- if ((!$isAllowEmpty && $rowText =~ /^\s*$/) || ($isAllowEmpty && $rowText eq ""))
752
- {
753
- $isStop = 1;
754
- }
755
-
756
- if($isStop)
757
- {
758
- last;
759
- }
760
- else
761
- {
762
- $rowText =~ s/\t$/\n/;
763
- $allText .= $rowText;
764
- # print STDERR "$rowText";
765
-
766
- # para
767
- if($isFirstLinePara)
768
- {
769
- push(@gPara, "yes");
770
- $isFirstLinePara = 0;
771
- }
772
- else
773
- {
774
- push(@gPara, "no");
775
- }
776
-
777
- if($isXmlFeature)
778
- {
779
- # table feature
780
- push(@gTable, "yes");
781
-
782
- # pic feature
783
- if($isPic)
784
- {
785
- push(@gPic, "yes");
786
- }
787
- else
788
- {
789
- push(@gPic, "no");
790
- }
791
-
792
- push(@gPosHash, $pos); # update xml pos value
793
- push(@gAlign, $align); # update xml alignment value
794
-
795
- ### Not assign value ###
796
- push(@gFontSize, -1); # fontSize feature
797
- push(@gFontFace, "none"); # fontFace feature
798
- push(@gBold, "no"); # bold feature
799
- push(@gItalic, "no"); # italic feature
800
- push(@gBullet, "no"); # bullet feature
801
- # push(@gSpace, "none"); # space feature
802
- } # end if xml feature
803
- }
804
- }
805
- }
806
-
807
- return $allText;
808
- }
809
-
810
- sub processCell
811
- {
812
- my ($inputText, $paraTexts, $tablePos, $isPic) = @_;
813
-
814
- my $text = "";
815
- my @lines = split(/\n/, $inputText);
816
- my $isPara = 0;
817
- my $flag = 0;
818
- foreach my $line (@lines)
819
- {
820
- if ($line =~ /^<para (.*)>$/)
821
- {
822
- $text .= $line."\n"; # we need the header
823
- $isPara = 1;
824
-
825
- if($isMarkup)
826
- {
827
- $markupOutput .= "## ParaTable $1\n";
828
- }
829
- }
830
- elsif ($line =~ /^<\/para>$/)
831
- {
832
- my ($paraText, $l, $t, $r, $b) = processPara($text, 1, $isPic);
833
- my @tokens = split(/\n/, $paraText);
834
-
835
- foreach my $token (@tokens)
836
- {
837
- if($token ne "")
838
- {
839
- push(@{$paraTexts}, $token);
840
- $flag = 1;
841
- }
842
- }
843
-
844
- if(!$tablePos->{$paraText})
845
- {
846
- $tablePos->{$paraText} = "$l-$t-$r-$b";
847
- }
848
- else
849
- {
850
- #print STDERR "#! Warning: in method processCell, encounter the same paraText $paraText\n";
851
- }
852
-
853
- $isPara = 0;
854
- $text = "";
855
- }
856
- elsif ($isPara)
857
- {
858
- $text .= $line."\n";
859
- next;
860
- }
861
- }
862
-
863
- # at least one value should be added for cell which is ""
864
- if($flag == 0)
865
- {
866
- push(@{$paraTexts}, "");
867
- }
868
- }
869
-
870
- sub getAttrValue {
871
- my ($attrText, $attr) = @_;
872
-
873
- my $value = "none";
874
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
875
- $value = $1;
876
- }
877
-
878
- return $value;
879
- }
880
-
881
- sub checkFontAttr {
882
- my ($attrText, $attr, $attrHash, $count) = @_;
883
-
884
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
885
- my $attrValue = $1;
886
-
887
- $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
888
- }
889
- }
890
-
891
- sub processPara
892
- {
893
- my ($inputText, $isCell, $isPic) = @_;
894
-
895
- my $isSpace = 0;
896
- my $isSpecialSpace = 0;
897
- my $isTab = 0;
898
- my $isBullet = 0;
899
-
900
- my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
901
- # xml feature
902
- my $align = "none";
903
- my ($l, $t, $r, $bottom);
904
- my %fontSizeHash = ();
905
- my %fontFaceHash = ();
906
- my @boldArray = ();
907
- my @italicArray = ();
908
- my $space = "none";
909
-
910
- my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
911
- my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
912
- my $wdAttr; my $wdText = ""; my $isWd = 0;
913
-
914
- my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
915
- my $lnBoldCount = 0;
916
- my $lnItalicCount = 0;
917
-
918
- my $allText = "";
919
- my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
920
-
921
- binmode(STDERR, ":utf8");
922
-
923
- my $isFirstLinePara = 1;
924
- my @lines = split(/\n/, $inputText);
925
- for (my $i=0; $i<scalar(@lines); $i++)
926
- {
927
- my $line = $lines[$i];
928
-
929
- ## new para
930
- if ($line =~ /^<para (.+?)>$/)
931
- {
932
- my $attr = $1;
933
- $align = getAttrValue($attr, "alignment");
934
- # $indent = getAttrValue($attr, "li");
935
- $space = getAttrValue($attr, "spaceBefore");
936
- }
937
- ## new ln
938
- elsif ($line =~ /^<ln (.+)>$/)
939
- {
940
- $lnAttr = $1;
941
- $isLn = 1;
942
-
943
- if ($isMarkup)
944
- {
945
- $markupOutput .= "# Line $lnAttr\n";
946
- }
947
-
948
- # Fix: wrong regex sequence, huydhn
949
- #if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
950
- #{
951
- # ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
952
- #}
953
-
954
- if ($lnAttr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
955
- if ($lnAttr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
956
- if ($lnAttr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
957
- if ($lnAttr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
958
- # End.
959
-
960
- $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
961
-
962
- # Bold & Italic
963
- if($isXmlFeature)
964
- {
965
- $lnBold = getAttrValue($lnAttr, "bold");
966
- $lnItalic = getAttrValue($lnAttr, "italic");
967
- }
968
- }
969
- ## new run
970
- elsif ($line =~ /<run (.*)>$/)
971
- {
972
- $runAttr = $1;
973
-
974
- $isSpace = 0;
975
- $isTab = 0;
976
- $isRun = 1;
977
-
978
- # new wd, that consists of many runs
979
- if($line =~ /^<wd (.*?)>/)
980
- {
981
- $isWd = 1;
982
- $wdAttr = $1;
983
- }
984
-
985
- # Bold & Italic
986
- if ($isXmlFeature)
987
- {
988
- $runBold = getAttrValue($runAttr, "bold");
989
- $runItalic = getAttrValue($runAttr, "italic");
990
- }
991
- }
992
- ## wd
993
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
994
- {
995
- $wdAttr = $1;
996
- my $word = $2;
997
- $isSpace = 0;
998
- $isTab = 0;
999
-
1000
- if ($isMarkup)
1001
- {
1002
- $markupOutput .= "$word $wdAttr\n";
1003
- }
1004
-
1005
- # FontSize & FontFace
1006
- if($isXmlFeature)
1007
- {
1008
- checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
1009
- checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
1010
- }
1011
-
1012
- # Bold & Italic
1013
- if($isXmlFeature)
1014
- {
1015
- my $wdBold = getAttrValue($wdAttr, "bold");
1016
- my $wdItalic = getAttrValue($wdAttr, "italic");
1017
-
1018
- if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true")
1019
- {
1020
- $boldArray[$wdIndex] = 1;
1021
- $lnBoldCount++;
1022
- }
1023
-
1024
- if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true")
1025
- {
1026
- $italicArray[$wdIndex] = 1;
1027
- $lnItalicCount++;
1028
- }
1029
- } # if($isXmlFeature)
1030
-
1031
- ## add text
1032
- $text .= "$word";
1033
-
1034
- if($isRun) { $runText .= "$word "; }
1035
- $wdIndex++;
1036
- }
1037
- ## end wd
1038
- elsif ($line =~ /^<\/wd>$/)
1039
- {
1040
- $isWd = 0;
1041
-
1042
- if($isMarkup)
1043
- {
1044
- $markupOutput .= "$wdText $wdAttr\n";
1045
- $wdAttr = "";
1046
- }
1047
- }
1048
- ## end run
1049
- elsif ($line =~ /^(.*)<\/run>$/)
1050
- {
1051
- my $word = $1;
1052
-
1053
- ## add text
1054
- if($word ne "")
1055
- {
1056
- # Bold & Italic
1057
- if($isXmlFeature)
1058
- {
1059
- if($runBold eq "true" || $lnBold eq "true")
1060
- {
1061
- $boldArray[$wdIndex] = 1;
1062
- $lnBoldCount++;
1063
- }
1064
-
1065
- if($runItalic eq "true" || $lnItalic eq "true")
1066
- {
1067
- $italicArray[$wdIndex] = 1;
1068
- $lnItalicCount++;
1069
- }
1070
- }
1071
-
1072
- # appear in the final result
1073
- if($isLn) { $text .= "$word"; }
1074
-
1075
- # for internal record
1076
- if($isRun) { $runText .= "$word "; }
1077
- if($isWd) { $wdText .= "$word"; }
1078
-
1079
- $wdIndex++;
1080
- }
1081
-
1082
- # xml feature
1083
- # not a space, tab or new-line run
1084
- if($isXmlFeature && $runText ne "")
1085
- {
1086
- my @words = split(/\s+/, $runText);
1087
- my $numWords = scalar(@words);
1088
- checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
1089
- checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
1090
- }
1091
-
1092
- # reset run
1093
- # <run> not enclosed within <ln>
1094
- if (!$isLn) { $wdIndex = 0; }
1095
- $runText = "";
1096
- $isRun = 0;
1097
- $isSpecialSpace = 0;
1098
-
1099
- # Bold & Italic
1100
- if($isXmlFeature)
1101
- {
1102
- $runBold = "none";
1103
- $runItalic = "none";
1104
-
1105
- # <run> not enclosed within <ln>
1106
- if (!$isLn)
1107
- {
1108
- $lnBoldCount = 0;
1109
- $lnItalicCount = 0;
1110
- }
1111
- }
1112
- }
1113
- ## end ln
1114
- elsif ($line =~ /^<\/ln>$/)
1115
- {
1116
- if((!$isAllowEmpty && $text !~ /^\s*$/) || ($isAllowEmpty && $text ne ""))
1117
- {
1118
- # there's a forced EOL?
1119
- # not an emply line with space character
1120
- if($isForcedEOF eq "true" || (!$isCell && !$isSpecialSpace) )
1121
- {
1122
- $text .= "\n";
1123
-
1124
- # update allText
1125
- $allText .= $text;
1126
- $text = "";
1127
- }
1128
-
1129
- my $numWords = $wdIndex;
1130
-
1131
- if (!$isCell)
1132
- {
1133
- if ($isFirstLinePara)
1134
- {
1135
- push(@gPara, "yes");
1136
- $isFirstLinePara = 0;
1137
- }
1138
- else
1139
- {
1140
- push(@gPara, "no");
1141
- }
1142
- }
1143
-
1144
- if ($isXmlFeature && $numWords >= 1)
1145
- {
1146
- # xml feature
1147
- # assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
1148
- checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
1149
- checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
1150
- }
1151
-
1152
- if ($isXmlFeature && !$isCell && !$isSpecialSpace)
1153
- {
1154
- my $pos = ($t+$bottom)/2.0;
1155
- if ($pos < $gMinPos) { $gMinPos = $pos; }
1156
- if ($pos > $gMaxPos) { $gMaxPos = $pos; }
1157
-
1158
- push(@gPosHash, $pos); # pos feature
1159
- push(@gAlign, $align); # alignment feature
1160
- push(@gTable, "no"); # table feature
1161
-
1162
- # pic feature
1163
- if($isPic)
1164
- {
1165
- push(@gPic, "yes");
1166
-
1167
- ### Not assign value ###
1168
- push(@gFontSize, -1); # bold feature
1169
- push(@gFontFace, "none"); # bold feature
1170
- push(@gBold, "no"); # bold feature
1171
- push(@gItalic, "no"); # italic feature
1172
- push(@gBullet, "no"); # bullet feature
1173
- }
1174
- else
1175
- {
1176
- push(@gPic, "no");
1177
-
1178
- updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
1179
- %fontSizeHash = (); %fontFaceHash = ();
1180
-
1181
- updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
1182
- } # end if pic
1183
- } # end if($isXmlFeature && !$isCell && !$isSpecialSpace)
1184
- }
1185
-
1186
- ## reset ln
1187
- $isLn = 0;
1188
- $isForcedEOF = "none";
1189
- $isSpecialSpace = 0;
1190
- $wdIndex = 0;
1191
-
1192
- # Bold & Italic
1193
- if ($isXmlFeature)
1194
- {
1195
- $lnBold = "none";
1196
- $lnItalic = "none";
1197
-
1198
- $lnBoldCount = 0;
1199
- $lnItalicCount = 0;
1200
- }
1201
- } # end else </ln>
1202
- ## nl newline signal
1203
- elsif ($line =~ /^<nl orig=\"true\"\/>$/)
1204
- {
1205
- if($isLn)
1206
- {
1207
- $isSpace = 0;
1208
- }
1209
- else
1210
- {
1211
- if($isDebug)
1212
- {
1213
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
1214
- }
1215
- }
1216
- }
1217
- ## space
1218
- elsif ($line =~ /^<space\/>$/)
1219
- {
1220
- my $startTag = "";
1221
- my $endTag = "";
1222
- if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/)
1223
- {
1224
- $startTag = $1;
1225
- }
1226
-
1227
- if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/)
1228
- {
1229
- $endTag = $1;
1230
- }
1231
-
1232
- if($startTag eq $endTag && $startTag ne "")
1233
- {
1234
- # print STDERR "# Special space after \"$text\"\n";
1235
- $isSpecialSpace = 1;
1236
- }
1237
-
1238
- ## addText
1239
- $text .= " ";
1240
- $isSpace = 1;
1241
- }
1242
- ## tab
1243
- elsif ($line =~ /^<tab .*\/>$/)
1244
- {
1245
- ## add Text
1246
- $text .= "\t";
1247
- $isTab = 1;
1248
- }
1249
- ## bullet
1250
- elsif ($line =~ /^<bullet .*>$/)
1251
- {
1252
- $isBullet = 1;
1253
- }
1254
- }
1255
-
1256
- $allText .= $text;
1257
- return ($allText, $l, $t, $r, $bottom, $isSpace);
1258
- }
1259
-
1260
- sub updateXMLFontFeature {
1261
- my ($fontSizeHash, $fontFaceHash) = @_;
1262
-
1263
- # font size feature
1264
- if(scalar(keys %{$fontSizeHash}) == 0){
1265
- push(@gFontSize, -1);
1266
- } else {
1267
- my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
1268
-
1269
- my $fontSize = $sortedFonts[0];
1270
- push(@gFontSize, $fontSize);
1271
-
1272
- $gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
1273
- }
1274
-
1275
- # font face feature
1276
- if(scalar(keys %{$fontFaceHash}) == 0){
1277
- push(@gFontFace, "none");
1278
- } else {
1279
- my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
1280
- my $fontFace = $sortedFonts[0];
1281
- push(@gFontFace, $fontFace);
1282
-
1283
- $gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
1284
- }
1285
- }
1286
-
1287
- sub updateXMLFeatures {
1288
- my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
1289
- # bold feature
1290
- my $boldFeature;
1291
- if ($lnBoldCount/$numWords >= 0.667){
1292
- $boldFeature = "yes";
1293
- } else {
1294
- $boldFeature = "no";
1295
- }
1296
- push(@gBold, $boldFeature);
1297
-
1298
- # italic feature
1299
- my $italicFeature;
1300
- if ($lnItalicCount/$numWords >= 0.667){
1301
- $italicFeature = "yes";
1302
- } else {
1303
- $italicFeature = "no";
1304
- }
1305
- push(@gItalic, $italicFeature);
1306
-
1307
- # bullet feature
1308
- if($isBullet){
1309
- push(@gBullet, "yes");
1310
- } else {
1311
- push(@gBullet, "no");
1312
- }
1313
-
1314
- # space feature
1315
- # push(@gSpace, $space);
1316
- }
1317
-
1318
- ## Find the positions of header, body, and citation
1319
- sub getStructureInfo {
1320
- my ($lines, $numLines) = @_;
1321
-
1322
- my ($bodyLength, $citationLength, $bodyEndId) =
1323
- SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
1324
-
1325
- my ($headerLength, $bodyStartId);
1326
- ($headerLength, $bodyLength, $bodyStartId) =
1327
- SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
1328
-
1329
- # sanity check
1330
- my $totalLength = $headerLength + $bodyLength + $citationLength;
1331
- if($numLines != $totalLength){
1332
- print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
1333
- die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
1334
- }
1335
- return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
1336
- }
1337
-
1338
- ## Count XML tags/values for statistics purpose
1339
- sub processTagInfo {
1340
- my ($line, $tags) = @_;
1341
-
1342
- my $tag;
1343
- my $attr;
1344
- if($line =~ /^<(.+?)\b(.*)/){
1345
- $tag = $1;
1346
- $attr = $2;
1347
- if(!$tags->{$tag}){
1348
- $tags->{$tag} = ();
1349
- }
1350
- if($attr =~ /^\s*(.+?)\s*\/?>/){
1351
- $attr = $1;
1352
- }
1353
-
1354
- my @tokens = split(/\s+/, $attr);
1355
- foreach my $token (@tokens){
1356
- if($token =~ /^(.+)=(.+)$/){
1357
- my $attrName = $1;
1358
- my $value = $2;
1359
- if(!$tags->{$tag}->{$attrName}){
1360
- $tags->{$tag}->{$attrName} = ();
1361
- }
1362
- if(!$tags->{$tag}->{$attrName}->{$value}){
1363
- $tags->{$tag}->{$attrName}->{$value} = 0;
1364
- }
1365
- $tags->{$tag}->{$attrName}->{$value}++;
1366
- }
1367
- }
1368
- }
1369
- }
1370
-
1371
- ## Print tag info to file
1372
- sub printTagInfo {
1373
- my ($tags, $tagFile) = @_;
1374
-
1375
- open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
1376
- my @sortedTags = sort {$a cmp $b} keys %{$tags};
1377
- foreach(@sortedTags){
1378
- my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
1379
- print TAG "# Tag = $_\n";
1380
- foreach my $attr (@attrs) {
1381
- print TAG "$attr:";
1382
- my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
1383
- foreach my $value (@values){
1384
- print TAG " $value-$tags->{$_}->{$attr}->{$value}";
1385
- }
1386
- print TAG "\n";
1387
- }
1388
- }
1389
- close TAG;
1390
- }
1391
-
1392
- sub untaintPath {
1393
- my ($path) = @_;
1394
-
1395
- if ( $path =~ /^([-_\/\w\.]*)$/ ) {
1396
- $path = $1;
1397
- } else {
1398
- die "Bad path \"$path\"\n";
1399
- }
1400
-
1401
- return $path;
1402
- }
1403
-
1404
- sub untaint {
1405
- my ($s) = @_;
1406
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
1407
- $s = $1; # $data now untainted
1408
- } else {
1409
- die "Bad data in $s"; # log this somewhere
1410
- }
1411
- return $s;
1412
- }
1413
-
1414
- sub execute {
1415
- my ($cmd) = @_;
1416
- if($isDebug){
1417
- print STDERR "Executing: $cmd\n";
1418
- }
1419
- $cmd = untaint($cmd);
1420
- system($cmd);
1421
- }
1422
-
1423
- sub newTmpFile {
1424
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1425
- chomp($tmpFile);
1426
- return $tmpFile;
1427
- }