biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,1025 +0,0 @@
1
- #!/usr/bin/perl -wT
2
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
-
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
- use Getopt::Long;
9
- use HTML::Entities;
10
-
11
- # I do not know a better solution to find a lib path in -T mode.
12
- # So if you know a better solution, I'd be glad to hear.
13
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
- use FindBin;
15
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
- my $path;
17
- BEGIN {
18
- if ($FindBin::Bin =~ /(.*)/) {
19
- $path = $1;
20
- }
21
- }
22
- use lib "$path/../../lib";
23
- use SectLabel::PreProcess;
24
-
25
- ### USER customizable section
26
- $0 =~ /([^\/]+)$/; my $progname = $1;
27
- my $outputVersion = "1.0";
28
- ### END user customizable section
29
-
30
- sub License {
31
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
32
- }
33
-
34
- ### HELP Sub-procedure
35
- sub Help {
36
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
37
- print STDERR "usage: $progname -h\t[invokes help]\n";
38
- print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
39
- print STDERR "Options:\n";
40
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
41
- print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
42
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
43
- print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
44
- print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
45
-
46
- print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
47
- }
48
- my $QUIET = 0;
49
- my $HELP = 0;
50
- my $outFile = undef;
51
- my $inFile = undef;
52
-
53
- my $isXmlFeature = 0;
54
- my $isDecode = 0;
55
-
56
- my $isMarkup = 0;
57
- my $isParaDelimiter = 0;
58
-
59
- my $tagFile = "";
60
- my $isAllowEmpty = 0;
61
- my $isDebug = 0;
62
- $HELP = 1 unless GetOptions('in=s' => \$inFile,
63
- 'out=s' => \$outFile,
64
- 'decode' => \$isDecode,
65
- 'xmlFeature' => \$isXmlFeature,
66
-
67
- 'tag=s' => \$tagFile,
68
- 'allowEmptyLine' => \$isAllowEmpty,
69
- 'markup' => \$isMarkup,
70
-
71
- 'para' => \$isParaDelimiter,
72
- 'log' => \$isDebug,
73
- 'h' => \$HELP,
74
- 'q' => \$QUIET);
75
-
76
- if ($HELP || !defined $inFile || !defined $outFile) {
77
- Help();
78
- exit(0);
79
- }
80
-
81
- if (!$QUIET) {
82
- License();
83
- }
84
-
85
- ### Untaint ###
86
- $inFile = untaintPath($inFile);
87
- $outFile = untaintPath($outFile);
88
- $tagFile = untaintPath($tagFile);
89
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
90
- ### End untaint ###
91
-
92
- ### Mark page, para, line, word
93
- my %gPageHash = ();
94
-
95
- ### Mark paragraph
96
- my @gPara = ();
97
-
98
- ### XML features ###
99
- # locFeature
100
- my @gPosHash = (); my $gMinPos = 1000000; my $gMaxPos = 0;
101
- my @gAlign = (); # alignFeature
102
- my @gBold = (); # bold feature
103
- my @gItalic = (); # italic feature
104
-
105
- # font size feature
106
- my %gFontSizeHash = (); my @gFontSize = ();
107
- # font face feature
108
- my %gFontFaceHash = (); my @gFontFace = ();
109
-
110
- my @gPic = (); # pic feature
111
- my @gTable = (); # table feature
112
- my @gBullet = (); # bullet feature
113
-
114
- # space feature
115
- #my %gSpaceHash = (); my @gSpace = ();
116
- ### End XML features ###
117
-
118
- my %tags = ();
119
-
120
- if($isDebug){
121
- print STDERR "\n# Processing file $inFile & output to $outFile\n";
122
- }
123
-
124
- my $markupOutput = "";
125
- my $allText = processFile($inFile, $outFile, \%tags);
126
-
127
- # Find header part
128
- my @lines = split(/\n/, $allText);
129
- my $numLines = scalar(@lines);
130
- my ($headerLength, $bodyLength, $bodyStartId) =
131
- SectLabel::PreProcess::findHeaderText(\@lines, 0, $numLines);
132
-
133
- # Output
134
- if($isMarkup){
135
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
136
- print OF "$markupOutput";
137
- close OF;
138
- } else {
139
- output(\@lines, $outFile);
140
- }
141
-
142
- if($tagFile ne ""){
143
- printTagInfo(\%tags, $tagFile);
144
- }
145
-
146
- sub processFile {
147
- my ($inFile, $tags) = @_;
148
-
149
- if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
150
- open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
151
-
152
- my $isPara = 0;
153
- my $isTable = 0;
154
- my $isSpace = 0;
155
- my $isPic = 0;
156
- my $allText = "";
157
- my $text = "";
158
-
159
- my $lineId = 0;
160
- my $isFirstTableCell = 0;
161
- while (<IF>) { #each line contains a header
162
- if (/^\#/) { next; } # skip comments
163
- chomp;
164
- s/\cM$//; # remove ^M character at the end of the file if any
165
- my $line = $_;
166
-
167
- if($tagFile ne ""){
168
- processTagInfo($line, $tags);
169
- }
170
-
171
- # if ($line =~ /<\?xml version.+>/){ } ### Xml ###
172
- # if ($line =~ /^<\/column>$/){ } ### Column ###
173
- if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup){
174
- $markupOutput .= "### Page $1\n";
175
- }
176
-
177
- ### pic ###
178
- if ($line =~ /^<dd (.*)>$/){
179
- $isPic = 1;
180
- if($isMarkup){
181
- $markupOutput .= "### Figure $1\n";
182
- }
183
- }
184
- elsif ($line =~ /^<\/dd>$/){
185
- $isPic = 0;
186
- }
187
-
188
- ### Table ###
189
- elsif ($line =~ /^<table (.*)>$/){
190
- $isTable = 1;
191
- $isFirstTableCell = 1;
192
- if($isMarkup){
193
- $markupOutput .= "### Table $1\n";
194
- }
195
- }
196
- elsif ($line =~ /^<\/table>$/){
197
- $isTable = 0;
198
- }
199
-
200
-
201
- ### Paragraph ###
202
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
203
- elsif ($line =~ /^<para (.*)>$/){
204
- $text .= $line."\n"; # we need the header
205
- $isPara = 1;
206
-
207
- if($isMarkup){
208
- $markupOutput .= "## Para $1\n";
209
- }
210
- }
211
- elsif ($line =~ /^<\/para>$/){
212
- my ($paraText, $l, $t, $r, $b);
213
- ($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, $isTable, $isPic, \$isFirstTableCell);
214
- $allText .= $paraText;
215
-
216
- my @tmpLines = split(/\n/, $paraText);
217
- $lineId += scalar(@tmpLines);
218
- $isPara = 0;
219
- $text = "";
220
- }
221
- elsif($isPara){
222
- $text .= $line."\n";
223
- next;
224
- }
225
- }
226
- close IF;
227
-
228
- return $allText;
229
- }
230
-
231
- sub output {
232
- my ($lines, $outFile) = @_;
233
-
234
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
235
-
236
- ####### Final output ############
237
- # xml feature label
238
- my %gFontSizeLabels = ();
239
- # my %gSpaceLabels = (); # yes, no
240
-
241
- if($isXmlFeature){
242
- getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
243
- # getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
244
- }
245
-
246
- my $id = -1;
247
- my $output = "";
248
- my $paraLineId = -1;
249
- my $paraLineCount = 0;
250
- foreach my $line (@{$lines}) {
251
- $id++;
252
-
253
- $line =~ s/\cM$//; # remove ^M character at the end of each line if any
254
-
255
- if($line =~ /^\s*$/){ # # empty lines
256
- if(!$isAllowEmpty){
257
- next;
258
- } else {
259
- if($isDebug){
260
- print STDERR "#! Line $id empty!\n";
261
- }
262
- }
263
- }
264
-
265
- if($gPara[$id] eq "yes"){
266
- if($output ne ""){ ## mark para
267
- if($isParaDelimiter){
268
- print OF "# Para $paraLineId $paraLineCount\n$output";
269
- $paraLineCount = 0;
270
- } else {
271
- if($isDecode){
272
- $output = decode_entities($output);
273
- }
274
- print OF $output;
275
- }
276
-
277
- $output = "";
278
- }
279
- $paraLineId = $id;
280
- }
281
-
282
- $output .= $line;
283
- $paraLineCount++;
284
-
285
- ## Output XML features ###
286
- if($isXmlFeature){
287
- # loc feature
288
- my $locFeature;
289
- if($gPosHash[$id] != -1){
290
- $locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos)*8.0/($gMaxPos - $gMinPos + 1));
291
- }
292
-
293
- # align feature
294
- my $alignFeature = "xmlAlign_".$gAlign[$id];
295
-
296
- # fontSize feature
297
- my $fontSizeFeature;
298
- if($gFontSize[$id] == -1){
299
- $fontSizeFeature = "xmlFontSize_none";
300
- } else {
301
- $fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
302
- }
303
-
304
- my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
305
- my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
306
- my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
307
- my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
308
- my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
309
-
310
- # space feature
311
- # my $spaceFeature;
312
- # if($gSpace[$id] eq "none"){
313
- # $spaceFeature = "xmlSpace_none";
314
- # } else {
315
- # $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
316
- # }
317
-
318
- ## Differential features ##
319
- my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
320
-
321
- $output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n"; # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
322
- } else {
323
- $output .= "\n";
324
- }
325
- }
326
-
327
- if($output ne ""){ ## mark para
328
- if($isParaDelimiter){
329
- print OF "# Para $paraLineId $paraLineCount\n$output";
330
- $paraLineCount = 0;
331
- } else {
332
- if($isDecode){
333
- $output = decode_entities($output);
334
- }
335
-
336
- print OF $output;
337
- }
338
- $output = ""
339
- }
340
- close OF;
341
- }
342
-
343
- sub getDifferentialFeatures {
344
- my ($id) = @_;
345
-
346
- # alignChange feature
347
- my $alignDiff = "bi_xmlA_";
348
- if($id == 0){
349
- $alignDiff .= $gAlign[$id];
350
- } elsif($gAlign[$id] eq $gAlign[$id-1]){
351
- $alignDiff .= "continue";
352
- } else {
353
- $alignDiff .= $gAlign[$id];
354
- }
355
-
356
- # fontFaceChange feature
357
- my $fontFaceDiff = "bi_xmlF_";
358
- if($id == 0){
359
- $fontFaceDiff .= "new";
360
- } elsif($gFontFace[$id] eq $gFontFace[$id-1]){
361
- $fontFaceDiff .= "continue";
362
- } else {
363
- $fontFaceDiff .= "new";
364
- }
365
-
366
- # fontSizeChange feature
367
- my $fontSizeDiff = "bi_xmlS_";
368
- if($id == 0){
369
- $fontSizeDiff .= "new";
370
- } elsif($gFontSize[$id] == $gFontSize[$id-1]){
371
- $fontSizeDiff .= "continue";
372
- } else {
373
- $fontSizeDiff .= "new";
374
- }
375
-
376
- # fontSFChange feature
377
- my $fontSFDiff = "bi_xmlSF_";
378
- if($id == 0){
379
- $fontSFDiff .= "new";
380
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
381
- $fontSFDiff .= "continue";
382
- } else {
383
- $fontSFDiff .= "new";
384
- }
385
-
386
- # fontSFBIChange feature
387
- my $fontSFBIDiff = "bi_xmlSFBI_";
388
- if($id == 0){
389
- $fontSFBIDiff .= "new";
390
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
391
- $fontSFBIDiff .= "continue";
392
- } else {
393
- $fontSFBIDiff .= "new";
394
- }
395
-
396
- # fontSFBIAChange feature
397
- my $fontSFBIADiff = "bi_xmlSFBIA_";
398
- if($id == 0){
399
- $fontSFBIADiff .= "new";
400
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
401
- $fontSFBIADiff .= "continue";
402
- } else {
403
- $fontSFBIADiff .= "new";
404
- }
405
-
406
- # para change feature
407
- my $paraDiff = "bi_xmlPara_";
408
- if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
409
- $paraDiff .= "header";
410
- } else {
411
- if($gPara[$id] eq "yes"){
412
- $paraDiff .= "new";
413
- } else {
414
- $paraDiff .= "continue";
415
- }
416
- }
417
-
418
- return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
419
- }
420
-
421
- sub getFontSizeLabels {
422
- my ($gFontSizeHash, $gFontSizeLabels) = @_;
423
-
424
- if($isDebug){ print STDERR "# Map fonts\n"; }
425
- my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
426
-
427
- my $commonSize = $sortedFonts[0];
428
- @sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
429
- my $commonIndex = 0; # index of common font size
430
- foreach(@sortedFonts){
431
- if($commonSize == $_) { # found
432
- last;
433
- }
434
- $commonIndex++;
435
- }
436
-
437
- # small fonts
438
- for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
439
- $gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
440
-
441
- if($isDebug){
442
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
443
- }
444
- }
445
-
446
- # common fonts
447
- $gFontSizeLabels->{$commonSize} = "common";
448
- if($isDebug){
449
- print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
450
- }
451
-
452
- # large fonts
453
- for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
454
- if((scalar(@sortedFonts)-$i) <= 3){
455
- $gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
456
- } else {
457
- $gFontSizeLabels->{$sortedFonts[$i]} = "larger";
458
- }
459
-
460
- if($isDebug){
461
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
462
- }
463
- }
464
- }
465
-
466
- sub getSpaceLabels {
467
- my ($gSpaceHash, $gSpaceLabels) = @_;
468
-
469
- if($isDebug){
470
- print STDERR "\n# Map space\n";
471
- }
472
- my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
473
-
474
- my $commonSpace = $sortedSpaces[0];
475
- my $commonFreq = $gSpaceHash->{$commonSpace};
476
- # find similar common freq with larger spaces
477
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
478
- my $freq = $gSpaceHash->{$sortedSpaces[$i]};
479
- if($freq/$commonFreq > 0.8){
480
- if($sortedSpaces[$i] > $commonSpace){
481
- $commonSpace = $sortedSpaces[$i];
482
- }
483
- } else {
484
- last;
485
- }
486
- }
487
-
488
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
489
- if($sortedSpaces[$i] > $commonSpace){
490
- $gSpaceLabels->{$sortedSpaces[$i]} = "yes";
491
- } else {
492
- $gSpaceLabels->{$sortedSpaces[$i]} = "no";
493
- }
494
-
495
- if($isDebug){
496
- print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
497
- }
498
- }
499
- }
500
-
501
- sub getAttrValue {
502
- my ($attrText, $attr) = @_;
503
-
504
- my $value = "none";
505
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
506
- $value = $1;
507
- }
508
-
509
- return $value;
510
- }
511
-
512
- sub checkFontAttr {
513
- my ($attrText, $attr, $attrHash, $count) = @_;
514
-
515
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
516
- my $attrValue = $1;
517
-
518
- $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
519
- }
520
- }
521
-
522
- sub processPara {
523
- my ($inputText, $isTable, $isPic, $isFirstTableCell) = @_;
524
-
525
- my $isSpace = 0;
526
- my $isSpecialSpace = 0;
527
- my $isTab = 0;
528
- my $isBullet = 0;
529
-
530
- my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
531
- # xml feature
532
- my $align = "none";
533
- my ($l, $t, $r, $bottom);
534
- my %fontSizeHash = ();
535
- my %fontFaceHash = ();
536
- my @boldArray = ();
537
- my @italicArray = ();
538
- my $space = "none";
539
-
540
- my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
541
- my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
542
- my $wdAttr; my $wdText = ""; my $isWd = 0;
543
-
544
- my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
545
- my $lnBoldCount = 0;
546
- my $lnItalicCount = 0;
547
-
548
- my $allText = "";
549
- my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
550
-
551
- binmode(STDERR, ":utf8");
552
-
553
- my $isFirstLinePara = 1;
554
- my @lines = split(/\n/, $inputText);
555
- for(my $i=0; $i<scalar(@lines); $i++){
556
- my $line = $lines[$i];
557
-
558
- ## new para
559
- if ($line =~ /^<para (.+?)>$/){
560
- my $attr = $1;
561
- $align = getAttrValue($attr, "alignment");
562
- # $indent = getAttrValue($attr, "li");
563
- $space = getAttrValue($attr, "spaceBefore");
564
- }
565
-
566
- ## new ln
567
- elsif ($line =~ /^<ln (.+)>$/){
568
- $lnAttr = $1;
569
- $isLn = 1;
570
-
571
- if ($isMarkup){
572
- $markupOutput .= "# Line $lnAttr\n";
573
- }
574
-
575
- if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/){
576
- ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
577
- }
578
- $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
579
-
580
- if($isXmlFeature){ # Bold & Italic
581
- $lnBold = getAttrValue($lnAttr, "bold");
582
- $lnItalic = getAttrValue($lnAttr, "italic");
583
- }
584
- }
585
-
586
- ## new run
587
- elsif ($line =~ /<run (.*)>$/){
588
- $runAttr = $1;
589
-
590
- $isSpace = 0;
591
- $isTab = 0;
592
- $isRun = 1;
593
-
594
- if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
595
- $isWd = 1;
596
- $wdAttr = $1;
597
- }
598
-
599
- if($isXmlFeature){ # Bold & Italic
600
- $runBold = getAttrValue($runAttr, "bold");
601
- $runItalic = getAttrValue($runAttr, "italic");
602
- }
603
- }
604
-
605
- ## wd
606
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
607
- $wdAttr = $1;
608
- my $word = $2;
609
- $isSpace = 0;
610
- $isTab = 0;
611
-
612
- if ($isMarkup){
613
- $markupOutput .= "$word $wdAttr";
614
- if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
615
- $markupOutput .= " $1=\"true\"";
616
- }
617
- $markupOutput .= "\n";
618
- }
619
-
620
- if($isXmlFeature){ # FontSize & FontFace
621
- checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
622
- checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
623
- }
624
-
625
- if($isXmlFeature){ # Bold & Italic
626
- my $wdBold = getAttrValue($wdAttr, "bold");
627
- my $wdItalic = getAttrValue($wdAttr, "italic");
628
-
629
- if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true"){
630
- $boldArray[$wdIndex] = 1;
631
- $lnBoldCount++;
632
- }
633
-
634
- if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true"){
635
- $italicArray[$wdIndex] = 1;
636
- $lnItalicCount++;
637
- }
638
- } # if($isXmlFeature)
639
-
640
- ## add text
641
- $text .= "$word";
642
-
643
- if($isRun) {
644
- $runText .= "$word ";
645
- }
646
- $wdIndex++;
647
- }
648
-
649
- ## end wd
650
- elsif ($line =~ /^<\/wd>$/){
651
- $isWd = 0;
652
-
653
- if($isMarkup){
654
- $markupOutput .= "$wdText $wdAttr";
655
- if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
656
- $markupOutput .= " $1=\"true\"";
657
- }
658
- $markupOutput .= "\n";
659
-
660
- $wdAttr = "";
661
- }
662
- }
663
-
664
- ## end run
665
- elsif ($line =~ /^(.*)<\/run>$/){
666
- my $word = $1;
667
-
668
- ## add text
669
- if($word ne ""){
670
- if($isXmlFeature){ # Bold & Italic
671
- if($runBold eq "true" || $lnBold eq "true"){
672
- $boldArray[$wdIndex] = 1;
673
- $lnBoldCount++;
674
- }
675
-
676
- if($runItalic eq "true" || $lnItalic eq "true"){
677
- $italicArray[$wdIndex] = 1;
678
- $lnItalicCount++;
679
- }
680
- }
681
-
682
- # appear in the final result
683
- if($isLn){ $text .= "$word"; }
684
-
685
- # for internal record
686
- if($isRun){ $runText .= "$word "; }
687
- if($isWd){ $wdText .= "$word"; }
688
-
689
- $wdIndex++;
690
- }
691
-
692
- # xml feature
693
- if($isXmlFeature && $runText ne "") { # not a space, tab or new-line run
694
- my @words = split(/\s+/, $runText);
695
- my $numWords = scalar(@words);
696
- checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
697
- checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
698
- }
699
-
700
- ## reset run
701
- if(!$isLn){ # <run> not enclosed within <ln>
702
- $wdIndex = 0;
703
- }
704
- $runText = "";
705
- $isRun = 0;
706
- $isSpecialSpace = 0;
707
-
708
- if($isXmlFeature){ # Bold & Italic
709
- $runBold = "none";
710
- $runItalic = "none";
711
-
712
- if(!$isLn){ # <run> not enclosed within <ln>
713
- $lnBoldCount = 0;
714
- $lnItalicCount = 0;
715
- }
716
- }
717
- }
718
-
719
- ## end ln
720
- elsif ($line =~ /^<\/ln>$/){
721
- if((!$isAllowEmpty && $text !~ /^\s*$/)
722
- || ($isAllowEmpty && $text ne "")){
723
- if($isForcedEOF eq "true" || # there's a forced EOL?
724
- !$isSpecialSpace # not an emply line with space character
725
- ){
726
- $text .= "\n";
727
-
728
- # update allText
729
- $allText .= $text;
730
- $text = "";
731
- }
732
-
733
- my $numWords = $wdIndex;
734
-
735
- if(!$isTable){
736
- if($isFirstLinePara){
737
- push(@gPara, "yes");
738
- $isFirstLinePara = 0;
739
- } else {
740
- push(@gPara, "no");
741
- }
742
- } else {
743
- if($$isFirstTableCell){
744
- push(@gPara, "yes");
745
- $$isFirstTableCell = 0;
746
- } else {
747
- push(@gPara, "no");
748
- }
749
- }
750
-
751
- if($isXmlFeature && $numWords >= 1){
752
- # xml feature
753
- # assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
754
- checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
755
- checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
756
- }
757
-
758
- if($isXmlFeature && !$isSpecialSpace){
759
- my $pos = ($t+$bottom)/2.0;
760
- if($pos < $gMinPos){ $gMinPos = $pos; }
761
- if($pos > $gMaxPos){ $gMaxPos = $pos; }
762
- push(@gPosHash, $pos); # pos feature
763
- push(@gAlign, $align); # alignment feature
764
-
765
- if($isPic){
766
- push(@gPic, "yes");
767
- } else {
768
- push(@gPic, "no");
769
- }
770
- if($isTable){
771
- push(@gTable, "yes");
772
- } else {
773
- push(@gTable, "no");
774
- }
775
-
776
- if($isPic || $isTable){
777
- ### Not assign value ###
778
- push(@gFontSize, -1); # bold feature
779
- push(@gFontFace, "none"); # bold feature
780
- push(@gBold, "no"); # bold feature
781
- push(@gItalic, "no"); # italic feature
782
- push(@gBullet, "no"); # bullet feature
783
- } else {
784
- updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
785
- %fontSizeHash = (); %fontFaceHash = ();
786
-
787
- updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
788
- } # end if pic
789
- } # end if($isXmlFeature && !$isSpecialSpace)
790
- }
791
-
792
- ## reset ln
793
- $isLn = 0;
794
- $isForcedEOF = "none";
795
- $isSpecialSpace = 0;
796
- $wdIndex = 0;
797
-
798
- if($isXmlFeature){ # Bold & Italic
799
- $lnBold = "none";
800
- $lnItalic = "none";
801
-
802
- $lnBoldCount = 0;
803
- $lnItalicCount = 0;
804
- }
805
- } # end else </ln>
806
-
807
- ## nl newline signal
808
- elsif ($line =~ /^<nl orig=\"true\"\/>$/){
809
- if($isLn){
810
- $isSpace = 0;
811
- } else {
812
- if($isDebug){
813
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
814
- }
815
- }
816
- }
817
-
818
- ## space
819
- elsif ($line =~ /^<space\/>$/){
820
- my $startTag = "";
821
- my $endTag = "";
822
- if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
823
- $startTag = $1;
824
- }
825
-
826
- if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
827
- $endTag = $1;
828
- }
829
-
830
- if($startTag eq $endTag && $startTag ne ""){
831
- # print STDERR "# Special space after \"$text\"\n";
832
- $isSpecialSpace = 1;
833
- }
834
-
835
- ## addText
836
- $text .= " ";
837
- $isSpace = 1;
838
- }
839
-
840
- ## tab
841
- elsif ($line =~ /^<tab .*\/>$/){
842
- ## add Text
843
- $text .= "\t";
844
-
845
- $isTab = 1;
846
- }
847
-
848
- ## bullet
849
- elsif ($line =~ /^<bullet .*>$/){
850
- $isBullet = 1;
851
- }
852
- }
853
-
854
- $allText .= $text;
855
- return ($allText, $l, $t, $r, $bottom, $isSpace);
856
- }
857
-
858
- sub updateXMLFontFeature {
859
- my ($fontSizeHash, $fontFaceHash) = @_;
860
-
861
- # font size feature
862
- if(scalar(keys %{$fontSizeHash}) == 0){
863
- push(@gFontSize, -1);
864
- } else {
865
- my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
866
-
867
- my $fontSize = $sortedFonts[0];
868
- push(@gFontSize, $fontSize);
869
-
870
- $gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
871
- }
872
-
873
- # font face feature
874
- if(scalar(keys %{$fontFaceHash}) == 0){
875
- push(@gFontFace, "none");
876
- } else {
877
- my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
878
- my $fontFace = $sortedFonts[0];
879
- push(@gFontFace, $fontFace);
880
-
881
- $gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
882
- }
883
- }
884
-
885
- sub updateXMLFeatures {
886
- my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
887
- # bold feature
888
- my $boldFeature;
889
- if ($lnBoldCount/$numWords >= 0.667){
890
- $boldFeature = "yes";
891
- } else {
892
- $boldFeature = "no";
893
- }
894
- push(@gBold, $boldFeature);
895
-
896
- # italic feature
897
- my $italicFeature;
898
- if ($lnItalicCount/$numWords >= 0.667){
899
- $italicFeature = "yes";
900
- } else {
901
- $italicFeature = "no";
902
- }
903
- push(@gItalic, $italicFeature);
904
-
905
- # bullet feature
906
- if($isBullet){
907
- push(@gBullet, "yes");
908
- } else {
909
- push(@gBullet, "no");
910
- }
911
-
912
- # space feature
913
- # push(@gSpace, $space);
914
- }
915
-
916
- ## Find the positions of header, body, and citation
917
- sub getStructureInfo {
918
- my ($lines, $numLines) = @_;
919
-
920
- my ($bodyLength, $citationLength, $bodyEndId) =
921
- SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
922
-
923
- my ($headerLength, $bodyStartId);
924
- ($headerLength, $bodyLength, $bodyStartId) =
925
- SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
926
-
927
- # sanity check
928
- my $totalLength = $headerLength + $bodyLength + $citationLength;
929
- if($numLines != $totalLength){
930
- print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
931
- die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
932
- }
933
- return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
934
- }
935
-
936
- ## Count XML tags/values for statistics purpose
937
- sub processTagInfo {
938
- my ($line, $tags) = @_;
939
-
940
- my $tag;
941
- my $attr;
942
- if($line =~ /^<(.+?)\b(.*)/){
943
- $tag = $1;
944
- $attr = $2;
945
- if(!$tags->{$tag}){
946
- $tags->{$tag} = ();
947
- }
948
- if($attr =~ /^\s*(.+?)\s*\/?>/){
949
- $attr = $1;
950
- }
951
-
952
- my @tokens = split(/\s+/, $attr);
953
- foreach my $token (@tokens){
954
- if($token =~ /^(.+)=(.+)$/){
955
- my $attrName = $1;
956
- my $value = $2;
957
- if(!$tags->{$tag}->{$attrName}){
958
- $tags->{$tag}->{$attrName} = ();
959
- }
960
- if(!$tags->{$tag}->{$attrName}->{$value}){
961
- $tags->{$tag}->{$attrName}->{$value} = 0;
962
- }
963
- $tags->{$tag}->{$attrName}->{$value}++;
964
- }
965
- }
966
- }
967
- }
968
-
969
- ## Print tag info to file
970
- sub printTagInfo {
971
- my ($tags, $tagFile) = @_;
972
-
973
- open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
974
- my @sortedTags = sort {$a cmp $b} keys %{$tags};
975
- foreach(@sortedTags){
976
- my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
977
- print TAG "# Tag = $_\n";
978
- foreach my $attr (@attrs) {
979
- print TAG "$attr:";
980
- my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
981
- foreach my $value (@values){
982
- print TAG " $value-$tags->{$_}->{$attr}->{$value}";
983
- }
984
- print TAG "\n";
985
- }
986
- }
987
- close TAG;
988
- }
989
-
990
- sub untaintPath {
991
- my ($path) = @_;
992
-
993
- if ( $path =~ /^([-_\/\w\.]*)$/ ) {
994
- $path = $1;
995
- } else {
996
- die "Bad path \"$path\"\n";
997
- }
998
-
999
- return $path;
1000
- }
1001
-
1002
- sub untaint {
1003
- my ($s) = @_;
1004
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
1005
- $s = $1; # $data now untainted
1006
- } else {
1007
- die "Bad data in $s"; # log this somewhere
1008
- }
1009
- return $s;
1010
- }
1011
-
1012
- sub execute {
1013
- my ($cmd) = @_;
1014
- if($isDebug){
1015
- print STDERR "Executing: $cmd\n";
1016
- }
1017
- $cmd = untaint($cmd);
1018
- system($cmd);
1019
- }
1020
-
1021
- sub newTmpFile {
1022
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1023
- chomp($tmpFile);
1024
- return $tmpFile;
1025
- }