biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1025 +0,0 @@
1
- #!/usr/bin/perl -wT
2
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
-
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
- use Getopt::Long;
9
- use HTML::Entities;
10
-
11
- # I do not know a better solution to find a lib path in -T mode.
12
- # So if you know a better solution, I'd be glad to hear.
13
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
- use FindBin;
15
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
- my $path;
17
- BEGIN {
18
- if ($FindBin::Bin =~ /(.*)/) {
19
- $path = $1;
20
- }
21
- }
22
- use lib "$path/../../lib";
23
- use SectLabel::PreProcess;
24
-
25
- ### USER customizable section
26
- $0 =~ /([^\/]+)$/; my $progname = $1;
27
- my $outputVersion = "1.0";
28
- ### END user customizable section
29
-
30
- sub License {
31
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
32
- }
33
-
34
- ### HELP Sub-procedure
35
- sub Help {
36
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
37
- print STDERR "usage: $progname -h\t[invokes help]\n";
38
- print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
39
- print STDERR "Options:\n";
40
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
41
- print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
42
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
43
- print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
44
- print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
45
-
46
- print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
47
- }
48
- my $QUIET = 0;
49
- my $HELP = 0;
50
- my $outFile = undef;
51
- my $inFile = undef;
52
-
53
- my $isXmlFeature = 0;
54
- my $isDecode = 0;
55
-
56
- my $isMarkup = 0;
57
- my $isParaDelimiter = 0;
58
-
59
- my $tagFile = "";
60
- my $isAllowEmpty = 0;
61
- my $isDebug = 0;
62
- $HELP = 1 unless GetOptions('in=s' => \$inFile,
63
- 'out=s' => \$outFile,
64
- 'decode' => \$isDecode,
65
- 'xmlFeature' => \$isXmlFeature,
66
-
67
- 'tag=s' => \$tagFile,
68
- 'allowEmptyLine' => \$isAllowEmpty,
69
- 'markup' => \$isMarkup,
70
-
71
- 'para' => \$isParaDelimiter,
72
- 'log' => \$isDebug,
73
- 'h' => \$HELP,
74
- 'q' => \$QUIET);
75
-
76
- if ($HELP || !defined $inFile || !defined $outFile) {
77
- Help();
78
- exit(0);
79
- }
80
-
81
- if (!$QUIET) {
82
- License();
83
- }
84
-
85
- ### Untaint ###
86
- $inFile = untaintPath($inFile);
87
- $outFile = untaintPath($outFile);
88
- $tagFile = untaintPath($tagFile);
89
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
90
- ### End untaint ###
91
-
92
- ### Mark page, para, line, word
93
- my %gPageHash = ();
94
-
95
- ### Mark paragraph
96
- my @gPara = ();
97
-
98
- ### XML features ###
99
- # locFeature
100
- my @gPosHash = (); my $gMinPos = 1000000; my $gMaxPos = 0;
101
- my @gAlign = (); # alignFeature
102
- my @gBold = (); # bold feature
103
- my @gItalic = (); # italic feature
104
-
105
- # font size feature
106
- my %gFontSizeHash = (); my @gFontSize = ();
107
- # font face feature
108
- my %gFontFaceHash = (); my @gFontFace = ();
109
-
110
- my @gPic = (); # pic feature
111
- my @gTable = (); # table feature
112
- my @gBullet = (); # bullet feature
113
-
114
- # space feature
115
- #my %gSpaceHash = (); my @gSpace = ();
116
- ### End XML features ###
117
-
118
- my %tags = ();
119
-
120
- if($isDebug){
121
- print STDERR "\n# Processing file $inFile & output to $outFile\n";
122
- }
123
-
124
- my $markupOutput = "";
125
- my $allText = processFile($inFile, $outFile, \%tags);
126
-
127
- # Find header part
128
- my @lines = split(/\n/, $allText);
129
- my $numLines = scalar(@lines);
130
- my ($headerLength, $bodyLength, $bodyStartId) =
131
- SectLabel::PreProcess::findHeaderText(\@lines, 0, $numLines);
132
-
133
- # Output
134
- if($isMarkup){
135
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
136
- print OF "$markupOutput";
137
- close OF;
138
- } else {
139
- output(\@lines, $outFile);
140
- }
141
-
142
- if($tagFile ne ""){
143
- printTagInfo(\%tags, $tagFile);
144
- }
145
-
146
- sub processFile {
147
- my ($inFile, $tags) = @_;
148
-
149
- if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
150
- open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
151
-
152
- my $isPara = 0;
153
- my $isTable = 0;
154
- my $isSpace = 0;
155
- my $isPic = 0;
156
- my $allText = "";
157
- my $text = "";
158
-
159
- my $lineId = 0;
160
- my $isFirstTableCell = 0;
161
- while (<IF>) { #each line contains a header
162
- if (/^\#/) { next; } # skip comments
163
- chomp;
164
- s/\cM$//; # remove ^M character at the end of the file if any
165
- my $line = $_;
166
-
167
- if($tagFile ne ""){
168
- processTagInfo($line, $tags);
169
- }
170
-
171
- # if ($line =~ /<\?xml version.+>/){ } ### Xml ###
172
- # if ($line =~ /^<\/column>$/){ } ### Column ###
173
- if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup){
174
- $markupOutput .= "### Page $1\n";
175
- }
176
-
177
- ### pic ###
178
- if ($line =~ /^<dd (.*)>$/){
179
- $isPic = 1;
180
- if($isMarkup){
181
- $markupOutput .= "### Figure $1\n";
182
- }
183
- }
184
- elsif ($line =~ /^<\/dd>$/){
185
- $isPic = 0;
186
- }
187
-
188
- ### Table ###
189
- elsif ($line =~ /^<table (.*)>$/){
190
- $isTable = 1;
191
- $isFirstTableCell = 1;
192
- if($isMarkup){
193
- $markupOutput .= "### Table $1\n";
194
- }
195
- }
196
- elsif ($line =~ /^<\/table>$/){
197
- $isTable = 0;
198
- }
199
-
200
-
201
- ### Paragraph ###
202
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
203
- elsif ($line =~ /^<para (.*)>$/){
204
- $text .= $line."\n"; # we need the header
205
- $isPara = 1;
206
-
207
- if($isMarkup){
208
- $markupOutput .= "## Para $1\n";
209
- }
210
- }
211
- elsif ($line =~ /^<\/para>$/){
212
- my ($paraText, $l, $t, $r, $b);
213
- ($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, $isTable, $isPic, \$isFirstTableCell);
214
- $allText .= $paraText;
215
-
216
- my @tmpLines = split(/\n/, $paraText);
217
- $lineId += scalar(@tmpLines);
218
- $isPara = 0;
219
- $text = "";
220
- }
221
- elsif($isPara){
222
- $text .= $line."\n";
223
- next;
224
- }
225
- }
226
- close IF;
227
-
228
- return $allText;
229
- }
230
-
231
- sub output {
232
- my ($lines, $outFile) = @_;
233
-
234
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
235
-
236
- ####### Final output ############
237
- # xml feature label
238
- my %gFontSizeLabels = ();
239
- # my %gSpaceLabels = (); # yes, no
240
-
241
- if($isXmlFeature){
242
- getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
243
- # getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
244
- }
245
-
246
- my $id = -1;
247
- my $output = "";
248
- my $paraLineId = -1;
249
- my $paraLineCount = 0;
250
- foreach my $line (@{$lines}) {
251
- $id++;
252
-
253
- $line =~ s/\cM$//; # remove ^M character at the end of each line if any
254
-
255
- if($line =~ /^\s*$/){ # # empty lines
256
- if(!$isAllowEmpty){
257
- next;
258
- } else {
259
- if($isDebug){
260
- print STDERR "#! Line $id empty!\n";
261
- }
262
- }
263
- }
264
-
265
- if($gPara[$id] eq "yes"){
266
- if($output ne ""){ ## mark para
267
- if($isParaDelimiter){
268
- print OF "# Para $paraLineId $paraLineCount\n$output";
269
- $paraLineCount = 0;
270
- } else {
271
- if($isDecode){
272
- $output = decode_entities($output);
273
- }
274
- print OF $output;
275
- }
276
-
277
- $output = "";
278
- }
279
- $paraLineId = $id;
280
- }
281
-
282
- $output .= $line;
283
- $paraLineCount++;
284
-
285
- ## Output XML features ###
286
- if($isXmlFeature){
287
- # loc feature
288
- my $locFeature;
289
- if($gPosHash[$id] != -1){
290
- $locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos)*8.0/($gMaxPos - $gMinPos + 1));
291
- }
292
-
293
- # align feature
294
- my $alignFeature = "xmlAlign_".$gAlign[$id];
295
-
296
- # fontSize feature
297
- my $fontSizeFeature;
298
- if($gFontSize[$id] == -1){
299
- $fontSizeFeature = "xmlFontSize_none";
300
- } else {
301
- $fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
302
- }
303
-
304
- my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
305
- my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
306
- my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
307
- my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
308
- my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
309
-
310
- # space feature
311
- # my $spaceFeature;
312
- # if($gSpace[$id] eq "none"){
313
- # $spaceFeature = "xmlSpace_none";
314
- # } else {
315
- # $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
316
- # }
317
-
318
- ## Differential features ##
319
- my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
320
-
321
- $output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n"; # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
322
- } else {
323
- $output .= "\n";
324
- }
325
- }
326
-
327
- if($output ne ""){ ## mark para
328
- if($isParaDelimiter){
329
- print OF "# Para $paraLineId $paraLineCount\n$output";
330
- $paraLineCount = 0;
331
- } else {
332
- if($isDecode){
333
- $output = decode_entities($output);
334
- }
335
-
336
- print OF $output;
337
- }
338
- $output = ""
339
- }
340
- close OF;
341
- }
342
-
343
- sub getDifferentialFeatures {
344
- my ($id) = @_;
345
-
346
- # alignChange feature
347
- my $alignDiff = "bi_xmlA_";
348
- if($id == 0){
349
- $alignDiff .= $gAlign[$id];
350
- } elsif($gAlign[$id] eq $gAlign[$id-1]){
351
- $alignDiff .= "continue";
352
- } else {
353
- $alignDiff .= $gAlign[$id];
354
- }
355
-
356
- # fontFaceChange feature
357
- my $fontFaceDiff = "bi_xmlF_";
358
- if($id == 0){
359
- $fontFaceDiff .= "new";
360
- } elsif($gFontFace[$id] eq $gFontFace[$id-1]){
361
- $fontFaceDiff .= "continue";
362
- } else {
363
- $fontFaceDiff .= "new";
364
- }
365
-
366
- # fontSizeChange feature
367
- my $fontSizeDiff = "bi_xmlS_";
368
- if($id == 0){
369
- $fontSizeDiff .= "new";
370
- } elsif($gFontSize[$id] == $gFontSize[$id-1]){
371
- $fontSizeDiff .= "continue";
372
- } else {
373
- $fontSizeDiff .= "new";
374
- }
375
-
376
- # fontSFChange feature
377
- my $fontSFDiff = "bi_xmlSF_";
378
- if($id == 0){
379
- $fontSFDiff .= "new";
380
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
381
- $fontSFDiff .= "continue";
382
- } else {
383
- $fontSFDiff .= "new";
384
- }
385
-
386
- # fontSFBIChange feature
387
- my $fontSFBIDiff = "bi_xmlSFBI_";
388
- if($id == 0){
389
- $fontSFBIDiff .= "new";
390
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
391
- $fontSFBIDiff .= "continue";
392
- } else {
393
- $fontSFBIDiff .= "new";
394
- }
395
-
396
- # fontSFBIAChange feature
397
- my $fontSFBIADiff = "bi_xmlSFBIA_";
398
- if($id == 0){
399
- $fontSFBIADiff .= "new";
400
- } elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
401
- $fontSFBIADiff .= "continue";
402
- } else {
403
- $fontSFBIADiff .= "new";
404
- }
405
-
406
- # para change feature
407
- my $paraDiff = "bi_xmlPara_";
408
- if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
409
- $paraDiff .= "header";
410
- } else {
411
- if($gPara[$id] eq "yes"){
412
- $paraDiff .= "new";
413
- } else {
414
- $paraDiff .= "continue";
415
- }
416
- }
417
-
418
- return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
419
- }
420
-
421
- sub getFontSizeLabels {
422
- my ($gFontSizeHash, $gFontSizeLabels) = @_;
423
-
424
- if($isDebug){ print STDERR "# Map fonts\n"; }
425
- my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
426
-
427
- my $commonSize = $sortedFonts[0];
428
- @sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
429
- my $commonIndex = 0; # index of common font size
430
- foreach(@sortedFonts){
431
- if($commonSize == $_) { # found
432
- last;
433
- }
434
- $commonIndex++;
435
- }
436
-
437
- # small fonts
438
- for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
439
- $gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
440
-
441
- if($isDebug){
442
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
443
- }
444
- }
445
-
446
- # common fonts
447
- $gFontSizeLabels->{$commonSize} = "common";
448
- if($isDebug){
449
- print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
450
- }
451
-
452
- # large fonts
453
- for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
454
- if((scalar(@sortedFonts)-$i) <= 3){
455
- $gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
456
- } else {
457
- $gFontSizeLabels->{$sortedFonts[$i]} = "larger";
458
- }
459
-
460
- if($isDebug){
461
- print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
462
- }
463
- }
464
- }
465
-
466
- sub getSpaceLabels {
467
- my ($gSpaceHash, $gSpaceLabels) = @_;
468
-
469
- if($isDebug){
470
- print STDERR "\n# Map space\n";
471
- }
472
- my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
473
-
474
- my $commonSpace = $sortedSpaces[0];
475
- my $commonFreq = $gSpaceHash->{$commonSpace};
476
- # find similar common freq with larger spaces
477
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
478
- my $freq = $gSpaceHash->{$sortedSpaces[$i]};
479
- if($freq/$commonFreq > 0.8){
480
- if($sortedSpaces[$i] > $commonSpace){
481
- $commonSpace = $sortedSpaces[$i];
482
- }
483
- } else {
484
- last;
485
- }
486
- }
487
-
488
- for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
489
- if($sortedSpaces[$i] > $commonSpace){
490
- $gSpaceLabels->{$sortedSpaces[$i]} = "yes";
491
- } else {
492
- $gSpaceLabels->{$sortedSpaces[$i]} = "no";
493
- }
494
-
495
- if($isDebug){
496
- print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
497
- }
498
- }
499
- }
500
-
501
- sub getAttrValue {
502
- my ($attrText, $attr) = @_;
503
-
504
- my $value = "none";
505
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
506
- $value = $1;
507
- }
508
-
509
- return $value;
510
- }
511
-
512
- sub checkFontAttr {
513
- my ($attrText, $attr, $attrHash, $count) = @_;
514
-
515
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
516
- my $attrValue = $1;
517
-
518
- $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
519
- }
520
- }
521
-
522
- sub processPara {
523
- my ($inputText, $isTable, $isPic, $isFirstTableCell) = @_;
524
-
525
- my $isSpace = 0;
526
- my $isSpecialSpace = 0;
527
- my $isTab = 0;
528
- my $isBullet = 0;
529
-
530
- my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
531
- # xml feature
532
- my $align = "none";
533
- my ($l, $t, $r, $bottom);
534
- my %fontSizeHash = ();
535
- my %fontFaceHash = ();
536
- my @boldArray = ();
537
- my @italicArray = ();
538
- my $space = "none";
539
-
540
- my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
541
- my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
542
- my $wdAttr; my $wdText = ""; my $isWd = 0;
543
-
544
- my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
545
- my $lnBoldCount = 0;
546
- my $lnItalicCount = 0;
547
-
548
- my $allText = "";
549
- my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
550
-
551
- binmode(STDERR, ":utf8");
552
-
553
- my $isFirstLinePara = 1;
554
- my @lines = split(/\n/, $inputText);
555
- for(my $i=0; $i<scalar(@lines); $i++){
556
- my $line = $lines[$i];
557
-
558
- ## new para
559
- if ($line =~ /^<para (.+?)>$/){
560
- my $attr = $1;
561
- $align = getAttrValue($attr, "alignment");
562
- # $indent = getAttrValue($attr, "li");
563
- $space = getAttrValue($attr, "spaceBefore");
564
- }
565
-
566
- ## new ln
567
- elsif ($line =~ /^<ln (.+)>$/){
568
- $lnAttr = $1;
569
- $isLn = 1;
570
-
571
- if ($isMarkup){
572
- $markupOutput .= "# Line $lnAttr\n";
573
- }
574
-
575
- if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/){
576
- ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
577
- }
578
- $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
579
-
580
- if($isXmlFeature){ # Bold & Italic
581
- $lnBold = getAttrValue($lnAttr, "bold");
582
- $lnItalic = getAttrValue($lnAttr, "italic");
583
- }
584
- }
585
-
586
- ## new run
587
- elsif ($line =~ /<run (.*)>$/){
588
- $runAttr = $1;
589
-
590
- $isSpace = 0;
591
- $isTab = 0;
592
- $isRun = 1;
593
-
594
- if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
595
- $isWd = 1;
596
- $wdAttr = $1;
597
- }
598
-
599
- if($isXmlFeature){ # Bold & Italic
600
- $runBold = getAttrValue($runAttr, "bold");
601
- $runItalic = getAttrValue($runAttr, "italic");
602
- }
603
- }
604
-
605
- ## wd
606
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
607
- $wdAttr = $1;
608
- my $word = $2;
609
- $isSpace = 0;
610
- $isTab = 0;
611
-
612
- if ($isMarkup){
613
- $markupOutput .= "$word $wdAttr";
614
- if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
615
- $markupOutput .= " $1=\"true\"";
616
- }
617
- $markupOutput .= "\n";
618
- }
619
-
620
- if($isXmlFeature){ # FontSize & FontFace
621
- checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
622
- checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
623
- }
624
-
625
- if($isXmlFeature){ # Bold & Italic
626
- my $wdBold = getAttrValue($wdAttr, "bold");
627
- my $wdItalic = getAttrValue($wdAttr, "italic");
628
-
629
- if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true"){
630
- $boldArray[$wdIndex] = 1;
631
- $lnBoldCount++;
632
- }
633
-
634
- if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true"){
635
- $italicArray[$wdIndex] = 1;
636
- $lnItalicCount++;
637
- }
638
- } # if($isXmlFeature)
639
-
640
- ## add text
641
- $text .= "$word";
642
-
643
- if($isRun) {
644
- $runText .= "$word ";
645
- }
646
- $wdIndex++;
647
- }
648
-
649
- ## end wd
650
- elsif ($line =~ /^<\/wd>$/){
651
- $isWd = 0;
652
-
653
- if($isMarkup){
654
- $markupOutput .= "$wdText $wdAttr";
655
- if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
656
- $markupOutput .= " $1=\"true\"";
657
- }
658
- $markupOutput .= "\n";
659
-
660
- $wdAttr = "";
661
- }
662
- }
663
-
664
- ## end run
665
- elsif ($line =~ /^(.*)<\/run>$/){
666
- my $word = $1;
667
-
668
- ## add text
669
- if($word ne ""){
670
- if($isXmlFeature){ # Bold & Italic
671
- if($runBold eq "true" || $lnBold eq "true"){
672
- $boldArray[$wdIndex] = 1;
673
- $lnBoldCount++;
674
- }
675
-
676
- if($runItalic eq "true" || $lnItalic eq "true"){
677
- $italicArray[$wdIndex] = 1;
678
- $lnItalicCount++;
679
- }
680
- }
681
-
682
- # appear in the final result
683
- if($isLn){ $text .= "$word"; }
684
-
685
- # for internal record
686
- if($isRun){ $runText .= "$word "; }
687
- if($isWd){ $wdText .= "$word"; }
688
-
689
- $wdIndex++;
690
- }
691
-
692
- # xml feature
693
- if($isXmlFeature && $runText ne "") { # not a space, tab or new-line run
694
- my @words = split(/\s+/, $runText);
695
- my $numWords = scalar(@words);
696
- checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
697
- checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
698
- }
699
-
700
- ## reset run
701
- if(!$isLn){ # <run> not enclosed within <ln>
702
- $wdIndex = 0;
703
- }
704
- $runText = "";
705
- $isRun = 0;
706
- $isSpecialSpace = 0;
707
-
708
- if($isXmlFeature){ # Bold & Italic
709
- $runBold = "none";
710
- $runItalic = "none";
711
-
712
- if(!$isLn){ # <run> not enclosed within <ln>
713
- $lnBoldCount = 0;
714
- $lnItalicCount = 0;
715
- }
716
- }
717
- }
718
-
719
- ## end ln
720
- elsif ($line =~ /^<\/ln>$/){
721
- if((!$isAllowEmpty && $text !~ /^\s*$/)
722
- || ($isAllowEmpty && $text ne "")){
723
- if($isForcedEOF eq "true" || # there's a forced EOL?
724
- !$isSpecialSpace # not an emply line with space character
725
- ){
726
- $text .= "\n";
727
-
728
- # update allText
729
- $allText .= $text;
730
- $text = "";
731
- }
732
-
733
- my $numWords = $wdIndex;
734
-
735
- if(!$isTable){
736
- if($isFirstLinePara){
737
- push(@gPara, "yes");
738
- $isFirstLinePara = 0;
739
- } else {
740
- push(@gPara, "no");
741
- }
742
- } else {
743
- if($$isFirstTableCell){
744
- push(@gPara, "yes");
745
- $$isFirstTableCell = 0;
746
- } else {
747
- push(@gPara, "no");
748
- }
749
- }
750
-
751
- if($isXmlFeature && $numWords >= 1){
752
- # xml feature
753
- # assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
754
- checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
755
- checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
756
- }
757
-
758
- if($isXmlFeature && !$isSpecialSpace){
759
- my $pos = ($t+$bottom)/2.0;
760
- if($pos < $gMinPos){ $gMinPos = $pos; }
761
- if($pos > $gMaxPos){ $gMaxPos = $pos; }
762
- push(@gPosHash, $pos); # pos feature
763
- push(@gAlign, $align); # alignment feature
764
-
765
- if($isPic){
766
- push(@gPic, "yes");
767
- } else {
768
- push(@gPic, "no");
769
- }
770
- if($isTable){
771
- push(@gTable, "yes");
772
- } else {
773
- push(@gTable, "no");
774
- }
775
-
776
- if($isPic || $isTable){
777
- ### Not assign value ###
778
- push(@gFontSize, -1); # bold feature
779
- push(@gFontFace, "none"); # bold feature
780
- push(@gBold, "no"); # bold feature
781
- push(@gItalic, "no"); # italic feature
782
- push(@gBullet, "no"); # bullet feature
783
- } else {
784
- updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
785
- %fontSizeHash = (); %fontFaceHash = ();
786
-
787
- updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
788
- } # end if pic
789
- } # end if($isXmlFeature && !$isSpecialSpace)
790
- }
791
-
792
- ## reset ln
793
- $isLn = 0;
794
- $isForcedEOF = "none";
795
- $isSpecialSpace = 0;
796
- $wdIndex = 0;
797
-
798
- if($isXmlFeature){ # Bold & Italic
799
- $lnBold = "none";
800
- $lnItalic = "none";
801
-
802
- $lnBoldCount = 0;
803
- $lnItalicCount = 0;
804
- }
805
- } # end else </ln>
806
-
807
- ## nl newline signal
808
- elsif ($line =~ /^<nl orig=\"true\"\/>$/){
809
- if($isLn){
810
- $isSpace = 0;
811
- } else {
812
- if($isDebug){
813
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
814
- }
815
- }
816
- }
817
-
818
- ## space
819
- elsif ($line =~ /^<space\/>$/){
820
- my $startTag = "";
821
- my $endTag = "";
822
- if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
823
- $startTag = $1;
824
- }
825
-
826
- if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
827
- $endTag = $1;
828
- }
829
-
830
- if($startTag eq $endTag && $startTag ne ""){
831
- # print STDERR "# Special space after \"$text\"\n";
832
- $isSpecialSpace = 1;
833
- }
834
-
835
- ## addText
836
- $text .= " ";
837
- $isSpace = 1;
838
- }
839
-
840
- ## tab
841
- elsif ($line =~ /^<tab .*\/>$/){
842
- ## add Text
843
- $text .= "\t";
844
-
845
- $isTab = 1;
846
- }
847
-
848
- ## bullet
849
- elsif ($line =~ /^<bullet .*>$/){
850
- $isBullet = 1;
851
- }
852
- }
853
-
854
- $allText .= $text;
855
- return ($allText, $l, $t, $r, $bottom, $isSpace);
856
- }
857
-
858
- sub updateXMLFontFeature {
859
- my ($fontSizeHash, $fontFaceHash) = @_;
860
-
861
- # font size feature
862
- if(scalar(keys %{$fontSizeHash}) == 0){
863
- push(@gFontSize, -1);
864
- } else {
865
- my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
866
-
867
- my $fontSize = $sortedFonts[0];
868
- push(@gFontSize, $fontSize);
869
-
870
- $gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
871
- }
872
-
873
- # font face feature
874
- if(scalar(keys %{$fontFaceHash}) == 0){
875
- push(@gFontFace, "none");
876
- } else {
877
- my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
878
- my $fontFace = $sortedFonts[0];
879
- push(@gFontFace, $fontFace);
880
-
881
- $gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
882
- }
883
- }
884
-
885
- sub updateXMLFeatures {
886
- my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
887
- # bold feature
888
- my $boldFeature;
889
- if ($lnBoldCount/$numWords >= 0.667){
890
- $boldFeature = "yes";
891
- } else {
892
- $boldFeature = "no";
893
- }
894
- push(@gBold, $boldFeature);
895
-
896
- # italic feature
897
- my $italicFeature;
898
- if ($lnItalicCount/$numWords >= 0.667){
899
- $italicFeature = "yes";
900
- } else {
901
- $italicFeature = "no";
902
- }
903
- push(@gItalic, $italicFeature);
904
-
905
- # bullet feature
906
- if($isBullet){
907
- push(@gBullet, "yes");
908
- } else {
909
- push(@gBullet, "no");
910
- }
911
-
912
- # space feature
913
- # push(@gSpace, $space);
914
- }
915
-
916
- ## Find the positions of header, body, and citation
917
- sub getStructureInfo {
918
- my ($lines, $numLines) = @_;
919
-
920
- my ($bodyLength, $citationLength, $bodyEndId) =
921
- SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
922
-
923
- my ($headerLength, $bodyStartId);
924
- ($headerLength, $bodyLength, $bodyStartId) =
925
- SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
926
-
927
- # sanity check
928
- my $totalLength = $headerLength + $bodyLength + $citationLength;
929
- if($numLines != $totalLength){
930
- print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
931
- die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
932
- }
933
- return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
934
- }
935
-
936
- ## Count XML tags/values for statistics purpose
937
- sub processTagInfo {
938
- my ($line, $tags) = @_;
939
-
940
- my $tag;
941
- my $attr;
942
- if($line =~ /^<(.+?)\b(.*)/){
943
- $tag = $1;
944
- $attr = $2;
945
- if(!$tags->{$tag}){
946
- $tags->{$tag} = ();
947
- }
948
- if($attr =~ /^\s*(.+?)\s*\/?>/){
949
- $attr = $1;
950
- }
951
-
952
- my @tokens = split(/\s+/, $attr);
953
- foreach my $token (@tokens){
954
- if($token =~ /^(.+)=(.+)$/){
955
- my $attrName = $1;
956
- my $value = $2;
957
- if(!$tags->{$tag}->{$attrName}){
958
- $tags->{$tag}->{$attrName} = ();
959
- }
960
- if(!$tags->{$tag}->{$attrName}->{$value}){
961
- $tags->{$tag}->{$attrName}->{$value} = 0;
962
- }
963
- $tags->{$tag}->{$attrName}->{$value}++;
964
- }
965
- }
966
- }
967
- }
968
-
969
- ## Print tag info to file
970
- sub printTagInfo {
971
- my ($tags, $tagFile) = @_;
972
-
973
- open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
974
- my @sortedTags = sort {$a cmp $b} keys %{$tags};
975
- foreach(@sortedTags){
976
- my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
977
- print TAG "# Tag = $_\n";
978
- foreach my $attr (@attrs) {
979
- print TAG "$attr:";
980
- my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
981
- foreach my $value (@values){
982
- print TAG " $value-$tags->{$_}->{$attr}->{$value}";
983
- }
984
- print TAG "\n";
985
- }
986
- }
987
- close TAG;
988
- }
989
-
990
- sub untaintPath {
991
- my ($path) = @_;
992
-
993
- if ( $path =~ /^([-_\/\w\.]*)$/ ) {
994
- $path = $1;
995
- } else {
996
- die "Bad path \"$path\"\n";
997
- }
998
-
999
- return $path;
1000
- }
1001
-
1002
- sub untaint {
1003
- my ($s) = @_;
1004
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
1005
- $s = $1; # $data now untainted
1006
- } else {
1007
- die "Bad data in $s"; # log this somewhere
1008
- }
1009
- return $s;
1010
- }
1011
-
1012
- sub execute {
1013
- my ($cmd) = @_;
1014
- if($isDebug){
1015
- print STDERR "Executing: $cmd\n";
1016
- }
1017
- $cmd = untaint($cmd);
1018
- system($cmd);
1019
- }
1020
-
1021
- sub newTmpFile {
1022
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1023
- chomp($tmpFile);
1024
- return $tmpFile;
1025
- }