biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,1025 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl -wT
|
|
2
|
-
# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
|
|
3
|
-
|
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
5
|
-
|
|
6
|
-
require 5.0;
|
|
7
|
-
use strict;
|
|
8
|
-
use Getopt::Long;
|
|
9
|
-
use HTML::Entities;
|
|
10
|
-
|
|
11
|
-
# I do not know a better solution to find a lib path in -T mode.
|
|
12
|
-
# So if you know a better solution, I'd be glad to hear.
|
|
13
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
|
14
|
-
use FindBin;
|
|
15
|
-
FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
|
|
16
|
-
my $path;
|
|
17
|
-
BEGIN {
|
|
18
|
-
if ($FindBin::Bin =~ /(.*)/) {
|
|
19
|
-
$path = $1;
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
use lib "$path/../../lib";
|
|
23
|
-
use SectLabel::PreProcess;
|
|
24
|
-
|
|
25
|
-
### USER customizable section
|
|
26
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
27
|
-
my $outputVersion = "1.0";
|
|
28
|
-
### END user customizable section
|
|
29
|
-
|
|
30
|
-
sub License {
|
|
31
|
-
print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
### HELP Sub-procedure
|
|
35
|
-
sub Help {
|
|
36
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
|
|
37
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
38
|
-
print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
|
|
39
|
-
print STDERR "Options:\n";
|
|
40
|
-
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
|
41
|
-
print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
|
|
42
|
-
print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
|
|
43
|
-
print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
|
|
44
|
-
print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
|
|
45
|
-
|
|
46
|
-
print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
|
|
47
|
-
}
|
|
48
|
-
my $QUIET = 0;
|
|
49
|
-
my $HELP = 0;
|
|
50
|
-
my $outFile = undef;
|
|
51
|
-
my $inFile = undef;
|
|
52
|
-
|
|
53
|
-
my $isXmlFeature = 0;
|
|
54
|
-
my $isDecode = 0;
|
|
55
|
-
|
|
56
|
-
my $isMarkup = 0;
|
|
57
|
-
my $isParaDelimiter = 0;
|
|
58
|
-
|
|
59
|
-
my $tagFile = "";
|
|
60
|
-
my $isAllowEmpty = 0;
|
|
61
|
-
my $isDebug = 0;
|
|
62
|
-
$HELP = 1 unless GetOptions('in=s' => \$inFile,
|
|
63
|
-
'out=s' => \$outFile,
|
|
64
|
-
'decode' => \$isDecode,
|
|
65
|
-
'xmlFeature' => \$isXmlFeature,
|
|
66
|
-
|
|
67
|
-
'tag=s' => \$tagFile,
|
|
68
|
-
'allowEmptyLine' => \$isAllowEmpty,
|
|
69
|
-
'markup' => \$isMarkup,
|
|
70
|
-
|
|
71
|
-
'para' => \$isParaDelimiter,
|
|
72
|
-
'log' => \$isDebug,
|
|
73
|
-
'h' => \$HELP,
|
|
74
|
-
'q' => \$QUIET);
|
|
75
|
-
|
|
76
|
-
if ($HELP || !defined $inFile || !defined $outFile) {
|
|
77
|
-
Help();
|
|
78
|
-
exit(0);
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
if (!$QUIET) {
|
|
82
|
-
License();
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
### Untaint ###
|
|
86
|
-
$inFile = untaintPath($inFile);
|
|
87
|
-
$outFile = untaintPath($outFile);
|
|
88
|
-
$tagFile = untaintPath($tagFile);
|
|
89
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
|
90
|
-
### End untaint ###
|
|
91
|
-
|
|
92
|
-
### Mark page, para, line, word
|
|
93
|
-
my %gPageHash = ();
|
|
94
|
-
|
|
95
|
-
### Mark paragraph
|
|
96
|
-
my @gPara = ();
|
|
97
|
-
|
|
98
|
-
### XML features ###
|
|
99
|
-
# locFeature
|
|
100
|
-
my @gPosHash = (); my $gMinPos = 1000000; my $gMaxPos = 0;
|
|
101
|
-
my @gAlign = (); # alignFeature
|
|
102
|
-
my @gBold = (); # bold feature
|
|
103
|
-
my @gItalic = (); # italic feature
|
|
104
|
-
|
|
105
|
-
# font size feature
|
|
106
|
-
my %gFontSizeHash = (); my @gFontSize = ();
|
|
107
|
-
# font face feature
|
|
108
|
-
my %gFontFaceHash = (); my @gFontFace = ();
|
|
109
|
-
|
|
110
|
-
my @gPic = (); # pic feature
|
|
111
|
-
my @gTable = (); # table feature
|
|
112
|
-
my @gBullet = (); # bullet feature
|
|
113
|
-
|
|
114
|
-
# space feature
|
|
115
|
-
#my %gSpaceHash = (); my @gSpace = ();
|
|
116
|
-
### End XML features ###
|
|
117
|
-
|
|
118
|
-
my %tags = ();
|
|
119
|
-
|
|
120
|
-
if($isDebug){
|
|
121
|
-
print STDERR "\n# Processing file $inFile & output to $outFile\n";
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
my $markupOutput = "";
|
|
125
|
-
my $allText = processFile($inFile, $outFile, \%tags);
|
|
126
|
-
|
|
127
|
-
# Find header part
|
|
128
|
-
my @lines = split(/\n/, $allText);
|
|
129
|
-
my $numLines = scalar(@lines);
|
|
130
|
-
my ($headerLength, $bodyLength, $bodyStartId) =
|
|
131
|
-
SectLabel::PreProcess::findHeaderText(\@lines, 0, $numLines);
|
|
132
|
-
|
|
133
|
-
# Output
|
|
134
|
-
if($isMarkup){
|
|
135
|
-
open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
|
|
136
|
-
print OF "$markupOutput";
|
|
137
|
-
close OF;
|
|
138
|
-
} else {
|
|
139
|
-
output(\@lines, $outFile);
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
if($tagFile ne ""){
|
|
143
|
-
printTagInfo(\%tags, $tagFile);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
sub processFile {
|
|
147
|
-
my ($inFile, $tags) = @_;
|
|
148
|
-
|
|
149
|
-
if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
|
|
150
|
-
open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
|
|
151
|
-
|
|
152
|
-
my $isPara = 0;
|
|
153
|
-
my $isTable = 0;
|
|
154
|
-
my $isSpace = 0;
|
|
155
|
-
my $isPic = 0;
|
|
156
|
-
my $allText = "";
|
|
157
|
-
my $text = "";
|
|
158
|
-
|
|
159
|
-
my $lineId = 0;
|
|
160
|
-
my $isFirstTableCell = 0;
|
|
161
|
-
while (<IF>) { #each line contains a header
|
|
162
|
-
if (/^\#/) { next; } # skip comments
|
|
163
|
-
chomp;
|
|
164
|
-
s/\cM$//; # remove ^M character at the end of the file if any
|
|
165
|
-
my $line = $_;
|
|
166
|
-
|
|
167
|
-
if($tagFile ne ""){
|
|
168
|
-
processTagInfo($line, $tags);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
# if ($line =~ /<\?xml version.+>/){ } ### Xml ###
|
|
172
|
-
# if ($line =~ /^<\/column>$/){ } ### Column ###
|
|
173
|
-
if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup){
|
|
174
|
-
$markupOutput .= "### Page $1\n";
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
### pic ###
|
|
178
|
-
if ($line =~ /^<dd (.*)>$/){
|
|
179
|
-
$isPic = 1;
|
|
180
|
-
if($isMarkup){
|
|
181
|
-
$markupOutput .= "### Figure $1\n";
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
elsif ($line =~ /^<\/dd>$/){
|
|
185
|
-
$isPic = 0;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
### Table ###
|
|
189
|
-
elsif ($line =~ /^<table (.*)>$/){
|
|
190
|
-
$isTable = 1;
|
|
191
|
-
$isFirstTableCell = 1;
|
|
192
|
-
if($isMarkup){
|
|
193
|
-
$markupOutput .= "### Table $1\n";
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
elsif ($line =~ /^<\/table>$/){
|
|
197
|
-
$isTable = 0;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
### Paragraph ###
|
|
202
|
-
# Note: table processing should have higher priority than paragraph, i.e. the priority does matter
|
|
203
|
-
elsif ($line =~ /^<para (.*)>$/){
|
|
204
|
-
$text .= $line."\n"; # we need the header
|
|
205
|
-
$isPara = 1;
|
|
206
|
-
|
|
207
|
-
if($isMarkup){
|
|
208
|
-
$markupOutput .= "## Para $1\n";
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
elsif ($line =~ /^<\/para>$/){
|
|
212
|
-
my ($paraText, $l, $t, $r, $b);
|
|
213
|
-
($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, $isTable, $isPic, \$isFirstTableCell);
|
|
214
|
-
$allText .= $paraText;
|
|
215
|
-
|
|
216
|
-
my @tmpLines = split(/\n/, $paraText);
|
|
217
|
-
$lineId += scalar(@tmpLines);
|
|
218
|
-
$isPara = 0;
|
|
219
|
-
$text = "";
|
|
220
|
-
}
|
|
221
|
-
elsif($isPara){
|
|
222
|
-
$text .= $line."\n";
|
|
223
|
-
next;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
close IF;
|
|
227
|
-
|
|
228
|
-
return $allText;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
sub output {
|
|
232
|
-
my ($lines, $outFile) = @_;
|
|
233
|
-
|
|
234
|
-
open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
|
|
235
|
-
|
|
236
|
-
####### Final output ############
|
|
237
|
-
# xml feature label
|
|
238
|
-
my %gFontSizeLabels = ();
|
|
239
|
-
# my %gSpaceLabels = (); # yes, no
|
|
240
|
-
|
|
241
|
-
if($isXmlFeature){
|
|
242
|
-
getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
|
|
243
|
-
# getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
my $id = -1;
|
|
247
|
-
my $output = "";
|
|
248
|
-
my $paraLineId = -1;
|
|
249
|
-
my $paraLineCount = 0;
|
|
250
|
-
foreach my $line (@{$lines}) {
|
|
251
|
-
$id++;
|
|
252
|
-
|
|
253
|
-
$line =~ s/\cM$//; # remove ^M character at the end of each line if any
|
|
254
|
-
|
|
255
|
-
if($line =~ /^\s*$/){ # # empty lines
|
|
256
|
-
if(!$isAllowEmpty){
|
|
257
|
-
next;
|
|
258
|
-
} else {
|
|
259
|
-
if($isDebug){
|
|
260
|
-
print STDERR "#! Line $id empty!\n";
|
|
261
|
-
}
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
if($gPara[$id] eq "yes"){
|
|
266
|
-
if($output ne ""){ ## mark para
|
|
267
|
-
if($isParaDelimiter){
|
|
268
|
-
print OF "# Para $paraLineId $paraLineCount\n$output";
|
|
269
|
-
$paraLineCount = 0;
|
|
270
|
-
} else {
|
|
271
|
-
if($isDecode){
|
|
272
|
-
$output = decode_entities($output);
|
|
273
|
-
}
|
|
274
|
-
print OF $output;
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
$output = "";
|
|
278
|
-
}
|
|
279
|
-
$paraLineId = $id;
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
$output .= $line;
|
|
283
|
-
$paraLineCount++;
|
|
284
|
-
|
|
285
|
-
## Output XML features ###
|
|
286
|
-
if($isXmlFeature){
|
|
287
|
-
# loc feature
|
|
288
|
-
my $locFeature;
|
|
289
|
-
if($gPosHash[$id] != -1){
|
|
290
|
-
$locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos)*8.0/($gMaxPos - $gMinPos + 1));
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
# align feature
|
|
294
|
-
my $alignFeature = "xmlAlign_".$gAlign[$id];
|
|
295
|
-
|
|
296
|
-
# fontSize feature
|
|
297
|
-
my $fontSizeFeature;
|
|
298
|
-
if($gFontSize[$id] == -1){
|
|
299
|
-
$fontSizeFeature = "xmlFontSize_none";
|
|
300
|
-
} else {
|
|
301
|
-
$fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
|
|
305
|
-
my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
|
|
306
|
-
my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
|
|
307
|
-
my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
|
|
308
|
-
my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
|
|
309
|
-
|
|
310
|
-
# space feature
|
|
311
|
-
# my $spaceFeature;
|
|
312
|
-
# if($gSpace[$id] eq "none"){
|
|
313
|
-
# $spaceFeature = "xmlSpace_none";
|
|
314
|
-
# } else {
|
|
315
|
-
# $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
|
|
316
|
-
# }
|
|
317
|
-
|
|
318
|
-
## Differential features ##
|
|
319
|
-
my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
|
|
320
|
-
|
|
321
|
-
$output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n"; # $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
|
|
322
|
-
} else {
|
|
323
|
-
$output .= "\n";
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
if($output ne ""){ ## mark para
|
|
328
|
-
if($isParaDelimiter){
|
|
329
|
-
print OF "# Para $paraLineId $paraLineCount\n$output";
|
|
330
|
-
$paraLineCount = 0;
|
|
331
|
-
} else {
|
|
332
|
-
if($isDecode){
|
|
333
|
-
$output = decode_entities($output);
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
print OF $output;
|
|
337
|
-
}
|
|
338
|
-
$output = ""
|
|
339
|
-
}
|
|
340
|
-
close OF;
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
sub getDifferentialFeatures {
|
|
344
|
-
my ($id) = @_;
|
|
345
|
-
|
|
346
|
-
# alignChange feature
|
|
347
|
-
my $alignDiff = "bi_xmlA_";
|
|
348
|
-
if($id == 0){
|
|
349
|
-
$alignDiff .= $gAlign[$id];
|
|
350
|
-
} elsif($gAlign[$id] eq $gAlign[$id-1]){
|
|
351
|
-
$alignDiff .= "continue";
|
|
352
|
-
} else {
|
|
353
|
-
$alignDiff .= $gAlign[$id];
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
# fontFaceChange feature
|
|
357
|
-
my $fontFaceDiff = "bi_xmlF_";
|
|
358
|
-
if($id == 0){
|
|
359
|
-
$fontFaceDiff .= "new";
|
|
360
|
-
} elsif($gFontFace[$id] eq $gFontFace[$id-1]){
|
|
361
|
-
$fontFaceDiff .= "continue";
|
|
362
|
-
} else {
|
|
363
|
-
$fontFaceDiff .= "new";
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
# fontSizeChange feature
|
|
367
|
-
my $fontSizeDiff = "bi_xmlS_";
|
|
368
|
-
if($id == 0){
|
|
369
|
-
$fontSizeDiff .= "new";
|
|
370
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1]){
|
|
371
|
-
$fontSizeDiff .= "continue";
|
|
372
|
-
} else {
|
|
373
|
-
$fontSizeDiff .= "new";
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
# fontSFChange feature
|
|
377
|
-
my $fontSFDiff = "bi_xmlSF_";
|
|
378
|
-
if($id == 0){
|
|
379
|
-
$fontSFDiff .= "new";
|
|
380
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
|
|
381
|
-
$fontSFDiff .= "continue";
|
|
382
|
-
} else {
|
|
383
|
-
$fontSFDiff .= "new";
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
# fontSFBIChange feature
|
|
387
|
-
my $fontSFBIDiff = "bi_xmlSFBI_";
|
|
388
|
-
if($id == 0){
|
|
389
|
-
$fontSFBIDiff .= "new";
|
|
390
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
|
|
391
|
-
$fontSFBIDiff .= "continue";
|
|
392
|
-
} else {
|
|
393
|
-
$fontSFBIDiff .= "new";
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
# fontSFBIAChange feature
|
|
397
|
-
my $fontSFBIADiff = "bi_xmlSFBIA_";
|
|
398
|
-
if($id == 0){
|
|
399
|
-
$fontSFBIADiff .= "new";
|
|
400
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
|
|
401
|
-
$fontSFBIADiff .= "continue";
|
|
402
|
-
} else {
|
|
403
|
-
$fontSFBIADiff .= "new";
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
# para change feature
|
|
407
|
-
my $paraDiff = "bi_xmlPara_";
|
|
408
|
-
if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
|
|
409
|
-
$paraDiff .= "header";
|
|
410
|
-
} else {
|
|
411
|
-
if($gPara[$id] eq "yes"){
|
|
412
|
-
$paraDiff .= "new";
|
|
413
|
-
} else {
|
|
414
|
-
$paraDiff .= "continue";
|
|
415
|
-
}
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
sub getFontSizeLabels {
|
|
422
|
-
my ($gFontSizeHash, $gFontSizeLabels) = @_;
|
|
423
|
-
|
|
424
|
-
if($isDebug){ print STDERR "# Map fonts\n"; }
|
|
425
|
-
my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
|
|
426
|
-
|
|
427
|
-
my $commonSize = $sortedFonts[0];
|
|
428
|
-
@sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
|
|
429
|
-
my $commonIndex = 0; # index of common font size
|
|
430
|
-
foreach(@sortedFonts){
|
|
431
|
-
if($commonSize == $_) { # found
|
|
432
|
-
last;
|
|
433
|
-
}
|
|
434
|
-
$commonIndex++;
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
# small fonts
|
|
438
|
-
for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
|
|
439
|
-
$gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
|
|
440
|
-
|
|
441
|
-
if($isDebug){
|
|
442
|
-
print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
|
|
443
|
-
}
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
# common fonts
|
|
447
|
-
$gFontSizeLabels->{$commonSize} = "common";
|
|
448
|
-
if($isDebug){
|
|
449
|
-
print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
# large fonts
|
|
453
|
-
for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
|
|
454
|
-
if((scalar(@sortedFonts)-$i) <= 3){
|
|
455
|
-
$gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
|
|
456
|
-
} else {
|
|
457
|
-
$gFontSizeLabels->{$sortedFonts[$i]} = "larger";
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
if($isDebug){
|
|
461
|
-
print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
|
|
466
|
-
sub getSpaceLabels {
|
|
467
|
-
my ($gSpaceHash, $gSpaceLabels) = @_;
|
|
468
|
-
|
|
469
|
-
if($isDebug){
|
|
470
|
-
print STDERR "\n# Map space\n";
|
|
471
|
-
}
|
|
472
|
-
my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
|
|
473
|
-
|
|
474
|
-
my $commonSpace = $sortedSpaces[0];
|
|
475
|
-
my $commonFreq = $gSpaceHash->{$commonSpace};
|
|
476
|
-
# find similar common freq with larger spaces
|
|
477
|
-
for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
|
|
478
|
-
my $freq = $gSpaceHash->{$sortedSpaces[$i]};
|
|
479
|
-
if($freq/$commonFreq > 0.8){
|
|
480
|
-
if($sortedSpaces[$i] > $commonSpace){
|
|
481
|
-
$commonSpace = $sortedSpaces[$i];
|
|
482
|
-
}
|
|
483
|
-
} else {
|
|
484
|
-
last;
|
|
485
|
-
}
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
|
|
489
|
-
if($sortedSpaces[$i] > $commonSpace){
|
|
490
|
-
$gSpaceLabels->{$sortedSpaces[$i]} = "yes";
|
|
491
|
-
} else {
|
|
492
|
-
$gSpaceLabels->{$sortedSpaces[$i]} = "no";
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
if($isDebug){
|
|
496
|
-
print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
sub getAttrValue {
|
|
502
|
-
my ($attrText, $attr) = @_;
|
|
503
|
-
|
|
504
|
-
my $value = "none";
|
|
505
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
|
506
|
-
$value = $1;
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
return $value;
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
sub checkFontAttr {
|
|
513
|
-
my ($attrText, $attr, $attrHash, $count) = @_;
|
|
514
|
-
|
|
515
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
|
516
|
-
my $attrValue = $1;
|
|
517
|
-
|
|
518
|
-
$attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
sub processPara {
|
|
523
|
-
my ($inputText, $isTable, $isPic, $isFirstTableCell) = @_;
|
|
524
|
-
|
|
525
|
-
my $isSpace = 0;
|
|
526
|
-
my $isSpecialSpace = 0;
|
|
527
|
-
my $isTab = 0;
|
|
528
|
-
my $isBullet = 0;
|
|
529
|
-
|
|
530
|
-
my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
|
|
531
|
-
# xml feature
|
|
532
|
-
my $align = "none";
|
|
533
|
-
my ($l, $t, $r, $bottom);
|
|
534
|
-
my %fontSizeHash = ();
|
|
535
|
-
my %fontFaceHash = ();
|
|
536
|
-
my @boldArray = ();
|
|
537
|
-
my @italicArray = ();
|
|
538
|
-
my $space = "none";
|
|
539
|
-
|
|
540
|
-
my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
|
|
541
|
-
my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
|
|
542
|
-
my $wdAttr; my $wdText = ""; my $isWd = 0;
|
|
543
|
-
|
|
544
|
-
my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
|
|
545
|
-
my $lnBoldCount = 0;
|
|
546
|
-
my $lnItalicCount = 0;
|
|
547
|
-
|
|
548
|
-
my $allText = "";
|
|
549
|
-
my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
|
|
550
|
-
|
|
551
|
-
binmode(STDERR, ":utf8");
|
|
552
|
-
|
|
553
|
-
my $isFirstLinePara = 1;
|
|
554
|
-
my @lines = split(/\n/, $inputText);
|
|
555
|
-
for(my $i=0; $i<scalar(@lines); $i++){
|
|
556
|
-
my $line = $lines[$i];
|
|
557
|
-
|
|
558
|
-
## new para
|
|
559
|
-
if ($line =~ /^<para (.+?)>$/){
|
|
560
|
-
my $attr = $1;
|
|
561
|
-
$align = getAttrValue($attr, "alignment");
|
|
562
|
-
# $indent = getAttrValue($attr, "li");
|
|
563
|
-
$space = getAttrValue($attr, "spaceBefore");
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
## new ln
|
|
567
|
-
elsif ($line =~ /^<ln (.+)>$/){
|
|
568
|
-
$lnAttr = $1;
|
|
569
|
-
$isLn = 1;
|
|
570
|
-
|
|
571
|
-
if ($isMarkup){
|
|
572
|
-
$markupOutput .= "# Line $lnAttr\n";
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/){
|
|
576
|
-
($l, $t, $r, $bottom) = ($1, $2, $3, $4);
|
|
577
|
-
}
|
|
578
|
-
$isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
|
|
579
|
-
|
|
580
|
-
if($isXmlFeature){ # Bold & Italic
|
|
581
|
-
$lnBold = getAttrValue($lnAttr, "bold");
|
|
582
|
-
$lnItalic = getAttrValue($lnAttr, "italic");
|
|
583
|
-
}
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
## new run
|
|
587
|
-
elsif ($line =~ /<run (.*)>$/){
|
|
588
|
-
$runAttr = $1;
|
|
589
|
-
|
|
590
|
-
$isSpace = 0;
|
|
591
|
-
$isTab = 0;
|
|
592
|
-
$isRun = 1;
|
|
593
|
-
|
|
594
|
-
if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
|
|
595
|
-
$isWd = 1;
|
|
596
|
-
$wdAttr = $1;
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
if($isXmlFeature){ # Bold & Italic
|
|
600
|
-
$runBold = getAttrValue($runAttr, "bold");
|
|
601
|
-
$runItalic = getAttrValue($runAttr, "italic");
|
|
602
|
-
}
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
## wd
|
|
606
|
-
elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
|
|
607
|
-
$wdAttr = $1;
|
|
608
|
-
my $word = $2;
|
|
609
|
-
$isSpace = 0;
|
|
610
|
-
$isTab = 0;
|
|
611
|
-
|
|
612
|
-
if ($isMarkup){
|
|
613
|
-
$markupOutput .= "$word $wdAttr";
|
|
614
|
-
if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
|
|
615
|
-
$markupOutput .= " $1=\"true\"";
|
|
616
|
-
}
|
|
617
|
-
$markupOutput .= "\n";
|
|
618
|
-
}
|
|
619
|
-
|
|
620
|
-
if($isXmlFeature){ # FontSize & FontFace
|
|
621
|
-
checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
|
|
622
|
-
checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
if($isXmlFeature){ # Bold & Italic
|
|
626
|
-
my $wdBold = getAttrValue($wdAttr, "bold");
|
|
627
|
-
my $wdItalic = getAttrValue($wdAttr, "italic");
|
|
628
|
-
|
|
629
|
-
if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true"){
|
|
630
|
-
$boldArray[$wdIndex] = 1;
|
|
631
|
-
$lnBoldCount++;
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true"){
|
|
635
|
-
$italicArray[$wdIndex] = 1;
|
|
636
|
-
$lnItalicCount++;
|
|
637
|
-
}
|
|
638
|
-
} # if($isXmlFeature)
|
|
639
|
-
|
|
640
|
-
## add text
|
|
641
|
-
$text .= "$word";
|
|
642
|
-
|
|
643
|
-
if($isRun) {
|
|
644
|
-
$runText .= "$word ";
|
|
645
|
-
}
|
|
646
|
-
$wdIndex++;
|
|
647
|
-
}
|
|
648
|
-
|
|
649
|
-
## end wd
|
|
650
|
-
elsif ($line =~ /^<\/wd>$/){
|
|
651
|
-
$isWd = 0;
|
|
652
|
-
|
|
653
|
-
if($isMarkup){
|
|
654
|
-
$markupOutput .= "$wdText $wdAttr";
|
|
655
|
-
if($isRun && $runAttr =~ /(bold|italic)=\"true\"/){ # if both bold and italic, then just use one
|
|
656
|
-
$markupOutput .= " $1=\"true\"";
|
|
657
|
-
}
|
|
658
|
-
$markupOutput .= "\n";
|
|
659
|
-
|
|
660
|
-
$wdAttr = "";
|
|
661
|
-
}
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
## end run
|
|
665
|
-
elsif ($line =~ /^(.*)<\/run>$/){
|
|
666
|
-
my $word = $1;
|
|
667
|
-
|
|
668
|
-
## add text
|
|
669
|
-
if($word ne ""){
|
|
670
|
-
if($isXmlFeature){ # Bold & Italic
|
|
671
|
-
if($runBold eq "true" || $lnBold eq "true"){
|
|
672
|
-
$boldArray[$wdIndex] = 1;
|
|
673
|
-
$lnBoldCount++;
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
if($runItalic eq "true" || $lnItalic eq "true"){
|
|
677
|
-
$italicArray[$wdIndex] = 1;
|
|
678
|
-
$lnItalicCount++;
|
|
679
|
-
}
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
# appear in the final result
|
|
683
|
-
if($isLn){ $text .= "$word"; }
|
|
684
|
-
|
|
685
|
-
# for internal record
|
|
686
|
-
if($isRun){ $runText .= "$word "; }
|
|
687
|
-
if($isWd){ $wdText .= "$word"; }
|
|
688
|
-
|
|
689
|
-
$wdIndex++;
|
|
690
|
-
}
|
|
691
|
-
|
|
692
|
-
# xml feature
|
|
693
|
-
if($isXmlFeature && $runText ne "") { # not a space, tab or new-line run
|
|
694
|
-
my @words = split(/\s+/, $runText);
|
|
695
|
-
my $numWords = scalar(@words);
|
|
696
|
-
checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
|
|
697
|
-
checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
## reset run
|
|
701
|
-
if(!$isLn){ # <run> not enclosed within <ln>
|
|
702
|
-
$wdIndex = 0;
|
|
703
|
-
}
|
|
704
|
-
$runText = "";
|
|
705
|
-
$isRun = 0;
|
|
706
|
-
$isSpecialSpace = 0;
|
|
707
|
-
|
|
708
|
-
if($isXmlFeature){ # Bold & Italic
|
|
709
|
-
$runBold = "none";
|
|
710
|
-
$runItalic = "none";
|
|
711
|
-
|
|
712
|
-
if(!$isLn){ # <run> not enclosed within <ln>
|
|
713
|
-
$lnBoldCount = 0;
|
|
714
|
-
$lnItalicCount = 0;
|
|
715
|
-
}
|
|
716
|
-
}
|
|
717
|
-
}
|
|
718
|
-
|
|
719
|
-
## end ln
|
|
720
|
-
elsif ($line =~ /^<\/ln>$/){
|
|
721
|
-
if((!$isAllowEmpty && $text !~ /^\s*$/)
|
|
722
|
-
|| ($isAllowEmpty && $text ne "")){
|
|
723
|
-
if($isForcedEOF eq "true" || # there's a forced EOL?
|
|
724
|
-
!$isSpecialSpace # not an emply line with space character
|
|
725
|
-
){
|
|
726
|
-
$text .= "\n";
|
|
727
|
-
|
|
728
|
-
# update allText
|
|
729
|
-
$allText .= $text;
|
|
730
|
-
$text = "";
|
|
731
|
-
}
|
|
732
|
-
|
|
733
|
-
my $numWords = $wdIndex;
|
|
734
|
-
|
|
735
|
-
if(!$isTable){
|
|
736
|
-
if($isFirstLinePara){
|
|
737
|
-
push(@gPara, "yes");
|
|
738
|
-
$isFirstLinePara = 0;
|
|
739
|
-
} else {
|
|
740
|
-
push(@gPara, "no");
|
|
741
|
-
}
|
|
742
|
-
} else {
|
|
743
|
-
if($$isFirstTableCell){
|
|
744
|
-
push(@gPara, "yes");
|
|
745
|
-
$$isFirstTableCell = 0;
|
|
746
|
-
} else {
|
|
747
|
-
push(@gPara, "no");
|
|
748
|
-
}
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
if($isXmlFeature && $numWords >= 1){
|
|
752
|
-
# xml feature
|
|
753
|
-
# assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
|
|
754
|
-
checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
|
|
755
|
-
checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
|
|
756
|
-
}
|
|
757
|
-
|
|
758
|
-
if($isXmlFeature && !$isSpecialSpace){
|
|
759
|
-
my $pos = ($t+$bottom)/2.0;
|
|
760
|
-
if($pos < $gMinPos){ $gMinPos = $pos; }
|
|
761
|
-
if($pos > $gMaxPos){ $gMaxPos = $pos; }
|
|
762
|
-
push(@gPosHash, $pos); # pos feature
|
|
763
|
-
push(@gAlign, $align); # alignment feature
|
|
764
|
-
|
|
765
|
-
if($isPic){
|
|
766
|
-
push(@gPic, "yes");
|
|
767
|
-
} else {
|
|
768
|
-
push(@gPic, "no");
|
|
769
|
-
}
|
|
770
|
-
if($isTable){
|
|
771
|
-
push(@gTable, "yes");
|
|
772
|
-
} else {
|
|
773
|
-
push(@gTable, "no");
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
if($isPic || $isTable){
|
|
777
|
-
### Not assign value ###
|
|
778
|
-
push(@gFontSize, -1); # bold feature
|
|
779
|
-
push(@gFontFace, "none"); # bold feature
|
|
780
|
-
push(@gBold, "no"); # bold feature
|
|
781
|
-
push(@gItalic, "no"); # italic feature
|
|
782
|
-
push(@gBullet, "no"); # bullet feature
|
|
783
|
-
} else {
|
|
784
|
-
updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
|
|
785
|
-
%fontSizeHash = (); %fontFaceHash = ();
|
|
786
|
-
|
|
787
|
-
updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
|
|
788
|
-
} # end if pic
|
|
789
|
-
} # end if($isXmlFeature && !$isSpecialSpace)
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
## reset ln
|
|
793
|
-
$isLn = 0;
|
|
794
|
-
$isForcedEOF = "none";
|
|
795
|
-
$isSpecialSpace = 0;
|
|
796
|
-
$wdIndex = 0;
|
|
797
|
-
|
|
798
|
-
if($isXmlFeature){ # Bold & Italic
|
|
799
|
-
$lnBold = "none";
|
|
800
|
-
$lnItalic = "none";
|
|
801
|
-
|
|
802
|
-
$lnBoldCount = 0;
|
|
803
|
-
$lnItalicCount = 0;
|
|
804
|
-
}
|
|
805
|
-
} # end else </ln>
|
|
806
|
-
|
|
807
|
-
## nl newline signal
|
|
808
|
-
elsif ($line =~ /^<nl orig=\"true\"\/>$/){
|
|
809
|
-
if($isLn){
|
|
810
|
-
$isSpace = 0;
|
|
811
|
-
} else {
|
|
812
|
-
if($isDebug){
|
|
813
|
-
print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
|
|
814
|
-
}
|
|
815
|
-
}
|
|
816
|
-
}
|
|
817
|
-
|
|
818
|
-
## space
|
|
819
|
-
elsif ($line =~ /^<space\/>$/){
|
|
820
|
-
my $startTag = "";
|
|
821
|
-
my $endTag = "";
|
|
822
|
-
if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
|
|
823
|
-
$startTag = $1;
|
|
824
|
-
}
|
|
825
|
-
|
|
826
|
-
if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
|
|
827
|
-
$endTag = $1;
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
if($startTag eq $endTag && $startTag ne ""){
|
|
831
|
-
# print STDERR "# Special space after \"$text\"\n";
|
|
832
|
-
$isSpecialSpace = 1;
|
|
833
|
-
}
|
|
834
|
-
|
|
835
|
-
## addText
|
|
836
|
-
$text .= " ";
|
|
837
|
-
$isSpace = 1;
|
|
838
|
-
}
|
|
839
|
-
|
|
840
|
-
## tab
|
|
841
|
-
elsif ($line =~ /^<tab .*\/>$/){
|
|
842
|
-
## add Text
|
|
843
|
-
$text .= "\t";
|
|
844
|
-
|
|
845
|
-
$isTab = 1;
|
|
846
|
-
}
|
|
847
|
-
|
|
848
|
-
## bullet
|
|
849
|
-
elsif ($line =~ /^<bullet .*>$/){
|
|
850
|
-
$isBullet = 1;
|
|
851
|
-
}
|
|
852
|
-
}
|
|
853
|
-
|
|
854
|
-
$allText .= $text;
|
|
855
|
-
return ($allText, $l, $t, $r, $bottom, $isSpace);
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
sub updateXMLFontFeature {
|
|
859
|
-
my ($fontSizeHash, $fontFaceHash) = @_;
|
|
860
|
-
|
|
861
|
-
# font size feature
|
|
862
|
-
if(scalar(keys %{$fontSizeHash}) == 0){
|
|
863
|
-
push(@gFontSize, -1);
|
|
864
|
-
} else {
|
|
865
|
-
my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
|
|
866
|
-
|
|
867
|
-
my $fontSize = $sortedFonts[0];
|
|
868
|
-
push(@gFontSize, $fontSize);
|
|
869
|
-
|
|
870
|
-
$gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
|
|
871
|
-
}
|
|
872
|
-
|
|
873
|
-
# font face feature
|
|
874
|
-
if(scalar(keys %{$fontFaceHash}) == 0){
|
|
875
|
-
push(@gFontFace, "none");
|
|
876
|
-
} else {
|
|
877
|
-
my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
|
|
878
|
-
my $fontFace = $sortedFonts[0];
|
|
879
|
-
push(@gFontFace, $fontFace);
|
|
880
|
-
|
|
881
|
-
$gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
|
|
882
|
-
}
|
|
883
|
-
}
|
|
884
|
-
|
|
885
|
-
sub updateXMLFeatures {
|
|
886
|
-
my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
|
|
887
|
-
# bold feature
|
|
888
|
-
my $boldFeature;
|
|
889
|
-
if ($lnBoldCount/$numWords >= 0.667){
|
|
890
|
-
$boldFeature = "yes";
|
|
891
|
-
} else {
|
|
892
|
-
$boldFeature = "no";
|
|
893
|
-
}
|
|
894
|
-
push(@gBold, $boldFeature);
|
|
895
|
-
|
|
896
|
-
# italic feature
|
|
897
|
-
my $italicFeature;
|
|
898
|
-
if ($lnItalicCount/$numWords >= 0.667){
|
|
899
|
-
$italicFeature = "yes";
|
|
900
|
-
} else {
|
|
901
|
-
$italicFeature = "no";
|
|
902
|
-
}
|
|
903
|
-
push(@gItalic, $italicFeature);
|
|
904
|
-
|
|
905
|
-
# bullet feature
|
|
906
|
-
if($isBullet){
|
|
907
|
-
push(@gBullet, "yes");
|
|
908
|
-
} else {
|
|
909
|
-
push(@gBullet, "no");
|
|
910
|
-
}
|
|
911
|
-
|
|
912
|
-
# space feature
|
|
913
|
-
# push(@gSpace, $space);
|
|
914
|
-
}
|
|
915
|
-
|
|
916
|
-
## Find the positions of header, body, and citation
|
|
917
|
-
sub getStructureInfo {
|
|
918
|
-
my ($lines, $numLines) = @_;
|
|
919
|
-
|
|
920
|
-
my ($bodyLength, $citationLength, $bodyEndId) =
|
|
921
|
-
SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
|
|
922
|
-
|
|
923
|
-
my ($headerLength, $bodyStartId);
|
|
924
|
-
($headerLength, $bodyLength, $bodyStartId) =
|
|
925
|
-
SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
|
|
926
|
-
|
|
927
|
-
# sanity check
|
|
928
|
-
my $totalLength = $headerLength + $bodyLength + $citationLength;
|
|
929
|
-
if($numLines != $totalLength){
|
|
930
|
-
print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
|
|
931
|
-
die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
|
|
932
|
-
}
|
|
933
|
-
return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
|
|
934
|
-
}
|
|
935
|
-
|
|
936
|
-
## Count XML tags/values for statistics purpose
|
|
937
|
-
sub processTagInfo {
|
|
938
|
-
my ($line, $tags) = @_;
|
|
939
|
-
|
|
940
|
-
my $tag;
|
|
941
|
-
my $attr;
|
|
942
|
-
if($line =~ /^<(.+?)\b(.*)/){
|
|
943
|
-
$tag = $1;
|
|
944
|
-
$attr = $2;
|
|
945
|
-
if(!$tags->{$tag}){
|
|
946
|
-
$tags->{$tag} = ();
|
|
947
|
-
}
|
|
948
|
-
if($attr =~ /^\s*(.+?)\s*\/?>/){
|
|
949
|
-
$attr = $1;
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
my @tokens = split(/\s+/, $attr);
|
|
953
|
-
foreach my $token (@tokens){
|
|
954
|
-
if($token =~ /^(.+)=(.+)$/){
|
|
955
|
-
my $attrName = $1;
|
|
956
|
-
my $value = $2;
|
|
957
|
-
if(!$tags->{$tag}->{$attrName}){
|
|
958
|
-
$tags->{$tag}->{$attrName} = ();
|
|
959
|
-
}
|
|
960
|
-
if(!$tags->{$tag}->{$attrName}->{$value}){
|
|
961
|
-
$tags->{$tag}->{$attrName}->{$value} = 0;
|
|
962
|
-
}
|
|
963
|
-
$tags->{$tag}->{$attrName}->{$value}++;
|
|
964
|
-
}
|
|
965
|
-
}
|
|
966
|
-
}
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
## Print tag info to file
|
|
970
|
-
sub printTagInfo {
|
|
971
|
-
my ($tags, $tagFile) = @_;
|
|
972
|
-
|
|
973
|
-
open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
|
|
974
|
-
my @sortedTags = sort {$a cmp $b} keys %{$tags};
|
|
975
|
-
foreach(@sortedTags){
|
|
976
|
-
my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
|
|
977
|
-
print TAG "# Tag = $_\n";
|
|
978
|
-
foreach my $attr (@attrs) {
|
|
979
|
-
print TAG "$attr:";
|
|
980
|
-
my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
|
|
981
|
-
foreach my $value (@values){
|
|
982
|
-
print TAG " $value-$tags->{$_}->{$attr}->{$value}";
|
|
983
|
-
}
|
|
984
|
-
print TAG "\n";
|
|
985
|
-
}
|
|
986
|
-
}
|
|
987
|
-
close TAG;
|
|
988
|
-
}
|
|
989
|
-
|
|
990
|
-
sub untaintPath {
|
|
991
|
-
my ($path) = @_;
|
|
992
|
-
|
|
993
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ ) {
|
|
994
|
-
$path = $1;
|
|
995
|
-
} else {
|
|
996
|
-
die "Bad path \"$path\"\n";
|
|
997
|
-
}
|
|
998
|
-
|
|
999
|
-
return $path;
|
|
1000
|
-
}
|
|
1001
|
-
|
|
1002
|
-
sub untaint {
|
|
1003
|
-
my ($s) = @_;
|
|
1004
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
|
|
1005
|
-
$s = $1; # $data now untainted
|
|
1006
|
-
} else {
|
|
1007
|
-
die "Bad data in $s"; # log this somewhere
|
|
1008
|
-
}
|
|
1009
|
-
return $s;
|
|
1010
|
-
}
|
|
1011
|
-
|
|
1012
|
-
sub execute {
|
|
1013
|
-
my ($cmd) = @_;
|
|
1014
|
-
if($isDebug){
|
|
1015
|
-
print STDERR "Executing: $cmd\n";
|
|
1016
|
-
}
|
|
1017
|
-
$cmd = untaint($cmd);
|
|
1018
|
-
system($cmd);
|
|
1019
|
-
}
|
|
1020
|
-
|
|
1021
|
-
sub newTmpFile {
|
|
1022
|
-
my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
|
|
1023
|
-
chomp($tmpFile);
|
|
1024
|
-
return $tmpFile;
|
|
1025
|
-
}
|