biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,1427 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl -wT
|
|
2
|
-
|
|
3
|
-
# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
|
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
5
|
-
|
|
6
|
-
require 5.0;
|
|
7
|
-
use strict;
|
|
8
|
-
use Getopt::Long;
|
|
9
|
-
use HTML::Entities;
|
|
10
|
-
|
|
11
|
-
# I do not know a better solution to find a lib path in -T mode.
|
|
12
|
-
# So if you know a better solution, I'd be glad to hear.
|
|
13
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
|
14
|
-
use FindBin;
|
|
15
|
-
FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
|
|
16
|
-
|
|
17
|
-
my $path;
|
|
18
|
-
BEGIN
|
|
19
|
-
{
|
|
20
|
-
if ($FindBin::Bin =~ /(.*)/)
|
|
21
|
-
{
|
|
22
|
-
$path = $1;
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
use lib "$path/../../lib";
|
|
26
|
-
|
|
27
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
|
|
28
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
|
|
29
|
-
|
|
30
|
-
use SectLabel::PreProcess;
|
|
31
|
-
|
|
32
|
-
# USER customizable section
|
|
33
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
34
|
-
my $outputVersion = "1.0";
|
|
35
|
-
# END user customizable section
|
|
36
|
-
|
|
37
|
-
sub License
|
|
38
|
-
{
|
|
39
|
-
print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
# HELP sub-procedure
|
|
43
|
-
sub Help
|
|
44
|
-
{
|
|
45
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
|
|
46
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
47
|
-
print STDERR " $progname -in xmlFile -out outFile [-xmlFeature -decode -markup -para] [-tag tagFile -allowEmptyLine -log]\n";
|
|
48
|
-
print STDERR "Options:\n";
|
|
49
|
-
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
|
50
|
-
print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
|
|
51
|
-
print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
|
|
52
|
-
print STDERR "\t-para: marking in the output each paragraph with # Para lineId numLines\n";
|
|
53
|
-
print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
|
|
54
|
-
print STDERR "\t-tag tagFile: count XML tags/values for statistics purpose\n";
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
my $QUIET = 0;
|
|
58
|
-
my $HELP = 0;
|
|
59
|
-
my $outFile = undef;
|
|
60
|
-
my $inFile = undef;
|
|
61
|
-
|
|
62
|
-
my $isXmlFeature = 0;
|
|
63
|
-
my $isDecode = 0;
|
|
64
|
-
|
|
65
|
-
my $isMarkup = 0;
|
|
66
|
-
my $isParaDelimiter = 0;
|
|
67
|
-
|
|
68
|
-
my $tagFile = "";
|
|
69
|
-
my $isAllowEmpty = 0;
|
|
70
|
-
my $isDebug = 1;
|
|
71
|
-
|
|
72
|
-
$HELP = 1 unless GetOptions('in=s' => \$inFile,
|
|
73
|
-
'out=s' => \$outFile,
|
|
74
|
-
'decode' => \$isDecode,
|
|
75
|
-
'xmlFeature' => \$isXmlFeature,
|
|
76
|
-
|
|
77
|
-
'tag=s' => \$tagFile,
|
|
78
|
-
'allowEmptyLine' => \$isAllowEmpty,
|
|
79
|
-
'markup' => \$isMarkup,
|
|
80
|
-
|
|
81
|
-
'para' => \$isParaDelimiter,
|
|
82
|
-
'log' => \$isDebug,
|
|
83
|
-
'h' => \$HELP,
|
|
84
|
-
'q' => \$QUIET);
|
|
85
|
-
|
|
86
|
-
if ($HELP || !defined $inFile || !defined $outFile)
|
|
87
|
-
{
|
|
88
|
-
Help();
|
|
89
|
-
exit(0);
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
if (!$QUIET)
|
|
93
|
-
{
|
|
94
|
-
License();
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
# Untaint
|
|
98
|
-
$inFile = untaintPath($inFile);
|
|
99
|
-
$outFile = untaintPath($outFile);
|
|
100
|
-
$tagFile = untaintPath($tagFile);
|
|
101
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
|
102
|
-
# End untaint
|
|
103
|
-
|
|
104
|
-
# Mark page, para, line, word
|
|
105
|
-
my %gPageHash = ();
|
|
106
|
-
|
|
107
|
-
# Mark paragraph
|
|
108
|
-
my @gPara = ();
|
|
109
|
-
|
|
110
|
-
# XML features
|
|
111
|
-
# locFeature
|
|
112
|
-
my @gPosHash = ();
|
|
113
|
-
my $gMinPos = 1000000;
|
|
114
|
-
my $gMaxPos = 0;
|
|
115
|
-
|
|
116
|
-
# alignFeature
|
|
117
|
-
my @gAlign = ();
|
|
118
|
-
|
|
119
|
-
# boldFeature
|
|
120
|
-
my @gBold = ();
|
|
121
|
-
|
|
122
|
-
# italicFeature
|
|
123
|
-
my @gItalic = ();
|
|
124
|
-
|
|
125
|
-
# fontSizeFeature
|
|
126
|
-
my %gFontSizeHash = ();
|
|
127
|
-
my @gFontSize = ();
|
|
128
|
-
|
|
129
|
-
# fontFaceFeature
|
|
130
|
-
my %gFontFaceHash = ();
|
|
131
|
-
my @gFontFace = ();
|
|
132
|
-
|
|
133
|
-
# picFeature
|
|
134
|
-
my @gPic = ();
|
|
135
|
-
|
|
136
|
-
# tableFeature
|
|
137
|
-
my @gTable = ();
|
|
138
|
-
|
|
139
|
-
# bulletFeature
|
|
140
|
-
my @gBullet = ();
|
|
141
|
-
|
|
142
|
-
# spaceFeature
|
|
143
|
-
#my %gSpaceHash = ();
|
|
144
|
-
#my @gSpace = ();
|
|
145
|
-
# End XML features
|
|
146
|
-
|
|
147
|
-
my %tags = ();
|
|
148
|
-
|
|
149
|
-
if ($isDebug)
|
|
150
|
-
{
|
|
151
|
-
print STDERR "\n# Processing file $inFile & output to $outFile\n";
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
my $markupOutput = "";
|
|
155
|
-
my $allText = processFile($inFile, $outFile, \%tags);
|
|
156
|
-
|
|
157
|
-
# Find header part
|
|
158
|
-
my @lines = split(/\n/, $allText);
|
|
159
|
-
my $numLines = scalar(@lines);
|
|
160
|
-
my ($headerLength, $bodyLength, $bodyStartId) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $numLines);
|
|
161
|
-
|
|
162
|
-
# Output
|
|
163
|
-
if ($isMarkup)
|
|
164
|
-
{
|
|
165
|
-
open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
|
|
166
|
-
print OF "$markupOutput";
|
|
167
|
-
close OF;
|
|
168
|
-
}
|
|
169
|
-
else
|
|
170
|
-
{
|
|
171
|
-
output(\@lines, $outFile);
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
if ($tagFile ne "")
|
|
175
|
-
{
|
|
176
|
-
printTagInfo(\%tags, $tagFile);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
sub processFile
|
|
180
|
-
{
|
|
181
|
-
my ($inFile, $tags) = @_;
|
|
182
|
-
|
|
183
|
-
if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
|
|
184
|
-
open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
|
|
185
|
-
|
|
186
|
-
my $isPic = 0;
|
|
187
|
-
my $isPara = 0;
|
|
188
|
-
my $isTable = 0;
|
|
189
|
-
my $isSpace = 0;
|
|
190
|
-
|
|
191
|
-
my $text = undef;
|
|
192
|
-
my $allText = undef;
|
|
193
|
-
|
|
194
|
-
my $lineId = 0;
|
|
195
|
-
|
|
196
|
-
# Each line contains a header
|
|
197
|
-
while (<IF>)
|
|
198
|
-
{
|
|
199
|
-
if (/^\#/) { next; } # skip comments
|
|
200
|
-
|
|
201
|
-
chomp;
|
|
202
|
-
s/\cM$//; # remove ^M character at the end of the file if any
|
|
203
|
-
|
|
204
|
-
my $line = $_;
|
|
205
|
-
if($tagFile ne "")
|
|
206
|
-
{
|
|
207
|
-
processTagInfo($line, $tags);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
# if ($line =~ /<\?xml version.+>/) { } ### Xml ###
|
|
211
|
-
# if ($line =~ /^<\/column>$/) { } ### Column ###
|
|
212
|
-
if ($isMarkup && $line =~ /<theoreticalPage (.*)\/>/ && $isMarkup)
|
|
213
|
-
{
|
|
214
|
-
$markupOutput .= "### Page $1\n";
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
### pic ###
|
|
218
|
-
if ($line =~ /^<dd (.*)>$/)
|
|
219
|
-
{
|
|
220
|
-
$isPic = 1;
|
|
221
|
-
if($isMarkup)
|
|
222
|
-
{
|
|
223
|
-
$markupOutput .= "### Dd $1\n";
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
elsif ($line =~ /^<\/dd>$/)
|
|
227
|
-
{
|
|
228
|
-
$isPic = 0;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
### Table ###
|
|
232
|
-
if ($line =~ /^<table .*>$/)
|
|
233
|
-
{
|
|
234
|
-
$text .= $line."\n"; # we need the header
|
|
235
|
-
$isTable = 1;
|
|
236
|
-
}
|
|
237
|
-
elsif ($line =~ /^<\/table>$/)
|
|
238
|
-
{
|
|
239
|
-
my $tableText = processTable($text, $isPic);
|
|
240
|
-
$allText .= $tableText;
|
|
241
|
-
|
|
242
|
-
my @tmpLines = split(/\n/, $tableText);
|
|
243
|
-
$lineId += scalar(@tmpLines);
|
|
244
|
-
|
|
245
|
-
$isTable = 0;
|
|
246
|
-
$text = "";
|
|
247
|
-
}
|
|
248
|
-
elsif($isTable)
|
|
249
|
-
{
|
|
250
|
-
$text .= $line."\n";
|
|
251
|
-
next;
|
|
252
|
-
}
|
|
253
|
-
### Paragraph ###
|
|
254
|
-
# Note: table processing should have higher priority than paragraph, i.e. the priority does matter
|
|
255
|
-
elsif ($line =~ /^<para (.*)>$/)
|
|
256
|
-
{
|
|
257
|
-
$text .= $line."\n"; # we need the header
|
|
258
|
-
$isPara = 1;
|
|
259
|
-
|
|
260
|
-
if($isMarkup)
|
|
261
|
-
{
|
|
262
|
-
$markupOutput .= "## Para $1\n";
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
elsif ($line =~ /^<\/para>$/)
|
|
266
|
-
{
|
|
267
|
-
my ($paraText, $l, $t, $r, $b);
|
|
268
|
-
($paraText, $l, $t, $r, $b, $isSpace) = processPara($text, 0, $isPic);
|
|
269
|
-
$allText .= $paraText;
|
|
270
|
-
|
|
271
|
-
my @tmpLines = split(/\n/, $paraText);
|
|
272
|
-
$lineId += scalar(@tmpLines);
|
|
273
|
-
$isPara = 0;
|
|
274
|
-
$text = "";
|
|
275
|
-
}
|
|
276
|
-
elsif($isPara)
|
|
277
|
-
{
|
|
278
|
-
$text .= $line."\n";
|
|
279
|
-
next;
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
close IF;
|
|
284
|
-
return $allText;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
sub output
|
|
288
|
-
{
|
|
289
|
-
my ($lines, $outFile) = @_;
|
|
290
|
-
|
|
291
|
-
open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
|
|
292
|
-
|
|
293
|
-
####### Final output ############
|
|
294
|
-
# xml feature label
|
|
295
|
-
my %gFontSizeLabels = ();
|
|
296
|
-
# my %gSpaceLabels = (); # yes, no
|
|
297
|
-
|
|
298
|
-
if($isXmlFeature)
|
|
299
|
-
{
|
|
300
|
-
getFontSizeLabels(\%gFontSizeHash, \%gFontSizeLabels);
|
|
301
|
-
# getSpaceLabels(\%gSpaceHash, \%gSpaceLabels);
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
my $id = -1;
|
|
305
|
-
my $output = "";
|
|
306
|
-
my $paraLineId = -1;
|
|
307
|
-
my $paraLineCount = 0;
|
|
308
|
-
foreach my $line (@{$lines})
|
|
309
|
-
{
|
|
310
|
-
$id++;
|
|
311
|
-
|
|
312
|
-
$line =~ s/\cM$//; # remove ^M character at the end of each line if any
|
|
313
|
-
|
|
314
|
-
# empty lines
|
|
315
|
-
if ($line =~ /^\s*$/)
|
|
316
|
-
{
|
|
317
|
-
if (!$isAllowEmpty)
|
|
318
|
-
{
|
|
319
|
-
next;
|
|
320
|
-
}
|
|
321
|
-
else
|
|
322
|
-
{
|
|
323
|
-
if ($isDebug)
|
|
324
|
-
{
|
|
325
|
-
print STDERR "#! Line $id empty!\n";
|
|
326
|
-
}
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
if ($gPara[$id] eq "yes")
|
|
331
|
-
{
|
|
332
|
-
# mark para
|
|
333
|
-
if ($output ne "")
|
|
334
|
-
{
|
|
335
|
-
if ($isParaDelimiter)
|
|
336
|
-
{
|
|
337
|
-
print OF "# Para $paraLineId $paraLineCount\n$output";
|
|
338
|
-
$paraLineCount = 0;
|
|
339
|
-
}
|
|
340
|
-
else
|
|
341
|
-
{
|
|
342
|
-
if($isDecode)
|
|
343
|
-
{
|
|
344
|
-
$output = decode_entities($output);
|
|
345
|
-
}
|
|
346
|
-
print OF $output;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
$output = "";
|
|
350
|
-
}
|
|
351
|
-
$paraLineId = $id;
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
$output .= $line;
|
|
355
|
-
$paraLineCount++;
|
|
356
|
-
|
|
357
|
-
## Output XML features
|
|
358
|
-
if ($isXmlFeature)
|
|
359
|
-
{
|
|
360
|
-
# loc feature
|
|
361
|
-
my $locFeature;
|
|
362
|
-
|
|
363
|
-
if($gPosHash[$id] != -1)
|
|
364
|
-
{
|
|
365
|
-
$locFeature = "xmlLoc_".int(($gPosHash[$id] - $gMinPos) * 8.0 / ($gMaxPos - $gMinPos + 1));
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
# align feature
|
|
369
|
-
my $alignFeature = "xmlAlign_".$gAlign[$id];
|
|
370
|
-
|
|
371
|
-
# fontSize feature
|
|
372
|
-
my $fontSizeFeature;
|
|
373
|
-
if($gFontSize[$id] == -1)
|
|
374
|
-
{
|
|
375
|
-
$fontSizeFeature = "xmlFontSize_none";
|
|
376
|
-
}
|
|
377
|
-
else
|
|
378
|
-
{
|
|
379
|
-
$fontSizeFeature = "xmlFontSize_".$gFontSizeLabels{$gFontSize[$id]};
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
my $boldFeature = "xmlBold_".$gBold[$id]; # bold feature
|
|
383
|
-
my $italicFeature = "xmlItalic_".$gItalic[$id]; # italic feature
|
|
384
|
-
my $picFeature = "xmlPic_".$gPic[$id]; # pic feature
|
|
385
|
-
my $tableFeature = "xmlTable_".$gTable[$id]; # table feature
|
|
386
|
-
my $bulletFeature = "xmlBullet_".$gBullet[$id]; # bullet feature
|
|
387
|
-
|
|
388
|
-
# space feature
|
|
389
|
-
#my $spaceFeature;
|
|
390
|
-
#if($gSpace[$id] eq "none")
|
|
391
|
-
#{
|
|
392
|
-
# $spaceFeature = "xmlSpace_none";
|
|
393
|
-
#}
|
|
394
|
-
#else
|
|
395
|
-
#{
|
|
396
|
-
# $spaceFeature = "xmlSpace_".$gSpaceLabels{$gSpace[$id]};
|
|
397
|
-
#}
|
|
398
|
-
|
|
399
|
-
# differential features
|
|
400
|
-
my ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff) = getDifferentialFeatures($id);
|
|
401
|
-
|
|
402
|
-
$output .= " |XML| $locFeature $boldFeature $italicFeature $fontSizeFeature $picFeature $tableFeature $bulletFeature $fontSFBIADiff $paraDiff\n";
|
|
403
|
-
# $alignFeature $alignDiff $fontSizeDiff $fontFaceDiff $fontSFDiff $fontSFBIDiff
|
|
404
|
-
}
|
|
405
|
-
else
|
|
406
|
-
{
|
|
407
|
-
$output .= "\n";
|
|
408
|
-
}
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
# mark para
|
|
412
|
-
if ($output ne "")
|
|
413
|
-
{
|
|
414
|
-
if ($isParaDelimiter)
|
|
415
|
-
{
|
|
416
|
-
print OF "# Para $paraLineId $paraLineCount\n$output";
|
|
417
|
-
$paraLineCount = 0;
|
|
418
|
-
}
|
|
419
|
-
else
|
|
420
|
-
{
|
|
421
|
-
if ($isDecode)
|
|
422
|
-
{
|
|
423
|
-
$output = decode_entities($output);
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
print OF $output;
|
|
427
|
-
}
|
|
428
|
-
$output = ""
|
|
429
|
-
}
|
|
430
|
-
close OF;
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
sub getDifferentialFeatures
|
|
434
|
-
{
|
|
435
|
-
my ($id) = @_;
|
|
436
|
-
|
|
437
|
-
# alignChange feature
|
|
438
|
-
my $alignDiff = "bi_xmlA_";
|
|
439
|
-
if ($id == 0)
|
|
440
|
-
{
|
|
441
|
-
$alignDiff .= $gAlign[$id];
|
|
442
|
-
}
|
|
443
|
-
elsif ($gAlign[$id] eq $gAlign[$id-1])
|
|
444
|
-
{
|
|
445
|
-
$alignDiff .= "continue";
|
|
446
|
-
}
|
|
447
|
-
else
|
|
448
|
-
{
|
|
449
|
-
$alignDiff .= $gAlign[$id];
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
# fontFaceChange feature
|
|
453
|
-
my $fontFaceDiff = "bi_xmlF_";
|
|
454
|
-
if ($id == 0)
|
|
455
|
-
{
|
|
456
|
-
$fontFaceDiff .= "new";
|
|
457
|
-
}
|
|
458
|
-
elsif ($gFontFace[$id] eq $gFontFace[$id-1])
|
|
459
|
-
{
|
|
460
|
-
$fontFaceDiff .= "continue";
|
|
461
|
-
}
|
|
462
|
-
else
|
|
463
|
-
{
|
|
464
|
-
$fontFaceDiff .= "new";
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
# fontSizeChange feature
|
|
468
|
-
my $fontSizeDiff = "bi_xmlS_";
|
|
469
|
-
if ($id == 0)
|
|
470
|
-
{
|
|
471
|
-
$fontSizeDiff .= "new";
|
|
472
|
-
}
|
|
473
|
-
elsif ($gFontSize[$id] == $gFontSize[$id-1])
|
|
474
|
-
{
|
|
475
|
-
$fontSizeDiff .= "continue";
|
|
476
|
-
}
|
|
477
|
-
else
|
|
478
|
-
{
|
|
479
|
-
$fontSizeDiff .= "new";
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
# fontSFChange feature
|
|
483
|
-
my $fontSFDiff = "bi_xmlSF_";
|
|
484
|
-
if($id == 0){
|
|
485
|
-
$fontSFDiff .= "new";
|
|
486
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1]){
|
|
487
|
-
$fontSFDiff .= "continue";
|
|
488
|
-
} else {
|
|
489
|
-
$fontSFDiff .= "new";
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
# fontSFBIChange feature
|
|
493
|
-
my $fontSFBIDiff = "bi_xmlSFBI_";
|
|
494
|
-
if($id == 0){
|
|
495
|
-
$fontSFBIDiff .= "new";
|
|
496
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1]){
|
|
497
|
-
$fontSFBIDiff .= "continue";
|
|
498
|
-
} else {
|
|
499
|
-
$fontSFBIDiff .= "new";
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
# fontSFBIAChange feature
|
|
503
|
-
my $fontSFBIADiff = "bi_xmlSFBIA_";
|
|
504
|
-
if($id == 0){
|
|
505
|
-
$fontSFBIADiff .= "new";
|
|
506
|
-
} elsif($gFontSize[$id] == $gFontSize[$id-1] && $gFontFace[$id] eq $gFontFace[$id-1] && $gBold[$id] eq $gBold[$id-1] && $gItalic[$id] eq $gItalic[$id-1] && $gAlign[$id] eq $gAlign[$id-1]){
|
|
507
|
-
$fontSFBIADiff .= "continue";
|
|
508
|
-
} else {
|
|
509
|
-
$fontSFBIADiff .= "new";
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
# para change feature
|
|
513
|
-
my $paraDiff = "bi_xmlPara_";
|
|
514
|
-
if($id < $bodyStartId){ # header part, consider each line as a separate paragraph
|
|
515
|
-
$paraDiff .= "header";
|
|
516
|
-
} else {
|
|
517
|
-
if($gPara[$id] eq "yes"){
|
|
518
|
-
$paraDiff .= "new";
|
|
519
|
-
} else {
|
|
520
|
-
$paraDiff .= "continue";
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
|
|
524
|
-
return ($alignDiff, $fontSizeDiff, $fontFaceDiff, $fontSFDiff, $fontSFBIDiff, $fontSFBIADiff, $paraDiff);
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
sub getFontSizeLabels {
|
|
528
|
-
my ($gFontSizeHash, $gFontSizeLabels) = @_;
|
|
529
|
-
|
|
530
|
-
if($isDebug){ print STDERR "# Map fonts\n"; }
|
|
531
|
-
my @sortedFonts = sort { $gFontSizeHash->{$b} <=> $gFontSizeHash->{$a} } keys %{$gFontSizeHash}; # sort by values, obtain keys
|
|
532
|
-
|
|
533
|
-
my $commonSize = $sortedFonts[0];
|
|
534
|
-
@sortedFonts = sort { $a <=> $b } keys %{$gFontSizeHash}; # sort by keys, obtain keys
|
|
535
|
-
my $commonIndex = 0; # index of common font size
|
|
536
|
-
foreach(@sortedFonts){
|
|
537
|
-
if($commonSize == $_) { # found
|
|
538
|
-
last;
|
|
539
|
-
}
|
|
540
|
-
$commonIndex++;
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
# small fonts
|
|
544
|
-
for(my $i = 0; $i<$commonIndex; $i++){ # smallIndex $largeIndex
|
|
545
|
-
$gFontSizeLabels->{$sortedFonts[$i]} = "smaller";
|
|
546
|
-
|
|
547
|
-
if($isDebug){
|
|
548
|
-
print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
# common fonts
|
|
553
|
-
$gFontSizeLabels->{$commonSize} = "common";
|
|
554
|
-
if($isDebug){
|
|
555
|
-
print STDERR "$sortedFonts[$commonIndex] --> $gFontSizeLabels->{$sortedFonts[$commonIndex]}, freq = $gFontSizeHash->{$sortedFonts[$commonIndex]}\n";
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
# large fonts
|
|
559
|
-
for(my $i = ($commonIndex+1); $i<scalar(@sortedFonts); $i++){ # ($largeIndex+1) (scalar(@sortedFonts)-1)
|
|
560
|
-
if((scalar(@sortedFonts)-$i) <= 3){
|
|
561
|
-
$gFontSizeLabels->{$sortedFonts[$i]} = "largest".($i+1-scalar(@sortedFonts));
|
|
562
|
-
} else {
|
|
563
|
-
$gFontSizeLabels->{$sortedFonts[$i]} = "larger";
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
if($isDebug){
|
|
567
|
-
print STDERR "$sortedFonts[$i] --> $gFontSizeLabels->{$sortedFonts[$i]}, freq = $gFontSizeHash->{$sortedFonts[$i]}\n";
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
sub getSpaceLabels {
|
|
573
|
-
my ($gSpaceHash, $gSpaceLabels) = @_;
|
|
574
|
-
|
|
575
|
-
if($isDebug){
|
|
576
|
-
print STDERR "\n# Map space\n";
|
|
577
|
-
}
|
|
578
|
-
my @sortedSpaces = sort { $gSpaceHash->{$b} <=> $gSpaceHash->{$a} } keys %{$gSpaceHash}; # sort by freqs, obtain space faces
|
|
579
|
-
|
|
580
|
-
my $commonSpace = $sortedSpaces[0];
|
|
581
|
-
my $commonFreq = $gSpaceHash->{$commonSpace};
|
|
582
|
-
# find similar common freq with larger spaces
|
|
583
|
-
for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
|
|
584
|
-
my $freq = $gSpaceHash->{$sortedSpaces[$i]};
|
|
585
|
-
if($freq/$commonFreq > 0.8){
|
|
586
|
-
if($sortedSpaces[$i] > $commonSpace){
|
|
587
|
-
$commonSpace = $sortedSpaces[$i];
|
|
588
|
-
}
|
|
589
|
-
} else {
|
|
590
|
-
last;
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
|
-
|
|
594
|
-
for(my $i = 0; $i<scalar(@sortedSpaces); $i++){ # 0 ($smallIndex-1)
|
|
595
|
-
if($sortedSpaces[$i] > $commonSpace){
|
|
596
|
-
$gSpaceLabels->{$sortedSpaces[$i]} = "yes";
|
|
597
|
-
} else {
|
|
598
|
-
$gSpaceLabels->{$sortedSpaces[$i]} = "no";
|
|
599
|
-
}
|
|
600
|
-
|
|
601
|
-
if($isDebug){
|
|
602
|
-
print STDERR "$sortedSpaces[$i] --> $gSpaceLabels->{$sortedSpaces[$i]}, freq = $gSpaceHash->{$sortedSpaces[$i]}\n";
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
sub processTable
|
|
608
|
-
{
|
|
609
|
-
my ($inputText, $isPic) = @_;
|
|
610
|
-
|
|
611
|
-
my $isCell = 0; # for table cell object
|
|
612
|
-
|
|
613
|
-
my $allText = "";
|
|
614
|
-
my $text = "";
|
|
615
|
-
|
|
616
|
-
my @lines = split(/\n/, $inputText);
|
|
617
|
-
my %tablePos = (); # $tablePos{$cellText} = "$l-$t-$r-$bottom"
|
|
618
|
-
my %table = (); # $table{$row}->{$col} = \@paraTexts
|
|
619
|
-
my $rowFrom;
|
|
620
|
-
my $colFrom;
|
|
621
|
-
my $rowTill;
|
|
622
|
-
my $colTill;
|
|
623
|
-
|
|
624
|
-
# xml feature
|
|
625
|
-
my $align = "none";
|
|
626
|
-
my $pos = -1;
|
|
627
|
-
foreach my $line (@lines)
|
|
628
|
-
{
|
|
629
|
-
if ($line =~ /^<table (.+?)>$/)
|
|
630
|
-
{
|
|
631
|
-
my $attr = $1;
|
|
632
|
-
|
|
633
|
-
if($isMarkup) { $markupOutput .= "### Table $attr\n"; }
|
|
634
|
-
|
|
635
|
-
# Fix: wrong regex sequence, huydhn
|
|
636
|
-
#if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
|
|
637
|
-
#{
|
|
638
|
-
# my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
|
|
639
|
-
# $align = $5;
|
|
640
|
-
|
|
641
|
-
# # pos feature
|
|
642
|
-
# $pos = ($t+$bottom)/2.0;
|
|
643
|
-
|
|
644
|
-
# if($pos < $gMinPos) { $gMinPos = $pos; }
|
|
645
|
-
# if($pos > $gMaxPos) { $gMaxPos = $pos; }
|
|
646
|
-
#}
|
|
647
|
-
#else
|
|
648
|
-
#{
|
|
649
|
-
# print STDERR "# no table alignment or location \"$line\"\n";
|
|
650
|
-
# $align = "";
|
|
651
|
-
#}
|
|
652
|
-
|
|
653
|
-
my ($l, $t, $r, $bottom) = undef;
|
|
654
|
-
if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
|
|
655
|
-
if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
|
|
656
|
-
if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
|
|
657
|
-
if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
|
|
658
|
-
|
|
659
|
-
if ($t && $bottom)
|
|
660
|
-
{
|
|
661
|
-
# pos feature
|
|
662
|
-
$pos = ($t + $bottom) / 2.0;
|
|
663
|
-
|
|
664
|
-
if($pos < $gMinPos) { $gMinPos = $pos; }
|
|
665
|
-
if($pos > $gMaxPos) { $gMaxPos = $pos; }
|
|
666
|
-
}
|
|
667
|
-
else
|
|
668
|
-
{
|
|
669
|
-
die "# Undefined table location \"$line\"\n";
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
|
|
673
|
-
{
|
|
674
|
-
$align = $1;
|
|
675
|
-
}
|
|
676
|
-
else
|
|
677
|
-
{
|
|
678
|
-
print STDERR "# no table alignment \"$line\"\n";
|
|
679
|
-
$align = "";
|
|
680
|
-
}
|
|
681
|
-
# End.
|
|
682
|
-
}
|
|
683
|
-
elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
|
|
684
|
-
{
|
|
685
|
-
$colFrom = $1;
|
|
686
|
-
$colTill = $2;
|
|
687
|
-
$rowFrom = $3;
|
|
688
|
-
$rowTill = $4;
|
|
689
|
-
#print STDERR "$rowFrom $rowTill $colFrom $colTill\n";
|
|
690
|
-
$isCell = 1;
|
|
691
|
-
}
|
|
692
|
-
elsif ($line =~ /^<\/cell>$/) # end cell
|
|
693
|
-
{
|
|
694
|
-
my @paraTexts = ();
|
|
695
|
-
processCell($text, \@paraTexts, \%tablePos, $isPic);
|
|
696
|
-
|
|
697
|
-
for(my $i = $rowFrom; $i<=$rowTill; $i++)
|
|
698
|
-
{
|
|
699
|
-
for(my $j = $colFrom; $j<=$colTill; $j++)
|
|
700
|
-
{
|
|
701
|
-
if(!$table{$i}) { $table{$i} = (); }
|
|
702
|
-
if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
|
|
703
|
-
|
|
704
|
-
if($i == $rowFrom && $j == $colFrom)
|
|
705
|
-
{
|
|
706
|
-
push(@{$table{$i}->{$j}}, @paraTexts);
|
|
707
|
-
if(scalar(@paraTexts) > 1) { last; }
|
|
708
|
-
}
|
|
709
|
-
else
|
|
710
|
-
{
|
|
711
|
-
push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
|
|
712
|
-
}
|
|
713
|
-
}
|
|
714
|
-
}
|
|
715
|
-
|
|
716
|
-
$isCell = 0;
|
|
717
|
-
$text = "";
|
|
718
|
-
}
|
|
719
|
-
elsif($isCell)
|
|
720
|
-
{
|
|
721
|
-
$text .= $line."\n";
|
|
722
|
-
next;
|
|
723
|
-
}
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
# note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
|
|
727
|
-
# E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
|
|
728
|
-
# It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
|
|
729
|
-
my @sortedRows = sort {$a <=> $b} keys %table;
|
|
730
|
-
my $isFirstLinePara = 1;
|
|
731
|
-
foreach my $row (@sortedRows)
|
|
732
|
-
{
|
|
733
|
-
my %tableR = %{$table{$row}};
|
|
734
|
-
my @sortedCols = sort {$a <=> $b} keys %tableR;
|
|
735
|
-
while(1)
|
|
736
|
-
{
|
|
737
|
-
my $isStop = 1;
|
|
738
|
-
my $rowText = "";
|
|
739
|
-
|
|
740
|
-
foreach my $col (@sortedCols)
|
|
741
|
-
{
|
|
742
|
-
# there's still some thing to process
|
|
743
|
-
if(scalar(@{$tableR{$col}}) > 0)
|
|
744
|
-
{
|
|
745
|
-
$isStop = 0;
|
|
746
|
-
$rowText .= shift(@{$tableR{$col}});
|
|
747
|
-
}
|
|
748
|
-
$rowText .= "\t";
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
if ((!$isAllowEmpty && $rowText =~ /^\s*$/) || ($isAllowEmpty && $rowText eq ""))
|
|
752
|
-
{
|
|
753
|
-
$isStop = 1;
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
if($isStop)
|
|
757
|
-
{
|
|
758
|
-
last;
|
|
759
|
-
}
|
|
760
|
-
else
|
|
761
|
-
{
|
|
762
|
-
$rowText =~ s/\t$/\n/;
|
|
763
|
-
$allText .= $rowText;
|
|
764
|
-
# print STDERR "$rowText";
|
|
765
|
-
|
|
766
|
-
# para
|
|
767
|
-
if($isFirstLinePara)
|
|
768
|
-
{
|
|
769
|
-
push(@gPara, "yes");
|
|
770
|
-
$isFirstLinePara = 0;
|
|
771
|
-
}
|
|
772
|
-
else
|
|
773
|
-
{
|
|
774
|
-
push(@gPara, "no");
|
|
775
|
-
}
|
|
776
|
-
|
|
777
|
-
if($isXmlFeature)
|
|
778
|
-
{
|
|
779
|
-
# table feature
|
|
780
|
-
push(@gTable, "yes");
|
|
781
|
-
|
|
782
|
-
# pic feature
|
|
783
|
-
if($isPic)
|
|
784
|
-
{
|
|
785
|
-
push(@gPic, "yes");
|
|
786
|
-
}
|
|
787
|
-
else
|
|
788
|
-
{
|
|
789
|
-
push(@gPic, "no");
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
push(@gPosHash, $pos); # update xml pos value
|
|
793
|
-
push(@gAlign, $align); # update xml alignment value
|
|
794
|
-
|
|
795
|
-
### Not assign value ###
|
|
796
|
-
push(@gFontSize, -1); # fontSize feature
|
|
797
|
-
push(@gFontFace, "none"); # fontFace feature
|
|
798
|
-
push(@gBold, "no"); # bold feature
|
|
799
|
-
push(@gItalic, "no"); # italic feature
|
|
800
|
-
push(@gBullet, "no"); # bullet feature
|
|
801
|
-
# push(@gSpace, "none"); # space feature
|
|
802
|
-
} # end if xml feature
|
|
803
|
-
}
|
|
804
|
-
}
|
|
805
|
-
}
|
|
806
|
-
|
|
807
|
-
return $allText;
|
|
808
|
-
}
|
|
809
|
-
|
|
810
|
-
sub processCell
|
|
811
|
-
{
|
|
812
|
-
my ($inputText, $paraTexts, $tablePos, $isPic) = @_;
|
|
813
|
-
|
|
814
|
-
my $text = "";
|
|
815
|
-
my @lines = split(/\n/, $inputText);
|
|
816
|
-
my $isPara = 0;
|
|
817
|
-
my $flag = 0;
|
|
818
|
-
foreach my $line (@lines)
|
|
819
|
-
{
|
|
820
|
-
if ($line =~ /^<para (.*)>$/)
|
|
821
|
-
{
|
|
822
|
-
$text .= $line."\n"; # we need the header
|
|
823
|
-
$isPara = 1;
|
|
824
|
-
|
|
825
|
-
if($isMarkup)
|
|
826
|
-
{
|
|
827
|
-
$markupOutput .= "## ParaTable $1\n";
|
|
828
|
-
}
|
|
829
|
-
}
|
|
830
|
-
elsif ($line =~ /^<\/para>$/)
|
|
831
|
-
{
|
|
832
|
-
my ($paraText, $l, $t, $r, $b) = processPara($text, 1, $isPic);
|
|
833
|
-
my @tokens = split(/\n/, $paraText);
|
|
834
|
-
|
|
835
|
-
foreach my $token (@tokens)
|
|
836
|
-
{
|
|
837
|
-
if($token ne "")
|
|
838
|
-
{
|
|
839
|
-
push(@{$paraTexts}, $token);
|
|
840
|
-
$flag = 1;
|
|
841
|
-
}
|
|
842
|
-
}
|
|
843
|
-
|
|
844
|
-
if(!$tablePos->{$paraText})
|
|
845
|
-
{
|
|
846
|
-
$tablePos->{$paraText} = "$l-$t-$r-$b";
|
|
847
|
-
}
|
|
848
|
-
else
|
|
849
|
-
{
|
|
850
|
-
#print STDERR "#! Warning: in method processCell, encounter the same paraText $paraText\n";
|
|
851
|
-
}
|
|
852
|
-
|
|
853
|
-
$isPara = 0;
|
|
854
|
-
$text = "";
|
|
855
|
-
}
|
|
856
|
-
elsif ($isPara)
|
|
857
|
-
{
|
|
858
|
-
$text .= $line."\n";
|
|
859
|
-
next;
|
|
860
|
-
}
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
# at least one value should be added for cell which is ""
|
|
864
|
-
if($flag == 0)
|
|
865
|
-
{
|
|
866
|
-
push(@{$paraTexts}, "");
|
|
867
|
-
}
|
|
868
|
-
}
|
|
869
|
-
|
|
870
|
-
sub getAttrValue {
|
|
871
|
-
my ($attrText, $attr) = @_;
|
|
872
|
-
|
|
873
|
-
my $value = "none";
|
|
874
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
|
875
|
-
$value = $1;
|
|
876
|
-
}
|
|
877
|
-
|
|
878
|
-
return $value;
|
|
879
|
-
}
|
|
880
|
-
|
|
881
|
-
sub checkFontAttr {
|
|
882
|
-
my ($attrText, $attr, $attrHash, $count) = @_;
|
|
883
|
-
|
|
884
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
|
885
|
-
my $attrValue = $1;
|
|
886
|
-
|
|
887
|
-
$attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
|
|
888
|
-
}
|
|
889
|
-
}
|
|
890
|
-
|
|
891
|
-
sub processPara
|
|
892
|
-
{
|
|
893
|
-
my ($inputText, $isCell, $isPic) = @_;
|
|
894
|
-
|
|
895
|
-
my $isSpace = 0;
|
|
896
|
-
my $isSpecialSpace = 0;
|
|
897
|
-
my $isTab = 0;
|
|
898
|
-
my $isBullet = 0;
|
|
899
|
-
|
|
900
|
-
my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
|
|
901
|
-
# xml feature
|
|
902
|
-
my $align = "none";
|
|
903
|
-
my ($l, $t, $r, $bottom);
|
|
904
|
-
my %fontSizeHash = ();
|
|
905
|
-
my %fontFaceHash = ();
|
|
906
|
-
my @boldArray = ();
|
|
907
|
-
my @italicArray = ();
|
|
908
|
-
my $space = "none";
|
|
909
|
-
|
|
910
|
-
my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
|
|
911
|
-
my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
|
|
912
|
-
my $wdAttr; my $wdText = ""; my $isWd = 0;
|
|
913
|
-
|
|
914
|
-
my $wdIndex = 0; # word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
|
|
915
|
-
my $lnBoldCount = 0;
|
|
916
|
-
my $lnItalicCount = 0;
|
|
917
|
-
|
|
918
|
-
my $allText = "";
|
|
919
|
-
my $text = ""; #invariant: when never enter a new line, $text will be copied into $allText, and $text is cleared
|
|
920
|
-
|
|
921
|
-
binmode(STDERR, ":utf8");
|
|
922
|
-
|
|
923
|
-
my $isFirstLinePara = 1;
|
|
924
|
-
my @lines = split(/\n/, $inputText);
|
|
925
|
-
for (my $i=0; $i<scalar(@lines); $i++)
|
|
926
|
-
{
|
|
927
|
-
my $line = $lines[$i];
|
|
928
|
-
|
|
929
|
-
## new para
|
|
930
|
-
if ($line =~ /^<para (.+?)>$/)
|
|
931
|
-
{
|
|
932
|
-
my $attr = $1;
|
|
933
|
-
$align = getAttrValue($attr, "alignment");
|
|
934
|
-
# $indent = getAttrValue($attr, "li");
|
|
935
|
-
$space = getAttrValue($attr, "spaceBefore");
|
|
936
|
-
}
|
|
937
|
-
## new ln
|
|
938
|
-
elsif ($line =~ /^<ln (.+)>$/)
|
|
939
|
-
{
|
|
940
|
-
$lnAttr = $1;
|
|
941
|
-
$isLn = 1;
|
|
942
|
-
|
|
943
|
-
if ($isMarkup)
|
|
944
|
-
{
|
|
945
|
-
$markupOutput .= "# Line $lnAttr\n";
|
|
946
|
-
}
|
|
947
|
-
|
|
948
|
-
# Fix: wrong regex sequence, huydhn
|
|
949
|
-
#if ($lnAttr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
|
|
950
|
-
#{
|
|
951
|
-
# ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
|
|
952
|
-
#}
|
|
953
|
-
|
|
954
|
-
if ($lnAttr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
|
|
955
|
-
if ($lnAttr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
|
|
956
|
-
if ($lnAttr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
|
|
957
|
-
if ($lnAttr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
|
|
958
|
-
# End.
|
|
959
|
-
|
|
960
|
-
$isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
|
|
961
|
-
|
|
962
|
-
# Bold & Italic
|
|
963
|
-
if($isXmlFeature)
|
|
964
|
-
{
|
|
965
|
-
$lnBold = getAttrValue($lnAttr, "bold");
|
|
966
|
-
$lnItalic = getAttrValue($lnAttr, "italic");
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
## new run
|
|
970
|
-
elsif ($line =~ /<run (.*)>$/)
|
|
971
|
-
{
|
|
972
|
-
$runAttr = $1;
|
|
973
|
-
|
|
974
|
-
$isSpace = 0;
|
|
975
|
-
$isTab = 0;
|
|
976
|
-
$isRun = 1;
|
|
977
|
-
|
|
978
|
-
# new wd, that consists of many runs
|
|
979
|
-
if($line =~ /^<wd (.*?)>/)
|
|
980
|
-
{
|
|
981
|
-
$isWd = 1;
|
|
982
|
-
$wdAttr = $1;
|
|
983
|
-
}
|
|
984
|
-
|
|
985
|
-
# Bold & Italic
|
|
986
|
-
if ($isXmlFeature)
|
|
987
|
-
{
|
|
988
|
-
$runBold = getAttrValue($runAttr, "bold");
|
|
989
|
-
$runItalic = getAttrValue($runAttr, "italic");
|
|
990
|
-
}
|
|
991
|
-
}
|
|
992
|
-
## wd
|
|
993
|
-
elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
|
|
994
|
-
{
|
|
995
|
-
$wdAttr = $1;
|
|
996
|
-
my $word = $2;
|
|
997
|
-
$isSpace = 0;
|
|
998
|
-
$isTab = 0;
|
|
999
|
-
|
|
1000
|
-
if ($isMarkup)
|
|
1001
|
-
{
|
|
1002
|
-
$markupOutput .= "$word $wdAttr\n";
|
|
1003
|
-
}
|
|
1004
|
-
|
|
1005
|
-
# FontSize & FontFace
|
|
1006
|
-
if($isXmlFeature)
|
|
1007
|
-
{
|
|
1008
|
-
checkFontAttr($wdAttr, "fontSize", \%fontSizeHash, 1);
|
|
1009
|
-
checkFontAttr($wdAttr, "fontFace", \%fontFaceHash, 1);
|
|
1010
|
-
}
|
|
1011
|
-
|
|
1012
|
-
# Bold & Italic
|
|
1013
|
-
if($isXmlFeature)
|
|
1014
|
-
{
|
|
1015
|
-
my $wdBold = getAttrValue($wdAttr, "bold");
|
|
1016
|
-
my $wdItalic = getAttrValue($wdAttr, "italic");
|
|
1017
|
-
|
|
1018
|
-
if($wdBold eq "true" || $runBold eq "true" || $lnBold eq "true")
|
|
1019
|
-
{
|
|
1020
|
-
$boldArray[$wdIndex] = 1;
|
|
1021
|
-
$lnBoldCount++;
|
|
1022
|
-
}
|
|
1023
|
-
|
|
1024
|
-
if($wdItalic eq "true" || $runItalic eq "true" || $lnItalic eq "true")
|
|
1025
|
-
{
|
|
1026
|
-
$italicArray[$wdIndex] = 1;
|
|
1027
|
-
$lnItalicCount++;
|
|
1028
|
-
}
|
|
1029
|
-
} # if($isXmlFeature)
|
|
1030
|
-
|
|
1031
|
-
## add text
|
|
1032
|
-
$text .= "$word";
|
|
1033
|
-
|
|
1034
|
-
if($isRun) { $runText .= "$word "; }
|
|
1035
|
-
$wdIndex++;
|
|
1036
|
-
}
|
|
1037
|
-
## end wd
|
|
1038
|
-
elsif ($line =~ /^<\/wd>$/)
|
|
1039
|
-
{
|
|
1040
|
-
$isWd = 0;
|
|
1041
|
-
|
|
1042
|
-
if($isMarkup)
|
|
1043
|
-
{
|
|
1044
|
-
$markupOutput .= "$wdText $wdAttr\n";
|
|
1045
|
-
$wdAttr = "";
|
|
1046
|
-
}
|
|
1047
|
-
}
|
|
1048
|
-
## end run
|
|
1049
|
-
elsif ($line =~ /^(.*)<\/run>$/)
|
|
1050
|
-
{
|
|
1051
|
-
my $word = $1;
|
|
1052
|
-
|
|
1053
|
-
## add text
|
|
1054
|
-
if($word ne "")
|
|
1055
|
-
{
|
|
1056
|
-
# Bold & Italic
|
|
1057
|
-
if($isXmlFeature)
|
|
1058
|
-
{
|
|
1059
|
-
if($runBold eq "true" || $lnBold eq "true")
|
|
1060
|
-
{
|
|
1061
|
-
$boldArray[$wdIndex] = 1;
|
|
1062
|
-
$lnBoldCount++;
|
|
1063
|
-
}
|
|
1064
|
-
|
|
1065
|
-
if($runItalic eq "true" || $lnItalic eq "true")
|
|
1066
|
-
{
|
|
1067
|
-
$italicArray[$wdIndex] = 1;
|
|
1068
|
-
$lnItalicCount++;
|
|
1069
|
-
}
|
|
1070
|
-
}
|
|
1071
|
-
|
|
1072
|
-
# appear in the final result
|
|
1073
|
-
if($isLn) { $text .= "$word"; }
|
|
1074
|
-
|
|
1075
|
-
# for internal record
|
|
1076
|
-
if($isRun) { $runText .= "$word "; }
|
|
1077
|
-
if($isWd) { $wdText .= "$word"; }
|
|
1078
|
-
|
|
1079
|
-
$wdIndex++;
|
|
1080
|
-
}
|
|
1081
|
-
|
|
1082
|
-
# xml feature
|
|
1083
|
-
# not a space, tab or new-line run
|
|
1084
|
-
if($isXmlFeature && $runText ne "")
|
|
1085
|
-
{
|
|
1086
|
-
my @words = split(/\s+/, $runText);
|
|
1087
|
-
my $numWords = scalar(@words);
|
|
1088
|
-
checkFontAttr($runAttr, "fontSize", \%fontSizeHash, $numWords);
|
|
1089
|
-
checkFontAttr($runAttr, "fontFace", \%fontFaceHash, $numWords);
|
|
1090
|
-
}
|
|
1091
|
-
|
|
1092
|
-
# reset run
|
|
1093
|
-
# <run> not enclosed within <ln>
|
|
1094
|
-
if (!$isLn) { $wdIndex = 0; }
|
|
1095
|
-
$runText = "";
|
|
1096
|
-
$isRun = 0;
|
|
1097
|
-
$isSpecialSpace = 0;
|
|
1098
|
-
|
|
1099
|
-
# Bold & Italic
|
|
1100
|
-
if($isXmlFeature)
|
|
1101
|
-
{
|
|
1102
|
-
$runBold = "none";
|
|
1103
|
-
$runItalic = "none";
|
|
1104
|
-
|
|
1105
|
-
# <run> not enclosed within <ln>
|
|
1106
|
-
if (!$isLn)
|
|
1107
|
-
{
|
|
1108
|
-
$lnBoldCount = 0;
|
|
1109
|
-
$lnItalicCount = 0;
|
|
1110
|
-
}
|
|
1111
|
-
}
|
|
1112
|
-
}
|
|
1113
|
-
## end ln
|
|
1114
|
-
elsif ($line =~ /^<\/ln>$/)
|
|
1115
|
-
{
|
|
1116
|
-
if((!$isAllowEmpty && $text !~ /^\s*$/) || ($isAllowEmpty && $text ne ""))
|
|
1117
|
-
{
|
|
1118
|
-
# there's a forced EOL?
|
|
1119
|
-
# not an emply line with space character
|
|
1120
|
-
if($isForcedEOF eq "true" || (!$isCell && !$isSpecialSpace) )
|
|
1121
|
-
{
|
|
1122
|
-
$text .= "\n";
|
|
1123
|
-
|
|
1124
|
-
# update allText
|
|
1125
|
-
$allText .= $text;
|
|
1126
|
-
$text = "";
|
|
1127
|
-
}
|
|
1128
|
-
|
|
1129
|
-
my $numWords = $wdIndex;
|
|
1130
|
-
|
|
1131
|
-
if (!$isCell)
|
|
1132
|
-
{
|
|
1133
|
-
if ($isFirstLinePara)
|
|
1134
|
-
{
|
|
1135
|
-
push(@gPara, "yes");
|
|
1136
|
-
$isFirstLinePara = 0;
|
|
1137
|
-
}
|
|
1138
|
-
else
|
|
1139
|
-
{
|
|
1140
|
-
push(@gPara, "no");
|
|
1141
|
-
}
|
|
1142
|
-
}
|
|
1143
|
-
|
|
1144
|
-
if ($isXmlFeature && $numWords >= 1)
|
|
1145
|
-
{
|
|
1146
|
-
# xml feature
|
|
1147
|
-
# assumtion that: fontSize is either occur in <ln>, or within multiple <run> under <ln>, but not both
|
|
1148
|
-
checkFontAttr($lnAttr, "fontSize", \%fontSizeHash, $numWords);
|
|
1149
|
-
checkFontAttr($lnAttr, "fontFace", \%fontFaceHash, $numWords);
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
if ($isXmlFeature && !$isCell && !$isSpecialSpace)
|
|
1153
|
-
{
|
|
1154
|
-
my $pos = ($t+$bottom)/2.0;
|
|
1155
|
-
if ($pos < $gMinPos) { $gMinPos = $pos; }
|
|
1156
|
-
if ($pos > $gMaxPos) { $gMaxPos = $pos; }
|
|
1157
|
-
|
|
1158
|
-
push(@gPosHash, $pos); # pos feature
|
|
1159
|
-
push(@gAlign, $align); # alignment feature
|
|
1160
|
-
push(@gTable, "no"); # table feature
|
|
1161
|
-
|
|
1162
|
-
# pic feature
|
|
1163
|
-
if($isPic)
|
|
1164
|
-
{
|
|
1165
|
-
push(@gPic, "yes");
|
|
1166
|
-
|
|
1167
|
-
### Not assign value ###
|
|
1168
|
-
push(@gFontSize, -1); # bold feature
|
|
1169
|
-
push(@gFontFace, "none"); # bold feature
|
|
1170
|
-
push(@gBold, "no"); # bold feature
|
|
1171
|
-
push(@gItalic, "no"); # italic feature
|
|
1172
|
-
push(@gBullet, "no"); # bullet feature
|
|
1173
|
-
}
|
|
1174
|
-
else
|
|
1175
|
-
{
|
|
1176
|
-
push(@gPic, "no");
|
|
1177
|
-
|
|
1178
|
-
updateXMLFontFeature(\%fontSizeHash, \%fontFaceHash);
|
|
1179
|
-
%fontSizeHash = (); %fontFaceHash = ();
|
|
1180
|
-
|
|
1181
|
-
updateXMLFeatures($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space);
|
|
1182
|
-
} # end if pic
|
|
1183
|
-
} # end if($isXmlFeature && !$isCell && !$isSpecialSpace)
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
## reset ln
|
|
1187
|
-
$isLn = 0;
|
|
1188
|
-
$isForcedEOF = "none";
|
|
1189
|
-
$isSpecialSpace = 0;
|
|
1190
|
-
$wdIndex = 0;
|
|
1191
|
-
|
|
1192
|
-
# Bold & Italic
|
|
1193
|
-
if ($isXmlFeature)
|
|
1194
|
-
{
|
|
1195
|
-
$lnBold = "none";
|
|
1196
|
-
$lnItalic = "none";
|
|
1197
|
-
|
|
1198
|
-
$lnBoldCount = 0;
|
|
1199
|
-
$lnItalicCount = 0;
|
|
1200
|
-
}
|
|
1201
|
-
} # end else </ln>
|
|
1202
|
-
## nl newline signal
|
|
1203
|
-
elsif ($line =~ /^<nl orig=\"true\"\/>$/)
|
|
1204
|
-
{
|
|
1205
|
-
if($isLn)
|
|
1206
|
-
{
|
|
1207
|
-
$isSpace = 0;
|
|
1208
|
-
}
|
|
1209
|
-
else
|
|
1210
|
-
{
|
|
1211
|
-
if($isDebug)
|
|
1212
|
-
{
|
|
1213
|
-
print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
|
|
1214
|
-
}
|
|
1215
|
-
}
|
|
1216
|
-
}
|
|
1217
|
-
## space
|
|
1218
|
-
elsif ($line =~ /^<space\/>$/)
|
|
1219
|
-
{
|
|
1220
|
-
my $startTag = "";
|
|
1221
|
-
my $endTag = "";
|
|
1222
|
-
if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/)
|
|
1223
|
-
{
|
|
1224
|
-
$startTag = $1;
|
|
1225
|
-
}
|
|
1226
|
-
|
|
1227
|
-
if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/)
|
|
1228
|
-
{
|
|
1229
|
-
$endTag = $1;
|
|
1230
|
-
}
|
|
1231
|
-
|
|
1232
|
-
if($startTag eq $endTag && $startTag ne "")
|
|
1233
|
-
{
|
|
1234
|
-
# print STDERR "# Special space after \"$text\"\n";
|
|
1235
|
-
$isSpecialSpace = 1;
|
|
1236
|
-
}
|
|
1237
|
-
|
|
1238
|
-
## addText
|
|
1239
|
-
$text .= " ";
|
|
1240
|
-
$isSpace = 1;
|
|
1241
|
-
}
|
|
1242
|
-
## tab
|
|
1243
|
-
elsif ($line =~ /^<tab .*\/>$/)
|
|
1244
|
-
{
|
|
1245
|
-
## add Text
|
|
1246
|
-
$text .= "\t";
|
|
1247
|
-
$isTab = 1;
|
|
1248
|
-
}
|
|
1249
|
-
## bullet
|
|
1250
|
-
elsif ($line =~ /^<bullet .*>$/)
|
|
1251
|
-
{
|
|
1252
|
-
$isBullet = 1;
|
|
1253
|
-
}
|
|
1254
|
-
}
|
|
1255
|
-
|
|
1256
|
-
$allText .= $text;
|
|
1257
|
-
return ($allText, $l, $t, $r, $bottom, $isSpace);
|
|
1258
|
-
}
|
|
1259
|
-
|
|
1260
|
-
sub updateXMLFontFeature {
|
|
1261
|
-
my ($fontSizeHash, $fontFaceHash) = @_;
|
|
1262
|
-
|
|
1263
|
-
# font size feature
|
|
1264
|
-
if(scalar(keys %{$fontSizeHash}) == 0){
|
|
1265
|
-
push(@gFontSize, -1);
|
|
1266
|
-
} else {
|
|
1267
|
-
my @sortedFonts = sort { $fontSizeHash->{$b} <=> $fontSizeHash->{$a} } keys %{$fontSizeHash};
|
|
1268
|
-
|
|
1269
|
-
my $fontSize = $sortedFonts[0];
|
|
1270
|
-
push(@gFontSize, $fontSize);
|
|
1271
|
-
|
|
1272
|
-
$gFontSizeHash{$fontSize} = $gFontSizeHash{$fontSize} ? ($gFontSizeHash{$fontSize}+1) : 1;
|
|
1273
|
-
}
|
|
1274
|
-
|
|
1275
|
-
# font face feature
|
|
1276
|
-
if(scalar(keys %{$fontFaceHash}) == 0){
|
|
1277
|
-
push(@gFontFace, "none");
|
|
1278
|
-
} else {
|
|
1279
|
-
my @sortedFonts = sort { $fontFaceHash->{$b} <=> $fontFaceHash->{$a} } keys %{$fontFaceHash};
|
|
1280
|
-
my $fontFace = $sortedFonts[0];
|
|
1281
|
-
push(@gFontFace, $fontFace);
|
|
1282
|
-
|
|
1283
|
-
$gFontFaceHash{$fontFace} = $gFontFaceHash{$fontFace} ? ($gFontFaceHash{$fontFace}+1) : 1;
|
|
1284
|
-
}
|
|
1285
|
-
}
|
|
1286
|
-
|
|
1287
|
-
sub updateXMLFeatures {
|
|
1288
|
-
my ($lnBoldCount, $lnItalicCount, $numWords, $isBullet, $space) = @_;
|
|
1289
|
-
# bold feature
|
|
1290
|
-
my $boldFeature;
|
|
1291
|
-
if ($lnBoldCount/$numWords >= 0.667){
|
|
1292
|
-
$boldFeature = "yes";
|
|
1293
|
-
} else {
|
|
1294
|
-
$boldFeature = "no";
|
|
1295
|
-
}
|
|
1296
|
-
push(@gBold, $boldFeature);
|
|
1297
|
-
|
|
1298
|
-
# italic feature
|
|
1299
|
-
my $italicFeature;
|
|
1300
|
-
if ($lnItalicCount/$numWords >= 0.667){
|
|
1301
|
-
$italicFeature = "yes";
|
|
1302
|
-
} else {
|
|
1303
|
-
$italicFeature = "no";
|
|
1304
|
-
}
|
|
1305
|
-
push(@gItalic, $italicFeature);
|
|
1306
|
-
|
|
1307
|
-
# bullet feature
|
|
1308
|
-
if($isBullet){
|
|
1309
|
-
push(@gBullet, "yes");
|
|
1310
|
-
} else {
|
|
1311
|
-
push(@gBullet, "no");
|
|
1312
|
-
}
|
|
1313
|
-
|
|
1314
|
-
# space feature
|
|
1315
|
-
# push(@gSpace, $space);
|
|
1316
|
-
}
|
|
1317
|
-
|
|
1318
|
-
## Find the positions of header, body, and citation
|
|
1319
|
-
sub getStructureInfo {
|
|
1320
|
-
my ($lines, $numLines) = @_;
|
|
1321
|
-
|
|
1322
|
-
my ($bodyLength, $citationLength, $bodyEndId) =
|
|
1323
|
-
SectLabel::PreProcess::findCitationText($lines, 0, $numLines);
|
|
1324
|
-
|
|
1325
|
-
my ($headerLength, $bodyStartId);
|
|
1326
|
-
($headerLength, $bodyLength, $bodyStartId) =
|
|
1327
|
-
SectLabel::PreProcess::findHeaderText($lines, 0, $bodyLength);
|
|
1328
|
-
|
|
1329
|
-
# sanity check
|
|
1330
|
-
my $totalLength = $headerLength + $bodyLength + $citationLength;
|
|
1331
|
-
if($numLines != $totalLength){
|
|
1332
|
-
print STDOUT "Die in getStructureInfo(): different num lines $numLines != $totalLength\n"; # to display in Web
|
|
1333
|
-
die "Die in getStructureInfo(): different num lines $numLines != $totalLength\n";
|
|
1334
|
-
}
|
|
1335
|
-
return ($headerLength, $bodyLength, $citationLength, $bodyStartId, $bodyEndId);
|
|
1336
|
-
}
|
|
1337
|
-
|
|
1338
|
-
## Count XML tags/values for statistics purpose
|
|
1339
|
-
sub processTagInfo {
|
|
1340
|
-
my ($line, $tags) = @_;
|
|
1341
|
-
|
|
1342
|
-
my $tag;
|
|
1343
|
-
my $attr;
|
|
1344
|
-
if($line =~ /^<(.+?)\b(.*)/){
|
|
1345
|
-
$tag = $1;
|
|
1346
|
-
$attr = $2;
|
|
1347
|
-
if(!$tags->{$tag}){
|
|
1348
|
-
$tags->{$tag} = ();
|
|
1349
|
-
}
|
|
1350
|
-
if($attr =~ /^\s*(.+?)\s*\/?>/){
|
|
1351
|
-
$attr = $1;
|
|
1352
|
-
}
|
|
1353
|
-
|
|
1354
|
-
my @tokens = split(/\s+/, $attr);
|
|
1355
|
-
foreach my $token (@tokens){
|
|
1356
|
-
if($token =~ /^(.+)=(.+)$/){
|
|
1357
|
-
my $attrName = $1;
|
|
1358
|
-
my $value = $2;
|
|
1359
|
-
if(!$tags->{$tag}->{$attrName}){
|
|
1360
|
-
$tags->{$tag}->{$attrName} = ();
|
|
1361
|
-
}
|
|
1362
|
-
if(!$tags->{$tag}->{$attrName}->{$value}){
|
|
1363
|
-
$tags->{$tag}->{$attrName}->{$value} = 0;
|
|
1364
|
-
}
|
|
1365
|
-
$tags->{$tag}->{$attrName}->{$value}++;
|
|
1366
|
-
}
|
|
1367
|
-
}
|
|
1368
|
-
}
|
|
1369
|
-
}
|
|
1370
|
-
|
|
1371
|
-
## Print tag info to file
|
|
1372
|
-
sub printTagInfo {
|
|
1373
|
-
my ($tags, $tagFile) = @_;
|
|
1374
|
-
|
|
1375
|
-
open(TAG, ">:utf8", "$tagFile") || die"#Can't open file \"$tagFile\"\n";
|
|
1376
|
-
my @sortedTags = sort {$a cmp $b} keys %{$tags};
|
|
1377
|
-
foreach(@sortedTags){
|
|
1378
|
-
my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
|
|
1379
|
-
print TAG "# Tag = $_\n";
|
|
1380
|
-
foreach my $attr (@attrs) {
|
|
1381
|
-
print TAG "$attr:";
|
|
1382
|
-
my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
|
|
1383
|
-
foreach my $value (@values){
|
|
1384
|
-
print TAG " $value-$tags->{$_}->{$attr}->{$value}";
|
|
1385
|
-
}
|
|
1386
|
-
print TAG "\n";
|
|
1387
|
-
}
|
|
1388
|
-
}
|
|
1389
|
-
close TAG;
|
|
1390
|
-
}
|
|
1391
|
-
|
|
1392
|
-
sub untaintPath {
|
|
1393
|
-
my ($path) = @_;
|
|
1394
|
-
|
|
1395
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ ) {
|
|
1396
|
-
$path = $1;
|
|
1397
|
-
} else {
|
|
1398
|
-
die "Bad path \"$path\"\n";
|
|
1399
|
-
}
|
|
1400
|
-
|
|
1401
|
-
return $path;
|
|
1402
|
-
}
|
|
1403
|
-
|
|
1404
|
-
sub untaint {
|
|
1405
|
-
my ($s) = @_;
|
|
1406
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
|
|
1407
|
-
$s = $1; # $data now untainted
|
|
1408
|
-
} else {
|
|
1409
|
-
die "Bad data in $s"; # log this somewhere
|
|
1410
|
-
}
|
|
1411
|
-
return $s;
|
|
1412
|
-
}
|
|
1413
|
-
|
|
1414
|
-
sub execute {
|
|
1415
|
-
my ($cmd) = @_;
|
|
1416
|
-
if($isDebug){
|
|
1417
|
-
print STDERR "Executing: $cmd\n";
|
|
1418
|
-
}
|
|
1419
|
-
$cmd = untaint($cmd);
|
|
1420
|
-
system($cmd);
|
|
1421
|
-
}
|
|
1422
|
-
|
|
1423
|
-
sub newTmpFile {
|
|
1424
|
-
my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
|
|
1425
|
-
chomp($tmpFile);
|
|
1426
|
-
return $tmpFile;
|
|
1427
|
-
}
|