biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,1529 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl -wT
|
|
2
|
-
|
|
3
|
-
# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
|
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
5
|
-
|
|
6
|
-
require 5.0;
|
|
7
|
-
use strict;
|
|
8
|
-
|
|
9
|
-
use Getopt::Long;
|
|
10
|
-
use HTML::Entities;
|
|
11
|
-
|
|
12
|
-
# I do not know a better solution to find a lib path in -T mode.
|
|
13
|
-
# So if you know a better solution, I'd be glad to hear.
|
|
14
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
|
15
|
-
use FindBin;
|
|
16
|
-
FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
|
|
17
|
-
my $path;
|
|
18
|
-
BEGIN
|
|
19
|
-
{
|
|
20
|
-
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
use lib "$path/../../lib";
|
|
24
|
-
|
|
25
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
|
|
26
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
|
|
27
|
-
|
|
28
|
-
use SectLabel::PreProcess;
|
|
29
|
-
|
|
30
|
-
### USER customizable section
|
|
31
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
32
|
-
my $outputVersion = "1.0";
|
|
33
|
-
### END user customizable section
|
|
34
|
-
|
|
35
|
-
sub License
|
|
36
|
-
{
|
|
37
|
-
print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
### HELP Sub-procedure
|
|
41
|
-
sub Help
|
|
42
|
-
{
|
|
43
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
|
|
44
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
45
|
-
print STDERR " $progname -in xmlFile -out out_file [-xmlFeature -decode -markup -para] [-tag tag_file -allowEmptyLine -log]\n";
|
|
46
|
-
print STDERR "Options:\n";
|
|
47
|
-
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
|
48
|
-
print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
|
|
49
|
-
print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
|
|
50
|
-
print STDERR "\t-para: marking in the output each paragraph with # Para lineId num_lines\n";
|
|
51
|
-
print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
|
|
52
|
-
print STDERR "\t-tag tag_file: count XML tags/values for statistics purpose\n";
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
my $quite = 0;
|
|
56
|
-
my $help = 0;
|
|
57
|
-
my $out_file = undef;
|
|
58
|
-
my $in_file = undef;
|
|
59
|
-
|
|
60
|
-
my $is_xml_feature = 0;
|
|
61
|
-
my $is_decode = 0;
|
|
62
|
-
my $is_markup = 0;
|
|
63
|
-
my $is_para_delimiter = 0;
|
|
64
|
-
my $is_allow_empty = 0;
|
|
65
|
-
my $is_debug = 0;
|
|
66
|
-
my $tag_file = "";
|
|
67
|
-
|
|
68
|
-
$help = 1 unless GetOptions(
|
|
69
|
-
'in=s' => \$in_file,
|
|
70
|
-
'out=s' => \$out_file,
|
|
71
|
-
'decode' => \$is_decode,
|
|
72
|
-
'xmlFeature' => \$is_xml_feature,
|
|
73
|
-
'tag=s' => \$tag_file,
|
|
74
|
-
'allowEmptyLine' => \$is_allow_empty,
|
|
75
|
-
'markup' => \$is_markup,
|
|
76
|
-
'para' => \$is_para_delimiter,
|
|
77
|
-
'log' => \$is_debug,
|
|
78
|
-
'h' => \$help,
|
|
79
|
-
'q' => \$quite );
|
|
80
|
-
|
|
81
|
-
if ($help || !defined $in_file || !defined $out_file)
|
|
82
|
-
{
|
|
83
|
-
Help();
|
|
84
|
-
exit(0);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
if (!$quite)
|
|
88
|
-
{
|
|
89
|
-
License();
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
### Untaint ###
|
|
93
|
-
$in_file = UntaintPath($in_file);
|
|
94
|
-
$out_file = UntaintPath($out_file);
|
|
95
|
-
$tag_file = UntaintPath($tag_file);
|
|
96
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
|
97
|
-
### End untaint ###
|
|
98
|
-
|
|
99
|
-
# Mark page, para, line, word
|
|
100
|
-
my %g_page_hash = ();
|
|
101
|
-
|
|
102
|
-
# Mark paragraph
|
|
103
|
-
my @g_para = ();
|
|
104
|
-
|
|
105
|
-
# XML features
|
|
106
|
-
# Location feature
|
|
107
|
-
my @g_pos_hash = ();
|
|
108
|
-
my $g_minpos = 1000000;
|
|
109
|
-
my $g_maxpos = 0;
|
|
110
|
-
# Align feature
|
|
111
|
-
my @g_align = ();
|
|
112
|
-
# Bold feature
|
|
113
|
-
my @g_bold = ();
|
|
114
|
-
# Italic feature
|
|
115
|
-
my @g_italic = ();
|
|
116
|
-
# Font size feature
|
|
117
|
-
my %g_font_size_hash = ();
|
|
118
|
-
my @g_font_size = ();
|
|
119
|
-
# Font face feature
|
|
120
|
-
my %g_font_face_hash = ();
|
|
121
|
-
my @g_font_face = ();
|
|
122
|
-
# Pic feature
|
|
123
|
-
my @g_pic = ();
|
|
124
|
-
# Table feature
|
|
125
|
-
my @g_table = ();
|
|
126
|
-
# Bullet feature
|
|
127
|
-
my @g_bullet = ();
|
|
128
|
-
# Space feature
|
|
129
|
-
# my %g_space_hash = ();
|
|
130
|
-
# my @g_space = ();
|
|
131
|
-
|
|
132
|
-
my %tags = ();
|
|
133
|
-
|
|
134
|
-
if ($is_debug)
|
|
135
|
-
{
|
|
136
|
-
print STDERR "\n# Processing file $in_file & output to $out_file\n";
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
my $markup_output = "";
|
|
140
|
-
my $all_text = ProcessFile($in_file, $out_file, \%tags);
|
|
141
|
-
|
|
142
|
-
# Find header part
|
|
143
|
-
my @lines = split(/\n/, $all_text);
|
|
144
|
-
my $num_lines = scalar(@lines);
|
|
145
|
-
my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
|
|
146
|
-
|
|
147
|
-
# Output
|
|
148
|
-
if ($is_markup)
|
|
149
|
-
{
|
|
150
|
-
open(OF, ">:utf8", "$out_file") || die"#Can't open file \"$out_file\"\n";
|
|
151
|
-
print OF "$markup_output";
|
|
152
|
-
close OF;
|
|
153
|
-
}
|
|
154
|
-
else
|
|
155
|
-
{
|
|
156
|
-
Output(\@lines, $out_file);
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
if ($tag_file ne "")
|
|
160
|
-
{
|
|
161
|
-
PrintTagInfo(\%tags, $tag_file);
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
sub ProcessFile
|
|
165
|
-
{
|
|
166
|
-
my ($in_file, $tags) = @_;
|
|
167
|
-
|
|
168
|
-
if (!(-e $in_file)) { die "# $progname crash\t\tFile \"$in_file\" doesn't exist"; }
|
|
169
|
-
open (IF, "<:utf8", $in_file) || die "# $progname crash\t\tCan't open \"$in_file\"";
|
|
170
|
-
|
|
171
|
-
my $is_para = 0;
|
|
172
|
-
my $is_table = 0;
|
|
173
|
-
my $is_space = 0;
|
|
174
|
-
my $is_pic = 0;
|
|
175
|
-
my $all_text = "";
|
|
176
|
-
my $text = "";
|
|
177
|
-
my $line_id = 0;
|
|
178
|
-
|
|
179
|
-
# Each line contains a header
|
|
180
|
-
while (<IF>)
|
|
181
|
-
{
|
|
182
|
-
# Skip comments
|
|
183
|
-
if (/^\#/) { next; }
|
|
184
|
-
chomp;
|
|
185
|
-
|
|
186
|
-
# Remove ^M character at the end of the file if any
|
|
187
|
-
s/\cM$//;
|
|
188
|
-
my $line = $_;
|
|
189
|
-
|
|
190
|
-
if($tag_file ne "") { ProcessTagInfo($line, $tags); }
|
|
191
|
-
|
|
192
|
-
# if ($line =~ /<\?xml version.+>/) { } ### XML ###
|
|
193
|
-
# if ($line =~ /^<\/column>$/) { } ### Column ###
|
|
194
|
-
if ($is_markup && $line =~ /<theoreticalPage (.*)\/>/ && $is_markup) { $markup_output .= "### Page $1\n"; }
|
|
195
|
-
|
|
196
|
-
# Pic
|
|
197
|
-
if ($line =~ /^<dd (.*)>$/)
|
|
198
|
-
{
|
|
199
|
-
$is_pic = 1;
|
|
200
|
-
if($is_markup) { $markup_output .= "### Figure $1\n"; }
|
|
201
|
-
}
|
|
202
|
-
elsif ($line =~ /^<\/dd>$/)
|
|
203
|
-
{
|
|
204
|
-
$is_pic = 0;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
# Table
|
|
208
|
-
if ($line =~ /^<table .*>$/)
|
|
209
|
-
{
|
|
210
|
-
$text .= $line."\n"; # we need the header
|
|
211
|
-
$is_table = 1;
|
|
212
|
-
}
|
|
213
|
-
elsif ($line =~ /^<\/table>$/)
|
|
214
|
-
{
|
|
215
|
-
my $table_text = ProcessTable($text, $is_pic);
|
|
216
|
-
$all_text .= $table_text;
|
|
217
|
-
|
|
218
|
-
my @tmp_lines = split(/\n/, $table_text);
|
|
219
|
-
$line_id += scalar(@tmp_lines);
|
|
220
|
-
|
|
221
|
-
$is_table = 0;
|
|
222
|
-
$text = "";
|
|
223
|
-
}
|
|
224
|
-
elsif ($is_table)
|
|
225
|
-
{
|
|
226
|
-
$text .= $line."\n";
|
|
227
|
-
next;
|
|
228
|
-
}
|
|
229
|
-
# Paragraph
|
|
230
|
-
# Note: table processing should have higher priority than paragraph, i.e. the priority does matter
|
|
231
|
-
elsif ($line =~ /^<para (.*)>$/)
|
|
232
|
-
{
|
|
233
|
-
$text .= $line."\n"; # we need the header
|
|
234
|
-
$is_para = 1;
|
|
235
|
-
if ($is_markup) { $markup_output .= "## Para $1\n"; }
|
|
236
|
-
}
|
|
237
|
-
elsif ($line =~ /^<\/para>$/)
|
|
238
|
-
{
|
|
239
|
-
my ($para_text, $l, $t, $r, $b) = undef;
|
|
240
|
-
($para_text, $l, $t, $r, $b, $is_space) = ProcessPara($text, 0, $is_pic);
|
|
241
|
-
$all_text .= $para_text;
|
|
242
|
-
|
|
243
|
-
my @tmp_lines = split(/\n/, $para_text);
|
|
244
|
-
$line_id += scalar(@tmp_lines);
|
|
245
|
-
$is_para = 0;
|
|
246
|
-
$text = "";
|
|
247
|
-
}
|
|
248
|
-
elsif ($is_para)
|
|
249
|
-
{
|
|
250
|
-
$text .= $line."\n";
|
|
251
|
-
next;
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
close IF;
|
|
256
|
-
return $all_text;
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
sub Output
|
|
260
|
-
{
|
|
261
|
-
my ($lines, $out_file) = @_;
|
|
262
|
-
open(OF, ">:utf8", "$out_file") || die"#Can't open file \"$out_file\"\n";
|
|
263
|
-
|
|
264
|
-
####### Final output ############
|
|
265
|
-
# XML feature label
|
|
266
|
-
my %g_font_size_labels = ();
|
|
267
|
-
# my %g_space_labels = (); # yes, no
|
|
268
|
-
|
|
269
|
-
if($is_xml_feature)
|
|
270
|
-
{
|
|
271
|
-
GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
|
|
272
|
-
# GetSpaceLabels(\%g_space_hash, \%g_space_labels);
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
my $id = -1;
|
|
276
|
-
my $output = "";
|
|
277
|
-
my $para_line_id = -1;
|
|
278
|
-
my $para_line_count = 0;
|
|
279
|
-
|
|
280
|
-
foreach my $line (@{$lines})
|
|
281
|
-
{
|
|
282
|
-
$id++;
|
|
283
|
-
|
|
284
|
-
# Remove ^M character at the end of each line if any
|
|
285
|
-
$line =~ s/\cM$//;
|
|
286
|
-
|
|
287
|
-
# Empty lines
|
|
288
|
-
if($line =~ /^\s*$/)
|
|
289
|
-
{
|
|
290
|
-
if(!$is_allow_empty)
|
|
291
|
-
{
|
|
292
|
-
next;
|
|
293
|
-
}
|
|
294
|
-
else
|
|
295
|
-
{
|
|
296
|
-
if($is_debug) { print STDERR "#! Line $id empty!\n"; }
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
if ($g_para[$id] eq "yes")
|
|
301
|
-
{
|
|
302
|
-
# Mark para
|
|
303
|
-
if($output ne "")
|
|
304
|
-
{
|
|
305
|
-
if($is_para_delimiter)
|
|
306
|
-
{
|
|
307
|
-
print OF "# Para $para_line_id $para_line_count\n$output";
|
|
308
|
-
$para_line_count = 0;
|
|
309
|
-
}
|
|
310
|
-
else
|
|
311
|
-
{
|
|
312
|
-
if ($is_decode) { $output = decode_entities($output); }
|
|
313
|
-
print OF $output;
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
$output = "";
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
$para_line_id = $id;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
$output .= $line;
|
|
323
|
-
$para_line_count++;
|
|
324
|
-
|
|
325
|
-
# Output XML features
|
|
326
|
-
if ($is_xml_feature)
|
|
327
|
-
{
|
|
328
|
-
# Loc feature
|
|
329
|
-
my $loc_feature;
|
|
330
|
-
if ($g_pos_hash[$id] != -1)
|
|
331
|
-
{
|
|
332
|
-
$loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos)*8.0/($g_maxpos - $g_minpos + 1));
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
# Align feature
|
|
336
|
-
my $align_feature = "xmlAlign_" . $g_align[$id];
|
|
337
|
-
|
|
338
|
-
# Font_size feature
|
|
339
|
-
my $font_size_feature;
|
|
340
|
-
if ($g_font_size[$id] == -1)
|
|
341
|
-
{
|
|
342
|
-
$font_size_feature = "xmlFontSize_none";
|
|
343
|
-
}
|
|
344
|
-
else
|
|
345
|
-
{
|
|
346
|
-
$font_size_feature = "xmlFontSize_" . $g_font_size_labels{$g_font_size[$id]};
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
my $bold_feature = "xmlBold_" . $g_bold[$id]; # Bold feature
|
|
350
|
-
my $italic_feature = "xmlItalic_" . $g_italic[$id]; # Italic feature
|
|
351
|
-
my $pic_feature = "xmlPic_" . $g_pic[$id]; # Pic feature
|
|
352
|
-
my $table_feature = "xmlTable_" . $g_table[$id]; # Table feature
|
|
353
|
-
my $bullet_feature = "xmlBullet_" . $g_bullet[$id]; # Bullet feature
|
|
354
|
-
|
|
355
|
-
# Space feature
|
|
356
|
-
# my $space_feature;
|
|
357
|
-
# if($g_space[$id] eq "none")
|
|
358
|
-
# {
|
|
359
|
-
# $space_feature = "xmlSpace_none";
|
|
360
|
-
# }
|
|
361
|
-
# else
|
|
362
|
-
# {
|
|
363
|
-
# $space_feature = "xmlSpace_" . $g_space_labels{$g_space[$id]};
|
|
364
|
-
# }
|
|
365
|
-
|
|
366
|
-
# Differential features
|
|
367
|
-
my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
|
|
368
|
-
|
|
369
|
-
# Each line and its XML features
|
|
370
|
-
$output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff\n";
|
|
371
|
-
}
|
|
372
|
-
else
|
|
373
|
-
{
|
|
374
|
-
$output .= "\n";
|
|
375
|
-
}
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
# Mark para
|
|
379
|
-
if ($output ne "")
|
|
380
|
-
{
|
|
381
|
-
if ($is_para_delimiter)
|
|
382
|
-
{
|
|
383
|
-
print OF "# Para $para_line_id $para_line_count\n$output";
|
|
384
|
-
$para_line_count = 0;
|
|
385
|
-
}
|
|
386
|
-
else
|
|
387
|
-
{
|
|
388
|
-
if($is_decode){ $output = decode_entities($output); }
|
|
389
|
-
print OF $output;
|
|
390
|
-
}
|
|
391
|
-
$output = ""
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
close OF;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
sub GetDifferentialFeatures
|
|
398
|
-
{
|
|
399
|
-
my ($id) = @_;
|
|
400
|
-
|
|
401
|
-
# AlignChange feature
|
|
402
|
-
my $align_diff = "bi_xmlA_";
|
|
403
|
-
|
|
404
|
-
if ($id == 0)
|
|
405
|
-
{
|
|
406
|
-
$align_diff .= $g_align[$id];
|
|
407
|
-
}
|
|
408
|
-
elsif ($g_align[$id] eq $g_align[$id-1])
|
|
409
|
-
{
|
|
410
|
-
$align_diff .= "continue";
|
|
411
|
-
}
|
|
412
|
-
else
|
|
413
|
-
{
|
|
414
|
-
$align_diff .= $g_align[$id];
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
# FontFaceChange feature
|
|
418
|
-
my $font_face_diff = "bi_xmlF_";
|
|
419
|
-
if ($id == 0)
|
|
420
|
-
{
|
|
421
|
-
$font_face_diff .= "new";
|
|
422
|
-
}
|
|
423
|
-
elsif ($g_font_face[$id] eq $g_font_face[$id-1])
|
|
424
|
-
{
|
|
425
|
-
$font_face_diff .= "continue";
|
|
426
|
-
}
|
|
427
|
-
else
|
|
428
|
-
{
|
|
429
|
-
$font_face_diff .= "new";
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
# FontSizeChange feature
|
|
433
|
-
my $font_size_diff = "bi_xmlS_";
|
|
434
|
-
if ($id == 0)
|
|
435
|
-
{
|
|
436
|
-
$font_size_diff .= "new";
|
|
437
|
-
}
|
|
438
|
-
elsif ($g_font_size[$id] == $g_font_size[$id-1])
|
|
439
|
-
{
|
|
440
|
-
$font_size_diff .= "continue";
|
|
441
|
-
}
|
|
442
|
-
else
|
|
443
|
-
{
|
|
444
|
-
$font_size_diff .= "new";
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
# FontSFChange feature
|
|
448
|
-
my $font_sf_diff = "bi_xmlSF_";
|
|
449
|
-
if ($id == 0)
|
|
450
|
-
{
|
|
451
|
-
$font_sf_diff .= "new";
|
|
452
|
-
}
|
|
453
|
-
elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1])
|
|
454
|
-
{
|
|
455
|
-
$font_sf_diff .= "continue";
|
|
456
|
-
}
|
|
457
|
-
else
|
|
458
|
-
{
|
|
459
|
-
$font_sf_diff .= "new";
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
# FontSFBIChange feature
|
|
463
|
-
my $font_sfbi_diff = "bi_xmlSFBI_";
|
|
464
|
-
if ($id == 0)
|
|
465
|
-
{
|
|
466
|
-
$font_sfbi_diff .= "new";
|
|
467
|
-
}
|
|
468
|
-
elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1] && $g_bold[$id] eq $g_bold[$id-1] && $g_italic[$id] eq $g_italic[$id-1])
|
|
469
|
-
{
|
|
470
|
-
$font_sfbi_diff .= "continue";
|
|
471
|
-
}
|
|
472
|
-
else
|
|
473
|
-
{
|
|
474
|
-
$font_sfbi_diff .= "new";
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
# FontSFBIAChange feature
|
|
478
|
-
my $font_sfbia_diff = "bi_xmlSFBIA_";
|
|
479
|
-
if ($id == 0)
|
|
480
|
-
{
|
|
481
|
-
$font_sfbia_diff .= "new";
|
|
482
|
-
}
|
|
483
|
-
elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1] && $g_bold[$id] eq $g_bold[$id-1] && $g_italic[$id] eq $g_italic[$id-1] && $g_align[$id] eq $g_align[$id-1])
|
|
484
|
-
{
|
|
485
|
-
$font_sfbia_diff .= "continue";
|
|
486
|
-
}
|
|
487
|
-
else
|
|
488
|
-
{
|
|
489
|
-
$font_sfbia_diff .= "new";
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
# Para change feature
|
|
493
|
-
my $para_diff = "bi_xmlPara_";
|
|
494
|
-
# Header part, consider each line as a separate paragraph
|
|
495
|
-
if ($id < $body_start_id)
|
|
496
|
-
{
|
|
497
|
-
$para_diff .= "header";
|
|
498
|
-
}
|
|
499
|
-
else
|
|
500
|
-
{
|
|
501
|
-
if($g_para[$id] eq "yes")
|
|
502
|
-
{
|
|
503
|
-
$para_diff .= "new";
|
|
504
|
-
}
|
|
505
|
-
else
|
|
506
|
-
{
|
|
507
|
-
$para_diff .= "continue";
|
|
508
|
-
}
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
sub GetFontSizeLabels
|
|
515
|
-
{
|
|
516
|
-
my ($g_font_size_hash, $g_font_size_labels) = @_;
|
|
517
|
-
|
|
518
|
-
if ($is_debug) { print STDERR "# Map fonts\n"; }
|
|
519
|
-
my @sorted_fonts = sort { $g_font_size_hash->{$b} <=> $g_font_size_hash->{$a} } keys %{$g_font_size_hash}; # Sort by values, obtain keys
|
|
520
|
-
|
|
521
|
-
my $common_size = $sorted_fonts[0];
|
|
522
|
-
@sorted_fonts = sort { $a <=> $b } keys %{$g_font_size_hash}; # Sort by keys, obtain keys
|
|
523
|
-
|
|
524
|
-
# Index of common font size
|
|
525
|
-
my $common_index = 0;
|
|
526
|
-
foreach (@sorted_fonts)
|
|
527
|
-
{
|
|
528
|
-
# Found
|
|
529
|
-
if ($common_size == $_)
|
|
530
|
-
{
|
|
531
|
-
last;
|
|
532
|
-
}
|
|
533
|
-
$common_index++;
|
|
534
|
-
}
|
|
535
|
-
|
|
536
|
-
# Small fonts
|
|
537
|
-
for (my $i = 0; $i < $common_index; $i++)
|
|
538
|
-
{
|
|
539
|
-
$g_font_size_labels->{$sorted_fonts[$i]} = "smaller";
|
|
540
|
-
|
|
541
|
-
if($is_debug)
|
|
542
|
-
{
|
|
543
|
-
print STDERR "$sorted_fonts[$i] --> $g_font_size_labels->{$sorted_fonts[$i]}, freq = $g_font_size_hash->{$sorted_fonts[$i]}\n";
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
# Common fonts
|
|
548
|
-
$g_font_size_labels->{$common_size} = "common";
|
|
549
|
-
if ($is_debug)
|
|
550
|
-
{
|
|
551
|
-
print STDERR "$sorted_fonts[$common_index] --> $g_font_size_labels->{$sorted_fonts[$common_index]}, freq = $g_font_size_hash->{$sorted_fonts[$common_index]}\n";
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
# Large fonts
|
|
555
|
-
for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
|
|
556
|
-
{
|
|
557
|
-
if ((scalar(@sorted_fonts)-$i) <= 3)
|
|
558
|
-
{
|
|
559
|
-
$g_font_size_labels->{$sorted_fonts[$i]} = "largest".($i+1-scalar(@sorted_fonts));
|
|
560
|
-
}
|
|
561
|
-
else
|
|
562
|
-
{
|
|
563
|
-
$g_font_size_labels->{$sorted_fonts[$i]} = "larger";
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
if($is_debug)
|
|
567
|
-
{
|
|
568
|
-
print STDERR "$sorted_fonts[$i] --> $g_font_size_labels->{$sorted_fonts[$i]}, freq = $g_font_size_hash->{$sorted_fonts[$i]}\n";
|
|
569
|
-
}
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
sub GetSpaceLabels
|
|
574
|
-
{
|
|
575
|
-
my ($g_space_hash, $g_space_labels) = @_;
|
|
576
|
-
|
|
577
|
-
if ($is_debug)
|
|
578
|
-
{
|
|
579
|
-
print STDERR "\n# Map space\n";
|
|
580
|
-
}
|
|
581
|
-
my @sorted_spaces = sort { $g_space_hash->{$b} <=> $g_space_hash->{$a} } keys %{$g_space_hash}; # sort by freqs, obtain space faces
|
|
582
|
-
|
|
583
|
-
my $common_space = $sorted_spaces[0];
|
|
584
|
-
my $common_freq = $g_space_hash->{$common_space};
|
|
585
|
-
|
|
586
|
-
# Find similar common freq with larger spaces
|
|
587
|
-
for (my $i = 0; $i < scalar(@sorted_spaces); $i++)
|
|
588
|
-
{
|
|
589
|
-
my $freq = $g_space_hash->{$sorted_spaces[$i]};
|
|
590
|
-
if ($freq/$common_freq > 0.8)
|
|
591
|
-
{
|
|
592
|
-
if($sorted_spaces[$i] > $common_space)
|
|
593
|
-
{
|
|
594
|
-
$common_space = $sorted_spaces[$i];
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
else
|
|
598
|
-
{
|
|
599
|
-
last;
|
|
600
|
-
}
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
for (my $i = 0; $i < scalar(@sorted_spaces); $i++)
|
|
604
|
-
{
|
|
605
|
-
if ($sorted_spaces[$i] > $common_space)
|
|
606
|
-
{
|
|
607
|
-
$g_space_labels->{$sorted_spaces[$i]} = "yes";
|
|
608
|
-
}
|
|
609
|
-
else
|
|
610
|
-
{
|
|
611
|
-
$g_space_labels->{$sorted_spaces[$i]} = "no";
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
if($is_debug)
|
|
615
|
-
{
|
|
616
|
-
print STDERR "$sorted_spaces[$i] --> $g_space_labels->{$sorted_spaces[$i]}, freq = $g_space_hash->{$sorted_spaces[$i]}\n";
|
|
617
|
-
}
|
|
618
|
-
}
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
sub GetAttrValue
|
|
622
|
-
{
|
|
623
|
-
my ($attr_text, $attr) = @_;
|
|
624
|
-
|
|
625
|
-
my $value = "none";
|
|
626
|
-
if ($attr_text =~ /^.*$attr=\"(.+?)\".*$/)
|
|
627
|
-
{
|
|
628
|
-
$value = $1;
|
|
629
|
-
}
|
|
630
|
-
|
|
631
|
-
return $value;
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
sub CheckFontAttr
|
|
635
|
-
{
|
|
636
|
-
my ($attr_text, $attr, $attr_hash, $count) = @_;
|
|
637
|
-
|
|
638
|
-
if ($attr_text =~ /^.*$attr=\"(.+?)\".*$/)
|
|
639
|
-
{
|
|
640
|
-
my $attr_value = $1;
|
|
641
|
-
$attr_hash->{$attr_value} = $attr_hash->{$attr_value} ? ($attr_hash->{$attr_value} + $count) : $count;
|
|
642
|
-
}
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
sub ProcessTable
|
|
646
|
-
{
|
|
647
|
-
my ($input_text, $is_pic) = @_;
|
|
648
|
-
|
|
649
|
-
# For table cell object
|
|
650
|
-
my $is_cell = 0;
|
|
651
|
-
|
|
652
|
-
my $all_text = "";
|
|
653
|
-
my $text = "";
|
|
654
|
-
|
|
655
|
-
my @lines = split(/\n/, $input_text);
|
|
656
|
-
|
|
657
|
-
my %table_pos = (); # $table_pos{$cellText} = "$l-$t-$r-$bottom"
|
|
658
|
-
my %table = (); # $table{$row}->{$col} = \@para_texts
|
|
659
|
-
my $row_from;
|
|
660
|
-
my $col_from;
|
|
661
|
-
my $row_till;
|
|
662
|
-
my $col_till;
|
|
663
|
-
|
|
664
|
-
# xml feature
|
|
665
|
-
my $align = "none";
|
|
666
|
-
my $pos = -1;
|
|
667
|
-
foreach my $line (@lines)
|
|
668
|
-
{
|
|
669
|
-
if ($line =~ /^<table (.+?)>$/)
|
|
670
|
-
{
|
|
671
|
-
my $attr = $1;
|
|
672
|
-
|
|
673
|
-
if($is_markup) { $markup_output .= "### Table $attr\n"; }
|
|
674
|
-
|
|
675
|
-
# Fix: wrong regex sequence, huydhn
|
|
676
|
-
#if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
|
|
677
|
-
#{
|
|
678
|
-
# my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
|
|
679
|
-
# $align = $5;
|
|
680
|
-
|
|
681
|
-
# # pos feature
|
|
682
|
-
# $pos = ($t+$bottom)/2.0;
|
|
683
|
-
|
|
684
|
-
# if($pos < $g_minpos) { $g_minpos = $pos; }
|
|
685
|
-
# if($pos > $g_maxpos) { $g_maxpos = $pos; }
|
|
686
|
-
#}
|
|
687
|
-
#else
|
|
688
|
-
#{
|
|
689
|
-
# print STDERR "# no table alignment or location \"$line\"\n";
|
|
690
|
-
# $align = "";
|
|
691
|
-
#}
|
|
692
|
-
|
|
693
|
-
my ($l, $t, $r, $bottom) = undef;
|
|
694
|
-
if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
|
|
695
|
-
if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
|
|
696
|
-
if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
|
|
697
|
-
if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
|
|
698
|
-
|
|
699
|
-
if ($t && $bottom)
|
|
700
|
-
{
|
|
701
|
-
# pos feature
|
|
702
|
-
$pos = ($t + $bottom) / 2.0;
|
|
703
|
-
|
|
704
|
-
if($pos < $g_minpos) { $g_minpos = $pos; }
|
|
705
|
-
if($pos > $g_maxpos) { $g_maxpos = $pos; }
|
|
706
|
-
}
|
|
707
|
-
else
|
|
708
|
-
{
|
|
709
|
-
die "# Undefined table location \"$line\"\n";
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
|
|
713
|
-
{
|
|
714
|
-
$align = $1;
|
|
715
|
-
}
|
|
716
|
-
else
|
|
717
|
-
{
|
|
718
|
-
print STDERR "# no table alignment \"$line\"\n";
|
|
719
|
-
$align = "";
|
|
720
|
-
}
|
|
721
|
-
# End.
|
|
722
|
-
}
|
|
723
|
-
elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
|
|
724
|
-
{
|
|
725
|
-
$col_from = $1;
|
|
726
|
-
$col_till = $2;
|
|
727
|
-
$row_from = $3;
|
|
728
|
-
$row_till = $4;
|
|
729
|
-
#print STDERR "$row_from $row_till $col_from $col_till\n";
|
|
730
|
-
$is_cell = 1;
|
|
731
|
-
}
|
|
732
|
-
elsif ($line =~ /^<\/cell>$/) # end cell
|
|
733
|
-
{
|
|
734
|
-
my @para_texts = ();
|
|
735
|
-
ProcessCell($text, \@para_texts, \%table_pos, $is_pic);
|
|
736
|
-
|
|
737
|
-
for(my $i = $row_from; $i<=$row_till; $i++)
|
|
738
|
-
{
|
|
739
|
-
for(my $j = $col_from; $j<=$col_till; $j++)
|
|
740
|
-
{
|
|
741
|
-
if(!$table{$i}) { $table{$i} = (); }
|
|
742
|
-
if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
|
|
743
|
-
|
|
744
|
-
if($i == $row_from && $j == $col_from)
|
|
745
|
-
{
|
|
746
|
-
push(@{$table{$i}->{$j}}, @para_texts);
|
|
747
|
-
if(scalar(@para_texts) > 1) { last; }
|
|
748
|
-
}
|
|
749
|
-
else
|
|
750
|
-
{
|
|
751
|
-
push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
|
|
752
|
-
}
|
|
753
|
-
}
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
$is_cell = 0;
|
|
757
|
-
$text = "";
|
|
758
|
-
}
|
|
759
|
-
elsif($is_cell)
|
|
760
|
-
{
|
|
761
|
-
$text .= $line."\n";
|
|
762
|
-
next;
|
|
763
|
-
}
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
# note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
|
|
767
|
-
# E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
|
|
768
|
-
# It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
|
|
769
|
-
my @sorted_rows = sort {$a <=> $b} keys %table;
|
|
770
|
-
my $is_first_line_para = 1;
|
|
771
|
-
foreach my $row (@sorted_rows)
|
|
772
|
-
{
|
|
773
|
-
my %table_r = %{$table{$row}};
|
|
774
|
-
my @sorted_cols = sort {$a <=> $b} keys %table_r;
|
|
775
|
-
while(1)
|
|
776
|
-
{
|
|
777
|
-
my $is_stop = 1;
|
|
778
|
-
my $row_text = "";
|
|
779
|
-
|
|
780
|
-
foreach my $col (@sorted_cols)
|
|
781
|
-
{
|
|
782
|
-
# there's still some thing to process
|
|
783
|
-
if(scalar(@{$table_r{$col}}) > 0)
|
|
784
|
-
{
|
|
785
|
-
$is_stop = 0;
|
|
786
|
-
$row_text .= shift(@{$table_r{$col}});
|
|
787
|
-
}
|
|
788
|
-
$row_text .= "\t";
|
|
789
|
-
}
|
|
790
|
-
|
|
791
|
-
if ((!$is_allow_empty && $row_text =~ /^\s*$/) || ($is_allow_empty && $row_text eq ""))
|
|
792
|
-
{
|
|
793
|
-
$is_stop = 1;
|
|
794
|
-
}
|
|
795
|
-
|
|
796
|
-
if($is_stop)
|
|
797
|
-
{
|
|
798
|
-
last;
|
|
799
|
-
}
|
|
800
|
-
else
|
|
801
|
-
{
|
|
802
|
-
$row_text =~ s/\t$/\n/;
|
|
803
|
-
$all_text .= $row_text;
|
|
804
|
-
# print STDERR "$row_text";
|
|
805
|
-
|
|
806
|
-
# para
|
|
807
|
-
if($is_first_line_para)
|
|
808
|
-
{
|
|
809
|
-
push(@g_para, "yes");
|
|
810
|
-
$is_first_line_para = 0;
|
|
811
|
-
}
|
|
812
|
-
else
|
|
813
|
-
{
|
|
814
|
-
push(@g_para, "no");
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
if($is_xml_feature)
|
|
818
|
-
{
|
|
819
|
-
# table feature
|
|
820
|
-
push(@g_table, "yes");
|
|
821
|
-
|
|
822
|
-
# pic feature
|
|
823
|
-
if($is_pic)
|
|
824
|
-
{
|
|
825
|
-
push(@g_pic, "yes");
|
|
826
|
-
}
|
|
827
|
-
else
|
|
828
|
-
{
|
|
829
|
-
push(@g_pic, "no");
|
|
830
|
-
}
|
|
831
|
-
|
|
832
|
-
push(@g_pos_hash, $pos); # update xml pos value
|
|
833
|
-
push(@g_align, $align); # update xml alignment value
|
|
834
|
-
|
|
835
|
-
### Not assign value ###
|
|
836
|
-
push(@g_font_size, -1); # fontSize feature
|
|
837
|
-
push(@g_font_face, "none"); # fontFace feature
|
|
838
|
-
push(@g_bold, "no"); # bold feature
|
|
839
|
-
push(@g_italic, "no"); # italic feature
|
|
840
|
-
push(@g_bullet, "no"); # bullet feature
|
|
841
|
-
# push(@gSpace, "none"); # space feature
|
|
842
|
-
} # end if xml feature
|
|
843
|
-
}
|
|
844
|
-
}
|
|
845
|
-
}
|
|
846
|
-
|
|
847
|
-
return $all_text;
|
|
848
|
-
}
|
|
849
|
-
|
|
850
|
-
sub ProcessCell
|
|
851
|
-
{
|
|
852
|
-
my ($input_text, $para_texts, $table_pos, $is_pic) = @_;
|
|
853
|
-
|
|
854
|
-
my $text = "";
|
|
855
|
-
my @lines = split(/\n/, $input_text);
|
|
856
|
-
my $is_para = 0;
|
|
857
|
-
my $flag = 0;
|
|
858
|
-
foreach my $line (@lines)
|
|
859
|
-
{
|
|
860
|
-
if ($line =~ /^<para (.*)>$/)
|
|
861
|
-
{
|
|
862
|
-
$text .= $line."\n"; # we need the header
|
|
863
|
-
$is_para = 1;
|
|
864
|
-
|
|
865
|
-
if($is_markup)
|
|
866
|
-
{
|
|
867
|
-
$markup_output .= "## ParaTable $1\n";
|
|
868
|
-
}
|
|
869
|
-
}
|
|
870
|
-
elsif ($line =~ /^<\/para>$/)
|
|
871
|
-
{
|
|
872
|
-
my ($para_text, $l, $t, $r, $b) = ProcessPara($text, 1, $is_pic);
|
|
873
|
-
my @tokens = split(/\n/, $para_text);
|
|
874
|
-
|
|
875
|
-
foreach my $token (@tokens)
|
|
876
|
-
{
|
|
877
|
-
if($token ne "")
|
|
878
|
-
{
|
|
879
|
-
push(@{$para_texts}, $token);
|
|
880
|
-
$flag = 1;
|
|
881
|
-
}
|
|
882
|
-
}
|
|
883
|
-
|
|
884
|
-
if(!$table_pos->{$para_text})
|
|
885
|
-
{
|
|
886
|
-
$table_pos->{$para_text} = "$l-$t-$r-$b";
|
|
887
|
-
}
|
|
888
|
-
else
|
|
889
|
-
{
|
|
890
|
-
#print STDERR "#! Warning: in method processCell, encounter the same para_text $para_text\n";
|
|
891
|
-
}
|
|
892
|
-
|
|
893
|
-
$is_para = 0;
|
|
894
|
-
$text = "";
|
|
895
|
-
}
|
|
896
|
-
elsif ($is_para)
|
|
897
|
-
{
|
|
898
|
-
$text .= $line."\n";
|
|
899
|
-
next;
|
|
900
|
-
}
|
|
901
|
-
}
|
|
902
|
-
|
|
903
|
-
# at least one value should be added for cell which is ""
|
|
904
|
-
if ($flag == 0)
|
|
905
|
-
{
|
|
906
|
-
push(@{$para_texts}, "");
|
|
907
|
-
}
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
sub ProcessPara
|
|
911
|
-
{
|
|
912
|
-
my ($input_text, $is_cell, $is_pic) = @_;
|
|
913
|
-
|
|
914
|
-
my $is_space = 0;
|
|
915
|
-
my $is_special_space = 0;
|
|
916
|
-
my $is_tab = 0;
|
|
917
|
-
my $is_bullet = 0;
|
|
918
|
-
my $is_forced_eof = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $is_space = 0
|
|
919
|
-
|
|
920
|
-
# XML feature
|
|
921
|
-
my $align = "none";
|
|
922
|
-
my ($l, $t, $r, $bottom);
|
|
923
|
-
|
|
924
|
-
my %font_size_hash = ();
|
|
925
|
-
my %font_face_hash = ();
|
|
926
|
-
my @bold_array = ();
|
|
927
|
-
my @italic_array = ();
|
|
928
|
-
my $space = "none";
|
|
929
|
-
|
|
930
|
-
my $ln_attr;
|
|
931
|
-
my $is_ln = 0;
|
|
932
|
-
my $ln_bold = "none";
|
|
933
|
-
my $ln_italic = "none";
|
|
934
|
-
|
|
935
|
-
my $run_attr;
|
|
936
|
-
my $run_text = "";
|
|
937
|
-
my $is_run = 0;
|
|
938
|
-
my $run_bold = "none";
|
|
939
|
-
my $run_italic = "none";
|
|
940
|
-
|
|
941
|
-
my $wd_attr;
|
|
942
|
-
my $wd_text = "";
|
|
943
|
-
my $is_wd = 0;
|
|
944
|
-
|
|
945
|
-
# Word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
|
|
946
|
-
my $wd_index = 0;
|
|
947
|
-
my $ln_bold_count = 0;
|
|
948
|
-
my $ln_italic_count = 0;
|
|
949
|
-
|
|
950
|
-
my $all_text = "";
|
|
951
|
-
# Invariant: when never enter a new line, $text will be copied into $all_text, and $text is cleared
|
|
952
|
-
my $text = "";
|
|
953
|
-
|
|
954
|
-
binmode(STDERR, ":utf8");
|
|
955
|
-
|
|
956
|
-
my $is_first_line_para = 1;
|
|
957
|
-
my @lines = split(/\n/, $input_text);
|
|
958
|
-
|
|
959
|
-
for (my $i=0; $i < scalar(@lines); $i++)
|
|
960
|
-
{
|
|
961
|
-
my $line = $lines[$i];
|
|
962
|
-
|
|
963
|
-
# New para
|
|
964
|
-
if ($line =~ /^<para (.+?)>$/)
|
|
965
|
-
{
|
|
966
|
-
my $attr = $1;
|
|
967
|
-
$align = GetAttrValue($attr, "alignment");
|
|
968
|
-
# $indent = GetAttrValue($attr, "li");
|
|
969
|
-
$space = GetAttrValue($attr, "spaceBefore");
|
|
970
|
-
}
|
|
971
|
-
# New ln
|
|
972
|
-
elsif ($line =~ /^<ln (.+)>$/)
|
|
973
|
-
{
|
|
974
|
-
$ln_attr = $1;
|
|
975
|
-
$is_ln = 1;
|
|
976
|
-
|
|
977
|
-
if ($is_markup) { $markup_output .= "# Line $ln_attr\n"; }
|
|
978
|
-
|
|
979
|
-
# Fix: wrong regex sequence, huydhn
|
|
980
|
-
#if ($ln_attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
|
|
981
|
-
#{
|
|
982
|
-
# ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
|
|
983
|
-
#}
|
|
984
|
-
|
|
985
|
-
if ($ln_attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
|
|
986
|
-
if ($ln_attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
|
|
987
|
-
if ($ln_attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
|
|
988
|
-
if ($ln_attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
|
|
989
|
-
# End.
|
|
990
|
-
|
|
991
|
-
$is_forced_eof = GetAttrValue($ln_attr, "forcedEOF");
|
|
992
|
-
|
|
993
|
-
# Bold & Italic
|
|
994
|
-
if ($is_xml_feature)
|
|
995
|
-
{
|
|
996
|
-
$ln_bold = GetAttrValue($ln_attr, "bold");
|
|
997
|
-
$ln_italic = GetAttrValue($ln_attr, "italic");
|
|
998
|
-
}
|
|
999
|
-
}
|
|
1000
|
-
# New run
|
|
1001
|
-
elsif ($line =~ /<run (.*)>$/)
|
|
1002
|
-
{
|
|
1003
|
-
$run_attr = $1;
|
|
1004
|
-
$is_space = 0;
|
|
1005
|
-
$is_tab = 0;
|
|
1006
|
-
$is_run = 1;
|
|
1007
|
-
|
|
1008
|
-
# New wd, that consists of many runs
|
|
1009
|
-
if ($line =~ /^<wd (.*?)>/)
|
|
1010
|
-
{
|
|
1011
|
-
$is_wd = 1;
|
|
1012
|
-
$wd_attr = $1;
|
|
1013
|
-
}
|
|
1014
|
-
|
|
1015
|
-
# Bold & Italic
|
|
1016
|
-
if ($is_xml_feature)
|
|
1017
|
-
{
|
|
1018
|
-
$run_bold = GetAttrValue($run_attr, "bold");
|
|
1019
|
-
$run_italic = GetAttrValue($run_attr, "italic");
|
|
1020
|
-
}
|
|
1021
|
-
}
|
|
1022
|
-
# Word
|
|
1023
|
-
elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
|
|
1024
|
-
{
|
|
1025
|
-
$wd_attr = $1;
|
|
1026
|
-
my $word = $2;
|
|
1027
|
-
$is_space = 0;
|
|
1028
|
-
$is_tab = 0;
|
|
1029
|
-
|
|
1030
|
-
if ($is_markup)
|
|
1031
|
-
{
|
|
1032
|
-
$markup_output .= "$word $wd_attr\n";
|
|
1033
|
-
# If both bold and italic, then just use one
|
|
1034
|
-
if ($is_run && $run_attr =~ /(bold|italic)=\"true\"/)
|
|
1035
|
-
{
|
|
1036
|
-
$markup_output .= " $1=\"true\"";
|
|
1037
|
-
}
|
|
1038
|
-
$markup_output .= "\n";
|
|
1039
|
-
}
|
|
1040
|
-
|
|
1041
|
-
# FontSize & FontFace
|
|
1042
|
-
if ($is_xml_feature)
|
|
1043
|
-
{
|
|
1044
|
-
CheckFontAttr($wd_attr, "fontSize", \%font_size_hash, 1);
|
|
1045
|
-
CheckFontAttr($wd_attr, "fontFace", \%font_face_hash, 1);
|
|
1046
|
-
}
|
|
1047
|
-
|
|
1048
|
-
# Bold & Italic
|
|
1049
|
-
if ($is_xml_feature)
|
|
1050
|
-
{
|
|
1051
|
-
my $wd_bold = GetAttrValue($wd_attr, "bold");
|
|
1052
|
-
my $wd_italic = GetAttrValue($wd_attr, "italic");
|
|
1053
|
-
|
|
1054
|
-
if ($wd_bold eq "true" || $run_bold eq "true" || $ln_bold eq "true")
|
|
1055
|
-
{
|
|
1056
|
-
$bold_array[$wd_index] = 1;
|
|
1057
|
-
$ln_bold_count++;
|
|
1058
|
-
}
|
|
1059
|
-
|
|
1060
|
-
if ($wd_italic eq "true" || $run_italic eq "true" || $ln_italic eq "true")
|
|
1061
|
-
{
|
|
1062
|
-
$italic_array[$wd_index] = 1;
|
|
1063
|
-
$ln_italic_count++;
|
|
1064
|
-
}
|
|
1065
|
-
}
|
|
1066
|
-
|
|
1067
|
-
# Add text
|
|
1068
|
-
$text .= "$word";
|
|
1069
|
-
|
|
1070
|
-
if ($is_run)
|
|
1071
|
-
{
|
|
1072
|
-
$run_text .= "$word ";
|
|
1073
|
-
}
|
|
1074
|
-
|
|
1075
|
-
$wd_index++;
|
|
1076
|
-
}
|
|
1077
|
-
# End wd
|
|
1078
|
-
elsif ($line =~ /^<\/wd>$/)
|
|
1079
|
-
{
|
|
1080
|
-
$is_wd = 0;
|
|
1081
|
-
|
|
1082
|
-
if ($is_markup)
|
|
1083
|
-
{
|
|
1084
|
-
$markup_output .= "$wd_text $wd_attr\n";
|
|
1085
|
-
# If both bold and italic, then just use one
|
|
1086
|
-
if ($is_run && $run_attr =~ /(bold|italic)=\"true\"/)
|
|
1087
|
-
{
|
|
1088
|
-
$markup_output .= " $1=\"true\"";
|
|
1089
|
-
}
|
|
1090
|
-
|
|
1091
|
-
$markup_output .= "\n";
|
|
1092
|
-
$wd_attr = "";
|
|
1093
|
-
}
|
|
1094
|
-
}
|
|
1095
|
-
# End run
|
|
1096
|
-
elsif ($line =~ /^(.*)<\/run>$/)
|
|
1097
|
-
{
|
|
1098
|
-
my $word = $1;
|
|
1099
|
-
|
|
1100
|
-
# Add text
|
|
1101
|
-
if ($word ne "")
|
|
1102
|
-
{
|
|
1103
|
-
# Bold & Italic
|
|
1104
|
-
if ($is_xml_feature)
|
|
1105
|
-
{
|
|
1106
|
-
if ($run_bold eq "true" || $ln_bold eq "true")
|
|
1107
|
-
{
|
|
1108
|
-
$bold_array[$wd_index] = 1;
|
|
1109
|
-
$ln_bold_count++;
|
|
1110
|
-
}
|
|
1111
|
-
|
|
1112
|
-
if ($run_italic eq "true" || $ln_italic eq "true")
|
|
1113
|
-
{
|
|
1114
|
-
$italic_array[$wd_index] = 1;
|
|
1115
|
-
$ln_italic_count++;
|
|
1116
|
-
}
|
|
1117
|
-
}
|
|
1118
|
-
|
|
1119
|
-
# Appear in the final result
|
|
1120
|
-
if ($is_ln) { $text .= "$word"; }
|
|
1121
|
-
|
|
1122
|
-
# For internal record
|
|
1123
|
-
if ($is_run) { $run_text .= $word . " "; }
|
|
1124
|
-
if ($is_wd) { $wd_text .= $word; }
|
|
1125
|
-
|
|
1126
|
-
$wd_index++;
|
|
1127
|
-
}
|
|
1128
|
-
|
|
1129
|
-
# Xml feature
|
|
1130
|
-
# Not a space, tab or new-line run
|
|
1131
|
-
if ($is_xml_feature && $run_text ne "")
|
|
1132
|
-
{
|
|
1133
|
-
my @words = split(/\s+/, $run_text);
|
|
1134
|
-
my $num_words = scalar(@words);
|
|
1135
|
-
CheckFontAttr($run_attr, "fontSize", \%font_size_hash, $num_words, 1);
|
|
1136
|
-
CheckFontAttr($run_attr, "fontFace", \%font_face_hash, $num_words, 1);
|
|
1137
|
-
}
|
|
1138
|
-
|
|
1139
|
-
# Reset run
|
|
1140
|
-
# <run> not enclosed within <ln>
|
|
1141
|
-
if (!$is_ln)
|
|
1142
|
-
{
|
|
1143
|
-
$wd_index = 0;
|
|
1144
|
-
}
|
|
1145
|
-
|
|
1146
|
-
$run_text = "";
|
|
1147
|
-
$is_run = 0;
|
|
1148
|
-
$is_special_space = 0;
|
|
1149
|
-
|
|
1150
|
-
# Bold & Italic
|
|
1151
|
-
if ($is_xml_feature)
|
|
1152
|
-
{
|
|
1153
|
-
$run_bold = "none";
|
|
1154
|
-
$run_italic = "none";
|
|
1155
|
-
|
|
1156
|
-
# <run> not enclosed within <ln>
|
|
1157
|
-
if(!$is_ln)
|
|
1158
|
-
{
|
|
1159
|
-
$ln_bold_count = 0;
|
|
1160
|
-
$ln_italic_count = 0;
|
|
1161
|
-
}
|
|
1162
|
-
}
|
|
1163
|
-
}
|
|
1164
|
-
# End ln
|
|
1165
|
-
elsif ($line =~ /^<\/ln>$/)
|
|
1166
|
-
{
|
|
1167
|
-
if((!$is_allow_empty && $text !~ /^\s*$/) || ($is_allow_empty && $text ne ""))
|
|
1168
|
-
{
|
|
1169
|
-
if ($is_forced_eof eq "true" || (!$is_cell && !$is_special_space) )
|
|
1170
|
-
{
|
|
1171
|
-
$text .= "\n";
|
|
1172
|
-
# Update all_text
|
|
1173
|
-
$all_text .= $text;
|
|
1174
|
-
$text = "";
|
|
1175
|
-
}
|
|
1176
|
-
|
|
1177
|
-
my $num_words = $wd_index;
|
|
1178
|
-
|
|
1179
|
-
if (!$is_cell)
|
|
1180
|
-
{
|
|
1181
|
-
if ($is_first_line_para)
|
|
1182
|
-
{
|
|
1183
|
-
push(@g_para, "yes");
|
|
1184
|
-
$is_first_line_para = 0;
|
|
1185
|
-
}
|
|
1186
|
-
else
|
|
1187
|
-
{
|
|
1188
|
-
push(@g_para, "no");
|
|
1189
|
-
}
|
|
1190
|
-
}
|
|
1191
|
-
|
|
1192
|
-
if ($is_xml_feature && $num_words >= 1)
|
|
1193
|
-
{
|
|
1194
|
-
# XML feature
|
|
1195
|
-
# Assumtion that: font_size is either occur in <ln>, or within multiple <run> under <ln>, but not both
|
|
1196
|
-
CheckFontAttr($ln_attr, "fontSize", \%font_size_hash, $num_words);
|
|
1197
|
-
CheckFontAttr($ln_attr, "fontFace", \%font_face_hash, $num_words);
|
|
1198
|
-
}
|
|
1199
|
-
|
|
1200
|
-
if ($is_xml_feature && !$is_cell && !$is_special_space)
|
|
1201
|
-
{
|
|
1202
|
-
my $pos = ($t + $bottom)/2.0;
|
|
1203
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
|
1204
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
|
1205
|
-
|
|
1206
|
-
push(@g_pos_hash, $pos); # pos feature
|
|
1207
|
-
push(@g_align, $align); # alignment feature
|
|
1208
|
-
push(@g_table, "no"); # table feature
|
|
1209
|
-
|
|
1210
|
-
if ($is_pic)
|
|
1211
|
-
{
|
|
1212
|
-
push(@g_pic, "yes");
|
|
1213
|
-
|
|
1214
|
-
# Not assign value
|
|
1215
|
-
push(@g_font_size, -1); # bold feature
|
|
1216
|
-
push(@g_font_face, "none"); # bold feature
|
|
1217
|
-
push(@g_bold, "no"); # bold feature
|
|
1218
|
-
push(@g_italic, "no"); # italic feature
|
|
1219
|
-
push(@g_bullet, "no"); # bullet feature
|
|
1220
|
-
}
|
|
1221
|
-
else
|
|
1222
|
-
{
|
|
1223
|
-
push(@g_pic, "no");
|
|
1224
|
-
|
|
1225
|
-
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
|
1226
|
-
|
|
1227
|
-
%font_size_hash = ();
|
|
1228
|
-
%font_face_hash = ();
|
|
1229
|
-
|
|
1230
|
-
UpdateXMLFeatures($ln_bold_count, $ln_italic_count, $num_words, $is_bullet, $space);
|
|
1231
|
-
}
|
|
1232
|
-
}
|
|
1233
|
-
}
|
|
1234
|
-
|
|
1235
|
-
# Reset ln
|
|
1236
|
-
$is_ln = 0;
|
|
1237
|
-
$is_forced_eof = "none";
|
|
1238
|
-
$is_special_space = 0;
|
|
1239
|
-
$wd_index = 0;
|
|
1240
|
-
|
|
1241
|
-
# Bold & Italic
|
|
1242
|
-
if ($is_xml_feature)
|
|
1243
|
-
{
|
|
1244
|
-
$ln_bold = "none";
|
|
1245
|
-
$ln_italic = "none";
|
|
1246
|
-
$ln_bold_count = 0;
|
|
1247
|
-
$ln_italic_count = 0;
|
|
1248
|
-
}
|
|
1249
|
-
}
|
|
1250
|
-
# Newline signal
|
|
1251
|
-
elsif ($line =~ /^<nl orig=\"true\"\/>$/)
|
|
1252
|
-
{
|
|
1253
|
-
if($is_ln)
|
|
1254
|
-
{
|
|
1255
|
-
$is_space = 0;
|
|
1256
|
-
}
|
|
1257
|
-
else
|
|
1258
|
-
{
|
|
1259
|
-
if($is_debug)
|
|
1260
|
-
{
|
|
1261
|
-
print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
|
|
1262
|
-
}
|
|
1263
|
-
}
|
|
1264
|
-
}
|
|
1265
|
-
# Space
|
|
1266
|
-
elsif ($line =~ /^<space\/>$/)
|
|
1267
|
-
{
|
|
1268
|
-
my $start_tag = "";
|
|
1269
|
-
my $end_tag = "";
|
|
1270
|
-
|
|
1271
|
-
if ($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/) { $start_tag = $1; }
|
|
1272
|
-
|
|
1273
|
-
if ($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/) { $end_tag = $1; }
|
|
1274
|
-
|
|
1275
|
-
if ($start_tag eq $end_tag && $start_tag ne "")
|
|
1276
|
-
{
|
|
1277
|
-
# print STDERR "# Special space after \"$text\"\n";
|
|
1278
|
-
$is_special_space = 1;
|
|
1279
|
-
}
|
|
1280
|
-
|
|
1281
|
-
# Add text
|
|
1282
|
-
$text .= " ";
|
|
1283
|
-
$is_space = 1;
|
|
1284
|
-
}
|
|
1285
|
-
# Tab
|
|
1286
|
-
elsif ($line =~ /^<tab .*\/>$/)
|
|
1287
|
-
{
|
|
1288
|
-
# Add text
|
|
1289
|
-
$text .= "\t";
|
|
1290
|
-
$is_tab = 1;
|
|
1291
|
-
}
|
|
1292
|
-
# Bullet
|
|
1293
|
-
elsif ($line =~ /^<bullet .*>$/)
|
|
1294
|
-
{
|
|
1295
|
-
$is_bullet = 1;
|
|
1296
|
-
}
|
|
1297
|
-
}
|
|
1298
|
-
|
|
1299
|
-
$all_text .= $text;
|
|
1300
|
-
return ($all_text, $l, $t, $r, $bottom, $is_space);
|
|
1301
|
-
}
|
|
1302
|
-
|
|
1303
|
-
sub UpdateXMLFontFeature
|
|
1304
|
-
{
|
|
1305
|
-
my ($font_size_hash, $font_face_hash) = @_;
|
|
1306
|
-
|
|
1307
|
-
# Font size feature
|
|
1308
|
-
if (scalar(keys %{$font_size_hash}) == 0)
|
|
1309
|
-
{
|
|
1310
|
-
push(@g_font_size, -1);
|
|
1311
|
-
}
|
|
1312
|
-
else
|
|
1313
|
-
{
|
|
1314
|
-
my @sorted_fonts = sort { $font_size_hash->{$b} <=> $font_size_hash->{$a} } keys %{$font_size_hash};
|
|
1315
|
-
|
|
1316
|
-
my $font_size = $sorted_fonts[0];
|
|
1317
|
-
push(@g_font_size, $font_size);
|
|
1318
|
-
|
|
1319
|
-
$g_font_size_hash{$font_size} = $g_font_size_hash{$font_size} ? ($g_font_size_hash{$font_size}+1) : 1;
|
|
1320
|
-
}
|
|
1321
|
-
|
|
1322
|
-
# Font face feature
|
|
1323
|
-
if (scalar(keys %{$font_face_hash}) == 0)
|
|
1324
|
-
{
|
|
1325
|
-
push(@g_font_face, "none");
|
|
1326
|
-
}
|
|
1327
|
-
else
|
|
1328
|
-
{
|
|
1329
|
-
my @sorted_fonts = sort { $font_face_hash->{$b} <=> $font_face_hash->{$a} } keys %{$font_face_hash};
|
|
1330
|
-
my $font_face = $sorted_fonts[0];
|
|
1331
|
-
push(@g_font_face, $font_face);
|
|
1332
|
-
|
|
1333
|
-
$g_font_face_hash{$font_face} = $g_font_face_hash{$font_face} ? ($g_font_face_hash{$font_face}+1) : 1;
|
|
1334
|
-
}
|
|
1335
|
-
}
|
|
1336
|
-
|
|
1337
|
-
sub UpdateXMLFeatures
|
|
1338
|
-
{
|
|
1339
|
-
my ($ln_bold_count, $ln_italic_count, $num_words, $is_bullet, $space) = @_;
|
|
1340
|
-
|
|
1341
|
-
# Bold feature
|
|
1342
|
-
my $bold_feature;
|
|
1343
|
-
if ($ln_bold_count/$num_words >= 0.667)
|
|
1344
|
-
{
|
|
1345
|
-
$bold_feature = "yes";
|
|
1346
|
-
}
|
|
1347
|
-
else
|
|
1348
|
-
{
|
|
1349
|
-
$bold_feature = "no";
|
|
1350
|
-
}
|
|
1351
|
-
push(@g_bold, $bold_feature);
|
|
1352
|
-
|
|
1353
|
-
# Italic feature
|
|
1354
|
-
my $italic_feature;
|
|
1355
|
-
if ($ln_italic_count/$num_words >= 0.667)
|
|
1356
|
-
{
|
|
1357
|
-
$italic_feature = "yes";
|
|
1358
|
-
}
|
|
1359
|
-
else
|
|
1360
|
-
{
|
|
1361
|
-
$italic_feature = "no";
|
|
1362
|
-
}
|
|
1363
|
-
push(@g_italic, $italic_feature);
|
|
1364
|
-
|
|
1365
|
-
# Bullet feature
|
|
1366
|
-
if ($is_bullet)
|
|
1367
|
-
{
|
|
1368
|
-
push(@g_bullet, "yes");
|
|
1369
|
-
}
|
|
1370
|
-
else
|
|
1371
|
-
{
|
|
1372
|
-
push(@g_bullet, "no");
|
|
1373
|
-
}
|
|
1374
|
-
|
|
1375
|
-
# Space feature
|
|
1376
|
-
# push(@gSpace, $space);
|
|
1377
|
-
}
|
|
1378
|
-
|
|
1379
|
-
# Find the positions of header, body, and citation
|
|
1380
|
-
sub GetStructureInfo
|
|
1381
|
-
{
|
|
1382
|
-
my ($lines, $num_lines) = @_;
|
|
1383
|
-
|
|
1384
|
-
my ($body_length, $citation_length, $body_end_id) = SectLabel::PreProcess::findCitationText($lines, 0, $num_lines);
|
|
1385
|
-
|
|
1386
|
-
my ($header_length, $body_start_id);
|
|
1387
|
-
|
|
1388
|
-
($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::findHeaderText($lines, 0, $body_length);
|
|
1389
|
-
|
|
1390
|
-
# Sanity check
|
|
1391
|
-
my $totalLength = $header_length + $body_length + $citation_length;
|
|
1392
|
-
|
|
1393
|
-
if ($num_lines != $totalLength)
|
|
1394
|
-
{
|
|
1395
|
-
print STDOUT "Die in getStructureInfo(): different num lines $num_lines != $totalLength\n"; # to display in Web
|
|
1396
|
-
die "Die in getStructureInfo(): different num lines $num_lines != $totalLength\n";
|
|
1397
|
-
}
|
|
1398
|
-
|
|
1399
|
-
return ($header_length, $body_length, $citation_length, $body_start_id, $body_end_id);
|
|
1400
|
-
}
|
|
1401
|
-
|
|
1402
|
-
# Count XML tags/values for statistics purpose
|
|
1403
|
-
sub ProcessTagInfo
|
|
1404
|
-
{
|
|
1405
|
-
my ($line, $tags) = @_;
|
|
1406
|
-
|
|
1407
|
-
my $tag;
|
|
1408
|
-
my $attr;
|
|
1409
|
-
|
|
1410
|
-
if ($line =~ /^<(.+?)\b(.*)/)
|
|
1411
|
-
{
|
|
1412
|
-
$tag = $1;
|
|
1413
|
-
$attr = $2;
|
|
1414
|
-
if (!$tags->{$tag})
|
|
1415
|
-
{
|
|
1416
|
-
$tags->{$tag} = ();
|
|
1417
|
-
}
|
|
1418
|
-
|
|
1419
|
-
if ($attr =~ /^\s*(.+?)\s*\/?>/)
|
|
1420
|
-
{
|
|
1421
|
-
$attr = $1;
|
|
1422
|
-
}
|
|
1423
|
-
|
|
1424
|
-
my @tokens = split(/\s+/, $attr);
|
|
1425
|
-
foreach my $token (@tokens)
|
|
1426
|
-
{
|
|
1427
|
-
if($token =~ /^(.+)=(.+)$/)
|
|
1428
|
-
{
|
|
1429
|
-
my $attr_name = $1;
|
|
1430
|
-
my $value = $2;
|
|
1431
|
-
|
|
1432
|
-
if (!$tags->{$tag}->{$attr_name})
|
|
1433
|
-
{
|
|
1434
|
-
$tags->{$tag}->{$attr_name} = ();
|
|
1435
|
-
}
|
|
1436
|
-
if (!$tags->{$tag}->{$attr_name}->{$value})
|
|
1437
|
-
{
|
|
1438
|
-
$tags->{$tag}->{$attr_name}->{$value} = 0;
|
|
1439
|
-
}
|
|
1440
|
-
$tags->{$tag}->{$attr_name}->{$value}++;
|
|
1441
|
-
}
|
|
1442
|
-
}
|
|
1443
|
-
}
|
|
1444
|
-
}
|
|
1445
|
-
|
|
1446
|
-
# Print tag info to file
|
|
1447
|
-
sub PrintTagInfo
|
|
1448
|
-
{
|
|
1449
|
-
my ($tags, $tag_file) = @_;
|
|
1450
|
-
|
|
1451
|
-
open(TAG, ">:utf8", "$tag_file") || die"#Can't open file \"$tag_file\"\n";
|
|
1452
|
-
|
|
1453
|
-
my @sortedTags = sort {$a cmp $b} keys %{$tags};
|
|
1454
|
-
|
|
1455
|
-
foreach(@sortedTags)
|
|
1456
|
-
{
|
|
1457
|
-
my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
|
|
1458
|
-
print TAG "# Tag = $_\n";
|
|
1459
|
-
|
|
1460
|
-
foreach my $attr (@attrs)
|
|
1461
|
-
{
|
|
1462
|
-
print TAG "$attr:";
|
|
1463
|
-
my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
|
|
1464
|
-
|
|
1465
|
-
foreach my $value (@values)
|
|
1466
|
-
{
|
|
1467
|
-
print TAG " $value-$tags->{$_}->{$attr}->{$value}";
|
|
1468
|
-
}
|
|
1469
|
-
|
|
1470
|
-
print TAG "\n";
|
|
1471
|
-
}
|
|
1472
|
-
}
|
|
1473
|
-
|
|
1474
|
-
close TAG;
|
|
1475
|
-
}
|
|
1476
|
-
|
|
1477
|
-
sub UntaintPath
|
|
1478
|
-
{
|
|
1479
|
-
my ($path) = @_;
|
|
1480
|
-
|
|
1481
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ )
|
|
1482
|
-
{
|
|
1483
|
-
$path = $1;
|
|
1484
|
-
}
|
|
1485
|
-
else
|
|
1486
|
-
{
|
|
1487
|
-
die "Bad path \"$path\"\n";
|
|
1488
|
-
}
|
|
1489
|
-
|
|
1490
|
-
return $path;
|
|
1491
|
-
}
|
|
1492
|
-
|
|
1493
|
-
sub Untaint
|
|
1494
|
-
{
|
|
1495
|
-
my ($s) = @_;
|
|
1496
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
|
|
1497
|
-
{
|
|
1498
|
-
$s = $1; # $data now untainted
|
|
1499
|
-
}
|
|
1500
|
-
else
|
|
1501
|
-
{
|
|
1502
|
-
die "Bad data in $s"; # log this somewhere
|
|
1503
|
-
}
|
|
1504
|
-
|
|
1505
|
-
return $s;
|
|
1506
|
-
}
|
|
1507
|
-
|
|
1508
|
-
sub Execute
|
|
1509
|
-
{
|
|
1510
|
-
my ($cmd) = @_;
|
|
1511
|
-
|
|
1512
|
-
if ($is_debug)
|
|
1513
|
-
{
|
|
1514
|
-
print STDERR "Executing: $cmd\n";
|
|
1515
|
-
}
|
|
1516
|
-
|
|
1517
|
-
$cmd = Untaint($cmd);
|
|
1518
|
-
system($cmd);
|
|
1519
|
-
}
|
|
1520
|
-
|
|
1521
|
-
sub NewTmpFile
|
|
1522
|
-
{
|
|
1523
|
-
my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
|
|
1524
|
-
chomp($tmpFile);
|
|
1525
|
-
return $tmpFile;
|
|
1526
|
-
}
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|