biblicit 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
@@ -1,964 +0,0 @@
|
|
1
|
-
#!/usr/bin/perl
|
2
|
-
|
3
|
-
# Author: Do Hoang Nhat Huy <huydo@comp.nus.edu.sg>
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
5
|
-
|
6
|
-
require 5.0;
|
7
|
-
use strict;
|
8
|
-
|
9
|
-
# Dependencies
|
10
|
-
use FindBin;
|
11
|
-
use Getopt::Long;
|
12
|
-
use HTML::Entities;
|
13
|
-
|
14
|
-
# I do not know a better solution to find a lib path in -T mode.
|
15
|
-
# So if you know a better solution, I'd be glad to hear.
|
16
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
17
|
-
|
18
|
-
# To get correct path in case 2 scripts in different directories use FindBin
|
19
|
-
FindBin::again();
|
20
|
-
my $path = undef;
|
21
|
-
BEGIN
|
22
|
-
{
|
23
|
-
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
24
|
-
}
|
25
|
-
use lib "$path/../../lib";
|
26
|
-
|
27
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
|
28
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
|
29
|
-
|
30
|
-
# Local libraries
|
31
|
-
use Omni::Config;
|
32
|
-
use Omni::Omnidoc;
|
33
|
-
use SectLabel::PreProcess;
|
34
|
-
|
35
|
-
# Omnilib configuration: object name
|
36
|
-
my $obj_list = $Omni::Config::obj_list;
|
37
|
-
|
38
|
-
### USER customizable section
|
39
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
40
|
-
my $version = "1.0";
|
41
|
-
### END user customizable section
|
42
|
-
|
43
|
-
sub License
|
44
|
-
{
|
45
|
-
print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
|
46
|
-
}
|
47
|
-
|
48
|
-
sub Help
|
49
|
-
{
|
50
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
|
51
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
52
|
-
print STDERR " $progname -in xmlfile -out outfile [-decode] [-log]\n";
|
53
|
-
print STDERR "Options:\n";
|
54
|
-
print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
|
55
|
-
print STDERR "\t-decode \tDecode HTML entities and then output, to avoid double entity encoding later\n";
|
56
|
-
}
|
57
|
-
|
58
|
-
my $quite = 0;
|
59
|
-
my $help = 0;
|
60
|
-
my $out_file = undef;
|
61
|
-
my $in_file = undef;
|
62
|
-
my $is_decode = 0;
|
63
|
-
my $is_debug = 0;
|
64
|
-
my $address = 1;
|
65
|
-
|
66
|
-
$help = 1 unless GetOptions( 'in=s' => \$in_file,
|
67
|
-
'out=s' => \$out_file,
|
68
|
-
'decode' => \$is_decode,
|
69
|
-
'log' => \$is_debug,
|
70
|
-
'h' => \$help,
|
71
|
-
'q' => \$quite );
|
72
|
-
|
73
|
-
if ($help || ! defined $in_file || ! defined $out_file)
|
74
|
-
{
|
75
|
-
Help();
|
76
|
-
exit(0);
|
77
|
-
}
|
78
|
-
|
79
|
-
if (!$quite)
|
80
|
-
{
|
81
|
-
License();
|
82
|
-
}
|
83
|
-
|
84
|
-
### Untaint ###
|
85
|
-
$in_file = UntaintPath($in_file);
|
86
|
-
$out_file = UntaintPath($out_file);
|
87
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
88
|
-
### End untaint ###
|
89
|
-
|
90
|
-
# Mark page, para, line, word
|
91
|
-
my %g_page_hash = ();
|
92
|
-
|
93
|
-
# Mark paragraph
|
94
|
-
my @g_para = ();
|
95
|
-
|
96
|
-
# XML features
|
97
|
-
# Location feature
|
98
|
-
my @g_pos_hash = ();
|
99
|
-
my $g_maxpos = 0;
|
100
|
-
my $g_minpos = 1000000;
|
101
|
-
# Align feature
|
102
|
-
my @g_align = ();
|
103
|
-
# Bold feature
|
104
|
-
my @g_bold = ();
|
105
|
-
# Italic feature
|
106
|
-
my @g_italic = ();
|
107
|
-
# Pic feature
|
108
|
-
my @g_pic = ();
|
109
|
-
# Table feature
|
110
|
-
my @g_table = ();
|
111
|
-
# Bullet feature
|
112
|
-
my @g_bullet = ();
|
113
|
-
# Font size feature
|
114
|
-
my %g_font_size_hash = ();
|
115
|
-
my @g_font_size = ();
|
116
|
-
# Font face feature
|
117
|
-
my %g_font_face_hash = ();
|
118
|
-
my @g_font_face = ();
|
119
|
-
|
120
|
-
# All lines
|
121
|
-
my @lines = ();
|
122
|
-
# and their address
|
123
|
-
my @lines_addr = ();
|
124
|
-
|
125
|
-
# BEGIN
|
126
|
-
ProcessFile($in_file);
|
127
|
-
# Find header part
|
128
|
-
my $num_lines = scalar(@lines);
|
129
|
-
my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
|
130
|
-
# Done
|
131
|
-
Output(\@lines, $out_file);
|
132
|
-
|
133
|
-
if ($address == 1)
|
134
|
-
{
|
135
|
-
my $address_handle = undef;
|
136
|
-
# Save the line address for further use
|
137
|
-
open($address_handle, ">:utf8", $out_file . ".address") || die"#Can't open file \"$out_file.address\"\n";
|
138
|
-
foreach my $addr (@lines_addr)
|
139
|
-
{
|
140
|
-
print $address_handle $addr->{ 'L1' }, " ", $addr->{ 'L2' }, " ", $addr->{ 'L3' }, " ", $addr->{ 'L4' }, "\n";
|
141
|
-
}
|
142
|
-
# Done
|
143
|
-
close $address_handle;
|
144
|
-
}
|
145
|
-
# END
|
146
|
-
|
147
|
-
sub ProcessFile
|
148
|
-
{
|
149
|
-
my ($in_file) = @_;
|
150
|
-
|
151
|
-
my $input_handle = undef;
|
152
|
-
if (! open($input_handle, "<:utf8", $in_file)) { die "Could not open xml file " . $in_file; }
|
153
|
-
my $xml = do { local $/; <$input_handle> };
|
154
|
-
close $input_handle;
|
155
|
-
|
156
|
-
###
|
157
|
-
# Huydhn
|
158
|
-
# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
|
159
|
-
# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
|
160
|
-
###
|
161
|
-
# Convert to Unix format
|
162
|
-
$xml =~ s/\r//g;
|
163
|
-
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
164
|
-
$xml =~ s/<\?xml.+?>\n//g;
|
165
|
-
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
166
|
-
$xml =~ s/<\!\-\-XML.+?>\n//g;
|
167
|
-
# Declaration and root
|
168
|
-
$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
|
169
|
-
|
170
|
-
# New document
|
171
|
-
my $doc = new Omni::Omnidoc();
|
172
|
-
$doc->set_raw($xml);
|
173
|
-
|
174
|
-
# Current position
|
175
|
-
my %current = ();
|
176
|
-
|
177
|
-
# All pages in the document
|
178
|
-
my $pages = $doc->get_objs_ref();
|
179
|
-
|
180
|
-
# From page, To page
|
181
|
-
my $start_page = 0;
|
182
|
-
my $end_page = scalar(@{ $pages }) - 1;
|
183
|
-
|
184
|
-
# Image area flag
|
185
|
-
my $is_pic = 0;
|
186
|
-
|
187
|
-
# Tree traveling is 'not' fun. Seriously.
|
188
|
-
# This is like a dungeon seige.
|
189
|
-
for (my $x = $start_page; $x <= $end_page; $x++)
|
190
|
-
{
|
191
|
-
# Current position
|
192
|
-
$current{ 'L1' } = $x;
|
193
|
-
|
194
|
-
# Column or dd
|
195
|
-
my $level_2 = $pages->[ $x ]->get_objs_ref();
|
196
|
-
my $start_l2 = 0;
|
197
|
-
my $end_l2 = scalar(@{ $level_2 }) - 1;
|
198
|
-
|
199
|
-
for (my $y = $start_l2; $y <= $end_l2; $y++)
|
200
|
-
{
|
201
|
-
# Thang's code
|
202
|
-
# Thang considers <dd> tag as image, I just follow that
|
203
|
-
if ($level_2->[ $y ]->get_name() eq $obj_list->{ 'OMNIDD' })
|
204
|
-
{
|
205
|
-
$is_pic = 1;
|
206
|
-
}
|
207
|
-
else
|
208
|
-
{
|
209
|
-
$is_pic = 0;
|
210
|
-
}
|
211
|
-
# End Thang's code
|
212
|
-
|
213
|
-
# Current position
|
214
|
-
$current{ 'L2' } = $y;
|
215
|
-
|
216
|
-
# Table or paragraph
|
217
|
-
my $level_3 = $level_2->[ $y ]->get_objs_ref();
|
218
|
-
my $start_l3 = 0;
|
219
|
-
my $end_l3 = scalar(@{ $level_3 }) - 1;
|
220
|
-
|
221
|
-
for (my $z = $start_l3; $z <= $end_l3; $z++)
|
222
|
-
{
|
223
|
-
# Current position
|
224
|
-
$current{ 'L3' } = $z;
|
225
|
-
|
226
|
-
# Is a paragraph
|
227
|
-
if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
228
|
-
{
|
229
|
-
# Thang's code
|
230
|
-
ProcessPara($level_3->[ $z ], $is_pic, \%current);
|
231
|
-
# End Thang's code
|
232
|
-
}
|
233
|
-
# or a table
|
234
|
-
elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' })
|
235
|
-
{
|
236
|
-
# Thang's code
|
237
|
-
ProcessTable($level_3->[ $z ], $is_pic, \%current, 0);
|
238
|
-
# End Thangs's code
|
239
|
-
}
|
240
|
-
# or a frame
|
241
|
-
elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' })
|
242
|
-
{
|
243
|
-
# Frame contains multiple paragraph ?
|
244
|
-
ProcessFrame($level_3->[ $z ], $is_pic, \%current);
|
245
|
-
}
|
246
|
-
}
|
247
|
-
}
|
248
|
-
}
|
249
|
-
}
|
250
|
-
|
251
|
-
sub Output
|
252
|
-
{
|
253
|
-
my ($lines, $out_file) = @_;
|
254
|
-
|
255
|
-
my $output_handle = undef;
|
256
|
-
# This is the output
|
257
|
-
open($output_handle, ">:utf8", $out_file) || die"#Can't open file \"$out_file\"\n";
|
258
|
-
|
259
|
-
# XML feature label
|
260
|
-
my %g_font_size_labels = ();
|
261
|
-
GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
|
262
|
-
|
263
|
-
my $output = "";
|
264
|
-
my $para_line_id = -1;
|
265
|
-
my $para_line_count = 0;
|
266
|
-
|
267
|
-
# This is the index of the line
|
268
|
-
my $id = 0;
|
269
|
-
# For each line in the whole document
|
270
|
-
foreach my $line (@{ $lines })
|
271
|
-
{
|
272
|
-
# Remove empty line
|
273
|
-
$line =~ s/^\s+|\s+$//g;
|
274
|
-
|
275
|
-
# New paragraph
|
276
|
-
if (($g_para[ $id ] eq "yes") && ($output ne ""))
|
277
|
-
{
|
278
|
-
if ($is_decode) { $output = decode_entities($output); }
|
279
|
-
# Write output to file
|
280
|
-
print $output_handle $output;
|
281
|
-
# Clean output for new paragraph
|
282
|
-
$output = "";
|
283
|
-
}
|
284
|
-
|
285
|
-
$output .= $line;
|
286
|
-
|
287
|
-
my $loc_feature = undef;
|
288
|
-
# XML location feature
|
289
|
-
if ($g_pos_hash[ $id ] != (-1)) { $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos) * 8.0 / ($g_maxpos - $g_minpos + 1)); }
|
290
|
-
|
291
|
-
# Align feature
|
292
|
-
my $align_feature = "xmlAlign_" . $g_align[ $id ];
|
293
|
-
|
294
|
-
my $font_size_feature = undef;
|
295
|
-
# Font_size feature
|
296
|
-
if (($g_font_size[$id] eq "") || ($g_font_size[$id] == -1))
|
297
|
-
{
|
298
|
-
$font_size_feature = "xmlFontSize_none";
|
299
|
-
}
|
300
|
-
else
|
301
|
-
{
|
302
|
-
$font_size_feature = "xmlFontSize_" . $g_font_size_labels{ $g_font_size[ $id ] };
|
303
|
-
}
|
304
|
-
|
305
|
-
# Bold feature
|
306
|
-
my $bold_feature = "xmlBold_" . $g_bold[ $id ];
|
307
|
-
# Italic feature
|
308
|
-
my $italic_feature = "xmlItalic_" . $g_italic[ $id ];
|
309
|
-
# Image feature
|
310
|
-
my $pic_feature = "xmlPic_" . $g_pic[ $id ];
|
311
|
-
# Table feature
|
312
|
-
my $table_feature = "xmlTable_" . $g_table[ $id ];
|
313
|
-
# Bullet feature
|
314
|
-
my $bullet_feature = "xmlBullet_" . $g_bullet[ $id ];
|
315
|
-
# Differential features
|
316
|
-
my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
|
317
|
-
|
318
|
-
# Each line and its XML features
|
319
|
-
$output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff" . "\n";
|
320
|
-
|
321
|
-
# Update line index
|
322
|
-
$id++;
|
323
|
-
}
|
324
|
-
|
325
|
-
# New paragraph
|
326
|
-
if ($output ne "")
|
327
|
-
{
|
328
|
-
if ($is_decode) { $output = decode_entities($output); }
|
329
|
-
# Write output to file
|
330
|
-
print $output_handle $output;
|
331
|
-
# Clean output for new paragraph
|
332
|
-
$output = "";
|
333
|
-
}
|
334
|
-
|
335
|
-
# Done
|
336
|
-
close $output_handle;
|
337
|
-
}
|
338
|
-
|
339
|
-
sub GetDifferentialFeatures
|
340
|
-
{
|
341
|
-
my ($id) = @_;
|
342
|
-
|
343
|
-
my $align_diff = "bi_xmlA_";
|
344
|
-
# AlignChange feature
|
345
|
-
if ($id == 0)
|
346
|
-
{
|
347
|
-
$align_diff .= $g_align[ $id ];
|
348
|
-
}
|
349
|
-
elsif ($g_align[ $id ] eq $g_align[ $id - 1 ])
|
350
|
-
{
|
351
|
-
$align_diff .= "continue";
|
352
|
-
}
|
353
|
-
else
|
354
|
-
{
|
355
|
-
$align_diff .= $g_align[$id];
|
356
|
-
}
|
357
|
-
|
358
|
-
my $font_face_diff = "bi_xmlF_";
|
359
|
-
# FontFaceChange feature
|
360
|
-
if ($id == 0)
|
361
|
-
{
|
362
|
-
$font_face_diff .= "new";
|
363
|
-
}
|
364
|
-
elsif ($g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
|
365
|
-
{
|
366
|
-
$font_face_diff .= "continue";
|
367
|
-
}
|
368
|
-
else
|
369
|
-
{
|
370
|
-
$font_face_diff .= "new";
|
371
|
-
}
|
372
|
-
|
373
|
-
my $font_size_diff = "bi_xmlS_";
|
374
|
-
# FontSizeChange feature
|
375
|
-
if ($id == 0)
|
376
|
-
{
|
377
|
-
$font_size_diff .= "new";
|
378
|
-
}
|
379
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ])
|
380
|
-
{
|
381
|
-
$font_size_diff .= "continue";
|
382
|
-
}
|
383
|
-
else
|
384
|
-
{
|
385
|
-
$font_size_diff .= "new";
|
386
|
-
}
|
387
|
-
|
388
|
-
my $font_sf_diff = "bi_xmlSF_";
|
389
|
-
# FontSFChange feature
|
390
|
-
if ($id == 0)
|
391
|
-
{
|
392
|
-
$font_sf_diff .= "new";
|
393
|
-
}
|
394
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
|
395
|
-
{
|
396
|
-
$font_sf_diff .= "continue";
|
397
|
-
}
|
398
|
-
else
|
399
|
-
{
|
400
|
-
$font_sf_diff .= "new";
|
401
|
-
}
|
402
|
-
|
403
|
-
my $font_sfbi_diff = "bi_xmlSFBI_";
|
404
|
-
# FontSFBIChange feature
|
405
|
-
if ($id == 0)
|
406
|
-
{
|
407
|
-
$font_sfbi_diff .= "new";
|
408
|
-
}
|
409
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[ $id - 1 ])
|
410
|
-
{
|
411
|
-
$font_sfbi_diff .= "continue";
|
412
|
-
}
|
413
|
-
else
|
414
|
-
{
|
415
|
-
$font_sfbi_diff .= "new";
|
416
|
-
}
|
417
|
-
|
418
|
-
my $font_sfbia_diff = "bi_xmlSFBIA_";
|
419
|
-
# FontSFBIAChange feature
|
420
|
-
if ($id == 0)
|
421
|
-
{
|
422
|
-
$font_sfbia_diff .= "new";
|
423
|
-
}
|
424
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[$id - 1] && $g_align[ $id ] eq $g_align[ $id - 1 ])
|
425
|
-
{
|
426
|
-
$font_sfbia_diff .= "continue";
|
427
|
-
}
|
428
|
-
else
|
429
|
-
{
|
430
|
-
$font_sfbia_diff .= "new";
|
431
|
-
}
|
432
|
-
|
433
|
-
# ParaChange feature
|
434
|
-
my $para_diff = "bi_xmlPara_";
|
435
|
-
# Header part, consider each line as a separate paragraph
|
436
|
-
if ($id < $body_start_id)
|
437
|
-
{
|
438
|
-
$para_diff .= "header";
|
439
|
-
}
|
440
|
-
else
|
441
|
-
{
|
442
|
-
if($g_para[$id] eq "yes")
|
443
|
-
{
|
444
|
-
$para_diff .= "new";
|
445
|
-
}
|
446
|
-
else
|
447
|
-
{
|
448
|
-
$para_diff .= "continue";
|
449
|
-
}
|
450
|
-
}
|
451
|
-
|
452
|
-
return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
|
453
|
-
}
|
454
|
-
|
455
|
-
sub GetFontSizeLabels
|
456
|
-
{
|
457
|
-
my ($g_font_size_hash, $g_font_size_labels) = @_;
|
458
|
-
|
459
|
-
# Sort by value in desccending order
|
460
|
-
my @sorted_fonts = sort { $g_font_size_hash->{ $b } <=> $g_font_size_hash->{ $a } } keys %{ $g_font_size_hash };
|
461
|
-
# and get the
|
462
|
-
my $common_size = $sorted_fonts[ 0 ];
|
463
|
-
|
464
|
-
# Sort by key in ascending order
|
465
|
-
@sorted_fonts = sort { $a <=> $b } keys %{ $g_font_size_hash };
|
466
|
-
|
467
|
-
my $common_index = 0;
|
468
|
-
# Index of common font size
|
469
|
-
foreach (@sorted_fonts)
|
470
|
-
{
|
471
|
-
# Found
|
472
|
-
if ($common_size == $_) { last; }
|
473
|
-
$common_index++;
|
474
|
-
}
|
475
|
-
|
476
|
-
# Small fonts
|
477
|
-
for (my $i = 0; $i < $common_index; $i++)
|
478
|
-
{
|
479
|
-
$g_font_size_labels->{ $sorted_fonts[ $i ] } = "smaller";
|
480
|
-
}
|
481
|
-
|
482
|
-
# Common fonts
|
483
|
-
$g_font_size_labels->{ $common_size } = "common";
|
484
|
-
|
485
|
-
# Large fonts
|
486
|
-
for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
|
487
|
-
{
|
488
|
-
if ((scalar(@sorted_fonts) - $i) <= 3)
|
489
|
-
{
|
490
|
-
$g_font_size_labels->{ $sorted_fonts[$i] } = "largest" . ($i + 1 - scalar(@sorted_fonts));
|
491
|
-
}
|
492
|
-
else
|
493
|
-
{
|
494
|
-
$g_font_size_labels->{ $sorted_fonts[$i] } = "larger";
|
495
|
-
}
|
496
|
-
}
|
497
|
-
}
|
498
|
-
|
499
|
-
sub ProcessFrame
|
500
|
-
{
|
501
|
-
my ($omniframe, $is_pic, $line_addr) = @_;
|
502
|
-
|
503
|
-
# Line index in the whole frame
|
504
|
-
my $lindex = 0;
|
505
|
-
# All paragraph or table in the frame
|
506
|
-
my $objs = $omniframe->get_objs_ref();
|
507
|
-
# For each paragraph or table in the frame
|
508
|
-
for (my $i = 0; $i < scalar(@{ $objs }); $i++)
|
509
|
-
{
|
510
|
-
if ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
511
|
-
{
|
512
|
-
# Paragraph attributes
|
513
|
-
my $align = $objs->[ $i ]->get_alignment();
|
514
|
-
my $space = $objs->[ $i ]->get_space_before();
|
515
|
-
# Line attributes
|
516
|
-
my ($left, $top, $right, $bottom) = undef;
|
517
|
-
# Run attributes
|
518
|
-
my $bold_count = 0;
|
519
|
-
my $italic_count = 0;
|
520
|
-
my %font_size_hash = ();
|
521
|
-
my %font_face_hash = ();
|
522
|
-
|
523
|
-
my $omnilines = $objs->[ $i ]->get_objs_ref();
|
524
|
-
# For each line in the paragraph
|
525
|
-
for (my $t = 0; $t < scalar(@{ $omnilines }); $t++)
|
526
|
-
{
|
527
|
-
# Save the line
|
528
|
-
push @lines, $omnilines->[ $t ]->get_content();
|
529
|
-
# Save the line's address
|
530
|
-
$line_addr->{ 'L4' } = $lindex;
|
531
|
-
push @lines_addr, { %{ $line_addr } };
|
532
|
-
# Point to the next line in the whole frame
|
533
|
-
$lindex++;
|
534
|
-
|
535
|
-
# Line attributes
|
536
|
-
$left = $omnilines->[ $t ]->get_left_pos();
|
537
|
-
$right = $omnilines->[ $t ]->get_right_pos();
|
538
|
-
$top = $omnilines->[ $t ]->get_top_pos();
|
539
|
-
$bottom = $omnilines->[ $t ]->get_bottom_pos();
|
540
|
-
|
541
|
-
# Runs
|
542
|
-
my $runs = $omnilines->[ $t ]->get_objs_ref();
|
543
|
-
my $start_r = 0;
|
544
|
-
my $end_r = scalar(@{ $runs }) - 1;
|
545
|
-
|
546
|
-
# Total number of words in a line
|
547
|
-
my $words_count = 0;
|
548
|
-
|
549
|
-
for (my $u = $start_r; $u <= $end_r; $u++)
|
550
|
-
{
|
551
|
-
# Thang's compatible code (instead of using get_objs_ref)
|
552
|
-
my $rcontent = undef;
|
553
|
-
# Get run content
|
554
|
-
$rcontent = $runs->[ $u ]->get_content();
|
555
|
-
# Trim
|
556
|
-
$rcontent =~ s/^\s+|\s+$//g;
|
557
|
-
# Split to words
|
558
|
-
my @words = split(/\s+/, $rcontent);
|
559
|
-
|
560
|
-
# Update the number of words
|
561
|
-
$words_count += scalar(@words);
|
562
|
-
|
563
|
-
# XML format
|
564
|
-
my $font_size = $runs->[ $u ]->get_font_size();
|
565
|
-
$font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
|
566
|
-
# XML format
|
567
|
-
my $font_face = $runs->[ $u ]->get_font_face();
|
568
|
-
$font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
|
569
|
-
# XML format
|
570
|
-
if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
|
571
|
-
# XML format
|
572
|
-
if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
|
573
|
-
}
|
574
|
-
|
575
|
-
# Line attributes - relative position in paragraph
|
576
|
-
if ($t == 0)
|
577
|
-
{
|
578
|
-
push @g_para, "yes";
|
579
|
-
}
|
580
|
-
else
|
581
|
-
{
|
582
|
-
push @g_para, "no";
|
583
|
-
}
|
584
|
-
|
585
|
-
# Line attributes - line position
|
586
|
-
my $pos = ($top + $bottom) / 2.0;
|
587
|
-
# Compare to global min and max position
|
588
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
589
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
590
|
-
# Pos feature
|
591
|
-
push @g_pos_hash, $pos;
|
592
|
-
# Alignment feature
|
593
|
-
push @g_align, $align;
|
594
|
-
# Table feature
|
595
|
-
push @g_table, "no";
|
596
|
-
|
597
|
-
if ($is_pic)
|
598
|
-
{
|
599
|
-
push @g_pic, "yes";
|
600
|
-
# Not assign value if line is in image area
|
601
|
-
push @g_bold, "no";
|
602
|
-
push @g_italic, "no";
|
603
|
-
push @g_bullet, "no";
|
604
|
-
push @g_font_size, -1;
|
605
|
-
push @g_font_face, "none";
|
606
|
-
}
|
607
|
-
else
|
608
|
-
{
|
609
|
-
push @g_pic, "no";
|
610
|
-
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
611
|
-
UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
|
612
|
-
}
|
613
|
-
|
614
|
-
# Reset hash
|
615
|
-
%font_size_hash = ();
|
616
|
-
%font_face_hash = ();
|
617
|
-
# Reset
|
618
|
-
$bold_count = 0;
|
619
|
-
$italic_count = 0;
|
620
|
-
}
|
621
|
-
}
|
622
|
-
elsif ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNITABLE' })
|
623
|
-
{
|
624
|
-
$lindex = ProcessTable($objs->[ $i ], $is_pic, $line_addr, $lindex);
|
625
|
-
}
|
626
|
-
}
|
627
|
-
}
|
628
|
-
|
629
|
-
sub ProcessTable
|
630
|
-
{
|
631
|
-
my ($omnitable, $is_pic, $line_addr, $lindex) = @_;
|
632
|
-
|
633
|
-
# Table attributes
|
634
|
-
my ($left, $top, $right, $bottom) = undef;
|
635
|
-
$left = $omnitable->get_left_pos();
|
636
|
-
$right = $omnitable->get_right_pos();
|
637
|
-
$top = $omnitable->get_top_pos();
|
638
|
-
$bottom = $omnitable->get_bottom_pos();
|
639
|
-
# Table attributes
|
640
|
-
my $align = $omnitable->get_alignment();
|
641
|
-
|
642
|
-
# Thang's code
|
643
|
-
my $pos = ($top + $bottom) / 2.0;
|
644
|
-
# Set new min and max position
|
645
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
646
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
647
|
-
# End Thangs's code
|
648
|
-
|
649
|
-
# All row in the table
|
650
|
-
my $rows = $omnitable->get_row_content();
|
651
|
-
# For each row in the table
|
652
|
-
for (my $i = 0; $i < scalar(@{ $rows }); $i++)
|
653
|
-
{
|
654
|
-
my @row_lines = split(/\n/, $rows->[ $i ]);
|
655
|
-
# For each line in the row
|
656
|
-
for (my $j = 0; $j < scalar(@row_lines); $j++)
|
657
|
-
{
|
658
|
-
# Save the line
|
659
|
-
push @lines, $row_lines[ $j ];
|
660
|
-
# Save the line's address
|
661
|
-
$line_addr->{ 'L4' } = $lindex;
|
662
|
-
push @lines_addr, { %{ $line_addr } };
|
663
|
-
# Point to the next line in the whole table
|
664
|
-
$lindex++;
|
665
|
-
|
666
|
-
if (($j == 0) && ($i == 0))
|
667
|
-
{
|
668
|
-
push @g_para, "yes";
|
669
|
-
}
|
670
|
-
else
|
671
|
-
{
|
672
|
-
push @g_para, "no";
|
673
|
-
}
|
674
|
-
|
675
|
-
# Table feature
|
676
|
-
push @g_table, "yes";
|
677
|
-
|
678
|
-
# Pic feature
|
679
|
-
if ($is_pic)
|
680
|
-
{
|
681
|
-
push @g_pic, "yes";
|
682
|
-
}
|
683
|
-
else
|
684
|
-
{
|
685
|
-
push @g_pic, "no";
|
686
|
-
}
|
687
|
-
|
688
|
-
# Update xml pos value
|
689
|
-
push @g_pos_hash, $pos;
|
690
|
-
# Update xml alignment value
|
691
|
-
push @g_align, $align;
|
692
|
-
|
693
|
-
# Fontsize feature
|
694
|
-
push @g_font_size, -1;
|
695
|
-
# Fontface feature
|
696
|
-
push @g_font_face, "none";
|
697
|
-
# Bold feature
|
698
|
-
push @g_bold, "no";
|
699
|
-
# Italic feature
|
700
|
-
push @g_italic, "no";
|
701
|
-
# Bullet feature
|
702
|
-
push @g_bullet, "no";
|
703
|
-
}
|
704
|
-
}
|
705
|
-
|
706
|
-
# Nonsense
|
707
|
-
return $lindex;
|
708
|
-
}
|
709
|
-
|
710
|
-
sub ProcessPara
|
711
|
-
{
|
712
|
-
my ($paragraph, $is_pic, $line_addr) = @_;
|
713
|
-
|
714
|
-
# Paragraph attributes
|
715
|
-
my $align = $paragraph->get_alignment();
|
716
|
-
my $space = $paragraph->get_space_before();
|
717
|
-
# Line attributes
|
718
|
-
my ($left, $top, $right, $bottom) = undef;
|
719
|
-
# Run attributes
|
720
|
-
my $bold_count = 0;
|
721
|
-
my $italic_count = 0;
|
722
|
-
my %font_size_hash = ();
|
723
|
-
my %font_face_hash = ();
|
724
|
-
|
725
|
-
# Lines
|
726
|
-
my $omnilines = $paragraph->get_objs_ref();
|
727
|
-
my $start_l = 0;
|
728
|
-
my $end_l = scalar(@{ $omnilines }) - 1;
|
729
|
-
|
730
|
-
# Lines
|
731
|
-
for (my $t = $start_l; $t <= $end_l; $t++)
|
732
|
-
{
|
733
|
-
# Skip blank line
|
734
|
-
my $lcontent = $omnilines->[ $t ]->get_content();
|
735
|
-
$lcontent =~ s/^\s+|\s+$//g;
|
736
|
-
# Skip blank line
|
737
|
-
if ($lcontent eq "") { next; }
|
738
|
-
|
739
|
-
# Save the line
|
740
|
-
push @lines, $omnilines->[ $t ]->get_content();
|
741
|
-
# Save the line's address
|
742
|
-
$line_addr->{ 'L4' } = $t;
|
743
|
-
push @lines_addr, { %{ $line_addr } };
|
744
|
-
|
745
|
-
# Line attributes
|
746
|
-
$left = $omnilines->[ $t ]->get_left_pos();
|
747
|
-
$right = $omnilines->[ $t ]->get_right_pos();
|
748
|
-
$top = $omnilines->[ $t ]->get_top_pos();
|
749
|
-
$bottom = $omnilines->[ $t ]->get_bottom_pos();
|
750
|
-
|
751
|
-
# Runs
|
752
|
-
my $runs = $omnilines->[ $t ]->get_objs_ref();
|
753
|
-
my $start_r = 0;
|
754
|
-
my $end_r = scalar(@{ $runs }) - 1;
|
755
|
-
|
756
|
-
# Total number of words in a line
|
757
|
-
my $words_count = 0;
|
758
|
-
|
759
|
-
for (my $u = $start_r; $u <= $end_r; $u++)
|
760
|
-
{
|
761
|
-
# Thang's compatible code (instead of using get_objs_ref)
|
762
|
-
my $rcontent = undef;
|
763
|
-
# Get run content
|
764
|
-
$rcontent = $runs->[ $u ]->get_content();
|
765
|
-
# Trim
|
766
|
-
$rcontent =~ s/^\s+|\s+$//g;
|
767
|
-
# Split to words
|
768
|
-
my @words = split(/\s+/, $rcontent);
|
769
|
-
|
770
|
-
# Update the number of words
|
771
|
-
$words_count += scalar(@words);
|
772
|
-
|
773
|
-
# XML format
|
774
|
-
my $font_size = $runs->[ $u ]->get_font_size();
|
775
|
-
$font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
|
776
|
-
# XML format
|
777
|
-
my $font_face = $runs->[ $u ]->get_font_face();
|
778
|
-
$font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
|
779
|
-
# XML format
|
780
|
-
if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
|
781
|
-
# XML format
|
782
|
-
if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
|
783
|
-
}
|
784
|
-
|
785
|
-
# Line attributes - relative position in paragraph
|
786
|
-
if ($t == $start_l)
|
787
|
-
{
|
788
|
-
push @g_para, "yes";
|
789
|
-
}
|
790
|
-
else
|
791
|
-
{
|
792
|
-
push @g_para, "no";
|
793
|
-
}
|
794
|
-
|
795
|
-
# Line attributes - line position
|
796
|
-
my $pos = ($top + $bottom) / 2.0;
|
797
|
-
# Compare to global min and max position
|
798
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
799
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
800
|
-
# Pos feature
|
801
|
-
push @g_pos_hash, $pos;
|
802
|
-
# Alignment feature
|
803
|
-
push @g_align, $align;
|
804
|
-
# Table feature
|
805
|
-
push @g_table, "no";
|
806
|
-
|
807
|
-
if ($is_pic)
|
808
|
-
{
|
809
|
-
push @g_pic, "yes";
|
810
|
-
# Not assign value if line is in image area
|
811
|
-
push @g_bold, "no";
|
812
|
-
push @g_italic, "no";
|
813
|
-
push @g_bullet, "no";
|
814
|
-
push @g_font_size, -1;
|
815
|
-
push @g_font_face, "none";
|
816
|
-
}
|
817
|
-
else
|
818
|
-
{
|
819
|
-
push @g_pic, "no";
|
820
|
-
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
821
|
-
UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
|
822
|
-
}
|
823
|
-
|
824
|
-
# Reset hash
|
825
|
-
%font_size_hash = ();
|
826
|
-
%font_face_hash = ();
|
827
|
-
# Reset
|
828
|
-
$bold_count = 0;
|
829
|
-
$italic_count = 0;
|
830
|
-
}
|
831
|
-
}
|
832
|
-
|
833
|
-
sub UpdateXMLFontFeature
|
834
|
-
{
|
835
|
-
my ($font_size_hash, $font_face_hash) = @_;
|
836
|
-
|
837
|
-
# Font size feature
|
838
|
-
if (scalar(keys %{ $font_size_hash }) == 0)
|
839
|
-
{
|
840
|
-
push @g_font_size, -1;
|
841
|
-
}
|
842
|
-
else
|
843
|
-
{
|
844
|
-
my @sorted_fonts = sort { $font_size_hash->{ $b } <=> $font_size_hash->{ $a } } keys %{ $font_size_hash };
|
845
|
-
|
846
|
-
my $font_size = undef;
|
847
|
-
# Iw two font sizes are equal in number, get the larger one
|
848
|
-
if ((scalar(@sorted_fonts) != 1) && ($font_size_hash->{ $sorted_fonts[ 0 ] } == $font_size_hash->{ $sorted_fonts[ 1 ] }))
|
849
|
-
{
|
850
|
-
$font_size = ($sorted_fonts[ 0 ] > $sorted_fonts[ 1 ]) ? $sorted_fonts[ 0 ] : $sorted_fonts[ 1 ];
|
851
|
-
}
|
852
|
-
else
|
853
|
-
{
|
854
|
-
$font_size = $sorted_fonts[ 0 ];
|
855
|
-
}
|
856
|
-
|
857
|
-
if ($font_size eq "") { $font_size = 0; }
|
858
|
-
|
859
|
-
push @g_font_size, $font_size;
|
860
|
-
$g_font_size_hash{ $font_size } = $g_font_size_hash{ $font_size } ? $g_font_size_hash{ $font_size } + 1 : 1;
|
861
|
-
}
|
862
|
-
|
863
|
-
# Font face feature
|
864
|
-
if (scalar(keys %{ $font_face_hash }) == 0)
|
865
|
-
{
|
866
|
-
push @g_font_face, "none";
|
867
|
-
}
|
868
|
-
else
|
869
|
-
{
|
870
|
-
my @sorted_fonts = sort { $font_face_hash->{ $b } <=> $font_face_hash->{ $a } } keys %{ $font_face_hash };
|
871
|
-
|
872
|
-
my $font_face = $sorted_fonts[ 0 ];
|
873
|
-
push @g_font_face, $font_face;
|
874
|
-
|
875
|
-
$g_font_face_hash{ $font_face } = $g_font_face_hash{ $font_face } ? $g_font_face_hash{ $font_face } + 1 : 1;
|
876
|
-
}
|
877
|
-
}
|
878
|
-
|
879
|
-
sub UpdateXMLFeatures
|
880
|
-
{
|
881
|
-
my ($bold_count, $italic_count, $words_count, $is_bullet, $space) = @_;
|
882
|
-
|
883
|
-
# Bold feature
|
884
|
-
my $bold_feature = undef;
|
885
|
-
if (($words_count != 0) && ($bold_count / $words_count >= 0.667))
|
886
|
-
{
|
887
|
-
$bold_feature = "yes";
|
888
|
-
}
|
889
|
-
else
|
890
|
-
{
|
891
|
-
$bold_feature = "no";
|
892
|
-
}
|
893
|
-
push @g_bold, $bold_feature;
|
894
|
-
|
895
|
-
# Italic feature
|
896
|
-
my $italic_feature = undef;
|
897
|
-
if (($words_count != 0) && ($italic_count / $words_count >= 0.667))
|
898
|
-
{
|
899
|
-
$italic_feature = "yes";
|
900
|
-
}
|
901
|
-
else
|
902
|
-
{
|
903
|
-
$italic_feature = "no";
|
904
|
-
}
|
905
|
-
push @g_italic, $italic_feature;
|
906
|
-
|
907
|
-
# Bullet feature
|
908
|
-
if ((defined $is_bullet) && ($is_bullet eq "true"))
|
909
|
-
{
|
910
|
-
push @g_bullet, "yes";
|
911
|
-
}
|
912
|
-
else
|
913
|
-
{
|
914
|
-
push @g_bullet, "no";
|
915
|
-
}
|
916
|
-
}
|
917
|
-
|
918
|
-
sub UntaintPath
|
919
|
-
{
|
920
|
-
my ($path) = @_;
|
921
|
-
|
922
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ )
|
923
|
-
{
|
924
|
-
$path = $1;
|
925
|
-
}
|
926
|
-
else
|
927
|
-
{
|
928
|
-
die "Bad path \"$path\"\n";
|
929
|
-
}
|
930
|
-
|
931
|
-
return $path;
|
932
|
-
}
|
933
|
-
|
934
|
-
sub Untaint
|
935
|
-
{
|
936
|
-
my ($s) = @_;
|
937
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
|
938
|
-
{
|
939
|
-
$s = $1; # $data now untainted
|
940
|
-
}
|
941
|
-
else
|
942
|
-
{
|
943
|
-
die "Bad data in $s"; # log this somewhere
|
944
|
-
}
|
945
|
-
|
946
|
-
return $s;
|
947
|
-
}
|
948
|
-
|
949
|
-
sub Execute
|
950
|
-
{
|
951
|
-
my ($cmd) = @_;
|
952
|
-
$cmd = Untaint($cmd);
|
953
|
-
system($cmd);
|
954
|
-
}
|
955
|
-
|
956
|
-
sub NewTmpFile
|
957
|
-
{
|
958
|
-
my $tmp_file = `date '+%Y%m%d-%H%M%S-$$'`;
|
959
|
-
chomp $tmp_file;
|
960
|
-
return $tmp_file;
|
961
|
-
}
|
962
|
-
|
963
|
-
|
964
|
-
|