biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,964 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl
|
|
2
|
-
|
|
3
|
-
# Author: Do Hoang Nhat Huy <huydo@comp.nus.edu.sg>
|
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
5
|
-
|
|
6
|
-
require 5.0;
|
|
7
|
-
use strict;
|
|
8
|
-
|
|
9
|
-
# Dependencies
|
|
10
|
-
use FindBin;
|
|
11
|
-
use Getopt::Long;
|
|
12
|
-
use HTML::Entities;
|
|
13
|
-
|
|
14
|
-
# I do not know a better solution to find a lib path in -T mode.
|
|
15
|
-
# So if you know a better solution, I'd be glad to hear.
|
|
16
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
|
17
|
-
|
|
18
|
-
# To get correct path in case 2 scripts in different directories use FindBin
|
|
19
|
-
FindBin::again();
|
|
20
|
-
my $path = undef;
|
|
21
|
-
BEGIN
|
|
22
|
-
{
|
|
23
|
-
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
|
24
|
-
}
|
|
25
|
-
use lib "$path/../../lib";
|
|
26
|
-
|
|
27
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
|
|
28
|
-
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
|
|
29
|
-
|
|
30
|
-
# Local libraries
|
|
31
|
-
use Omni::Config;
|
|
32
|
-
use Omni::Omnidoc;
|
|
33
|
-
use SectLabel::PreProcess;
|
|
34
|
-
|
|
35
|
-
# Omnilib configuration: object name
|
|
36
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
37
|
-
|
|
38
|
-
### USER customizable section
|
|
39
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
40
|
-
my $version = "1.0";
|
|
41
|
-
### END user customizable section
|
|
42
|
-
|
|
43
|
-
sub License
|
|
44
|
-
{
|
|
45
|
-
print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
sub Help
|
|
49
|
-
{
|
|
50
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
|
|
51
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
52
|
-
print STDERR " $progname -in xmlfile -out outfile [-decode] [-log]\n";
|
|
53
|
-
print STDERR "Options:\n";
|
|
54
|
-
print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
|
|
55
|
-
print STDERR "\t-decode \tDecode HTML entities and then output, to avoid double entity encoding later\n";
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
my $quite = 0;
|
|
59
|
-
my $help = 0;
|
|
60
|
-
my $out_file = undef;
|
|
61
|
-
my $in_file = undef;
|
|
62
|
-
my $is_decode = 0;
|
|
63
|
-
my $is_debug = 0;
|
|
64
|
-
my $address = 1;
|
|
65
|
-
|
|
66
|
-
$help = 1 unless GetOptions( 'in=s' => \$in_file,
|
|
67
|
-
'out=s' => \$out_file,
|
|
68
|
-
'decode' => \$is_decode,
|
|
69
|
-
'log' => \$is_debug,
|
|
70
|
-
'h' => \$help,
|
|
71
|
-
'q' => \$quite );
|
|
72
|
-
|
|
73
|
-
if ($help || ! defined $in_file || ! defined $out_file)
|
|
74
|
-
{
|
|
75
|
-
Help();
|
|
76
|
-
exit(0);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
if (!$quite)
|
|
80
|
-
{
|
|
81
|
-
License();
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
### Untaint ###
|
|
85
|
-
$in_file = UntaintPath($in_file);
|
|
86
|
-
$out_file = UntaintPath($out_file);
|
|
87
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
|
88
|
-
### End untaint ###
|
|
89
|
-
|
|
90
|
-
# Mark page, para, line, word
|
|
91
|
-
my %g_page_hash = ();
|
|
92
|
-
|
|
93
|
-
# Mark paragraph
|
|
94
|
-
my @g_para = ();
|
|
95
|
-
|
|
96
|
-
# XML features
|
|
97
|
-
# Location feature
|
|
98
|
-
my @g_pos_hash = ();
|
|
99
|
-
my $g_maxpos = 0;
|
|
100
|
-
my $g_minpos = 1000000;
|
|
101
|
-
# Align feature
|
|
102
|
-
my @g_align = ();
|
|
103
|
-
# Bold feature
|
|
104
|
-
my @g_bold = ();
|
|
105
|
-
# Italic feature
|
|
106
|
-
my @g_italic = ();
|
|
107
|
-
# Pic feature
|
|
108
|
-
my @g_pic = ();
|
|
109
|
-
# Table feature
|
|
110
|
-
my @g_table = ();
|
|
111
|
-
# Bullet feature
|
|
112
|
-
my @g_bullet = ();
|
|
113
|
-
# Font size feature
|
|
114
|
-
my %g_font_size_hash = ();
|
|
115
|
-
my @g_font_size = ();
|
|
116
|
-
# Font face feature
|
|
117
|
-
my %g_font_face_hash = ();
|
|
118
|
-
my @g_font_face = ();
|
|
119
|
-
|
|
120
|
-
# All lines
|
|
121
|
-
my @lines = ();
|
|
122
|
-
# and their address
|
|
123
|
-
my @lines_addr = ();
|
|
124
|
-
|
|
125
|
-
# BEGIN
|
|
126
|
-
ProcessFile($in_file);
|
|
127
|
-
# Find header part
|
|
128
|
-
my $num_lines = scalar(@lines);
|
|
129
|
-
my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
|
|
130
|
-
# Done
|
|
131
|
-
Output(\@lines, $out_file);
|
|
132
|
-
|
|
133
|
-
if ($address == 1)
|
|
134
|
-
{
|
|
135
|
-
my $address_handle = undef;
|
|
136
|
-
# Save the line address for further use
|
|
137
|
-
open($address_handle, ">:utf8", $out_file . ".address") || die"#Can't open file \"$out_file.address\"\n";
|
|
138
|
-
foreach my $addr (@lines_addr)
|
|
139
|
-
{
|
|
140
|
-
print $address_handle $addr->{ 'L1' }, " ", $addr->{ 'L2' }, " ", $addr->{ 'L3' }, " ", $addr->{ 'L4' }, "\n";
|
|
141
|
-
}
|
|
142
|
-
# Done
|
|
143
|
-
close $address_handle;
|
|
144
|
-
}
|
|
145
|
-
# END
|
|
146
|
-
|
|
147
|
-
sub ProcessFile
|
|
148
|
-
{
|
|
149
|
-
my ($in_file) = @_;
|
|
150
|
-
|
|
151
|
-
my $input_handle = undef;
|
|
152
|
-
if (! open($input_handle, "<:utf8", $in_file)) { die "Could not open xml file " . $in_file; }
|
|
153
|
-
my $xml = do { local $/; <$input_handle> };
|
|
154
|
-
close $input_handle;
|
|
155
|
-
|
|
156
|
-
###
|
|
157
|
-
# Huydhn
|
|
158
|
-
# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
|
|
159
|
-
# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
|
|
160
|
-
###
|
|
161
|
-
# Convert to Unix format
|
|
162
|
-
$xml =~ s/\r//g;
|
|
163
|
-
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
|
164
|
-
$xml =~ s/<\?xml.+?>\n//g;
|
|
165
|
-
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
|
166
|
-
$xml =~ s/<\!\-\-XML.+?>\n//g;
|
|
167
|
-
# Declaration and root
|
|
168
|
-
$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
|
|
169
|
-
|
|
170
|
-
# New document
|
|
171
|
-
my $doc = new Omni::Omnidoc();
|
|
172
|
-
$doc->set_raw($xml);
|
|
173
|
-
|
|
174
|
-
# Current position
|
|
175
|
-
my %current = ();
|
|
176
|
-
|
|
177
|
-
# All pages in the document
|
|
178
|
-
my $pages = $doc->get_objs_ref();
|
|
179
|
-
|
|
180
|
-
# From page, To page
|
|
181
|
-
my $start_page = 0;
|
|
182
|
-
my $end_page = scalar(@{ $pages }) - 1;
|
|
183
|
-
|
|
184
|
-
# Image area flag
|
|
185
|
-
my $is_pic = 0;
|
|
186
|
-
|
|
187
|
-
# Tree traveling is 'not' fun. Seriously.
|
|
188
|
-
# This is like a dungeon seige.
|
|
189
|
-
for (my $x = $start_page; $x <= $end_page; $x++)
|
|
190
|
-
{
|
|
191
|
-
# Current position
|
|
192
|
-
$current{ 'L1' } = $x;
|
|
193
|
-
|
|
194
|
-
# Column or dd
|
|
195
|
-
my $level_2 = $pages->[ $x ]->get_objs_ref();
|
|
196
|
-
my $start_l2 = 0;
|
|
197
|
-
my $end_l2 = scalar(@{ $level_2 }) - 1;
|
|
198
|
-
|
|
199
|
-
for (my $y = $start_l2; $y <= $end_l2; $y++)
|
|
200
|
-
{
|
|
201
|
-
# Thang's code
|
|
202
|
-
# Thang considers <dd> tag as image, I just follow that
|
|
203
|
-
if ($level_2->[ $y ]->get_name() eq $obj_list->{ 'OMNIDD' })
|
|
204
|
-
{
|
|
205
|
-
$is_pic = 1;
|
|
206
|
-
}
|
|
207
|
-
else
|
|
208
|
-
{
|
|
209
|
-
$is_pic = 0;
|
|
210
|
-
}
|
|
211
|
-
# End Thang's code
|
|
212
|
-
|
|
213
|
-
# Current position
|
|
214
|
-
$current{ 'L2' } = $y;
|
|
215
|
-
|
|
216
|
-
# Table or paragraph
|
|
217
|
-
my $level_3 = $level_2->[ $y ]->get_objs_ref();
|
|
218
|
-
my $start_l3 = 0;
|
|
219
|
-
my $end_l3 = scalar(@{ $level_3 }) - 1;
|
|
220
|
-
|
|
221
|
-
for (my $z = $start_l3; $z <= $end_l3; $z++)
|
|
222
|
-
{
|
|
223
|
-
# Current position
|
|
224
|
-
$current{ 'L3' } = $z;
|
|
225
|
-
|
|
226
|
-
# Is a paragraph
|
|
227
|
-
if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
|
228
|
-
{
|
|
229
|
-
# Thang's code
|
|
230
|
-
ProcessPara($level_3->[ $z ], $is_pic, \%current);
|
|
231
|
-
# End Thang's code
|
|
232
|
-
}
|
|
233
|
-
# or a table
|
|
234
|
-
elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' })
|
|
235
|
-
{
|
|
236
|
-
# Thang's code
|
|
237
|
-
ProcessTable($level_3->[ $z ], $is_pic, \%current, 0);
|
|
238
|
-
# End Thangs's code
|
|
239
|
-
}
|
|
240
|
-
# or a frame
|
|
241
|
-
elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' })
|
|
242
|
-
{
|
|
243
|
-
# Frame contains multiple paragraph ?
|
|
244
|
-
ProcessFrame($level_3->[ $z ], $is_pic, \%current);
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
sub Output
|
|
252
|
-
{
|
|
253
|
-
my ($lines, $out_file) = @_;
|
|
254
|
-
|
|
255
|
-
my $output_handle = undef;
|
|
256
|
-
# This is the output
|
|
257
|
-
open($output_handle, ">:utf8", $out_file) || die"#Can't open file \"$out_file\"\n";
|
|
258
|
-
|
|
259
|
-
# XML feature label
|
|
260
|
-
my %g_font_size_labels = ();
|
|
261
|
-
GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
|
|
262
|
-
|
|
263
|
-
my $output = "";
|
|
264
|
-
my $para_line_id = -1;
|
|
265
|
-
my $para_line_count = 0;
|
|
266
|
-
|
|
267
|
-
# This is the index of the line
|
|
268
|
-
my $id = 0;
|
|
269
|
-
# For each line in the whole document
|
|
270
|
-
foreach my $line (@{ $lines })
|
|
271
|
-
{
|
|
272
|
-
# Remove empty line
|
|
273
|
-
$line =~ s/^\s+|\s+$//g;
|
|
274
|
-
|
|
275
|
-
# New paragraph
|
|
276
|
-
if (($g_para[ $id ] eq "yes") && ($output ne ""))
|
|
277
|
-
{
|
|
278
|
-
if ($is_decode) { $output = decode_entities($output); }
|
|
279
|
-
# Write output to file
|
|
280
|
-
print $output_handle $output;
|
|
281
|
-
# Clean output for new paragraph
|
|
282
|
-
$output = "";
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
$output .= $line;
|
|
286
|
-
|
|
287
|
-
my $loc_feature = undef;
|
|
288
|
-
# XML location feature
|
|
289
|
-
if ($g_pos_hash[ $id ] != (-1)) { $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos) * 8.0 / ($g_maxpos - $g_minpos + 1)); }
|
|
290
|
-
|
|
291
|
-
# Align feature
|
|
292
|
-
my $align_feature = "xmlAlign_" . $g_align[ $id ];
|
|
293
|
-
|
|
294
|
-
my $font_size_feature = undef;
|
|
295
|
-
# Font_size feature
|
|
296
|
-
if (($g_font_size[$id] eq "") || ($g_font_size[$id] == -1))
|
|
297
|
-
{
|
|
298
|
-
$font_size_feature = "xmlFontSize_none";
|
|
299
|
-
}
|
|
300
|
-
else
|
|
301
|
-
{
|
|
302
|
-
$font_size_feature = "xmlFontSize_" . $g_font_size_labels{ $g_font_size[ $id ] };
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
# Bold feature
|
|
306
|
-
my $bold_feature = "xmlBold_" . $g_bold[ $id ];
|
|
307
|
-
# Italic feature
|
|
308
|
-
my $italic_feature = "xmlItalic_" . $g_italic[ $id ];
|
|
309
|
-
# Image feature
|
|
310
|
-
my $pic_feature = "xmlPic_" . $g_pic[ $id ];
|
|
311
|
-
# Table feature
|
|
312
|
-
my $table_feature = "xmlTable_" . $g_table[ $id ];
|
|
313
|
-
# Bullet feature
|
|
314
|
-
my $bullet_feature = "xmlBullet_" . $g_bullet[ $id ];
|
|
315
|
-
# Differential features
|
|
316
|
-
my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
|
|
317
|
-
|
|
318
|
-
# Each line and its XML features
|
|
319
|
-
$output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff" . "\n";
|
|
320
|
-
|
|
321
|
-
# Update line index
|
|
322
|
-
$id++;
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
# New paragraph
|
|
326
|
-
if ($output ne "")
|
|
327
|
-
{
|
|
328
|
-
if ($is_decode) { $output = decode_entities($output); }
|
|
329
|
-
# Write output to file
|
|
330
|
-
print $output_handle $output;
|
|
331
|
-
# Clean output for new paragraph
|
|
332
|
-
$output = "";
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
# Done
|
|
336
|
-
close $output_handle;
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
sub GetDifferentialFeatures
|
|
340
|
-
{
|
|
341
|
-
my ($id) = @_;
|
|
342
|
-
|
|
343
|
-
my $align_diff = "bi_xmlA_";
|
|
344
|
-
# AlignChange feature
|
|
345
|
-
if ($id == 0)
|
|
346
|
-
{
|
|
347
|
-
$align_diff .= $g_align[ $id ];
|
|
348
|
-
}
|
|
349
|
-
elsif ($g_align[ $id ] eq $g_align[ $id - 1 ])
|
|
350
|
-
{
|
|
351
|
-
$align_diff .= "continue";
|
|
352
|
-
}
|
|
353
|
-
else
|
|
354
|
-
{
|
|
355
|
-
$align_diff .= $g_align[$id];
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
my $font_face_diff = "bi_xmlF_";
|
|
359
|
-
# FontFaceChange feature
|
|
360
|
-
if ($id == 0)
|
|
361
|
-
{
|
|
362
|
-
$font_face_diff .= "new";
|
|
363
|
-
}
|
|
364
|
-
elsif ($g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
|
|
365
|
-
{
|
|
366
|
-
$font_face_diff .= "continue";
|
|
367
|
-
}
|
|
368
|
-
else
|
|
369
|
-
{
|
|
370
|
-
$font_face_diff .= "new";
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
my $font_size_diff = "bi_xmlS_";
|
|
374
|
-
# FontSizeChange feature
|
|
375
|
-
if ($id == 0)
|
|
376
|
-
{
|
|
377
|
-
$font_size_diff .= "new";
|
|
378
|
-
}
|
|
379
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ])
|
|
380
|
-
{
|
|
381
|
-
$font_size_diff .= "continue";
|
|
382
|
-
}
|
|
383
|
-
else
|
|
384
|
-
{
|
|
385
|
-
$font_size_diff .= "new";
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
my $font_sf_diff = "bi_xmlSF_";
|
|
389
|
-
# FontSFChange feature
|
|
390
|
-
if ($id == 0)
|
|
391
|
-
{
|
|
392
|
-
$font_sf_diff .= "new";
|
|
393
|
-
}
|
|
394
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
|
|
395
|
-
{
|
|
396
|
-
$font_sf_diff .= "continue";
|
|
397
|
-
}
|
|
398
|
-
else
|
|
399
|
-
{
|
|
400
|
-
$font_sf_diff .= "new";
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
my $font_sfbi_diff = "bi_xmlSFBI_";
|
|
404
|
-
# FontSFBIChange feature
|
|
405
|
-
if ($id == 0)
|
|
406
|
-
{
|
|
407
|
-
$font_sfbi_diff .= "new";
|
|
408
|
-
}
|
|
409
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[ $id - 1 ])
|
|
410
|
-
{
|
|
411
|
-
$font_sfbi_diff .= "continue";
|
|
412
|
-
}
|
|
413
|
-
else
|
|
414
|
-
{
|
|
415
|
-
$font_sfbi_diff .= "new";
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
my $font_sfbia_diff = "bi_xmlSFBIA_";
|
|
419
|
-
# FontSFBIAChange feature
|
|
420
|
-
if ($id == 0)
|
|
421
|
-
{
|
|
422
|
-
$font_sfbia_diff .= "new";
|
|
423
|
-
}
|
|
424
|
-
elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[$id - 1] && $g_align[ $id ] eq $g_align[ $id - 1 ])
|
|
425
|
-
{
|
|
426
|
-
$font_sfbia_diff .= "continue";
|
|
427
|
-
}
|
|
428
|
-
else
|
|
429
|
-
{
|
|
430
|
-
$font_sfbia_diff .= "new";
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
# ParaChange feature
|
|
434
|
-
my $para_diff = "bi_xmlPara_";
|
|
435
|
-
# Header part, consider each line as a separate paragraph
|
|
436
|
-
if ($id < $body_start_id)
|
|
437
|
-
{
|
|
438
|
-
$para_diff .= "header";
|
|
439
|
-
}
|
|
440
|
-
else
|
|
441
|
-
{
|
|
442
|
-
if($g_para[$id] eq "yes")
|
|
443
|
-
{
|
|
444
|
-
$para_diff .= "new";
|
|
445
|
-
}
|
|
446
|
-
else
|
|
447
|
-
{
|
|
448
|
-
$para_diff .= "continue";
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
sub GetFontSizeLabels
|
|
456
|
-
{
|
|
457
|
-
my ($g_font_size_hash, $g_font_size_labels) = @_;
|
|
458
|
-
|
|
459
|
-
# Sort by value in desccending order
|
|
460
|
-
my @sorted_fonts = sort { $g_font_size_hash->{ $b } <=> $g_font_size_hash->{ $a } } keys %{ $g_font_size_hash };
|
|
461
|
-
# and get the
|
|
462
|
-
my $common_size = $sorted_fonts[ 0 ];
|
|
463
|
-
|
|
464
|
-
# Sort by key in ascending order
|
|
465
|
-
@sorted_fonts = sort { $a <=> $b } keys %{ $g_font_size_hash };
|
|
466
|
-
|
|
467
|
-
my $common_index = 0;
|
|
468
|
-
# Index of common font size
|
|
469
|
-
foreach (@sorted_fonts)
|
|
470
|
-
{
|
|
471
|
-
# Found
|
|
472
|
-
if ($common_size == $_) { last; }
|
|
473
|
-
$common_index++;
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
# Small fonts
|
|
477
|
-
for (my $i = 0; $i < $common_index; $i++)
|
|
478
|
-
{
|
|
479
|
-
$g_font_size_labels->{ $sorted_fonts[ $i ] } = "smaller";
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
# Common fonts
|
|
483
|
-
$g_font_size_labels->{ $common_size } = "common";
|
|
484
|
-
|
|
485
|
-
# Large fonts
|
|
486
|
-
for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
|
|
487
|
-
{
|
|
488
|
-
if ((scalar(@sorted_fonts) - $i) <= 3)
|
|
489
|
-
{
|
|
490
|
-
$g_font_size_labels->{ $sorted_fonts[$i] } = "largest" . ($i + 1 - scalar(@sorted_fonts));
|
|
491
|
-
}
|
|
492
|
-
else
|
|
493
|
-
{
|
|
494
|
-
$g_font_size_labels->{ $sorted_fonts[$i] } = "larger";
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
sub ProcessFrame
|
|
500
|
-
{
|
|
501
|
-
my ($omniframe, $is_pic, $line_addr) = @_;
|
|
502
|
-
|
|
503
|
-
# Line index in the whole frame
|
|
504
|
-
my $lindex = 0;
|
|
505
|
-
# All paragraph or table in the frame
|
|
506
|
-
my $objs = $omniframe->get_objs_ref();
|
|
507
|
-
# For each paragraph or table in the frame
|
|
508
|
-
for (my $i = 0; $i < scalar(@{ $objs }); $i++)
|
|
509
|
-
{
|
|
510
|
-
if ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNIPARA' })
|
|
511
|
-
{
|
|
512
|
-
# Paragraph attributes
|
|
513
|
-
my $align = $objs->[ $i ]->get_alignment();
|
|
514
|
-
my $space = $objs->[ $i ]->get_space_before();
|
|
515
|
-
# Line attributes
|
|
516
|
-
my ($left, $top, $right, $bottom) = undef;
|
|
517
|
-
# Run attributes
|
|
518
|
-
my $bold_count = 0;
|
|
519
|
-
my $italic_count = 0;
|
|
520
|
-
my %font_size_hash = ();
|
|
521
|
-
my %font_face_hash = ();
|
|
522
|
-
|
|
523
|
-
my $omnilines = $objs->[ $i ]->get_objs_ref();
|
|
524
|
-
# For each line in the paragraph
|
|
525
|
-
for (my $t = 0; $t < scalar(@{ $omnilines }); $t++)
|
|
526
|
-
{
|
|
527
|
-
# Save the line
|
|
528
|
-
push @lines, $omnilines->[ $t ]->get_content();
|
|
529
|
-
# Save the line's address
|
|
530
|
-
$line_addr->{ 'L4' } = $lindex;
|
|
531
|
-
push @lines_addr, { %{ $line_addr } };
|
|
532
|
-
# Point to the next line in the whole frame
|
|
533
|
-
$lindex++;
|
|
534
|
-
|
|
535
|
-
# Line attributes
|
|
536
|
-
$left = $omnilines->[ $t ]->get_left_pos();
|
|
537
|
-
$right = $omnilines->[ $t ]->get_right_pos();
|
|
538
|
-
$top = $omnilines->[ $t ]->get_top_pos();
|
|
539
|
-
$bottom = $omnilines->[ $t ]->get_bottom_pos();
|
|
540
|
-
|
|
541
|
-
# Runs
|
|
542
|
-
my $runs = $omnilines->[ $t ]->get_objs_ref();
|
|
543
|
-
my $start_r = 0;
|
|
544
|
-
my $end_r = scalar(@{ $runs }) - 1;
|
|
545
|
-
|
|
546
|
-
# Total number of words in a line
|
|
547
|
-
my $words_count = 0;
|
|
548
|
-
|
|
549
|
-
for (my $u = $start_r; $u <= $end_r; $u++)
|
|
550
|
-
{
|
|
551
|
-
# Thang's compatible code (instead of using get_objs_ref)
|
|
552
|
-
my $rcontent = undef;
|
|
553
|
-
# Get run content
|
|
554
|
-
$rcontent = $runs->[ $u ]->get_content();
|
|
555
|
-
# Trim
|
|
556
|
-
$rcontent =~ s/^\s+|\s+$//g;
|
|
557
|
-
# Split to words
|
|
558
|
-
my @words = split(/\s+/, $rcontent);
|
|
559
|
-
|
|
560
|
-
# Update the number of words
|
|
561
|
-
$words_count += scalar(@words);
|
|
562
|
-
|
|
563
|
-
# XML format
|
|
564
|
-
my $font_size = $runs->[ $u ]->get_font_size();
|
|
565
|
-
$font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
|
|
566
|
-
# XML format
|
|
567
|
-
my $font_face = $runs->[ $u ]->get_font_face();
|
|
568
|
-
$font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
|
|
569
|
-
# XML format
|
|
570
|
-
if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
|
|
571
|
-
# XML format
|
|
572
|
-
if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
# Line attributes - relative position in paragraph
|
|
576
|
-
if ($t == 0)
|
|
577
|
-
{
|
|
578
|
-
push @g_para, "yes";
|
|
579
|
-
}
|
|
580
|
-
else
|
|
581
|
-
{
|
|
582
|
-
push @g_para, "no";
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
# Line attributes - line position
|
|
586
|
-
my $pos = ($top + $bottom) / 2.0;
|
|
587
|
-
# Compare to global min and max position
|
|
588
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
|
589
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
|
590
|
-
# Pos feature
|
|
591
|
-
push @g_pos_hash, $pos;
|
|
592
|
-
# Alignment feature
|
|
593
|
-
push @g_align, $align;
|
|
594
|
-
# Table feature
|
|
595
|
-
push @g_table, "no";
|
|
596
|
-
|
|
597
|
-
if ($is_pic)
|
|
598
|
-
{
|
|
599
|
-
push @g_pic, "yes";
|
|
600
|
-
# Not assign value if line is in image area
|
|
601
|
-
push @g_bold, "no";
|
|
602
|
-
push @g_italic, "no";
|
|
603
|
-
push @g_bullet, "no";
|
|
604
|
-
push @g_font_size, -1;
|
|
605
|
-
push @g_font_face, "none";
|
|
606
|
-
}
|
|
607
|
-
else
|
|
608
|
-
{
|
|
609
|
-
push @g_pic, "no";
|
|
610
|
-
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
|
611
|
-
UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
# Reset hash
|
|
615
|
-
%font_size_hash = ();
|
|
616
|
-
%font_face_hash = ();
|
|
617
|
-
# Reset
|
|
618
|
-
$bold_count = 0;
|
|
619
|
-
$italic_count = 0;
|
|
620
|
-
}
|
|
621
|
-
}
|
|
622
|
-
elsif ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNITABLE' })
|
|
623
|
-
{
|
|
624
|
-
$lindex = ProcessTable($objs->[ $i ], $is_pic, $line_addr, $lindex);
|
|
625
|
-
}
|
|
626
|
-
}
|
|
627
|
-
}
|
|
628
|
-
|
|
629
|
-
sub ProcessTable
|
|
630
|
-
{
|
|
631
|
-
my ($omnitable, $is_pic, $line_addr, $lindex) = @_;
|
|
632
|
-
|
|
633
|
-
# Table attributes
|
|
634
|
-
my ($left, $top, $right, $bottom) = undef;
|
|
635
|
-
$left = $omnitable->get_left_pos();
|
|
636
|
-
$right = $omnitable->get_right_pos();
|
|
637
|
-
$top = $omnitable->get_top_pos();
|
|
638
|
-
$bottom = $omnitable->get_bottom_pos();
|
|
639
|
-
# Table attributes
|
|
640
|
-
my $align = $omnitable->get_alignment();
|
|
641
|
-
|
|
642
|
-
# Thang's code
|
|
643
|
-
my $pos = ($top + $bottom) / 2.0;
|
|
644
|
-
# Set new min and max position
|
|
645
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
|
646
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
|
647
|
-
# End Thangs's code
|
|
648
|
-
|
|
649
|
-
# All row in the table
|
|
650
|
-
my $rows = $omnitable->get_row_content();
|
|
651
|
-
# For each row in the table
|
|
652
|
-
for (my $i = 0; $i < scalar(@{ $rows }); $i++)
|
|
653
|
-
{
|
|
654
|
-
my @row_lines = split(/\n/, $rows->[ $i ]);
|
|
655
|
-
# For each line in the row
|
|
656
|
-
for (my $j = 0; $j < scalar(@row_lines); $j++)
|
|
657
|
-
{
|
|
658
|
-
# Save the line
|
|
659
|
-
push @lines, $row_lines[ $j ];
|
|
660
|
-
# Save the line's address
|
|
661
|
-
$line_addr->{ 'L4' } = $lindex;
|
|
662
|
-
push @lines_addr, { %{ $line_addr } };
|
|
663
|
-
# Point to the next line in the whole table
|
|
664
|
-
$lindex++;
|
|
665
|
-
|
|
666
|
-
if (($j == 0) && ($i == 0))
|
|
667
|
-
{
|
|
668
|
-
push @g_para, "yes";
|
|
669
|
-
}
|
|
670
|
-
else
|
|
671
|
-
{
|
|
672
|
-
push @g_para, "no";
|
|
673
|
-
}
|
|
674
|
-
|
|
675
|
-
# Table feature
|
|
676
|
-
push @g_table, "yes";
|
|
677
|
-
|
|
678
|
-
# Pic feature
|
|
679
|
-
if ($is_pic)
|
|
680
|
-
{
|
|
681
|
-
push @g_pic, "yes";
|
|
682
|
-
}
|
|
683
|
-
else
|
|
684
|
-
{
|
|
685
|
-
push @g_pic, "no";
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
# Update xml pos value
|
|
689
|
-
push @g_pos_hash, $pos;
|
|
690
|
-
# Update xml alignment value
|
|
691
|
-
push @g_align, $align;
|
|
692
|
-
|
|
693
|
-
# Fontsize feature
|
|
694
|
-
push @g_font_size, -1;
|
|
695
|
-
# Fontface feature
|
|
696
|
-
push @g_font_face, "none";
|
|
697
|
-
# Bold feature
|
|
698
|
-
push @g_bold, "no";
|
|
699
|
-
# Italic feature
|
|
700
|
-
push @g_italic, "no";
|
|
701
|
-
# Bullet feature
|
|
702
|
-
push @g_bullet, "no";
|
|
703
|
-
}
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
# Nonsense
|
|
707
|
-
return $lindex;
|
|
708
|
-
}
|
|
709
|
-
|
|
710
|
-
sub ProcessPara
|
|
711
|
-
{
|
|
712
|
-
my ($paragraph, $is_pic, $line_addr) = @_;
|
|
713
|
-
|
|
714
|
-
# Paragraph attributes
|
|
715
|
-
my $align = $paragraph->get_alignment();
|
|
716
|
-
my $space = $paragraph->get_space_before();
|
|
717
|
-
# Line attributes
|
|
718
|
-
my ($left, $top, $right, $bottom) = undef;
|
|
719
|
-
# Run attributes
|
|
720
|
-
my $bold_count = 0;
|
|
721
|
-
my $italic_count = 0;
|
|
722
|
-
my %font_size_hash = ();
|
|
723
|
-
my %font_face_hash = ();
|
|
724
|
-
|
|
725
|
-
# Lines
|
|
726
|
-
my $omnilines = $paragraph->get_objs_ref();
|
|
727
|
-
my $start_l = 0;
|
|
728
|
-
my $end_l = scalar(@{ $omnilines }) - 1;
|
|
729
|
-
|
|
730
|
-
# Lines
|
|
731
|
-
for (my $t = $start_l; $t <= $end_l; $t++)
|
|
732
|
-
{
|
|
733
|
-
# Skip blank line
|
|
734
|
-
my $lcontent = $omnilines->[ $t ]->get_content();
|
|
735
|
-
$lcontent =~ s/^\s+|\s+$//g;
|
|
736
|
-
# Skip blank line
|
|
737
|
-
if ($lcontent eq "") { next; }
|
|
738
|
-
|
|
739
|
-
# Save the line
|
|
740
|
-
push @lines, $omnilines->[ $t ]->get_content();
|
|
741
|
-
# Save the line's address
|
|
742
|
-
$line_addr->{ 'L4' } = $t;
|
|
743
|
-
push @lines_addr, { %{ $line_addr } };
|
|
744
|
-
|
|
745
|
-
# Line attributes
|
|
746
|
-
$left = $omnilines->[ $t ]->get_left_pos();
|
|
747
|
-
$right = $omnilines->[ $t ]->get_right_pos();
|
|
748
|
-
$top = $omnilines->[ $t ]->get_top_pos();
|
|
749
|
-
$bottom = $omnilines->[ $t ]->get_bottom_pos();
|
|
750
|
-
|
|
751
|
-
# Runs
|
|
752
|
-
my $runs = $omnilines->[ $t ]->get_objs_ref();
|
|
753
|
-
my $start_r = 0;
|
|
754
|
-
my $end_r = scalar(@{ $runs }) - 1;
|
|
755
|
-
|
|
756
|
-
# Total number of words in a line
|
|
757
|
-
my $words_count = 0;
|
|
758
|
-
|
|
759
|
-
for (my $u = $start_r; $u <= $end_r; $u++)
|
|
760
|
-
{
|
|
761
|
-
# Thang's compatible code (instead of using get_objs_ref)
|
|
762
|
-
my $rcontent = undef;
|
|
763
|
-
# Get run content
|
|
764
|
-
$rcontent = $runs->[ $u ]->get_content();
|
|
765
|
-
# Trim
|
|
766
|
-
$rcontent =~ s/^\s+|\s+$//g;
|
|
767
|
-
# Split to words
|
|
768
|
-
my @words = split(/\s+/, $rcontent);
|
|
769
|
-
|
|
770
|
-
# Update the number of words
|
|
771
|
-
$words_count += scalar(@words);
|
|
772
|
-
|
|
773
|
-
# XML format
|
|
774
|
-
my $font_size = $runs->[ $u ]->get_font_size();
|
|
775
|
-
$font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
|
|
776
|
-
# XML format
|
|
777
|
-
my $font_face = $runs->[ $u ]->get_font_face();
|
|
778
|
-
$font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
|
|
779
|
-
# XML format
|
|
780
|
-
if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
|
|
781
|
-
# XML format
|
|
782
|
-
if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
|
|
783
|
-
}
|
|
784
|
-
|
|
785
|
-
# Line attributes - relative position in paragraph
|
|
786
|
-
if ($t == $start_l)
|
|
787
|
-
{
|
|
788
|
-
push @g_para, "yes";
|
|
789
|
-
}
|
|
790
|
-
else
|
|
791
|
-
{
|
|
792
|
-
push @g_para, "no";
|
|
793
|
-
}
|
|
794
|
-
|
|
795
|
-
# Line attributes - line position
|
|
796
|
-
my $pos = ($top + $bottom) / 2.0;
|
|
797
|
-
# Compare to global min and max position
|
|
798
|
-
if ($pos < $g_minpos) { $g_minpos = $pos; }
|
|
799
|
-
if ($pos > $g_maxpos) { $g_maxpos = $pos; }
|
|
800
|
-
# Pos feature
|
|
801
|
-
push @g_pos_hash, $pos;
|
|
802
|
-
# Alignment feature
|
|
803
|
-
push @g_align, $align;
|
|
804
|
-
# Table feature
|
|
805
|
-
push @g_table, "no";
|
|
806
|
-
|
|
807
|
-
if ($is_pic)
|
|
808
|
-
{
|
|
809
|
-
push @g_pic, "yes";
|
|
810
|
-
# Not assign value if line is in image area
|
|
811
|
-
push @g_bold, "no";
|
|
812
|
-
push @g_italic, "no";
|
|
813
|
-
push @g_bullet, "no";
|
|
814
|
-
push @g_font_size, -1;
|
|
815
|
-
push @g_font_face, "none";
|
|
816
|
-
}
|
|
817
|
-
else
|
|
818
|
-
{
|
|
819
|
-
push @g_pic, "no";
|
|
820
|
-
UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
|
|
821
|
-
UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
# Reset hash
|
|
825
|
-
%font_size_hash = ();
|
|
826
|
-
%font_face_hash = ();
|
|
827
|
-
# Reset
|
|
828
|
-
$bold_count = 0;
|
|
829
|
-
$italic_count = 0;
|
|
830
|
-
}
|
|
831
|
-
}
|
|
832
|
-
|
|
833
|
-
sub UpdateXMLFontFeature
|
|
834
|
-
{
|
|
835
|
-
my ($font_size_hash, $font_face_hash) = @_;
|
|
836
|
-
|
|
837
|
-
# Font size feature
|
|
838
|
-
if (scalar(keys %{ $font_size_hash }) == 0)
|
|
839
|
-
{
|
|
840
|
-
push @g_font_size, -1;
|
|
841
|
-
}
|
|
842
|
-
else
|
|
843
|
-
{
|
|
844
|
-
my @sorted_fonts = sort { $font_size_hash->{ $b } <=> $font_size_hash->{ $a } } keys %{ $font_size_hash };
|
|
845
|
-
|
|
846
|
-
my $font_size = undef;
|
|
847
|
-
# Iw two font sizes are equal in number, get the larger one
|
|
848
|
-
if ((scalar(@sorted_fonts) != 1) && ($font_size_hash->{ $sorted_fonts[ 0 ] } == $font_size_hash->{ $sorted_fonts[ 1 ] }))
|
|
849
|
-
{
|
|
850
|
-
$font_size = ($sorted_fonts[ 0 ] > $sorted_fonts[ 1 ]) ? $sorted_fonts[ 0 ] : $sorted_fonts[ 1 ];
|
|
851
|
-
}
|
|
852
|
-
else
|
|
853
|
-
{
|
|
854
|
-
$font_size = $sorted_fonts[ 0 ];
|
|
855
|
-
}
|
|
856
|
-
|
|
857
|
-
if ($font_size eq "") { $font_size = 0; }
|
|
858
|
-
|
|
859
|
-
push @g_font_size, $font_size;
|
|
860
|
-
$g_font_size_hash{ $font_size } = $g_font_size_hash{ $font_size } ? $g_font_size_hash{ $font_size } + 1 : 1;
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
# Font face feature
|
|
864
|
-
if (scalar(keys %{ $font_face_hash }) == 0)
|
|
865
|
-
{
|
|
866
|
-
push @g_font_face, "none";
|
|
867
|
-
}
|
|
868
|
-
else
|
|
869
|
-
{
|
|
870
|
-
my @sorted_fonts = sort { $font_face_hash->{ $b } <=> $font_face_hash->{ $a } } keys %{ $font_face_hash };
|
|
871
|
-
|
|
872
|
-
my $font_face = $sorted_fonts[ 0 ];
|
|
873
|
-
push @g_font_face, $font_face;
|
|
874
|
-
|
|
875
|
-
$g_font_face_hash{ $font_face } = $g_font_face_hash{ $font_face } ? $g_font_face_hash{ $font_face } + 1 : 1;
|
|
876
|
-
}
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
sub UpdateXMLFeatures
|
|
880
|
-
{
|
|
881
|
-
my ($bold_count, $italic_count, $words_count, $is_bullet, $space) = @_;
|
|
882
|
-
|
|
883
|
-
# Bold feature
|
|
884
|
-
my $bold_feature = undef;
|
|
885
|
-
if (($words_count != 0) && ($bold_count / $words_count >= 0.667))
|
|
886
|
-
{
|
|
887
|
-
$bold_feature = "yes";
|
|
888
|
-
}
|
|
889
|
-
else
|
|
890
|
-
{
|
|
891
|
-
$bold_feature = "no";
|
|
892
|
-
}
|
|
893
|
-
push @g_bold, $bold_feature;
|
|
894
|
-
|
|
895
|
-
# Italic feature
|
|
896
|
-
my $italic_feature = undef;
|
|
897
|
-
if (($words_count != 0) && ($italic_count / $words_count >= 0.667))
|
|
898
|
-
{
|
|
899
|
-
$italic_feature = "yes";
|
|
900
|
-
}
|
|
901
|
-
else
|
|
902
|
-
{
|
|
903
|
-
$italic_feature = "no";
|
|
904
|
-
}
|
|
905
|
-
push @g_italic, $italic_feature;
|
|
906
|
-
|
|
907
|
-
# Bullet feature
|
|
908
|
-
if ((defined $is_bullet) && ($is_bullet eq "true"))
|
|
909
|
-
{
|
|
910
|
-
push @g_bullet, "yes";
|
|
911
|
-
}
|
|
912
|
-
else
|
|
913
|
-
{
|
|
914
|
-
push @g_bullet, "no";
|
|
915
|
-
}
|
|
916
|
-
}
|
|
917
|
-
|
|
918
|
-
sub UntaintPath
|
|
919
|
-
{
|
|
920
|
-
my ($path) = @_;
|
|
921
|
-
|
|
922
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ )
|
|
923
|
-
{
|
|
924
|
-
$path = $1;
|
|
925
|
-
}
|
|
926
|
-
else
|
|
927
|
-
{
|
|
928
|
-
die "Bad path \"$path\"\n";
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
return $path;
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
sub Untaint
|
|
935
|
-
{
|
|
936
|
-
my ($s) = @_;
|
|
937
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
|
|
938
|
-
{
|
|
939
|
-
$s = $1; # $data now untainted
|
|
940
|
-
}
|
|
941
|
-
else
|
|
942
|
-
{
|
|
943
|
-
die "Bad data in $s"; # log this somewhere
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
return $s;
|
|
947
|
-
}
|
|
948
|
-
|
|
949
|
-
sub Execute
|
|
950
|
-
{
|
|
951
|
-
my ($cmd) = @_;
|
|
952
|
-
$cmd = Untaint($cmd);
|
|
953
|
-
system($cmd);
|
|
954
|
-
}
|
|
955
|
-
|
|
956
|
-
sub NewTmpFile
|
|
957
|
-
{
|
|
958
|
-
my $tmp_file = `date '+%Y%m%d-%H%M%S-$$'`;
|
|
959
|
-
chomp $tmp_file;
|
|
960
|
-
return $tmp_file;
|
|
961
|
-
}
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|