biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,964 +0,0 @@
1
- #!/usr/bin/perl
2
-
3
- # Author: Do Hoang Nhat Huy <huydo@comp.nus.edu.sg>
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
-
9
- # Dependencies
10
- use FindBin;
11
- use Getopt::Long;
12
- use HTML::Entities;
13
-
14
- # I do not know a better solution to find a lib path in -T mode.
15
- # So if you know a better solution, I'd be glad to hear.
16
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
17
-
18
- # To get correct path in case 2 scripts in different directories use FindBin
19
- FindBin::again();
20
- my $path = undef;
21
- BEGIN
22
- {
23
- if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
24
- }
25
- use lib "$path/../../lib";
26
-
27
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
28
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
29
-
30
- # Local libraries
31
- use Omni::Config;
32
- use Omni::Omnidoc;
33
- use SectLabel::PreProcess;
34
-
35
- # Omnilib configuration: object name
36
- my $obj_list = $Omni::Config::obj_list;
37
-
38
- ### USER customizable section
39
- $0 =~ /([^\/]+)$/; my $progname = $1;
40
- my $version = "1.0";
41
- ### END user customizable section
42
-
43
- sub License
44
- {
45
- print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
46
- }
47
-
48
- sub Help
49
- {
50
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
51
- print STDERR "usage: $progname -h\t[invokes help]\n";
52
- print STDERR " $progname -in xmlfile -out outfile [-decode] [-log]\n";
53
- print STDERR "Options:\n";
54
- print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
55
- print STDERR "\t-decode \tDecode HTML entities and then output, to avoid double entity encoding later\n";
56
- }
57
-
58
- my $quite = 0;
59
- my $help = 0;
60
- my $out_file = undef;
61
- my $in_file = undef;
62
- my $is_decode = 0;
63
- my $is_debug = 0;
64
- my $address = 1;
65
-
66
- $help = 1 unless GetOptions( 'in=s' => \$in_file,
67
- 'out=s' => \$out_file,
68
- 'decode' => \$is_decode,
69
- 'log' => \$is_debug,
70
- 'h' => \$help,
71
- 'q' => \$quite );
72
-
73
- if ($help || ! defined $in_file || ! defined $out_file)
74
- {
75
- Help();
76
- exit(0);
77
- }
78
-
79
- if (!$quite)
80
- {
81
- License();
82
- }
83
-
84
- ### Untaint ###
85
- $in_file = UntaintPath($in_file);
86
- $out_file = UntaintPath($out_file);
87
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
88
- ### End untaint ###
89
-
90
- # Mark page, para, line, word
91
- my %g_page_hash = ();
92
-
93
- # Mark paragraph
94
- my @g_para = ();
95
-
96
- # XML features
97
- # Location feature
98
- my @g_pos_hash = ();
99
- my $g_maxpos = 0;
100
- my $g_minpos = 1000000;
101
- # Align feature
102
- my @g_align = ();
103
- # Bold feature
104
- my @g_bold = ();
105
- # Italic feature
106
- my @g_italic = ();
107
- # Pic feature
108
- my @g_pic = ();
109
- # Table feature
110
- my @g_table = ();
111
- # Bullet feature
112
- my @g_bullet = ();
113
- # Font size feature
114
- my %g_font_size_hash = ();
115
- my @g_font_size = ();
116
- # Font face feature
117
- my %g_font_face_hash = ();
118
- my @g_font_face = ();
119
-
120
- # All lines
121
- my @lines = ();
122
- # and their address
123
- my @lines_addr = ();
124
-
125
- # BEGIN
126
- ProcessFile($in_file);
127
- # Find header part
128
- my $num_lines = scalar(@lines);
129
- my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
130
- # Done
131
- Output(\@lines, $out_file);
132
-
133
- if ($address == 1)
134
- {
135
- my $address_handle = undef;
136
- # Save the line address for further use
137
- open($address_handle, ">:utf8", $out_file . ".address") || die"#Can't open file \"$out_file.address\"\n";
138
- foreach my $addr (@lines_addr)
139
- {
140
- print $address_handle $addr->{ 'L1' }, " ", $addr->{ 'L2' }, " ", $addr->{ 'L3' }, " ", $addr->{ 'L4' }, "\n";
141
- }
142
- # Done
143
- close $address_handle;
144
- }
145
- # END
146
-
147
- sub ProcessFile
148
- {
149
- my ($in_file) = @_;
150
-
151
- my $input_handle = undef;
152
- if (! open($input_handle, "<:utf8", $in_file)) { die "Could not open xml file " . $in_file; }
153
- my $xml = do { local $/; <$input_handle> };
154
- close $input_handle;
155
-
156
- ###
157
- # Huydhn
158
- # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
159
- # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
160
- ###
161
- # Convert to Unix format
162
- $xml =~ s/\r//g;
163
- # Remove <?xml version="1.0" encoding="UTF-8"?>
164
- $xml =~ s/<\?xml.+?>\n//g;
165
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
166
- $xml =~ s/<\!\-\-XML.+?>\n//g;
167
- # Declaration and root
168
- $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
169
-
170
- # New document
171
- my $doc = new Omni::Omnidoc();
172
- $doc->set_raw($xml);
173
-
174
- # Current position
175
- my %current = ();
176
-
177
- # All pages in the document
178
- my $pages = $doc->get_objs_ref();
179
-
180
- # From page, To page
181
- my $start_page = 0;
182
- my $end_page = scalar(@{ $pages }) - 1;
183
-
184
- # Image area flag
185
- my $is_pic = 0;
186
-
187
- # Tree traveling is 'not' fun. Seriously.
188
- # This is like a dungeon seige.
189
- for (my $x = $start_page; $x <= $end_page; $x++)
190
- {
191
- # Current position
192
- $current{ 'L1' } = $x;
193
-
194
- # Column or dd
195
- my $level_2 = $pages->[ $x ]->get_objs_ref();
196
- my $start_l2 = 0;
197
- my $end_l2 = scalar(@{ $level_2 }) - 1;
198
-
199
- for (my $y = $start_l2; $y <= $end_l2; $y++)
200
- {
201
- # Thang's code
202
- # Thang considers <dd> tag as image, I just follow that
203
- if ($level_2->[ $y ]->get_name() eq $obj_list->{ 'OMNIDD' })
204
- {
205
- $is_pic = 1;
206
- }
207
- else
208
- {
209
- $is_pic = 0;
210
- }
211
- # End Thang's code
212
-
213
- # Current position
214
- $current{ 'L2' } = $y;
215
-
216
- # Table or paragraph
217
- my $level_3 = $level_2->[ $y ]->get_objs_ref();
218
- my $start_l3 = 0;
219
- my $end_l3 = scalar(@{ $level_3 }) - 1;
220
-
221
- for (my $z = $start_l3; $z <= $end_l3; $z++)
222
- {
223
- # Current position
224
- $current{ 'L3' } = $z;
225
-
226
- # Is a paragraph
227
- if ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIPARA' })
228
- {
229
- # Thang's code
230
- ProcessPara($level_3->[ $z ], $is_pic, \%current);
231
- # End Thang's code
232
- }
233
- # or a table
234
- elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNITABLE' })
235
- {
236
- # Thang's code
237
- ProcessTable($level_3->[ $z ], $is_pic, \%current, 0);
238
- # End Thangs's code
239
- }
240
- # or a frame
241
- elsif ($level_3->[ $z ]->get_name() eq $obj_list->{ 'OMNIFRAME' })
242
- {
243
- # Frame contains multiple paragraph ?
244
- ProcessFrame($level_3->[ $z ], $is_pic, \%current);
245
- }
246
- }
247
- }
248
- }
249
- }
250
-
251
- sub Output
252
- {
253
- my ($lines, $out_file) = @_;
254
-
255
- my $output_handle = undef;
256
- # This is the output
257
- open($output_handle, ">:utf8", $out_file) || die"#Can't open file \"$out_file\"\n";
258
-
259
- # XML feature label
260
- my %g_font_size_labels = ();
261
- GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
262
-
263
- my $output = "";
264
- my $para_line_id = -1;
265
- my $para_line_count = 0;
266
-
267
- # This is the index of the line
268
- my $id = 0;
269
- # For each line in the whole document
270
- foreach my $line (@{ $lines })
271
- {
272
- # Remove empty line
273
- $line =~ s/^\s+|\s+$//g;
274
-
275
- # New paragraph
276
- if (($g_para[ $id ] eq "yes") && ($output ne ""))
277
- {
278
- if ($is_decode) { $output = decode_entities($output); }
279
- # Write output to file
280
- print $output_handle $output;
281
- # Clean output for new paragraph
282
- $output = "";
283
- }
284
-
285
- $output .= $line;
286
-
287
- my $loc_feature = undef;
288
- # XML location feature
289
- if ($g_pos_hash[ $id ] != (-1)) { $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos) * 8.0 / ($g_maxpos - $g_minpos + 1)); }
290
-
291
- # Align feature
292
- my $align_feature = "xmlAlign_" . $g_align[ $id ];
293
-
294
- my $font_size_feature = undef;
295
- # Font_size feature
296
- if (($g_font_size[$id] eq "") || ($g_font_size[$id] == -1))
297
- {
298
- $font_size_feature = "xmlFontSize_none";
299
- }
300
- else
301
- {
302
- $font_size_feature = "xmlFontSize_" . $g_font_size_labels{ $g_font_size[ $id ] };
303
- }
304
-
305
- # Bold feature
306
- my $bold_feature = "xmlBold_" . $g_bold[ $id ];
307
- # Italic feature
308
- my $italic_feature = "xmlItalic_" . $g_italic[ $id ];
309
- # Image feature
310
- my $pic_feature = "xmlPic_" . $g_pic[ $id ];
311
- # Table feature
312
- my $table_feature = "xmlTable_" . $g_table[ $id ];
313
- # Bullet feature
314
- my $bullet_feature = "xmlBullet_" . $g_bullet[ $id ];
315
- # Differential features
316
- my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
317
-
318
- # Each line and its XML features
319
- $output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff" . "\n";
320
-
321
- # Update line index
322
- $id++;
323
- }
324
-
325
- # New paragraph
326
- if ($output ne "")
327
- {
328
- if ($is_decode) { $output = decode_entities($output); }
329
- # Write output to file
330
- print $output_handle $output;
331
- # Clean output for new paragraph
332
- $output = "";
333
- }
334
-
335
- # Done
336
- close $output_handle;
337
- }
338
-
339
- sub GetDifferentialFeatures
340
- {
341
- my ($id) = @_;
342
-
343
- my $align_diff = "bi_xmlA_";
344
- # AlignChange feature
345
- if ($id == 0)
346
- {
347
- $align_diff .= $g_align[ $id ];
348
- }
349
- elsif ($g_align[ $id ] eq $g_align[ $id - 1 ])
350
- {
351
- $align_diff .= "continue";
352
- }
353
- else
354
- {
355
- $align_diff .= $g_align[$id];
356
- }
357
-
358
- my $font_face_diff = "bi_xmlF_";
359
- # FontFaceChange feature
360
- if ($id == 0)
361
- {
362
- $font_face_diff .= "new";
363
- }
364
- elsif ($g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
365
- {
366
- $font_face_diff .= "continue";
367
- }
368
- else
369
- {
370
- $font_face_diff .= "new";
371
- }
372
-
373
- my $font_size_diff = "bi_xmlS_";
374
- # FontSizeChange feature
375
- if ($id == 0)
376
- {
377
- $font_size_diff .= "new";
378
- }
379
- elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ])
380
- {
381
- $font_size_diff .= "continue";
382
- }
383
- else
384
- {
385
- $font_size_diff .= "new";
386
- }
387
-
388
- my $font_sf_diff = "bi_xmlSF_";
389
- # FontSFChange feature
390
- if ($id == 0)
391
- {
392
- $font_sf_diff .= "new";
393
- }
394
- elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ])
395
- {
396
- $font_sf_diff .= "continue";
397
- }
398
- else
399
- {
400
- $font_sf_diff .= "new";
401
- }
402
-
403
- my $font_sfbi_diff = "bi_xmlSFBI_";
404
- # FontSFBIChange feature
405
- if ($id == 0)
406
- {
407
- $font_sfbi_diff .= "new";
408
- }
409
- elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[ $id - 1 ])
410
- {
411
- $font_sfbi_diff .= "continue";
412
- }
413
- else
414
- {
415
- $font_sfbi_diff .= "new";
416
- }
417
-
418
- my $font_sfbia_diff = "bi_xmlSFBIA_";
419
- # FontSFBIAChange feature
420
- if ($id == 0)
421
- {
422
- $font_sfbia_diff .= "new";
423
- }
424
- elsif ($g_font_size[ $id ] == $g_font_size[ $id - 1 ] && $g_font_face[ $id ] eq $g_font_face[ $id - 1 ] && $g_bold[ $id ] eq $g_bold[ $id - 1 ] && $g_italic[ $id ] eq $g_italic[$id - 1] && $g_align[ $id ] eq $g_align[ $id - 1 ])
425
- {
426
- $font_sfbia_diff .= "continue";
427
- }
428
- else
429
- {
430
- $font_sfbia_diff .= "new";
431
- }
432
-
433
- # ParaChange feature
434
- my $para_diff = "bi_xmlPara_";
435
- # Header part, consider each line as a separate paragraph
436
- if ($id < $body_start_id)
437
- {
438
- $para_diff .= "header";
439
- }
440
- else
441
- {
442
- if($g_para[$id] eq "yes")
443
- {
444
- $para_diff .= "new";
445
- }
446
- else
447
- {
448
- $para_diff .= "continue";
449
- }
450
- }
451
-
452
- return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
453
- }
454
-
455
- sub GetFontSizeLabels
456
- {
457
- my ($g_font_size_hash, $g_font_size_labels) = @_;
458
-
459
- # Sort by value in desccending order
460
- my @sorted_fonts = sort { $g_font_size_hash->{ $b } <=> $g_font_size_hash->{ $a } } keys %{ $g_font_size_hash };
461
- # and get the
462
- my $common_size = $sorted_fonts[ 0 ];
463
-
464
- # Sort by key in ascending order
465
- @sorted_fonts = sort { $a <=> $b } keys %{ $g_font_size_hash };
466
-
467
- my $common_index = 0;
468
- # Index of common font size
469
- foreach (@sorted_fonts)
470
- {
471
- # Found
472
- if ($common_size == $_) { last; }
473
- $common_index++;
474
- }
475
-
476
- # Small fonts
477
- for (my $i = 0; $i < $common_index; $i++)
478
- {
479
- $g_font_size_labels->{ $sorted_fonts[ $i ] } = "smaller";
480
- }
481
-
482
- # Common fonts
483
- $g_font_size_labels->{ $common_size } = "common";
484
-
485
- # Large fonts
486
- for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
487
- {
488
- if ((scalar(@sorted_fonts) - $i) <= 3)
489
- {
490
- $g_font_size_labels->{ $sorted_fonts[$i] } = "largest" . ($i + 1 - scalar(@sorted_fonts));
491
- }
492
- else
493
- {
494
- $g_font_size_labels->{ $sorted_fonts[$i] } = "larger";
495
- }
496
- }
497
- }
498
-
499
- sub ProcessFrame
500
- {
501
- my ($omniframe, $is_pic, $line_addr) = @_;
502
-
503
- # Line index in the whole frame
504
- my $lindex = 0;
505
- # All paragraph or table in the frame
506
- my $objs = $omniframe->get_objs_ref();
507
- # For each paragraph or table in the frame
508
- for (my $i = 0; $i < scalar(@{ $objs }); $i++)
509
- {
510
- if ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNIPARA' })
511
- {
512
- # Paragraph attributes
513
- my $align = $objs->[ $i ]->get_alignment();
514
- my $space = $objs->[ $i ]->get_space_before();
515
- # Line attributes
516
- my ($left, $top, $right, $bottom) = undef;
517
- # Run attributes
518
- my $bold_count = 0;
519
- my $italic_count = 0;
520
- my %font_size_hash = ();
521
- my %font_face_hash = ();
522
-
523
- my $omnilines = $objs->[ $i ]->get_objs_ref();
524
- # For each line in the paragraph
525
- for (my $t = 0; $t < scalar(@{ $omnilines }); $t++)
526
- {
527
- # Save the line
528
- push @lines, $omnilines->[ $t ]->get_content();
529
- # Save the line's address
530
- $line_addr->{ 'L4' } = $lindex;
531
- push @lines_addr, { %{ $line_addr } };
532
- # Point to the next line in the whole frame
533
- $lindex++;
534
-
535
- # Line attributes
536
- $left = $omnilines->[ $t ]->get_left_pos();
537
- $right = $omnilines->[ $t ]->get_right_pos();
538
- $top = $omnilines->[ $t ]->get_top_pos();
539
- $bottom = $omnilines->[ $t ]->get_bottom_pos();
540
-
541
- # Runs
542
- my $runs = $omnilines->[ $t ]->get_objs_ref();
543
- my $start_r = 0;
544
- my $end_r = scalar(@{ $runs }) - 1;
545
-
546
- # Total number of words in a line
547
- my $words_count = 0;
548
-
549
- for (my $u = $start_r; $u <= $end_r; $u++)
550
- {
551
- # Thang's compatible code (instead of using get_objs_ref)
552
- my $rcontent = undef;
553
- # Get run content
554
- $rcontent = $runs->[ $u ]->get_content();
555
- # Trim
556
- $rcontent =~ s/^\s+|\s+$//g;
557
- # Split to words
558
- my @words = split(/\s+/, $rcontent);
559
-
560
- # Update the number of words
561
- $words_count += scalar(@words);
562
-
563
- # XML format
564
- my $font_size = $runs->[ $u ]->get_font_size();
565
- $font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
566
- # XML format
567
- my $font_face = $runs->[ $u ]->get_font_face();
568
- $font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
569
- # XML format
570
- if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
571
- # XML format
572
- if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
573
- }
574
-
575
- # Line attributes - relative position in paragraph
576
- if ($t == 0)
577
- {
578
- push @g_para, "yes";
579
- }
580
- else
581
- {
582
- push @g_para, "no";
583
- }
584
-
585
- # Line attributes - line position
586
- my $pos = ($top + $bottom) / 2.0;
587
- # Compare to global min and max position
588
- if ($pos < $g_minpos) { $g_minpos = $pos; }
589
- if ($pos > $g_maxpos) { $g_maxpos = $pos; }
590
- # Pos feature
591
- push @g_pos_hash, $pos;
592
- # Alignment feature
593
- push @g_align, $align;
594
- # Table feature
595
- push @g_table, "no";
596
-
597
- if ($is_pic)
598
- {
599
- push @g_pic, "yes";
600
- # Not assign value if line is in image area
601
- push @g_bold, "no";
602
- push @g_italic, "no";
603
- push @g_bullet, "no";
604
- push @g_font_size, -1;
605
- push @g_font_face, "none";
606
- }
607
- else
608
- {
609
- push @g_pic, "no";
610
- UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
611
- UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
612
- }
613
-
614
- # Reset hash
615
- %font_size_hash = ();
616
- %font_face_hash = ();
617
- # Reset
618
- $bold_count = 0;
619
- $italic_count = 0;
620
- }
621
- }
622
- elsif ($objs->[ $i ]->get_name() eq $obj_list->{ 'OMNITABLE' })
623
- {
624
- $lindex = ProcessTable($objs->[ $i ], $is_pic, $line_addr, $lindex);
625
- }
626
- }
627
- }
628
-
629
- sub ProcessTable
630
- {
631
- my ($omnitable, $is_pic, $line_addr, $lindex) = @_;
632
-
633
- # Table attributes
634
- my ($left, $top, $right, $bottom) = undef;
635
- $left = $omnitable->get_left_pos();
636
- $right = $omnitable->get_right_pos();
637
- $top = $omnitable->get_top_pos();
638
- $bottom = $omnitable->get_bottom_pos();
639
- # Table attributes
640
- my $align = $omnitable->get_alignment();
641
-
642
- # Thang's code
643
- my $pos = ($top + $bottom) / 2.0;
644
- # Set new min and max position
645
- if ($pos < $g_minpos) { $g_minpos = $pos; }
646
- if ($pos > $g_maxpos) { $g_maxpos = $pos; }
647
- # End Thangs's code
648
-
649
- # All row in the table
650
- my $rows = $omnitable->get_row_content();
651
- # For each row in the table
652
- for (my $i = 0; $i < scalar(@{ $rows }); $i++)
653
- {
654
- my @row_lines = split(/\n/, $rows->[ $i ]);
655
- # For each line in the row
656
- for (my $j = 0; $j < scalar(@row_lines); $j++)
657
- {
658
- # Save the line
659
- push @lines, $row_lines[ $j ];
660
- # Save the line's address
661
- $line_addr->{ 'L4' } = $lindex;
662
- push @lines_addr, { %{ $line_addr } };
663
- # Point to the next line in the whole table
664
- $lindex++;
665
-
666
- if (($j == 0) && ($i == 0))
667
- {
668
- push @g_para, "yes";
669
- }
670
- else
671
- {
672
- push @g_para, "no";
673
- }
674
-
675
- # Table feature
676
- push @g_table, "yes";
677
-
678
- # Pic feature
679
- if ($is_pic)
680
- {
681
- push @g_pic, "yes";
682
- }
683
- else
684
- {
685
- push @g_pic, "no";
686
- }
687
-
688
- # Update xml pos value
689
- push @g_pos_hash, $pos;
690
- # Update xml alignment value
691
- push @g_align, $align;
692
-
693
- # Fontsize feature
694
- push @g_font_size, -1;
695
- # Fontface feature
696
- push @g_font_face, "none";
697
- # Bold feature
698
- push @g_bold, "no";
699
- # Italic feature
700
- push @g_italic, "no";
701
- # Bullet feature
702
- push @g_bullet, "no";
703
- }
704
- }
705
-
706
- # Nonsense
707
- return $lindex;
708
- }
709
-
710
- sub ProcessPara
711
- {
712
- my ($paragraph, $is_pic, $line_addr) = @_;
713
-
714
- # Paragraph attributes
715
- my $align = $paragraph->get_alignment();
716
- my $space = $paragraph->get_space_before();
717
- # Line attributes
718
- my ($left, $top, $right, $bottom) = undef;
719
- # Run attributes
720
- my $bold_count = 0;
721
- my $italic_count = 0;
722
- my %font_size_hash = ();
723
- my %font_face_hash = ();
724
-
725
- # Lines
726
- my $omnilines = $paragraph->get_objs_ref();
727
- my $start_l = 0;
728
- my $end_l = scalar(@{ $omnilines }) - 1;
729
-
730
- # Lines
731
- for (my $t = $start_l; $t <= $end_l; $t++)
732
- {
733
- # Skip blank line
734
- my $lcontent = $omnilines->[ $t ]->get_content();
735
- $lcontent =~ s/^\s+|\s+$//g;
736
- # Skip blank line
737
- if ($lcontent eq "") { next; }
738
-
739
- # Save the line
740
- push @lines, $omnilines->[ $t ]->get_content();
741
- # Save the line's address
742
- $line_addr->{ 'L4' } = $t;
743
- push @lines_addr, { %{ $line_addr } };
744
-
745
- # Line attributes
746
- $left = $omnilines->[ $t ]->get_left_pos();
747
- $right = $omnilines->[ $t ]->get_right_pos();
748
- $top = $omnilines->[ $t ]->get_top_pos();
749
- $bottom = $omnilines->[ $t ]->get_bottom_pos();
750
-
751
- # Runs
752
- my $runs = $omnilines->[ $t ]->get_objs_ref();
753
- my $start_r = 0;
754
- my $end_r = scalar(@{ $runs }) - 1;
755
-
756
- # Total number of words in a line
757
- my $words_count = 0;
758
-
759
- for (my $u = $start_r; $u <= $end_r; $u++)
760
- {
761
- # Thang's compatible code (instead of using get_objs_ref)
762
- my $rcontent = undef;
763
- # Get run content
764
- $rcontent = $runs->[ $u ]->get_content();
765
- # Trim
766
- $rcontent =~ s/^\s+|\s+$//g;
767
- # Split to words
768
- my @words = split(/\s+/, $rcontent);
769
-
770
- # Update the number of words
771
- $words_count += scalar(@words);
772
-
773
- # XML format
774
- my $font_size = $runs->[ $u ]->get_font_size();
775
- $font_size_hash{ $font_size } = $font_size_hash{ $font_size } ? $font_size_hash{ $font_size } + scalar(@words) : scalar(@words);
776
- # XML format
777
- my $font_face = $runs->[ $u ]->get_font_face();
778
- $font_face_hash{ $font_face } = $font_face_hash{ $font_face } ? $font_face_hash{ $font_face } + scalar(@words) : scalar(@words);
779
- # XML format
780
- if ($runs->[ $u ]->get_bold() eq "true") { $bold_count += scalar(@words); }
781
- # XML format
782
- if ($runs->[ $u ]->get_italic() eq "true") { $italic_count += scalar(@words); }
783
- }
784
-
785
- # Line attributes - relative position in paragraph
786
- if ($t == $start_l)
787
- {
788
- push @g_para, "yes";
789
- }
790
- else
791
- {
792
- push @g_para, "no";
793
- }
794
-
795
- # Line attributes - line position
796
- my $pos = ($top + $bottom) / 2.0;
797
- # Compare to global min and max position
798
- if ($pos < $g_minpos) { $g_minpos = $pos; }
799
- if ($pos > $g_maxpos) { $g_maxpos = $pos; }
800
- # Pos feature
801
- push @g_pos_hash, $pos;
802
- # Alignment feature
803
- push @g_align, $align;
804
- # Table feature
805
- push @g_table, "no";
806
-
807
- if ($is_pic)
808
- {
809
- push @g_pic, "yes";
810
- # Not assign value if line is in image area
811
- push @g_bold, "no";
812
- push @g_italic, "no";
813
- push @g_bullet, "no";
814
- push @g_font_size, -1;
815
- push @g_font_face, "none";
816
- }
817
- else
818
- {
819
- push @g_pic, "no";
820
- UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
821
- UpdateXMLFeatures($bold_count, $italic_count, $words_count, $omnilines->[ $t ]->get_bullet(), $space);
822
- }
823
-
824
- # Reset hash
825
- %font_size_hash = ();
826
- %font_face_hash = ();
827
- # Reset
828
- $bold_count = 0;
829
- $italic_count = 0;
830
- }
831
- }
832
-
833
- sub UpdateXMLFontFeature
834
- {
835
- my ($font_size_hash, $font_face_hash) = @_;
836
-
837
- # Font size feature
838
- if (scalar(keys %{ $font_size_hash }) == 0)
839
- {
840
- push @g_font_size, -1;
841
- }
842
- else
843
- {
844
- my @sorted_fonts = sort { $font_size_hash->{ $b } <=> $font_size_hash->{ $a } } keys %{ $font_size_hash };
845
-
846
- my $font_size = undef;
847
- # Iw two font sizes are equal in number, get the larger one
848
- if ((scalar(@sorted_fonts) != 1) && ($font_size_hash->{ $sorted_fonts[ 0 ] } == $font_size_hash->{ $sorted_fonts[ 1 ] }))
849
- {
850
- $font_size = ($sorted_fonts[ 0 ] > $sorted_fonts[ 1 ]) ? $sorted_fonts[ 0 ] : $sorted_fonts[ 1 ];
851
- }
852
- else
853
- {
854
- $font_size = $sorted_fonts[ 0 ];
855
- }
856
-
857
- if ($font_size eq "") { $font_size = 0; }
858
-
859
- push @g_font_size, $font_size;
860
- $g_font_size_hash{ $font_size } = $g_font_size_hash{ $font_size } ? $g_font_size_hash{ $font_size } + 1 : 1;
861
- }
862
-
863
- # Font face feature
864
- if (scalar(keys %{ $font_face_hash }) == 0)
865
- {
866
- push @g_font_face, "none";
867
- }
868
- else
869
- {
870
- my @sorted_fonts = sort { $font_face_hash->{ $b } <=> $font_face_hash->{ $a } } keys %{ $font_face_hash };
871
-
872
- my $font_face = $sorted_fonts[ 0 ];
873
- push @g_font_face, $font_face;
874
-
875
- $g_font_face_hash{ $font_face } = $g_font_face_hash{ $font_face } ? $g_font_face_hash{ $font_face } + 1 : 1;
876
- }
877
- }
878
-
879
- sub UpdateXMLFeatures
880
- {
881
- my ($bold_count, $italic_count, $words_count, $is_bullet, $space) = @_;
882
-
883
- # Bold feature
884
- my $bold_feature = undef;
885
- if (($words_count != 0) && ($bold_count / $words_count >= 0.667))
886
- {
887
- $bold_feature = "yes";
888
- }
889
- else
890
- {
891
- $bold_feature = "no";
892
- }
893
- push @g_bold, $bold_feature;
894
-
895
- # Italic feature
896
- my $italic_feature = undef;
897
- if (($words_count != 0) && ($italic_count / $words_count >= 0.667))
898
- {
899
- $italic_feature = "yes";
900
- }
901
- else
902
- {
903
- $italic_feature = "no";
904
- }
905
- push @g_italic, $italic_feature;
906
-
907
- # Bullet feature
908
- if ((defined $is_bullet) && ($is_bullet eq "true"))
909
- {
910
- push @g_bullet, "yes";
911
- }
912
- else
913
- {
914
- push @g_bullet, "no";
915
- }
916
- }
917
-
918
- sub UntaintPath
919
- {
920
- my ($path) = @_;
921
-
922
- if ( $path =~ /^([-_\/\w\.]*)$/ )
923
- {
924
- $path = $1;
925
- }
926
- else
927
- {
928
- die "Bad path \"$path\"\n";
929
- }
930
-
931
- return $path;
932
- }
933
-
934
- sub Untaint
935
- {
936
- my ($s) = @_;
937
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
938
- {
939
- $s = $1; # $data now untainted
940
- }
941
- else
942
- {
943
- die "Bad data in $s"; # log this somewhere
944
- }
945
-
946
- return $s;
947
- }
948
-
949
- sub Execute
950
- {
951
- my ($cmd) = @_;
952
- $cmd = Untaint($cmd);
953
- system($cmd);
954
- }
955
-
956
- sub NewTmpFile
957
- {
958
- my $tmp_file = `date '+%Y%m%d-%H%M%S-$$'`;
959
- chomp $tmp_file;
960
- return $tmp_file;
961
- }
962
-
963
-
964
-