biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,1529 +0,0 @@
1
- #!/usr/bin/perl -wT
2
-
3
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
-
9
- use Getopt::Long;
10
- use HTML::Entities;
11
-
12
- # I do not know a better solution to find a lib path in -T mode.
13
- # So if you know a better solution, I'd be glad to hear.
14
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
15
- use FindBin;
16
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
17
- my $path;
18
- BEGIN
19
- {
20
- if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
21
- }
22
-
23
- use lib "$path/../../lib";
24
-
25
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
26
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
27
-
28
- use SectLabel::PreProcess;
29
-
30
- ### USER customizable section
31
- $0 =~ /([^\/]+)$/; my $progname = $1;
32
- my $outputVersion = "1.0";
33
- ### END user customizable section
34
-
35
- sub License
36
- {
37
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
38
- }
39
-
40
- ### HELP Sub-procedure
41
- sub Help
42
- {
43
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
44
- print STDERR "usage: $progname -h\t[invokes help]\n";
45
- print STDERR " $progname -in xmlFile -out out_file [-xmlFeature -decode -markup -para] [-tag tag_file -allowEmptyLine -log]\n";
46
- print STDERR "Options:\n";
47
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
48
- print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
49
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
50
- print STDERR "\t-para: marking in the output each paragraph with # Para lineId num_lines\n";
51
- print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
52
- print STDERR "\t-tag tag_file: count XML tags/values for statistics purpose\n";
53
- }
54
-
55
- my $quite = 0;
56
- my $help = 0;
57
- my $out_file = undef;
58
- my $in_file = undef;
59
-
60
- my $is_xml_feature = 0;
61
- my $is_decode = 0;
62
- my $is_markup = 0;
63
- my $is_para_delimiter = 0;
64
- my $is_allow_empty = 0;
65
- my $is_debug = 0;
66
- my $tag_file = "";
67
-
68
- $help = 1 unless GetOptions(
69
- 'in=s' => \$in_file,
70
- 'out=s' => \$out_file,
71
- 'decode' => \$is_decode,
72
- 'xmlFeature' => \$is_xml_feature,
73
- 'tag=s' => \$tag_file,
74
- 'allowEmptyLine' => \$is_allow_empty,
75
- 'markup' => \$is_markup,
76
- 'para' => \$is_para_delimiter,
77
- 'log' => \$is_debug,
78
- 'h' => \$help,
79
- 'q' => \$quite );
80
-
81
- if ($help || !defined $in_file || !defined $out_file)
82
- {
83
- Help();
84
- exit(0);
85
- }
86
-
87
- if (!$quite)
88
- {
89
- License();
90
- }
91
-
92
- ### Untaint ###
93
- $in_file = UntaintPath($in_file);
94
- $out_file = UntaintPath($out_file);
95
- $tag_file = UntaintPath($tag_file);
96
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
97
- ### End untaint ###
98
-
99
- # Mark page, para, line, word
100
- my %g_page_hash = ();
101
-
102
- # Mark paragraph
103
- my @g_para = ();
104
-
105
- # XML features
106
- # Location feature
107
- my @g_pos_hash = ();
108
- my $g_minpos = 1000000;
109
- my $g_maxpos = 0;
110
- # Align feature
111
- my @g_align = ();
112
- # Bold feature
113
- my @g_bold = ();
114
- # Italic feature
115
- my @g_italic = ();
116
- # Font size feature
117
- my %g_font_size_hash = ();
118
- my @g_font_size = ();
119
- # Font face feature
120
- my %g_font_face_hash = ();
121
- my @g_font_face = ();
122
- # Pic feature
123
- my @g_pic = ();
124
- # Table feature
125
- my @g_table = ();
126
- # Bullet feature
127
- my @g_bullet = ();
128
- # Space feature
129
- # my %g_space_hash = ();
130
- # my @g_space = ();
131
-
132
- my %tags = ();
133
-
134
- if ($is_debug)
135
- {
136
- print STDERR "\n# Processing file $in_file & output to $out_file\n";
137
- }
138
-
139
- my $markup_output = "";
140
- my $all_text = ProcessFile($in_file, $out_file, \%tags);
141
-
142
- # Find header part
143
- my @lines = split(/\n/, $all_text);
144
- my $num_lines = scalar(@lines);
145
- my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
146
-
147
- # Output
148
- if ($is_markup)
149
- {
150
- open(OF, ">:utf8", "$out_file") || die"#Can't open file \"$out_file\"\n";
151
- print OF "$markup_output";
152
- close OF;
153
- }
154
- else
155
- {
156
- Output(\@lines, $out_file);
157
- }
158
-
159
- if ($tag_file ne "")
160
- {
161
- PrintTagInfo(\%tags, $tag_file);
162
- }
163
-
164
- sub ProcessFile
165
- {
166
- my ($in_file, $tags) = @_;
167
-
168
- if (!(-e $in_file)) { die "# $progname crash\t\tFile \"$in_file\" doesn't exist"; }
169
- open (IF, "<:utf8", $in_file) || die "# $progname crash\t\tCan't open \"$in_file\"";
170
-
171
- my $is_para = 0;
172
- my $is_table = 0;
173
- my $is_space = 0;
174
- my $is_pic = 0;
175
- my $all_text = "";
176
- my $text = "";
177
- my $line_id = 0;
178
-
179
- # Each line contains a header
180
- while (<IF>)
181
- {
182
- # Skip comments
183
- if (/^\#/) { next; }
184
- chomp;
185
-
186
- # Remove ^M character at the end of the file if any
187
- s/\cM$//;
188
- my $line = $_;
189
-
190
- if($tag_file ne "") { ProcessTagInfo($line, $tags); }
191
-
192
- # if ($line =~ /<\?xml version.+>/) { } ### XML ###
193
- # if ($line =~ /^<\/column>$/) { } ### Column ###
194
- if ($is_markup && $line =~ /<theoreticalPage (.*)\/>/ && $is_markup) { $markup_output .= "### Page $1\n"; }
195
-
196
- # Pic
197
- if ($line =~ /^<dd (.*)>$/)
198
- {
199
- $is_pic = 1;
200
- if($is_markup) { $markup_output .= "### Figure $1\n"; }
201
- }
202
- elsif ($line =~ /^<\/dd>$/)
203
- {
204
- $is_pic = 0;
205
- }
206
-
207
- # Table
208
- if ($line =~ /^<table .*>$/)
209
- {
210
- $text .= $line."\n"; # we need the header
211
- $is_table = 1;
212
- }
213
- elsif ($line =~ /^<\/table>$/)
214
- {
215
- my $table_text = ProcessTable($text, $is_pic);
216
- $all_text .= $table_text;
217
-
218
- my @tmp_lines = split(/\n/, $table_text);
219
- $line_id += scalar(@tmp_lines);
220
-
221
- $is_table = 0;
222
- $text = "";
223
- }
224
- elsif ($is_table)
225
- {
226
- $text .= $line."\n";
227
- next;
228
- }
229
- # Paragraph
230
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
231
- elsif ($line =~ /^<para (.*)>$/)
232
- {
233
- $text .= $line."\n"; # we need the header
234
- $is_para = 1;
235
- if ($is_markup) { $markup_output .= "## Para $1\n"; }
236
- }
237
- elsif ($line =~ /^<\/para>$/)
238
- {
239
- my ($para_text, $l, $t, $r, $b) = undef;
240
- ($para_text, $l, $t, $r, $b, $is_space) = ProcessPara($text, 0, $is_pic);
241
- $all_text .= $para_text;
242
-
243
- my @tmp_lines = split(/\n/, $para_text);
244
- $line_id += scalar(@tmp_lines);
245
- $is_para = 0;
246
- $text = "";
247
- }
248
- elsif ($is_para)
249
- {
250
- $text .= $line."\n";
251
- next;
252
- }
253
- }
254
-
255
- close IF;
256
- return $all_text;
257
- }
258
-
259
- sub Output
260
- {
261
- my ($lines, $out_file) = @_;
262
- open(OF, ">:utf8", "$out_file") || die"#Can't open file \"$out_file\"\n";
263
-
264
- ####### Final output ############
265
- # XML feature label
266
- my %g_font_size_labels = ();
267
- # my %g_space_labels = (); # yes, no
268
-
269
- if($is_xml_feature)
270
- {
271
- GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
272
- # GetSpaceLabels(\%g_space_hash, \%g_space_labels);
273
- }
274
-
275
- my $id = -1;
276
- my $output = "";
277
- my $para_line_id = -1;
278
- my $para_line_count = 0;
279
-
280
- foreach my $line (@{$lines})
281
- {
282
- $id++;
283
-
284
- # Remove ^M character at the end of each line if any
285
- $line =~ s/\cM$//;
286
-
287
- # Empty lines
288
- if($line =~ /^\s*$/)
289
- {
290
- if(!$is_allow_empty)
291
- {
292
- next;
293
- }
294
- else
295
- {
296
- if($is_debug) { print STDERR "#! Line $id empty!\n"; }
297
- }
298
- }
299
-
300
- if ($g_para[$id] eq "yes")
301
- {
302
- # Mark para
303
- if($output ne "")
304
- {
305
- if($is_para_delimiter)
306
- {
307
- print OF "# Para $para_line_id $para_line_count\n$output";
308
- $para_line_count = 0;
309
- }
310
- else
311
- {
312
- if ($is_decode) { $output = decode_entities($output); }
313
- print OF $output;
314
- }
315
-
316
- $output = "";
317
- }
318
-
319
- $para_line_id = $id;
320
- }
321
-
322
- $output .= $line;
323
- $para_line_count++;
324
-
325
- # Output XML features
326
- if ($is_xml_feature)
327
- {
328
- # Loc feature
329
- my $loc_feature;
330
- if ($g_pos_hash[$id] != -1)
331
- {
332
- $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos)*8.0/($g_maxpos - $g_minpos + 1));
333
- }
334
-
335
- # Align feature
336
- my $align_feature = "xmlAlign_" . $g_align[$id];
337
-
338
- # Font_size feature
339
- my $font_size_feature;
340
- if ($g_font_size[$id] == -1)
341
- {
342
- $font_size_feature = "xmlFontSize_none";
343
- }
344
- else
345
- {
346
- $font_size_feature = "xmlFontSize_" . $g_font_size_labels{$g_font_size[$id]};
347
- }
348
-
349
- my $bold_feature = "xmlBold_" . $g_bold[$id]; # Bold feature
350
- my $italic_feature = "xmlItalic_" . $g_italic[$id]; # Italic feature
351
- my $pic_feature = "xmlPic_" . $g_pic[$id]; # Pic feature
352
- my $table_feature = "xmlTable_" . $g_table[$id]; # Table feature
353
- my $bullet_feature = "xmlBullet_" . $g_bullet[$id]; # Bullet feature
354
-
355
- # Space feature
356
- # my $space_feature;
357
- # if($g_space[$id] eq "none")
358
- # {
359
- # $space_feature = "xmlSpace_none";
360
- # }
361
- # else
362
- # {
363
- # $space_feature = "xmlSpace_" . $g_space_labels{$g_space[$id]};
364
- # }
365
-
366
- # Differential features
367
- my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
368
-
369
- # Each line and its XML features
370
- $output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff\n";
371
- }
372
- else
373
- {
374
- $output .= "\n";
375
- }
376
- }
377
-
378
- # Mark para
379
- if ($output ne "")
380
- {
381
- if ($is_para_delimiter)
382
- {
383
- print OF "# Para $para_line_id $para_line_count\n$output";
384
- $para_line_count = 0;
385
- }
386
- else
387
- {
388
- if($is_decode){ $output = decode_entities($output); }
389
- print OF $output;
390
- }
391
- $output = ""
392
- }
393
-
394
- close OF;
395
- }
396
-
397
- sub GetDifferentialFeatures
398
- {
399
- my ($id) = @_;
400
-
401
- # AlignChange feature
402
- my $align_diff = "bi_xmlA_";
403
-
404
- if ($id == 0)
405
- {
406
- $align_diff .= $g_align[$id];
407
- }
408
- elsif ($g_align[$id] eq $g_align[$id-1])
409
- {
410
- $align_diff .= "continue";
411
- }
412
- else
413
- {
414
- $align_diff .= $g_align[$id];
415
- }
416
-
417
- # FontFaceChange feature
418
- my $font_face_diff = "bi_xmlF_";
419
- if ($id == 0)
420
- {
421
- $font_face_diff .= "new";
422
- }
423
- elsif ($g_font_face[$id] eq $g_font_face[$id-1])
424
- {
425
- $font_face_diff .= "continue";
426
- }
427
- else
428
- {
429
- $font_face_diff .= "new";
430
- }
431
-
432
- # FontSizeChange feature
433
- my $font_size_diff = "bi_xmlS_";
434
- if ($id == 0)
435
- {
436
- $font_size_diff .= "new";
437
- }
438
- elsif ($g_font_size[$id] == $g_font_size[$id-1])
439
- {
440
- $font_size_diff .= "continue";
441
- }
442
- else
443
- {
444
- $font_size_diff .= "new";
445
- }
446
-
447
- # FontSFChange feature
448
- my $font_sf_diff = "bi_xmlSF_";
449
- if ($id == 0)
450
- {
451
- $font_sf_diff .= "new";
452
- }
453
- elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1])
454
- {
455
- $font_sf_diff .= "continue";
456
- }
457
- else
458
- {
459
- $font_sf_diff .= "new";
460
- }
461
-
462
- # FontSFBIChange feature
463
- my $font_sfbi_diff = "bi_xmlSFBI_";
464
- if ($id == 0)
465
- {
466
- $font_sfbi_diff .= "new";
467
- }
468
- elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1] && $g_bold[$id] eq $g_bold[$id-1] && $g_italic[$id] eq $g_italic[$id-1])
469
- {
470
- $font_sfbi_diff .= "continue";
471
- }
472
- else
473
- {
474
- $font_sfbi_diff .= "new";
475
- }
476
-
477
- # FontSFBIAChange feature
478
- my $font_sfbia_diff = "bi_xmlSFBIA_";
479
- if ($id == 0)
480
- {
481
- $font_sfbia_diff .= "new";
482
- }
483
- elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1] && $g_bold[$id] eq $g_bold[$id-1] && $g_italic[$id] eq $g_italic[$id-1] && $g_align[$id] eq $g_align[$id-1])
484
- {
485
- $font_sfbia_diff .= "continue";
486
- }
487
- else
488
- {
489
- $font_sfbia_diff .= "new";
490
- }
491
-
492
- # Para change feature
493
- my $para_diff = "bi_xmlPara_";
494
- # Header part, consider each line as a separate paragraph
495
- if ($id < $body_start_id)
496
- {
497
- $para_diff .= "header";
498
- }
499
- else
500
- {
501
- if($g_para[$id] eq "yes")
502
- {
503
- $para_diff .= "new";
504
- }
505
- else
506
- {
507
- $para_diff .= "continue";
508
- }
509
- }
510
-
511
- return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
512
- }
513
-
514
- sub GetFontSizeLabels
515
- {
516
- my ($g_font_size_hash, $g_font_size_labels) = @_;
517
-
518
- if ($is_debug) { print STDERR "# Map fonts\n"; }
519
- my @sorted_fonts = sort { $g_font_size_hash->{$b} <=> $g_font_size_hash->{$a} } keys %{$g_font_size_hash}; # Sort by values, obtain keys
520
-
521
- my $common_size = $sorted_fonts[0];
522
- @sorted_fonts = sort { $a <=> $b } keys %{$g_font_size_hash}; # Sort by keys, obtain keys
523
-
524
- # Index of common font size
525
- my $common_index = 0;
526
- foreach (@sorted_fonts)
527
- {
528
- # Found
529
- if ($common_size == $_)
530
- {
531
- last;
532
- }
533
- $common_index++;
534
- }
535
-
536
- # Small fonts
537
- for (my $i = 0; $i < $common_index; $i++)
538
- {
539
- $g_font_size_labels->{$sorted_fonts[$i]} = "smaller";
540
-
541
- if($is_debug)
542
- {
543
- print STDERR "$sorted_fonts[$i] --> $g_font_size_labels->{$sorted_fonts[$i]}, freq = $g_font_size_hash->{$sorted_fonts[$i]}\n";
544
- }
545
- }
546
-
547
- # Common fonts
548
- $g_font_size_labels->{$common_size} = "common";
549
- if ($is_debug)
550
- {
551
- print STDERR "$sorted_fonts[$common_index] --> $g_font_size_labels->{$sorted_fonts[$common_index]}, freq = $g_font_size_hash->{$sorted_fonts[$common_index]}\n";
552
- }
553
-
554
- # Large fonts
555
- for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
556
- {
557
- if ((scalar(@sorted_fonts)-$i) <= 3)
558
- {
559
- $g_font_size_labels->{$sorted_fonts[$i]} = "largest".($i+1-scalar(@sorted_fonts));
560
- }
561
- else
562
- {
563
- $g_font_size_labels->{$sorted_fonts[$i]} = "larger";
564
- }
565
-
566
- if($is_debug)
567
- {
568
- print STDERR "$sorted_fonts[$i] --> $g_font_size_labels->{$sorted_fonts[$i]}, freq = $g_font_size_hash->{$sorted_fonts[$i]}\n";
569
- }
570
- }
571
- }
572
-
573
- sub GetSpaceLabels
574
- {
575
- my ($g_space_hash, $g_space_labels) = @_;
576
-
577
- if ($is_debug)
578
- {
579
- print STDERR "\n# Map space\n";
580
- }
581
- my @sorted_spaces = sort { $g_space_hash->{$b} <=> $g_space_hash->{$a} } keys %{$g_space_hash}; # sort by freqs, obtain space faces
582
-
583
- my $common_space = $sorted_spaces[0];
584
- my $common_freq = $g_space_hash->{$common_space};
585
-
586
- # Find similar common freq with larger spaces
587
- for (my $i = 0; $i < scalar(@sorted_spaces); $i++)
588
- {
589
- my $freq = $g_space_hash->{$sorted_spaces[$i]};
590
- if ($freq/$common_freq > 0.8)
591
- {
592
- if($sorted_spaces[$i] > $common_space)
593
- {
594
- $common_space = $sorted_spaces[$i];
595
- }
596
- }
597
- else
598
- {
599
- last;
600
- }
601
- }
602
-
603
- for (my $i = 0; $i < scalar(@sorted_spaces); $i++)
604
- {
605
- if ($sorted_spaces[$i] > $common_space)
606
- {
607
- $g_space_labels->{$sorted_spaces[$i]} = "yes";
608
- }
609
- else
610
- {
611
- $g_space_labels->{$sorted_spaces[$i]} = "no";
612
- }
613
-
614
- if($is_debug)
615
- {
616
- print STDERR "$sorted_spaces[$i] --> $g_space_labels->{$sorted_spaces[$i]}, freq = $g_space_hash->{$sorted_spaces[$i]}\n";
617
- }
618
- }
619
- }
620
-
621
- sub GetAttrValue
622
- {
623
- my ($attr_text, $attr) = @_;
624
-
625
- my $value = "none";
626
- if ($attr_text =~ /^.*$attr=\"(.+?)\".*$/)
627
- {
628
- $value = $1;
629
- }
630
-
631
- return $value;
632
- }
633
-
634
- sub CheckFontAttr
635
- {
636
- my ($attr_text, $attr, $attr_hash, $count) = @_;
637
-
638
- if ($attr_text =~ /^.*$attr=\"(.+?)\".*$/)
639
- {
640
- my $attr_value = $1;
641
- $attr_hash->{$attr_value} = $attr_hash->{$attr_value} ? ($attr_hash->{$attr_value} + $count) : $count;
642
- }
643
- }
644
-
645
- sub ProcessTable
646
- {
647
- my ($input_text, $is_pic) = @_;
648
-
649
- # For table cell object
650
- my $is_cell = 0;
651
-
652
- my $all_text = "";
653
- my $text = "";
654
-
655
- my @lines = split(/\n/, $input_text);
656
-
657
- my %table_pos = (); # $table_pos{$cellText} = "$l-$t-$r-$bottom"
658
- my %table = (); # $table{$row}->{$col} = \@para_texts
659
- my $row_from;
660
- my $col_from;
661
- my $row_till;
662
- my $col_till;
663
-
664
- # xml feature
665
- my $align = "none";
666
- my $pos = -1;
667
- foreach my $line (@lines)
668
- {
669
- if ($line =~ /^<table (.+?)>$/)
670
- {
671
- my $attr = $1;
672
-
673
- if($is_markup) { $markup_output .= "### Table $attr\n"; }
674
-
675
- # Fix: wrong regex sequence, huydhn
676
- #if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
677
- #{
678
- # my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
679
- # $align = $5;
680
-
681
- # # pos feature
682
- # $pos = ($t+$bottom)/2.0;
683
-
684
- # if($pos < $g_minpos) { $g_minpos = $pos; }
685
- # if($pos > $g_maxpos) { $g_maxpos = $pos; }
686
- #}
687
- #else
688
- #{
689
- # print STDERR "# no table alignment or location \"$line\"\n";
690
- # $align = "";
691
- #}
692
-
693
- my ($l, $t, $r, $bottom) = undef;
694
- if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
695
- if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
696
- if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
697
- if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
698
-
699
- if ($t && $bottom)
700
- {
701
- # pos feature
702
- $pos = ($t + $bottom) / 2.0;
703
-
704
- if($pos < $g_minpos) { $g_minpos = $pos; }
705
- if($pos > $g_maxpos) { $g_maxpos = $pos; }
706
- }
707
- else
708
- {
709
- die "# Undefined table location \"$line\"\n";
710
- }
711
-
712
- if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
713
- {
714
- $align = $1;
715
- }
716
- else
717
- {
718
- print STDERR "# no table alignment \"$line\"\n";
719
- $align = "";
720
- }
721
- # End.
722
- }
723
- elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
724
- {
725
- $col_from = $1;
726
- $col_till = $2;
727
- $row_from = $3;
728
- $row_till = $4;
729
- #print STDERR "$row_from $row_till $col_from $col_till\n";
730
- $is_cell = 1;
731
- }
732
- elsif ($line =~ /^<\/cell>$/) # end cell
733
- {
734
- my @para_texts = ();
735
- ProcessCell($text, \@para_texts, \%table_pos, $is_pic);
736
-
737
- for(my $i = $row_from; $i<=$row_till; $i++)
738
- {
739
- for(my $j = $col_from; $j<=$col_till; $j++)
740
- {
741
- if(!$table{$i}) { $table{$i} = (); }
742
- if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
743
-
744
- if($i == $row_from && $j == $col_from)
745
- {
746
- push(@{$table{$i}->{$j}}, @para_texts);
747
- if(scalar(@para_texts) > 1) { last; }
748
- }
749
- else
750
- {
751
- push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
752
- }
753
- }
754
- }
755
-
756
- $is_cell = 0;
757
- $text = "";
758
- }
759
- elsif($is_cell)
760
- {
761
- $text .= $line."\n";
762
- next;
763
- }
764
- }
765
-
766
- # note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
767
- # E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
768
- # It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
769
- my @sorted_rows = sort {$a <=> $b} keys %table;
770
- my $is_first_line_para = 1;
771
- foreach my $row (@sorted_rows)
772
- {
773
- my %table_r = %{$table{$row}};
774
- my @sorted_cols = sort {$a <=> $b} keys %table_r;
775
- while(1)
776
- {
777
- my $is_stop = 1;
778
- my $row_text = "";
779
-
780
- foreach my $col (@sorted_cols)
781
- {
782
- # there's still some thing to process
783
- if(scalar(@{$table_r{$col}}) > 0)
784
- {
785
- $is_stop = 0;
786
- $row_text .= shift(@{$table_r{$col}});
787
- }
788
- $row_text .= "\t";
789
- }
790
-
791
- if ((!$is_allow_empty && $row_text =~ /^\s*$/) || ($is_allow_empty && $row_text eq ""))
792
- {
793
- $is_stop = 1;
794
- }
795
-
796
- if($is_stop)
797
- {
798
- last;
799
- }
800
- else
801
- {
802
- $row_text =~ s/\t$/\n/;
803
- $all_text .= $row_text;
804
- # print STDERR "$row_text";
805
-
806
- # para
807
- if($is_first_line_para)
808
- {
809
- push(@g_para, "yes");
810
- $is_first_line_para = 0;
811
- }
812
- else
813
- {
814
- push(@g_para, "no");
815
- }
816
-
817
- if($is_xml_feature)
818
- {
819
- # table feature
820
- push(@g_table, "yes");
821
-
822
- # pic feature
823
- if($is_pic)
824
- {
825
- push(@g_pic, "yes");
826
- }
827
- else
828
- {
829
- push(@g_pic, "no");
830
- }
831
-
832
- push(@g_pos_hash, $pos); # update xml pos value
833
- push(@g_align, $align); # update xml alignment value
834
-
835
- ### Not assign value ###
836
- push(@g_font_size, -1); # fontSize feature
837
- push(@g_font_face, "none"); # fontFace feature
838
- push(@g_bold, "no"); # bold feature
839
- push(@g_italic, "no"); # italic feature
840
- push(@g_bullet, "no"); # bullet feature
841
- # push(@gSpace, "none"); # space feature
842
- } # end if xml feature
843
- }
844
- }
845
- }
846
-
847
- return $all_text;
848
- }
849
-
850
- sub ProcessCell
851
- {
852
- my ($input_text, $para_texts, $table_pos, $is_pic) = @_;
853
-
854
- my $text = "";
855
- my @lines = split(/\n/, $input_text);
856
- my $is_para = 0;
857
- my $flag = 0;
858
- foreach my $line (@lines)
859
- {
860
- if ($line =~ /^<para (.*)>$/)
861
- {
862
- $text .= $line."\n"; # we need the header
863
- $is_para = 1;
864
-
865
- if($is_markup)
866
- {
867
- $markup_output .= "## ParaTable $1\n";
868
- }
869
- }
870
- elsif ($line =~ /^<\/para>$/)
871
- {
872
- my ($para_text, $l, $t, $r, $b) = ProcessPara($text, 1, $is_pic);
873
- my @tokens = split(/\n/, $para_text);
874
-
875
- foreach my $token (@tokens)
876
- {
877
- if($token ne "")
878
- {
879
- push(@{$para_texts}, $token);
880
- $flag = 1;
881
- }
882
- }
883
-
884
- if(!$table_pos->{$para_text})
885
- {
886
- $table_pos->{$para_text} = "$l-$t-$r-$b";
887
- }
888
- else
889
- {
890
- #print STDERR "#! Warning: in method processCell, encounter the same para_text $para_text\n";
891
- }
892
-
893
- $is_para = 0;
894
- $text = "";
895
- }
896
- elsif ($is_para)
897
- {
898
- $text .= $line."\n";
899
- next;
900
- }
901
- }
902
-
903
- # at least one value should be added for cell which is ""
904
- if ($flag == 0)
905
- {
906
- push(@{$para_texts}, "");
907
- }
908
- }
909
-
910
- sub ProcessPara
911
- {
912
- my ($input_text, $is_cell, $is_pic) = @_;
913
-
914
- my $is_space = 0;
915
- my $is_special_space = 0;
916
- my $is_tab = 0;
917
- my $is_bullet = 0;
918
- my $is_forced_eof = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $is_space = 0
919
-
920
- # XML feature
921
- my $align = "none";
922
- my ($l, $t, $r, $bottom);
923
-
924
- my %font_size_hash = ();
925
- my %font_face_hash = ();
926
- my @bold_array = ();
927
- my @italic_array = ();
928
- my $space = "none";
929
-
930
- my $ln_attr;
931
- my $is_ln = 0;
932
- my $ln_bold = "none";
933
- my $ln_italic = "none";
934
-
935
- my $run_attr;
936
- my $run_text = "";
937
- my $is_run = 0;
938
- my $run_bold = "none";
939
- my $run_italic = "none";
940
-
941
- my $wd_attr;
942
- my $wd_text = "";
943
- my $is_wd = 0;
944
-
945
- # Word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
946
- my $wd_index = 0;
947
- my $ln_bold_count = 0;
948
- my $ln_italic_count = 0;
949
-
950
- my $all_text = "";
951
- # Invariant: when never enter a new line, $text will be copied into $all_text, and $text is cleared
952
- my $text = "";
953
-
954
- binmode(STDERR, ":utf8");
955
-
956
- my $is_first_line_para = 1;
957
- my @lines = split(/\n/, $input_text);
958
-
959
- for (my $i=0; $i < scalar(@lines); $i++)
960
- {
961
- my $line = $lines[$i];
962
-
963
- # New para
964
- if ($line =~ /^<para (.+?)>$/)
965
- {
966
- my $attr = $1;
967
- $align = GetAttrValue($attr, "alignment");
968
- # $indent = GetAttrValue($attr, "li");
969
- $space = GetAttrValue($attr, "spaceBefore");
970
- }
971
- # New ln
972
- elsif ($line =~ /^<ln (.+)>$/)
973
- {
974
- $ln_attr = $1;
975
- $is_ln = 1;
976
-
977
- if ($is_markup) { $markup_output .= "# Line $ln_attr\n"; }
978
-
979
- # Fix: wrong regex sequence, huydhn
980
- #if ($ln_attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
981
- #{
982
- # ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
983
- #}
984
-
985
- if ($ln_attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
986
- if ($ln_attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
987
- if ($ln_attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
988
- if ($ln_attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
989
- # End.
990
-
991
- $is_forced_eof = GetAttrValue($ln_attr, "forcedEOF");
992
-
993
- # Bold & Italic
994
- if ($is_xml_feature)
995
- {
996
- $ln_bold = GetAttrValue($ln_attr, "bold");
997
- $ln_italic = GetAttrValue($ln_attr, "italic");
998
- }
999
- }
1000
- # New run
1001
- elsif ($line =~ /<run (.*)>$/)
1002
- {
1003
- $run_attr = $1;
1004
- $is_space = 0;
1005
- $is_tab = 0;
1006
- $is_run = 1;
1007
-
1008
- # New wd, that consists of many runs
1009
- if ($line =~ /^<wd (.*?)>/)
1010
- {
1011
- $is_wd = 1;
1012
- $wd_attr = $1;
1013
- }
1014
-
1015
- # Bold & Italic
1016
- if ($is_xml_feature)
1017
- {
1018
- $run_bold = GetAttrValue($run_attr, "bold");
1019
- $run_italic = GetAttrValue($run_attr, "italic");
1020
- }
1021
- }
1022
- # Word
1023
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
1024
- {
1025
- $wd_attr = $1;
1026
- my $word = $2;
1027
- $is_space = 0;
1028
- $is_tab = 0;
1029
-
1030
- if ($is_markup)
1031
- {
1032
- $markup_output .= "$word $wd_attr\n";
1033
- # If both bold and italic, then just use one
1034
- if ($is_run && $run_attr =~ /(bold|italic)=\"true\"/)
1035
- {
1036
- $markup_output .= " $1=\"true\"";
1037
- }
1038
- $markup_output .= "\n";
1039
- }
1040
-
1041
- # FontSize & FontFace
1042
- if ($is_xml_feature)
1043
- {
1044
- CheckFontAttr($wd_attr, "fontSize", \%font_size_hash, 1);
1045
- CheckFontAttr($wd_attr, "fontFace", \%font_face_hash, 1);
1046
- }
1047
-
1048
- # Bold & Italic
1049
- if ($is_xml_feature)
1050
- {
1051
- my $wd_bold = GetAttrValue($wd_attr, "bold");
1052
- my $wd_italic = GetAttrValue($wd_attr, "italic");
1053
-
1054
- if ($wd_bold eq "true" || $run_bold eq "true" || $ln_bold eq "true")
1055
- {
1056
- $bold_array[$wd_index] = 1;
1057
- $ln_bold_count++;
1058
- }
1059
-
1060
- if ($wd_italic eq "true" || $run_italic eq "true" || $ln_italic eq "true")
1061
- {
1062
- $italic_array[$wd_index] = 1;
1063
- $ln_italic_count++;
1064
- }
1065
- }
1066
-
1067
- # Add text
1068
- $text .= "$word";
1069
-
1070
- if ($is_run)
1071
- {
1072
- $run_text .= "$word ";
1073
- }
1074
-
1075
- $wd_index++;
1076
- }
1077
- # End wd
1078
- elsif ($line =~ /^<\/wd>$/)
1079
- {
1080
- $is_wd = 0;
1081
-
1082
- if ($is_markup)
1083
- {
1084
- $markup_output .= "$wd_text $wd_attr\n";
1085
- # If both bold and italic, then just use one
1086
- if ($is_run && $run_attr =~ /(bold|italic)=\"true\"/)
1087
- {
1088
- $markup_output .= " $1=\"true\"";
1089
- }
1090
-
1091
- $markup_output .= "\n";
1092
- $wd_attr = "";
1093
- }
1094
- }
1095
- # End run
1096
- elsif ($line =~ /^(.*)<\/run>$/)
1097
- {
1098
- my $word = $1;
1099
-
1100
- # Add text
1101
- if ($word ne "")
1102
- {
1103
- # Bold & Italic
1104
- if ($is_xml_feature)
1105
- {
1106
- if ($run_bold eq "true" || $ln_bold eq "true")
1107
- {
1108
- $bold_array[$wd_index] = 1;
1109
- $ln_bold_count++;
1110
- }
1111
-
1112
- if ($run_italic eq "true" || $ln_italic eq "true")
1113
- {
1114
- $italic_array[$wd_index] = 1;
1115
- $ln_italic_count++;
1116
- }
1117
- }
1118
-
1119
- # Appear in the final result
1120
- if ($is_ln) { $text .= "$word"; }
1121
-
1122
- # For internal record
1123
- if ($is_run) { $run_text .= $word . " "; }
1124
- if ($is_wd) { $wd_text .= $word; }
1125
-
1126
- $wd_index++;
1127
- }
1128
-
1129
- # Xml feature
1130
- # Not a space, tab or new-line run
1131
- if ($is_xml_feature && $run_text ne "")
1132
- {
1133
- my @words = split(/\s+/, $run_text);
1134
- my $num_words = scalar(@words);
1135
- CheckFontAttr($run_attr, "fontSize", \%font_size_hash, $num_words, 1);
1136
- CheckFontAttr($run_attr, "fontFace", \%font_face_hash, $num_words, 1);
1137
- }
1138
-
1139
- # Reset run
1140
- # <run> not enclosed within <ln>
1141
- if (!$is_ln)
1142
- {
1143
- $wd_index = 0;
1144
- }
1145
-
1146
- $run_text = "";
1147
- $is_run = 0;
1148
- $is_special_space = 0;
1149
-
1150
- # Bold & Italic
1151
- if ($is_xml_feature)
1152
- {
1153
- $run_bold = "none";
1154
- $run_italic = "none";
1155
-
1156
- # <run> not enclosed within <ln>
1157
- if(!$is_ln)
1158
- {
1159
- $ln_bold_count = 0;
1160
- $ln_italic_count = 0;
1161
- }
1162
- }
1163
- }
1164
- # End ln
1165
- elsif ($line =~ /^<\/ln>$/)
1166
- {
1167
- if((!$is_allow_empty && $text !~ /^\s*$/) || ($is_allow_empty && $text ne ""))
1168
- {
1169
- if ($is_forced_eof eq "true" || (!$is_cell && !$is_special_space) )
1170
- {
1171
- $text .= "\n";
1172
- # Update all_text
1173
- $all_text .= $text;
1174
- $text = "";
1175
- }
1176
-
1177
- my $num_words = $wd_index;
1178
-
1179
- if (!$is_cell)
1180
- {
1181
- if ($is_first_line_para)
1182
- {
1183
- push(@g_para, "yes");
1184
- $is_first_line_para = 0;
1185
- }
1186
- else
1187
- {
1188
- push(@g_para, "no");
1189
- }
1190
- }
1191
-
1192
- if ($is_xml_feature && $num_words >= 1)
1193
- {
1194
- # XML feature
1195
- # Assumtion that: font_size is either occur in <ln>, or within multiple <run> under <ln>, but not both
1196
- CheckFontAttr($ln_attr, "fontSize", \%font_size_hash, $num_words);
1197
- CheckFontAttr($ln_attr, "fontFace", \%font_face_hash, $num_words);
1198
- }
1199
-
1200
- if ($is_xml_feature && !$is_cell && !$is_special_space)
1201
- {
1202
- my $pos = ($t + $bottom)/2.0;
1203
- if ($pos < $g_minpos) { $g_minpos = $pos; }
1204
- if ($pos > $g_maxpos) { $g_maxpos = $pos; }
1205
-
1206
- push(@g_pos_hash, $pos); # pos feature
1207
- push(@g_align, $align); # alignment feature
1208
- push(@g_table, "no"); # table feature
1209
-
1210
- if ($is_pic)
1211
- {
1212
- push(@g_pic, "yes");
1213
-
1214
- # Not assign value
1215
- push(@g_font_size, -1); # bold feature
1216
- push(@g_font_face, "none"); # bold feature
1217
- push(@g_bold, "no"); # bold feature
1218
- push(@g_italic, "no"); # italic feature
1219
- push(@g_bullet, "no"); # bullet feature
1220
- }
1221
- else
1222
- {
1223
- push(@g_pic, "no");
1224
-
1225
- UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
1226
-
1227
- %font_size_hash = ();
1228
- %font_face_hash = ();
1229
-
1230
- UpdateXMLFeatures($ln_bold_count, $ln_italic_count, $num_words, $is_bullet, $space);
1231
- }
1232
- }
1233
- }
1234
-
1235
- # Reset ln
1236
- $is_ln = 0;
1237
- $is_forced_eof = "none";
1238
- $is_special_space = 0;
1239
- $wd_index = 0;
1240
-
1241
- # Bold & Italic
1242
- if ($is_xml_feature)
1243
- {
1244
- $ln_bold = "none";
1245
- $ln_italic = "none";
1246
- $ln_bold_count = 0;
1247
- $ln_italic_count = 0;
1248
- }
1249
- }
1250
- # Newline signal
1251
- elsif ($line =~ /^<nl orig=\"true\"\/>$/)
1252
- {
1253
- if($is_ln)
1254
- {
1255
- $is_space = 0;
1256
- }
1257
- else
1258
- {
1259
- if($is_debug)
1260
- {
1261
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
1262
- }
1263
- }
1264
- }
1265
- # Space
1266
- elsif ($line =~ /^<space\/>$/)
1267
- {
1268
- my $start_tag = "";
1269
- my $end_tag = "";
1270
-
1271
- if ($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/) { $start_tag = $1; }
1272
-
1273
- if ($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/) { $end_tag = $1; }
1274
-
1275
- if ($start_tag eq $end_tag && $start_tag ne "")
1276
- {
1277
- # print STDERR "# Special space after \"$text\"\n";
1278
- $is_special_space = 1;
1279
- }
1280
-
1281
- # Add text
1282
- $text .= " ";
1283
- $is_space = 1;
1284
- }
1285
- # Tab
1286
- elsif ($line =~ /^<tab .*\/>$/)
1287
- {
1288
- # Add text
1289
- $text .= "\t";
1290
- $is_tab = 1;
1291
- }
1292
- # Bullet
1293
- elsif ($line =~ /^<bullet .*>$/)
1294
- {
1295
- $is_bullet = 1;
1296
- }
1297
- }
1298
-
1299
- $all_text .= $text;
1300
- return ($all_text, $l, $t, $r, $bottom, $is_space);
1301
- }
1302
-
1303
- sub UpdateXMLFontFeature
1304
- {
1305
- my ($font_size_hash, $font_face_hash) = @_;
1306
-
1307
- # Font size feature
1308
- if (scalar(keys %{$font_size_hash}) == 0)
1309
- {
1310
- push(@g_font_size, -1);
1311
- }
1312
- else
1313
- {
1314
- my @sorted_fonts = sort { $font_size_hash->{$b} <=> $font_size_hash->{$a} } keys %{$font_size_hash};
1315
-
1316
- my $font_size = $sorted_fonts[0];
1317
- push(@g_font_size, $font_size);
1318
-
1319
- $g_font_size_hash{$font_size} = $g_font_size_hash{$font_size} ? ($g_font_size_hash{$font_size}+1) : 1;
1320
- }
1321
-
1322
- # Font face feature
1323
- if (scalar(keys %{$font_face_hash}) == 0)
1324
- {
1325
- push(@g_font_face, "none");
1326
- }
1327
- else
1328
- {
1329
- my @sorted_fonts = sort { $font_face_hash->{$b} <=> $font_face_hash->{$a} } keys %{$font_face_hash};
1330
- my $font_face = $sorted_fonts[0];
1331
- push(@g_font_face, $font_face);
1332
-
1333
- $g_font_face_hash{$font_face} = $g_font_face_hash{$font_face} ? ($g_font_face_hash{$font_face}+1) : 1;
1334
- }
1335
- }
1336
-
1337
- sub UpdateXMLFeatures
1338
- {
1339
- my ($ln_bold_count, $ln_italic_count, $num_words, $is_bullet, $space) = @_;
1340
-
1341
- # Bold feature
1342
- my $bold_feature;
1343
- if ($ln_bold_count/$num_words >= 0.667)
1344
- {
1345
- $bold_feature = "yes";
1346
- }
1347
- else
1348
- {
1349
- $bold_feature = "no";
1350
- }
1351
- push(@g_bold, $bold_feature);
1352
-
1353
- # Italic feature
1354
- my $italic_feature;
1355
- if ($ln_italic_count/$num_words >= 0.667)
1356
- {
1357
- $italic_feature = "yes";
1358
- }
1359
- else
1360
- {
1361
- $italic_feature = "no";
1362
- }
1363
- push(@g_italic, $italic_feature);
1364
-
1365
- # Bullet feature
1366
- if ($is_bullet)
1367
- {
1368
- push(@g_bullet, "yes");
1369
- }
1370
- else
1371
- {
1372
- push(@g_bullet, "no");
1373
- }
1374
-
1375
- # Space feature
1376
- # push(@gSpace, $space);
1377
- }
1378
-
1379
- # Find the positions of header, body, and citation
1380
- sub GetStructureInfo
1381
- {
1382
- my ($lines, $num_lines) = @_;
1383
-
1384
- my ($body_length, $citation_length, $body_end_id) = SectLabel::PreProcess::findCitationText($lines, 0, $num_lines);
1385
-
1386
- my ($header_length, $body_start_id);
1387
-
1388
- ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::findHeaderText($lines, 0, $body_length);
1389
-
1390
- # Sanity check
1391
- my $totalLength = $header_length + $body_length + $citation_length;
1392
-
1393
- if ($num_lines != $totalLength)
1394
- {
1395
- print STDOUT "Die in getStructureInfo(): different num lines $num_lines != $totalLength\n"; # to display in Web
1396
- die "Die in getStructureInfo(): different num lines $num_lines != $totalLength\n";
1397
- }
1398
-
1399
- return ($header_length, $body_length, $citation_length, $body_start_id, $body_end_id);
1400
- }
1401
-
1402
- # Count XML tags/values for statistics purpose
1403
- sub ProcessTagInfo
1404
- {
1405
- my ($line, $tags) = @_;
1406
-
1407
- my $tag;
1408
- my $attr;
1409
-
1410
- if ($line =~ /^<(.+?)\b(.*)/)
1411
- {
1412
- $tag = $1;
1413
- $attr = $2;
1414
- if (!$tags->{$tag})
1415
- {
1416
- $tags->{$tag} = ();
1417
- }
1418
-
1419
- if ($attr =~ /^\s*(.+?)\s*\/?>/)
1420
- {
1421
- $attr = $1;
1422
- }
1423
-
1424
- my @tokens = split(/\s+/, $attr);
1425
- foreach my $token (@tokens)
1426
- {
1427
- if($token =~ /^(.+)=(.+)$/)
1428
- {
1429
- my $attr_name = $1;
1430
- my $value = $2;
1431
-
1432
- if (!$tags->{$tag}->{$attr_name})
1433
- {
1434
- $tags->{$tag}->{$attr_name} = ();
1435
- }
1436
- if (!$tags->{$tag}->{$attr_name}->{$value})
1437
- {
1438
- $tags->{$tag}->{$attr_name}->{$value} = 0;
1439
- }
1440
- $tags->{$tag}->{$attr_name}->{$value}++;
1441
- }
1442
- }
1443
- }
1444
- }
1445
-
1446
- # Print tag info to file
1447
- sub PrintTagInfo
1448
- {
1449
- my ($tags, $tag_file) = @_;
1450
-
1451
- open(TAG, ">:utf8", "$tag_file") || die"#Can't open file \"$tag_file\"\n";
1452
-
1453
- my @sortedTags = sort {$a cmp $b} keys %{$tags};
1454
-
1455
- foreach(@sortedTags)
1456
- {
1457
- my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
1458
- print TAG "# Tag = $_\n";
1459
-
1460
- foreach my $attr (@attrs)
1461
- {
1462
- print TAG "$attr:";
1463
- my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
1464
-
1465
- foreach my $value (@values)
1466
- {
1467
- print TAG " $value-$tags->{$_}->{$attr}->{$value}";
1468
- }
1469
-
1470
- print TAG "\n";
1471
- }
1472
- }
1473
-
1474
- close TAG;
1475
- }
1476
-
1477
- sub UntaintPath
1478
- {
1479
- my ($path) = @_;
1480
-
1481
- if ( $path =~ /^([-_\/\w\.]*)$/ )
1482
- {
1483
- $path = $1;
1484
- }
1485
- else
1486
- {
1487
- die "Bad path \"$path\"\n";
1488
- }
1489
-
1490
- return $path;
1491
- }
1492
-
1493
- sub Untaint
1494
- {
1495
- my ($s) = @_;
1496
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
1497
- {
1498
- $s = $1; # $data now untainted
1499
- }
1500
- else
1501
- {
1502
- die "Bad data in $s"; # log this somewhere
1503
- }
1504
-
1505
- return $s;
1506
- }
1507
-
1508
- sub Execute
1509
- {
1510
- my ($cmd) = @_;
1511
-
1512
- if ($is_debug)
1513
- {
1514
- print STDERR "Executing: $cmd\n";
1515
- }
1516
-
1517
- $cmd = Untaint($cmd);
1518
- system($cmd);
1519
- }
1520
-
1521
- sub NewTmpFile
1522
- {
1523
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1524
- chomp($tmpFile);
1525
- return $tmpFile;
1526
- }
1527
-
1528
-
1529
-