biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1529 +0,0 @@
1
- #!/usr/bin/perl -wT
2
-
3
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
-
9
- use Getopt::Long;
10
- use HTML::Entities;
11
-
12
- # I do not know a better solution to find a lib path in -T mode.
13
- # So if you know a better solution, I'd be glad to hear.
14
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
15
- use FindBin;
16
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
17
- my $path;
18
- BEGIN
19
- {
20
- if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
21
- }
22
-
23
- use lib "$path/../../lib";
24
-
25
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
26
- use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
27
-
28
- use SectLabel::PreProcess;
29
-
30
- ### USER customizable section
31
- $0 =~ /([^\/]+)$/; my $progname = $1;
32
- my $outputVersion = "1.0";
33
- ### END user customizable section
34
-
35
- sub License
36
- {
37
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
38
- }
39
-
40
- ### HELP Sub-procedure
41
- sub Help
42
- {
43
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract text lines together with other XML infos\n";
44
- print STDERR "usage: $progname -h\t[invokes help]\n";
45
- print STDERR " $progname -in xmlFile -out out_file [-xmlFeature -decode -markup -para] [-tag tag_file -allowEmptyLine -log]\n";
46
- print STDERR "Options:\n";
47
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
48
- print STDERR "\t-xmlFeature: append XML feature together with text extracted\n";
49
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
50
- print STDERR "\t-para: marking in the output each paragraph with # Para lineId num_lines\n";
51
- print STDERR "\t-markup: marking in the output detailed word-level info ### Page w h\\n## Para l t r b\\n# Line l t r b\\nword l t r b\n";
52
- print STDERR "\t-tag tag_file: count XML tags/values for statistics purpose\n";
53
- }
54
-
55
- my $quite = 0;
56
- my $help = 0;
57
- my $out_file = undef;
58
- my $in_file = undef;
59
-
60
- my $is_xml_feature = 0;
61
- my $is_decode = 0;
62
- my $is_markup = 0;
63
- my $is_para_delimiter = 0;
64
- my $is_allow_empty = 0;
65
- my $is_debug = 0;
66
- my $tag_file = "";
67
-
68
- $help = 1 unless GetOptions(
69
- 'in=s' => \$in_file,
70
- 'out=s' => \$out_file,
71
- 'decode' => \$is_decode,
72
- 'xmlFeature' => \$is_xml_feature,
73
- 'tag=s' => \$tag_file,
74
- 'allowEmptyLine' => \$is_allow_empty,
75
- 'markup' => \$is_markup,
76
- 'para' => \$is_para_delimiter,
77
- 'log' => \$is_debug,
78
- 'h' => \$help,
79
- 'q' => \$quite );
80
-
81
- if ($help || !defined $in_file || !defined $out_file)
82
- {
83
- Help();
84
- exit(0);
85
- }
86
-
87
- if (!$quite)
88
- {
89
- License();
90
- }
91
-
92
- ### Untaint ###
93
- $in_file = UntaintPath($in_file);
94
- $out_file = UntaintPath($out_file);
95
- $tag_file = UntaintPath($tag_file);
96
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
97
- ### End untaint ###
98
-
99
- # Mark page, para, line, word
100
- my %g_page_hash = ();
101
-
102
- # Mark paragraph
103
- my @g_para = ();
104
-
105
- # XML features
106
- # Location feature
107
- my @g_pos_hash = ();
108
- my $g_minpos = 1000000;
109
- my $g_maxpos = 0;
110
- # Align feature
111
- my @g_align = ();
112
- # Bold feature
113
- my @g_bold = ();
114
- # Italic feature
115
- my @g_italic = ();
116
- # Font size feature
117
- my %g_font_size_hash = ();
118
- my @g_font_size = ();
119
- # Font face feature
120
- my %g_font_face_hash = ();
121
- my @g_font_face = ();
122
- # Pic feature
123
- my @g_pic = ();
124
- # Table feature
125
- my @g_table = ();
126
- # Bullet feature
127
- my @g_bullet = ();
128
- # Space feature
129
- # my %g_space_hash = ();
130
- # my @g_space = ();
131
-
132
- my %tags = ();
133
-
134
- if ($is_debug)
135
- {
136
- print STDERR "\n# Processing file $in_file & output to $out_file\n";
137
- }
138
-
139
- my $markup_output = "";
140
- my $all_text = ProcessFile($in_file, $out_file, \%tags);
141
-
142
- # Find header part
143
- my @lines = split(/\n/, $all_text);
144
- my $num_lines = scalar(@lines);
145
- my ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::FindHeaderText(\@lines, 0, $num_lines);
146
-
147
- # Output
148
- if ($is_markup)
149
- {
150
- open(OF, ">:utf8", "$out_file") || die"#Can't open file \"$out_file\"\n";
151
- print OF "$markup_output";
152
- close OF;
153
- }
154
- else
155
- {
156
- Output(\@lines, $out_file);
157
- }
158
-
159
- if ($tag_file ne "")
160
- {
161
- PrintTagInfo(\%tags, $tag_file);
162
- }
163
-
164
- sub ProcessFile
165
- {
166
- my ($in_file, $tags) = @_;
167
-
168
- if (!(-e $in_file)) { die "# $progname crash\t\tFile \"$in_file\" doesn't exist"; }
169
- open (IF, "<:utf8", $in_file) || die "# $progname crash\t\tCan't open \"$in_file\"";
170
-
171
- my $is_para = 0;
172
- my $is_table = 0;
173
- my $is_space = 0;
174
- my $is_pic = 0;
175
- my $all_text = "";
176
- my $text = "";
177
- my $line_id = 0;
178
-
179
- # Each line contains a header
180
- while (<IF>)
181
- {
182
- # Skip comments
183
- if (/^\#/) { next; }
184
- chomp;
185
-
186
- # Remove ^M character at the end of the file if any
187
- s/\cM$//;
188
- my $line = $_;
189
-
190
- if($tag_file ne "") { ProcessTagInfo($line, $tags); }
191
-
192
- # if ($line =~ /<\?xml version.+>/) { } ### XML ###
193
- # if ($line =~ /^<\/column>$/) { } ### Column ###
194
- if ($is_markup && $line =~ /<theoreticalPage (.*)\/>/ && $is_markup) { $markup_output .= "### Page $1\n"; }
195
-
196
- # Pic
197
- if ($line =~ /^<dd (.*)>$/)
198
- {
199
- $is_pic = 1;
200
- if($is_markup) { $markup_output .= "### Figure $1\n"; }
201
- }
202
- elsif ($line =~ /^<\/dd>$/)
203
- {
204
- $is_pic = 0;
205
- }
206
-
207
- # Table
208
- if ($line =~ /^<table .*>$/)
209
- {
210
- $text .= $line."\n"; # we need the header
211
- $is_table = 1;
212
- }
213
- elsif ($line =~ /^<\/table>$/)
214
- {
215
- my $table_text = ProcessTable($text, $is_pic);
216
- $all_text .= $table_text;
217
-
218
- my @tmp_lines = split(/\n/, $table_text);
219
- $line_id += scalar(@tmp_lines);
220
-
221
- $is_table = 0;
222
- $text = "";
223
- }
224
- elsif ($is_table)
225
- {
226
- $text .= $line."\n";
227
- next;
228
- }
229
- # Paragraph
230
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
231
- elsif ($line =~ /^<para (.*)>$/)
232
- {
233
- $text .= $line."\n"; # we need the header
234
- $is_para = 1;
235
- if ($is_markup) { $markup_output .= "## Para $1\n"; }
236
- }
237
- elsif ($line =~ /^<\/para>$/)
238
- {
239
- my ($para_text, $l, $t, $r, $b) = undef;
240
- ($para_text, $l, $t, $r, $b, $is_space) = ProcessPara($text, 0, $is_pic);
241
- $all_text .= $para_text;
242
-
243
- my @tmp_lines = split(/\n/, $para_text);
244
- $line_id += scalar(@tmp_lines);
245
- $is_para = 0;
246
- $text = "";
247
- }
248
- elsif ($is_para)
249
- {
250
- $text .= $line."\n";
251
- next;
252
- }
253
- }
254
-
255
- close IF;
256
- return $all_text;
257
- }
258
-
259
- sub Output
260
- {
261
- my ($lines, $out_file) = @_;
262
- open(OF, ">:utf8", "$out_file") || die"#Can't open file \"$out_file\"\n";
263
-
264
- ####### Final output ############
265
- # XML feature label
266
- my %g_font_size_labels = ();
267
- # my %g_space_labels = (); # yes, no
268
-
269
- if($is_xml_feature)
270
- {
271
- GetFontSizeLabels(\%g_font_size_hash, \%g_font_size_labels);
272
- # GetSpaceLabels(\%g_space_hash, \%g_space_labels);
273
- }
274
-
275
- my $id = -1;
276
- my $output = "";
277
- my $para_line_id = -1;
278
- my $para_line_count = 0;
279
-
280
- foreach my $line (@{$lines})
281
- {
282
- $id++;
283
-
284
- # Remove ^M character at the end of each line if any
285
- $line =~ s/\cM$//;
286
-
287
- # Empty lines
288
- if($line =~ /^\s*$/)
289
- {
290
- if(!$is_allow_empty)
291
- {
292
- next;
293
- }
294
- else
295
- {
296
- if($is_debug) { print STDERR "#! Line $id empty!\n"; }
297
- }
298
- }
299
-
300
- if ($g_para[$id] eq "yes")
301
- {
302
- # Mark para
303
- if($output ne "")
304
- {
305
- if($is_para_delimiter)
306
- {
307
- print OF "# Para $para_line_id $para_line_count\n$output";
308
- $para_line_count = 0;
309
- }
310
- else
311
- {
312
- if ($is_decode) { $output = decode_entities($output); }
313
- print OF $output;
314
- }
315
-
316
- $output = "";
317
- }
318
-
319
- $para_line_id = $id;
320
- }
321
-
322
- $output .= $line;
323
- $para_line_count++;
324
-
325
- # Output XML features
326
- if ($is_xml_feature)
327
- {
328
- # Loc feature
329
- my $loc_feature;
330
- if ($g_pos_hash[$id] != -1)
331
- {
332
- $loc_feature = "xmlLoc_".int(($g_pos_hash[$id] - $g_minpos)*8.0/($g_maxpos - $g_minpos + 1));
333
- }
334
-
335
- # Align feature
336
- my $align_feature = "xmlAlign_" . $g_align[$id];
337
-
338
- # Font_size feature
339
- my $font_size_feature;
340
- if ($g_font_size[$id] == -1)
341
- {
342
- $font_size_feature = "xmlFontSize_none";
343
- }
344
- else
345
- {
346
- $font_size_feature = "xmlFontSize_" . $g_font_size_labels{$g_font_size[$id]};
347
- }
348
-
349
- my $bold_feature = "xmlBold_" . $g_bold[$id]; # Bold feature
350
- my $italic_feature = "xmlItalic_" . $g_italic[$id]; # Italic feature
351
- my $pic_feature = "xmlPic_" . $g_pic[$id]; # Pic feature
352
- my $table_feature = "xmlTable_" . $g_table[$id]; # Table feature
353
- my $bullet_feature = "xmlBullet_" . $g_bullet[$id]; # Bullet feature
354
-
355
- # Space feature
356
- # my $space_feature;
357
- # if($g_space[$id] eq "none")
358
- # {
359
- # $space_feature = "xmlSpace_none";
360
- # }
361
- # else
362
- # {
363
- # $space_feature = "xmlSpace_" . $g_space_labels{$g_space[$id]};
364
- # }
365
-
366
- # Differential features
367
- my ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff) = GetDifferentialFeatures($id);
368
-
369
- # Each line and its XML features
370
- $output .= " |XML| $loc_feature $bold_feature $italic_feature $font_size_feature $pic_feature $table_feature $bullet_feature $font_sfbia_diff $para_diff\n";
371
- }
372
- else
373
- {
374
- $output .= "\n";
375
- }
376
- }
377
-
378
- # Mark para
379
- if ($output ne "")
380
- {
381
- if ($is_para_delimiter)
382
- {
383
- print OF "# Para $para_line_id $para_line_count\n$output";
384
- $para_line_count = 0;
385
- }
386
- else
387
- {
388
- if($is_decode){ $output = decode_entities($output); }
389
- print OF $output;
390
- }
391
- $output = ""
392
- }
393
-
394
- close OF;
395
- }
396
-
397
- sub GetDifferentialFeatures
398
- {
399
- my ($id) = @_;
400
-
401
- # AlignChange feature
402
- my $align_diff = "bi_xmlA_";
403
-
404
- if ($id == 0)
405
- {
406
- $align_diff .= $g_align[$id];
407
- }
408
- elsif ($g_align[$id] eq $g_align[$id-1])
409
- {
410
- $align_diff .= "continue";
411
- }
412
- else
413
- {
414
- $align_diff .= $g_align[$id];
415
- }
416
-
417
- # FontFaceChange feature
418
- my $font_face_diff = "bi_xmlF_";
419
- if ($id == 0)
420
- {
421
- $font_face_diff .= "new";
422
- }
423
- elsif ($g_font_face[$id] eq $g_font_face[$id-1])
424
- {
425
- $font_face_diff .= "continue";
426
- }
427
- else
428
- {
429
- $font_face_diff .= "new";
430
- }
431
-
432
- # FontSizeChange feature
433
- my $font_size_diff = "bi_xmlS_";
434
- if ($id == 0)
435
- {
436
- $font_size_diff .= "new";
437
- }
438
- elsif ($g_font_size[$id] == $g_font_size[$id-1])
439
- {
440
- $font_size_diff .= "continue";
441
- }
442
- else
443
- {
444
- $font_size_diff .= "new";
445
- }
446
-
447
- # FontSFChange feature
448
- my $font_sf_diff = "bi_xmlSF_";
449
- if ($id == 0)
450
- {
451
- $font_sf_diff .= "new";
452
- }
453
- elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1])
454
- {
455
- $font_sf_diff .= "continue";
456
- }
457
- else
458
- {
459
- $font_sf_diff .= "new";
460
- }
461
-
462
- # FontSFBIChange feature
463
- my $font_sfbi_diff = "bi_xmlSFBI_";
464
- if ($id == 0)
465
- {
466
- $font_sfbi_diff .= "new";
467
- }
468
- elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1] && $g_bold[$id] eq $g_bold[$id-1] && $g_italic[$id] eq $g_italic[$id-1])
469
- {
470
- $font_sfbi_diff .= "continue";
471
- }
472
- else
473
- {
474
- $font_sfbi_diff .= "new";
475
- }
476
-
477
- # FontSFBIAChange feature
478
- my $font_sfbia_diff = "bi_xmlSFBIA_";
479
- if ($id == 0)
480
- {
481
- $font_sfbia_diff .= "new";
482
- }
483
- elsif ($g_font_size[$id] == $g_font_size[$id-1] && $g_font_face[$id] eq $g_font_face[$id-1] && $g_bold[$id] eq $g_bold[$id-1] && $g_italic[$id] eq $g_italic[$id-1] && $g_align[$id] eq $g_align[$id-1])
484
- {
485
- $font_sfbia_diff .= "continue";
486
- }
487
- else
488
- {
489
- $font_sfbia_diff .= "new";
490
- }
491
-
492
- # Para change feature
493
- my $para_diff = "bi_xmlPara_";
494
- # Header part, consider each line as a separate paragraph
495
- if ($id < $body_start_id)
496
- {
497
- $para_diff .= "header";
498
- }
499
- else
500
- {
501
- if($g_para[$id] eq "yes")
502
- {
503
- $para_diff .= "new";
504
- }
505
- else
506
- {
507
- $para_diff .= "continue";
508
- }
509
- }
510
-
511
- return ($align_diff, $font_size_diff, $font_face_diff, $font_sf_diff, $font_sfbi_diff, $font_sfbia_diff, $para_diff);
512
- }
513
-
514
- sub GetFontSizeLabels
515
- {
516
- my ($g_font_size_hash, $g_font_size_labels) = @_;
517
-
518
- if ($is_debug) { print STDERR "# Map fonts\n"; }
519
- my @sorted_fonts = sort { $g_font_size_hash->{$b} <=> $g_font_size_hash->{$a} } keys %{$g_font_size_hash}; # Sort by values, obtain keys
520
-
521
- my $common_size = $sorted_fonts[0];
522
- @sorted_fonts = sort { $a <=> $b } keys %{$g_font_size_hash}; # Sort by keys, obtain keys
523
-
524
- # Index of common font size
525
- my $common_index = 0;
526
- foreach (@sorted_fonts)
527
- {
528
- # Found
529
- if ($common_size == $_)
530
- {
531
- last;
532
- }
533
- $common_index++;
534
- }
535
-
536
- # Small fonts
537
- for (my $i = 0; $i < $common_index; $i++)
538
- {
539
- $g_font_size_labels->{$sorted_fonts[$i]} = "smaller";
540
-
541
- if($is_debug)
542
- {
543
- print STDERR "$sorted_fonts[$i] --> $g_font_size_labels->{$sorted_fonts[$i]}, freq = $g_font_size_hash->{$sorted_fonts[$i]}\n";
544
- }
545
- }
546
-
547
- # Common fonts
548
- $g_font_size_labels->{$common_size} = "common";
549
- if ($is_debug)
550
- {
551
- print STDERR "$sorted_fonts[$common_index] --> $g_font_size_labels->{$sorted_fonts[$common_index]}, freq = $g_font_size_hash->{$sorted_fonts[$common_index]}\n";
552
- }
553
-
554
- # Large fonts
555
- for (my $i = ($common_index + 1); $i < scalar(@sorted_fonts); $i++)
556
- {
557
- if ((scalar(@sorted_fonts)-$i) <= 3)
558
- {
559
- $g_font_size_labels->{$sorted_fonts[$i]} = "largest".($i+1-scalar(@sorted_fonts));
560
- }
561
- else
562
- {
563
- $g_font_size_labels->{$sorted_fonts[$i]} = "larger";
564
- }
565
-
566
- if($is_debug)
567
- {
568
- print STDERR "$sorted_fonts[$i] --> $g_font_size_labels->{$sorted_fonts[$i]}, freq = $g_font_size_hash->{$sorted_fonts[$i]}\n";
569
- }
570
- }
571
- }
572
-
573
- sub GetSpaceLabels
574
- {
575
- my ($g_space_hash, $g_space_labels) = @_;
576
-
577
- if ($is_debug)
578
- {
579
- print STDERR "\n# Map space\n";
580
- }
581
- my @sorted_spaces = sort { $g_space_hash->{$b} <=> $g_space_hash->{$a} } keys %{$g_space_hash}; # sort by freqs, obtain space faces
582
-
583
- my $common_space = $sorted_spaces[0];
584
- my $common_freq = $g_space_hash->{$common_space};
585
-
586
- # Find similar common freq with larger spaces
587
- for (my $i = 0; $i < scalar(@sorted_spaces); $i++)
588
- {
589
- my $freq = $g_space_hash->{$sorted_spaces[$i]};
590
- if ($freq/$common_freq > 0.8)
591
- {
592
- if($sorted_spaces[$i] > $common_space)
593
- {
594
- $common_space = $sorted_spaces[$i];
595
- }
596
- }
597
- else
598
- {
599
- last;
600
- }
601
- }
602
-
603
- for (my $i = 0; $i < scalar(@sorted_spaces); $i++)
604
- {
605
- if ($sorted_spaces[$i] > $common_space)
606
- {
607
- $g_space_labels->{$sorted_spaces[$i]} = "yes";
608
- }
609
- else
610
- {
611
- $g_space_labels->{$sorted_spaces[$i]} = "no";
612
- }
613
-
614
- if($is_debug)
615
- {
616
- print STDERR "$sorted_spaces[$i] --> $g_space_labels->{$sorted_spaces[$i]}, freq = $g_space_hash->{$sorted_spaces[$i]}\n";
617
- }
618
- }
619
- }
620
-
621
- sub GetAttrValue
622
- {
623
- my ($attr_text, $attr) = @_;
624
-
625
- my $value = "none";
626
- if ($attr_text =~ /^.*$attr=\"(.+?)\".*$/)
627
- {
628
- $value = $1;
629
- }
630
-
631
- return $value;
632
- }
633
-
634
- sub CheckFontAttr
635
- {
636
- my ($attr_text, $attr, $attr_hash, $count) = @_;
637
-
638
- if ($attr_text =~ /^.*$attr=\"(.+?)\".*$/)
639
- {
640
- my $attr_value = $1;
641
- $attr_hash->{$attr_value} = $attr_hash->{$attr_value} ? ($attr_hash->{$attr_value} + $count) : $count;
642
- }
643
- }
644
-
645
- sub ProcessTable
646
- {
647
- my ($input_text, $is_pic) = @_;
648
-
649
- # For table cell object
650
- my $is_cell = 0;
651
-
652
- my $all_text = "";
653
- my $text = "";
654
-
655
- my @lines = split(/\n/, $input_text);
656
-
657
- my %table_pos = (); # $table_pos{$cellText} = "$l-$t-$r-$bottom"
658
- my %table = (); # $table{$row}->{$col} = \@para_texts
659
- my $row_from;
660
- my $col_from;
661
- my $row_till;
662
- my $col_till;
663
-
664
- # xml feature
665
- my $align = "none";
666
- my $pos = -1;
667
- foreach my $line (@lines)
668
- {
669
- if ($line =~ /^<table (.+?)>$/)
670
- {
671
- my $attr = $1;
672
-
673
- if($is_markup) { $markup_output .= "### Table $attr\n"; }
674
-
675
- # Fix: wrong regex sequence, huydhn
676
- #if ($attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*alignment=\"(.+?)\".*$/)
677
- #{
678
- # my ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
679
- # $align = $5;
680
-
681
- # # pos feature
682
- # $pos = ($t+$bottom)/2.0;
683
-
684
- # if($pos < $g_minpos) { $g_minpos = $pos; }
685
- # if($pos > $g_maxpos) { $g_maxpos = $pos; }
686
- #}
687
- #else
688
- #{
689
- # print STDERR "# no table alignment or location \"$line\"\n";
690
- # $align = "";
691
- #}
692
-
693
- my ($l, $t, $r, $bottom) = undef;
694
- if ($attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; }
695
- if ($attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; }
696
- if ($attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; }
697
- if ($attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; }
698
-
699
- if ($t && $bottom)
700
- {
701
- # pos feature
702
- $pos = ($t + $bottom) / 2.0;
703
-
704
- if($pos < $g_minpos) { $g_minpos = $pos; }
705
- if($pos > $g_maxpos) { $g_maxpos = $pos; }
706
- }
707
- else
708
- {
709
- die "# Undefined table location \"$line\"\n";
710
- }
711
-
712
- if ($attr =~ /^.*alignment=\"(\d+)\".*$/)
713
- {
714
- $align = $1;
715
- }
716
- else
717
- {
718
- print STDERR "# no table alignment \"$line\"\n";
719
- $align = "";
720
- }
721
- # End.
722
- }
723
- elsif ($line =~ /^<cell .*gridColFrom=\"(\d+)\" gridColTill=\"(\d+)\" gridRowFrom=\"(\d+)\" gridRowTill=\"(\d+)\".*>$/) # new cell
724
- {
725
- $col_from = $1;
726
- $col_till = $2;
727
- $row_from = $3;
728
- $row_till = $4;
729
- #print STDERR "$row_from $row_till $col_from $col_till\n";
730
- $is_cell = 1;
731
- }
732
- elsif ($line =~ /^<\/cell>$/) # end cell
733
- {
734
- my @para_texts = ();
735
- ProcessCell($text, \@para_texts, \%table_pos, $is_pic);
736
-
737
- for(my $i = $row_from; $i<=$row_till; $i++)
738
- {
739
- for(my $j = $col_from; $j<=$col_till; $j++)
740
- {
741
- if(!$table{$i}) { $table{$i} = (); }
742
- if(!$table{$i}->{$j}) { $table{$i}->{$j} = (); }
743
-
744
- if($i == $row_from && $j == $col_from)
745
- {
746
- push(@{$table{$i}->{$j}}, @para_texts);
747
- if(scalar(@para_texts) > 1) { last; }
748
- }
749
- else
750
- {
751
- push(@{$table{$i}->{$j}}, ""); #add stub "" for spanning rows or cols
752
- }
753
- }
754
- }
755
-
756
- $is_cell = 0;
757
- $text = "";
758
- }
759
- elsif($is_cell)
760
- {
761
- $text .= $line."\n";
762
- next;
763
- }
764
- }
765
-
766
- # note: such a complicated code is because in the normal node, Omnipage doesn't seem to strictly print column by column given a row is fixed.
767
- # E.g if col1: paraText1, col2: paraText21\n$paraText22, and col3: paraText31\n$paraText32
768
- # It will print paraText1\tparaText21\tparaText31\n\t$paraText22\t$paraText32
769
- my @sorted_rows = sort {$a <=> $b} keys %table;
770
- my $is_first_line_para = 1;
771
- foreach my $row (@sorted_rows)
772
- {
773
- my %table_r = %{$table{$row}};
774
- my @sorted_cols = sort {$a <=> $b} keys %table_r;
775
- while(1)
776
- {
777
- my $is_stop = 1;
778
- my $row_text = "";
779
-
780
- foreach my $col (@sorted_cols)
781
- {
782
- # there's still some thing to process
783
- if(scalar(@{$table_r{$col}}) > 0)
784
- {
785
- $is_stop = 0;
786
- $row_text .= shift(@{$table_r{$col}});
787
- }
788
- $row_text .= "\t";
789
- }
790
-
791
- if ((!$is_allow_empty && $row_text =~ /^\s*$/) || ($is_allow_empty && $row_text eq ""))
792
- {
793
- $is_stop = 1;
794
- }
795
-
796
- if($is_stop)
797
- {
798
- last;
799
- }
800
- else
801
- {
802
- $row_text =~ s/\t$/\n/;
803
- $all_text .= $row_text;
804
- # print STDERR "$row_text";
805
-
806
- # para
807
- if($is_first_line_para)
808
- {
809
- push(@g_para, "yes");
810
- $is_first_line_para = 0;
811
- }
812
- else
813
- {
814
- push(@g_para, "no");
815
- }
816
-
817
- if($is_xml_feature)
818
- {
819
- # table feature
820
- push(@g_table, "yes");
821
-
822
- # pic feature
823
- if($is_pic)
824
- {
825
- push(@g_pic, "yes");
826
- }
827
- else
828
- {
829
- push(@g_pic, "no");
830
- }
831
-
832
- push(@g_pos_hash, $pos); # update xml pos value
833
- push(@g_align, $align); # update xml alignment value
834
-
835
- ### Not assign value ###
836
- push(@g_font_size, -1); # fontSize feature
837
- push(@g_font_face, "none"); # fontFace feature
838
- push(@g_bold, "no"); # bold feature
839
- push(@g_italic, "no"); # italic feature
840
- push(@g_bullet, "no"); # bullet feature
841
- # push(@gSpace, "none"); # space feature
842
- } # end if xml feature
843
- }
844
- }
845
- }
846
-
847
- return $all_text;
848
- }
849
-
850
- sub ProcessCell
851
- {
852
- my ($input_text, $para_texts, $table_pos, $is_pic) = @_;
853
-
854
- my $text = "";
855
- my @lines = split(/\n/, $input_text);
856
- my $is_para = 0;
857
- my $flag = 0;
858
- foreach my $line (@lines)
859
- {
860
- if ($line =~ /^<para (.*)>$/)
861
- {
862
- $text .= $line."\n"; # we need the header
863
- $is_para = 1;
864
-
865
- if($is_markup)
866
- {
867
- $markup_output .= "## ParaTable $1\n";
868
- }
869
- }
870
- elsif ($line =~ /^<\/para>$/)
871
- {
872
- my ($para_text, $l, $t, $r, $b) = ProcessPara($text, 1, $is_pic);
873
- my @tokens = split(/\n/, $para_text);
874
-
875
- foreach my $token (@tokens)
876
- {
877
- if($token ne "")
878
- {
879
- push(@{$para_texts}, $token);
880
- $flag = 1;
881
- }
882
- }
883
-
884
- if(!$table_pos->{$para_text})
885
- {
886
- $table_pos->{$para_text} = "$l-$t-$r-$b";
887
- }
888
- else
889
- {
890
- #print STDERR "#! Warning: in method processCell, encounter the same para_text $para_text\n";
891
- }
892
-
893
- $is_para = 0;
894
- $text = "";
895
- }
896
- elsif ($is_para)
897
- {
898
- $text .= $line."\n";
899
- next;
900
- }
901
- }
902
-
903
- # at least one value should be added for cell which is ""
904
- if ($flag == 0)
905
- {
906
- push(@{$para_texts}, "");
907
- }
908
- }
909
-
910
- sub ProcessPara
911
- {
912
- my ($input_text, $is_cell, $is_pic) = @_;
913
-
914
- my $is_space = 0;
915
- my $is_special_space = 0;
916
- my $is_tab = 0;
917
- my $is_bullet = 0;
918
- my $is_forced_eof = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $is_space = 0
919
-
920
- # XML feature
921
- my $align = "none";
922
- my ($l, $t, $r, $bottom);
923
-
924
- my %font_size_hash = ();
925
- my %font_face_hash = ();
926
- my @bold_array = ();
927
- my @italic_array = ();
928
- my $space = "none";
929
-
930
- my $ln_attr;
931
- my $is_ln = 0;
932
- my $ln_bold = "none";
933
- my $ln_italic = "none";
934
-
935
- my $run_attr;
936
- my $run_text = "";
937
- my $is_run = 0;
938
- my $run_bold = "none";
939
- my $run_italic = "none";
940
-
941
- my $wd_attr;
942
- my $wd_text = "";
943
- my $is_wd = 0;
944
-
945
- # Word index in a line. When encountering </ln>, this parameter indicates the number of words in a line
946
- my $wd_index = 0;
947
- my $ln_bold_count = 0;
948
- my $ln_italic_count = 0;
949
-
950
- my $all_text = "";
951
- # Invariant: when never enter a new line, $text will be copied into $all_text, and $text is cleared
952
- my $text = "";
953
-
954
- binmode(STDERR, ":utf8");
955
-
956
- my $is_first_line_para = 1;
957
- my @lines = split(/\n/, $input_text);
958
-
959
- for (my $i=0; $i < scalar(@lines); $i++)
960
- {
961
- my $line = $lines[$i];
962
-
963
- # New para
964
- if ($line =~ /^<para (.+?)>$/)
965
- {
966
- my $attr = $1;
967
- $align = GetAttrValue($attr, "alignment");
968
- # $indent = GetAttrValue($attr, "li");
969
- $space = GetAttrValue($attr, "spaceBefore");
970
- }
971
- # New ln
972
- elsif ($line =~ /^<ln (.+)>$/)
973
- {
974
- $ln_attr = $1;
975
- $is_ln = 1;
976
-
977
- if ($is_markup) { $markup_output .= "# Line $ln_attr\n"; }
978
-
979
- # Fix: wrong regex sequence, huydhn
980
- #if ($ln_attr =~ /^.*l=\"(\d+)\" t=\"(\d+)\" r=\"(\d+)\" b=\"(\d+)\".*$/)
981
- #{
982
- # ($l, $t, $r, $bottom) = ($1, $2, $3, $4);
983
- #}
984
-
985
- if ($ln_attr =~ /^.*l=\"(\d+)\".*$/) { $l = $1; } else { $l = undef; }
986
- if ($ln_attr =~ /^.*t=\"(\d+)\".*$/) { $t = $1; } else { $t = undef; }
987
- if ($ln_attr =~ /^.*r=\"(\d+)\".*$/) { $r = $1; } else { $r = undef; }
988
- if ($ln_attr =~ /^.*b=\"(\d+)\".*$/) { $bottom = $1; } else { $bottom = undef; }
989
- # End.
990
-
991
- $is_forced_eof = GetAttrValue($ln_attr, "forcedEOF");
992
-
993
- # Bold & Italic
994
- if ($is_xml_feature)
995
- {
996
- $ln_bold = GetAttrValue($ln_attr, "bold");
997
- $ln_italic = GetAttrValue($ln_attr, "italic");
998
- }
999
- }
1000
- # New run
1001
- elsif ($line =~ /<run (.*)>$/)
1002
- {
1003
- $run_attr = $1;
1004
- $is_space = 0;
1005
- $is_tab = 0;
1006
- $is_run = 1;
1007
-
1008
- # New wd, that consists of many runs
1009
- if ($line =~ /^<wd (.*?)>/)
1010
- {
1011
- $is_wd = 1;
1012
- $wd_attr = $1;
1013
- }
1014
-
1015
- # Bold & Italic
1016
- if ($is_xml_feature)
1017
- {
1018
- $run_bold = GetAttrValue($run_attr, "bold");
1019
- $run_italic = GetAttrValue($run_attr, "italic");
1020
- }
1021
- }
1022
- # Word
1023
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/)
1024
- {
1025
- $wd_attr = $1;
1026
- my $word = $2;
1027
- $is_space = 0;
1028
- $is_tab = 0;
1029
-
1030
- if ($is_markup)
1031
- {
1032
- $markup_output .= "$word $wd_attr\n";
1033
- # If both bold and italic, then just use one
1034
- if ($is_run && $run_attr =~ /(bold|italic)=\"true\"/)
1035
- {
1036
- $markup_output .= " $1=\"true\"";
1037
- }
1038
- $markup_output .= "\n";
1039
- }
1040
-
1041
- # FontSize & FontFace
1042
- if ($is_xml_feature)
1043
- {
1044
- CheckFontAttr($wd_attr, "fontSize", \%font_size_hash, 1);
1045
- CheckFontAttr($wd_attr, "fontFace", \%font_face_hash, 1);
1046
- }
1047
-
1048
- # Bold & Italic
1049
- if ($is_xml_feature)
1050
- {
1051
- my $wd_bold = GetAttrValue($wd_attr, "bold");
1052
- my $wd_italic = GetAttrValue($wd_attr, "italic");
1053
-
1054
- if ($wd_bold eq "true" || $run_bold eq "true" || $ln_bold eq "true")
1055
- {
1056
- $bold_array[$wd_index] = 1;
1057
- $ln_bold_count++;
1058
- }
1059
-
1060
- if ($wd_italic eq "true" || $run_italic eq "true" || $ln_italic eq "true")
1061
- {
1062
- $italic_array[$wd_index] = 1;
1063
- $ln_italic_count++;
1064
- }
1065
- }
1066
-
1067
- # Add text
1068
- $text .= "$word";
1069
-
1070
- if ($is_run)
1071
- {
1072
- $run_text .= "$word ";
1073
- }
1074
-
1075
- $wd_index++;
1076
- }
1077
- # End wd
1078
- elsif ($line =~ /^<\/wd>$/)
1079
- {
1080
- $is_wd = 0;
1081
-
1082
- if ($is_markup)
1083
- {
1084
- $markup_output .= "$wd_text $wd_attr\n";
1085
- # If both bold and italic, then just use one
1086
- if ($is_run && $run_attr =~ /(bold|italic)=\"true\"/)
1087
- {
1088
- $markup_output .= " $1=\"true\"";
1089
- }
1090
-
1091
- $markup_output .= "\n";
1092
- $wd_attr = "";
1093
- }
1094
- }
1095
- # End run
1096
- elsif ($line =~ /^(.*)<\/run>$/)
1097
- {
1098
- my $word = $1;
1099
-
1100
- # Add text
1101
- if ($word ne "")
1102
- {
1103
- # Bold & Italic
1104
- if ($is_xml_feature)
1105
- {
1106
- if ($run_bold eq "true" || $ln_bold eq "true")
1107
- {
1108
- $bold_array[$wd_index] = 1;
1109
- $ln_bold_count++;
1110
- }
1111
-
1112
- if ($run_italic eq "true" || $ln_italic eq "true")
1113
- {
1114
- $italic_array[$wd_index] = 1;
1115
- $ln_italic_count++;
1116
- }
1117
- }
1118
-
1119
- # Appear in the final result
1120
- if ($is_ln) { $text .= "$word"; }
1121
-
1122
- # For internal record
1123
- if ($is_run) { $run_text .= $word . " "; }
1124
- if ($is_wd) { $wd_text .= $word; }
1125
-
1126
- $wd_index++;
1127
- }
1128
-
1129
- # Xml feature
1130
- # Not a space, tab or new-line run
1131
- if ($is_xml_feature && $run_text ne "")
1132
- {
1133
- my @words = split(/\s+/, $run_text);
1134
- my $num_words = scalar(@words);
1135
- CheckFontAttr($run_attr, "fontSize", \%font_size_hash, $num_words, 1);
1136
- CheckFontAttr($run_attr, "fontFace", \%font_face_hash, $num_words, 1);
1137
- }
1138
-
1139
- # Reset run
1140
- # <run> not enclosed within <ln>
1141
- if (!$is_ln)
1142
- {
1143
- $wd_index = 0;
1144
- }
1145
-
1146
- $run_text = "";
1147
- $is_run = 0;
1148
- $is_special_space = 0;
1149
-
1150
- # Bold & Italic
1151
- if ($is_xml_feature)
1152
- {
1153
- $run_bold = "none";
1154
- $run_italic = "none";
1155
-
1156
- # <run> not enclosed within <ln>
1157
- if(!$is_ln)
1158
- {
1159
- $ln_bold_count = 0;
1160
- $ln_italic_count = 0;
1161
- }
1162
- }
1163
- }
1164
- # End ln
1165
- elsif ($line =~ /^<\/ln>$/)
1166
- {
1167
- if((!$is_allow_empty && $text !~ /^\s*$/) || ($is_allow_empty && $text ne ""))
1168
- {
1169
- if ($is_forced_eof eq "true" || (!$is_cell && !$is_special_space) )
1170
- {
1171
- $text .= "\n";
1172
- # Update all_text
1173
- $all_text .= $text;
1174
- $text = "";
1175
- }
1176
-
1177
- my $num_words = $wd_index;
1178
-
1179
- if (!$is_cell)
1180
- {
1181
- if ($is_first_line_para)
1182
- {
1183
- push(@g_para, "yes");
1184
- $is_first_line_para = 0;
1185
- }
1186
- else
1187
- {
1188
- push(@g_para, "no");
1189
- }
1190
- }
1191
-
1192
- if ($is_xml_feature && $num_words >= 1)
1193
- {
1194
- # XML feature
1195
- # Assumtion that: font_size is either occur in <ln>, or within multiple <run> under <ln>, but not both
1196
- CheckFontAttr($ln_attr, "fontSize", \%font_size_hash, $num_words);
1197
- CheckFontAttr($ln_attr, "fontFace", \%font_face_hash, $num_words);
1198
- }
1199
-
1200
- if ($is_xml_feature && !$is_cell && !$is_special_space)
1201
- {
1202
- my $pos = ($t + $bottom)/2.0;
1203
- if ($pos < $g_minpos) { $g_minpos = $pos; }
1204
- if ($pos > $g_maxpos) { $g_maxpos = $pos; }
1205
-
1206
- push(@g_pos_hash, $pos); # pos feature
1207
- push(@g_align, $align); # alignment feature
1208
- push(@g_table, "no"); # table feature
1209
-
1210
- if ($is_pic)
1211
- {
1212
- push(@g_pic, "yes");
1213
-
1214
- # Not assign value
1215
- push(@g_font_size, -1); # bold feature
1216
- push(@g_font_face, "none"); # bold feature
1217
- push(@g_bold, "no"); # bold feature
1218
- push(@g_italic, "no"); # italic feature
1219
- push(@g_bullet, "no"); # bullet feature
1220
- }
1221
- else
1222
- {
1223
- push(@g_pic, "no");
1224
-
1225
- UpdateXMLFontFeature(\%font_size_hash, \%font_face_hash);
1226
-
1227
- %font_size_hash = ();
1228
- %font_face_hash = ();
1229
-
1230
- UpdateXMLFeatures($ln_bold_count, $ln_italic_count, $num_words, $is_bullet, $space);
1231
- }
1232
- }
1233
- }
1234
-
1235
- # Reset ln
1236
- $is_ln = 0;
1237
- $is_forced_eof = "none";
1238
- $is_special_space = 0;
1239
- $wd_index = 0;
1240
-
1241
- # Bold & Italic
1242
- if ($is_xml_feature)
1243
- {
1244
- $ln_bold = "none";
1245
- $ln_italic = "none";
1246
- $ln_bold_count = 0;
1247
- $ln_italic_count = 0;
1248
- }
1249
- }
1250
- # Newline signal
1251
- elsif ($line =~ /^<nl orig=\"true\"\/>$/)
1252
- {
1253
- if($is_ln)
1254
- {
1255
- $is_space = 0;
1256
- }
1257
- else
1258
- {
1259
- if($is_debug)
1260
- {
1261
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
1262
- }
1263
- }
1264
- }
1265
- # Space
1266
- elsif ($line =~ /^<space\/>$/)
1267
- {
1268
- my $start_tag = "";
1269
- my $end_tag = "";
1270
-
1271
- if ($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/) { $start_tag = $1; }
1272
-
1273
- if ($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/) { $end_tag = $1; }
1274
-
1275
- if ($start_tag eq $end_tag && $start_tag ne "")
1276
- {
1277
- # print STDERR "# Special space after \"$text\"\n";
1278
- $is_special_space = 1;
1279
- }
1280
-
1281
- # Add text
1282
- $text .= " ";
1283
- $is_space = 1;
1284
- }
1285
- # Tab
1286
- elsif ($line =~ /^<tab .*\/>$/)
1287
- {
1288
- # Add text
1289
- $text .= "\t";
1290
- $is_tab = 1;
1291
- }
1292
- # Bullet
1293
- elsif ($line =~ /^<bullet .*>$/)
1294
- {
1295
- $is_bullet = 1;
1296
- }
1297
- }
1298
-
1299
- $all_text .= $text;
1300
- return ($all_text, $l, $t, $r, $bottom, $is_space);
1301
- }
1302
-
1303
- sub UpdateXMLFontFeature
1304
- {
1305
- my ($font_size_hash, $font_face_hash) = @_;
1306
-
1307
- # Font size feature
1308
- if (scalar(keys %{$font_size_hash}) == 0)
1309
- {
1310
- push(@g_font_size, -1);
1311
- }
1312
- else
1313
- {
1314
- my @sorted_fonts = sort { $font_size_hash->{$b} <=> $font_size_hash->{$a} } keys %{$font_size_hash};
1315
-
1316
- my $font_size = $sorted_fonts[0];
1317
- push(@g_font_size, $font_size);
1318
-
1319
- $g_font_size_hash{$font_size} = $g_font_size_hash{$font_size} ? ($g_font_size_hash{$font_size}+1) : 1;
1320
- }
1321
-
1322
- # Font face feature
1323
- if (scalar(keys %{$font_face_hash}) == 0)
1324
- {
1325
- push(@g_font_face, "none");
1326
- }
1327
- else
1328
- {
1329
- my @sorted_fonts = sort { $font_face_hash->{$b} <=> $font_face_hash->{$a} } keys %{$font_face_hash};
1330
- my $font_face = $sorted_fonts[0];
1331
- push(@g_font_face, $font_face);
1332
-
1333
- $g_font_face_hash{$font_face} = $g_font_face_hash{$font_face} ? ($g_font_face_hash{$font_face}+1) : 1;
1334
- }
1335
- }
1336
-
1337
- sub UpdateXMLFeatures
1338
- {
1339
- my ($ln_bold_count, $ln_italic_count, $num_words, $is_bullet, $space) = @_;
1340
-
1341
- # Bold feature
1342
- my $bold_feature;
1343
- if ($ln_bold_count/$num_words >= 0.667)
1344
- {
1345
- $bold_feature = "yes";
1346
- }
1347
- else
1348
- {
1349
- $bold_feature = "no";
1350
- }
1351
- push(@g_bold, $bold_feature);
1352
-
1353
- # Italic feature
1354
- my $italic_feature;
1355
- if ($ln_italic_count/$num_words >= 0.667)
1356
- {
1357
- $italic_feature = "yes";
1358
- }
1359
- else
1360
- {
1361
- $italic_feature = "no";
1362
- }
1363
- push(@g_italic, $italic_feature);
1364
-
1365
- # Bullet feature
1366
- if ($is_bullet)
1367
- {
1368
- push(@g_bullet, "yes");
1369
- }
1370
- else
1371
- {
1372
- push(@g_bullet, "no");
1373
- }
1374
-
1375
- # Space feature
1376
- # push(@gSpace, $space);
1377
- }
1378
-
1379
- # Find the positions of header, body, and citation
1380
- sub GetStructureInfo
1381
- {
1382
- my ($lines, $num_lines) = @_;
1383
-
1384
- my ($body_length, $citation_length, $body_end_id) = SectLabel::PreProcess::findCitationText($lines, 0, $num_lines);
1385
-
1386
- my ($header_length, $body_start_id);
1387
-
1388
- ($header_length, $body_length, $body_start_id) = SectLabel::PreProcess::findHeaderText($lines, 0, $body_length);
1389
-
1390
- # Sanity check
1391
- my $totalLength = $header_length + $body_length + $citation_length;
1392
-
1393
- if ($num_lines != $totalLength)
1394
- {
1395
- print STDOUT "Die in getStructureInfo(): different num lines $num_lines != $totalLength\n"; # to display in Web
1396
- die "Die in getStructureInfo(): different num lines $num_lines != $totalLength\n";
1397
- }
1398
-
1399
- return ($header_length, $body_length, $citation_length, $body_start_id, $body_end_id);
1400
- }
1401
-
1402
- # Count XML tags/values for statistics purpose
1403
- sub ProcessTagInfo
1404
- {
1405
- my ($line, $tags) = @_;
1406
-
1407
- my $tag;
1408
- my $attr;
1409
-
1410
- if ($line =~ /^<(.+?)\b(.*)/)
1411
- {
1412
- $tag = $1;
1413
- $attr = $2;
1414
- if (!$tags->{$tag})
1415
- {
1416
- $tags->{$tag} = ();
1417
- }
1418
-
1419
- if ($attr =~ /^\s*(.+?)\s*\/?>/)
1420
- {
1421
- $attr = $1;
1422
- }
1423
-
1424
- my @tokens = split(/\s+/, $attr);
1425
- foreach my $token (@tokens)
1426
- {
1427
- if($token =~ /^(.+)=(.+)$/)
1428
- {
1429
- my $attr_name = $1;
1430
- my $value = $2;
1431
-
1432
- if (!$tags->{$tag}->{$attr_name})
1433
- {
1434
- $tags->{$tag}->{$attr_name} = ();
1435
- }
1436
- if (!$tags->{$tag}->{$attr_name}->{$value})
1437
- {
1438
- $tags->{$tag}->{$attr_name}->{$value} = 0;
1439
- }
1440
- $tags->{$tag}->{$attr_name}->{$value}++;
1441
- }
1442
- }
1443
- }
1444
- }
1445
-
1446
- # Print tag info to file
1447
- sub PrintTagInfo
1448
- {
1449
- my ($tags, $tag_file) = @_;
1450
-
1451
- open(TAG, ">:utf8", "$tag_file") || die"#Can't open file \"$tag_file\"\n";
1452
-
1453
- my @sortedTags = sort {$a cmp $b} keys %{$tags};
1454
-
1455
- foreach(@sortedTags)
1456
- {
1457
- my @attrs = sort {$a cmp $b} keys %{$tags->{$_}};
1458
- print TAG "# Tag = $_\n";
1459
-
1460
- foreach my $attr (@attrs)
1461
- {
1462
- print TAG "$attr:";
1463
- my @values = sort {$a cmp $b} keys %{$tags->{$_}->{$attr}};
1464
-
1465
- foreach my $value (@values)
1466
- {
1467
- print TAG " $value-$tags->{$_}->{$attr}->{$value}";
1468
- }
1469
-
1470
- print TAG "\n";
1471
- }
1472
- }
1473
-
1474
- close TAG;
1475
- }
1476
-
1477
- sub UntaintPath
1478
- {
1479
- my ($path) = @_;
1480
-
1481
- if ( $path =~ /^([-_\/\w\.]*)$/ )
1482
- {
1483
- $path = $1;
1484
- }
1485
- else
1486
- {
1487
- die "Bad path \"$path\"\n";
1488
- }
1489
-
1490
- return $path;
1491
- }
1492
-
1493
- sub Untaint
1494
- {
1495
- my ($s) = @_;
1496
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/)
1497
- {
1498
- $s = $1; # $data now untainted
1499
- }
1500
- else
1501
- {
1502
- die "Bad data in $s"; # log this somewhere
1503
- }
1504
-
1505
- return $s;
1506
- }
1507
-
1508
- sub Execute
1509
- {
1510
- my ($cmd) = @_;
1511
-
1512
- if ($is_debug)
1513
- {
1514
- print STDERR "Executing: $cmd\n";
1515
- }
1516
-
1517
- $cmd = Untaint($cmd);
1518
- system($cmd);
1519
- }
1520
-
1521
- sub NewTmpFile
1522
- {
1523
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
1524
- chomp($tmpFile);
1525
- return $tmpFile;
1526
- }
1527
-
1528
-
1529
-