biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,382 +0,0 @@
1
- #!/usr/bin/perl -wT
2
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
-
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
- use Getopt::Long;
9
- use HTML::Entities;
10
-
11
- # I do not know a better solution to find a lib path in -T mode.
12
- # So if you know a better solution, I'd be glad to hear.
13
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
- use FindBin;
15
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
- my $path;
17
- BEGIN {
18
- if ($FindBin::Bin =~ /(.*)/) {
19
- $path = $1;
20
- }
21
- }
22
- use lib "$path/../../lib";
23
- use SectLabel::PreProcess;
24
-
25
- ### USER customizable section
26
- $0 =~ /([^\/]+)$/; my $progname = $1;
27
- my $outputVersion = "1.0";
28
- ### END user customizable section
29
-
30
- sub License {
31
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
32
- }
33
-
34
- ### HELP Sub-procedure
35
- sub Help {
36
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract necessary information. Marking in the output detailed word-level info ### Page\\n## Para\\n# Line\\nword\\n### Table\\n### Figure\n";
37
-
38
- print STDERR "usage: $progname -h\t[invokes help]\n";
39
- print STDERR " $progname -in xmlFile -out outFile [-decode -allowEmptyLine -log]\n";
40
- print STDERR "Options:\n";
41
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
42
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
43
- }
44
-
45
- my $QUIET = 0;
46
- my $HELP = 0;
47
- my $outFile = undef;
48
- my $inFile = undef;
49
-
50
- my $isDecode = 0;
51
- my $isAllowEmpty = 0;
52
- my $isDebug = 0;
53
- $HELP = 1 unless GetOptions('in=s' => \$inFile,
54
- 'out=s' => \$outFile,
55
- 'decode' => \$isDecode,
56
- 'allowEmptyLine' => \$isAllowEmpty,
57
- 'log' => \$isDebug,
58
- 'h' => \$HELP,
59
- 'q' => \$QUIET);
60
-
61
- if ($HELP || !defined $inFile || !defined $outFile) {
62
- Help();
63
- exit(0);
64
- }
65
-
66
- if (!$QUIET) {
67
- License();
68
- }
69
-
70
- ### Untaint ###
71
- $inFile = untaintPath($inFile);
72
- $outFile = untaintPath($outFile);
73
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
74
- ### End untaint ###
75
-
76
- if($isDebug){
77
- print STDERR "\n# Processing file $inFile & output to $outFile\n";
78
- }
79
-
80
- my $markupOutput = "";
81
- processFile($inFile);
82
-
83
- if($isDecode){
84
- $markupOutput = decode_entities($markupOutput);
85
- }
86
-
87
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
88
- print OF "$markupOutput";
89
- close OF;
90
-
91
- sub processFile {
92
- my ($inFile) = @_;
93
-
94
- if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
95
- open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
96
-
97
- my $isPara = 0;
98
- my $isTable = 0;
99
- my $isSpace = 0;
100
- my $isPic = 0;
101
- my $text = "";
102
-
103
- while (<IF>) { #each line contains a header
104
- if (/^\#/) { next; } # skip comments
105
- chomp;
106
- s/\cM$//; # remove ^M character at the end of the file if any
107
- my $line = $_;
108
-
109
-
110
- # if ($line =~ /<\?xml version.+>/){ } ### Xml ###
111
- # if ($line =~ /^<\/column>$/){ } ### Column ###
112
- if ($line =~ /<theoreticalPage (.*)\/>/){
113
- $markupOutput .= "||| Page $1\n";
114
- }
115
-
116
- ### pic ###
117
- if ($line =~ /^<dd (.*)>$/){
118
- $isPic = 1;
119
-
120
- $markupOutput .= "||| Figure $1\n";
121
- }
122
- elsif ($line =~ /^<\/dd>$/){
123
- $isPic = 0;
124
- }
125
-
126
- ### Table ###
127
- elsif ($line =~ /^<table (.*)>$/){
128
- $isTable = 1;
129
- $markupOutput .= "||| Table $1\n";
130
- }
131
- elsif ($line =~ /^<\/table>$/){
132
- $isTable = 0;
133
- }
134
-
135
- ### Paragraph ###
136
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
137
- elsif ($line =~ /^<para (.*)>$/){
138
- $text .= $line."\n"; # we need the header
139
- $isPara = 1;
140
-
141
- if($isTable){
142
- $markupOutput .= "||| ParaTable $1\n";
143
- } else {
144
- $markupOutput .= "||| Para $1\n";
145
- }
146
- }
147
- elsif ($line =~ /^<\/para>$/){
148
- my $paraText;
149
- processPara($text);
150
-
151
- $isPara = 0;
152
- $text = "";
153
- }
154
- elsif($isPara){
155
- $text .= $line."\n";
156
- next;
157
- }
158
- }
159
- close IF;
160
- }
161
-
162
- sub getAttrValue {
163
- my ($attrText, $attr) = @_;
164
-
165
- my $value = "none";
166
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
167
- $value = $1;
168
- }
169
-
170
- return $value;
171
- }
172
-
173
- sub checkFontAttr {
174
- my ($attrText, $attr, $attrHash, $count) = @_;
175
-
176
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
177
- my $attrValue = $1;
178
-
179
- $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
180
- }
181
- }
182
-
183
- sub processPara {
184
- my ($inputText) = @_;
185
-
186
- my $isSpace = 0;
187
- my $isSpecialSpace = 0;
188
- my $isTab = 0;
189
- my $isBullet = 0;
190
-
191
- my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
192
-
193
- my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
194
- my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
195
- my $wdAttr; my $wdText = ""; my $isWd = 0;
196
-
197
- my $text = "";
198
- my $tmpMarkupOutput = "";
199
- # binmode(STDERR, ":utf8");
200
-
201
- my @lines = split(/\n/, $inputText);
202
- for(my $i=0; $i<scalar(@lines); $i++){
203
- my $line = $lines[$i];
204
-
205
- ## new ln
206
- if ($line =~ /^<ln (.+)>$/){
207
- $lnAttr = $1;
208
- $isLn = 1;
209
-
210
- $tmpMarkupOutput .= "||| Line $lnAttr\n";
211
- $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
212
- }
213
-
214
- ## new run
215
- elsif ($line =~ /<run (.*)>$/){
216
- $runAttr = $1;
217
-
218
- $isSpace = 0;
219
- $isTab = 0;
220
- $isRun = 1;
221
-
222
- if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
223
- $isWd = 1;
224
- $wdAttr = $1;
225
- }
226
- }
227
-
228
- ## wd
229
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
230
- $wdAttr = $1;
231
- my $word = $2;
232
- $isSpace = 0;
233
- $isTab = 0;
234
-
235
- $word =~ s/\cM$//g; # remove ^M character
236
- $tmpMarkupOutput .= "$word $wdAttr\n";
237
-
238
- ## add text
239
- $text .= "$word";
240
-
241
- if($isRun) {
242
- $runText .= "$word ";
243
- }
244
- }
245
-
246
- ## end wd
247
- elsif ($line =~ /^<\/wd>$/){
248
- $isWd = 0;
249
-
250
- $tmpMarkupOutput .= "$wdText $wdAttr\n";
251
- $wdAttr = "";
252
- $wdText = "";
253
- }
254
-
255
- ## end run
256
- elsif ($line =~ /^(.*)<\/run>$/){
257
- my $word = $1;
258
-
259
- ## add text
260
- if($word ne ""){
261
- $word =~ s/\cM$//g; # remove ^M character
262
-
263
- # appear in the final result
264
- if($isLn){ $text .= "$word"; }
265
-
266
- # for internal record
267
- if($isRun){ $runText .= "$word "; }
268
- if($isWd){ $wdText .= "$word"; }
269
- }
270
-
271
- ## reset run
272
- $runText = "";
273
- $isRun = 0;
274
- $isSpecialSpace = 0;
275
- }
276
-
277
- ## end ln
278
- elsif ($line =~ /^<\/ln>$/){
279
- if((!$isAllowEmpty && $text !~ /^\s*$/)
280
- || ($isAllowEmpty && $text ne "")){
281
- if($isForcedEOF eq "true" || # there's a forced EOL?
282
- (!$isSpecialSpace) # not an emply line with space character
283
- ){
284
- $text .= "\n";
285
-
286
- $markupOutput .= $tmpMarkupOutput;
287
- $tmpMarkupOutput = "";
288
- $text = "";
289
- }
290
- } else {
291
- $tmpMarkupOutput = "";
292
- }
293
-
294
- ## reset ln
295
- $isLn = 0;
296
- $isForcedEOF = "none";
297
- $isSpecialSpace = 0;
298
- } # end else </ln>
299
-
300
- ## nl newline signal
301
- elsif ($line =~ /^<nl orig=\"true\"\/>$/){
302
- if($isLn){
303
- $isSpace = 0;
304
- } else {
305
- if($isDebug){
306
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
307
- }
308
- }
309
- }
310
-
311
- ## space
312
- elsif ($line =~ /^<space\/>$/){
313
- my $startTag = "";
314
- my $endTag = "";
315
- if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
316
- $startTag = $1;
317
- }
318
-
319
- if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
320
- $endTag = $1;
321
- }
322
-
323
- if($startTag eq $endTag && $startTag ne ""){
324
- $isSpecialSpace = 1;
325
- }
326
-
327
- ## addText
328
- $text .= " ";
329
- $isSpace = 1;
330
- }
331
-
332
- ## tab
333
- elsif ($line =~ /^<tab .*\/>$/){
334
- ## add Text
335
- $text .= "\t";
336
-
337
- $isTab = 1;
338
- }
339
-
340
- ## bullet
341
- elsif ($line =~ /^<bullet .*>$/){
342
- $isBullet = 1;
343
- }
344
- }
345
- }
346
-
347
- sub untaintPath {
348
- my ($path) = @_;
349
-
350
- if ( $path =~ /^([-_\/\w\.]*)$/ ) {
351
- $path = $1;
352
- } else {
353
- die "Bad path \"$path\"\n";
354
- }
355
-
356
- return $path;
357
- }
358
-
359
- sub untaint {
360
- my ($s) = @_;
361
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
362
- $s = $1; # $data now untainted
363
- } else {
364
- die "Bad data in $s"; # log this somewhere
365
- }
366
- return $s;
367
- }
368
-
369
- sub execute {
370
- my ($cmd) = @_;
371
- if($isDebug){
372
- print STDERR "Executing: $cmd\n";
373
- }
374
- $cmd = untaint($cmd);
375
- system($cmd);
376
- }
377
-
378
- sub newTmpFile {
379
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
380
- chomp($tmpFile);
381
- return $tmpFile;
382
- }
@@ -1,193 +0,0 @@
1
- #!/usr/bin/perl
2
- # Author: Do Hoang Nhat Huy <dcsdhnh@nus.edu.sg>, generated at Fri, 3 Dec 2010 14:36:00
3
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
4
- require 5.0;
5
- use strict;
6
-
7
- use FindBin;
8
- use Getopt::Long;
9
-
10
- # I do not know a better solution to find a lib path in -T mode.
11
- # So if you know a better solution, I'd be glad to hear.
12
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
13
- my $path; # Path to Parscit binary directory
14
- BEGIN
15
- {
16
- if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
17
- }
18
-
19
- use lib "$path/../lib";
20
-
21
- # Local libraries
22
- use Omni::Omnidoc;
23
- use ParsCit::Tr2crfpp;
24
- use ParsCit::PreProcess;
25
- # Dependencies
26
-
27
-
28
- ### USER customizable section
29
- my $version = "1.0";
30
- $0 =~ /([^\/]+)$/; my $progname = $1;
31
- ### END user customizable section
32
-
33
- sub License
34
- {
35
- print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
36
- }
37
-
38
- ### HELP Sub-procedure
39
- sub Help
40
- {
41
- print STDERR "Process Omnipage XML output (Reference Section Only) and extract text lines together with other XML information\n";
42
- print STDERR "usage: $progname -h\t[invokes help]\n";
43
- print STDERR " $progname -in xmlfile -out outfile -opt option [-codec -app]\n";
44
- print STDERR "Options:\n";
45
- print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
46
- print STDERR "\t-in \tXML input from Omnipage\n";
47
- print STDERR "\t-out \tOutput file\n";
48
- print STDERR "\t-codec \tCodec of the input XML: utf-16 or utf-8. Default is utf-8\n";
49
- print STDERR "\t-opt \tOptio: train (output is train file for crf++) or xml (output is xml features). Default is train\n";
50
- }
51
-
52
- my $help = 0;
53
- my $quite = 0;
54
- my $infile = undef;
55
- my $outfile = undef;
56
- my $option = "train";
57
- my $codec = "utf-8";
58
-
59
- $help = 1 unless GetOptions('in=s' => \$infile,
60
- 'out=s' => \$outfile,
61
- 'opt=s' => \$option,
62
- 'codec=s' => \$codec,
63
- 'h' => \$help,
64
- 'q' => \$quite);
65
-
66
- if ($help || !defined $infile || !defined $outfile)
67
- {
68
- Help();
69
- exit(0);
70
- }
71
-
72
- if (!$quite)
73
- {
74
- License();
75
- }
76
-
77
- # Sanity check
78
- if (($option ne "train") && ($option ne "xml"))
79
- {
80
- die "Die: -opt must equal \"train\" or \"xml\".\n";
81
- }
82
-
83
- if (($codec ne "utf-8") && ($codec ne "utf-16"))
84
- {
85
- die "Die: -codec must equal \"utf-8\" or \"utf-16\".\n";
86
- }
87
-
88
- # Untaint check
89
- $infile = UntaintPath($infile);
90
- $outfile = UntaintPath($outfile);
91
-
92
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
93
- # End untaint check
94
-
95
- # MAIN
96
- my $infile_utf8 = $infile . "-utf8";
97
- if ($codec eq "utf-16") { Convert($infile, "UTF16", $infile_utf8, "UTF8"); }
98
-
99
- if (! open(IN, "<:utf8", $infile)) { return (-1, "Could not open xml file " . $infile . ": " . $!); }
100
- my $xml = do { local $/; <IN> };
101
- close IN;
102
-
103
- # Cleanup
104
- CleanUp(\$xml);
105
-
106
- # New document
107
- my $doc = new Omni::Omnidoc();
108
- $doc->set_raw($xml);
109
-
110
- # Extract the reference portion from the XML
111
- my ($start_ref, $end_ref, $rcite_text_from_xml) = ParsCit::PreProcess::findCitationTextXML($doc);
112
-
113
- if ($option eq "train")
114
- {
115
- # Prepare to split unmarked reference portion
116
- my $tmp_file = ParsCit::Tr2crfpp::prepDataUnmarked($doc, $start_ref, $end_ref);
117
-
118
- # Save the temporary file
119
- my $cmd = "mv " . $tmp_file . " " . $outfile;
120
-
121
- Execute($cmd);
122
- }
123
- else
124
- {
125
-
126
- }
127
-
128
- # END
129
-
130
- # Convert the input XML
131
- sub Convert
132
- {
133
- my ($from_file, $from_encode, $to_file, $to_encode, $log) = @_;
134
-
135
- # Call iconv program
136
- my $cmd = "iconv" . " -f " . $from_encode . " -t " . $to_encode . " " . $from_file . " -o " . $to_file;
137
-
138
- # Transformation
139
- Execute($cmd);
140
- }
141
-
142
- # Clean up the input XML
143
- sub CleanUp
144
- {
145
- my ($ref_xml) = @_;
146
-
147
- # Remove <?xml version="1.0" encoding="UTF-8"?>
148
- $$ref_xml =~ s/<\?xml.+?>\n//g;
149
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
150
- $$ref_xml =~ s/<\!\-\-XML.+?>\n//g;
151
- # Add the root tag
152
- $$ref_xml = "<root>" . "\n" . $$ref_xml . "\n" . "</root>";
153
- }
154
-
155
- sub UntaintPath
156
- {
157
- my ($path) = @_;
158
-
159
- if ($path =~ /^([-_:" \/\w\.%\p{C}\p{P}]+)$/ )
160
- {
161
- $path = $1;
162
- }
163
- else
164
- {
165
- die "Bad path \"$path\"\n";
166
- }
167
-
168
- return $path;
169
- }
170
-
171
- sub Untaint
172
- {
173
- my ($s) = @_;
174
- if ($s =~ /^([\w \-\@\(\),\.\/>\p{C}\p{P}]+)$/)
175
- {
176
- $s = $1; # $data now untainted
177
- }
178
- else
179
- {
180
- die "Bad data in $s"; # log this somewhere
181
- }
182
- return $s;
183
- }
184
-
185
- sub Execute
186
- {
187
- my ($cmd) = @_;
188
- print STDERR "Executing: $cmd\n";
189
- system($cmd);
190
- }
191
-
192
-
193
-
@@ -1,93 +0,0 @@
1
- package Omni::Config;
2
-
3
- # Global
4
- # Names of the classes
5
- $ALG_NAME = "Omni";
6
- # Version
7
- $ALG_VERSION = "110505";
8
-
9
- # All Omnipage XML tags
10
- %omni_tag_list = ( 'DOCUMENT' => 'document',
11
- 'PAGE' => 'page',
12
- 'COLUMN' => 'column',
13
- 'DESC' => 'description',
14
- 'SRC' => 'source',
15
- 'LANGUAGE' => 'language',
16
- 'STYLE' => 'style',
17
- 'STYLE-TABLE' => 'styleTable',
18
- 'THEO-PAGE' => 'theoreticalPage',
19
- 'BODY' => 'body',
20
- 'SECTION' => 'section',
21
- 'COL' => 'column',
22
- 'PARA' => 'para',
23
- 'LINE' => 'ln',
24
- 'WORD' => 'wd',
25
- 'SPACE' => 'space',
26
- 'RUN' => 'run',
27
- 'BULLET' => 'bullet',
28
- 'TABLE' => 'table',
29
- 'GRID' => 'gridTable',
30
- 'GRID-COL' => 'gridCol',
31
- 'GRID-ROW' => 'gridRow',
32
- 'CELL' => 'cell',
33
- 'BOTTOM-CELL' => 'bottomBorder',
34
- 'TOP-CELL' => 'topBorder',
35
- 'LEFT-CELL' => 'leftBorder',
36
- 'RIGHT-CELL' => 'rightBorder',
37
- 'NEWLINE' => 'nl',
38
- 'TAB' => 'tab',
39
- 'DD' => 'dd',
40
- 'PICTURE' => 'picture',
41
- 'FRAME' => 'frame'
42
- );
43
- $tag_list = \%omni_tag_list;
44
-
45
- # All Omnipage XML attributes
46
- %omni_att_list = ( 'ALIGN' => 'alignment',
47
- 'FONTFACE' => 'fontFace',
48
- 'FONTFAMILY' => 'fontFamily',
49
- 'FONTPITCH' => 'fontPitch',
50
- 'FONTSIZE' => 'fontSize',
51
- 'UNDERLINE' => 'underline',
52
- 'SPACING' => 'spacing',
53
- 'SCALE' => 'scale',
54
- 'BOTTOM' => 'b',
55
- 'TOP' => 't',
56
- 'LEFT' => 'l',
57
- 'RIGHT' => 'r',
58
- 'LANGUAGE' => 'language',
59
- 'SUSCRIPT' => 'subsuperscript',
60
- 'BASELINE' => 'baseline',
61
- 'BOLD' => 'bold',
62
- 'ITALIC' => 'italic',
63
- 'SPACEB' => 'spaceBefore',
64
- # These attribute usually go with <dd> tag
65
- 'BOTTOMDIST' => 'bottomDistance',
66
- 'TOPDIST' => 'topDistance',
67
- 'LEFTDIST' => 'leftDistance',
68
- 'RIGHTDIST' => 'rightDistance',
69
- # These attribute usually fo with <cell> tag
70
- 'GROWFROM' => 'gridRowFrom',
71
- 'GROWTO' => 'gridRowTill',
72
- 'GCOLFROM' => 'gridColFrom',
73
- 'GCOLTO' => 'gridColTill',
74
- 'VALIGN' => 'verticalAlignment',
75
- );
76
- $att_list = \%omni_att_list;
77
-
78
- # All object type in Omni library
79
- %omni_obj_list = ( 'OMNIDOC' => 'document',
80
- 'OMNIPAGE' => 'page',
81
- 'OMNICOL' => 'column',
82
- 'OMNIDD' => 'dd',
83
- 'OMNITABLE' => 'table',
84
- 'OMNIIMG' => 'image',
85
- 'OMNIPARA' => 'paragraph',
86
- 'OMNILINE' => 'line',
87
- 'OMNIRUN' => 'run',
88
- 'OMNIWORD' => 'word',
89
- 'OMNIFRAME' => 'frame',
90
- );
91
- $obj_list = \%omni_obj_list;
92
-
93
- 1;