biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,382 +0,0 @@
1
- #!/usr/bin/perl -wT
2
- # Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
3
-
4
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
5
-
6
- require 5.0;
7
- use strict;
8
- use Getopt::Long;
9
- use HTML::Entities;
10
-
11
- # I do not know a better solution to find a lib path in -T mode.
12
- # So if you know a better solution, I'd be glad to hear.
13
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
14
- use FindBin;
15
- FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
16
- my $path;
17
- BEGIN {
18
- if ($FindBin::Bin =~ /(.*)/) {
19
- $path = $1;
20
- }
21
- }
22
- use lib "$path/../../lib";
23
- use SectLabel::PreProcess;
24
-
25
- ### USER customizable section
26
- $0 =~ /([^\/]+)$/; my $progname = $1;
27
- my $outputVersion = "1.0";
28
- ### END user customizable section
29
-
30
- sub License {
31
- print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
32
- }
33
-
34
- ### HELP Sub-procedure
35
- sub Help {
36
- print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract necessary information. Marking in the output detailed word-level info ### Page\\n## Para\\n# Line\\nword\\n### Table\\n### Figure\n";
37
-
38
- print STDERR "usage: $progname -h\t[invokes help]\n";
39
- print STDERR " $progname -in xmlFile -out outFile [-decode -allowEmptyLine -log]\n";
40
- print STDERR "Options:\n";
41
- print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
42
- print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
43
- }
44
-
45
- my $QUIET = 0;
46
- my $HELP = 0;
47
- my $outFile = undef;
48
- my $inFile = undef;
49
-
50
- my $isDecode = 0;
51
- my $isAllowEmpty = 0;
52
- my $isDebug = 0;
53
- $HELP = 1 unless GetOptions('in=s' => \$inFile,
54
- 'out=s' => \$outFile,
55
- 'decode' => \$isDecode,
56
- 'allowEmptyLine' => \$isAllowEmpty,
57
- 'log' => \$isDebug,
58
- 'h' => \$HELP,
59
- 'q' => \$QUIET);
60
-
61
- if ($HELP || !defined $inFile || !defined $outFile) {
62
- Help();
63
- exit(0);
64
- }
65
-
66
- if (!$QUIET) {
67
- License();
68
- }
69
-
70
- ### Untaint ###
71
- $inFile = untaintPath($inFile);
72
- $outFile = untaintPath($outFile);
73
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
74
- ### End untaint ###
75
-
76
- if($isDebug){
77
- print STDERR "\n# Processing file $inFile & output to $outFile\n";
78
- }
79
-
80
- my $markupOutput = "";
81
- processFile($inFile);
82
-
83
- if($isDecode){
84
- $markupOutput = decode_entities($markupOutput);
85
- }
86
-
87
- open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
88
- print OF "$markupOutput";
89
- close OF;
90
-
91
- sub processFile {
92
- my ($inFile) = @_;
93
-
94
- if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
95
- open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
96
-
97
- my $isPara = 0;
98
- my $isTable = 0;
99
- my $isSpace = 0;
100
- my $isPic = 0;
101
- my $text = "";
102
-
103
- while (<IF>) { #each line contains a header
104
- if (/^\#/) { next; } # skip comments
105
- chomp;
106
- s/\cM$//; # remove ^M character at the end of the file if any
107
- my $line = $_;
108
-
109
-
110
- # if ($line =~ /<\?xml version.+>/){ } ### Xml ###
111
- # if ($line =~ /^<\/column>$/){ } ### Column ###
112
- if ($line =~ /<theoreticalPage (.*)\/>/){
113
- $markupOutput .= "||| Page $1\n";
114
- }
115
-
116
- ### pic ###
117
- if ($line =~ /^<dd (.*)>$/){
118
- $isPic = 1;
119
-
120
- $markupOutput .= "||| Figure $1\n";
121
- }
122
- elsif ($line =~ /^<\/dd>$/){
123
- $isPic = 0;
124
- }
125
-
126
- ### Table ###
127
- elsif ($line =~ /^<table (.*)>$/){
128
- $isTable = 1;
129
- $markupOutput .= "||| Table $1\n";
130
- }
131
- elsif ($line =~ /^<\/table>$/){
132
- $isTable = 0;
133
- }
134
-
135
- ### Paragraph ###
136
- # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
137
- elsif ($line =~ /^<para (.*)>$/){
138
- $text .= $line."\n"; # we need the header
139
- $isPara = 1;
140
-
141
- if($isTable){
142
- $markupOutput .= "||| ParaTable $1\n";
143
- } else {
144
- $markupOutput .= "||| Para $1\n";
145
- }
146
- }
147
- elsif ($line =~ /^<\/para>$/){
148
- my $paraText;
149
- processPara($text);
150
-
151
- $isPara = 0;
152
- $text = "";
153
- }
154
- elsif($isPara){
155
- $text .= $line."\n";
156
- next;
157
- }
158
- }
159
- close IF;
160
- }
161
-
162
- sub getAttrValue {
163
- my ($attrText, $attr) = @_;
164
-
165
- my $value = "none";
166
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
167
- $value = $1;
168
- }
169
-
170
- return $value;
171
- }
172
-
173
- sub checkFontAttr {
174
- my ($attrText, $attr, $attrHash, $count) = @_;
175
-
176
- if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
177
- my $attrValue = $1;
178
-
179
- $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
180
- }
181
- }
182
-
183
- sub processPara {
184
- my ($inputText) = @_;
185
-
186
- my $isSpace = 0;
187
- my $isSpecialSpace = 0;
188
- my $isTab = 0;
189
- my $isBullet = 0;
190
-
191
- my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
192
-
193
- my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
194
- my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
195
- my $wdAttr; my $wdText = ""; my $isWd = 0;
196
-
197
- my $text = "";
198
- my $tmpMarkupOutput = "";
199
- # binmode(STDERR, ":utf8");
200
-
201
- my @lines = split(/\n/, $inputText);
202
- for(my $i=0; $i<scalar(@lines); $i++){
203
- my $line = $lines[$i];
204
-
205
- ## new ln
206
- if ($line =~ /^<ln (.+)>$/){
207
- $lnAttr = $1;
208
- $isLn = 1;
209
-
210
- $tmpMarkupOutput .= "||| Line $lnAttr\n";
211
- $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
212
- }
213
-
214
- ## new run
215
- elsif ($line =~ /<run (.*)>$/){
216
- $runAttr = $1;
217
-
218
- $isSpace = 0;
219
- $isTab = 0;
220
- $isRun = 1;
221
-
222
- if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
223
- $isWd = 1;
224
- $wdAttr = $1;
225
- }
226
- }
227
-
228
- ## wd
229
- elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
230
- $wdAttr = $1;
231
- my $word = $2;
232
- $isSpace = 0;
233
- $isTab = 0;
234
-
235
- $word =~ s/\cM$//g; # remove ^M character
236
- $tmpMarkupOutput .= "$word $wdAttr\n";
237
-
238
- ## add text
239
- $text .= "$word";
240
-
241
- if($isRun) {
242
- $runText .= "$word ";
243
- }
244
- }
245
-
246
- ## end wd
247
- elsif ($line =~ /^<\/wd>$/){
248
- $isWd = 0;
249
-
250
- $tmpMarkupOutput .= "$wdText $wdAttr\n";
251
- $wdAttr = "";
252
- $wdText = "";
253
- }
254
-
255
- ## end run
256
- elsif ($line =~ /^(.*)<\/run>$/){
257
- my $word = $1;
258
-
259
- ## add text
260
- if($word ne ""){
261
- $word =~ s/\cM$//g; # remove ^M character
262
-
263
- # appear in the final result
264
- if($isLn){ $text .= "$word"; }
265
-
266
- # for internal record
267
- if($isRun){ $runText .= "$word "; }
268
- if($isWd){ $wdText .= "$word"; }
269
- }
270
-
271
- ## reset run
272
- $runText = "";
273
- $isRun = 0;
274
- $isSpecialSpace = 0;
275
- }
276
-
277
- ## end ln
278
- elsif ($line =~ /^<\/ln>$/){
279
- if((!$isAllowEmpty && $text !~ /^\s*$/)
280
- || ($isAllowEmpty && $text ne "")){
281
- if($isForcedEOF eq "true" || # there's a forced EOL?
282
- (!$isSpecialSpace) # not an emply line with space character
283
- ){
284
- $text .= "\n";
285
-
286
- $markupOutput .= $tmpMarkupOutput;
287
- $tmpMarkupOutput = "";
288
- $text = "";
289
- }
290
- } else {
291
- $tmpMarkupOutput = "";
292
- }
293
-
294
- ## reset ln
295
- $isLn = 0;
296
- $isForcedEOF = "none";
297
- $isSpecialSpace = 0;
298
- } # end else </ln>
299
-
300
- ## nl newline signal
301
- elsif ($line =~ /^<nl orig=\"true\"\/>$/){
302
- if($isLn){
303
- $isSpace = 0;
304
- } else {
305
- if($isDebug){
306
- print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
307
- }
308
- }
309
- }
310
-
311
- ## space
312
- elsif ($line =~ /^<space\/>$/){
313
- my $startTag = "";
314
- my $endTag = "";
315
- if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
316
- $startTag = $1;
317
- }
318
-
319
- if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
320
- $endTag = $1;
321
- }
322
-
323
- if($startTag eq $endTag && $startTag ne ""){
324
- $isSpecialSpace = 1;
325
- }
326
-
327
- ## addText
328
- $text .= " ";
329
- $isSpace = 1;
330
- }
331
-
332
- ## tab
333
- elsif ($line =~ /^<tab .*\/>$/){
334
- ## add Text
335
- $text .= "\t";
336
-
337
- $isTab = 1;
338
- }
339
-
340
- ## bullet
341
- elsif ($line =~ /^<bullet .*>$/){
342
- $isBullet = 1;
343
- }
344
- }
345
- }
346
-
347
- sub untaintPath {
348
- my ($path) = @_;
349
-
350
- if ( $path =~ /^([-_\/\w\.]*)$/ ) {
351
- $path = $1;
352
- } else {
353
- die "Bad path \"$path\"\n";
354
- }
355
-
356
- return $path;
357
- }
358
-
359
- sub untaint {
360
- my ($s) = @_;
361
- if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
362
- $s = $1; # $data now untainted
363
- } else {
364
- die "Bad data in $s"; # log this somewhere
365
- }
366
- return $s;
367
- }
368
-
369
- sub execute {
370
- my ($cmd) = @_;
371
- if($isDebug){
372
- print STDERR "Executing: $cmd\n";
373
- }
374
- $cmd = untaint($cmd);
375
- system($cmd);
376
- }
377
-
378
- sub newTmpFile {
379
- my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
380
- chomp($tmpFile);
381
- return $tmpFile;
382
- }
@@ -1,193 +0,0 @@
1
- #!/usr/bin/perl
2
- # Author: Do Hoang Nhat Huy <dcsdhnh@nus.edu.sg>, generated at Fri, 3 Dec 2010 14:36:00
3
- # Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
4
- require 5.0;
5
- use strict;
6
-
7
- use FindBin;
8
- use Getopt::Long;
9
-
10
- # I do not know a better solution to find a lib path in -T mode.
11
- # So if you know a better solution, I'd be glad to hear.
12
- # See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
13
- my $path; # Path to Parscit binary directory
14
- BEGIN
15
- {
16
- if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
17
- }
18
-
19
- use lib "$path/../lib";
20
-
21
- # Local libraries
22
- use Omni::Omnidoc;
23
- use ParsCit::Tr2crfpp;
24
- use ParsCit::PreProcess;
25
- # Dependencies
26
-
27
-
28
- ### USER customizable section
29
- my $version = "1.0";
30
- $0 =~ /([^\/]+)$/; my $progname = $1;
31
- ### END user customizable section
32
-
33
- sub License
34
- {
35
- print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
36
- }
37
-
38
- ### HELP Sub-procedure
39
- sub Help
40
- {
41
- print STDERR "Process Omnipage XML output (Reference Section Only) and extract text lines together with other XML information\n";
42
- print STDERR "usage: $progname -h\t[invokes help]\n";
43
- print STDERR " $progname -in xmlfile -out outfile -opt option [-codec -app]\n";
44
- print STDERR "Options:\n";
45
- print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
46
- print STDERR "\t-in \tXML input from Omnipage\n";
47
- print STDERR "\t-out \tOutput file\n";
48
- print STDERR "\t-codec \tCodec of the input XML: utf-16 or utf-8. Default is utf-8\n";
49
- print STDERR "\t-opt \tOptio: train (output is train file for crf++) or xml (output is xml features). Default is train\n";
50
- }
51
-
52
- my $help = 0;
53
- my $quite = 0;
54
- my $infile = undef;
55
- my $outfile = undef;
56
- my $option = "train";
57
- my $codec = "utf-8";
58
-
59
- $help = 1 unless GetOptions('in=s' => \$infile,
60
- 'out=s' => \$outfile,
61
- 'opt=s' => \$option,
62
- 'codec=s' => \$codec,
63
- 'h' => \$help,
64
- 'q' => \$quite);
65
-
66
- if ($help || !defined $infile || !defined $outfile)
67
- {
68
- Help();
69
- exit(0);
70
- }
71
-
72
- if (!$quite)
73
- {
74
- License();
75
- }
76
-
77
- # Sanity check
78
- if (($option ne "train") && ($option ne "xml"))
79
- {
80
- die "Die: -opt must equal \"train\" or \"xml\".\n";
81
- }
82
-
83
- if (($codec ne "utf-8") && ($codec ne "utf-16"))
84
- {
85
- die "Die: -codec must equal \"utf-8\" or \"utf-16\".\n";
86
- }
87
-
88
- # Untaint check
89
- $infile = UntaintPath($infile);
90
- $outfile = UntaintPath($outfile);
91
-
92
- $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
93
- # End untaint check
94
-
95
- # MAIN
96
- my $infile_utf8 = $infile . "-utf8";
97
- if ($codec eq "utf-16") { Convert($infile, "UTF16", $infile_utf8, "UTF8"); }
98
-
99
- if (! open(IN, "<:utf8", $infile)) { return (-1, "Could not open xml file " . $infile . ": " . $!); }
100
- my $xml = do { local $/; <IN> };
101
- close IN;
102
-
103
- # Cleanup
104
- CleanUp(\$xml);
105
-
106
- # New document
107
- my $doc = new Omni::Omnidoc();
108
- $doc->set_raw($xml);
109
-
110
- # Extract the reference portion from the XML
111
- my ($start_ref, $end_ref, $rcite_text_from_xml) = ParsCit::PreProcess::findCitationTextXML($doc);
112
-
113
- if ($option eq "train")
114
- {
115
- # Prepare to split unmarked reference portion
116
- my $tmp_file = ParsCit::Tr2crfpp::prepDataUnmarked($doc, $start_ref, $end_ref);
117
-
118
- # Save the temporary file
119
- my $cmd = "mv " . $tmp_file . " " . $outfile;
120
-
121
- Execute($cmd);
122
- }
123
- else
124
- {
125
-
126
- }
127
-
128
- # END
129
-
130
- # Convert the input XML
131
- sub Convert
132
- {
133
- my ($from_file, $from_encode, $to_file, $to_encode, $log) = @_;
134
-
135
- # Call iconv program
136
- my $cmd = "iconv" . " -f " . $from_encode . " -t " . $to_encode . " " . $from_file . " -o " . $to_file;
137
-
138
- # Transformation
139
- Execute($cmd);
140
- }
141
-
142
- # Clean up the input XML
143
- sub CleanUp
144
- {
145
- my ($ref_xml) = @_;
146
-
147
- # Remove <?xml version="1.0" encoding="UTF-8"?>
148
- $$ref_xml =~ s/<\?xml.+?>\n//g;
149
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
150
- $$ref_xml =~ s/<\!\-\-XML.+?>\n//g;
151
- # Add the root tag
152
- $$ref_xml = "<root>" . "\n" . $$ref_xml . "\n" . "</root>";
153
- }
154
-
155
- sub UntaintPath
156
- {
157
- my ($path) = @_;
158
-
159
- if ($path =~ /^([-_:" \/\w\.%\p{C}\p{P}]+)$/ )
160
- {
161
- $path = $1;
162
- }
163
- else
164
- {
165
- die "Bad path \"$path\"\n";
166
- }
167
-
168
- return $path;
169
- }
170
-
171
- sub Untaint
172
- {
173
- my ($s) = @_;
174
- if ($s =~ /^([\w \-\@\(\),\.\/>\p{C}\p{P}]+)$/)
175
- {
176
- $s = $1; # $data now untainted
177
- }
178
- else
179
- {
180
- die "Bad data in $s"; # log this somewhere
181
- }
182
- return $s;
183
- }
184
-
185
- sub Execute
186
- {
187
- my ($cmd) = @_;
188
- print STDERR "Executing: $cmd\n";
189
- system($cmd);
190
- }
191
-
192
-
193
-
@@ -1,93 +0,0 @@
1
- package Omni::Config;
2
-
3
- # Global
4
- # Names of the classes
5
- $ALG_NAME = "Omni";
6
- # Version
7
- $ALG_VERSION = "110505";
8
-
9
- # All Omnipage XML tags
10
- %omni_tag_list = ( 'DOCUMENT' => 'document',
11
- 'PAGE' => 'page',
12
- 'COLUMN' => 'column',
13
- 'DESC' => 'description',
14
- 'SRC' => 'source',
15
- 'LANGUAGE' => 'language',
16
- 'STYLE' => 'style',
17
- 'STYLE-TABLE' => 'styleTable',
18
- 'THEO-PAGE' => 'theoreticalPage',
19
- 'BODY' => 'body',
20
- 'SECTION' => 'section',
21
- 'COL' => 'column',
22
- 'PARA' => 'para',
23
- 'LINE' => 'ln',
24
- 'WORD' => 'wd',
25
- 'SPACE' => 'space',
26
- 'RUN' => 'run',
27
- 'BULLET' => 'bullet',
28
- 'TABLE' => 'table',
29
- 'GRID' => 'gridTable',
30
- 'GRID-COL' => 'gridCol',
31
- 'GRID-ROW' => 'gridRow',
32
- 'CELL' => 'cell',
33
- 'BOTTOM-CELL' => 'bottomBorder',
34
- 'TOP-CELL' => 'topBorder',
35
- 'LEFT-CELL' => 'leftBorder',
36
- 'RIGHT-CELL' => 'rightBorder',
37
- 'NEWLINE' => 'nl',
38
- 'TAB' => 'tab',
39
- 'DD' => 'dd',
40
- 'PICTURE' => 'picture',
41
- 'FRAME' => 'frame'
42
- );
43
- $tag_list = \%omni_tag_list;
44
-
45
- # All Omnipage XML attributes
46
- %omni_att_list = ( 'ALIGN' => 'alignment',
47
- 'FONTFACE' => 'fontFace',
48
- 'FONTFAMILY' => 'fontFamily',
49
- 'FONTPITCH' => 'fontPitch',
50
- 'FONTSIZE' => 'fontSize',
51
- 'UNDERLINE' => 'underline',
52
- 'SPACING' => 'spacing',
53
- 'SCALE' => 'scale',
54
- 'BOTTOM' => 'b',
55
- 'TOP' => 't',
56
- 'LEFT' => 'l',
57
- 'RIGHT' => 'r',
58
- 'LANGUAGE' => 'language',
59
- 'SUSCRIPT' => 'subsuperscript',
60
- 'BASELINE' => 'baseline',
61
- 'BOLD' => 'bold',
62
- 'ITALIC' => 'italic',
63
- 'SPACEB' => 'spaceBefore',
64
- # These attribute usually go with <dd> tag
65
- 'BOTTOMDIST' => 'bottomDistance',
66
- 'TOPDIST' => 'topDistance',
67
- 'LEFTDIST' => 'leftDistance',
68
- 'RIGHTDIST' => 'rightDistance',
69
- # These attribute usually fo with <cell> tag
70
- 'GROWFROM' => 'gridRowFrom',
71
- 'GROWTO' => 'gridRowTill',
72
- 'GCOLFROM' => 'gridColFrom',
73
- 'GCOLTO' => 'gridColTill',
74
- 'VALIGN' => 'verticalAlignment',
75
- );
76
- $att_list = \%omni_att_list;
77
-
78
- # All object type in Omni library
79
- %omni_obj_list = ( 'OMNIDOC' => 'document',
80
- 'OMNIPAGE' => 'page',
81
- 'OMNICOL' => 'column',
82
- 'OMNIDD' => 'dd',
83
- 'OMNITABLE' => 'table',
84
- 'OMNIIMG' => 'image',
85
- 'OMNIPARA' => 'paragraph',
86
- 'OMNILINE' => 'line',
87
- 'OMNIRUN' => 'run',
88
- 'OMNIWORD' => 'word',
89
- 'OMNIFRAME' => 'frame',
90
- );
91
- $obj_list = \%omni_obj_list;
92
-
93
- 1;