biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
|
@@ -1,382 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl -wT
|
|
2
|
-
# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
|
|
3
|
-
|
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
5
|
-
|
|
6
|
-
require 5.0;
|
|
7
|
-
use strict;
|
|
8
|
-
use Getopt::Long;
|
|
9
|
-
use HTML::Entities;
|
|
10
|
-
|
|
11
|
-
# I do not know a better solution to find a lib path in -T mode.
|
|
12
|
-
# So if you know a better solution, I'd be glad to hear.
|
|
13
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
|
14
|
-
use FindBin;
|
|
15
|
-
FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
|
|
16
|
-
my $path;
|
|
17
|
-
BEGIN {
|
|
18
|
-
if ($FindBin::Bin =~ /(.*)/) {
|
|
19
|
-
$path = $1;
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
use lib "$path/../../lib";
|
|
23
|
-
use SectLabel::PreProcess;
|
|
24
|
-
|
|
25
|
-
### USER customizable section
|
|
26
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
27
|
-
my $outputVersion = "1.0";
|
|
28
|
-
### END user customizable section
|
|
29
|
-
|
|
30
|
-
sub License {
|
|
31
|
-
print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
### HELP Sub-procedure
|
|
35
|
-
sub Help {
|
|
36
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract necessary information. Marking in the output detailed word-level info ### Page\\n## Para\\n# Line\\nword\\n### Table\\n### Figure\n";
|
|
37
|
-
|
|
38
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
39
|
-
print STDERR " $progname -in xmlFile -out outFile [-decode -allowEmptyLine -log]\n";
|
|
40
|
-
print STDERR "Options:\n";
|
|
41
|
-
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
|
42
|
-
print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
my $QUIET = 0;
|
|
46
|
-
my $HELP = 0;
|
|
47
|
-
my $outFile = undef;
|
|
48
|
-
my $inFile = undef;
|
|
49
|
-
|
|
50
|
-
my $isDecode = 0;
|
|
51
|
-
my $isAllowEmpty = 0;
|
|
52
|
-
my $isDebug = 0;
|
|
53
|
-
$HELP = 1 unless GetOptions('in=s' => \$inFile,
|
|
54
|
-
'out=s' => \$outFile,
|
|
55
|
-
'decode' => \$isDecode,
|
|
56
|
-
'allowEmptyLine' => \$isAllowEmpty,
|
|
57
|
-
'log' => \$isDebug,
|
|
58
|
-
'h' => \$HELP,
|
|
59
|
-
'q' => \$QUIET);
|
|
60
|
-
|
|
61
|
-
if ($HELP || !defined $inFile || !defined $outFile) {
|
|
62
|
-
Help();
|
|
63
|
-
exit(0);
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
if (!$QUIET) {
|
|
67
|
-
License();
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
### Untaint ###
|
|
71
|
-
$inFile = untaintPath($inFile);
|
|
72
|
-
$outFile = untaintPath($outFile);
|
|
73
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
|
74
|
-
### End untaint ###
|
|
75
|
-
|
|
76
|
-
if($isDebug){
|
|
77
|
-
print STDERR "\n# Processing file $inFile & output to $outFile\n";
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
my $markupOutput = "";
|
|
81
|
-
processFile($inFile);
|
|
82
|
-
|
|
83
|
-
if($isDecode){
|
|
84
|
-
$markupOutput = decode_entities($markupOutput);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
|
|
88
|
-
print OF "$markupOutput";
|
|
89
|
-
close OF;
|
|
90
|
-
|
|
91
|
-
sub processFile {
|
|
92
|
-
my ($inFile) = @_;
|
|
93
|
-
|
|
94
|
-
if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
|
|
95
|
-
open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
|
|
96
|
-
|
|
97
|
-
my $isPara = 0;
|
|
98
|
-
my $isTable = 0;
|
|
99
|
-
my $isSpace = 0;
|
|
100
|
-
my $isPic = 0;
|
|
101
|
-
my $text = "";
|
|
102
|
-
|
|
103
|
-
while (<IF>) { #each line contains a header
|
|
104
|
-
if (/^\#/) { next; } # skip comments
|
|
105
|
-
chomp;
|
|
106
|
-
s/\cM$//; # remove ^M character at the end of the file if any
|
|
107
|
-
my $line = $_;
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
# if ($line =~ /<\?xml version.+>/){ } ### Xml ###
|
|
111
|
-
# if ($line =~ /^<\/column>$/){ } ### Column ###
|
|
112
|
-
if ($line =~ /<theoreticalPage (.*)\/>/){
|
|
113
|
-
$markupOutput .= "||| Page $1\n";
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
### pic ###
|
|
117
|
-
if ($line =~ /^<dd (.*)>$/){
|
|
118
|
-
$isPic = 1;
|
|
119
|
-
|
|
120
|
-
$markupOutput .= "||| Figure $1\n";
|
|
121
|
-
}
|
|
122
|
-
elsif ($line =~ /^<\/dd>$/){
|
|
123
|
-
$isPic = 0;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
### Table ###
|
|
127
|
-
elsif ($line =~ /^<table (.*)>$/){
|
|
128
|
-
$isTable = 1;
|
|
129
|
-
$markupOutput .= "||| Table $1\n";
|
|
130
|
-
}
|
|
131
|
-
elsif ($line =~ /^<\/table>$/){
|
|
132
|
-
$isTable = 0;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
### Paragraph ###
|
|
136
|
-
# Note: table processing should have higher priority than paragraph, i.e. the priority does matter
|
|
137
|
-
elsif ($line =~ /^<para (.*)>$/){
|
|
138
|
-
$text .= $line."\n"; # we need the header
|
|
139
|
-
$isPara = 1;
|
|
140
|
-
|
|
141
|
-
if($isTable){
|
|
142
|
-
$markupOutput .= "||| ParaTable $1\n";
|
|
143
|
-
} else {
|
|
144
|
-
$markupOutput .= "||| Para $1\n";
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
elsif ($line =~ /^<\/para>$/){
|
|
148
|
-
my $paraText;
|
|
149
|
-
processPara($text);
|
|
150
|
-
|
|
151
|
-
$isPara = 0;
|
|
152
|
-
$text = "";
|
|
153
|
-
}
|
|
154
|
-
elsif($isPara){
|
|
155
|
-
$text .= $line."\n";
|
|
156
|
-
next;
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
close IF;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
sub getAttrValue {
|
|
163
|
-
my ($attrText, $attr) = @_;
|
|
164
|
-
|
|
165
|
-
my $value = "none";
|
|
166
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
|
167
|
-
$value = $1;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
return $value;
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
sub checkFontAttr {
|
|
174
|
-
my ($attrText, $attr, $attrHash, $count) = @_;
|
|
175
|
-
|
|
176
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
|
177
|
-
my $attrValue = $1;
|
|
178
|
-
|
|
179
|
-
$attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
sub processPara {
|
|
184
|
-
my ($inputText) = @_;
|
|
185
|
-
|
|
186
|
-
my $isSpace = 0;
|
|
187
|
-
my $isSpecialSpace = 0;
|
|
188
|
-
my $isTab = 0;
|
|
189
|
-
my $isBullet = 0;
|
|
190
|
-
|
|
191
|
-
my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
|
|
192
|
-
|
|
193
|
-
my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
|
|
194
|
-
my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
|
|
195
|
-
my $wdAttr; my $wdText = ""; my $isWd = 0;
|
|
196
|
-
|
|
197
|
-
my $text = "";
|
|
198
|
-
my $tmpMarkupOutput = "";
|
|
199
|
-
# binmode(STDERR, ":utf8");
|
|
200
|
-
|
|
201
|
-
my @lines = split(/\n/, $inputText);
|
|
202
|
-
for(my $i=0; $i<scalar(@lines); $i++){
|
|
203
|
-
my $line = $lines[$i];
|
|
204
|
-
|
|
205
|
-
## new ln
|
|
206
|
-
if ($line =~ /^<ln (.+)>$/){
|
|
207
|
-
$lnAttr = $1;
|
|
208
|
-
$isLn = 1;
|
|
209
|
-
|
|
210
|
-
$tmpMarkupOutput .= "||| Line $lnAttr\n";
|
|
211
|
-
$isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
## new run
|
|
215
|
-
elsif ($line =~ /<run (.*)>$/){
|
|
216
|
-
$runAttr = $1;
|
|
217
|
-
|
|
218
|
-
$isSpace = 0;
|
|
219
|
-
$isTab = 0;
|
|
220
|
-
$isRun = 1;
|
|
221
|
-
|
|
222
|
-
if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
|
|
223
|
-
$isWd = 1;
|
|
224
|
-
$wdAttr = $1;
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
## wd
|
|
229
|
-
elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
|
|
230
|
-
$wdAttr = $1;
|
|
231
|
-
my $word = $2;
|
|
232
|
-
$isSpace = 0;
|
|
233
|
-
$isTab = 0;
|
|
234
|
-
|
|
235
|
-
$word =~ s/\cM$//g; # remove ^M character
|
|
236
|
-
$tmpMarkupOutput .= "$word $wdAttr\n";
|
|
237
|
-
|
|
238
|
-
## add text
|
|
239
|
-
$text .= "$word";
|
|
240
|
-
|
|
241
|
-
if($isRun) {
|
|
242
|
-
$runText .= "$word ";
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
## end wd
|
|
247
|
-
elsif ($line =~ /^<\/wd>$/){
|
|
248
|
-
$isWd = 0;
|
|
249
|
-
|
|
250
|
-
$tmpMarkupOutput .= "$wdText $wdAttr\n";
|
|
251
|
-
$wdAttr = "";
|
|
252
|
-
$wdText = "";
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
## end run
|
|
256
|
-
elsif ($line =~ /^(.*)<\/run>$/){
|
|
257
|
-
my $word = $1;
|
|
258
|
-
|
|
259
|
-
## add text
|
|
260
|
-
if($word ne ""){
|
|
261
|
-
$word =~ s/\cM$//g; # remove ^M character
|
|
262
|
-
|
|
263
|
-
# appear in the final result
|
|
264
|
-
if($isLn){ $text .= "$word"; }
|
|
265
|
-
|
|
266
|
-
# for internal record
|
|
267
|
-
if($isRun){ $runText .= "$word "; }
|
|
268
|
-
if($isWd){ $wdText .= "$word"; }
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
## reset run
|
|
272
|
-
$runText = "";
|
|
273
|
-
$isRun = 0;
|
|
274
|
-
$isSpecialSpace = 0;
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
## end ln
|
|
278
|
-
elsif ($line =~ /^<\/ln>$/){
|
|
279
|
-
if((!$isAllowEmpty && $text !~ /^\s*$/)
|
|
280
|
-
|| ($isAllowEmpty && $text ne "")){
|
|
281
|
-
if($isForcedEOF eq "true" || # there's a forced EOL?
|
|
282
|
-
(!$isSpecialSpace) # not an emply line with space character
|
|
283
|
-
){
|
|
284
|
-
$text .= "\n";
|
|
285
|
-
|
|
286
|
-
$markupOutput .= $tmpMarkupOutput;
|
|
287
|
-
$tmpMarkupOutput = "";
|
|
288
|
-
$text = "";
|
|
289
|
-
}
|
|
290
|
-
} else {
|
|
291
|
-
$tmpMarkupOutput = "";
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
## reset ln
|
|
295
|
-
$isLn = 0;
|
|
296
|
-
$isForcedEOF = "none";
|
|
297
|
-
$isSpecialSpace = 0;
|
|
298
|
-
} # end else </ln>
|
|
299
|
-
|
|
300
|
-
## nl newline signal
|
|
301
|
-
elsif ($line =~ /^<nl orig=\"true\"\/>$/){
|
|
302
|
-
if($isLn){
|
|
303
|
-
$isSpace = 0;
|
|
304
|
-
} else {
|
|
305
|
-
if($isDebug){
|
|
306
|
-
print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
## space
|
|
312
|
-
elsif ($line =~ /^<space\/>$/){
|
|
313
|
-
my $startTag = "";
|
|
314
|
-
my $endTag = "";
|
|
315
|
-
if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
|
|
316
|
-
$startTag = $1;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
|
|
320
|
-
$endTag = $1;
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
if($startTag eq $endTag && $startTag ne ""){
|
|
324
|
-
$isSpecialSpace = 1;
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
## addText
|
|
328
|
-
$text .= " ";
|
|
329
|
-
$isSpace = 1;
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
## tab
|
|
333
|
-
elsif ($line =~ /^<tab .*\/>$/){
|
|
334
|
-
## add Text
|
|
335
|
-
$text .= "\t";
|
|
336
|
-
|
|
337
|
-
$isTab = 1;
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
## bullet
|
|
341
|
-
elsif ($line =~ /^<bullet .*>$/){
|
|
342
|
-
$isBullet = 1;
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
sub untaintPath {
|
|
348
|
-
my ($path) = @_;
|
|
349
|
-
|
|
350
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ ) {
|
|
351
|
-
$path = $1;
|
|
352
|
-
} else {
|
|
353
|
-
die "Bad path \"$path\"\n";
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
return $path;
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
sub untaint {
|
|
360
|
-
my ($s) = @_;
|
|
361
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
|
|
362
|
-
$s = $1; # $data now untainted
|
|
363
|
-
} else {
|
|
364
|
-
die "Bad data in $s"; # log this somewhere
|
|
365
|
-
}
|
|
366
|
-
return $s;
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
sub execute {
|
|
370
|
-
my ($cmd) = @_;
|
|
371
|
-
if($isDebug){
|
|
372
|
-
print STDERR "Executing: $cmd\n";
|
|
373
|
-
}
|
|
374
|
-
$cmd = untaint($cmd);
|
|
375
|
-
system($cmd);
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
sub newTmpFile {
|
|
379
|
-
my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
|
|
380
|
-
chomp($tmpFile);
|
|
381
|
-
return $tmpFile;
|
|
382
|
-
}
|
data/parscit/bin/xml2train.pl
DELETED
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/perl
|
|
2
|
-
# Author: Do Hoang Nhat Huy <dcsdhnh@nus.edu.sg>, generated at Fri, 3 Dec 2010 14:36:00
|
|
3
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
|
4
|
-
require 5.0;
|
|
5
|
-
use strict;
|
|
6
|
-
|
|
7
|
-
use FindBin;
|
|
8
|
-
use Getopt::Long;
|
|
9
|
-
|
|
10
|
-
# I do not know a better solution to find a lib path in -T mode.
|
|
11
|
-
# So if you know a better solution, I'd be glad to hear.
|
|
12
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
|
13
|
-
my $path; # Path to Parscit binary directory
|
|
14
|
-
BEGIN
|
|
15
|
-
{
|
|
16
|
-
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
use lib "$path/../lib";
|
|
20
|
-
|
|
21
|
-
# Local libraries
|
|
22
|
-
use Omni::Omnidoc;
|
|
23
|
-
use ParsCit::Tr2crfpp;
|
|
24
|
-
use ParsCit::PreProcess;
|
|
25
|
-
# Dependencies
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
### USER customizable section
|
|
29
|
-
my $version = "1.0";
|
|
30
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
|
31
|
-
### END user customizable section
|
|
32
|
-
|
|
33
|
-
sub License
|
|
34
|
-
{
|
|
35
|
-
print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
### HELP Sub-procedure
|
|
39
|
-
sub Help
|
|
40
|
-
{
|
|
41
|
-
print STDERR "Process Omnipage XML output (Reference Section Only) and extract text lines together with other XML information\n";
|
|
42
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
|
43
|
-
print STDERR " $progname -in xmlfile -out outfile -opt option [-codec -app]\n";
|
|
44
|
-
print STDERR "Options:\n";
|
|
45
|
-
print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
|
|
46
|
-
print STDERR "\t-in \tXML input from Omnipage\n";
|
|
47
|
-
print STDERR "\t-out \tOutput file\n";
|
|
48
|
-
print STDERR "\t-codec \tCodec of the input XML: utf-16 or utf-8. Default is utf-8\n";
|
|
49
|
-
print STDERR "\t-opt \tOptio: train (output is train file for crf++) or xml (output is xml features). Default is train\n";
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
my $help = 0;
|
|
53
|
-
my $quite = 0;
|
|
54
|
-
my $infile = undef;
|
|
55
|
-
my $outfile = undef;
|
|
56
|
-
my $option = "train";
|
|
57
|
-
my $codec = "utf-8";
|
|
58
|
-
|
|
59
|
-
$help = 1 unless GetOptions('in=s' => \$infile,
|
|
60
|
-
'out=s' => \$outfile,
|
|
61
|
-
'opt=s' => \$option,
|
|
62
|
-
'codec=s' => \$codec,
|
|
63
|
-
'h' => \$help,
|
|
64
|
-
'q' => \$quite);
|
|
65
|
-
|
|
66
|
-
if ($help || !defined $infile || !defined $outfile)
|
|
67
|
-
{
|
|
68
|
-
Help();
|
|
69
|
-
exit(0);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
if (!$quite)
|
|
73
|
-
{
|
|
74
|
-
License();
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
# Sanity check
|
|
78
|
-
if (($option ne "train") && ($option ne "xml"))
|
|
79
|
-
{
|
|
80
|
-
die "Die: -opt must equal \"train\" or \"xml\".\n";
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
if (($codec ne "utf-8") && ($codec ne "utf-16"))
|
|
84
|
-
{
|
|
85
|
-
die "Die: -codec must equal \"utf-8\" or \"utf-16\".\n";
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
# Untaint check
|
|
89
|
-
$infile = UntaintPath($infile);
|
|
90
|
-
$outfile = UntaintPath($outfile);
|
|
91
|
-
|
|
92
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
|
93
|
-
# End untaint check
|
|
94
|
-
|
|
95
|
-
# MAIN
|
|
96
|
-
my $infile_utf8 = $infile . "-utf8";
|
|
97
|
-
if ($codec eq "utf-16") { Convert($infile, "UTF16", $infile_utf8, "UTF8"); }
|
|
98
|
-
|
|
99
|
-
if (! open(IN, "<:utf8", $infile)) { return (-1, "Could not open xml file " . $infile . ": " . $!); }
|
|
100
|
-
my $xml = do { local $/; <IN> };
|
|
101
|
-
close IN;
|
|
102
|
-
|
|
103
|
-
# Cleanup
|
|
104
|
-
CleanUp(\$xml);
|
|
105
|
-
|
|
106
|
-
# New document
|
|
107
|
-
my $doc = new Omni::Omnidoc();
|
|
108
|
-
$doc->set_raw($xml);
|
|
109
|
-
|
|
110
|
-
# Extract the reference portion from the XML
|
|
111
|
-
my ($start_ref, $end_ref, $rcite_text_from_xml) = ParsCit::PreProcess::findCitationTextXML($doc);
|
|
112
|
-
|
|
113
|
-
if ($option eq "train")
|
|
114
|
-
{
|
|
115
|
-
# Prepare to split unmarked reference portion
|
|
116
|
-
my $tmp_file = ParsCit::Tr2crfpp::prepDataUnmarked($doc, $start_ref, $end_ref);
|
|
117
|
-
|
|
118
|
-
# Save the temporary file
|
|
119
|
-
my $cmd = "mv " . $tmp_file . " " . $outfile;
|
|
120
|
-
|
|
121
|
-
Execute($cmd);
|
|
122
|
-
}
|
|
123
|
-
else
|
|
124
|
-
{
|
|
125
|
-
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
# END
|
|
129
|
-
|
|
130
|
-
# Convert the input XML
|
|
131
|
-
sub Convert
|
|
132
|
-
{
|
|
133
|
-
my ($from_file, $from_encode, $to_file, $to_encode, $log) = @_;
|
|
134
|
-
|
|
135
|
-
# Call iconv program
|
|
136
|
-
my $cmd = "iconv" . " -f " . $from_encode . " -t " . $to_encode . " " . $from_file . " -o " . $to_file;
|
|
137
|
-
|
|
138
|
-
# Transformation
|
|
139
|
-
Execute($cmd);
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
# Clean up the input XML
|
|
143
|
-
sub CleanUp
|
|
144
|
-
{
|
|
145
|
-
my ($ref_xml) = @_;
|
|
146
|
-
|
|
147
|
-
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
|
148
|
-
$$ref_xml =~ s/<\?xml.+?>\n//g;
|
|
149
|
-
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
|
150
|
-
$$ref_xml =~ s/<\!\-\-XML.+?>\n//g;
|
|
151
|
-
# Add the root tag
|
|
152
|
-
$$ref_xml = "<root>" . "\n" . $$ref_xml . "\n" . "</root>";
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
sub UntaintPath
|
|
156
|
-
{
|
|
157
|
-
my ($path) = @_;
|
|
158
|
-
|
|
159
|
-
if ($path =~ /^([-_:" \/\w\.%\p{C}\p{P}]+)$/ )
|
|
160
|
-
{
|
|
161
|
-
$path = $1;
|
|
162
|
-
}
|
|
163
|
-
else
|
|
164
|
-
{
|
|
165
|
-
die "Bad path \"$path\"\n";
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
return $path;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
sub Untaint
|
|
172
|
-
{
|
|
173
|
-
my ($s) = @_;
|
|
174
|
-
if ($s =~ /^([\w \-\@\(\),\.\/>\p{C}\p{P}]+)$/)
|
|
175
|
-
{
|
|
176
|
-
$s = $1; # $data now untainted
|
|
177
|
-
}
|
|
178
|
-
else
|
|
179
|
-
{
|
|
180
|
-
die "Bad data in $s"; # log this somewhere
|
|
181
|
-
}
|
|
182
|
-
return $s;
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
sub Execute
|
|
186
|
-
{
|
|
187
|
-
my ($cmd) = @_;
|
|
188
|
-
print STDERR "Executing: $cmd\n";
|
|
189
|
-
system($cmd);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
data/parscit/lib/Omni/Config.pm
DELETED
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
package Omni::Config;
|
|
2
|
-
|
|
3
|
-
# Global
|
|
4
|
-
# Names of the classes
|
|
5
|
-
$ALG_NAME = "Omni";
|
|
6
|
-
# Version
|
|
7
|
-
$ALG_VERSION = "110505";
|
|
8
|
-
|
|
9
|
-
# All Omnipage XML tags
|
|
10
|
-
%omni_tag_list = ( 'DOCUMENT' => 'document',
|
|
11
|
-
'PAGE' => 'page',
|
|
12
|
-
'COLUMN' => 'column',
|
|
13
|
-
'DESC' => 'description',
|
|
14
|
-
'SRC' => 'source',
|
|
15
|
-
'LANGUAGE' => 'language',
|
|
16
|
-
'STYLE' => 'style',
|
|
17
|
-
'STYLE-TABLE' => 'styleTable',
|
|
18
|
-
'THEO-PAGE' => 'theoreticalPage',
|
|
19
|
-
'BODY' => 'body',
|
|
20
|
-
'SECTION' => 'section',
|
|
21
|
-
'COL' => 'column',
|
|
22
|
-
'PARA' => 'para',
|
|
23
|
-
'LINE' => 'ln',
|
|
24
|
-
'WORD' => 'wd',
|
|
25
|
-
'SPACE' => 'space',
|
|
26
|
-
'RUN' => 'run',
|
|
27
|
-
'BULLET' => 'bullet',
|
|
28
|
-
'TABLE' => 'table',
|
|
29
|
-
'GRID' => 'gridTable',
|
|
30
|
-
'GRID-COL' => 'gridCol',
|
|
31
|
-
'GRID-ROW' => 'gridRow',
|
|
32
|
-
'CELL' => 'cell',
|
|
33
|
-
'BOTTOM-CELL' => 'bottomBorder',
|
|
34
|
-
'TOP-CELL' => 'topBorder',
|
|
35
|
-
'LEFT-CELL' => 'leftBorder',
|
|
36
|
-
'RIGHT-CELL' => 'rightBorder',
|
|
37
|
-
'NEWLINE' => 'nl',
|
|
38
|
-
'TAB' => 'tab',
|
|
39
|
-
'DD' => 'dd',
|
|
40
|
-
'PICTURE' => 'picture',
|
|
41
|
-
'FRAME' => 'frame'
|
|
42
|
-
);
|
|
43
|
-
$tag_list = \%omni_tag_list;
|
|
44
|
-
|
|
45
|
-
# All Omnipage XML attributes
|
|
46
|
-
%omni_att_list = ( 'ALIGN' => 'alignment',
|
|
47
|
-
'FONTFACE' => 'fontFace',
|
|
48
|
-
'FONTFAMILY' => 'fontFamily',
|
|
49
|
-
'FONTPITCH' => 'fontPitch',
|
|
50
|
-
'FONTSIZE' => 'fontSize',
|
|
51
|
-
'UNDERLINE' => 'underline',
|
|
52
|
-
'SPACING' => 'spacing',
|
|
53
|
-
'SCALE' => 'scale',
|
|
54
|
-
'BOTTOM' => 'b',
|
|
55
|
-
'TOP' => 't',
|
|
56
|
-
'LEFT' => 'l',
|
|
57
|
-
'RIGHT' => 'r',
|
|
58
|
-
'LANGUAGE' => 'language',
|
|
59
|
-
'SUSCRIPT' => 'subsuperscript',
|
|
60
|
-
'BASELINE' => 'baseline',
|
|
61
|
-
'BOLD' => 'bold',
|
|
62
|
-
'ITALIC' => 'italic',
|
|
63
|
-
'SPACEB' => 'spaceBefore',
|
|
64
|
-
# These attribute usually go with <dd> tag
|
|
65
|
-
'BOTTOMDIST' => 'bottomDistance',
|
|
66
|
-
'TOPDIST' => 'topDistance',
|
|
67
|
-
'LEFTDIST' => 'leftDistance',
|
|
68
|
-
'RIGHTDIST' => 'rightDistance',
|
|
69
|
-
# These attribute usually fo with <cell> tag
|
|
70
|
-
'GROWFROM' => 'gridRowFrom',
|
|
71
|
-
'GROWTO' => 'gridRowTill',
|
|
72
|
-
'GCOLFROM' => 'gridColFrom',
|
|
73
|
-
'GCOLTO' => 'gridColTill',
|
|
74
|
-
'VALIGN' => 'verticalAlignment',
|
|
75
|
-
);
|
|
76
|
-
$att_list = \%omni_att_list;
|
|
77
|
-
|
|
78
|
-
# All object type in Omni library
|
|
79
|
-
%omni_obj_list = ( 'OMNIDOC' => 'document',
|
|
80
|
-
'OMNIPAGE' => 'page',
|
|
81
|
-
'OMNICOL' => 'column',
|
|
82
|
-
'OMNIDD' => 'dd',
|
|
83
|
-
'OMNITABLE' => 'table',
|
|
84
|
-
'OMNIIMG' => 'image',
|
|
85
|
-
'OMNIPARA' => 'paragraph',
|
|
86
|
-
'OMNILINE' => 'line',
|
|
87
|
-
'OMNIRUN' => 'run',
|
|
88
|
-
'OMNIWORD' => 'word',
|
|
89
|
-
'OMNIFRAME' => 'frame',
|
|
90
|
-
);
|
|
91
|
-
$obj_list = \%omni_obj_list;
|
|
92
|
-
|
|
93
|
-
1;
|