biblicit 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
@@ -1,382 +0,0 @@
|
|
1
|
-
#!/usr/bin/perl -wT
|
2
|
-
# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
|
3
|
-
|
4
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
5
|
-
|
6
|
-
require 5.0;
|
7
|
-
use strict;
|
8
|
-
use Getopt::Long;
|
9
|
-
use HTML::Entities;
|
10
|
-
|
11
|
-
# I do not know a better solution to find a lib path in -T mode.
|
12
|
-
# So if you know a better solution, I'd be glad to hear.
|
13
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
14
|
-
use FindBin;
|
15
|
-
FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
|
16
|
-
my $path;
|
17
|
-
BEGIN {
|
18
|
-
if ($FindBin::Bin =~ /(.*)/) {
|
19
|
-
$path = $1;
|
20
|
-
}
|
21
|
-
}
|
22
|
-
use lib "$path/../../lib";
|
23
|
-
use SectLabel::PreProcess;
|
24
|
-
|
25
|
-
### USER customizable section
|
26
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
27
|
-
my $outputVersion = "1.0";
|
28
|
-
### END user customizable section
|
29
|
-
|
30
|
-
sub License {
|
31
|
-
print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
|
32
|
-
}
|
33
|
-
|
34
|
-
### HELP Sub-procedure
|
35
|
-
sub Help {
|
36
|
-
print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract necessary information. Marking in the output detailed word-level info ### Page\\n## Para\\n# Line\\nword\\n### Table\\n### Figure\n";
|
37
|
-
|
38
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
39
|
-
print STDERR " $progname -in xmlFile -out outFile [-decode -allowEmptyLine -log]\n";
|
40
|
-
print STDERR "Options:\n";
|
41
|
-
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
|
42
|
-
print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
|
43
|
-
}
|
44
|
-
|
45
|
-
my $QUIET = 0;
|
46
|
-
my $HELP = 0;
|
47
|
-
my $outFile = undef;
|
48
|
-
my $inFile = undef;
|
49
|
-
|
50
|
-
my $isDecode = 0;
|
51
|
-
my $isAllowEmpty = 0;
|
52
|
-
my $isDebug = 0;
|
53
|
-
$HELP = 1 unless GetOptions('in=s' => \$inFile,
|
54
|
-
'out=s' => \$outFile,
|
55
|
-
'decode' => \$isDecode,
|
56
|
-
'allowEmptyLine' => \$isAllowEmpty,
|
57
|
-
'log' => \$isDebug,
|
58
|
-
'h' => \$HELP,
|
59
|
-
'q' => \$QUIET);
|
60
|
-
|
61
|
-
if ($HELP || !defined $inFile || !defined $outFile) {
|
62
|
-
Help();
|
63
|
-
exit(0);
|
64
|
-
}
|
65
|
-
|
66
|
-
if (!$QUIET) {
|
67
|
-
License();
|
68
|
-
}
|
69
|
-
|
70
|
-
### Untaint ###
|
71
|
-
$inFile = untaintPath($inFile);
|
72
|
-
$outFile = untaintPath($outFile);
|
73
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
74
|
-
### End untaint ###
|
75
|
-
|
76
|
-
if($isDebug){
|
77
|
-
print STDERR "\n# Processing file $inFile & output to $outFile\n";
|
78
|
-
}
|
79
|
-
|
80
|
-
my $markupOutput = "";
|
81
|
-
processFile($inFile);
|
82
|
-
|
83
|
-
if($isDecode){
|
84
|
-
$markupOutput = decode_entities($markupOutput);
|
85
|
-
}
|
86
|
-
|
87
|
-
open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
|
88
|
-
print OF "$markupOutput";
|
89
|
-
close OF;
|
90
|
-
|
91
|
-
sub processFile {
|
92
|
-
my ($inFile) = @_;
|
93
|
-
|
94
|
-
if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
|
95
|
-
open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
|
96
|
-
|
97
|
-
my $isPara = 0;
|
98
|
-
my $isTable = 0;
|
99
|
-
my $isSpace = 0;
|
100
|
-
my $isPic = 0;
|
101
|
-
my $text = "";
|
102
|
-
|
103
|
-
while (<IF>) { #each line contains a header
|
104
|
-
if (/^\#/) { next; } # skip comments
|
105
|
-
chomp;
|
106
|
-
s/\cM$//; # remove ^M character at the end of the file if any
|
107
|
-
my $line = $_;
|
108
|
-
|
109
|
-
|
110
|
-
# if ($line =~ /<\?xml version.+>/){ } ### Xml ###
|
111
|
-
# if ($line =~ /^<\/column>$/){ } ### Column ###
|
112
|
-
if ($line =~ /<theoreticalPage (.*)\/>/){
|
113
|
-
$markupOutput .= "||| Page $1\n";
|
114
|
-
}
|
115
|
-
|
116
|
-
### pic ###
|
117
|
-
if ($line =~ /^<dd (.*)>$/){
|
118
|
-
$isPic = 1;
|
119
|
-
|
120
|
-
$markupOutput .= "||| Figure $1\n";
|
121
|
-
}
|
122
|
-
elsif ($line =~ /^<\/dd>$/){
|
123
|
-
$isPic = 0;
|
124
|
-
}
|
125
|
-
|
126
|
-
### Table ###
|
127
|
-
elsif ($line =~ /^<table (.*)>$/){
|
128
|
-
$isTable = 1;
|
129
|
-
$markupOutput .= "||| Table $1\n";
|
130
|
-
}
|
131
|
-
elsif ($line =~ /^<\/table>$/){
|
132
|
-
$isTable = 0;
|
133
|
-
}
|
134
|
-
|
135
|
-
### Paragraph ###
|
136
|
-
# Note: table processing should have higher priority than paragraph, i.e. the priority does matter
|
137
|
-
elsif ($line =~ /^<para (.*)>$/){
|
138
|
-
$text .= $line."\n"; # we need the header
|
139
|
-
$isPara = 1;
|
140
|
-
|
141
|
-
if($isTable){
|
142
|
-
$markupOutput .= "||| ParaTable $1\n";
|
143
|
-
} else {
|
144
|
-
$markupOutput .= "||| Para $1\n";
|
145
|
-
}
|
146
|
-
}
|
147
|
-
elsif ($line =~ /^<\/para>$/){
|
148
|
-
my $paraText;
|
149
|
-
processPara($text);
|
150
|
-
|
151
|
-
$isPara = 0;
|
152
|
-
$text = "";
|
153
|
-
}
|
154
|
-
elsif($isPara){
|
155
|
-
$text .= $line."\n";
|
156
|
-
next;
|
157
|
-
}
|
158
|
-
}
|
159
|
-
close IF;
|
160
|
-
}
|
161
|
-
|
162
|
-
sub getAttrValue {
|
163
|
-
my ($attrText, $attr) = @_;
|
164
|
-
|
165
|
-
my $value = "none";
|
166
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
167
|
-
$value = $1;
|
168
|
-
}
|
169
|
-
|
170
|
-
return $value;
|
171
|
-
}
|
172
|
-
|
173
|
-
sub checkFontAttr {
|
174
|
-
my ($attrText, $attr, $attrHash, $count) = @_;
|
175
|
-
|
176
|
-
if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
|
177
|
-
my $attrValue = $1;
|
178
|
-
|
179
|
-
$attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
|
180
|
-
}
|
181
|
-
}
|
182
|
-
|
183
|
-
sub processPara {
|
184
|
-
my ($inputText) = @_;
|
185
|
-
|
186
|
-
my $isSpace = 0;
|
187
|
-
my $isSpecialSpace = 0;
|
188
|
-
my $isTab = 0;
|
189
|
-
my $isBullet = 0;
|
190
|
-
|
191
|
-
my $isForcedEOF = "none"; # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
|
192
|
-
|
193
|
-
my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
|
194
|
-
my $runAttr; my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
|
195
|
-
my $wdAttr; my $wdText = ""; my $isWd = 0;
|
196
|
-
|
197
|
-
my $text = "";
|
198
|
-
my $tmpMarkupOutput = "";
|
199
|
-
# binmode(STDERR, ":utf8");
|
200
|
-
|
201
|
-
my @lines = split(/\n/, $inputText);
|
202
|
-
for(my $i=0; $i<scalar(@lines); $i++){
|
203
|
-
my $line = $lines[$i];
|
204
|
-
|
205
|
-
## new ln
|
206
|
-
if ($line =~ /^<ln (.+)>$/){
|
207
|
-
$lnAttr = $1;
|
208
|
-
$isLn = 1;
|
209
|
-
|
210
|
-
$tmpMarkupOutput .= "||| Line $lnAttr\n";
|
211
|
-
$isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
|
212
|
-
}
|
213
|
-
|
214
|
-
## new run
|
215
|
-
elsif ($line =~ /<run (.*)>$/){
|
216
|
-
$runAttr = $1;
|
217
|
-
|
218
|
-
$isSpace = 0;
|
219
|
-
$isTab = 0;
|
220
|
-
$isRun = 1;
|
221
|
-
|
222
|
-
if($line =~ /^<wd (.*?)>/){ # new wd, that consists of many runs
|
223
|
-
$isWd = 1;
|
224
|
-
$wdAttr = $1;
|
225
|
-
}
|
226
|
-
}
|
227
|
-
|
228
|
-
## wd
|
229
|
-
elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
|
230
|
-
$wdAttr = $1;
|
231
|
-
my $word = $2;
|
232
|
-
$isSpace = 0;
|
233
|
-
$isTab = 0;
|
234
|
-
|
235
|
-
$word =~ s/\cM$//g; # remove ^M character
|
236
|
-
$tmpMarkupOutput .= "$word $wdAttr\n";
|
237
|
-
|
238
|
-
## add text
|
239
|
-
$text .= "$word";
|
240
|
-
|
241
|
-
if($isRun) {
|
242
|
-
$runText .= "$word ";
|
243
|
-
}
|
244
|
-
}
|
245
|
-
|
246
|
-
## end wd
|
247
|
-
elsif ($line =~ /^<\/wd>$/){
|
248
|
-
$isWd = 0;
|
249
|
-
|
250
|
-
$tmpMarkupOutput .= "$wdText $wdAttr\n";
|
251
|
-
$wdAttr = "";
|
252
|
-
$wdText = "";
|
253
|
-
}
|
254
|
-
|
255
|
-
## end run
|
256
|
-
elsif ($line =~ /^(.*)<\/run>$/){
|
257
|
-
my $word = $1;
|
258
|
-
|
259
|
-
## add text
|
260
|
-
if($word ne ""){
|
261
|
-
$word =~ s/\cM$//g; # remove ^M character
|
262
|
-
|
263
|
-
# appear in the final result
|
264
|
-
if($isLn){ $text .= "$word"; }
|
265
|
-
|
266
|
-
# for internal record
|
267
|
-
if($isRun){ $runText .= "$word "; }
|
268
|
-
if($isWd){ $wdText .= "$word"; }
|
269
|
-
}
|
270
|
-
|
271
|
-
## reset run
|
272
|
-
$runText = "";
|
273
|
-
$isRun = 0;
|
274
|
-
$isSpecialSpace = 0;
|
275
|
-
}
|
276
|
-
|
277
|
-
## end ln
|
278
|
-
elsif ($line =~ /^<\/ln>$/){
|
279
|
-
if((!$isAllowEmpty && $text !~ /^\s*$/)
|
280
|
-
|| ($isAllowEmpty && $text ne "")){
|
281
|
-
if($isForcedEOF eq "true" || # there's a forced EOL?
|
282
|
-
(!$isSpecialSpace) # not an emply line with space character
|
283
|
-
){
|
284
|
-
$text .= "\n";
|
285
|
-
|
286
|
-
$markupOutput .= $tmpMarkupOutput;
|
287
|
-
$tmpMarkupOutput = "";
|
288
|
-
$text = "";
|
289
|
-
}
|
290
|
-
} else {
|
291
|
-
$tmpMarkupOutput = "";
|
292
|
-
}
|
293
|
-
|
294
|
-
## reset ln
|
295
|
-
$isLn = 0;
|
296
|
-
$isForcedEOF = "none";
|
297
|
-
$isSpecialSpace = 0;
|
298
|
-
} # end else </ln>
|
299
|
-
|
300
|
-
## nl newline signal
|
301
|
-
elsif ($line =~ /^<nl orig=\"true\"\/>$/){
|
302
|
-
if($isLn){
|
303
|
-
$isSpace = 0;
|
304
|
-
} else {
|
305
|
-
if($isDebug){
|
306
|
-
print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
|
307
|
-
}
|
308
|
-
}
|
309
|
-
}
|
310
|
-
|
311
|
-
## space
|
312
|
-
elsif ($line =~ /^<space\/>$/){
|
313
|
-
my $startTag = "";
|
314
|
-
my $endTag = "";
|
315
|
-
if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
|
316
|
-
$startTag = $1;
|
317
|
-
}
|
318
|
-
|
319
|
-
if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
|
320
|
-
$endTag = $1;
|
321
|
-
}
|
322
|
-
|
323
|
-
if($startTag eq $endTag && $startTag ne ""){
|
324
|
-
$isSpecialSpace = 1;
|
325
|
-
}
|
326
|
-
|
327
|
-
## addText
|
328
|
-
$text .= " ";
|
329
|
-
$isSpace = 1;
|
330
|
-
}
|
331
|
-
|
332
|
-
## tab
|
333
|
-
elsif ($line =~ /^<tab .*\/>$/){
|
334
|
-
## add Text
|
335
|
-
$text .= "\t";
|
336
|
-
|
337
|
-
$isTab = 1;
|
338
|
-
}
|
339
|
-
|
340
|
-
## bullet
|
341
|
-
elsif ($line =~ /^<bullet .*>$/){
|
342
|
-
$isBullet = 1;
|
343
|
-
}
|
344
|
-
}
|
345
|
-
}
|
346
|
-
|
347
|
-
sub untaintPath {
|
348
|
-
my ($path) = @_;
|
349
|
-
|
350
|
-
if ( $path =~ /^([-_\/\w\.]*)$/ ) {
|
351
|
-
$path = $1;
|
352
|
-
} else {
|
353
|
-
die "Bad path \"$path\"\n";
|
354
|
-
}
|
355
|
-
|
356
|
-
return $path;
|
357
|
-
}
|
358
|
-
|
359
|
-
sub untaint {
|
360
|
-
my ($s) = @_;
|
361
|
-
if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
|
362
|
-
$s = $1; # $data now untainted
|
363
|
-
} else {
|
364
|
-
die "Bad data in $s"; # log this somewhere
|
365
|
-
}
|
366
|
-
return $s;
|
367
|
-
}
|
368
|
-
|
369
|
-
sub execute {
|
370
|
-
my ($cmd) = @_;
|
371
|
-
if($isDebug){
|
372
|
-
print STDERR "Executing: $cmd\n";
|
373
|
-
}
|
374
|
-
$cmd = untaint($cmd);
|
375
|
-
system($cmd);
|
376
|
-
}
|
377
|
-
|
378
|
-
sub newTmpFile {
|
379
|
-
my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
|
380
|
-
chomp($tmpFile);
|
381
|
-
return $tmpFile;
|
382
|
-
}
|
data/parscit/bin/xml2train.pl
DELETED
@@ -1,193 +0,0 @@
|
|
1
|
-
#!/usr/bin/perl
|
2
|
-
# Author: Do Hoang Nhat Huy <dcsdhnh@nus.edu.sg>, generated at Fri, 3 Dec 2010 14:36:00
|
3
|
-
# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
|
4
|
-
require 5.0;
|
5
|
-
use strict;
|
6
|
-
|
7
|
-
use FindBin;
|
8
|
-
use Getopt::Long;
|
9
|
-
|
10
|
-
# I do not know a better solution to find a lib path in -T mode.
|
11
|
-
# So if you know a better solution, I'd be glad to hear.
|
12
|
-
# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
|
13
|
-
my $path; # Path to Parscit binary directory
|
14
|
-
BEGIN
|
15
|
-
{
|
16
|
-
if ($FindBin::Bin =~ /(.*)/) { $path = $1; }
|
17
|
-
}
|
18
|
-
|
19
|
-
use lib "$path/../lib";
|
20
|
-
|
21
|
-
# Local libraries
|
22
|
-
use Omni::Omnidoc;
|
23
|
-
use ParsCit::Tr2crfpp;
|
24
|
-
use ParsCit::PreProcess;
|
25
|
-
# Dependencies
|
26
|
-
|
27
|
-
|
28
|
-
### USER customizable section
|
29
|
-
my $version = "1.0";
|
30
|
-
$0 =~ /([^\/]+)$/; my $progname = $1;
|
31
|
-
### END user customizable section
|
32
|
-
|
33
|
-
sub License
|
34
|
-
{
|
35
|
-
print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
|
36
|
-
}
|
37
|
-
|
38
|
-
### HELP Sub-procedure
|
39
|
-
sub Help
|
40
|
-
{
|
41
|
-
print STDERR "Process Omnipage XML output (Reference Section Only) and extract text lines together with other XML information\n";
|
42
|
-
print STDERR "usage: $progname -h\t[invokes help]\n";
|
43
|
-
print STDERR " $progname -in xmlfile -out outfile -opt option [-codec -app]\n";
|
44
|
-
print STDERR "Options:\n";
|
45
|
-
print STDERR "\t-q \tQuiet Mode (don't echo license)\n";
|
46
|
-
print STDERR "\t-in \tXML input from Omnipage\n";
|
47
|
-
print STDERR "\t-out \tOutput file\n";
|
48
|
-
print STDERR "\t-codec \tCodec of the input XML: utf-16 or utf-8. Default is utf-8\n";
|
49
|
-
print STDERR "\t-opt \tOptio: train (output is train file for crf++) or xml (output is xml features). Default is train\n";
|
50
|
-
}
|
51
|
-
|
52
|
-
my $help = 0;
|
53
|
-
my $quite = 0;
|
54
|
-
my $infile = undef;
|
55
|
-
my $outfile = undef;
|
56
|
-
my $option = "train";
|
57
|
-
my $codec = "utf-8";
|
58
|
-
|
59
|
-
$help = 1 unless GetOptions('in=s' => \$infile,
|
60
|
-
'out=s' => \$outfile,
|
61
|
-
'opt=s' => \$option,
|
62
|
-
'codec=s' => \$codec,
|
63
|
-
'h' => \$help,
|
64
|
-
'q' => \$quite);
|
65
|
-
|
66
|
-
if ($help || !defined $infile || !defined $outfile)
|
67
|
-
{
|
68
|
-
Help();
|
69
|
-
exit(0);
|
70
|
-
}
|
71
|
-
|
72
|
-
if (!$quite)
|
73
|
-
{
|
74
|
-
License();
|
75
|
-
}
|
76
|
-
|
77
|
-
# Sanity check
|
78
|
-
if (($option ne "train") && ($option ne "xml"))
|
79
|
-
{
|
80
|
-
die "Die: -opt must equal \"train\" or \"xml\".\n";
|
81
|
-
}
|
82
|
-
|
83
|
-
if (($codec ne "utf-8") && ($codec ne "utf-16"))
|
84
|
-
{
|
85
|
-
die "Die: -codec must equal \"utf-8\" or \"utf-16\".\n";
|
86
|
-
}
|
87
|
-
|
88
|
-
# Untaint check
|
89
|
-
$infile = UntaintPath($infile);
|
90
|
-
$outfile = UntaintPath($outfile);
|
91
|
-
|
92
|
-
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
|
93
|
-
# End untaint check
|
94
|
-
|
95
|
-
# MAIN
|
96
|
-
my $infile_utf8 = $infile . "-utf8";
|
97
|
-
if ($codec eq "utf-16") { Convert($infile, "UTF16", $infile_utf8, "UTF8"); }
|
98
|
-
|
99
|
-
if (! open(IN, "<:utf8", $infile)) { return (-1, "Could not open xml file " . $infile . ": " . $!); }
|
100
|
-
my $xml = do { local $/; <IN> };
|
101
|
-
close IN;
|
102
|
-
|
103
|
-
# Cleanup
|
104
|
-
CleanUp(\$xml);
|
105
|
-
|
106
|
-
# New document
|
107
|
-
my $doc = new Omni::Omnidoc();
|
108
|
-
$doc->set_raw($xml);
|
109
|
-
|
110
|
-
# Extract the reference portion from the XML
|
111
|
-
my ($start_ref, $end_ref, $rcite_text_from_xml) = ParsCit::PreProcess::findCitationTextXML($doc);
|
112
|
-
|
113
|
-
if ($option eq "train")
|
114
|
-
{
|
115
|
-
# Prepare to split unmarked reference portion
|
116
|
-
my $tmp_file = ParsCit::Tr2crfpp::prepDataUnmarked($doc, $start_ref, $end_ref);
|
117
|
-
|
118
|
-
# Save the temporary file
|
119
|
-
my $cmd = "mv " . $tmp_file . " " . $outfile;
|
120
|
-
|
121
|
-
Execute($cmd);
|
122
|
-
}
|
123
|
-
else
|
124
|
-
{
|
125
|
-
|
126
|
-
}
|
127
|
-
|
128
|
-
# END
|
129
|
-
|
130
|
-
# Convert the input XML
|
131
|
-
sub Convert
|
132
|
-
{
|
133
|
-
my ($from_file, $from_encode, $to_file, $to_encode, $log) = @_;
|
134
|
-
|
135
|
-
# Call iconv program
|
136
|
-
my $cmd = "iconv" . " -f " . $from_encode . " -t " . $to_encode . " " . $from_file . " -o " . $to_file;
|
137
|
-
|
138
|
-
# Transformation
|
139
|
-
Execute($cmd);
|
140
|
-
}
|
141
|
-
|
142
|
-
# Clean up the input XML
|
143
|
-
sub CleanUp
|
144
|
-
{
|
145
|
-
my ($ref_xml) = @_;
|
146
|
-
|
147
|
-
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
148
|
-
$$ref_xml =~ s/<\?xml.+?>\n//g;
|
149
|
-
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
150
|
-
$$ref_xml =~ s/<\!\-\-XML.+?>\n//g;
|
151
|
-
# Add the root tag
|
152
|
-
$$ref_xml = "<root>" . "\n" . $$ref_xml . "\n" . "</root>";
|
153
|
-
}
|
154
|
-
|
155
|
-
sub UntaintPath
|
156
|
-
{
|
157
|
-
my ($path) = @_;
|
158
|
-
|
159
|
-
if ($path =~ /^([-_:" \/\w\.%\p{C}\p{P}]+)$/ )
|
160
|
-
{
|
161
|
-
$path = $1;
|
162
|
-
}
|
163
|
-
else
|
164
|
-
{
|
165
|
-
die "Bad path \"$path\"\n";
|
166
|
-
}
|
167
|
-
|
168
|
-
return $path;
|
169
|
-
}
|
170
|
-
|
171
|
-
sub Untaint
|
172
|
-
{
|
173
|
-
my ($s) = @_;
|
174
|
-
if ($s =~ /^([\w \-\@\(\),\.\/>\p{C}\p{P}]+)$/)
|
175
|
-
{
|
176
|
-
$s = $1; # $data now untainted
|
177
|
-
}
|
178
|
-
else
|
179
|
-
{
|
180
|
-
die "Bad data in $s"; # log this somewhere
|
181
|
-
}
|
182
|
-
return $s;
|
183
|
-
}
|
184
|
-
|
185
|
-
sub Execute
|
186
|
-
{
|
187
|
-
my ($cmd) = @_;
|
188
|
-
print STDERR "Executing: $cmd\n";
|
189
|
-
system($cmd);
|
190
|
-
}
|
191
|
-
|
192
|
-
|
193
|
-
|
data/parscit/lib/Omni/Config.pm
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
package Omni::Config;
|
2
|
-
|
3
|
-
# Global
|
4
|
-
# Names of the classes
|
5
|
-
$ALG_NAME = "Omni";
|
6
|
-
# Version
|
7
|
-
$ALG_VERSION = "110505";
|
8
|
-
|
9
|
-
# All Omnipage XML tags
|
10
|
-
%omni_tag_list = ( 'DOCUMENT' => 'document',
|
11
|
-
'PAGE' => 'page',
|
12
|
-
'COLUMN' => 'column',
|
13
|
-
'DESC' => 'description',
|
14
|
-
'SRC' => 'source',
|
15
|
-
'LANGUAGE' => 'language',
|
16
|
-
'STYLE' => 'style',
|
17
|
-
'STYLE-TABLE' => 'styleTable',
|
18
|
-
'THEO-PAGE' => 'theoreticalPage',
|
19
|
-
'BODY' => 'body',
|
20
|
-
'SECTION' => 'section',
|
21
|
-
'COL' => 'column',
|
22
|
-
'PARA' => 'para',
|
23
|
-
'LINE' => 'ln',
|
24
|
-
'WORD' => 'wd',
|
25
|
-
'SPACE' => 'space',
|
26
|
-
'RUN' => 'run',
|
27
|
-
'BULLET' => 'bullet',
|
28
|
-
'TABLE' => 'table',
|
29
|
-
'GRID' => 'gridTable',
|
30
|
-
'GRID-COL' => 'gridCol',
|
31
|
-
'GRID-ROW' => 'gridRow',
|
32
|
-
'CELL' => 'cell',
|
33
|
-
'BOTTOM-CELL' => 'bottomBorder',
|
34
|
-
'TOP-CELL' => 'topBorder',
|
35
|
-
'LEFT-CELL' => 'leftBorder',
|
36
|
-
'RIGHT-CELL' => 'rightBorder',
|
37
|
-
'NEWLINE' => 'nl',
|
38
|
-
'TAB' => 'tab',
|
39
|
-
'DD' => 'dd',
|
40
|
-
'PICTURE' => 'picture',
|
41
|
-
'FRAME' => 'frame'
|
42
|
-
);
|
43
|
-
$tag_list = \%omni_tag_list;
|
44
|
-
|
45
|
-
# All Omnipage XML attributes
|
46
|
-
%omni_att_list = ( 'ALIGN' => 'alignment',
|
47
|
-
'FONTFACE' => 'fontFace',
|
48
|
-
'FONTFAMILY' => 'fontFamily',
|
49
|
-
'FONTPITCH' => 'fontPitch',
|
50
|
-
'FONTSIZE' => 'fontSize',
|
51
|
-
'UNDERLINE' => 'underline',
|
52
|
-
'SPACING' => 'spacing',
|
53
|
-
'SCALE' => 'scale',
|
54
|
-
'BOTTOM' => 'b',
|
55
|
-
'TOP' => 't',
|
56
|
-
'LEFT' => 'l',
|
57
|
-
'RIGHT' => 'r',
|
58
|
-
'LANGUAGE' => 'language',
|
59
|
-
'SUSCRIPT' => 'subsuperscript',
|
60
|
-
'BASELINE' => 'baseline',
|
61
|
-
'BOLD' => 'bold',
|
62
|
-
'ITALIC' => 'italic',
|
63
|
-
'SPACEB' => 'spaceBefore',
|
64
|
-
# These attribute usually go with <dd> tag
|
65
|
-
'BOTTOMDIST' => 'bottomDistance',
|
66
|
-
'TOPDIST' => 'topDistance',
|
67
|
-
'LEFTDIST' => 'leftDistance',
|
68
|
-
'RIGHTDIST' => 'rightDistance',
|
69
|
-
# These attribute usually fo with <cell> tag
|
70
|
-
'GROWFROM' => 'gridRowFrom',
|
71
|
-
'GROWTO' => 'gridRowTill',
|
72
|
-
'GCOLFROM' => 'gridColFrom',
|
73
|
-
'GCOLTO' => 'gridColTill',
|
74
|
-
'VALIGN' => 'verticalAlignment',
|
75
|
-
);
|
76
|
-
$att_list = \%omni_att_list;
|
77
|
-
|
78
|
-
# All object type in Omni library
|
79
|
-
%omni_obj_list = ( 'OMNIDOC' => 'document',
|
80
|
-
'OMNIPAGE' => 'page',
|
81
|
-
'OMNICOL' => 'column',
|
82
|
-
'OMNIDD' => 'dd',
|
83
|
-
'OMNITABLE' => 'table',
|
84
|
-
'OMNIIMG' => 'image',
|
85
|
-
'OMNIPARA' => 'paragraph',
|
86
|
-
'OMNILINE' => 'line',
|
87
|
-
'OMNIRUN' => 'run',
|
88
|
-
'OMNIWORD' => 'word',
|
89
|
-
'OMNIFRAME' => 'frame',
|
90
|
-
);
|
91
|
-
$obj_list = \%omni_obj_list;
|
92
|
-
|
93
|
-
1;
|