biblicit 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +0 -2
- data/biblicit.gemspec +1 -1
- data/parscit/bin/citeExtract.pl +9 -161
- data/parscit/bin/sectExtract.pl +0 -14
- data/parscit/lib/ParsCit/Controller.pm +0 -59
- data/parscit/lib/ParsCit/PreProcess.pm +0 -4
- data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
- metadata +4 -24
- data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
- data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
- data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
- data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
- data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
- data/parscit/bin/xml2train.pl +0 -193
- data/parscit/lib/Omni/Config.pm +0 -93
- data/parscit/lib/Omni/Omnicell.pm +0 -263
- data/parscit/lib/Omni/Omnicol.pm +0 -292
- data/parscit/lib/Omni/Omnidd.pm +0 -328
- data/parscit/lib/Omni/Omnidoc.pm +0 -153
- data/parscit/lib/Omni/Omniframe.pm +0 -223
- data/parscit/lib/Omni/Omniline.pm +0 -423
- data/parscit/lib/Omni/Omnipage.pm +0 -282
- data/parscit/lib/Omni/Omnipara.pm +0 -232
- data/parscit/lib/Omni/Omnirun.pm +0 -303
- data/parscit/lib/Omni/Omnitable.pm +0 -336
- data/parscit/lib/Omni/Omniword.pm +0 -162
- data/parscit/lib/Omni/Traversal.pm +0 -313
- data/parscit/lib/SectLabel/AAMatching.pm +0 -1949
data/README.md
CHANGED
|
@@ -119,8 +119,6 @@ More than these might be required; this is what I had to add to my default insta
|
|
|
119
119
|
|
|
120
120
|
sudo cpan install Digest::SHA1
|
|
121
121
|
sudo cpan install String::Approx
|
|
122
|
-
sudo cpan install XML::Writer::String
|
|
123
|
-
sudo cpan install XML::Twig
|
|
124
122
|
|
|
125
123
|
## Required to use the ParsCit algorithm
|
|
126
124
|
|
data/biblicit.gemspec
CHANGED
|
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "biblicit"
|
|
8
|
-
gem.version = "2.0.
|
|
8
|
+
gem.version = "2.0.4"
|
|
9
9
|
gem.authors = ["David Judd"]
|
|
10
10
|
gem.email = ["david@academia.edu"]
|
|
11
11
|
gem.summary = %q{Extract citations from PDFs.}
|
data/parscit/bin/citeExtract.pl
CHANGED
|
@@ -41,10 +41,7 @@ use File::Spec;
|
|
|
41
41
|
use File::Basename;
|
|
42
42
|
|
|
43
43
|
# Local libraries
|
|
44
|
-
use Omni::Omnidoc;
|
|
45
|
-
use Omni::Traversal;
|
|
46
44
|
use ParsCit::Controller;
|
|
47
|
-
use SectLabel::AAMatching;
|
|
48
45
|
|
|
49
46
|
# USER customizable section
|
|
50
47
|
my $tmpfile .= $0;
|
|
@@ -195,113 +192,17 @@ if (defined $opt_e && $opt_e ne "")
|
|
|
195
192
|
}
|
|
196
193
|
|
|
197
194
|
my $doc = undef;
|
|
198
|
-
my $text_file =
|
|
199
|
-
# Extracting text from Omnipage XML output
|
|
200
|
-
if ($is_xml_input)
|
|
201
|
-
{
|
|
202
|
-
$text_file = "/tmp/" . NewTmpFile();
|
|
203
|
-
my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv2.pl -q -in $in -out $text_file -decode";
|
|
204
|
-
system($cmd);
|
|
205
|
-
|
|
206
|
-
###
|
|
207
|
-
# Huydhn: input is xml from Omnipage
|
|
208
|
-
###
|
|
209
|
-
if (! open(IN, "<:utf8", $in)) { return (-1, "Could not open xml file " . $in . ": " . $!); }
|
|
210
|
-
my $xml = do { local $/; <IN> };
|
|
211
|
-
close IN;
|
|
212
|
-
|
|
213
|
-
###
|
|
214
|
-
# Huydhn
|
|
215
|
-
# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
|
|
216
|
-
# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
|
|
217
|
-
###
|
|
218
|
-
# Convert to Unix format
|
|
219
|
-
$xml =~ s/\r//g;
|
|
220
|
-
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
|
221
|
-
$xml =~ s/<\?xml.+?>\n//g;
|
|
222
|
-
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
|
223
|
-
$xml =~ s/<\!\-\-XML.+?>\n//g;
|
|
224
|
-
# Declaration and root
|
|
225
|
-
$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
|
|
226
|
-
|
|
227
|
-
# New document
|
|
228
|
-
$doc = new Omni::Omnidoc();
|
|
229
|
-
$doc->set_raw($xml);
|
|
230
|
-
}
|
|
231
|
-
else
|
|
232
|
-
{
|
|
233
|
-
$text_file = $in;
|
|
234
|
-
}
|
|
195
|
+
my $text_file = $in;
|
|
235
196
|
|
|
236
197
|
# SECTLABEL
|
|
237
198
|
if (($mode & $SECTLABEL) == $SECTLABEL)
|
|
238
199
|
{
|
|
239
200
|
my $sect_label_input = $text_file;
|
|
240
201
|
|
|
241
|
-
|
|
242
|
-
if ($is_xml_input)
|
|
243
|
-
{
|
|
244
|
-
my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
|
|
245
|
-
system($cmd);
|
|
246
|
-
|
|
247
|
-
my $address_file = $text_file . ".feature" . ".address";
|
|
248
|
-
if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
|
|
249
|
-
|
|
250
|
-
my @omni_address = ();
|
|
251
|
-
# Read the address file provided by process OmniXML script
|
|
252
|
-
while (<IN>)
|
|
253
|
-
{
|
|
254
|
-
chomp;
|
|
255
|
-
# Save and split the line
|
|
256
|
-
my $line = $_;
|
|
257
|
-
my @element = split(/\s+/, $line);
|
|
258
|
-
|
|
259
|
-
my %addr = ();
|
|
260
|
-
# Address
|
|
261
|
-
$addr{ 'L1' } = $element[ 0 ];
|
|
262
|
-
$addr{ 'L2' } = $element[ 1 ];
|
|
263
|
-
$addr{ 'L3' } = $element[ 2 ];
|
|
264
|
-
$addr{ 'L4' } = $element[ 3 ];
|
|
265
|
-
|
|
266
|
-
# Save the address
|
|
267
|
-
push @omni_address, { %addr };
|
|
268
|
-
}
|
|
269
|
-
close IN;
|
|
270
|
-
unlink($address_file);
|
|
271
|
-
|
|
272
|
-
$sect_label_input .= ".feature";
|
|
273
|
-
my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
|
|
274
|
-
|
|
275
|
-
# Remove first line <?xml/>
|
|
276
|
-
$rxml .= RemoveTopLines($sl_xml, 1) . "\n";
|
|
277
|
-
|
|
278
|
-
# Only run author - affiliation if "something" is provided
|
|
279
|
-
if ($opt_a)
|
|
280
|
-
{
|
|
281
|
-
my @aut_addrs = ();
|
|
282
|
-
my @aff_addrs = ();
|
|
283
|
-
# Address of author section
|
|
284
|
-
for my $lindex (@{ $aut_lines }) { push @aut_addrs, $omni_address[ $lindex ]; }
|
|
285
|
-
# Address of affiliation section
|
|
286
|
-
for my $lindex (@{ $aff_lines }) { push @aff_addrs, $omni_address[ $lindex ]; }
|
|
287
|
-
|
|
288
|
-
# The tarpit
|
|
289
|
-
my $aa_xml = SectLabel::AAMatching::AAMatching($doc, \@aut_addrs, \@aff_addrs);
|
|
290
|
-
|
|
291
|
-
# Author-Affiliation Matching result
|
|
292
|
-
$rxml .= $aa_xml . "\n";
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
# Remove XML feature file
|
|
296
|
-
unlink($sect_label_input);
|
|
297
|
-
}
|
|
298
|
-
else
|
|
299
|
-
{
|
|
300
|
-
my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
|
|
202
|
+
my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
|
|
301
203
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
}
|
|
204
|
+
# Remove first line <?xml/>
|
|
205
|
+
$rxml .= RemoveTopLines($sl_xml, 1) . "\n";
|
|
305
206
|
}
|
|
306
207
|
|
|
307
208
|
# PARSHED
|
|
@@ -318,66 +219,13 @@ if (($mode & $PARSHED) == $PARSHED)
|
|
|
318
219
|
# PARSCIT
|
|
319
220
|
if (($mode & $PARSCIT) == $PARSCIT)
|
|
320
221
|
{
|
|
321
|
-
|
|
322
|
-
{
|
|
323
|
-
my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
|
|
324
|
-
system($cmd);
|
|
325
|
-
|
|
326
|
-
my $address_file = $text_file . ".feature" . ".address";
|
|
327
|
-
if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
|
|
328
|
-
|
|
329
|
-
my @omni_address = ();
|
|
330
|
-
# Read the address file provided by process OmniXML script
|
|
331
|
-
while (<IN>)
|
|
332
|
-
{
|
|
333
|
-
chomp;
|
|
334
|
-
# Save and split the line
|
|
335
|
-
my $line = $_;
|
|
336
|
-
my @element = split(/\s+/, $line);
|
|
337
|
-
|
|
338
|
-
my %addr = ();
|
|
339
|
-
# Address
|
|
340
|
-
$addr{ 'L1' } = $element[ 0 ];
|
|
341
|
-
$addr{ 'L2' } = $element[ 1 ];
|
|
342
|
-
$addr{ 'L3' } = $element[ 2 ];
|
|
343
|
-
$addr{ 'L4' } = $element[ 3 ];
|
|
344
|
-
|
|
345
|
-
# Save the address
|
|
346
|
-
push @omni_address, { %addr };
|
|
347
|
-
}
|
|
348
|
-
close IN;
|
|
349
|
-
unlink($address_file);
|
|
350
|
-
|
|
351
|
-
my $sect_label_input = $text_file . ".feature";
|
|
352
|
-
# Output of sectlabel becomes input for parscit
|
|
353
|
-
my ($all_text, $cit_lines) = SectLabel($sect_label_input, $is_xml_input, 1);
|
|
354
|
-
# Remove XML feature file
|
|
355
|
-
unlink($sect_label_input);
|
|
356
|
-
|
|
357
|
-
my @cit_addrs = ();
|
|
358
|
-
# Address of reference section
|
|
359
|
-
for my $lindex (@{ $cit_lines }) { push @cit_addrs, $omni_address[ $lindex ]; }
|
|
360
|
-
|
|
361
|
-
my $pc_xml = undef;
|
|
362
|
-
# Huydhn: add xml features to parscit in case of unmarked reference
|
|
363
|
-
$pc_xml = ParsCit::Controller::ExtractCitations2(\$all_text, $cit_lines, $is_xml_input, $doc, \@cit_addrs);
|
|
364
|
-
|
|
365
|
-
# Remove first line <?xml/>
|
|
366
|
-
$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
|
|
222
|
+
my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
|
|
367
223
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
}
|
|
371
|
-
else
|
|
372
|
-
{
|
|
373
|
-
my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
|
|
374
|
-
|
|
375
|
-
# Remove first line <?xml/>
|
|
376
|
-
$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
|
|
224
|
+
# Remove first line <?xml/>
|
|
225
|
+
$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
|
|
377
226
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
}
|
|
227
|
+
# Thang v100901: call to BiblioScript
|
|
228
|
+
if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
|
|
381
229
|
}
|
|
382
230
|
|
|
383
231
|
$rxml .= "</algorithms>";
|
data/parscit/bin/sectExtract.pl
CHANGED
|
@@ -82,16 +82,6 @@ $modelFile = "$path/../$modelFile";
|
|
|
82
82
|
my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
|
|
83
83
|
$configFile = "$path/../$configFile";
|
|
84
84
|
|
|
85
|
-
if($isXmlInput){
|
|
86
|
-
my $xmlInFile = newTmpFile();
|
|
87
|
-
$xmlInFile = untaintPath($xmlInFile);
|
|
88
|
-
my $cmd = "$path/sectLabel/";
|
|
89
|
-
$cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
|
|
90
|
-
$cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
|
|
91
|
-
execute($cmd);
|
|
92
|
-
$inFile = $xmlInFile;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
85
|
my $dictFile = $SectLabel::Config::dictFile;
|
|
96
86
|
$dictFile = "$path/../$dictFile";
|
|
97
87
|
|
|
@@ -99,10 +89,6 @@ my $funcFile = $SectLabel::Config::funcFile;
|
|
|
99
89
|
$funcFile = "$path/../$funcFile";
|
|
100
90
|
my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
|
|
101
91
|
|
|
102
|
-
if($isXmlInput){
|
|
103
|
-
unlink($inFile);
|
|
104
|
-
}
|
|
105
|
-
|
|
106
92
|
if (defined $outFile) {
|
|
107
93
|
$outFile = untaintPath($outFile);
|
|
108
94
|
|
|
@@ -21,8 +21,6 @@ use ParsCit::Tr2crfpp;
|
|
|
21
21
|
use ParsCit::PreProcess;
|
|
22
22
|
use ParsCit::PostProcess;
|
|
23
23
|
use ParsCit::CitationContext;
|
|
24
|
-
# Omnipage libraries
|
|
25
|
-
use Omni::Omnidoc;
|
|
26
24
|
# Dependencies
|
|
27
25
|
use CSXUtil::SafeText qw(cleanXML);
|
|
28
26
|
|
|
@@ -228,62 +226,6 @@ sub ExtractCitationsImpl
|
|
|
228
226
|
# Reference to an array of single reference
|
|
229
227
|
my $rraw_citations = undef;
|
|
230
228
|
|
|
231
|
-
# Find and separate reference
|
|
232
|
-
if ($is_xml)
|
|
233
|
-
{
|
|
234
|
-
###
|
|
235
|
-
# Huydhn: input is xml from Omnipage
|
|
236
|
-
###
|
|
237
|
-
if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
|
|
238
|
-
my $xml = do { local $/; <IN> };
|
|
239
|
-
close IN;
|
|
240
|
-
|
|
241
|
-
###
|
|
242
|
-
# Huydhn
|
|
243
|
-
# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
|
|
244
|
-
# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
|
|
245
|
-
###
|
|
246
|
-
# Convert to Unix format
|
|
247
|
-
$xml =~ s/\r//g;
|
|
248
|
-
# Remove <?xml version="1.0" encoding="UTF-8"?>
|
|
249
|
-
$xml =~ s/<\?xml.+?>\n//g;
|
|
250
|
-
# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
|
|
251
|
-
$xml =~ s/<\!\-\-XML.+?>\n//g;
|
|
252
|
-
# Declaration and root
|
|
253
|
-
$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
|
|
254
|
-
|
|
255
|
-
# New document
|
|
256
|
-
my $doc = new Omni::Omnidoc();
|
|
257
|
-
$doc->set_raw($xml);
|
|
258
|
-
|
|
259
|
-
# Extract the reference portion from the XML
|
|
260
|
-
my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
|
|
261
|
-
|
|
262
|
-
# Extract the reference portion from the text.
|
|
263
|
-
# TODO: NEED TO BE REMOVED FROM HERE
|
|
264
|
-
my $content = $doc->get_content();
|
|
265
|
-
($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
|
|
266
|
-
|
|
267
|
-
my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
|
|
268
|
-
my @body_tokens = split(/\s+/, $$rbody_text);
|
|
269
|
-
|
|
270
|
-
my $size = scalar(@norm_body_tokens);
|
|
271
|
-
my $size1 = scalar(@pos_array);
|
|
272
|
-
|
|
273
|
-
if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
|
|
274
|
-
# TODO: TO HERE
|
|
275
|
-
|
|
276
|
-
# Filename initialization
|
|
277
|
-
if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
|
|
278
|
-
|
|
279
|
-
# Prepare to split unmarked reference portion
|
|
280
|
-
my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
|
|
281
|
-
|
|
282
|
-
# Extract citations from citation text
|
|
283
|
-
$rraw_citations = ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
|
|
284
|
-
}
|
|
285
|
-
else
|
|
286
|
-
{
|
|
287
229
|
if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
|
|
288
230
|
my $text = do { local $/; <IN> };
|
|
289
231
|
close IN;
|
|
@@ -309,7 +251,6 @@ sub ExtractCitationsImpl
|
|
|
309
251
|
|
|
310
252
|
# Extract citations from citation text
|
|
311
253
|
$rraw_citations = ParsCit::PreProcess::SegmentCitations($rcite_text);
|
|
312
|
-
}
|
|
313
254
|
|
|
314
255
|
my @citations = ();
|
|
315
256
|
my @valid_citations = ();
|
|
@@ -11,7 +11,6 @@ package ParsCit::PreProcess;
|
|
|
11
11
|
use utf8;
|
|
12
12
|
use strict;
|
|
13
13
|
|
|
14
|
-
use Omni::Config;
|
|
15
14
|
use ParsCit::Citation;
|
|
16
15
|
|
|
17
16
|
my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
|
|
@@ -22,9 +21,6 @@ my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
|
|
|
22
21
|
#'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625)
|
|
23
22
|
);
|
|
24
23
|
|
|
25
|
-
# Omnilib configuration: object name
|
|
26
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
27
|
-
|
|
28
24
|
###
|
|
29
25
|
# Huydhn: similar to findCitationText, find the citation portion using regular expression.
|
|
30
26
|
# However the input is an omnipage xml document object, not the raw text
|
|
@@ -15,7 +15,6 @@ use strict 'vars';
|
|
|
15
15
|
use FindBin;
|
|
16
16
|
use Encode ();
|
|
17
17
|
|
|
18
|
-
use Omni::Config;
|
|
19
18
|
use ParsCit::Config;
|
|
20
19
|
|
|
21
20
|
### USER customizable section
|
|
@@ -38,8 +37,6 @@ $split_model_file = "$FindBin::Bin/../$split_model_file";
|
|
|
38
37
|
# Huydhn: don't know its function
|
|
39
38
|
###
|
|
40
39
|
my %dict = ();
|
|
41
|
-
# Omnilib configuration: object name
|
|
42
|
-
my $obj_list = $Omni::Config::obj_list;
|
|
43
40
|
|
|
44
41
|
###
|
|
45
42
|
# Huydhn: prepare data for trfpp, segmenting unmarked reference
|
|
@@ -124,7 +121,7 @@ sub PrepDataUnmarked
|
|
|
124
121
|
# Trim line
|
|
125
122
|
$ln =~ s/^\s+|\s+$//g;
|
|
126
123
|
# Skip blank lines
|
|
127
|
-
if (($ln =~ m/^\s*$/)
|
|
124
|
+
if (($ln =~ m/^\s*$/))
|
|
128
125
|
{
|
|
129
126
|
$addr_index++;
|
|
130
127
|
next;
|
|
@@ -355,7 +352,6 @@ sub PrepDataUnmarked
|
|
|
355
352
|
# XML features
|
|
356
353
|
# Bullet
|
|
357
354
|
my $bullet = undef;
|
|
358
|
-
if ($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' }) { $bullet = $lines->[ $t ]->get_bullet(); }
|
|
359
355
|
if ((defined $bullet) && ($bullet eq 'true'))
|
|
360
356
|
{
|
|
361
357
|
push @feats, 'xmlBullet_yes';
|
|
@@ -368,7 +364,6 @@ sub PrepDataUnmarked
|
|
|
368
364
|
|
|
369
365
|
# First word format: bold, italic, font size
|
|
370
366
|
my $xml_runs = undef;
|
|
371
|
-
if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $xml_runs = $lines->[ $t ]->get_objs_ref(); }
|
|
372
367
|
|
|
373
368
|
# First word format: bold
|
|
374
369
|
my $bold = undef;
|
|
@@ -415,7 +410,6 @@ sub PrepDataUnmarked
|
|
|
415
410
|
|
|
416
411
|
# First word format: starting point, left alignment
|
|
417
412
|
my $start_point = undef;
|
|
418
|
-
if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $start_point = $lines->[ $t ]->get_left_pos(); }
|
|
419
413
|
if ((defined $start_point) && ($start_point > $avg_start_point * $start_upper_ratio))
|
|
420
414
|
{
|
|
421
415
|
push @feats, 'xmlBeginLine_right';
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: biblicit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.0.
|
|
4
|
+
version: 2.0.4
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2013-
|
|
12
|
+
date: 2013-03-07 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: activesupport
|
|
@@ -402,30 +402,11 @@ files:
|
|
|
402
402
|
- parscit/bin/sectLabel/genericSect/extractFeature.rb
|
|
403
403
|
- parscit/bin/sectLabel/genericSectExtract.rb
|
|
404
404
|
- parscit/bin/sectLabel/getStructureInfo.pl
|
|
405
|
-
- parscit/bin/sectLabel/processOmniXML.pl
|
|
406
|
-
- parscit/bin/sectLabel/processOmniXML_new.pl
|
|
407
|
-
- parscit/bin/sectLabel/processOmniXMLv2.pl
|
|
408
|
-
- parscit/bin/sectLabel/processOmniXMLv3.pl
|
|
409
405
|
- parscit/bin/sectLabel/redo.sectLabel.pl
|
|
410
|
-
- parscit/bin/sectLabel/simplifyOmniXML.pl
|
|
411
406
|
- parscit/bin/sectLabel/single2multi.pl
|
|
412
407
|
- parscit/bin/sectLabel/tr2crfpp.pl
|
|
413
408
|
- parscit/bin/tr2crfpp.pl
|
|
414
|
-
- parscit/bin/xml2train.pl
|
|
415
409
|
- parscit/lib/CSXUtil/SafeText.pm
|
|
416
|
-
- parscit/lib/Omni/Config.pm
|
|
417
|
-
- parscit/lib/Omni/Omnicell.pm
|
|
418
|
-
- parscit/lib/Omni/Omnicol.pm
|
|
419
|
-
- parscit/lib/Omni/Omnidd.pm
|
|
420
|
-
- parscit/lib/Omni/Omnidoc.pm
|
|
421
|
-
- parscit/lib/Omni/Omniframe.pm
|
|
422
|
-
- parscit/lib/Omni/Omniline.pm
|
|
423
|
-
- parscit/lib/Omni/Omnipage.pm
|
|
424
|
-
- parscit/lib/Omni/Omnipara.pm
|
|
425
|
-
- parscit/lib/Omni/Omnirun.pm
|
|
426
|
-
- parscit/lib/Omni/Omnitable.pm
|
|
427
|
-
- parscit/lib/Omni/Omniword.pm
|
|
428
|
-
- parscit/lib/Omni/Traversal.pm
|
|
429
410
|
- parscit/lib/ParsCit/.PostProcess.pm.swp
|
|
430
411
|
- parscit/lib/ParsCit/Citation.pm
|
|
431
412
|
- parscit/lib/ParsCit/CitationContext.pm
|
|
@@ -439,7 +420,6 @@ files:
|
|
|
439
420
|
- parscit/lib/ParsHed/PostProcess.pm
|
|
440
421
|
- parscit/lib/ParsHed/Tr2crfpp.pm
|
|
441
422
|
- parscit/lib/ParsHed/Tr2crfpp_token.pm
|
|
442
|
-
- parscit/lib/SectLabel/AAMatching.pm
|
|
443
423
|
- parscit/lib/SectLabel/Config.pm
|
|
444
424
|
- parscit/lib/SectLabel/Controller.pm
|
|
445
425
|
- parscit/lib/SectLabel/PostProcess.pm
|
|
@@ -473,7 +453,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
473
453
|
version: '0'
|
|
474
454
|
segments:
|
|
475
455
|
- 0
|
|
476
|
-
hash:
|
|
456
|
+
hash: -2794280872000100021
|
|
477
457
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
478
458
|
none: false
|
|
479
459
|
requirements:
|
|
@@ -482,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
482
462
|
version: '0'
|
|
483
463
|
segments:
|
|
484
464
|
- 0
|
|
485
|
-
hash:
|
|
465
|
+
hash: -2794280872000100021
|
|
486
466
|
requirements:
|
|
487
467
|
- For PDFs, Poppler or XPDF (try "which pdftotext")
|
|
488
468
|
- For Postscript files, Ghostscript (try "which ps2ascii")
|