biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -119,8 +119,6 @@ More than these might be required; this is what I had to add to my default insta
119
119
 
120
120
  sudo cpan install Digest::SHA1
121
121
  sudo cpan install String::Approx
122
- sudo cpan install XML::Writer::String
123
- sudo cpan install XML::Twig
124
122
 
125
123
  ## Required to use the ParsCit algorithm
126
124
 
data/biblicit.gemspec CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
5
 
6
6
  Gem::Specification.new do |gem|
7
7
  gem.name = "biblicit"
8
- gem.version = "2.0.3"
8
+ gem.version = "2.0.4"
9
9
  gem.authors = ["David Judd"]
10
10
  gem.email = ["david@academia.edu"]
11
11
  gem.summary = %q{Extract citations from PDFs.}
@@ -41,10 +41,7 @@ use File::Spec;
41
41
  use File::Basename;
42
42
 
43
43
  # Local libraries
44
- use Omni::Omnidoc;
45
- use Omni::Traversal;
46
44
  use ParsCit::Controller;
47
- use SectLabel::AAMatching;
48
45
 
49
46
  # USER customizable section
50
47
  my $tmpfile .= $0;
@@ -195,113 +192,17 @@ if (defined $opt_e && $opt_e ne "")
195
192
  }
196
193
 
197
194
  my $doc = undef;
198
- my $text_file = undef;
199
- # Extracting text from Omnipage XML output
200
- if ($is_xml_input)
201
- {
202
- $text_file = "/tmp/" . NewTmpFile();
203
- my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv2.pl -q -in $in -out $text_file -decode";
204
- system($cmd);
205
-
206
- ###
207
- # Huydhn: input is xml from Omnipage
208
- ###
209
- if (! open(IN, "<:utf8", $in)) { return (-1, "Could not open xml file " . $in . ": " . $!); }
210
- my $xml = do { local $/; <IN> };
211
- close IN;
212
-
213
- ###
214
- # Huydhn
215
- # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
216
- # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
217
- ###
218
- # Convert to Unix format
219
- $xml =~ s/\r//g;
220
- # Remove <?xml version="1.0" encoding="UTF-8"?>
221
- $xml =~ s/<\?xml.+?>\n//g;
222
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
223
- $xml =~ s/<\!\-\-XML.+?>\n//g;
224
- # Declaration and root
225
- $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
226
-
227
- # New document
228
- $doc = new Omni::Omnidoc();
229
- $doc->set_raw($xml);
230
- }
231
- else
232
- {
233
- $text_file = $in;
234
- }
195
+ my $text_file = $in;
235
196
 
236
197
  # SECTLABEL
237
198
  if (($mode & $SECTLABEL) == $SECTLABEL)
238
199
  {
239
200
  my $sect_label_input = $text_file;
240
201
 
241
- # Get XML features and append to $text_file
242
- if ($is_xml_input)
243
- {
244
- my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
245
- system($cmd);
246
-
247
- my $address_file = $text_file . ".feature" . ".address";
248
- if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
249
-
250
- my @omni_address = ();
251
- # Read the address file provided by process OmniXML script
252
- while (<IN>)
253
- {
254
- chomp;
255
- # Save and split the line
256
- my $line = $_;
257
- my @element = split(/\s+/, $line);
258
-
259
- my %addr = ();
260
- # Address
261
- $addr{ 'L1' } = $element[ 0 ];
262
- $addr{ 'L2' } = $element[ 1 ];
263
- $addr{ 'L3' } = $element[ 2 ];
264
- $addr{ 'L4' } = $element[ 3 ];
265
-
266
- # Save the address
267
- push @omni_address, { %addr };
268
- }
269
- close IN;
270
- unlink($address_file);
271
-
272
- $sect_label_input .= ".feature";
273
- my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
274
-
275
- # Remove first line <?xml/>
276
- $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
277
-
278
- # Only run author - affiliation if "something" is provided
279
- if ($opt_a)
280
- {
281
- my @aut_addrs = ();
282
- my @aff_addrs = ();
283
- # Address of author section
284
- for my $lindex (@{ $aut_lines }) { push @aut_addrs, $omni_address[ $lindex ]; }
285
- # Address of affiliation section
286
- for my $lindex (@{ $aff_lines }) { push @aff_addrs, $omni_address[ $lindex ]; }
287
-
288
- # The tarpit
289
- my $aa_xml = SectLabel::AAMatching::AAMatching($doc, \@aut_addrs, \@aff_addrs);
290
-
291
- # Author-Affiliation Matching result
292
- $rxml .= $aa_xml . "\n";
293
- }
294
-
295
- # Remove XML feature file
296
- unlink($sect_label_input);
297
- }
298
- else
299
- {
300
- my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
202
+ my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
301
203
 
302
- # Remove first line <?xml/>
303
- $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
304
- }
204
+ # Remove first line <?xml/>
205
+ $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
305
206
  }
306
207
 
307
208
  # PARSHED
@@ -318,66 +219,13 @@ if (($mode & $PARSHED) == $PARSHED)
318
219
  # PARSCIT
319
220
  if (($mode & $PARSCIT) == $PARSCIT)
320
221
  {
321
- if ($is_xml_input)
322
- {
323
- my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
324
- system($cmd);
325
-
326
- my $address_file = $text_file . ".feature" . ".address";
327
- if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
328
-
329
- my @omni_address = ();
330
- # Read the address file provided by process OmniXML script
331
- while (<IN>)
332
- {
333
- chomp;
334
- # Save and split the line
335
- my $line = $_;
336
- my @element = split(/\s+/, $line);
337
-
338
- my %addr = ();
339
- # Address
340
- $addr{ 'L1' } = $element[ 0 ];
341
- $addr{ 'L2' } = $element[ 1 ];
342
- $addr{ 'L3' } = $element[ 2 ];
343
- $addr{ 'L4' } = $element[ 3 ];
344
-
345
- # Save the address
346
- push @omni_address, { %addr };
347
- }
348
- close IN;
349
- unlink($address_file);
350
-
351
- my $sect_label_input = $text_file . ".feature";
352
- # Output of sectlabel becomes input for parscit
353
- my ($all_text, $cit_lines) = SectLabel($sect_label_input, $is_xml_input, 1);
354
- # Remove XML feature file
355
- unlink($sect_label_input);
356
-
357
- my @cit_addrs = ();
358
- # Address of reference section
359
- for my $lindex (@{ $cit_lines }) { push @cit_addrs, $omni_address[ $lindex ]; }
360
-
361
- my $pc_xml = undef;
362
- # Huydhn: add xml features to parscit in case of unmarked reference
363
- $pc_xml = ParsCit::Controller::ExtractCitations2(\$all_text, $cit_lines, $is_xml_input, $doc, \@cit_addrs);
364
-
365
- # Remove first line <?xml/>
366
- $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
222
+ my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
367
223
 
368
- # Thang v100901: call to BiblioScript
369
- if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
370
- }
371
- else
372
- {
373
- my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
374
-
375
- # Remove first line <?xml/>
376
- $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
224
+ # Remove first line <?xml/>
225
+ $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
377
226
 
378
- # Thang v100901: call to BiblioScript
379
- if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
380
- }
227
+ # Thang v100901: call to BiblioScript
228
+ if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
381
229
  }
382
230
 
383
231
  $rxml .= "</algorithms>";
@@ -82,16 +82,6 @@ $modelFile = "$path/../$modelFile";
82
82
  my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
83
83
  $configFile = "$path/../$configFile";
84
84
 
85
- if($isXmlInput){
86
- my $xmlInFile = newTmpFile();
87
- $xmlInFile = untaintPath($xmlInFile);
88
- my $cmd = "$path/sectLabel/";
89
- $cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
90
- $cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
91
- execute($cmd);
92
- $inFile = $xmlInFile;
93
- }
94
-
95
85
  my $dictFile = $SectLabel::Config::dictFile;
96
86
  $dictFile = "$path/../$dictFile";
97
87
 
@@ -99,10 +89,6 @@ my $funcFile = $SectLabel::Config::funcFile;
99
89
  $funcFile = "$path/../$funcFile";
100
90
  my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
101
91
 
102
- if($isXmlInput){
103
- unlink($inFile);
104
- }
105
-
106
92
  if (defined $outFile) {
107
93
  $outFile = untaintPath($outFile);
108
94
 
@@ -21,8 +21,6 @@ use ParsCit::Tr2crfpp;
21
21
  use ParsCit::PreProcess;
22
22
  use ParsCit::PostProcess;
23
23
  use ParsCit::CitationContext;
24
- # Omnipage libraries
25
- use Omni::Omnidoc;
26
24
  # Dependencies
27
25
  use CSXUtil::SafeText qw(cleanXML);
28
26
 
@@ -228,62 +226,6 @@ sub ExtractCitationsImpl
228
226
  # Reference to an array of single reference
229
227
  my $rraw_citations = undef;
230
228
 
231
- # Find and separate reference
232
- if ($is_xml)
233
- {
234
- ###
235
- # Huydhn: input is xml from Omnipage
236
- ###
237
- if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
238
- my $xml = do { local $/; <IN> };
239
- close IN;
240
-
241
- ###
242
- # Huydhn
243
- # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
244
- # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
245
- ###
246
- # Convert to Unix format
247
- $xml =~ s/\r//g;
248
- # Remove <?xml version="1.0" encoding="UTF-8"?>
249
- $xml =~ s/<\?xml.+?>\n//g;
250
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
251
- $xml =~ s/<\!\-\-XML.+?>\n//g;
252
- # Declaration and root
253
- $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
254
-
255
- # New document
256
- my $doc = new Omni::Omnidoc();
257
- $doc->set_raw($xml);
258
-
259
- # Extract the reference portion from the XML
260
- my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
261
-
262
- # Extract the reference portion from the text.
263
- # TODO: NEED TO BE REMOVED FROM HERE
264
- my $content = $doc->get_content();
265
- ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
266
-
267
- my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
268
- my @body_tokens = split(/\s+/, $$rbody_text);
269
-
270
- my $size = scalar(@norm_body_tokens);
271
- my $size1 = scalar(@pos_array);
272
-
273
- if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
274
- # TODO: TO HERE
275
-
276
- # Filename initialization
277
- if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
278
-
279
- # Prepare to split unmarked reference portion
280
- my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
281
-
282
- # Extract citations from citation text
283
- $rraw_citations = ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
284
- }
285
- else
286
- {
287
229
  if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
288
230
  my $text = do { local $/; <IN> };
289
231
  close IN;
@@ -309,7 +251,6 @@ sub ExtractCitationsImpl
309
251
 
310
252
  # Extract citations from citation text
311
253
  $rraw_citations = ParsCit::PreProcess::SegmentCitations($rcite_text);
312
- }
313
254
 
314
255
  my @citations = ();
315
256
  my @valid_citations = ();
@@ -11,7 +11,6 @@ package ParsCit::PreProcess;
11
11
  use utf8;
12
12
  use strict;
13
13
 
14
- use Omni::Config;
15
14
  use ParsCit::Citation;
16
15
 
17
16
  my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
@@ -22,9 +21,6 @@ my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
22
21
  #'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625)
23
22
  );
24
23
 
25
- # Omnilib configuration: object name
26
- my $obj_list = $Omni::Config::obj_list;
27
-
28
24
  ###
29
25
  # Huydhn: similar to findCitationText, find the citation portion using regular expression.
30
26
  # However the input is an omnipage xml document object, not the raw text
@@ -15,7 +15,6 @@ use strict 'vars';
15
15
  use FindBin;
16
16
  use Encode ();
17
17
 
18
- use Omni::Config;
19
18
  use ParsCit::Config;
20
19
 
21
20
  ### USER customizable section
@@ -38,8 +37,6 @@ $split_model_file = "$FindBin::Bin/../$split_model_file";
38
37
  # Huydhn: don't know its function
39
38
  ###
40
39
  my %dict = ();
41
- # Omnilib configuration: object name
42
- my $obj_list = $Omni::Config::obj_list;
43
40
 
44
41
  ###
45
42
  # Huydhn: prepare data for trfpp, segmenting unmarked reference
@@ -124,7 +121,7 @@ sub PrepDataUnmarked
124
121
  # Trim line
125
122
  $ln =~ s/^\s+|\s+$//g;
126
123
  # Skip blank lines
127
- if (($ln =~ m/^\s*$/) || ($lines->[ $t ]->get_name() ne $obj_list->{ 'OMNILINE' }))
124
+ if (($ln =~ m/^\s*$/))
128
125
  {
129
126
  $addr_index++;
130
127
  next;
@@ -355,7 +352,6 @@ sub PrepDataUnmarked
355
352
  # XML features
356
353
  # Bullet
357
354
  my $bullet = undef;
358
- if ($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' }) { $bullet = $lines->[ $t ]->get_bullet(); }
359
355
  if ((defined $bullet) && ($bullet eq 'true'))
360
356
  {
361
357
  push @feats, 'xmlBullet_yes';
@@ -368,7 +364,6 @@ sub PrepDataUnmarked
368
364
 
369
365
  # First word format: bold, italic, font size
370
366
  my $xml_runs = undef;
371
- if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $xml_runs = $lines->[ $t ]->get_objs_ref(); }
372
367
 
373
368
  # First word format: bold
374
369
  my $bold = undef;
@@ -415,7 +410,6 @@ sub PrepDataUnmarked
415
410
 
416
411
  # First word format: starting point, left alignment
417
412
  my $start_point = undef;
418
- if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $start_point = $lines->[ $t ]->get_left_pos(); }
419
413
  if ((defined $start_point) && ($start_point > $avg_start_point * $start_upper_ratio))
420
414
  {
421
415
  push @feats, 'xmlBeginLine_right';
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biblicit
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.3
4
+ version: 2.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-28 00:00:00.000000000 Z
12
+ date: 2013-03-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -402,30 +402,11 @@ files:
402
402
  - parscit/bin/sectLabel/genericSect/extractFeature.rb
403
403
  - parscit/bin/sectLabel/genericSectExtract.rb
404
404
  - parscit/bin/sectLabel/getStructureInfo.pl
405
- - parscit/bin/sectLabel/processOmniXML.pl
406
- - parscit/bin/sectLabel/processOmniXML_new.pl
407
- - parscit/bin/sectLabel/processOmniXMLv2.pl
408
- - parscit/bin/sectLabel/processOmniXMLv3.pl
409
405
  - parscit/bin/sectLabel/redo.sectLabel.pl
410
- - parscit/bin/sectLabel/simplifyOmniXML.pl
411
406
  - parscit/bin/sectLabel/single2multi.pl
412
407
  - parscit/bin/sectLabel/tr2crfpp.pl
413
408
  - parscit/bin/tr2crfpp.pl
414
- - parscit/bin/xml2train.pl
415
409
  - parscit/lib/CSXUtil/SafeText.pm
416
- - parscit/lib/Omni/Config.pm
417
- - parscit/lib/Omni/Omnicell.pm
418
- - parscit/lib/Omni/Omnicol.pm
419
- - parscit/lib/Omni/Omnidd.pm
420
- - parscit/lib/Omni/Omnidoc.pm
421
- - parscit/lib/Omni/Omniframe.pm
422
- - parscit/lib/Omni/Omniline.pm
423
- - parscit/lib/Omni/Omnipage.pm
424
- - parscit/lib/Omni/Omnipara.pm
425
- - parscit/lib/Omni/Omnirun.pm
426
- - parscit/lib/Omni/Omnitable.pm
427
- - parscit/lib/Omni/Omniword.pm
428
- - parscit/lib/Omni/Traversal.pm
429
410
  - parscit/lib/ParsCit/.PostProcess.pm.swp
430
411
  - parscit/lib/ParsCit/Citation.pm
431
412
  - parscit/lib/ParsCit/CitationContext.pm
@@ -439,7 +420,6 @@ files:
439
420
  - parscit/lib/ParsHed/PostProcess.pm
440
421
  - parscit/lib/ParsHed/Tr2crfpp.pm
441
422
  - parscit/lib/ParsHed/Tr2crfpp_token.pm
442
- - parscit/lib/SectLabel/AAMatching.pm
443
423
  - parscit/lib/SectLabel/Config.pm
444
424
  - parscit/lib/SectLabel/Controller.pm
445
425
  - parscit/lib/SectLabel/PostProcess.pm
@@ -473,7 +453,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
473
453
  version: '0'
474
454
  segments:
475
455
  - 0
476
- hash: 2279876521491945710
456
+ hash: -2794280872000100021
477
457
  required_rubygems_version: !ruby/object:Gem::Requirement
478
458
  none: false
479
459
  requirements:
@@ -482,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
482
462
  version: '0'
483
463
  segments:
484
464
  - 0
485
- hash: 2279876521491945710
465
+ hash: -2794280872000100021
486
466
  requirements:
487
467
  - For PDFs, Poppler or XPDF (try "which pdftotext")
488
468
  - For Postscript files, Ghostscript (try "which ps2ascii")