biblicit 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -119,8 +119,6 @@ More than these might be required; this is what I had to add to my default insta
119
119
 
120
120
  sudo cpan install Digest::SHA1
121
121
  sudo cpan install String::Approx
122
- sudo cpan install XML::Writer::String
123
- sudo cpan install XML::Twig
124
122
 
125
123
  ## Required to use the ParsCit algorithm
126
124
 
data/biblicit.gemspec CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
5
 
6
6
  Gem::Specification.new do |gem|
7
7
  gem.name = "biblicit"
8
- gem.version = "2.0.3"
8
+ gem.version = "2.0.4"
9
9
  gem.authors = ["David Judd"]
10
10
  gem.email = ["david@academia.edu"]
11
11
  gem.summary = %q{Extract citations from PDFs.}
@@ -41,10 +41,7 @@ use File::Spec;
41
41
  use File::Basename;
42
42
 
43
43
  # Local libraries
44
- use Omni::Omnidoc;
45
- use Omni::Traversal;
46
44
  use ParsCit::Controller;
47
- use SectLabel::AAMatching;
48
45
 
49
46
  # USER customizable section
50
47
  my $tmpfile .= $0;
@@ -195,113 +192,17 @@ if (defined $opt_e && $opt_e ne "")
195
192
  }
196
193
 
197
194
  my $doc = undef;
198
- my $text_file = undef;
199
- # Extracting text from Omnipage XML output
200
- if ($is_xml_input)
201
- {
202
- $text_file = "/tmp/" . NewTmpFile();
203
- my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv2.pl -q -in $in -out $text_file -decode";
204
- system($cmd);
205
-
206
- ###
207
- # Huydhn: input is xml from Omnipage
208
- ###
209
- if (! open(IN, "<:utf8", $in)) { return (-1, "Could not open xml file " . $in . ": " . $!); }
210
- my $xml = do { local $/; <IN> };
211
- close IN;
212
-
213
- ###
214
- # Huydhn
215
- # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
216
- # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
217
- ###
218
- # Convert to Unix format
219
- $xml =~ s/\r//g;
220
- # Remove <?xml version="1.0" encoding="UTF-8"?>
221
- $xml =~ s/<\?xml.+?>\n//g;
222
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
223
- $xml =~ s/<\!\-\-XML.+?>\n//g;
224
- # Declaration and root
225
- $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
226
-
227
- # New document
228
- $doc = new Omni::Omnidoc();
229
- $doc->set_raw($xml);
230
- }
231
- else
232
- {
233
- $text_file = $in;
234
- }
195
+ my $text_file = $in;
235
196
 
236
197
  # SECTLABEL
237
198
  if (($mode & $SECTLABEL) == $SECTLABEL)
238
199
  {
239
200
  my $sect_label_input = $text_file;
240
201
 
241
- # Get XML features and append to $text_file
242
- if ($is_xml_input)
243
- {
244
- my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
245
- system($cmd);
246
-
247
- my $address_file = $text_file . ".feature" . ".address";
248
- if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
249
-
250
- my @omni_address = ();
251
- # Read the address file provided by process OmniXML script
252
- while (<IN>)
253
- {
254
- chomp;
255
- # Save and split the line
256
- my $line = $_;
257
- my @element = split(/\s+/, $line);
258
-
259
- my %addr = ();
260
- # Address
261
- $addr{ 'L1' } = $element[ 0 ];
262
- $addr{ 'L2' } = $element[ 1 ];
263
- $addr{ 'L3' } = $element[ 2 ];
264
- $addr{ 'L4' } = $element[ 3 ];
265
-
266
- # Save the address
267
- push @omni_address, { %addr };
268
- }
269
- close IN;
270
- unlink($address_file);
271
-
272
- $sect_label_input .= ".feature";
273
- my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
274
-
275
- # Remove first line <?xml/>
276
- $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
277
-
278
- # Only run author - affiliation if "something" is provided
279
- if ($opt_a)
280
- {
281
- my @aut_addrs = ();
282
- my @aff_addrs = ();
283
- # Address of author section
284
- for my $lindex (@{ $aut_lines }) { push @aut_addrs, $omni_address[ $lindex ]; }
285
- # Address of affiliation section
286
- for my $lindex (@{ $aff_lines }) { push @aff_addrs, $omni_address[ $lindex ]; }
287
-
288
- # The tarpit
289
- my $aa_xml = SectLabel::AAMatching::AAMatching($doc, \@aut_addrs, \@aff_addrs);
290
-
291
- # Author-Affiliation Matching result
292
- $rxml .= $aa_xml . "\n";
293
- }
294
-
295
- # Remove XML feature file
296
- unlink($sect_label_input);
297
- }
298
- else
299
- {
300
- my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
202
+ my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
301
203
 
302
- # Remove first line <?xml/>
303
- $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
304
- }
204
+ # Remove first line <?xml/>
205
+ $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
305
206
  }
306
207
 
307
208
  # PARSHED
@@ -318,66 +219,13 @@ if (($mode & $PARSHED) == $PARSHED)
318
219
  # PARSCIT
319
220
  if (($mode & $PARSCIT) == $PARSCIT)
320
221
  {
321
- if ($is_xml_input)
322
- {
323
- my $cmd = $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
324
- system($cmd);
325
-
326
- my $address_file = $text_file . ".feature" . ".address";
327
- if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
328
-
329
- my @omni_address = ();
330
- # Read the address file provided by process OmniXML script
331
- while (<IN>)
332
- {
333
- chomp;
334
- # Save and split the line
335
- my $line = $_;
336
- my @element = split(/\s+/, $line);
337
-
338
- my %addr = ();
339
- # Address
340
- $addr{ 'L1' } = $element[ 0 ];
341
- $addr{ 'L2' } = $element[ 1 ];
342
- $addr{ 'L3' } = $element[ 2 ];
343
- $addr{ 'L4' } = $element[ 3 ];
344
-
345
- # Save the address
346
- push @omni_address, { %addr };
347
- }
348
- close IN;
349
- unlink($address_file);
350
-
351
- my $sect_label_input = $text_file . ".feature";
352
- # Output of sectlabel becomes input for parscit
353
- my ($all_text, $cit_lines) = SectLabel($sect_label_input, $is_xml_input, 1);
354
- # Remove XML feature file
355
- unlink($sect_label_input);
356
-
357
- my @cit_addrs = ();
358
- # Address of reference section
359
- for my $lindex (@{ $cit_lines }) { push @cit_addrs, $omni_address[ $lindex ]; }
360
-
361
- my $pc_xml = undef;
362
- # Huydhn: add xml features to parscit in case of unmarked reference
363
- $pc_xml = ParsCit::Controller::ExtractCitations2(\$all_text, $cit_lines, $is_xml_input, $doc, \@cit_addrs);
364
-
365
- # Remove first line <?xml/>
366
- $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
222
+ my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
367
223
 
368
- # Thang v100901: call to BiblioScript
369
- if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
370
- }
371
- else
372
- {
373
- my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
374
-
375
- # Remove first line <?xml/>
376
- $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
224
+ # Remove first line <?xml/>
225
+ $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
377
226
 
378
- # Thang v100901: call to BiblioScript
379
- if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
380
- }
227
+ # Thang v100901: call to BiblioScript
228
+ if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
381
229
  }
382
230
 
383
231
  $rxml .= "</algorithms>";
@@ -82,16 +82,6 @@ $modelFile = "$path/../$modelFile";
82
82
  my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
83
83
  $configFile = "$path/../$configFile";
84
84
 
85
- if($isXmlInput){
86
- my $xmlInFile = newTmpFile();
87
- $xmlInFile = untaintPath($xmlInFile);
88
- my $cmd = "$path/sectLabel/";
89
- $cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
90
- $cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
91
- execute($cmd);
92
- $inFile = $xmlInFile;
93
- }
94
-
95
85
  my $dictFile = $SectLabel::Config::dictFile;
96
86
  $dictFile = "$path/../$dictFile";
97
87
 
@@ -99,10 +89,6 @@ my $funcFile = $SectLabel::Config::funcFile;
99
89
  $funcFile = "$path/../$funcFile";
100
90
  my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
101
91
 
102
- if($isXmlInput){
103
- unlink($inFile);
104
- }
105
-
106
92
  if (defined $outFile) {
107
93
  $outFile = untaintPath($outFile);
108
94
 
@@ -21,8 +21,6 @@ use ParsCit::Tr2crfpp;
21
21
  use ParsCit::PreProcess;
22
22
  use ParsCit::PostProcess;
23
23
  use ParsCit::CitationContext;
24
- # Omnipage libraries
25
- use Omni::Omnidoc;
26
24
  # Dependencies
27
25
  use CSXUtil::SafeText qw(cleanXML);
28
26
 
@@ -228,62 +226,6 @@ sub ExtractCitationsImpl
228
226
  # Reference to an array of single reference
229
227
  my $rraw_citations = undef;
230
228
 
231
- # Find and separate reference
232
- if ($is_xml)
233
- {
234
- ###
235
- # Huydhn: input is xml from Omnipage
236
- ###
237
- if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
238
- my $xml = do { local $/; <IN> };
239
- close IN;
240
-
241
- ###
242
- # Huydhn
243
- # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
244
- # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
245
- ###
246
- # Convert to Unix format
247
- $xml =~ s/\r//g;
248
- # Remove <?xml version="1.0" encoding="UTF-8"?>
249
- $xml =~ s/<\?xml.+?>\n//g;
250
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
251
- $xml =~ s/<\!\-\-XML.+?>\n//g;
252
- # Declaration and root
253
- $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
254
-
255
- # New document
256
- my $doc = new Omni::Omnidoc();
257
- $doc->set_raw($xml);
258
-
259
- # Extract the reference portion from the XML
260
- my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
261
-
262
- # Extract the reference portion from the text.
263
- # TODO: NEED TO BE REMOVED FROM HERE
264
- my $content = $doc->get_content();
265
- ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
266
-
267
- my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
268
- my @body_tokens = split(/\s+/, $$rbody_text);
269
-
270
- my $size = scalar(@norm_body_tokens);
271
- my $size1 = scalar(@pos_array);
272
-
273
- if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
274
- # TODO: TO HERE
275
-
276
- # Filename initialization
277
- if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
278
-
279
- # Prepare to split unmarked reference portion
280
- my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
281
-
282
- # Extract citations from citation text
283
- $rraw_citations = ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
284
- }
285
- else
286
- {
287
229
  if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
288
230
  my $text = do { local $/; <IN> };
289
231
  close IN;
@@ -309,7 +251,6 @@ sub ExtractCitationsImpl
309
251
 
310
252
  # Extract citations from citation text
311
253
  $rraw_citations = ParsCit::PreProcess::SegmentCitations($rcite_text);
312
- }
313
254
 
314
255
  my @citations = ();
315
256
  my @valid_citations = ();
@@ -11,7 +11,6 @@ package ParsCit::PreProcess;
11
11
  use utf8;
12
12
  use strict;
13
13
 
14
- use Omni::Config;
15
14
  use ParsCit::Citation;
16
15
 
17
16
  my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
@@ -22,9 +21,6 @@ my %marker_types = ( 'SQUARE' => '\\[.+?\\]',
22
21
  #'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625)
23
22
  );
24
23
 
25
- # Omnilib configuration: object name
26
- my $obj_list = $Omni::Config::obj_list;
27
-
28
24
  ###
29
25
  # Huydhn: similar to findCitationText, find the citation portion using regular expression.
30
26
  # However the input is an omnipage xml document object, not the raw text
@@ -15,7 +15,6 @@ use strict 'vars';
15
15
  use FindBin;
16
16
  use Encode ();
17
17
 
18
- use Omni::Config;
19
18
  use ParsCit::Config;
20
19
 
21
20
  ### USER customizable section
@@ -38,8 +37,6 @@ $split_model_file = "$FindBin::Bin/../$split_model_file";
38
37
  # Huydhn: don't know its function
39
38
  ###
40
39
  my %dict = ();
41
- # Omnilib configuration: object name
42
- my $obj_list = $Omni::Config::obj_list;
43
40
 
44
41
  ###
45
42
  # Huydhn: prepare data for trfpp, segmenting unmarked reference
@@ -124,7 +121,7 @@ sub PrepDataUnmarked
124
121
  # Trim line
125
122
  $ln =~ s/^\s+|\s+$//g;
126
123
  # Skip blank lines
127
- if (($ln =~ m/^\s*$/) || ($lines->[ $t ]->get_name() ne $obj_list->{ 'OMNILINE' }))
124
+ if (($ln =~ m/^\s*$/))
128
125
  {
129
126
  $addr_index++;
130
127
  next;
@@ -355,7 +352,6 @@ sub PrepDataUnmarked
355
352
  # XML features
356
353
  # Bullet
357
354
  my $bullet = undef;
358
- if ($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' }) { $bullet = $lines->[ $t ]->get_bullet(); }
359
355
  if ((defined $bullet) && ($bullet eq 'true'))
360
356
  {
361
357
  push @feats, 'xmlBullet_yes';
@@ -368,7 +364,6 @@ sub PrepDataUnmarked
368
364
 
369
365
  # First word format: bold, italic, font size
370
366
  my $xml_runs = undef;
371
- if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $xml_runs = $lines->[ $t ]->get_objs_ref(); }
372
367
 
373
368
  # First word format: bold
374
369
  my $bold = undef;
@@ -415,7 +410,6 @@ sub PrepDataUnmarked
415
410
 
416
411
  # First word format: starting point, left alignment
417
412
  my $start_point = undef;
418
- if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $start_point = $lines->[ $t ]->get_left_pos(); }
419
413
  if ((defined $start_point) && ($start_point > $avg_start_point * $start_upper_ratio))
420
414
  {
421
415
  push @feats, 'xmlBeginLine_right';
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biblicit
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.3
4
+ version: 2.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-28 00:00:00.000000000 Z
12
+ date: 2013-03-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -402,30 +402,11 @@ files:
402
402
  - parscit/bin/sectLabel/genericSect/extractFeature.rb
403
403
  - parscit/bin/sectLabel/genericSectExtract.rb
404
404
  - parscit/bin/sectLabel/getStructureInfo.pl
405
- - parscit/bin/sectLabel/processOmniXML.pl
406
- - parscit/bin/sectLabel/processOmniXML_new.pl
407
- - parscit/bin/sectLabel/processOmniXMLv2.pl
408
- - parscit/bin/sectLabel/processOmniXMLv3.pl
409
405
  - parscit/bin/sectLabel/redo.sectLabel.pl
410
- - parscit/bin/sectLabel/simplifyOmniXML.pl
411
406
  - parscit/bin/sectLabel/single2multi.pl
412
407
  - parscit/bin/sectLabel/tr2crfpp.pl
413
408
  - parscit/bin/tr2crfpp.pl
414
- - parscit/bin/xml2train.pl
415
409
  - parscit/lib/CSXUtil/SafeText.pm
416
- - parscit/lib/Omni/Config.pm
417
- - parscit/lib/Omni/Omnicell.pm
418
- - parscit/lib/Omni/Omnicol.pm
419
- - parscit/lib/Omni/Omnidd.pm
420
- - parscit/lib/Omni/Omnidoc.pm
421
- - parscit/lib/Omni/Omniframe.pm
422
- - parscit/lib/Omni/Omniline.pm
423
- - parscit/lib/Omni/Omnipage.pm
424
- - parscit/lib/Omni/Omnipara.pm
425
- - parscit/lib/Omni/Omnirun.pm
426
- - parscit/lib/Omni/Omnitable.pm
427
- - parscit/lib/Omni/Omniword.pm
428
- - parscit/lib/Omni/Traversal.pm
429
410
  - parscit/lib/ParsCit/.PostProcess.pm.swp
430
411
  - parscit/lib/ParsCit/Citation.pm
431
412
  - parscit/lib/ParsCit/CitationContext.pm
@@ -439,7 +420,6 @@ files:
439
420
  - parscit/lib/ParsHed/PostProcess.pm
440
421
  - parscit/lib/ParsHed/Tr2crfpp.pm
441
422
  - parscit/lib/ParsHed/Tr2crfpp_token.pm
442
- - parscit/lib/SectLabel/AAMatching.pm
443
423
  - parscit/lib/SectLabel/Config.pm
444
424
  - parscit/lib/SectLabel/Controller.pm
445
425
  - parscit/lib/SectLabel/PostProcess.pm
@@ -473,7 +453,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
473
453
  version: '0'
474
454
  segments:
475
455
  - 0
476
- hash: 2279876521491945710
456
+ hash: -2794280872000100021
477
457
  required_rubygems_version: !ruby/object:Gem::Requirement
478
458
  none: false
479
459
  requirements:
@@ -482,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
482
462
  version: '0'
483
463
  segments:
484
464
  - 0
485
- hash: 2279876521491945710
465
+ hash: -2794280872000100021
486
466
  requirements:
487
467
  - For PDFs, Poppler or XPDF (try "which pdftotext")
488
468
  - For Postscript files, Ghostscript (try "which ps2ascii")