RubyGems - biblicit - Versions diffs - 2.0.3 → 2.0.4 - Mend

biblicit 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/README.md +0 -2
data/biblicit.gemspec +1 -1
data/parscit/bin/citeExtract.pl +9 -161
data/parscit/bin/sectExtract.pl +0 -14
data/parscit/lib/ParsCit/Controller.pm +0 -59
data/parscit/lib/ParsCit/PreProcess.pm +0 -4
data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
metadata +4 -24
data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
data/parscit/bin/xml2train.pl +0 -193
data/parscit/lib/Omni/Config.pm +0 -93
data/parscit/lib/Omni/Omnicell.pm +0 -263
data/parscit/lib/Omni/Omnicol.pm +0 -292
data/parscit/lib/Omni/Omnidd.pm +0 -328
data/parscit/lib/Omni/Omnidoc.pm +0 -153
data/parscit/lib/Omni/Omniframe.pm +0 -223
data/parscit/lib/Omni/Omniline.pm +0 -423
data/parscit/lib/Omni/Omnipage.pm +0 -282
data/parscit/lib/Omni/Omnipara.pm +0 -232
data/parscit/lib/Omni/Omnirun.pm +0 -303
data/parscit/lib/Omni/Omnitable.pm +0 -336
data/parscit/lib/Omni/Omniword.pm +0 -162
data/parscit/lib/Omni/Traversal.pm +0 -313
data/parscit/lib/SectLabel/AAMatching.pm +0 -1949

data/README.md CHANGED Viewed

@@ -119,8 +119,6 @@ More than these might be required; this is what I had to add to my default insta
     sudo cpan install Digest::SHA1
     sudo cpan install String::Approx
-    sudo cpan install XML::Writer::String
-    sudo cpan install XML::Twig
 ## Required to use the ParsCit algorithm

data/biblicit.gemspec CHANGED Viewed

@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 Gem::Specification.new do |gem|
   gem.name          = "biblicit"
-  gem.version       = "2.0.3"
+  gem.version       = "2.0.4"
   gem.authors       = ["David Judd"]
   gem.email         = ["david@academia.edu"]
   gem.summary       = %q{Extract citations from PDFs.}

data/parscit/bin/citeExtract.pl CHANGED Viewed

@@ -41,10 +41,7 @@ use File::Spec;
 use File::Basename;
 # Local libraries
-use Omni::Omnidoc;
-use Omni::Traversal;
 use ParsCit::Controller;
-use SectLabel::AAMatching;
 # USER customizable section
 my $tmpfile	.= $0;
@@ -195,113 +192,17 @@ if (defined $opt_e && $opt_e ne "")
 }
 my $doc			= undef;
-my $text_file	= undef;
-# Extracting text from Omnipage XML output
-if ($is_xml_input)
-{
-	$text_file	= "/tmp/" . NewTmpFile();
-	my $cmd		= $FindBin::Bin . "/sectLabel/processOmniXMLv2.pl -q -in $in -out $text_file -decode";
-	system($cmd);
-	###
-	# Huydhn: input is xml from Omnipage
-	###
-	if (! open(IN, "<:utf8", $in)) { return (-1, "Could not open xml file " . $in . ": " . $!); }
-	my $xml = do { local $/; <IN> };
-	close IN;
-	###
-	# Huydhn
-	# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
-	# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
-	###
-	# Convert to Unix format
-	$xml =~ s/\r//g;
-	# Remove <?xml version="1.0" encoding="UTF-8"?>
-	$xml =~ s/<\?xml.+?>\n//g;
-	# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
-	$xml =~ s/<\!\-\-XML.+?>\n//g;
-	# Declaration and root
-	$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
-	# New document
-	$doc = new Omni::Omnidoc();
-	$doc->set_raw($xml);
-}
-else
-{
-	$text_file	= $in;
-}
+my $text_file	= $in;
 # SECTLABEL
 if (($mode & $SECTLABEL) == $SECTLABEL)
 {
 	my $sect_label_input = $text_file;
-	# Get XML features and append to $text_file
-	if ($is_xml_input)
-	{
-		my $cmd	= $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
-		system($cmd);
-		my $address_file = $text_file . ".feature" . ".address";
-		if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
-		my @omni_address = ();
-		# Read the address file provided by process OmniXML script
-		while (<IN>)
-		{
-			chomp;
-			# Save and split the line
-			my $line	= $_;
-			my @element	= split(/\s+/, $line);
-			my %addr		= ();
-			# Address
-			$addr{ 'L1' }	= $element[ 0 ];
-			$addr{ 'L2' }	= $element[ 1 ];
-			$addr{ 'L3' }	= $element[ 2 ];
-			$addr{ 'L4' }	= $element[ 3 ];
-			# Save the address
-			push @omni_address, { %addr };
-		}
-		close IN;
-		unlink($address_file);
-		$sect_label_input .= ".feature";
-		my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($sl_xml, 1) . "\n";
-		# Only run author - affiliation if "something" is provided
-		if ($opt_a)
-		{
-			my @aut_addrs = ();
-			my @aff_addrs = ();
-			# Address of author section
-			for my $lindex (@{ $aut_lines }) { push @aut_addrs, $omni_address[ $lindex ]; }
-			# Address of affiliation section
-			for my $lindex (@{ $aff_lines }) { push @aff_addrs, $omni_address[ $lindex ]; }
-			# The tarpit
-			my $aa_xml = SectLabel::AAMatching::AAMatching($doc, \@aut_addrs, \@aff_addrs);
-			# Author-Affiliation Matching result
-			$rxml .= $aa_xml . "\n";
-		}
-		# Remove XML feature file
-		unlink($sect_label_input);
-	}
-	else
-	{
-		my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
+  my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($sl_xml, 1) . "\n";
-	}
+  # Remove first line <?xml/>
+  $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
 }
 # PARSHED
@@ -318,66 +219,13 @@ if (($mode & $PARSHED) == $PARSHED)
 # PARSCIT
 if (($mode & $PARSCIT) == $PARSCIT)
 {
-	if ($is_xml_input)
-	{
-		my $cmd	= $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
-		system($cmd);
-		my $address_file = $text_file . ".feature" . ".address";
-		if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
-		my @omni_address = ();
-		# Read the address file provided by process OmniXML script
-		while (<IN>)
-		{
-			chomp;
-			# Save and split the line
-			my $line	= $_;
-			my @element	= split(/\s+/, $line);
-			my %addr		= ();
-			# Address
-			$addr{ 'L1' }	= $element[ 0 ];
-			$addr{ 'L2' }	= $element[ 1 ];
-			$addr{ 'L3' }	= $element[ 2 ];
-			$addr{ 'L4' }	= $element[ 3 ];
-			# Save the address
-			push @omni_address, { %addr };
-		}
-		close IN;
-		unlink($address_file);
-		my $sect_label_input = $text_file . ".feature";
-		# Output of sectlabel becomes input for parscit
-		my ($all_text, $cit_lines) = SectLabel($sect_label_input, $is_xml_input, 1);
-		# Remove XML feature file
-		unlink($sect_label_input);
-		my @cit_addrs = ();
-		# Address of reference section
-		for my $lindex (@{ $cit_lines }) { push @cit_addrs, $omni_address[ $lindex ]; }
-		my $pc_xml = undef;
-		# Huydhn: add xml features to parscit in case of unmarked reference
-		$pc_xml = ParsCit::Controller::ExtractCitations2(\$all_text, $cit_lines, $is_xml_input, $doc, \@cit_addrs);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
+  my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
-		# Thang v100901: call to BiblioScript
-		if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
-	}
-	else
-	{
-		my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
+  # Remove first line <?xml/>
+  $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
-		# Thang v100901: call to BiblioScript
-		if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
-	}
+  # Thang v100901: call to BiblioScript
+  if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
 }
 $rxml .= "</algorithms>";

data/parscit/bin/sectExtract.pl CHANGED Viewed

@@ -82,16 +82,6 @@ $modelFile = "$path/../$modelFile";
 my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
 $configFile = "$path/../$configFile";
-if($isXmlInput){
-  my $xmlInFile = newTmpFile();
-  $xmlInFile = untaintPath($xmlInFile);
-  my $cmd = "$path/sectLabel/";
-  $cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
-  $cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
-  execute($cmd);
-  $inFile = $xmlInFile;
-}
 my $dictFile = $SectLabel::Config::dictFile;
 $dictFile = "$path/../$dictFile";
@@ -99,10 +89,6 @@ my $funcFile = $SectLabel::Config::funcFile;
 $funcFile = "$path/../$funcFile";
 my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
-if($isXmlInput){
-  unlink($inFile);
-}
 if (defined $outFile) {
   $outFile = untaintPath($outFile);

data/parscit/lib/ParsCit/Controller.pm CHANGED Viewed

@@ -21,8 +21,6 @@ use ParsCit::Tr2crfpp;
 use ParsCit::PreProcess;
 use ParsCit::PostProcess;
 use ParsCit::CitationContext;
-# Omnipage libraries
-use Omni::Omnidoc;
 # Dependencies
 use CSXUtil::SafeText qw(cleanXML);
@@ -228,62 +226,6 @@ sub ExtractCitationsImpl
 	# Reference to an array of single reference
 	my $rraw_citations = undef;
-	# Find and separate reference
-	if ($is_xml)
-	{
-		###
-		# Huydhn: input is xml from Omnipage
-		###
-		if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
-		my $xml = do { local $/; <IN> };
-		close IN;
-		###
-		# Huydhn
-		# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
-		# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
-		###
-		# Convert to Unix format
-		$xml =~ s/\r//g;
-		# Remove <?xml version="1.0" encoding="UTF-8"?>
-		$xml =~ s/<\?xml.+?>\n//g;
-		# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
-		$xml =~ s/<\!\-\-XML.+?>\n//g;
-		# Declaration and root
-		$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
-		# New document
-		my $doc = new Omni::Omnidoc();
-		$doc->set_raw($xml);
-		# Extract the reference portion from the XML
-		my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
-		# Extract the reference portion from the text.
-		# TODO: NEED TO BE REMOVED FROM HERE
-		my $content = $doc->get_content();
-		($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
-		my @norm_body_tokens	= split(/\s+/, $$rnorm_body_text);
-    	my @body_tokens			= split(/\s+/, $$rbody_text);
-		my $size	= scalar(@norm_body_tokens);
-    	my $size1	= scalar(@pos_array);
-	    if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
-		# TODO: TO HERE
-		# Filename initialization
-    	if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
-		# Prepare to split unmarked reference portion
-		my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
-		# Extract citations from citation text
-	    $rraw_citations	= ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
-	}
-	else
-	{
 		if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
 		my $text = do { local $/; <IN> };
 		close IN;
@@ -309,7 +251,6 @@ sub ExtractCitationsImpl
 		# Extract citations from citation text
 	    $rraw_citations	= ParsCit::PreProcess::SegmentCitations($rcite_text);
-	}
 	my @citations		= ();
     my @valid_citations	= ();

data/parscit/lib/ParsCit/PreProcess.pm CHANGED Viewed

@@ -11,7 +11,6 @@ package ParsCit::PreProcess;
 use utf8;
 use strict;
-use Omni::Config;
 use ParsCit::Citation;
 my %marker_types =	(	'SQUARE'		=> '\\[.+?\\]',
@@ -22,9 +21,6 @@ my %marker_types =	(	'SQUARE'		=> '\\[.+?\\]',
 						#'NAKEDNUMDOT'	=> '\\d{1,3}\\.'	# Modified by Artemy Kolchinsky (v090625)
 					);
-# Omnilib configuration: object name
-my $obj_list = $Omni::Config::obj_list;
 ###
 # Huydhn: similar to findCitationText, find the citation portion using regular expression.
 # However the input is an omnipage xml document object, not the raw text

data/parscit/lib/ParsCit/Tr2crfpp.pm CHANGED Viewed

@@ -15,7 +15,6 @@ use strict 'vars';
 use FindBin;
 use Encode ();
-use Omni::Config;
 use ParsCit::Config;
 ### USER customizable section
@@ -38,8 +37,6 @@ $split_model_file		= "$FindBin::Bin/../$split_model_file";
 # Huydhn: don't know its function
 ###
 my %dict 	 = ();
-# Omnilib configuration: object name
-my $obj_list = $Omni::Config::obj_list;
 ###
 # Huydhn: prepare data for trfpp, segmenting unmarked reference
@@ -124,7 +121,7 @@ sub PrepDataUnmarked
 						# Trim line
 						$ln	=~ s/^\s+|\s+$//g;
 						# Skip blank lines
-						if (($ln =~ m/^\s*$/) || ($lines->[ $t ]->get_name() ne $obj_list->{ 'OMNILINE' }))
+						if (($ln =~ m/^\s*$/))
 						{
 							$addr_index++;
 							next;
@@ -355,7 +352,6 @@ sub PrepDataUnmarked
 					# XML features
 					# Bullet
 					my $bullet = undef;
-					if ($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' }) { $bullet = $lines->[ $t ]->get_bullet(); }
 					if ((defined $bullet) && ($bullet eq 'true'))
 					{
 						push @feats, 'xmlBullet_yes';
@@ -368,7 +364,6 @@ sub PrepDataUnmarked
 					# First word format: bold, italic, font size
 					my $xml_runs = undef;
-					if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $xml_runs = $lines->[ $t ]->get_objs_ref(); }
 					# First word format: bold
 					my $bold = undef;
@@ -415,7 +410,6 @@ sub PrepDataUnmarked
 					# First word format: starting point, left alignment
 					my $start_point = undef;
-					if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $start_point = $lines->[ $t ]->get_left_pos(); }
 					if ((defined $start_point) && ($start_point > $avg_start_point * $start_upper_ratio))
 					{
 						push @feats, 'xmlBeginLine_right';

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: biblicit
 version: !ruby/object:Gem::Version
-  version: 2.0.3
+  version: 2.0.4
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-02-28 00:00:00.000000000 Z
+date: 2013-03-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -402,30 +402,11 @@ files:
 - parscit/bin/sectLabel/genericSect/extractFeature.rb
 - parscit/bin/sectLabel/genericSectExtract.rb
 - parscit/bin/sectLabel/getStructureInfo.pl
-- parscit/bin/sectLabel/processOmniXML.pl
-- parscit/bin/sectLabel/processOmniXML_new.pl
-- parscit/bin/sectLabel/processOmniXMLv2.pl
-- parscit/bin/sectLabel/processOmniXMLv3.pl
 - parscit/bin/sectLabel/redo.sectLabel.pl
-- parscit/bin/sectLabel/simplifyOmniXML.pl
 - parscit/bin/sectLabel/single2multi.pl
 - parscit/bin/sectLabel/tr2crfpp.pl
 - parscit/bin/tr2crfpp.pl
-- parscit/bin/xml2train.pl
 - parscit/lib/CSXUtil/SafeText.pm
-- parscit/lib/Omni/Config.pm
-- parscit/lib/Omni/Omnicell.pm
-- parscit/lib/Omni/Omnicol.pm
-- parscit/lib/Omni/Omnidd.pm
-- parscit/lib/Omni/Omnidoc.pm
-- parscit/lib/Omni/Omniframe.pm
-- parscit/lib/Omni/Omniline.pm
-- parscit/lib/Omni/Omnipage.pm
-- parscit/lib/Omni/Omnipara.pm
-- parscit/lib/Omni/Omnirun.pm
-- parscit/lib/Omni/Omnitable.pm
-- parscit/lib/Omni/Omniword.pm
-- parscit/lib/Omni/Traversal.pm
 - parscit/lib/ParsCit/.PostProcess.pm.swp
 - parscit/lib/ParsCit/Citation.pm
 - parscit/lib/ParsCit/CitationContext.pm
@@ -439,7 +420,6 @@ files:
 - parscit/lib/ParsHed/PostProcess.pm
 - parscit/lib/ParsHed/Tr2crfpp.pm
 - parscit/lib/ParsHed/Tr2crfpp_token.pm
-- parscit/lib/SectLabel/AAMatching.pm
 - parscit/lib/SectLabel/Config.pm
 - parscit/lib/SectLabel/Controller.pm
 - parscit/lib/SectLabel/PostProcess.pm
@@ -473,7 +453,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 2279876521491945710
+      hash: -2794280872000100021
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -482,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 2279876521491945710
+      hash: -2794280872000100021
 requirements:
 - For PDFs, Poppler or XPDF (try "which pdftotext")
 - For Postscript files, Ghostscript (try "which ps2ascii")