RubyGems - biblicit - Versions diffs - 2.0.3 → 2.0.4 - Mend

biblicit 2.0.3 → 2.0.4

Files changed (28) hide show

data/README.md +0 -2
data/biblicit.gemspec +1 -1
data/parscit/bin/citeExtract.pl +9 -161
data/parscit/bin/sectExtract.pl +0 -14
data/parscit/lib/ParsCit/Controller.pm +0 -59
data/parscit/lib/ParsCit/PreProcess.pm +0 -4
data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
metadata +4 -24
data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
data/parscit/bin/xml2train.pl +0 -193
data/parscit/lib/Omni/Config.pm +0 -93
data/parscit/lib/Omni/Omnicell.pm +0 -263
data/parscit/lib/Omni/Omnicol.pm +0 -292
data/parscit/lib/Omni/Omnidd.pm +0 -328
data/parscit/lib/Omni/Omnidoc.pm +0 -153
data/parscit/lib/Omni/Omniframe.pm +0 -223
data/parscit/lib/Omni/Omniline.pm +0 -423
data/parscit/lib/Omni/Omnipage.pm +0 -282
data/parscit/lib/Omni/Omnipara.pm +0 -232
data/parscit/lib/Omni/Omnirun.pm +0 -303
data/parscit/lib/Omni/Omnitable.pm +0 -336
data/parscit/lib/Omni/Omniword.pm +0 -162
data/parscit/lib/Omni/Traversal.pm +0 -313
data/parscit/lib/SectLabel/AAMatching.pm +0 -1949

data/README.md CHANGED Viewed

@@ -119,8 +119,6 @@ More than these might be required; this is what I had to add to my default insta
     sudo cpan install Digest::SHA1
     sudo cpan install String::Approx
-    sudo cpan install XML::Writer::String
-    sudo cpan install XML::Twig
 ## Required to use the ParsCit algorithm

data/biblicit.gemspec CHANGED Viewed

@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 Gem::Specification.new do |gem|
   gem.name          = "biblicit"
-  gem.version       = "2.0.3"
+  gem.version       = "2.0.4"
   gem.authors       = ["David Judd"]
   gem.email         = ["david@academia.edu"]
   gem.summary       = %q{Extract citations from PDFs.}

data/parscit/bin/citeExtract.pl CHANGED Viewed

@@ -41,10 +41,7 @@ use File::Spec;
 use File::Basename;
 # Local libraries
-use Omni::Omnidoc;
-use Omni::Traversal;
 use ParsCit::Controller;
-use SectLabel::AAMatching;
 # USER customizable section
 my $tmpfile	.= $0;
@@ -195,113 +192,17 @@ if (defined $opt_e && $opt_e ne "")
 }
 my $doc			= undef;
-my $text_file	= undef;
-# Extracting text from Omnipage XML output
-if ($is_xml_input)
-{
-	$text_file	= "/tmp/" . NewTmpFile();
-	my $cmd		= $FindBin::Bin . "/sectLabel/processOmniXMLv2.pl -q -in $in -out $text_file -decode";
-	system($cmd);
-	###
-	# Huydhn: input is xml from Omnipage
-	###
-	if (! open(IN, "<:utf8", $in)) { return (-1, "Could not open xml file " . $in . ": " . $!); }
-	my $xml = do { local $/; <IN> };
-	close IN;
-	###
-	# Huydhn
-	# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
-	# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
-	###
-	# Convert to Unix format
-	$xml =~ s/\r//g;
-	# Remove <?xml version="1.0" encoding="UTF-8"?>
-	$xml =~ s/<\?xml.+?>\n//g;
-	# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
-	$xml =~ s/<\!\-\-XML.+?>\n//g;
-	# Declaration and root
-	$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
-	# New document
-	$doc = new Omni::Omnidoc();
-	$doc->set_raw($xml);
-}
-else
-{
-	$text_file	= $in;
-}
+my $text_file	= $in;
 # SECTLABEL
 if (($mode & $SECTLABEL) == $SECTLABEL)
 {
 	my $sect_label_input = $text_file;
-	# Get XML features and append to $text_file
-	if ($is_xml_input)
-	{
-		my $cmd	= $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
-		system($cmd);
-		my $address_file = $text_file . ".feature" . ".address";
-		if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
-		my @omni_address = ();
-		# Read the address file provided by process OmniXML script
-		while (<IN>)
-		{
-			chomp;
-			# Save and split the line
-			my $line	= $_;
-			my @element	= split(/\s+/, $line);
-			my %addr		= ();
-			# Address
-			$addr{ 'L1' }	= $element[ 0 ];
-			$addr{ 'L2' }	= $element[ 1 ];
-			$addr{ 'L3' }	= $element[ 2 ];
-			$addr{ 'L4' }	= $element[ 3 ];
-			# Save the address
-			push @omni_address, { %addr };
-		}
-		close IN;
-		unlink($address_file);
-		$sect_label_input .= ".feature";
-		my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($sl_xml, 1) . "\n";
-		# Only run author - affiliation if "something" is provided
-		if ($opt_a)
-		{
-			my @aut_addrs = ();
-			my @aff_addrs = ();
-			# Address of author section
-			for my $lindex (@{ $aut_lines }) { push @aut_addrs, $omni_address[ $lindex ]; }
-			# Address of affiliation section
-			for my $lindex (@{ $aff_lines }) { push @aff_addrs, $omni_address[ $lindex ]; }
-			# The tarpit
-			my $aa_xml = SectLabel::AAMatching::AAMatching($doc, \@aut_addrs, \@aff_addrs);
-			# Author-Affiliation Matching result
-			$rxml .= $aa_xml . "\n";
-		}
-		# Remove XML feature file
-		unlink($sect_label_input);
-	}
-	else
-	{
-		my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
+  my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($sl_xml, 1) . "\n";
-	}
+  # Remove first line <?xml/>
+  $rxml .= RemoveTopLines($sl_xml, 1) . "\n";
 }
 # PARSHED
@@ -318,66 +219,13 @@ if (($mode & $PARSHED) == $PARSHED)
 # PARSCIT
 if (($mode & $PARSCIT) == $PARSCIT)
 {
-	if ($is_xml_input)
-	{
-		my $cmd	= $FindBin::Bin . "/sectLabel/processOmniXMLv3.pl -q -in $in -out $text_file.feature -decode";
-		system($cmd);
-		my $address_file = $text_file . ".feature" . ".address";
-		if (! open(IN, "<:utf8", $address_file)) { return (-1, "Could not open address file " . $address_file . ": " . $!); }
-		my @omni_address = ();
-		# Read the address file provided by process OmniXML script
-		while (<IN>)
-		{
-			chomp;
-			# Save and split the line
-			my $line	= $_;
-			my @element	= split(/\s+/, $line);
-			my %addr		= ();
-			# Address
-			$addr{ 'L1' }	= $element[ 0 ];
-			$addr{ 'L2' }	= $element[ 1 ];
-			$addr{ 'L3' }	= $element[ 2 ];
-			$addr{ 'L4' }	= $element[ 3 ];
-			# Save the address
-			push @omni_address, { %addr };
-		}
-		close IN;
-		unlink($address_file);
-		my $sect_label_input = $text_file . ".feature";
-		# Output of sectlabel becomes input for parscit
-		my ($all_text, $cit_lines) = SectLabel($sect_label_input, $is_xml_input, 1);
-		# Remove XML feature file
-		unlink($sect_label_input);
-		my @cit_addrs = ();
-		# Address of reference section
-		for my $lindex (@{ $cit_lines }) { push @cit_addrs, $omni_address[ $lindex ]; }
-		my $pc_xml = undef;
-		# Huydhn: add xml features to parscit in case of unmarked reference
-		$pc_xml = ParsCit::Controller::ExtractCitations2(\$all_text, $cit_lines, $is_xml_input, $doc, \@cit_addrs);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
+  my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
-		# Thang v100901: call to BiblioScript
-		if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
-	}
-	else
-	{
-		my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input);
-		# Remove first line <?xml/>
-		$rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
+  # Remove first line <?xml/>
+  $rxml .= RemoveTopLines($$pc_xml, 1) . "\n";
-		# Thang v100901: call to BiblioScript
-		if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
-	}
+  # Thang v100901: call to BiblioScript
+  if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); }
 }
 $rxml .= "</algorithms>";

data/parscit/bin/sectExtract.pl CHANGED Viewed

@@ -82,16 +82,6 @@ $modelFile = "$path/../$modelFile";
 my $configFile = $isXmlInput ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile;
 $configFile = "$path/../$configFile";
-if($isXmlInput){
-  my $xmlInFile = newTmpFile();
-  $xmlInFile = untaintPath($xmlInFile);
-  my $cmd = "$path/sectLabel/";
-  $cmd .= ($isNew) ? "processOmniXMLv2.pl" : "processOmniXML.pl";
-  $cmd .= " -in $inFile -out $xmlInFile -xmlFeature -decode";
-  execute($cmd);
-  $inFile = $xmlInFile;
-}
 my $dictFile = $SectLabel::Config::dictFile;
 $dictFile = "$path/../$dictFile";
@@ -99,10 +89,6 @@ my $funcFile = $SectLabel::Config::funcFile;
 $funcFile = "$path/../$funcFile";
 my $rXML = SectLabel::Controller::extractSection($inFile, $isXmlOutput, $modelFile, $dictFile, $funcFile, $configFile, $isXmlInput, $isDebug);
-if($isXmlInput){
-  unlink($inFile);
-}
 if (defined $outFile) {
   $outFile = untaintPath($outFile);

data/parscit/lib/ParsCit/Controller.pm CHANGED Viewed

@@ -21,8 +21,6 @@ use ParsCit::Tr2crfpp;
 use ParsCit::PreProcess;
 use ParsCit::PostProcess;
 use ParsCit::CitationContext;
-# Omnipage libraries
-use Omni::Omnidoc;
 # Dependencies
 use CSXUtil::SafeText qw(cleanXML);
@@ -228,62 +226,6 @@ sub ExtractCitationsImpl
 	# Reference to an array of single reference
 	my $rraw_citations = undef;
-	# Find and separate reference
-	if ($is_xml)
-	{
-		###
-		# Huydhn: input is xml from Omnipage
-		###
-		if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
-		my $xml = do { local $/; <IN> };
-		close IN;
-		###
-		# Huydhn
-		# NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
-		# This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
-		###
-		# Convert to Unix format
-		$xml =~ s/\r//g;
-		# Remove <?xml version="1.0" encoding="UTF-8"?>
-		$xml =~ s/<\?xml.+?>\n//g;
-		# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
-		$xml =~ s/<\!\-\-XML.+?>\n//g;
-		# Declaration and root
-		$xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
-		# New document
-		my $doc = new Omni::Omnidoc();
-		$doc->set_raw($xml);
-		# Extract the reference portion from the XML
-		my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
-		# Extract the reference portion from the text.
-		# TODO: NEED TO BE REMOVED FROM HERE
-		my $content = $doc->get_content();
-		($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
-		my @norm_body_tokens	= split(/\s+/, $$rnorm_body_text);
-    	my @body_tokens			= split(/\s+/, $$rbody_text);
-		my $size	= scalar(@norm_body_tokens);
-    	my $size1	= scalar(@pos_array);
-	    if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
-		# TODO: TO HERE
-		# Filename initialization
-    	if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
-		# Prepare to split unmarked reference portion
-		my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
-		# Extract citations from citation text
-	    $rraw_citations	= ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
-	}
-	else
-	{
 		if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
 		my $text = do { local $/; <IN> };
 		close IN;
@@ -309,7 +251,6 @@ sub ExtractCitationsImpl
 		# Extract citations from citation text
 	    $rraw_citations	= ParsCit::PreProcess::SegmentCitations($rcite_text);
-	}
 	my @citations		= ();
     my @valid_citations	= ();

data/parscit/lib/ParsCit/PreProcess.pm CHANGED Viewed

@@ -11,7 +11,6 @@ package ParsCit::PreProcess;
 use utf8;
 use strict;
-use Omni::Config;
 use ParsCit::Citation;
 my %marker_types =	(	'SQUARE'		=> '\\[.+?\\]',
@@ -22,9 +21,6 @@ my %marker_types =	(	'SQUARE'		=> '\\[.+?\\]',
 						#'NAKEDNUMDOT'	=> '\\d{1,3}\\.'	# Modified by Artemy Kolchinsky (v090625)
 					);
-# Omnilib configuration: object name
-my $obj_list = $Omni::Config::obj_list;
 ###
 # Huydhn: similar to findCitationText, find the citation portion using regular expression.
 # However the input is an omnipage xml document object, not the raw text

data/parscit/lib/ParsCit/Tr2crfpp.pm CHANGED Viewed

@@ -15,7 +15,6 @@ use strict 'vars';
 use FindBin;
 use Encode ();
-use Omni::Config;
 use ParsCit::Config;
 ### USER customizable section
@@ -38,8 +37,6 @@ $split_model_file		= "$FindBin::Bin/../$split_model_file";
 # Huydhn: don't know its function
 ###
 my %dict 	 = ();
-# Omnilib configuration: object name
-my $obj_list = $Omni::Config::obj_list;
 ###
 # Huydhn: prepare data for trfpp, segmenting unmarked reference
@@ -124,7 +121,7 @@ sub PrepDataUnmarked
 						# Trim line
 						$ln	=~ s/^\s+|\s+$//g;
 						# Skip blank lines
-						if (($ln =~ m/^\s*$/) || ($lines->[ $t ]->get_name() ne $obj_list->{ 'OMNILINE' }))
+						if (($ln =~ m/^\s*$/))
 						{
 							$addr_index++;
 							next;
@@ -355,7 +352,6 @@ sub PrepDataUnmarked
 					# XML features
 					# Bullet
 					my $bullet = undef;
-					if ($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' }) { $bullet = $lines->[ $t ]->get_bullet(); }
 					if ((defined $bullet) && ($bullet eq 'true'))
 					{
 						push @feats, 'xmlBullet_yes';
@@ -368,7 +364,6 @@ sub PrepDataUnmarked
 					# First word format: bold, italic, font size
 					my $xml_runs = undef;
-					if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $xml_runs = $lines->[ $t ]->get_objs_ref(); }
 					# First word format: bold
 					my $bold = undef;
@@ -415,7 +410,6 @@ sub PrepDataUnmarked
 					# First word format: starting point, left alignment
 					my $start_point = undef;
-					if (($lines->[ $t ]->get_name() eq $obj_list->{ 'OMNILINE' })) { $start_point = $lines->[ $t ]->get_left_pos(); }
 					if ((defined $start_point) && ($start_point > $avg_start_point * $start_upper_ratio))
 					{
 						push @feats, 'xmlBeginLine_right';

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: biblicit
 version: !ruby/object:Gem::Version
-  version: 2.0.3
+  version: 2.0.4
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-02-28 00:00:00.000000000 Z
+date: 2013-03-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -402,30 +402,11 @@ files:
 - parscit/bin/sectLabel/genericSect/extractFeature.rb
 - parscit/bin/sectLabel/genericSectExtract.rb
 - parscit/bin/sectLabel/getStructureInfo.pl
-- parscit/bin/sectLabel/processOmniXML.pl
-- parscit/bin/sectLabel/processOmniXML_new.pl
-- parscit/bin/sectLabel/processOmniXMLv2.pl
-- parscit/bin/sectLabel/processOmniXMLv3.pl
 - parscit/bin/sectLabel/redo.sectLabel.pl
-- parscit/bin/sectLabel/simplifyOmniXML.pl
 - parscit/bin/sectLabel/single2multi.pl
 - parscit/bin/sectLabel/tr2crfpp.pl
 - parscit/bin/tr2crfpp.pl
-- parscit/bin/xml2train.pl
 - parscit/lib/CSXUtil/SafeText.pm
-- parscit/lib/Omni/Config.pm
-- parscit/lib/Omni/Omnicell.pm
-- parscit/lib/Omni/Omnicol.pm
-- parscit/lib/Omni/Omnidd.pm
-- parscit/lib/Omni/Omnidoc.pm
-- parscit/lib/Omni/Omniframe.pm
-- parscit/lib/Omni/Omniline.pm
-- parscit/lib/Omni/Omnipage.pm
-- parscit/lib/Omni/Omnipara.pm
-- parscit/lib/Omni/Omnirun.pm
-- parscit/lib/Omni/Omnitable.pm
-- parscit/lib/Omni/Omniword.pm
-- parscit/lib/Omni/Traversal.pm
 - parscit/lib/ParsCit/.PostProcess.pm.swp
 - parscit/lib/ParsCit/Citation.pm
 - parscit/lib/ParsCit/CitationContext.pm
@@ -439,7 +420,6 @@ files:
 - parscit/lib/ParsHed/PostProcess.pm
 - parscit/lib/ParsHed/Tr2crfpp.pm
 - parscit/lib/ParsHed/Tr2crfpp_token.pm
-- parscit/lib/SectLabel/AAMatching.pm
 - parscit/lib/SectLabel/Config.pm
 - parscit/lib/SectLabel/Controller.pm
 - parscit/lib/SectLabel/PostProcess.pm
@@ -473,7 +453,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 2279876521491945710
+      hash: -2794280872000100021
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -482,7 +462,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 2279876521491945710
+      hash: -2794280872000100021
 requirements:
 - For PDFs, Poppler or XPDF (try "which pdftotext")
 - For Postscript files, Ghostscript (try "which ps2ascii")