RubyGems - biblicit - Versions diffs - 2.0.3 → 2.0.4 - Mend

biblicit 2.0.3 → 2.0.4

Files changed (28) hide show

data/README.md +0 -2
data/biblicit.gemspec +1 -1
data/parscit/bin/citeExtract.pl +9 -161
data/parscit/bin/sectExtract.pl +0 -14
data/parscit/lib/ParsCit/Controller.pm +0 -59
data/parscit/lib/ParsCit/PreProcess.pm +0 -4
data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
metadata +4 -24
data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
data/parscit/bin/xml2train.pl +0 -193
data/parscit/lib/Omni/Config.pm +0 -93
data/parscit/lib/Omni/Omnicell.pm +0 -263
data/parscit/lib/Omni/Omnicol.pm +0 -292
data/parscit/lib/Omni/Omnidd.pm +0 -328
data/parscit/lib/Omni/Omnidoc.pm +0 -153
data/parscit/lib/Omni/Omniframe.pm +0 -223
data/parscit/lib/Omni/Omniline.pm +0 -423
data/parscit/lib/Omni/Omnipage.pm +0 -282
data/parscit/lib/Omni/Omnipara.pm +0 -232
data/parscit/lib/Omni/Omnirun.pm +0 -303
data/parscit/lib/Omni/Omnitable.pm +0 -336
data/parscit/lib/Omni/Omniword.pm +0 -162
data/parscit/lib/Omni/Traversal.pm +0 -313
data/parscit/lib/SectLabel/AAMatching.pm +0 -1949

data/parscit/bin/sectLabel/simplifyOmniXML.pl DELETED Viewed

@@ -1,382 +0,0 @@
-#!/usr/bin/perl -wT
-# Author: Luong Minh Thang <luongmin@comp.nus.edu.sg>, generated at Tue, 02 Jun 2009 01:30:42
-# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
-require 5.0;
-use strict;
-use Getopt::Long;
-use HTML::Entities;
-# I do not know a better solution to find a lib path in -T mode.
-# So if you know a better solution, I'd be glad to hear.
-# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
-use FindBin;
-FindBin::again(); # to get correct path in case 2 scripts in different directories use FindBin
-my $path;
-BEGIN {
-  if ($FindBin::Bin =~ /(.*)/) {
-    $path = $1;
-  }
-}
-use lib "$path/../../lib";
-use SectLabel::PreProcess;
-### USER customizable section
-$0 =~ /([^\/]+)$/; my $progname = $1;
-my $outputVersion = "1.0";
-### END user customizable section
-sub License {
-  print STDERR "# Copyright 2009 \251 by Luong Minh Thang\n";
-}
-### HELP Sub-procedure
-sub Help {
-  print STDERR "Process Omnipage XML output (concatenated results fromm all pages of a PDF file), and extract necessary information. Marking in the output detailed word-level info ### Page\\n## Para\\n# Line\\nword\\n### Table\\n### Figure\n";
-  print STDERR "usage: $progname -h\t[invokes help]\n";
-  print STDERR "       $progname -in xmlFile -out outFile [-decode -allowEmptyLine -log]\n";
-  print STDERR "Options:\n";
-  print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
-  print STDERR "\t-decode: decode HTML entities and then output, to avoid double entity encoding later\n";
-}
-my $QUIET = 0;
-my $HELP = 0;
-my $outFile = undef;
-my $inFile = undef;
-my $isDecode = 0;
-my $isAllowEmpty = 0;
-my $isDebug = 0;
-$HELP = 1 unless GetOptions('in=s' => \$inFile,
-			    'out=s' => \$outFile,
-			    'decode' => \$isDecode,
-			    'allowEmptyLine' => \$isAllowEmpty,
-			    'log' => \$isDebug,
-			    'h' => \$HELP,
-			    'q' => \$QUIET);
-if ($HELP || !defined $inFile || !defined $outFile) {
-  Help();
-  exit(0);
-}
-if (!$QUIET) {
-  License();
-}
-### Untaint ###
-$inFile = untaintPath($inFile);
-$outFile = untaintPath($outFile);
-$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
-### End untaint ###
-if($isDebug){
-  print STDERR "\n# Processing file $inFile & output to $outFile\n";
-}
-my $markupOutput = "";
-processFile($inFile);
-if($isDecode){
-  $markupOutput = decode_entities($markupOutput);
-}
-open(OF, ">:utf8", "$outFile") || die"#Can't open file \"$outFile\"\n";
-print OF "$markupOutput";
-close OF;
-sub processFile {
-  my ($inFile) = @_;
-  if (!(-e $inFile)) { die "# $progname crash\t\tFile \"$inFile\" doesn't exist"; }
-  open (IF, "<:utf8", $inFile) || die "# $progname crash\t\tCan't open \"$inFile\"";
-  my $isPara = 0;
-  my $isTable = 0;
-  my $isSpace = 0;
-  my $isPic = 0;
-  my $text = "";
-  while (<IF>) { #each line contains a header
-    if (/^\#/) { next; }			# skip comments
-    chomp;
-    s/\cM$//; # remove ^M character at the end of the file if any
-    my $line = $_;
-    #    if ($line =~ /<\?xml version.+>/){    } ### Xml ###
-    #    if ($line =~ /^<\/column>$/){    } ### Column ###
-    if ($line =~ /<theoreticalPage (.*)\/>/){
-      $markupOutput .= "||| Page $1\n";
-    }
-    ### pic ###
-    if ($line =~ /^<dd (.*)>$/){
-      $isPic = 1;
-      $markupOutput .= "||| Figure $1\n";
-    }
-    elsif ($line =~ /^<\/dd>$/){
-      $isPic = 0;
-    }
-    ### Table ###
-    elsif ($line =~ /^<table (.*)>$/){
-      $isTable = 1;
-      $markupOutput .= "||| Table $1\n";
-    }
-    elsif ($line =~ /^<\/table>$/){
-      $isTable = 0;
-    }
-    ### Paragraph ###
-    # Note: table processing should have higher priority than paragraph, i.e. the priority does matter
-    elsif ($line =~ /^<para (.*)>$/){
-      $text .= $line."\n"; # we need the header
-      $isPara = 1;
-      if($isTable){
-	$markupOutput .= "||| ParaTable $1\n";
-      } else {
-	$markupOutput .= "||| Para $1\n";
-      }
-    }
-    elsif ($line =~ /^<\/para>$/){
-      my $paraText;
-      processPara($text);
-      $isPara = 0;
-      $text = "";
-    }
-    elsif($isPara){
-      $text .= $line."\n";
-      next;
-    }
-  }
-  close IF;
-}
-sub getAttrValue {
-  my ($attrText, $attr) = @_;
-  my $value = "none";
-  if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
-    $value = $1;
-  }
-  return $value;
-}
-sub checkFontAttr {
-  my ($attrText, $attr, $attrHash, $count) = @_;
-  if($attrText =~ /^.*$attr=\"(.+?)\".*$/){
-    my $attrValue = $1;
-    $attrHash->{$attrValue} = $attrHash->{$attrValue} ? ($attrHash->{$attrValue}+$count) : $count;
-  }
-}
-sub processPara {
-  my ($inputText) = @_;
-  my $isSpace = 0;
-  my $isSpecialSpace = 0;
-  my $isTab = 0;
-  my $isBullet = 0;
-  my $isForcedEOF = "none";  # 3 signals for end of L: forcedEOF=\"true\" in attribute of <ln> or || <nl orig=\"true\"\/> || end of </para> without encountering any of the above signal in the para plus $isSpace = 0
-  my $lnAttr; my $isLn = 0; my $lnBold = "none"; my $lnItalic = "none";
-  my $runAttr;  my $runText = ""; my $isRun = 0; my $runBold = "none"; my $runItalic = "none";
-  my $wdAttr; my $wdText = ""; my $isWd = 0;
-  my $text = "";
-  my $tmpMarkupOutput = "";
-#  binmode(STDERR, ":utf8");
-  my @lines = split(/\n/, $inputText);
-  for(my $i=0; $i<scalar(@lines); $i++){
-    my $line = $lines[$i];
-    ## new ln
-    if ($line =~ /^<ln (.+)>$/){
-      $lnAttr = $1;
-      $isLn = 1;
-      $tmpMarkupOutput .= "||| Line $lnAttr\n";
-      $isForcedEOF = getAttrValue($lnAttr, "forcedEOF");
-    }
-    ## new run
-    elsif ($line =~ /<run (.*)>$/){
-      $runAttr = $1;
-      $isSpace = 0;
-      $isTab = 0;
-      $isRun = 1;
-      if($line =~ /^<wd (.*?)>/){  # new wd, that consists of many runs
-	$isWd = 1;
-	$wdAttr = $1;
-      }
-    }
-    ## wd
-    elsif ($line =~ /^<wd (.+)?>(.+)<\/wd>$/){
-      $wdAttr = $1;
-      my $word = $2;
-      $isSpace = 0;
-      $isTab = 0;
-      $word =~ s/\cM$//g; # remove ^M character
-      $tmpMarkupOutput .= "$word $wdAttr\n";
-      ## add text
-      $text .= "$word";
-      if($isRun) {
-	$runText .= "$word ";
-      }
-    }
-    ## end wd
-    elsif ($line =~ /^<\/wd>$/){
-      $isWd = 0;
-      $tmpMarkupOutput .= "$wdText $wdAttr\n";
-      $wdAttr = "";
-      $wdText = "";
-    }
-    ## end run
-    elsif ($line =~ /^(.*)<\/run>$/){
-      my $word = $1;
-      ## add text
-      if($word ne ""){
-	$word =~ s/\cM$//g; # remove ^M character
-	# appear in the final result
-	if($isLn){ $text .= "$word"; }
-	# for internal record
-	if($isRun){ $runText .= "$word "; }
-	if($isWd){ $wdText .= "$word"; }
-      }
-      ## reset run
-      $runText = "";
-      $isRun = 0;
-      $isSpecialSpace = 0;
-    }
-    ## end ln
-    elsif ($line =~ /^<\/ln>$/){
-      if((!$isAllowEmpty && $text !~ /^\s*$/)
-	 || ($isAllowEmpty && $text ne "")){
-	if($isForcedEOF eq "true" || # there's a forced EOL?
-	   (!$isSpecialSpace) # not an emply line with space character
-	  ){
-	  $text .= "\n";
-	  $markupOutput .= $tmpMarkupOutput;
-	  $tmpMarkupOutput = "";
-	  $text = "";
-	}
-      } else {
-	$tmpMarkupOutput = "";
-      }
-      ## reset ln
-      $isLn = 0;
-      $isForcedEOF = "none";
-      $isSpecialSpace = 0;
-    } # end else </ln>
-    ## nl newline signal
-    elsif ($line =~ /^<nl orig=\"true\"\/>$/){
-      if($isLn){
-	$isSpace = 0;
-      } else {
-	if($isDebug){
-	  print STDERR "#!!! Warning: found <nl orig=\"true\"\/> while not in tag <ln>: $line\n";
-	}
-      }
-    }
-    ## space
-    elsif ($line =~ /^<space\/>$/){
-      my $startTag = "";
-      my $endTag = "";
-      if($i>0 && $lines[$i-1] =~ /^<(.+?)\b.*/){
-	$startTag = $1;
-      }
-      if($i < (scalar(@lines) -1) && $lines[$i+1] =~ /^<\/(.+)>/){
-	$endTag = $1;
-      }
-      if($startTag eq $endTag && $startTag ne ""){
-	$isSpecialSpace = 1;
-      }
-      ## addText
-      $text .= " ";
-      $isSpace = 1;
-    }
-    ## tab
-    elsif ($line =~ /^<tab .*\/>$/){
-      ## add Text
-      $text .= "\t";
-      $isTab = 1;
-    }
-    ## bullet
-    elsif ($line =~ /^<bullet .*>$/){
-      $isBullet = 1;
-    }
-  }
-}
-sub untaintPath {
-  my ($path) = @_;
-  if ( $path =~ /^([-_\/\w\.]*)$/ ) {
-    $path = $1;
-  } else {
-    die "Bad path \"$path\"\n";
-  }
-  return $path;
-}
-sub untaint {
-  my ($s) = @_;
-  if ($s =~ /^([\w \-\@\(\),\.\/]+)$/) {
-    $s = $1;               # $data now untainted
-  } else {
-    die "Bad data in $s";  # log this somewhere
-  }
-  return $s;
-}
-sub execute {
-  my ($cmd) = @_;
-  if($isDebug){
-    print STDERR "Executing: $cmd\n";
-  }
-  $cmd = untaint($cmd);
-  system($cmd);
-}
-sub newTmpFile {
-  my $tmpFile = `date '+%Y%m%d-%H%M%S-$$'`;
-  chomp($tmpFile);
-  return $tmpFile;
-}

data/parscit/bin/xml2train.pl DELETED Viewed

@@ -1,193 +0,0 @@
-#!/usr/bin/perl
-# Author: Do Hoang Nhat Huy <dcsdhnh@nus.edu.sg>, generated at Fri, 3 Dec 2010 14:36:00
-# Modified from template by Min-Yen Kan <kanmy@comp.nus.edu.sg>
-require 5.0;
-use strict;
-use FindBin;
-use Getopt::Long;
-# I do not know a better solution to find a lib path in -T mode.
-# So if you know a better solution, I'd be glad to hear.
-# See this http://www.perlmonks.org/?node_id=585299 for why I used the below code
-my $path;	# Path to Parscit binary directory
-BEGIN
-{
-	if ($FindBin::Bin =~ /(.*)/) { $path  = $1; }
-}
-use lib "$path/../lib";
-# Local libraries
-use Omni::Omnidoc;
-use ParsCit::Tr2crfpp;
-use ParsCit::PreProcess;
-# Dependencies
-### USER customizable section
-my $version = "1.0";
-$0 =~ /([^\/]+)$/; my $progname = $1;
-### END user customizable section
-sub License
-{
-	print STDERR "# Copyright 2011 \251 by Do Hoang Nhat Huy\n";
-}
-### HELP Sub-procedure
-sub Help
-{
-	print STDERR "Process Omnipage XML output (Reference Section Only) and extract text lines together with other XML information\n";
-	print STDERR "usage: $progname -h\t[invokes help]\n";
-  	print STDERR "       $progname -in xmlfile -out outfile -opt option [-codec -app]\n";
- 	print STDERR "Options:\n";
-  	print STDERR "\t-q		\tQuiet Mode (don't echo license)\n";
-  	print STDERR "\t-in		\tXML input from Omnipage\n";
-  	print STDERR "\t-out	\tOutput file\n";
-  	print STDERR "\t-codec	\tCodec of the input XML: utf-16 or utf-8. Default is utf-8\n";
-  	print STDERR "\t-opt	\tOptio: train (output is train file for crf++) or xml (output is xml features). Default is train\n";
-}
-my $help	= 0;
-my $quite	= 0;
-my $infile	= undef;
-my $outfile	= undef;
-my $option	= "train";
-my $codec	= "utf-8";
-$help = 1 unless GetOptions('in=s'		=> \$infile,
-			 				'out=s'		=> \$outfile,
-							'opt=s'		=> \$option,
-							'codec=s'	=> \$codec,
-			 				'h' 		=> \$help,
-							'q' 		=> \$quite);
-if ($help || !defined $infile || !defined $outfile)
-{
-	Help();
-	exit(0);
-}
-if (!$quite)
-{
-	License();
-}
-# Sanity check
-if (($option ne "train") && ($option ne "xml"))
-{
-	die "Die: -opt must equal \"train\" or \"xml\".\n";
-}
-if (($codec ne "utf-8") && ($codec ne "utf-16"))
-{
-	die "Die: -codec must equal \"utf-8\" or \"utf-16\".\n";
-}
-# Untaint check
-$infile		= UntaintPath($infile);
-$outfile 	= UntaintPath($outfile);
-$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
-# End untaint check
-# MAIN
-my $infile_utf8 = $infile . "-utf8";
-if ($codec eq "utf-16") { Convert($infile, "UTF16", $infile_utf8, "UTF8"); }
-if (! open(IN, "<:utf8", $infile)) { return (-1, "Could not open xml file " . $infile . ": " . $!); }
-my $xml = do { local $/; <IN> };
-close IN;
-# Cleanup
-CleanUp(\$xml);
-# New document
-my $doc = new Omni::Omnidoc();
-$doc->set_raw($xml);
-# Extract the reference portion from the XML
-my ($start_ref, $end_ref, $rcite_text_from_xml) = ParsCit::PreProcess::findCitationTextXML($doc);
-if ($option eq "train")
-{
-	# Prepare to split unmarked reference portion
-	my $tmp_file = ParsCit::Tr2crfpp::prepDataUnmarked($doc, $start_ref, $end_ref);
-	# Save the temporary file
-	my $cmd = "mv " . $tmp_file . " " . $outfile;
-	Execute($cmd);
-}
-else
-{
-}
-# END
-# Convert the input XML
-sub Convert
-{
-	my ($from_file, $from_encode, $to_file, $to_encode, $log) = @_;
-	# Call iconv program
-	my $cmd = "iconv" . " -f " . $from_encode . " -t " . $to_encode . " " . $from_file . " -o " . $to_file;
-	# Transformation
-	Execute($cmd);
-}
-# Clean up the input XML
-sub CleanUp
-{
-	my ($ref_xml) = @_;
-	# Remove <?xml version="1.0" encoding="UTF-8"?>
-	$$ref_xml =~ s/<\?xml.+?>\n//g;
-	# Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
-	$$ref_xml =~ s/<\!\-\-XML.+?>\n//g;
-	# Add the root tag
-	$$ref_xml = "<root>" . "\n" . $$ref_xml . "\n" . "</root>";
-}
-sub UntaintPath
-{
-	my ($path) = @_;
-	if ($path =~ /^([-_:" \/\w\.%\p{C}\p{P}]+)$/ )
-	{
-		$path = $1;
-	}
-	else
-	{
-		die "Bad path \"$path\"\n";
-	}
-	return $path;
-}
-sub Untaint
-{
-	my ($s) = @_;
-	if ($s =~ /^([\w \-\@\(\),\.\/>\p{C}\p{P}]+)$/)
-	{
-		$s = $1;               # $data now untainted
-	}
-	else
-	{
-		die "Bad data in $s";  # log this somewhere
-	}
-	return $s;
-}
-sub Execute
-{
-	my ($cmd) = @_;
-	print STDERR "Executing: $cmd\n";
-	system($cmd);
-}

data/parscit/lib/Omni/Config.pm DELETED Viewed

@@ -1,93 +0,0 @@
-package Omni::Config;
-# Global
-# Names of the classes
-$ALG_NAME		= "Omni";
-# Version
-$ALG_VERSION	= "110505";
-# All Omnipage XML tags
-%omni_tag_list	=		(	'DOCUMENT'		=> 'document',
-		 	  				'PAGE'			=> 'page',
-							'COLUMN'		=> 'column',
-		   					'DESC'			=> 'description',
-			   				'SRC'			=> 'source',
-							'LANGUAGE' 		=> 'language',
-							'STYLE'			=> 'style',
-							'STYLE-TABLE'	=> 'styleTable',
-							'THEO-PAGE'		=> 'theoreticalPage',
-							'BODY'			=> 'body',
-							'SECTION'		=> 'section',
-							'COL'			=> 'column',
-							'PARA'			=> 'para',
-							'LINE'			=> 'ln',
-							'WORD'			=> 'wd',
-							'SPACE'			=> 'space',
-							'RUN'			=> 'run',
-							'BULLET'		=> 'bullet',
-							'TABLE'			=> 'table',
-							'GRID'			=> 'gridTable',
-							'GRID-COL'		=> 'gridCol',
-							'GRID-ROW'		=> 'gridRow',
-							'CELL'			=> 'cell',
-							'BOTTOM-CELL'	=> 'bottomBorder',
-							'TOP-CELL'		=> 'topBorder',
-							'LEFT-CELL'		=> 'leftBorder',
-							'RIGHT-CELL'	=> 'rightBorder',
-							'NEWLINE'		=> 'nl',
-							'TAB'			=> 'tab',
-							'DD'			=> 'dd',
-							'PICTURE'		=> 'picture',
-							'FRAME'			=> 'frame'
-						);
-$tag_list = \%omni_tag_list;
-# All Omnipage XML attributes
-%omni_att_list	=		(	'ALIGN'			=> 'alignment',
-							'FONTFACE'		=> 'fontFace',
-							'FONTFAMILY'	=> 'fontFamily',
-							'FONTPITCH'		=> 'fontPitch',
-							'FONTSIZE'		=> 'fontSize',
-							'UNDERLINE'		=> 'underline',
-							'SPACING'		=> 'spacing',
-							'SCALE'			=> 'scale',
-							'BOTTOM'		=> 'b',
-							'TOP'			=> 't',
-							'LEFT'			=> 'l',
-							'RIGHT'			=> 'r',
-							'LANGUAGE'		=> 'language',
-							'SUSCRIPT'		=> 'subsuperscript',
-							'BASELINE'		=> 'baseline',
-							'BOLD'			=> 'bold',
-							'ITALIC'		=> 'italic',
-							'SPACEB'		=> 'spaceBefore',
-							# These attribute usually go with <dd> tag
-							'BOTTOMDIST'	=> 'bottomDistance',
-							'TOPDIST'		=> 'topDistance',
-							'LEFTDIST'		=> 'leftDistance',
-							'RIGHTDIST'		=> 'rightDistance',
-							# These attribute usually fo with <cell> tag
-							'GROWFROM'		=> 'gridRowFrom',
-							'GROWTO'		=> 'gridRowTill',
-							'GCOLFROM'		=> 'gridColFrom',
-							'GCOLTO'		=> 'gridColTill',
-							'VALIGN'		=> 'verticalAlignment',
-						);
-$att_list = \%omni_att_list;
-# All object type in Omni library
-%omni_obj_list	=		(	'OMNIDOC'		=> 'document',
-							'OMNIPAGE'		=> 'page',
-							'OMNICOL'		=> 'column',
-							'OMNIDD'		=> 'dd',
-							'OMNITABLE'		=> 'table',
-							'OMNIIMG'		=> 'image',
-							'OMNIPARA'		=> 'paragraph',
-							'OMNILINE'		=> 'line',
-							'OMNIRUN'		=> 'run',
-							'OMNIWORD'		=> 'word',
-							'OMNIFRAME'		=> 'frame',
-						);
-$obj_list = \%omni_obj_list;
-1;