RubyGems - biblicit - Versions diffs - 2.0.3 → 2.0.4 - Mend

biblicit 2.0.3 → 2.0.4

Files changed (28) hide show

data/README.md +0 -2
data/biblicit.gemspec +1 -1
data/parscit/bin/citeExtract.pl +9 -161
data/parscit/bin/sectExtract.pl +0 -14
data/parscit/lib/ParsCit/Controller.pm +0 -59
data/parscit/lib/ParsCit/PreProcess.pm +0 -4
data/parscit/lib/ParsCit/Tr2crfpp.pm +1 -7
metadata +4 -24
data/parscit/bin/sectLabel/processOmniXML.pl +0 -1427
data/parscit/bin/sectLabel/processOmniXML_new.pl +0 -1025
data/parscit/bin/sectLabel/processOmniXMLv2.pl +0 -1529
data/parscit/bin/sectLabel/processOmniXMLv3.pl +0 -964
data/parscit/bin/sectLabel/simplifyOmniXML.pl +0 -382
data/parscit/bin/xml2train.pl +0 -193
data/parscit/lib/Omni/Config.pm +0 -93
data/parscit/lib/Omni/Omnicell.pm +0 -263
data/parscit/lib/Omni/Omnicol.pm +0 -292
data/parscit/lib/Omni/Omnidd.pm +0 -328
data/parscit/lib/Omni/Omnidoc.pm +0 -153
data/parscit/lib/Omni/Omniframe.pm +0 -223
data/parscit/lib/Omni/Omniline.pm +0 -423
data/parscit/lib/Omni/Omnipage.pm +0 -282
data/parscit/lib/Omni/Omnipara.pm +0 -232
data/parscit/lib/Omni/Omnirun.pm +0 -303
data/parscit/lib/Omni/Omnitable.pm +0 -336
data/parscit/lib/Omni/Omniword.pm +0 -162
data/parscit/lib/Omni/Traversal.pm +0 -313
data/parscit/lib/SectLabel/AAMatching.pm +0 -1949

data/parscit/lib/SectLabel/AAMatching.pm DELETED Viewed

@@ -1,1949 +0,0 @@
-package SectLabel::AAMatching;
-###
-# This package provides methods to solve the matching problem
-# between author and affiliation in a pdf
-#
-# Do Hoang Nhat Huy 21 Apr, 11
-###
-use strict;
-# Dependencies
-use POSIX;
-use IO::File;
-use XML::Writer;
-use XML::Writer::String;
-use	Class::Struct;
-# Local libraries
-use SectLabel::Config;
-use ParsCit::PostProcess;
-# Dictionary
-my %dict = ();
-# CRF++
-my $crft = $SectLabel::Config::crf_test;
-# Matching features of each author, including
-# Signals
-# Coordinations: top, bottom, left, right
-# Position: page, sections, paragraph, line
-struct aut_rcfeatures =>
-{
-	signals	=> '@',
-	top		=> '$',
-	bottom	=> '$',
-	left	=> '$',
-	right	=> '$',
-	page 	=> '$',
-	section	=> '$',
-	para	=> '$',
-	line	=> '$'
-};
-# Matching features of each affiliation, including
-# Signals
-# Coordinations: top, bottom, left, right
-# Position: page, sections, paragraph, line
-struct aff_rcfeatures =>
-{
-	signals	=> '@',
-	top		=> '$',
-	bottom	=> '$',
-	left	=> '$',
-	right	=> '$',
-	page 	=> '$',
-	section	=> '$',
-	para	=> '$',
-	line	=> '$'
-};
-# Author
-# Affiliation
-sub AAMatching
-{
-	my ($doc, $aut_addrs, $aff_addrs) = @_;
-	my $need_object	= 1;
-	# Get the author objects
-	my $aut_lines	= Omni::Traversal::OmniCollector($doc, $aut_addrs, $need_object);
-	# Get the affiliation objects
-	my $aff_lines	= Omni::Traversal::OmniCollector($doc, $aff_addrs, $need_object);
-	# Dictionary
-	ReadDict($FindBin::Bin . "/../" . $SectLabel::Config::dictFile);
-	# Authors
-	my ($aut_features, $aut_rc_features) = AuthorFeatureExtraction($aut_lines, $aut_addrs);
-	# Call CRF
-	my ($aut_signal, $aut_rc) = AuthorExtraction($aut_features, $aut_rc_features);
-	# Affiliations
-	my ($aff_features, $aff_rc_features) = AffiliationFeatureExtraction($aff_lines, $aff_addrs);
-	# Call CRF
-	my ($aff_signal, $aff_rc, $affs) = AffiliationExtraction($aff_features, $aff_rc_features);
-	# Matching features
-	my $aa_features = AAFeatureExtraction($aut_rc, $aff_rc);
-	# Matching
-	my $aa			= AAMatchingImp($aa_features);
-=pod
-	# DEBUG
-	my $aut_handle	= undef;
-	my $aff_handle	= undef;
-	my $aau_handle	= undef;
-	my $aaf_handle	= undef;
-	my $aut_debug	= undef;
-	my $aff_debug	= undef;
-	my $aa_handle	= undef;
-	open $aut_handle, ">:utf8", "aut.features";
-	open $aff_handle, ">:utf8", "aff.features";
-	open $aau_handle, ">:utf8", "aau.features";
-	open $aaf_handle, ">:utf8", "aaf.features";
-	open $aut_debug, ">:utf8", "aut.debug.features";
-	open $aff_debug, ">:utf8", "aff.debug.features";
-	open $aa_handle, ">:utf8", "aa.features";
-	print $aut_handle $aut_features;
-	print $aff_handle $aff_features;
-	print $aau_handle $aut_rc_features;
-	print $aaf_handle $aff_rc_features;
-	print $aa_handle $aa_features, "\n";
-	foreach my $author (keys %{ $aut_rc } )
-	{
-		print $aut_debug $author, ": ", "\n";
-		foreach my $signal (@{ $aut_rc->{ $author }->signals })
-		{
-			print $aut_debug "\t", $signal, "\n";
-		}
-		print $aut_debug "\t", $aut_rc->{ $author }->top, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->bottom, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->left, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->right, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->page, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->section, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->para, "\n";
-		print $aut_debug "\t", $aut_rc->{ $author }->line, "\n";
-	}
-	foreach my $affiliation (keys %{ $aff_rc } )
-	{
-		print $aff_debug $affiliation, ": ", "\n";
-		foreach my $signal (@{ $aff_rc->{ $affiliation }->signals })
-		{
-			print $aff_debug "\t", $signal, "\n";
-		}
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->top, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->bottom, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->left, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->right, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->page, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->section, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->para, "\n";
-		print $aff_debug "\t", $aff_rc->{ $affiliation }->line, "\n";
-	}
-	close $aut_handle;
-	close $aff_handle;
-	close $aau_handle;
-	close $aaf_handle;
-	close $aut_debug;
-	close $aff_debug;
-	close $aa_handle;
-	# END
-=cut
-	# Do the matching
-	# XML string
-	my $sxml 	= "";
-	# and XML writer
-	my $writer	= new XML::Writer(OUTPUT => \$sxml, ENCODING => 'utf-8', DATA_MODE => 'true', DATA_INDENT => 2);
-	# Algorithm
-	$writer->startTag("algorithm", "name" => "AAMatching", "version" => $SectLabel::Config::algorithmVersion);
-	# XML header
-	my $date = `date`; chomp($date);
-	my $time = `date +%s`; chomp($time);
-	# Write XML header
-	$writer->startTag("results", "time" => $time, "date" => $date);
-	# Write authors
-	$writer->startTag("authors");
-	# Write the author name and his corresponding institution
-	foreach my $author (keys %{ $aut_signal })
-	{
-		$writer->startTag("author");
-		$writer->startTag("fullname", "source" => "parscit");
-		$writer->characters($author);
-		$writer->endTag("fullname");
-		$writer->startTag("institutions");
-=pod
-		foreach my $signal (@{ $aut_signal->{ $author } })
-		{
-			$signal =~ s/^\s+|\s+$//g;
-			# Skip blank
-			if ($signal eq "") { next; }
-			$writer->startTag("institution", "symbol" => $signal);
-			$writer->characters($aff_signal->{ $signal });
-			$writer->endTag("institution");
-		}
-=cut
-		foreach my $affiliation (@{ $aa->{ $author } })
-		{
-			$writer->startTag("institution");
-			$writer->characters($affiliation);
-			$writer->endTag("institution");
-		}
-		$writer->endTag("institutions");
-		$writer->endTag("author");
-	}
-	# Finish authors
-	$writer->endTag("authors");
-	# Write institutions
-	$writer->startTag("institutions");
-	# Write the instituion name
-	foreach my $institute (@{ $affs })
-	{
-		$writer->startTag("institution");
-		$writer->characters($institute);
-		$writer->endTag("institution");
-	}
-	$writer->endTag("institutions");
-	# Done
-	$writer->endTag("results");
-	# Done
-	$writer->endTag("algorithm");
-	# Done
-	$writer->end();
-	# Return the xml content back to the caller
-	return $sxml;
-}
-# Features of the relational classifier between author and affiliation
-sub AAFeatureExtraction
-{
-	my ($aut_rc, $aff_rc) = @_;
-	# Relational features
-	my $features = "";
-	# Features between x authors
-	foreach my $author (keys %{ $aut_rc })
-	{
-		my @aut_tokens	= split /\s/, $author;
-		my $author_nb	= join '|||', @aut_tokens;
-		my $min_aff_x	= undef;
-		my $min_dist_x	= LONG_MAX;
-		my $min_aff_y	= undef;
-		my $min_dist_y	= LONG_MAX;
-		# Find the nearest affiliation
-		foreach my $aff (keys %{ $aff_rc })
-		{
-			my $aut_x = ($aut_rc->{ $author }->left + $aut_rc->{ $author }->right) / 2;
-			my $aut_y = ($aut_rc->{ $author }->top + $aut_rc->{ $author }->bottom) / 2;
-			my $aff_x = ($aff_rc->{ $aff }->left + $aff_rc->{ $aff }->right) / 2;
-			my $aff_y = ($aff_rc->{ $aff }->top + $aff_rc->{ $aff }->bottom) / 2;
-			my $dis_x = abs( $aut_x - $aff_x );
-			my $dis_y = abs( $aut_y - $aff_y );
-			# Distance between an author and an affiliation
-			# my $distance = sqrt( $dis_x * $dis_x + $dis_y * $dis_y );
-			# Check if it it the minimum distance in x axis
-			if ($dis_x < $min_dist_x)
-			{
-				$min_dist_x	= $dis_x;
-				$min_aff_x	= $aff;
-			}
-			# Check if it it the minimum distance in y axis
-			if ($dis_y < $min_dist_y)
-			{
-				$min_dist_y	= $dis_y;
-				$min_aff_y	= $aff;
-			}
-		}
-		# and y affiliation
-		foreach my $aff (keys %{ $aff_rc })
-		{
-			my @aff_tokens	= split /\s/, $aff;
-			my $aff_nb		= join '|||', @aff_tokens;
-			# Content
-			$features .= $author_nb . "#" . $aff_nb . "\t";
-			my $signal = undef;
-			# Signal
-			if ((scalar(@{ $aut_rc->{ $author }->signals }) == 0) || (scalar(@{ $aff_rc->{ $aff }->signals }) == 0))
-			{
-				$signal = "diff";
-			}
-			else
-			{
-				my $matched = undef;
-				# Check each author signal
-				foreach my $aut_sig (@{ $aut_rc->{ $author }->signals })
-				{
-					# if it match with affiliation signal
-					if ($aut_sig eq ${ $aff_rc->{ $aff }->signals }[ 0 ]) { $matched = 1; last; }
-				}
-				$signal = (! defined $matched) ? "diff" : "same";
-			}
-			# Signal
-			$features .= $signal . "\t";
-			# Same page
-			my $page = ($aut_rc->{ $author }->page == $aff_rc->{ $aff }->page) ? "yes" : "no";
-			$features .= $page . "\t";
-			my $section = undef;
-			# Same section
-			if ($page eq "yes")
-			{
-				$section = ($aut_rc->{ $author }->section == $aff_rc->{ $aff }->section) ? "yes" : "no";
-				$features .= $section . "\t";
-			}
-			else
-			{
-				$section = "no";
-				$features .= $section . "\t";
-			}
-			my $para = undef;
-			# Same paragraph
-			if (($page eq "yes") && ($section eq "yes"))
-			{
-				$para = ($aut_rc->{ $author }->para == $aff_rc->{ $aff }->para) ? "yes" : "no";
-				$features .= $para . "\t";
-			}
-			else
-			{
-				$para = "no";
-				$features .= $para . "\t";
-			}
-			my $line = undef;
-			# Same line
-			if (($page eq "yes") && ($section eq "yes") && ($para eq "yes"))
-			{
-				$line = ($aut_rc->{ $author }->line == $aff_rc->{ $aff }->line) ? "yes" : "no";
-				$features .= $line . "\t";
-			}
-			else
-			{
-				$line = "no";
-				$features .= $line . "\t";
-			}
-			# Is neartest affiliation in x axis ?
-			my $nearest_x = ($aff eq $min_aff_x) ? "yes" : "no";
-			$features 	 .= $nearest_x . "\t";
-			# Is neartest affiliation in y axis ?
-			my $nearest_y = ($aff eq $min_aff_y) ? "yes" : "no";
-			$features 	 .= $nearest_y . "\n";
-		}
-	}
-	return $features;
-}
-# Actually do the matching between author and affiliation
-sub AAMatchingImp
-{
-	my ($features) = @_;
-	# Temporary input file for CRF
-	my $infile	= BuildTmpFile("aa-input");
-	# Temporary output file for CRF
-	my $outfile	= BuildTmpFile("aa-output");
-	my $output_handle = undef;
-	# Split and write to temporary input
-	open $output_handle, ">:utf8", $infile;
-	# Split
-	my @lines = split /\n/, $features;
-	# and write
-	foreach my $line (@lines)
-	{
-		if ($line eq "")
-		{
-			print $output_handle "\n";
-		}
-		else
-		{
-			print $output_handle $line, "\t", "no", "\n";
-		}
-	}
-	# Done
-	close $output_handle;
-	# AA matching model
-	my $match_model = $SectLabel::Config::matFile;
-	# Matching
-  	system("$crft -m $match_model $infile > $outfile");
-	# List of authors and their affiliation (if exists)
-	my %aa = ();
-	my $input_handle = undef;
-	# Read the CRF output
-	open $input_handle, "<:utf8", $outfile;
-	# Read each line and get its label
-	while (<$input_handle>)
-	{
-		my $line = $_;
-		# Trim
-		$line =~ s/^\s+|\s+$//g;
-		# Blank linem, what the heck ?
-		if ($line eq "") { next; }
-		# Split the line
-		my @fields	= split /\t/, $line;
-		# and extract the class and the content
-		my $class	= $fields[ -1 ];
-		my $content	= $fields[ 0 ];
-		# You miss
-		if ($class ne "yes") { next; }
-		# Split the content into author name and affiliation name
-		my @tmp		= split /#/, $content;
-		# Author name
-		my $author	= $tmp[ 0 ];
-		$author		=~ s/\|\|\|/ /g;
-		# Affiliation name
-		my $aff		= $tmp[ 1 ];
-		$aff		=~ s/\|\|\|/ /g;
-		# Save
-		if (! exists $aa{ $author }) { $aa{ $author } = (); }
-		# Save
-		push @{ $aa{ $author } }, $aff;
-	}
-	# Done
-	close $input_handle;
-	# Clean up
-	unlink $infile;
-	unlink $outfile;
-	# Done
-	return (\%aa);
-}
-# Extract affiliation and their signal using crf
-sub AffiliationExtraction
-{
-	my ($features, $rc_features) = @_;
-	# Temporary input file for CRF
-	my $infile	= BuildTmpFile("aff-input");
-	# Temporary output file for CRF
-	my $outfile	= BuildTmpFile("aff-output");
-	my $output_handle = undef;
-	# Split and write to temporary input
-	open $output_handle, ">:utf8", $infile;
-	# Split
-	my @lines = split /\n/, $features;
-	# and write
-	foreach my $line (@lines)
-	{
-		if ($line eq "")
-		{
-			print $output_handle "\n";
-		}
-		else
-		{
-			print $output_handle $line, "\t", "affiliation", "\n";
-		}
-	}
-	# Done
-	close $output_handle;
-	# Author model
-	my $aff_model = $SectLabel::Config::affFile;
-	# Split the authors
-  	system("$crft -m $aff_model $infile > $outfile");
-	# Each affiliation can have only one signal
-	my %asg = ();
-	# Each affilitiaon can have only one struct
-	my %aaf	= ();
-	# List of all affiliations
-	my @aff = ();
-	# Each line in the relational features string
-	my @rc_lines = split /\n/, $rc_features;
-	my $input_handle = undef;
-	# Read the CRF output
-	open $input_handle, "<:utf8", $outfile;
-	# Author and signal string
-	my $prev_class	= "";
-	my @aff_str		= ();
-	my $signal_str	= "";
-	# Relational classifier
-	my @aaf_rc		= ();
-	# Line counter
-	my $counter		= 0;
-	# Next to last signal
-	my $ntl_signal	= "";
-	# Read each line and get its label
-	# TODO: The code assumes that an affiliation will have the following format: 1 foobar institute
-	while (<$input_handle>)
-	{
-		my $line = $_;
-		# Trim
-		$line =~ s/^\s+|\s+$//g;
-		# Blank line mark the end of an affiliation section
-		if ($line eq "")
-		{
-			if ($prev_class eq "affiliation")
-			{
-				my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
-				# Save the affiliation
-				push @aff, $affiliation;
-				# and its signal
-				if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
-				# Save the signal
-				push @{ $rcs->signals }, $ntl_signal;
-				# Save the record
-				$aaf{ $affiliation } = $rcs;
-			}
-			elsif ($prev_class eq "signal")
-			{
-				# Save the next to last signal
-				$ntl_signal = NormalizeAffiliationSignal($signal_str);
-			}
-			# Cleanup
-			$ntl_signal = "";
-			# Cleanup
-			@aff_str 	= ();
-			$signal_str = "";
-			$prev_class = "";
-			# Cleanup
-			@aaf_rc		= ();
-			# Update the counter
-			$counter++;
-			next ;
-		}
-		# Split the line
-		my @fields	= split /\t/, $line;
-		# and extract the class and the content
-		my $class	= $fields[ -1 ];
-		my $content	= $fields[ 0 ];
-		if ($class eq $prev_class)
-		{
-			# An affiliation
-			if ($class eq "affiliation")
-			{
-				push @aff_str, $content;
-				push @aaf_rc, $rc_lines[ $counter ];
-			}
-			# A signal
-			elsif ($class eq "signal")
-			{
-				$signal_str .= $content . " ";
-			}
-		}
-		else
-		{
-			if ($prev_class eq "affiliation")
-			{
-				my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
-				# Save the affiliation
-				push @aff, $affiliation;
-				# and its signal
-				if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
-				# Save the signal
-				push @{ $rcs->signals }, $ntl_signal;
-				# Save the record
-				$aaf{ $affiliation } = $rcs;
-			}
-			elsif ($prev_class eq "signal")
-			{
-				# Save the next to last signal
-				$ntl_signal = NormalizeAffiliationSignal($signal_str);
-			}
-			# Cleanup
-			@aff_str 	= ();
-			$signal_str = "";
-			@aaf_rc		= ();
-			# Switch to the current class
-			$prev_class = $class;
-			if ($class eq "affiliation")
-			{
-				push @aff_str, $content;
-				push @aaf_rc, $rc_lines[ $counter ];
-			}
-			elsif ($class eq "signal")
-			{
-				$signal_str .= $content . " ";
-			}
-		}
-		# Update the counter
-		$counter++;
-	}
-	# Final class
-	if ($prev_class eq "affiliation")
-	{
-		my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
-		# Save the affiliation
-		push @aff, $affiliation;
-		# and its signal
-		if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }
-		# Save the signal
-		push @{ $rcs->signals }, $ntl_signal;
-		# Save the record
-		$aaf{ $affiliation } = $rcs;
-	}
-	elsif ($prev_class eq "signal")
-	{
-		# Save the next to last signal
-		$ntl_signal = NormalizeAffiliationSignal($signal_str);
-	}
-	# Done
-	close $input_handle;
-	# Clean up
-	unlink $infile;
-	unlink $outfile;
-	# Done
-	return (\%asg, \%aaf, \@aff);
-}
-sub NormalizeAffiliationSignal
-{
-	my ($signal_str) = @_;
-	# Trim
-	$signal_str =~ s/^\s+|\s+$//g;
-	# Remove all space inside the signature
-	$signal_str =~ s/\s+//g;
-	# Done
-	return $signal_str;
-}
-sub NormalizeAffiliationName
-{
-	my ($aff_str, $aaf_rc) = @_;
-	# Constraint
-	if (scalar(@{ $aff_str }) != scalar(@{ $aaf_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
-	# Affiliation string
-	my $affiliation = join ' ', @{ $aff_str };
-	# First word
-	my @fields = split /\s/, $aaf_rc->[ 0 ];
-	# Save the relational features of an affiliation (its first word)
-	my $rcs	= aff_rcfeatures->new(	signals => [],
-									top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
-									page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
-	# Done
-	return ($affiliation, $rcs);
-}
-# Extract author name and their signal using crf
-sub AuthorExtraction
-{
-	my ($features, $rc_features) = @_;
-	# Temporary input file for CRF
-	my $infile	= BuildTmpFile("aut-input");
-	# Temporary output file for CRF
-	my $outfile	= BuildTmpFile("aut-output");
-	my $output_handle = undef;
-	# Split and write to temporary input
-	open $output_handle, ">:utf8", $infile;
-	# Split
-	my @lines = split /\n/, $features;
-	# and write
-	foreach my $line (@lines)
-	{
-		if ($line eq "")
-		{
-			print $output_handle "\n";
-		}
-		else
-		{
-			print $output_handle $line, "\t", "ns", "\n";
-		}
-	}
-	# Done
-	close $output_handle;
-	# Author model
-	my $author_model = $SectLabel::Config::autFile;
-	# Split the authors
-  	system("$crft -m $author_model $infile > $outfile");
-	# Each author can have one or more signals
-	my %asg = ();
-	# Each author can have only one struct
-	my %aas = ();
-	# Each line in the relational features string
-	my @rc_lines = split /\n/, $rc_features;
-	my $input_handle = undef;
-	# Read the CRF output
-	open $input_handle, "<:utf8", $outfile;
-	# Author and signal string
-	my $prev_class	= "";
-	my @author_str	= ();
-	my $signal_str	= "";
-	# Relational classifier
-	my @author_rc	= ();
-	# Line counter
-	my $counter		= 0;
-	# Next to last authors
-	my %ntl_asg 	= ();
-	#
-	my $is_authors	= 0;
-	# Read each line and get its label
-	while (<$input_handle>)
-	{
-		my $line = $_;
-		# Trim
-		$line =~ s/^\s+|\s+$//g;
-		# Blank line mark the end of an author section
-		if ($line eq "")
-		{
-			if ($prev_class eq "author")
-			{
-				my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
-				# Save each author
-				for (my $i = 0; $i < scalar(@{ $authors }); $i++)
-				{
-					$asg{ $authors->[ $i ] } 		= ();
-					$aas{ $authors->[ $i ] }		= $rcs->[ $i ];
-					$ntl_asg{ $authors->[ $i ] }	= 0;
-				}
-			}
-			elsif ($prev_class eq "signal")
-			{
-				my $signals = NormalizeAuthorSignal($signal_str);
-				# Save each signal to its corresponding author
-				foreach my $author (keys %ntl_asg)
-				{
-					foreach my $signal (@{ $signals })
-					{
-						push @{ $asg{ $author } }, $signal;
-						push @{ $aas{ $author }->signals }, $signal;
-					}
-				}
-			}
-			# Cleanup
-			%ntl_asg = ();
-			# Cleanup
-			@author_str = ();
-			$signal_str = "";
-			@author_rc	= ();
-			# Cleanup
-			$prev_class = "";
-			# Update the counter
-			$counter++;
-			#
-			$is_authors = 0;
-			next;
-		}
-		# Split the line
-		my @fields	= split /\t/, $line;
-		# and extract the class and the content
-		my $class	= $fields[ -1 ];
-		my $content	= $fields[ 0 ];
-		if ($class eq $prev_class)
-		{
-			# An author
-			if ($class eq "author")
-			{
-				push @author_str, $content;
-				push @author_rc, $rc_lines[ $counter ];
-			}
-			# A signal
-			elsif ($class eq "signal")
-			{
-				$signal_str .= $content . " ";
-			}
-		}
-		else
-		{
-			if ($prev_class eq "author")
-			{
-				my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
-				# Save each author
-				for (my $i = 0; $i < scalar(@{ $authors }); $i++)
-				{
-					$asg{ $authors->[ $i ] } 		= ();
-					$aas{ $authors->[ $i ] }		= $rcs->[ $i ];
-					$ntl_asg{ $authors->[ $i ] }	= 0;
-				}
-			}
-			elsif ($prev_class eq "signal")
-			{
-				my $signals = NormalizeAuthorSignal($signal_str);
-				# Save each signal to its corresponding author
-				foreach my $author (keys %ntl_asg)
-				{
-					foreach my $signal (@{ $signals })
-					{
-						push @{ $asg{ $author } }, $signal;
-						push @{ $aas{ $author }->signals }, $signal;
-					}
-				}
-			}
-			# Clean the next to last author list if this current class is author
-			if (($is_authors == 0) && ($class eq "author")) { %ntl_asg = (); $is_authors = 1; }
-			#
-			if ($class eq "signal") { $is_authors = 0; }
-			# Cleanup
-			@author_str = ();
-			$signal_str = "";
-			@author_rc	= ();
-			# Switch to the current class
-			$prev_class = $class;
-			if ($class eq "author")
-			{
-				push @author_str, $content;
-				push @author_rc, $rc_lines[ $counter ];
-			}
-			elsif ($class eq "signal")
-			{
-				$signal_str .= $content . " ";
-			}
-		}
-		# Update the counter
-		$counter++;
-	}
-	# Final class
-	if ($prev_class eq "author")
-	{
-		my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
-		# Save each author
-		for (my $i = 0; $i < scalar(@{ $authors }); $i++)
-		{
-			$asg{ $authors->[ $i ] } 		= ();
-			$aas{ $authors->[ $i ] }		= $rcs->[ $i ];
-			$ntl_asg{ $authors->[ $i ] }	= 0;
-		}
-	}
-	elsif ($prev_class eq "signal")
-	{
-		my $signals = NormalizeAuthorSignal($signal_str);
-		# Save each signal to its corresponding author
-		foreach my $author (keys %ntl_asg)
-		{
-			foreach my $signal (@{ $signals })
-			{
-				push @{ $asg{ $author } }, $signal;
-				push @{ $aas{ $author }->signals }, $signal;
-			}
-		}
-	}
-	# Done
-	close $input_handle;
-	# Clean up
-	unlink $infile;
-	unlink $outfile;
-	# Done
-	return (\%asg, \%aas);
-}
-sub NormalizeAuthorNames
-{
-	my ($author_str, $author_rc) = @_;
-	# Constraint
-	if (scalar(@{ $author_str }) != scalar(@{ $author_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }
-	# Mark the beginning of an author name
-	my $begin	= 1;
-	# and its corresponding relational features
-	my $rcbegin	= 0;
-	my @current	= ();
-	my @authors	= ();
-	my @rcs		= ();
-	# Check all tokens in the author string
-	for (my $i = 0; $i < scalar(@{ $author_str }); $i++)
-	{
-		my $token = $author_str->[ $i ];
-		# Mark the end of an author name
-		if ($token =~ m/^(&|and|,|;)$/i)
-		{
-	    	if (scalar(@current) != 0)
-			{
-				push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
-				# Save the relational features of an author (its first word)
-				my @fields = split /\s/, $author_rc->[ $rcbegin ];
-				# Create new record
-				my $tmp	= aut_rcfeatures->new(	signals => [],
-												top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
-												page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
-				# Save the record
-				push @rcs, $tmp;
-			}
-			# Cleanup
-	    	@current	= ();
-	    	$begin		= 1;
-	    	next;
-		}
-		# Mark the begin of an author name
-		if ($begin == 1)
-		{
-	    	push @current, $token;
-	    	$begin 	 = 0;
-			$rcbegin = $i;
-	    	next;
-		}
-		# Author name ending with a comma
-		if ($token =~ m/,$/)
-		{
-	    	push @current, $token;
-			if (scalar(@current) != 0)
-			{
-				push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
-				# Save the relational features of an author (its first word)
-				my @fields = split /\s/, $author_rc->[ $rcbegin ];
-				# Create new record
-				my $tmp	= aut_rcfeatures->new(	signals => [],
-												top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
-												page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
-				# Save the record
-				push @rcs, $tmp;
-			}
-			# Cleanup
-	    	@current	= ();
-	    	$begin		= 1;
-		}
-		# or it's just parts of the name
-		else
-		{
-	    	push @current, $token;
-		}
-	}
-	# Last author name
-	if (scalar(@current) != 0)
-	{
-		push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);
-		# Save the relational features of an author (its first word)
-		my @fields = split /\s/, $author_rc->[ $rcbegin ];
-		# Create new record
-		my $tmp	= aut_rcfeatures->new(	signals => [],
-										top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
-										page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
-		# Save the record
-		push @rcs, $tmp;
-    }
-	# Done
-	return (\@authors, \@rcs);
-}
-#
-sub NormalizeAuthorSignal
-{
-	my ($signal_str) = @_;
-	# Trim
-	$signal_str =~ s/^\s+|\s+$//g;
-	# Split into individual signal
-	my @signals = split / |,|:|;/, $signal_str;
-	# Done
-	return \@signals;
-}
-# Extract features from affiliation lines
-# The list of features include
-# Content
-# Content, lower case, no punctuation
-# Content length
-# First word in line
-#
-# XML features
-# Subscript, superscript
-# Bold
-# Italic
-# Underline
-# Relative font size
-# Differentiate features
-sub AffiliationFeatureExtraction
-{
-	my ($aff_lines, $aff_addrs) = @_;
-	# NOTE: Relational classifier features
-	my $rc_features		= "";
-	# Features will be stored here
-	my $features 		= "";
-	# First word in line
-	my $is_first_line	= undef;
-	# Font size
-	my %fonts = ();
-	# Each line contains many runs
-	foreach my $line (@{ $aff_lines })
-	{
-		my $runs = $line->get_objs_ref();
-		# Iterator though all work in all lines
-		foreach my $run (@{ $runs })
-		{
-			my $fsize = $run->get_font_size();
-			my $words = $run->get_objs_ref();
-			# Statistic
-			if (! exists $fonts{ $fsize })
-			{
-				$fonts{ $fsize } = scalar(@{ $words });
-			}
-			else
-			{
-				$fonts{ $fsize } += scalar(@{ $words });
-			}
-		}
-	}
-	my $dominate_font = undef;
-	# Sort all the font descend with the number of their appearance
-	my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
-	# Select the dominated font
-	$dominate_font = $sorted[ 0 ];
-	my $size_mismatch = undef;
-	# TODO: serious error if the size of aff_lines and the size of aff_addrs mismatch
-	if (scalar(@{ $aff_lines }) != scalar(@{ $aff_addrs }))
-	{
-		$size_mismatch = 1;
-		# Print the error but still try to continue
-		print STDERR "# Total number of affiliation lines (" . scalar(@{ $aff_lines }) . ") != Total number of affiliation addresses (" . scalar(@{ $aff_addrs }) . ")." . "\n";
-	}
-	my $prev_page = undef;
-	my $prev_sect = undef;
-	my $prev_para = undef;
-	# Each line contains many runs
-	for (my $counter = 0; $counter < scalar(@{ $aff_lines }); $counter++)
-	{
-		# Get the line object
-		my $line = $aff_lines->[ $counter ];
-		# Check the size of aff_lines and aff_addrs
-		if (! defined $size_mismatch)
-		{
-			# Check if two consecutive lines are from two different sections
-			if (! defined $prev_page)
-			{
-				# Init
-				$prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
-				$prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
-				$prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
-			}
-			else
-			{
-				# Affiliations from different sections will be separated immediately
-				if (($prev_page != $aff_addrs->[ $counter ]->{ 'L1' }) ||
-					($prev_sect != $aff_addrs->[ $counter ]->{ 'L2' }) ||
-					($prev_para != $aff_addrs->[ $counter ]->{ 'L3' }))
-				{
-					$features .= "\n";
-					# NOTE: Relational classifier features
-					$rc_features .= "\n";
-				}
-				# Save the paragraph index
-				$prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
-				$prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
-				$prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
-			}
-		}
-		# Set first word in line
-		$is_first_line = 1;
-		# Two previous words
-		my $prev_word		= undef;
-		my $prev_prev_word	= undef;
-		# Format of the previous word
-		my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
-		my $runs = $line->get_objs_ref();
-		# Iterator though all work in all lines
-		foreach my $run (@{ $runs })
-		{
-			# The run must be non-empty
-			my $tmp = $run->get_content();
-			# Trim
-			$tmp	=~ s/^\s+|\s+$//g;
-			# Skip blank run
-			if ($tmp eq "") { next; }
-			###
-			# The following features are XML features
-			###
-			# Bold format
-			my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
-			# Italic format
-			my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
-			# Underline
-			my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
-			# Sub-Sup-script
-			my $suscript =	($run->get_suscript() eq "superscript")	? "super"	:
-							($run->get_suscript() eq "subscript")	? "sub"		: "none";
-			# Relative font size
-			my $fontsize =	($run->get_font_size() > $dominate_font)	? "large"	:
-							($run->get_font_size() < $dominate_font)	? "small"	: "normal";
-			###
-			# End of XML features
-			###
-			# All words in the run
-			my $words = $run->get_objs_ref();
-			# For each word
-			foreach my $word (@{ $words })
-			{
-				# Get word location
-				my $top 	= $word->get_top_pos();
-				my $bottom 	= $word->get_bottom_pos();
-				my $left	= $word->get_left_pos();
-				my $right	= $word->get_right_pos();
-				# NOTE: heuristic rule, for words in the same line
-				# If the x-axis distance between this word and the previous word is
-				# three times larger than the distance between the previous word and
-				# the word before it, then it marks the separator.
-				# The better way to do this is to introduce it as a new feature in the
-				# author and affiliation model but this step requires re-training these
-				# two models, so ...
-				#
-				# NOTE: Assuming left to right writing
-				if (! defined $prev_word)
-				{
-					$prev_word = $word;
-				}
-				elsif (! defined $prev_prev_word)
-				{
-					# NOTE: Words have the power to both destroy and heal, when words are both
-					# true and kind, they can change our world
-					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
-					{
-						$prev_prev_word = $prev_word;
-						$prev_word		= $word;
-					}
-				}
-				else
-				{
-					# NOTE: Words have the power to both destroy and heal, when words are both
-					# true and kind, they can change our world
-					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
-					{
-						my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
-						my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
-						if ($prev_dist * 5 < $curr_dist)
-						{
-							$features .= "\n";
-							# NOTE: Relational classifier features
-							$rc_features .= "\n";
-						}
-						$prev_prev_word = $prev_word;
-						$prev_word		= $word;
-					}
-				}
-				# Extract features
-				my $full_content = $word->get_content();
-				# Trim
-				$full_content	 =~ s/^\s+|\s+$//g;
-				# Skip blank run
-				if ($full_content eq "") { next; }
-				my @sub_content = ();
-				# This is the tricky part, one word e.g. **affiliation will be
-				# splitted into two parts: the signal, and the affiliation if
-				# possible using regular expression
-				while ($full_content =~ m/([\w|-]*)(\W*)/g)
-				{
-					my $first	= $1;
-					my $second	= $2;
-					# Trim
-					$first	=~ s/^\s+|\s+$//g;
-					$second	=~ s/^\s+|\s+$//g;
-					# Only keep non-blank content
-					if ($first ne "") { push @sub_content, $first; }
-					# Check the signal and separator
-					while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
-					{
-						my $sub_first	= $1;
-						my $sub_second	= $2;
-						# Trim
-						$sub_first	=~ s/^\s+|\s+$//g;
-						$sub_second	=~ s/^\s+|\s+$//g;
-						# Only keep non-blank separator
-						if ($sub_first ne "") { push @sub_content, $sub_first; }
-						# Only keep non-blank signal
-						if ($sub_second ne "") { push @sub_content, $sub_second; }
-					}
-				}
-				foreach my $content (@sub_content)
-				{
-					# Content
-					$features .= $content . "\t";
-					my $content_n	= $content;
-					# Remove punctuation
-					$content_n		=~ s/[^\w]//g;
-					# Lower case
-					my $content_l	= lc($content);
-					# Lower case, no punctuation
-					my $content_nl	= lc($content_n);
-					# Lower case
-					$features .= $content_l . "\t";
-					# Lower case, no punctuation
-					if ($content_nl ne "")
-					{
-						$features .= $content_nl . "\t";
-					}
-					else
-					{
-						$features .= $content_l . "\t";
-					}
-					# Split into character
-		      		my @chars = split(//, $content);
-					# Content length
-					my $length =	(scalar(@chars) == 1)	? "1-char"	:
-									(scalar(@chars) == 2)	? "2-char"	:
-									(scalar(@chars) == 3)	? "3-char"	: "4+char";
-					$features .= $length . "\t";
-					# First word in line
-					if ($is_first_line == 1)
-					{
-						$features .= "begin" . "\t";
-						# Next words are not the first in line anymore
-						$is_first_line = 0;
-					}
-					else
-					{
-						$features .= "continue" . "\t";
-					}
-					###
-					# The following features are XML features
-					###
-					# Bold format
-					$features .= $bold . "\t";
-					# Italic format
-					$features .= $italic . "\t";
-					# Underline
-					$features .= $underline . "\t";
-					# Sub-Sup-script
-					$features .= $suscript . "\t";
-					# Relative font size
-					$features .= $fontsize . "\t";
-					# First word in run
-					if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
-					{
-						$features .= "fbegin" . "\t";
-					}
-					else
-					{
-						$features .= "fcontinue" . "\t";
-					}
-					# New token
-					$features .= "\n";
-					# Save the XML format
-					$prev_bold		= $bold;
-					$prev_italic	= $italic;
-					$prev_underline	= $underline;
-					$prev_suscript	= $suscript;
-					$prev_fontsize	= $fontsize;
-					# NOTE: Relational classifier features
-					# Content
-					$rc_features .= $content . "\t";
-					# Location
-					$rc_features .= $top 	. "\t";
-					$rc_features .= $bottom . "\t";
-					$rc_features .= $left 	. "\t";
-					$rc_features .= $right	. "\t";
-					# Index
-					if (! defined $size_mismatch)
-					{
-						$rc_features .= $aff_addrs->[ $counter ]->{ 'L1' } . "\t";
-						$rc_features .= $aff_addrs->[ $counter ]->{ 'L2' } . "\t";
-						$rc_features .= $aff_addrs->[ $counter ]->{ 'L3' } . "\t";
-						$rc_features .= $aff_addrs->[ $counter ]->{ 'L4' } . "\t";
-					}
-					# Done
-					$rc_features .= "\n";
-				}
-			}
-		}
-	}
-	return ($features, $rc_features);
-}
-# Extract features from author lines
-# The list of features include
-# Content
-# Content, lower case, no punctuation
-# Content length
-# Capitalization
-# Numeric property
-# Last punctuation
-# First 4-gram
-# Last 4-gram
-# Dictionary
-# First word in line
-#
-# XML features
-# Subscript, superscript
-# Bold
-# Italic
-# Underline
-# Relative font size
-# Differentiate features
-sub AuthorFeatureExtraction
-{
-	my ($aut_lines, $aut_addrs) = @_;
-	# NOTE: Relational classifier features
-	my $rc_features		= "";
-	# Features will be stored here
-	my $features 		= "";
-	# First word in line
-	my $is_first_line	= undef;
-	# First word in run
-	# my $is_first_run	= undef;
-	# Font size
-	my %fonts = ();
-	# Each line contains many runs
-	foreach my $line (@{ $aut_lines })
-	{
-		my $runs = $line->get_objs_ref();
-		# Iterator though all work in all lines
-		foreach my $run (@{ $runs })
-		{
-			my $fsize = $run->get_font_size();
-			my $words = $run->get_objs_ref();
-			# Statistic
-			if (! exists $fonts{ $fsize })
-			{
-				$fonts{ $fsize } = scalar(@{ $words });
-			}
-			else
-			{
-				$fonts{ $fsize } += scalar(@{ $words });
-			}
-		}
-	}
-	my $dominate_font = undef;
-	# Sort all the font descend with the number of their appearance
-	my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
-	# Select the dominated font
-	$dominate_font = $sorted[ 0 ];
-	my $size_mismatch = undef;
-	# TODO: serious error if the size of aut_lines and the size of aut_addrs mismatch
-	if (scalar(@{ $aut_lines }) != scalar(@{ $aut_addrs }))
-	{
-		$size_mismatch = 1;
-		# Print the error but still try to continue
-		print STDERR "# Total number of author lines (" . scalar(@{ $aut_lines }) . ") != Total number of author addresses (" . scalar(@{ $aut_addrs }) . ")." . "\n";
-	}
-	my $prev_page = undef;
-	my $prev_sect = undef;
-	my $prev_para = undef;
-	# Each line contains many runs
-	for (my $counter = 0; $counter < scalar(@{ $aut_lines }); $counter++)
-	{
-		# Get the line object
-		my $line = $aut_lines->[ $counter ];
-		# Check the size of aut_line and aut_addrs
-		if (! defined $size_mismatch)
-		{
-			# Check if two consecutive lines are from two different sections
-			if (! defined $prev_page)
-			{
-				# Init
-				$prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
-				$prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
-				$prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
-			}
-			else
-			{
-				# Authors from different sections will be separated immediately
-				if (($prev_page != $aut_addrs->[ $counter ]->{ 'L1' }) ||
-					($prev_sect != $aut_addrs->[ $counter ]->{ 'L2' }) ||
-					($prev_para != $aut_addrs->[ $counter ]->{ 'L3' }))
-				{
-					$features .= "\n";
-					# NOTE: Relational classifier features
-					$rc_features .= "\n";
-				}
-				# Save the paragraph index
-				$prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
-				$prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
-				$prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
-			}
-		}
-		# Set first word in line
-		$is_first_line = 1;
-		# Previous word and the word before this
-		my $prev_prev_word	= undef;
-		my $prev_word		= undef;
-		# Format of the previous word
-		my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";
-		my $runs = $line->get_objs_ref();
-		# Iterator though all work in all lines
-		foreach my $run (@{ $runs })
-		{
-			# The run must be non-empty
-			my $tmp = $run->get_content();
-			# Trim
-			$tmp	=~ s/^\s+|\s+$//g;
-			# Skip blank run
-			if ($tmp eq "") { next; }
-			# Set first word in run
-			# $is_first_run = 1;
-			###
-			# The following features are XML features
-			###
-			# Bold format
-			my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
-			# Italic format
-			my $italic = ($run->get_italic() eq "true") ? "italic" : "none";
-			# Underline
-			my $underline = ($run->get_underline() eq "true") ? "underline" : "none";
-			# Sub-Sup-script
-			my $suscript =	($run->get_suscript() eq "superscript")	? "super"	:
-							($run->get_suscript() eq "subscript")	? "sub"		: "none";
-			# Relative font size
-			my $fontsize =	($run->get_font_size() > $dominate_font)	? "large"	:
-							($run->get_font_size() < $dominate_font)	? "small"	: "normal";
-			###
-			# End of XML features
-			###
-			# All words in the run
-			my $words = $run->get_objs_ref();
-			# For each word
-			foreach my $word (@{ $words })
-			{
-				# Get word location
-				my $top 	= $word->get_top_pos();
-				my $bottom 	= $word->get_bottom_pos();
-				my $left	= $word->get_left_pos();
-				my $right	= $word->get_right_pos();
-				# NOTE: heuristic rule, for words in the same line
-				# If the x-axis distance between this word and the previous word is
-				# three times larger than the distance between the previous word and
-				# the word before it, then it marks the separator.
-				# The better way to do this is to introduce it as a new feature in the
-				# author and affiliation model but this step requires re-training these
-				# two models, so ...
-				#
-				# NOTE: Assuming left to right writing
-				if (! defined $prev_word)
-				{
-					$prev_word = $word;
-				}
-				elsif (! defined $prev_prev_word)
-				{
-					# NOTE: Words have the power to both destroy and heal, when words are both
-					# true and kind, they can change our world
-					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
-					{
-						$prev_prev_word = $prev_word;
-						$prev_word		= $word;
-					}
-				}
-				else
-				{
-					# NOTE: Words have the power to both destroy and heal, when words are both
-					# true and kind, they can change our world
-					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
-					{
-						my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
-						my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());
-						if ($prev_dist * 5 < $curr_dist)
-						{
-							$features .= "\n";
-							# NOTE: Relational classifier features
-							$rc_features .= "\n";
-						}
-						$prev_prev_word = $prev_word;
-						$prev_word		= $word;
-					}
-				}
-				# Extract features
-				my $full_content = $word->get_content();
-				# Trim
-				$full_content	 =~ s/^\s+|\s+$//g;
-				# Skip blank run
-				if ($full_content eq "") { next; }
-				my @sub_content = ();
-				# This is the tricky part, one word e.g. name** will be splitted
-				# into several parts: the name, the signal, and the separator if
-				# possible using regular expression
-				while ($full_content =~ m/([\w|-]*)(\W*)/g)
-				{
-					my $first	= $1;
-					my $second	= $2;
-					# Trim
-					$first	=~ s/^\s+|\s+$//g;
-					$second	=~ s/^\s+|\s+$//g;
-					# Only keep non-blank content
-					if ($first ne "") { push @sub_content, $first; }
-					# Check the signal and separator
-					while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
-					{
-						my $sub_first	= $1;
-						my $sub_second	= $2;
-						# Trim
-						$sub_first	=~ s/^\s+|\s+$//g;
-						$sub_second	=~ s/^\s+|\s+$//g;
-						# Only keep non-blank separator
-						if ($sub_first ne "") { push @sub_content, $sub_first; }
-						# Only keep non-blank signal
-						if ($sub_second ne "") { push @sub_content, $sub_second; }
-					}
-				}
-				foreach my $content (@sub_content)
-				{
-					# Content
-					$features .= $content . "\t";
-					my $content_n	= $content;
-					# Remove punctuation
-					$content_n		=~ s/[^\w]//g;
-					# Lower case
-					my $content_l	= lc($content);
-					# Lower case, no punctuation
-					my $content_nl	= lc($content_n);
-					# Lower case
-					$features .= $content_l . "\t";
-					# Lower case, no punctuation
-					if ($content_nl ne "")
-					{
-						$features .= $content_nl . "\t";
-					}
-					else
-					{
-						$features .= $content_l . "\t";
-					}
-					# Capitalization
-					my $ortho = ($content =~ /^[\p{IsUpper}]$/)					? "single"	:
-								($content =~ /^[\p{IsUpper}][\p{IsLower}]+/)	? "init" 	:
-								($content =~ /^[\p{IsUpper}]+$/) 				? "all" 	: "others";
-					$features .= $ortho . "\t";
-					# Numeric property
-					my $num =	($content =~ /^[0-9]$/)					? "1dig" 	:
-								($content =~ /^[0-9][0-9]$/) 			? "2dig" 	:
-								($content =~ /^[0-9][0-9][0-9]$/) 		? "3dig" 	:
-								($content =~ /^[0-9]+$/) 				? "4+dig" 	:
-								($content =~ /^[0-9]+(th|st|nd|rd)$/)	? "ordinal"	:
-								($content =~ /[0-9]/) 					? "hasdig" 	: "nonnum";
-					$features .= $num . "\t";
-					# Last punctuation
-					my $punct = ($content =~ /^[\"\'\`]/) 						? "leadq" 	:
-								($content =~ /[\"\'\`][^s]?$/) 					? "endq" 	:
-	  							($content =~ /\-.*\-/) 							? "multi"	:
-	    						($content =~ /[\-\,\:\;]$/) 					? "cont" 	:
-	      						($content =~ /[\!\?\.\"\']$/) 					? "stop" 	:
-	        					($content =~ /^[\(\[\{\<].+[\)\]\}\>].?$/)		? "braces" 	: "others";
-					$features .= $punct . "\t";
-					# Split into character
-		      		my @chars = split(//, $content);
-					my $clen  = scalar @chars;
-					# Content length
-					my $length =	(scalar(@chars) == 1)	? "1-char"	:
-									(scalar(@chars) == 2)	? "2-char"	:
-									(scalar(@chars) == 3)	? "3-char"	: "4+char";
-					$features .= $length . "\t";
-					# First n-gram
-					$features .= $chars[ 0 ] . "\t";
-					if ($clen >= 2) {
-						$features .= join("", @chars[ 0..1 ]) . "\t";
-					} else {
-						$features .= $length . "\t";
-					}
-					if ($clen >= 3) {
-						$features .= join("", @chars[ 0..2 ]) . "\t";
-					} elsif ($clen >= 2) {
-						$features .= join("", @chars[ 0..1 ]) . "\t";
-					} else {
-						$features .= $length . "\t";
-					}
-					if ($clen >= 4) {
-						$features .= join("", @chars[ 0..3 ]) . "\t";
-					} elsif ($clen >= 3) {
-						$features .= join("", @chars[ 0..2 ]) . "\t";
-					} elsif ($clen >= 2) {
-						$features .= join("", @chars[ 0..1 ]) . "\t";
-					} else {
-						$features .= $length . "\t";
-					}
-	      			# Last n-gram
-					$features .= $chars[ -1 ] . "\t";
-					if ($clen >= 2) {
-						$features .= join("", @chars[ -2..-1 ]) . "\t";
-					} else {
-						$features .= $chars[ -1 ] . "\t";
-					}
-					if ($clen >= 3) {
-						$features .= join("", @chars[ -3..-1 ]) . "\t";
-					} elsif ($clen >= 2) {
-						$features .= join("", @chars[ -2..-1 ]) . "\t";
-					} else {
-						$features .= $chars[ -1 ] . "\t";
-					}
-					if ($clen >= 4) {
-						$features .= join("", @chars[ -4..-1 ]) . "\t";
-					} elsif ($clen >= 3) {
-						$features .= join("", @chars[ -3..-1 ]) . "\t";
-					} elsif ($clen >= 2) {
-						$features .= join("", @chars[ -2..-1 ]) . "\t";
-					} else {
-						$features .= $chars[ -1 ] . "\t";
-					}
-					# Dictionary
-					my $dict_status = (defined $dict{ $content_nl }) ? $dict{ $content_nl } : 0;
-					# Possible names
-					my ($publisher_name, $place_name, $month_name, $last_name, $female_name, $male_name) = undef;
-   					# Check all case
-					if ($dict_status >= 32) { $dict_status -= 32; 	$publisher_name	= "publisher"	} else { $publisher_name	= "no"; }
-	    			if ($dict_status >= 16)	{ $dict_status -= 16; 	$place_name 	= "place" 		} else { $place_name 		= "no"; }
-		    		if ($dict_status >= 8)	{ $dict_status -= 8; 	$month_name 	= "month" 		} else { $month_name 		= "no"; }
-    				if ($dict_status >= 4)	{ $dict_status -= 4; 	$last_name 		= "last" 		} else { $last_name 		= "no"; }
-	    			if ($dict_status >= 2) 	{ $dict_status -= 2; 	$female_name 	= "female" 		} else { $female_name 		= "no"; }
-    				if ($dict_status >= 1) 	{ $dict_status -= 1; 	$male_name 		= "male" 		} else { $male_name 		= "no"; }
-		    		# Save the feature
-					$features .= $male_name 	 . "\t";
-					$features .= $female_name 	 . "\t";
-					$features .= $last_name 	 . "\t";
-					$features .= $month_name 	 . "\t";
-					$features .= $place_name 	 . "\t";
-					$features .= $publisher_name . "\t";
-					# First word in line
-					if ($is_first_line == 1)
-					{
-						$features .= "begin" . "\t";
-						# Next words are not the first in line anymore
-						$is_first_line = 0;
-					}
-					else
-					{
-						$features .= "continue" . "\t";
-					}
-					###
-					# The following features are XML features
-					###
-					# Bold format
-					$features .= $bold . "\t";
-					# Italic format
-					$features .= $italic . "\t";
-					# Underline
-					$features .= $underline . "\t";
-					# Sub-Sup-script
-					$features .= $suscript . "\t";
-					# Relative font size
-					$features .= $fontsize . "\t";
-					# First word in run
-					if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
-					{
-						$features .= "fbegin" . "\t";
-						# Next words are not the first in line anymore
-						# $is_first_run = 0;
-					}
-					else
-					{
-						$features .= "fcontinue" . "\t";
-					}
-					# New token
-					$features .= "\n";
-					# Save the XML format
-					$prev_bold		= $bold;
-					$prev_italic	= $italic;
-					$prev_underline	= $underline;
-					$prev_suscript	= $suscript;
-					$prev_fontsize	= $fontsize;
-					# NOTE: Relational classifier features
-					# Content
-					$rc_features .= $content . "\t";
-					# Location
-					$rc_features .= $top 	. "\t";
-					$rc_features .= $bottom . "\t";
-					$rc_features .= $left 	. "\t";
-					$rc_features .= $right	. "\t";
-					# Index
-					if (! defined $size_mismatch)
-					{
-						$rc_features .= $aut_addrs->[ $counter ]->{ 'L1' } . "\t";
-						$rc_features .= $aut_addrs->[ $counter ]->{ 'L2' } . "\t";
-						$rc_features .= $aut_addrs->[ $counter ]->{ 'L3' } . "\t";
-						$rc_features .= $aut_addrs->[ $counter ]->{ 'L4' } . "\t";
-					}
-					# Done
-					$rc_features .= "\n";
-				}
-			}
-		}
-	}
-	return ($features, $rc_features);
-}
-sub ReadDict
-{
-  	my ($dictfile) = @_;
-	# Absolute path
-	my $dictfile_abs = File::Spec->rel2abs($dictfile);
-	# Dictionary handle
-	my $dict_handle	 = undef;
-  	open ($dict_handle, "<:utf8", $dictfile_abs) || die "Could not open dict file $dictfile_abs: $!";
-	my $mode = 0;
-  	while (<$dict_handle>)
-	{
-    	if (/^\#\# Male/) 			{ $mode = 1; }		# male names
-    	elsif (/^\#\# Female/) 		{ $mode = 2; }		# female names
-    	elsif (/^\#\# Last/) 		{ $mode = 4; }		# last names
-    	elsif (/^\#\# Chinese/) 	{ $mode = 4; }		# last names
-    	elsif (/^\#\# Months/) 		{ $mode = 8; }		# month names
-    	elsif (/^\#\# Place/) 		{ $mode = 16; }		# place names
-    	elsif (/^\#\# Publisher/)	{ $mode = 32; }		# publisher names
-    	elsif (/^\#/) { next; }
-    	else
-		{
-      		chop;
-      		my $key = $_;
-      		my $val = 0;
-			# Has probability
-      		if (/\t/) { ($key,$val) = split (/\t/,$_); }
-      		# Already tagged (some entries may appear in same part of lexicon more than once
-      		if (! exists $dict{ $key })
-			{
-				$dict{ $key } = $mode;
-      		}
-			else
-			{
-				if ($dict{ $key } >= $mode)
-				{
-					next;
-				}
-				# Not yet tagged
-				else
-				{
-					$dict{ $key } += $mode;
-				}
-      		}
-    	}
-  	}
-	close ($dict_handle);
-}
-sub BuildTmpFile
-{
-    my ($filename) = @_;
-	my $tmpfile = $filename;
-    $tmpfile 	=~ s/[\.\/]//g;
-    $tmpfile 	.= $$ . time;
-	# Untaint tmpfile variable
-    if ($tmpfile =~ /^([-\@\w.]+)$/)
-	{
-		$tmpfile = $1;
-    }
-    return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
-}
-1;