RubyGems - biblicit - Versions diffs - 1.0 - Mend

biblicit 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (322) hide show

data/perl/ParsCit/lib/ParsCit/PostProcess.pm ADDED Viewed

@@ -0,0 +1,367 @@
+package ParsCit::PostProcess;
+#
+# Utilities for normalizing the output of CRF++ into standard
+# representations.
+#
+# Isaac Councill, 07/20/07
+#
+use strict;
+use utf8;
+##
+# Main normalization subroutine.  Reads in a CRF++ output file
+# and normalizes each field of individual citations.  An intermediate
+# XML representation is used to keep track of the tags discovered by
+# the model.  Returns a reference to the raw XML (may not be encoded
+# safely) and a reference to a list of hashes containing the normalized
+# citation subfields, keyed by tag name.
+##
+sub readAndNormalize {
+    my ($inFile) = @_;
+    my $status = 1;
+    my $msg = "";
+    open(IN, "<:utf8", $inFile) or return (undef, undef, 0,
+				    "couldn't open infile: $!");
+    my $currentTag;
+    my @currentTokens = ();
+    my $newCitation = 1;
+    my $xml = "";
+    while(<IN>) {
+	if (m/^\s*$/) { # blank line separates citations
+	    if ($newCitation <= 0) {
+		finishCitation(\$xml, \$currentTag, \@currentTokens);
+		@currentTokens = ();
+		$newCitation = 1;
+		next;
+	    }
+	}
+	if ($newCitation > 0) {
+	    $xml .= "<citation>\n";
+	    $newCitation = 0;
+	}
+	my @fields = split /\s+/;
+	my $token = $fields[0];
+	my $tag = $fields[$#fields];
+	if (!defined $currentTag) {
+	    $currentTag = $tag;
+	}
+	if ($tag eq $currentTag) {
+	    push @currentTokens, $token;
+	} else {
+	    $xml .= makeSegment($currentTag, @currentTokens);
+	    $currentTag = $tag;
+	    @currentTokens = ();
+	    push @currentTokens, $token;
+	}
+    }
+    close IN;
+    if ($newCitation <= 0) {
+	finishCitation(\$xml, \$currentTag, \@currentTokens);
+	@currentTokens = ();
+	$newCitation = 1;
+    }
+    my $rCiteInfo = normalizeFields(\$xml);
+    return \$xml, $rCiteInfo, $status, $msg;
+}  # readAndNormalize
+##
+# Utility for adding a closing tag to a citation in the
+# intermediate XML, and setting the currentTag value to undef.
+##
+sub finishCitation {
+    my ($r_xml, $r_currentTag, $r_currentTokens) = @_;
+    if (defined $$r_currentTag) {
+	$$r_xml .= makeSegment($$r_currentTag, @$r_currentTokens);
+    }
+    $$r_xml .= "</citation>\n";
+    $$r_currentTag = undef;
+}  # finishCitation
+##
+# Makes an XML segment based on the specifed tag and token list.
+##
+sub makeSegment {
+    my ($tag, @tokens) = @_;
+    my $segment = join " ", @tokens;
+    return "<$tag>$segment</$tag>\n";
+}
+##
+# Switching utility for reading through the intermediate XMl
+# and passing control to an appropriate normalization routine
+# for each field encountered.  Returns a reference to a list
+# of hashes containing normalized fields, keyed by tag name.
+##
+sub normalizeFields {
+    my ($rXML) = @_;
+    my @citeInfos = ();
+    $_ = $$rXML;
+    my @citeBlocks = m/<citation>(.*?)<\/citation>/gs;
+    foreach my $block (@citeBlocks) {
+	my %citeInfo;
+	while($block =~ m/<(.*?)>(.*?)<\/\1>/gs) {
+	    my ($tag, $content) = ($1, $2);
+	    if ($tag eq "author") {
+		$tag = "authors";
+		$content = normalizeAuthorNames($content);
+	    } elsif ($tag eq "date") {
+		$content = normalizeDate($content);
+	    } elsif ($tag eq "volume") {
+		$content = normalizeNumber($content);
+	    } elsif ($tag eq "number") {
+		$content = normalizeNumber($content);
+	    } elsif ($tag eq "pages") {
+		$content = normalizePages($content);
+	    } else {
+		$content = stripPunctuation($content);
+	    }
+	    # Heuristic - only get first instance of tag.
+	    # TODO: we can do better than that...
+	    unless (defined $citeInfo{$tag} || ! defined $content) {
+		$citeInfo{$tag} = $content;
+	    }
+	}
+	push @citeInfos, \%citeInfo;
+    }
+    return \@citeInfos;
+}  # normalizeFields
+sub stripPunctuation {
+    my $text = shift;
+    $text =~ s/^[^\p{IsLower}\p{IsUpper}0-9]+//;
+    $text =~ s/[^\p{IsLower}\p{IsUpper}0-9]+$//;
+    return $text;
+}
+##
+# Tries to split the author tokens into individual author names
+# and then normalizes these names individually.  Returns a
+# list of author names.
+##
+sub normalizeAuthorNames {
+    my ($authorText) = @_;
+    my @tokens = repairAndTokenizeAuthorText($authorText);
+    my @authors = ();
+    my @currentAuth = ();
+    my $beginAuth = 1;
+    foreach my $tok (@tokens) {
+	if ($tok =~ m/^(&|and)$/i) {
+	    if ($#currentAuth >= 0) {
+		my $auth = normalizeAuthorName(@currentAuth);
+		push @authors, $auth;
+	    }
+	    @currentAuth = ();
+	    $beginAuth = 1;
+	    next;
+	}
+	if ($beginAuth > 0) {
+	    push @currentAuth, $tok;
+	    $beginAuth = 0;
+	    next;
+	}
+	if ($tok =~ m/,$/) {
+	    push @currentAuth, $tok;
+	    if ($#currentAuth>0) {
+		my $auth = normalizeAuthorName(@currentAuth);
+		push @authors, $auth;
+		@currentAuth = ();
+		$beginAuth = 1;
+	    }
+	} else {
+	    push @currentAuth, $tok;
+	}
+    }
+    if ($#currentAuth >= 0) {
+	my $auth = normalizeAuthorName(@currentAuth);
+	push @authors, $auth;
+    }
+    return \@authors;
+}  # normalizeAuthorNames
+##
+# Strips unexpected punctuation and removes tokens that
+# are obviously not name words from the token list.
+##
+sub repairAndTokenizeAuthorText {
+    my ($authorText) = @_;
+    # Repair obvious parse errors and weird notations.
+    $authorText =~ s/et\.? al\.?.*$//;
+    $authorText =~ s/^.*?[\p{IsUpper}\p{IsLower}][\p{IsUpper}\p{IsLower}]+\. //;
+    $authorText =~ s/\(.*?\)//g;
+    $authorText =~ s/^.*?\)\.?//g;
+    $authorText =~ s/\(.*?$//g;
+    $authorText =~ s/\[.*?\]//g;
+    $authorText =~ s/^.*?\]\.?//g;
+    $authorText =~ s/\[.*?$//g;
+    $authorText =~ s/;/,/g;
+    $authorText =~ s/,/, /g;
+    $authorText =~ s/\:/ /g;
+    $authorText =~ s/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]//g;
+    $authorText = joinMultiWordNames($authorText);
+    my @origTokens = split '\s+', $authorText;
+    my @tokens = ();
+    for (my $i=0; $i<=$#origTokens; $i++) {
+	my $tok = $origTokens[$i];
+	if ($tok !~ m/[\p{IsUpper}\p{IsLower}&]/) {
+	    if ($i < $#origTokens/2) {
+		# Probably got junk up to now.
+		@tokens = ();
+		next;
+	    } else {
+		last;
+	    }
+	}
+	if ($tok =~ m/^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i) {
+	    if ($tokens[$#tokens] =~ m/\,$/) {
+		next;
+	    }
+	}
+	if ($tok =~ m/^[IVX][IVX]+\.?\,?$/) {
+	    next;
+	}
+	push @tokens, $tok;
+    }
+    return @tokens;
+}  #repairAndTokenizeAuthorText
+##
+# Tries to normalize an individual author name into the form
+# "First Middle Last", without punctuation.
+##
+sub normalizeAuthorName {
+    my @authTokens = @_;
+    if ($#authTokens < 0) {
+	return "";
+    }
+    my $tmpStr = join " ", @authTokens;
+    if ($tmpStr =~ m/(.+),\s*(.+)/) {
+	$tmpStr = "$2 $1";
+    }
+    $tmpStr =~ s/\.\-/-/g;
+    $tmpStr =~ s/[\,\.]/ /g;
+    $tmpStr =~ s/  +/ /g;
+    $tmpStr = trim($tmpStr);
+    if ($tmpStr =~ m/^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/) {
+	my @newTokens = split '\s+', $tmpStr;
+	my @newOrder = @newTokens[1..$#newTokens];
+	push @newOrder, $newTokens[0];
+	$tmpStr = join " ", @newOrder;
+    }
+    return $tmpStr;
+}  # normalizeAuthorName
+##
+# Utility for creating an intermediate representation of multi-word
+# name components, e.g., transforms "van der Wald" to "van_dir_Wald".
+# this helps keep things straight during normalization.  The
+# underscores can be stripped out later.
+##
+sub joinMultiWordNames {
+    my $authorText = shift;
+    $authorText =~ s/\b((?:van|von|der|den|de|di|le|el))\s/\1_/sgi;
+    return $authorText;
+} # joinMultiWordNames
+##
+# Normalizes a date field into just the year.  Looks for a string of
+# four digits.
+##
+sub normalizeDate {
+    my $dateText = shift;
+    if ($dateText =~ m/(\d{4})/) {
+	my $year = $1;
+	# check to see whether this is a sane year setting
+	my @timeData = localtime(time);
+	my $currentYear = $timeData[5]+1900;
+	if ($year <= $currentYear+3) {
+	    return $1;
+	}
+    }
+}  # normalizeDate
+##
+# If a field should be numeric only, this utility is used
+# to extract the first number string only.
+##
+sub normalizeNumber {
+    my $numText = shift;
+    if ($numText =~ m/(\d+)/) {
+	return $1;
+    } else {
+	return $numText;
+    }
+}  # normalizeNumber
+##
+# Normalizes page fields into the form "start--end".  If the page
+# field does not appear to be in a standard form, does nothing.
+##
+sub normalizePages {
+    my $pageText = shift;
+    if ($pageText =~ m/(\d+)[^\d]+?(\d+)/) {
+	if ($1>=$2) {
+	    return undef;
+	}
+	return "$1--$2";
+    } elsif ($pageText =~ m/(\d+)/) {
+	return $1;
+    } else {
+	return undef;
+    }
+}  # normalizePages
+sub trim {
+    my $str = shift;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//;
+    return $str;
+}
+1;

data/perl/ParsCit/lib/ParsCit/PreProcess.pm ADDED Viewed

@@ -0,0 +1,333 @@
+package ParsCit::PreProcess;
+#
+# Utilities for finding and normalizing citations within
+# text files, including separating citation text from
+# body text and segmenting citations.
+#
+# Isaac Councill, 7/19/07
+#
+use strict;
+use utf8;
+use ParsCit::Citation;
+my %markerTypes = (
+		   'SQUARE' => '\\[.+?\\]',
+		   'PAREN' => '\\(.+?\\)',
+		   'NAKEDNUM' => '\\d+',
+		   'NAKEDNUMDOT' => '\\d+\\.',
+		   );
+##
+# Looks for reference section markers in the supplied text and
+# separates the citation text from the body text based on these
+# indicators.  If it looks like there is a reference section marker
+# too early in the document, this procedure will try to find later
+# ones.  If the final reference section is still too long, an empty
+# citation text string will be returned.  Returns references to
+# the citation text, normalized body text, and original body text.
+##
+sub findCitationText {
+    my ($rText) = @_;
+    my $text = $$rText;
+    my $bodyText = '0';
+    my $citeText = '0';
+    while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg) {
+	$bodyText = substr $text, 0, pos $text;
+	$citeText = substr $text, pos $text unless (pos $text < 1);
+    }
+    if (length($citeText) >= 0.8*length($bodyText)) {
+	print STDERR "Citation text longer than article body: ignoring\n";
+	$citeText = "";
+	return \$citeText, \normalizeBodyText(\$bodyText), \$bodyText;
+    }
+    my ($sciteText, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citeText);
+    if (length($sciteText)>0) {
+	$citeText = $sciteText;
+    }
+    if ($citeText eq '0' || !defined $citeText) {
+	print STDERR "warning: no citation text found\n";
+    }
+    return (normalizeCiteText(\$citeText),
+	    normalizeBodyText(\$bodyText),
+	    \$bodyText);
+}  # findCitationText
+##
+# Removes lines that appear to be junk from the citation text.
+##
+sub normalizeCiteText {
+    my ($rCiteText) = @_;
+    my @lines = split "\n", $$rCiteText;
+    my @newLines = ();
+    foreach my $line (@lines) {
+	if ($line =~ m/^[\s\d]*$/) {
+	    next;
+	}
+	push @newLines, $line;
+    }
+    my $newText = join "\n", @newLines;
+    return \$newText;
+}  # normalizeCiteText
+##
+# Removes lines that appear to be junk from the body text,
+# de-hyphenates words where a hyphen occurs at the end of
+# a line, and normalizes strings of blank spaces to only
+# single blancks.
+##
+sub normalizeBodyText {
+    my ($rText) = @_;
+    my @lines = split "\n", $$rText;
+    my $text = "";
+    foreach my $line (@lines) {
+	if ($line =~ m/^\s*$/) {
+	    next;
+	}
+	if ($text =~ s/(\w)\-$/$1/) {
+	    $text .= $line;
+	} else {
+	    $text .= " ".$line;
+	}
+    }
+    $text =~ s/\s\s+/\s/g;
+    return \$text;
+} # normalizeBodyText
+##
+# Controls the process by which citations are segmented,
+# based on the result of trying to guess the type of
+# citation marker used in the reference section.  Returns
+# a reference to a list of citation objects.
+##
+sub segmentCitations {
+    my ($rCiteText) = @_;
+    my $markerType = guessMarkerType($rCiteText);
+    my $rCitations;
+    if ($markerType ne 'UNKNOWN') {
+	$rCitations = splitCitationsByMarker($rCiteText, $markerType);
+    } else {
+	$rCitations = splitUnmarkedCitations($rCiteText);
+    }
+    return $rCitations;
+}  # segmentCitations
+##
+# Segments citations that have explicit markers in the
+# reference section.  Whenever a new line starts with an
+# expression that matches what we'd expect of a marker,
+# a new citation is started.  Returns a reference to a
+# list of citation objects.
+##
+sub splitCitationsByMarker {
+    my ($rCiteText, $markerType) = @_;
+    my @citations;
+    my $currentCitation = new ParsCit::Citation();
+    my $currentCitationString;
+    # TODO: Might want to add a check that marker number is
+    # increasing as we'd expect, if the marker is numeric.
+    foreach my $line (split "\n", $$rCiteText) {
+	if ($line =~ m/^\s*($markerTypes{$markerType})\s*(.*)$/) {
+	    my ($marker, $citeString) = ($1, $2);
+	    if (defined $currentCitationString) {
+		$currentCitation->setString($currentCitationString);
+		push @citations, $currentCitation;
+		$currentCitationString = undef;
+	    }
+	    $currentCitation = new ParsCit::Citation();
+	    $currentCitation->setMarkerType($markerType);
+	    $currentCitation->setMarker($marker);
+	    $currentCitationString = $citeString;
+	} else {
+	    if ($currentCitationString =~ m/\w\-$/) {
+		# merge words when lines are hyphenated
+		$currentCitationString =~ s/\-$//;
+		$currentCitationString .= $line;
+	    } else {
+		$currentCitationString .= " ".$line;
+	    }
+	}
+    }
+    if (defined $currentCitation && defined $currentCitationString) {
+	$currentCitation->setString($currentCitationString);
+	push @citations, $currentCitation;
+    }
+    return \@citations;
+}  # splitCitationsByMarker
+##
+# Uses several heuristics to decide where individual citations
+# begin and end based on the length of previous lines, strings
+# that look like author lists, and punctuation.  Returns a
+# reference to a list of citation objects.
+##
+sub splitUnmarkedCitations {
+    my ($rCiteText) = @_;
+    my @content = split "\n", $$rCiteText;
+    my @citeStarts = ();
+    my $citeStart = 0;
+    my @citations = ();
+    for (my $i=0; $i<=$#content; $i++) {
+	if ($content[$i] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s) {
+	    for (my $k=$i; $k > $citeStart; $k--) {
+		if ($content[$k] =~ m/\s*[\p{IsUpper}]/g) {
+		    # If length of previous line is extremely small,
+		    # start a new citation here.
+		    if (length($content[$k-1]) < 2) {
+			$citeStart = $k;
+			last;
+		    }
+		    # Start looking backwards for lines that could
+		    # be author lists - these usually start the
+		    # citation, have several separation characters (,;),
+		    # and shouldn't contain any numbers.
+		    my $beginningAuthorLine = -1;
+		    for (my $j=$k-1; $j>$citeStart; $j--) {
+			if ($content[$j] =~ m/\d/) {
+			    last;
+			}
+			$_ = $content[$j];
+			my $nSep = s/([,;])/\1/g;
+			if ($nSep >= 3) {
+			    if (($content[$j-1] =~ m/\.\s*$/) || $j==0) {
+				$beginningAuthorLine = $j;
+			    }
+			} else {
+			    last;
+			}
+		    }
+		    if ($beginningAuthorLine >= 0) {
+			$citeStart = $beginningAuthorLine;
+			last;
+		    }
+		    # Now that the backwards author search failed
+		    # to find any extra lines, start a new citation
+		    # here if the previous line ends with a ".".
+		    if ($content[$k-1] =~ m/\.\s*$/) {
+			$citeStart = $k;
+			last;
+		    }
+		}
+	    }
+	    push @citeStarts, $citeStart
+		unless (($citeStart <= $citeStarts[$#citeStarts]) &&
+			($citeStart != 0));
+	}
+    }
+    for (my $k=0; $k<$#citeStarts; $k++) {
+	my $firstLine = $citeStarts[$k];
+	my $lastLine = ($k==$#citeStarts) ? $#content : ($citeStarts[$k+1]-1);
+	my $citeString =
+	    mergeLines(join "\n", @content[$firstLine .. $lastLine]);
+	my $citation = new ParsCit::Citation();
+	$citation->setString($citeString);
+	push @citations, $citation;
+    }
+    return \@citations;
+}  # splitUnmarkedCitations
+##
+# Merges lines of text by dehyphenating where appropriate,
+# with normal spacing.
+##
+sub mergeLines {
+    my ($text) = shift;
+    my @lines = split "\n", $text;
+    my $mergedText = "";
+    foreach my $line (@lines) {
+	$line = trim($line);
+	if ($mergedText =~ m/\w\-$/) {
+	    $mergedText =~ s/\-$//;
+	    $mergedText .= $line;
+	} else {
+	    $mergedText .= " ".$line;
+	}
+    }
+    return trim($mergedText);
+}  # mergeLines
+##
+# Uses a list of regular expressions that match common citation
+# markers to count the number of matches for each type in the
+# text.  If a sufficient number of matches to a particular type
+# are found, we can be reasonably sure of the type.
+##
+sub guessMarkerType {
+    my ($rCiteText) = @_;
+    my $markerType = 'UNKNOWN';
+    my %markerObservations;
+    foreach my $type (keys %markerTypes) {
+	$markerObservations{$type} = 0;
+    }
+    my $citeText = "\n".$$rCiteText;
+    $_ = $citeText;
+    my $nLines = s/\n/\n/gs - 1;
+    while ($citeText =~ m/\n\s*($markerTypes{'SQUARE'}([^\n]){10})/sg) {
+	$markerObservations{'SQUARE'}++;
+    }
+    while ($citeText =~ m/\n\s*($markerTypes{'PAREN'}([^\n]){10})/sg) {
+	$markerObservations{'PAREN'}++;
+    }
+    while ($citeText =~ m/\n\s*($markerTypes{'NAKEDNUM'} [^\n]{10}) /sg) {
+	$markerObservations{'NAKEDNUM'}++;
+    }
+    while ($citeText =~ m/\n\s*$markerTypes{'NAKEDNUMDOT'}([^\n]){10}/sg) {
+	$markerObservations{'NAKEDNUMDOT'}++;
+    }
+    my @sortedObservations =
+	sort {$markerObservations{$b} <=> $markerObservations{$a}}
+    keys %markerObservations;
+    my $minMarkers = $nLines / 6;
+    if ($markerObservations{$sortedObservations[0]} >= $minMarkers) {
+	$markerType = $sortedObservations[0];
+    }
+    return $markerType;
+}  # guessMarkerType
+sub trim {
+    my $text = shift;
+    $text =~ s/^\s+//;
+    $text =~ s/\s+$//;
+    return $text;
+}  # trim
+1;