RubyGems - name-spotter - Versions diffs - 0.1.11 → 0.2.0 - Mend

name-spotter 0.1.11 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/VERSION +1 -1
data/lib/name-spotter/scientific_name.rb +0 -2
data/lib/name-spotter/taxon_finder_client.rb +11 -9
data/name-spotter.gemspec +4 -3
data/spec/name-spotter_spec.rb +16 -3
data/tf_logic.txt +72 -0
metadata +4 -3

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.11
1	+ 0.2.0

data/lib/name-spotter/scientific_name.rb CHANGED Viewed

@@ -5,8 +5,6 @@ class NameSpotter
     def self.normalize(name)
       name = name.gsub(",", " ")
       name = name.gsub(/\s+/, " ")
-      name = UnicodeUtils.downcase(name)
-      UnicodeUtils.upcase(name[0]) + name[1..-1]
     end
     def initialize(verbatim_name, options={})

data/lib/name-spotter/taxon_finder_client.rb CHANGED Viewed

@@ -45,12 +45,13 @@ class NameSpotter
         response = parse_socket_response(output)
         return if not response
-        [response.return_string, response.return_string_2].each do |str|
+        [response.return_string, response.return_string_2].each_with_index do |str, i|
           next if !str || str.split(" ").size > 6
-          verbatim_string, scientific_string, start_position = process_response(str)
+          verbatim_string, scientific_string, start_position = process_response(str, i)
+          next if scientific_string.empty?
           add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
         end
-        @current_index = nil
+        @current_index = @current_string.empty? ? nil : @cursor[-1][-1]
       end
     end
@@ -60,10 +61,10 @@ class NameSpotter
       @current_string_state = current_string_state
       @word_list_matches = word_list_matches
       @return_score = return_score
-      if @current_string.size > 0 && !@current_index
-        @current_index = @cursor[-1][-1]
+      if !@current_index && @current_string.size > 0
+          @current_index = @cursor[-1][-1]
       end
-      if not return_string.blank? or not return_string_2.blank?
+      if not return_string.blank? or not return_string_2.blank?
         OpenStruct.new( { :current_string       => current_string,
                        :current_string_state => current_string_state,
                        :word_list_matches    => word_list_matches,
@@ -77,13 +78,14 @@ class NameSpotter
       end
     end
-    def process_response(str)
+    def process_response(str, index)
+      is_return_string2 = (index == 1)
       str.force_encoding('utf-8')
       start_position = verbatim_string = nil
       if @current_index
-        start_position = @current_index
+        start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
         words, indices = @cursor.transpose
-        verbatim_string = str.include?("[") ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
+        verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
       else
         verbatim_string, start_position = @cursor[-1]
       end

data/name-spotter.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "name-spotter"
-  s.version = "0.1.11"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
-  s.date = "2012-06-13"
+  s.date = "2012-06-18"
   s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
   s.email = "dmozzherin@gmail.com"
   s.extra_rdoc_files = [
@@ -40,7 +40,8 @@ Gem::Specification.new do |s|
     "spec/files/journalofentomol13pomo_0063.txt",
     "spec/name-spotter_spec.rb",
     "spec/scientific_name_spec.rb",
-    "spec/spec_helper.rb"
+    "spec/spec_helper.rb",
+    "tf_logic.txt"
   ]
   s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
   s.licenses = ["MIT"]

data/spec/name-spotter_spec.rb CHANGED Viewed

@@ -93,14 +93,14 @@ describe "NameSpotter" do
     offsets[0].should == 67
   end
-  it "should normalize capitalization of found names" do
+  it "should not normalize capitalization of found names" do
+    #this is a problem we are aware of
     text = "We need to make sure that Ophioihrix nidis and OPHTOMVXIDAE and also  Ophiocynodus and especially ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be Asleronyx excavata should all be capitalized correctly"
     res = @neti.find(text)
-    res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"Ophtomvxidae", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"Astérochemidae", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"Stfrophvtidae", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
+    res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"OPHTOMVXIDAE", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"ASTÉROCHEMIDAE", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"STFROPHVTIDAE", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
   end
   it "should not break NetiNeti results from processing OCR with | character in it" do
-    text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
     text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
     res = @neti.find(text)
     res.should == {:names=>[{:verbatim=>"Ophloihrix nidis", :scientificName=>"Ophloihrix nidis", :offsetStart=>26, :offsetEnd=>41}]}
@@ -111,5 +111,18 @@ describe "NameSpotter" do
     res = @tf.find(text)
     res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
   end
+  it "should register situations where new name started and prev name is finished in the same cycle in TF" do
+    text = "What  happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
+    res = @tf.find(text)
+    res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>29, :offsetEnd=>42}, {:verbatim=>"(Araneae:", :scientificName=>"Araneae", :offsetStart=>44, :offsetEnd=>52}, {:verbatim=>"Lycosidae)", :scientificName=>"Lycosidae", :offsetStart=>54, :offsetEnd=>63}]}
+  end
+  it "should ignore abbreviated genus before family for TaxonFinder" do
+    text = "What  happens another called P. (LYCOSIDAE) is the species?"
+    res = @tf.find(text)
+    res[:names].size.should == 1
+    res.should == {:names=>[{:verbatim=>"(LYCOSIDAE)", :scientificName=>"Lycosidae", :offsetStart=>32, :offsetEnd=>42}]}
+  end
 end

data/tf_logic.txt ADDED Viewed

@@ -0,0 +1,72 @@
+if no 5 items input:
+  return ||0||-1||
+  word list match - 0
+  score -1
+if word is empty:
+  return ||0|$currentString|$wordListMatches||
+  where currentString is from input stripped
+  and worldListMatches is from input stripped
+  wordlist match 0
+  score - wordListMatch??
+**********GENUS**********************
+for Genus + species
+  if genus found and species has punctuation at the end (end of the name because of comma, end of the sentence etc.)
+    return ||0|$currentString|$wordListMatches||
+    where $wordListMatches is contactenation of strings, like 0S
+  if genus found and species, but not end of the sentence
+    $currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||
+    search continues... found species, can be subspecies
+  for abbreviated Genera (1-2 letters (where '-' can also be the first letter ???):
+    expand Genus if last used genus is known with this letters
+    if abbrev did not make sense (genus is not found):
+      ||0||-1||
+      - result is nothing
+      Also means 2 letter genera are not found by NameFinder
+  if next word starts right after genus
+    if next word is potential abbr genus
+      ****$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
+    if next word is subgenus
+      if genus is abbreviated find it and extend
+        "$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
+      if abbr not found start new name from 'subgenus' as genus this time
+        "$cleanCandidateWord|genus|$scoreG||-1||\n"
+      else return genus (subgenus)
+        "$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
+    if next word is genus
+      ****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
+  if next word is family
+    if genus was abbreviated make it ""
+      ****"||0|$currentString|$wordListMatches"; empty string genus -- should be ignored
+    if genus and family -- return both as return_string1 and return_string2 correspondently
+      ****||0|genus|genus_score|family|family_score
+***********SPECIES**********
+if subspecies:
+  "$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||\n"
+  currently number of infrapsecies is unlimited
+if rank
+  "$currentString $candidateWord|rank|$wordListMatches"."$score||-1||\n";
+  potentially also unlimited
+if potential abbr genus
+  ****"$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
+if genus
+  ****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
+if family
+  ****||0|species|species|family|family_score

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: name-spotter
 version: !ruby/object:Gem::Version
-  version: 0.1.11
+  version: 0.2.0
   prerelease:
 platform: ruby
 authors:
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-13 00:00:00.000000000 Z
+date: 2012-06-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -254,6 +254,7 @@ files:
 - spec/name-spotter_spec.rb
 - spec/scientific_name_spec.rb
 - spec/spec_helper.rb
+- tf_logic.txt
 homepage: http://github.com/GlobalNamesArchitecture/name-spotter
 licenses:
 - MIT
@@ -269,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -2251568119704049389
+      hash: 2125020200384233400
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: