RubyGems - name-spotter - Versions diffs - 0.1.11 → 0.2.0 - Mend

name-spotter 0.1.11 → 0.2.0

Files changed (7) hide show

data/VERSION +1 -1
data/lib/name-spotter/scientific_name.rb +0 -2
data/lib/name-spotter/taxon_finder_client.rb +11 -9
data/name-spotter.gemspec +4 -3
data/spec/name-spotter_spec.rb +16 -3
data/tf_logic.txt +72 -0
metadata +4 -3

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.11
1	+ 0.2.0

data/lib/name-spotter/scientific_name.rb CHANGED Viewed

@@ -5,8 +5,6 @@ class NameSpotter
     def self.normalize(name)
       name = name.gsub(",", " ")
       name = name.gsub(/\s+/, " ")
-      name = UnicodeUtils.downcase(name)
-      UnicodeUtils.upcase(name[0]) + name[1..-1]
     end
     def initialize(verbatim_name, options={})

data/lib/name-spotter/taxon_finder_client.rb CHANGED Viewed

@@ -45,12 +45,13 @@ class NameSpotter
         response = parse_socket_response(output)
         return if not response
-        [response.return_string, response.return_string_2].each do |str|
+        [response.return_string, response.return_string_2].each_with_index do |str, i|
           next if !str || str.split(" ").size > 6
-          verbatim_string, scientific_string, start_position = process_response(str)
+          verbatim_string, scientific_string, start_position = process_response(str, i)
+          next if scientific_string.empty?
           add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
         end
-        @current_index = nil
+        @current_index = @current_string.empty? ? nil : @cursor[-1][-1]
       end
     end
@@ -60,10 +61,10 @@ class NameSpotter
       @current_string_state = current_string_state
       @word_list_matches = word_list_matches
       @return_score = return_score
-      if @current_string.size > 0 && !@current_index
-        @current_index = @cursor[-1][-1]
+      if !@current_index && @current_string.size > 0
+          @current_index = @cursor[-1][-1]
       end
-      if not return_string.blank? or not return_string_2.blank?
+      if not return_string.blank? or not return_string_2.blank?
         OpenStruct.new( { :current_string       => current_string,
                        :current_string_state => current_string_state,
                        :word_list_matches    => word_list_matches,
@@ -77,13 +78,14 @@ class NameSpotter
       end
     end
-    def process_response(str)
+    def process_response(str, index)
+      is_return_string2 = (index == 1)
       str.force_encoding('utf-8')
       start_position = verbatim_string = nil
       if @current_index
-        start_position = @current_index
+        start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
         words, indices = @cursor.transpose
-        verbatim_string = str.include?("[") ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
+        verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
       else
         verbatim_string, start_position = @cursor[-1]
       end

data/name-spotter.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "name-spotter"
-  s.version = "0.1.11"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
-  s.date = "2012-06-13"
+  s.date = "2012-06-18"
   s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
   s.email = "dmozzherin@gmail.com"
   s.extra_rdoc_files = [
@@ -40,7 +40,8 @@ Gem::Specification.new do |s|
     "spec/files/journalofentomol13pomo_0063.txt",
     "spec/name-spotter_spec.rb",
     "spec/scientific_name_spec.rb",
-    "spec/spec_helper.rb"
+    "spec/spec_helper.rb",
+    "tf_logic.txt"
   ]
   s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
   s.licenses = ["MIT"]

data/spec/name-spotter_spec.rb CHANGED Viewed

@@ -93,14 +93,14 @@ describe "NameSpotter" do
     offsets[0].should == 67
   end
-  it "should normalize capitalization of found names" do
+  it "should not normalize capitalization of found names" do
+    #this is a problem we are aware of
     text = "We need to make sure that Ophioihrix nidis and OPHTOMVXIDAE and also  Ophiocynodus and especially ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be Asleronyx excavata should all be capitalized correctly"
     res = @neti.find(text)
-    res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"Ophtomvxidae", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"Astérochemidae", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"Stfrophvtidae", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
+    res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"OPHTOMVXIDAE", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"ASTÉROCHEMIDAE", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"STFROPHVTIDAE", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
   end
   it "should not break NetiNeti results from processing OCR with | character in it" do
-    text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
     text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
     res = @neti.find(text)
     res.should == {:names=>[{:verbatim=>"Ophloihrix nidis", :scientificName=>"Ophloihrix nidis", :offsetStart=>26, :offsetEnd=>41}]}
@@ -111,5 +111,18 @@ describe "NameSpotter" do
     res = @tf.find(text)
     res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
   end
+  it "should register situations where new name started and prev name is finished in the same cycle in TF" do
+    text = "What  happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
+    res = @tf.find(text)
+    res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>29, :offsetEnd=>42}, {:verbatim=>"(Araneae:", :scientificName=>"Araneae", :offsetStart=>44, :offsetEnd=>52}, {:verbatim=>"Lycosidae)", :scientificName=>"Lycosidae", :offsetStart=>54, :offsetEnd=>63}]}
+  end
+  it "should ignore abbreviated genus before family for TaxonFinder" do
+    text = "What  happens another called P. (LYCOSIDAE) is the species?"
+    res = @tf.find(text)
+    res[:names].size.should == 1
+    res.should == {:names=>[{:verbatim=>"(LYCOSIDAE)", :scientificName=>"Lycosidae", :offsetStart=>32, :offsetEnd=>42}]}
+  end
 end

data/tf_logic.txt ADDED Viewed

@@ -0,0 +1,72 @@
+if no 5 items input:
+  return ||0||-1||
+  word list match - 0
+  score -1
+if word is empty:
+  return ||0|$currentString|$wordListMatches||
+  where currentString is from input stripped
+  and worldListMatches is from input stripped
+  wordlist match 0
+  score - wordListMatch??
+**********GENUS**********************
+for Genus + species
+  if genus found and species has punctuation at the end (end of the name because of comma, end of the sentence etc.)
+    return ||0|$currentString|$wordListMatches||
+    where $wordListMatches is contactenation of strings, like 0S
+  if genus found and species, but not end of the sentence
+    $currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||
+    search continues... found species, can be subspecies
+  for abbreviated Genera (1-2 letters (where '-' can also be the first letter ???):
+    expand Genus if last used genus is known with this letters
+    if abbrev did not make sense (genus is not found):
+      ||0||-1||
+      - result is nothing
+      Also means 2 letter genera are not found by NameFinder
+  if next word starts right after genus
+    if next word is potential abbr genus
+      ****$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
+    if next word is subgenus
+      if genus is abbreviated find it and extend
+        "$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
+      if abbr not found start new name from 'subgenus' as genus this time
+        "$cleanCandidateWord|genus|$scoreG||-1||\n"
+      else return genus (subgenus)
+        "$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
+    if next word is genus
+      ****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
+  if next word is family
+    if genus was abbreviated make it ""
+      ****"||0|$currentString|$wordListMatches"; empty string genus -- should be ignored
+    if genus and family -- return both as return_string1 and return_string2 correspondently
+      ****||0|genus|genus_score|family|family_score
+***********SPECIES**********
+if subspecies:
+  "$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||\n"
+  currently number of infrapsecies is unlimited
+if rank
+  "$currentString $candidateWord|rank|$wordListMatches"."$score||-1||\n";
+  potentially also unlimited
+if potential abbr genus
+  ****"$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
+if genus
+  ****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
+if family
+  ****||0|species|species|family|family_score

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: name-spotter
 version: !ruby/object:Gem::Version
-  version: 0.1.11
+  version: 0.2.0
   prerelease:
 platform: ruby
 authors:
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-13 00:00:00.000000000 Z
+date: 2012-06-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -254,6 +254,7 @@ files:
 - spec/name-spotter_spec.rb
 - spec/scientific_name_spec.rb
 - spec/spec_helper.rb
+- tf_logic.txt
 homepage: http://github.com/GlobalNamesArchitecture/name-spotter
 licenses:
 - MIT
@@ -269,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -2251568119704049389
+      hash: 2125020200384233400
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: