name-spotter 0.1.11 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.11
1
+ 0.2.0
@@ -5,8 +5,6 @@ class NameSpotter
5
5
  def self.normalize(name)
6
6
  name = name.gsub(",", " ")
7
7
  name = name.gsub(/\s+/, " ")
8
- name = UnicodeUtils.downcase(name)
9
- UnicodeUtils.upcase(name[0]) + name[1..-1]
10
8
  end
11
9
 
12
10
  def initialize(verbatim_name, options={})
@@ -45,12 +45,13 @@ class NameSpotter
45
45
  response = parse_socket_response(output)
46
46
  return if not response
47
47
 
48
- [response.return_string, response.return_string_2].each do |str|
48
+ [response.return_string, response.return_string_2].each_with_index do |str, i|
49
49
  next if !str || str.split(" ").size > 6
50
- verbatim_string, scientific_string, start_position = process_response(str)
50
+ verbatim_string, scientific_string, start_position = process_response(str, i)
51
+ next if scientific_string.empty?
51
52
  add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
52
53
  end
53
- @current_index = nil
54
+ @current_index = @current_string.empty? ? nil : @cursor[-1][-1]
54
55
  end
55
56
  end
56
57
 
@@ -60,10 +61,10 @@ class NameSpotter
60
61
  @current_string_state = current_string_state
61
62
  @word_list_matches = word_list_matches
62
63
  @return_score = return_score
63
- if @current_string.size > 0 && !@current_index
64
- @current_index = @cursor[-1][-1]
64
+ if !@current_index && @current_string.size > 0
65
+ @current_index = @cursor[-1][-1]
65
66
  end
66
- if not return_string.blank? or not return_string_2.blank?
67
+ if not return_string.blank? or not return_string_2.blank?
67
68
  OpenStruct.new( { :current_string => current_string,
68
69
  :current_string_state => current_string_state,
69
70
  :word_list_matches => word_list_matches,
@@ -77,13 +78,14 @@ class NameSpotter
77
78
  end
78
79
  end
79
80
 
80
- def process_response(str)
81
+ def process_response(str, index)
82
+ is_return_string2 = (index == 1)
81
83
  str.force_encoding('utf-8')
82
84
  start_position = verbatim_string = nil
83
85
  if @current_index
84
- start_position = @current_index
86
+ start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
85
87
  words, indices = @cursor.transpose
86
- verbatim_string = str.include?("[") ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
88
+ verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
87
89
  else
88
90
  verbatim_string, start_position = @cursor[-1]
89
91
  end
data/name-spotter.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "name-spotter"
8
- s.version = "0.1.11"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
12
- s.date = "2012-06-13"
12
+ s.date = "2012-06-18"
13
13
  s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
14
14
  s.email = "dmozzherin@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -40,7 +40,8 @@ Gem::Specification.new do |s|
40
40
  "spec/files/journalofentomol13pomo_0063.txt",
41
41
  "spec/name-spotter_spec.rb",
42
42
  "spec/scientific_name_spec.rb",
43
- "spec/spec_helper.rb"
43
+ "spec/spec_helper.rb",
44
+ "tf_logic.txt"
44
45
  ]
45
46
  s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
46
47
  s.licenses = ["MIT"]
@@ -93,14 +93,14 @@ describe "NameSpotter" do
93
93
  offsets[0].should == 67
94
94
  end
95
95
 
96
- it "should normalize capitalization of found names" do
96
+ it "should not normalize capitalization of found names" do
97
+ #this is a problem we are aware of
97
98
  text = "We need to make sure that Ophioihrix nidis and OPHTOMVXIDAE and also Ophiocynodus and especially ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be Asleronyx excavata should all be capitalized correctly"
98
99
  res = @neti.find(text)
99
- res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"Ophtomvxidae", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"Astérochemidae", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"Stfrophvtidae", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
100
+ res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"OPHTOMVXIDAE", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"ASTÉROCHEMIDAE", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"STFROPHVTIDAE", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
100
101
  end
101
102
 
102
103
  it "should not break NetiNeti results from processing OCR with | character in it" do
103
- text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
104
104
  text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
105
105
  res = @neti.find(text)
106
106
  res.should == {:names=>[{:verbatim=>"Ophloihrix nidis", :scientificName=>"Ophloihrix nidis", :offsetStart=>26, :offsetEnd=>41}]}
@@ -111,5 +111,18 @@ describe "NameSpotter" do
111
111
  res = @tf.find(text)
112
112
  res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
113
113
  end
114
+
115
+ it "should register situations where new name started and prev name is finished in the same cycle in TF" do
116
+ text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
117
+ res = @tf.find(text)
118
+ res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>29, :offsetEnd=>42}, {:verbatim=>"(Araneae:", :scientificName=>"Araneae", :offsetStart=>44, :offsetEnd=>52}, {:verbatim=>"Lycosidae)", :scientificName=>"Lycosidae", :offsetStart=>54, :offsetEnd=>63}]}
119
+ end
120
+
121
+ it "should ignore abbreviated genus before family for TaxonFinder" do
122
+ text = "What happens another called P. (LYCOSIDAE) is the species?"
123
+ res = @tf.find(text)
124
+ res[:names].size.should == 1
125
+ res.should == {:names=>[{:verbatim=>"(LYCOSIDAE)", :scientificName=>"Lycosidae", :offsetStart=>32, :offsetEnd=>42}]}
126
+ end
114
127
 
115
128
  end
data/tf_logic.txt ADDED
@@ -0,0 +1,72 @@
1
+ if no 5 items input:
2
+
3
+ return ||0||-1||
4
+ word list match - 0
5
+ score -1
6
+
7
+ if word is empty:
8
+
9
+ return ||0|$currentString|$wordListMatches||
10
+
11
+ where currentString is from input stripped
12
+ and worldListMatches is from input stripped
13
+ wordlist match 0
14
+ score - wordListMatch??
15
+
16
+ **********GENUS**********************
17
+ for Genus + species
18
+ if genus found and species has punctuation at the end (end of the name because of comma, end of the sentence etc.)
19
+
20
+ return ||0|$currentString|$wordListMatches||
21
+ where $wordListMatches is contactenation of strings, like 0S
22
+
23
+ if genus found and species, but not end of the sentence
24
+ $currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||
25
+ search continues... found species, can be subspecies
26
+
27
+ for abbreviated Genera (1-2 letters (where '-' can also be the first letter ???):
28
+ expand Genus if last used genus is known with this letters
29
+ if abbrev did not make sense (genus is not found):
30
+ ||0||-1||
31
+ - result is nothing
32
+ Also means 2 letter genera are not found by NameFinder
33
+
34
+ if next word starts right after genus
35
+ if next word is potential abbr genus
36
+ ****$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
37
+ if next word is subgenus
38
+ if genus is abbreviated find it and extend
39
+ "$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
40
+ if abbr not found start new name from 'subgenus' as genus this time
41
+ "$cleanCandidateWord|genus|$scoreG||-1||\n"
42
+ else return genus (subgenus)
43
+ "$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
44
+ if next word is genus
45
+ ****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
46
+
47
+ if next word is family
48
+ if genus was abbreviated make it ""
49
+ ****"||0|$currentString|$wordListMatches"; empty string genus -- should be ignored
50
+ if genus and family -- return both as return_string1 and return_string2 correspondently
51
+ ****||0|genus|genus_score|family|family_score
52
+
53
+ ***********SPECIES**********
54
+ if subspecies:
55
+ "$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||\n"
56
+ currently number of infrapsecies is unlimited
57
+ if rank
58
+ "$currentString $candidateWord|rank|$wordListMatches"."$score||-1||\n";
59
+ potentially also unlimited
60
+ if potential abbr genus
61
+ ****"$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
62
+ if genus
63
+ ****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
64
+ if family
65
+ ****||0|species|species|family|family_score
66
+
67
+
68
+
69
+
70
+
71
+
72
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: name-spotter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-13 00:00:00.000000000 Z
14
+ date: 2012-06-18 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rake
@@ -254,6 +254,7 @@ files:
254
254
  - spec/name-spotter_spec.rb
255
255
  - spec/scientific_name_spec.rb
256
256
  - spec/spec_helper.rb
257
+ - tf_logic.txt
257
258
  homepage: http://github.com/GlobalNamesArchitecture/name-spotter
258
259
  licenses:
259
260
  - MIT
@@ -269,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
269
270
  version: '0'
270
271
  segments:
271
272
  - 0
272
- hash: -2251568119704049389
273
+ hash: 2125020200384233400
273
274
  required_rubygems_version: !ruby/object:Gem::Requirement
274
275
  none: false
275
276
  requirements: