name-spotter 0.1.11 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/name-spotter/scientific_name.rb +0 -2
- data/lib/name-spotter/taxon_finder_client.rb +11 -9
- data/name-spotter.gemspec +4 -3
- data/spec/name-spotter_spec.rb +16 -3
- data/tf_logic.txt +72 -0
- metadata +4 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -45,12 +45,13 @@ class NameSpotter
|
|
45
45
|
response = parse_socket_response(output)
|
46
46
|
return if not response
|
47
47
|
|
48
|
-
[response.return_string, response.return_string_2].
|
48
|
+
[response.return_string, response.return_string_2].each_with_index do |str, i|
|
49
49
|
next if !str || str.split(" ").size > 6
|
50
|
-
verbatim_string, scientific_string, start_position = process_response(str)
|
50
|
+
verbatim_string, scientific_string, start_position = process_response(str, i)
|
51
|
+
next if scientific_string.empty?
|
51
52
|
add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
|
52
53
|
end
|
53
|
-
@current_index = nil
|
54
|
+
@current_index = @current_string.empty? ? nil : @cursor[-1][-1]
|
54
55
|
end
|
55
56
|
end
|
56
57
|
|
@@ -60,10 +61,10 @@ class NameSpotter
|
|
60
61
|
@current_string_state = current_string_state
|
61
62
|
@word_list_matches = word_list_matches
|
62
63
|
@return_score = return_score
|
63
|
-
if @current_string.size > 0
|
64
|
-
|
64
|
+
if !@current_index && @current_string.size > 0
|
65
|
+
@current_index = @cursor[-1][-1]
|
65
66
|
end
|
66
|
-
if not return_string.blank? or not return_string_2.blank?
|
67
|
+
if not return_string.blank? or not return_string_2.blank?
|
67
68
|
OpenStruct.new( { :current_string => current_string,
|
68
69
|
:current_string_state => current_string_state,
|
69
70
|
:word_list_matches => word_list_matches,
|
@@ -77,13 +78,14 @@ class NameSpotter
|
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
80
|
-
def process_response(str)
|
81
|
+
def process_response(str, index)
|
82
|
+
is_return_string2 = (index == 1)
|
81
83
|
str.force_encoding('utf-8')
|
82
84
|
start_position = verbatim_string = nil
|
83
85
|
if @current_index
|
84
|
-
start_position = @current_index
|
86
|
+
start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
|
85
87
|
words, indices = @cursor.transpose
|
86
|
-
verbatim_string = str.include?("[") ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
|
88
|
+
verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
|
87
89
|
else
|
88
90
|
verbatim_string, start_position = @cursor[-1]
|
89
91
|
end
|
data/name-spotter.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "name-spotter"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-06-
|
12
|
+
s.date = "2012-06-18"
|
13
13
|
s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
|
14
14
|
s.email = "dmozzherin@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -40,7 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"spec/files/journalofentomol13pomo_0063.txt",
|
41
41
|
"spec/name-spotter_spec.rb",
|
42
42
|
"spec/scientific_name_spec.rb",
|
43
|
-
"spec/spec_helper.rb"
|
43
|
+
"spec/spec_helper.rb",
|
44
|
+
"tf_logic.txt"
|
44
45
|
]
|
45
46
|
s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
|
46
47
|
s.licenses = ["MIT"]
|
data/spec/name-spotter_spec.rb
CHANGED
@@ -93,14 +93,14 @@ describe "NameSpotter" do
|
|
93
93
|
offsets[0].should == 67
|
94
94
|
end
|
95
95
|
|
96
|
-
it "should normalize capitalization of found names" do
|
96
|
+
it "should not normalize capitalization of found names" do
|
97
|
+
#this is a problem we are aware of
|
97
98
|
text = "We need to make sure that Ophioihrix nidis and OPHTOMVXIDAE and also Ophiocynodus and especially ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be Asleronyx excavata should all be capitalized correctly"
|
98
99
|
res = @neti.find(text)
|
99
|
-
res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"
|
100
|
+
res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"OPHTOMVXIDAE", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"ASTÉROCHEMIDAE", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"STFROPHVTIDAE", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
|
100
101
|
end
|
101
102
|
|
102
103
|
it "should not break NetiNeti results from processing OCR with | character in it" do
|
103
|
-
text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
|
104
104
|
text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
|
105
105
|
res = @neti.find(text)
|
106
106
|
res.should == {:names=>[{:verbatim=>"Ophloihrix nidis", :scientificName=>"Ophloihrix nidis", :offsetStart=>26, :offsetEnd=>41}]}
|
@@ -111,5 +111,18 @@ describe "NameSpotter" do
|
|
111
111
|
res = @tf.find(text)
|
112
112
|
res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
|
113
113
|
end
|
114
|
+
|
115
|
+
it "should register situations where new name started and prev name is finished in the same cycle in TF" do
|
116
|
+
text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
|
117
|
+
res = @tf.find(text)
|
118
|
+
res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>29, :offsetEnd=>42}, {:verbatim=>"(Araneae:", :scientificName=>"Araneae", :offsetStart=>44, :offsetEnd=>52}, {:verbatim=>"Lycosidae)", :scientificName=>"Lycosidae", :offsetStart=>54, :offsetEnd=>63}]}
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should ignore abbreviated genus before family for TaxonFinder" do
|
122
|
+
text = "What happens another called P. (LYCOSIDAE) is the species?"
|
123
|
+
res = @tf.find(text)
|
124
|
+
res[:names].size.should == 1
|
125
|
+
res.should == {:names=>[{:verbatim=>"(LYCOSIDAE)", :scientificName=>"Lycosidae", :offsetStart=>32, :offsetEnd=>42}]}
|
126
|
+
end
|
114
127
|
|
115
128
|
end
|
data/tf_logic.txt
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
if no 5 items input:
|
2
|
+
|
3
|
+
return ||0||-1||
|
4
|
+
word list match - 0
|
5
|
+
score -1
|
6
|
+
|
7
|
+
if word is empty:
|
8
|
+
|
9
|
+
return ||0|$currentString|$wordListMatches||
|
10
|
+
|
11
|
+
where currentString is from input stripped
|
12
|
+
and worldListMatches is from input stripped
|
13
|
+
wordlist match 0
|
14
|
+
score - wordListMatch??
|
15
|
+
|
16
|
+
**********GENUS**********************
|
17
|
+
for Genus + species
|
18
|
+
if genus found and species has punctuation at the end (end of the name because of comma, end of the sentence etc.)
|
19
|
+
|
20
|
+
return ||0|$currentString|$wordListMatches||
|
21
|
+
where $wordListMatches is contactenation of strings, like 0S
|
22
|
+
|
23
|
+
if genus found and species, but not end of the sentence
|
24
|
+
$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||
|
25
|
+
search continues... found species, can be subspecies
|
26
|
+
|
27
|
+
for abbreviated Genera (1-2 letters (where '-' can also be the first letter ???):
|
28
|
+
expand Genus if last used genus is known with this letters
|
29
|
+
if abbrev did not make sense (genus is not found):
|
30
|
+
||0||-1||
|
31
|
+
- result is nothing
|
32
|
+
Also means 2 letter genera are not found by NameFinder
|
33
|
+
|
34
|
+
if next word starts right after genus
|
35
|
+
if next word is potential abbr genus
|
36
|
+
****$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
|
37
|
+
if next word is subgenus
|
38
|
+
if genus is abbreviated find it and extend
|
39
|
+
"$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
|
40
|
+
if abbr not found start new name from 'subgenus' as genus this time
|
41
|
+
"$cleanCandidateWord|genus|$scoreG||-1||\n"
|
42
|
+
else return genus (subgenus)
|
43
|
+
"$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
|
44
|
+
if next word is genus
|
45
|
+
****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
|
46
|
+
|
47
|
+
if next word is family
|
48
|
+
if genus was abbreviated make it ""
|
49
|
+
****"||0|$currentString|$wordListMatches"; empty string genus -- should be ignored
|
50
|
+
if genus and family -- return both as return_string1 and return_string2 correspondently
|
51
|
+
****||0|genus|genus_score|family|family_score
|
52
|
+
|
53
|
+
***********SPECIES**********
|
54
|
+
if subspecies:
|
55
|
+
"$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||\n"
|
56
|
+
currently number of infrapsecies is unlimited
|
57
|
+
if rank
|
58
|
+
"$currentString $candidateWord|rank|$wordListMatches"."$score||-1||\n";
|
59
|
+
potentially also unlimited
|
60
|
+
if potential abbr genus
|
61
|
+
****"$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
|
62
|
+
if genus
|
63
|
+
****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
|
64
|
+
if family
|
65
|
+
****||0|species|species|family|family_score
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: name-spotter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-06-
|
14
|
+
date: 2012-06-18 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rake
|
@@ -254,6 +254,7 @@ files:
|
|
254
254
|
- spec/name-spotter_spec.rb
|
255
255
|
- spec/scientific_name_spec.rb
|
256
256
|
- spec/spec_helper.rb
|
257
|
+
- tf_logic.txt
|
257
258
|
homepage: http://github.com/GlobalNamesArchitecture/name-spotter
|
258
259
|
licenses:
|
259
260
|
- MIT
|
@@ -269,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
269
270
|
version: '0'
|
270
271
|
segments:
|
271
272
|
- 0
|
272
|
-
hash:
|
273
|
+
hash: 2125020200384233400
|
273
274
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
274
275
|
none: false
|
275
276
|
requirements:
|