name-spotter 0.1.11 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/name-spotter/scientific_name.rb +0 -2
- data/lib/name-spotter/taxon_finder_client.rb +11 -9
- data/name-spotter.gemspec +4 -3
- data/spec/name-spotter_spec.rb +16 -3
- data/tf_logic.txt +72 -0
- metadata +4 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -45,12 +45,13 @@ class NameSpotter
|
|
45
45
|
response = parse_socket_response(output)
|
46
46
|
return if not response
|
47
47
|
|
48
|
-
[response.return_string, response.return_string_2].
|
48
|
+
[response.return_string, response.return_string_2].each_with_index do |str, i|
|
49
49
|
next if !str || str.split(" ").size > 6
|
50
|
-
verbatim_string, scientific_string, start_position = process_response(str)
|
50
|
+
verbatim_string, scientific_string, start_position = process_response(str, i)
|
51
|
+
next if scientific_string.empty?
|
51
52
|
add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
|
52
53
|
end
|
53
|
-
@current_index = nil
|
54
|
+
@current_index = @current_string.empty? ? nil : @cursor[-1][-1]
|
54
55
|
end
|
55
56
|
end
|
56
57
|
|
@@ -60,10 +61,10 @@ class NameSpotter
|
|
60
61
|
@current_string_state = current_string_state
|
61
62
|
@word_list_matches = word_list_matches
|
62
63
|
@return_score = return_score
|
63
|
-
if @current_string.size > 0
|
64
|
-
|
64
|
+
if !@current_index && @current_string.size > 0
|
65
|
+
@current_index = @cursor[-1][-1]
|
65
66
|
end
|
66
|
-
if not return_string.blank? or not return_string_2.blank?
|
67
|
+
if not return_string.blank? or not return_string_2.blank?
|
67
68
|
OpenStruct.new( { :current_string => current_string,
|
68
69
|
:current_string_state => current_string_state,
|
69
70
|
:word_list_matches => word_list_matches,
|
@@ -77,13 +78,14 @@ class NameSpotter
|
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
80
|
-
def process_response(str)
|
81
|
+
def process_response(str, index)
|
82
|
+
is_return_string2 = (index == 1)
|
81
83
|
str.force_encoding('utf-8')
|
82
84
|
start_position = verbatim_string = nil
|
83
85
|
if @current_index
|
84
|
-
start_position = @current_index
|
86
|
+
start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
|
85
87
|
words, indices = @cursor.transpose
|
86
|
-
verbatim_string = str.include?("[") ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
|
88
|
+
verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
|
87
89
|
else
|
88
90
|
verbatim_string, start_position = @cursor[-1]
|
89
91
|
end
|
data/name-spotter.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "name-spotter"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-06-
|
12
|
+
s.date = "2012-06-18"
|
13
13
|
s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
|
14
14
|
s.email = "dmozzherin@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -40,7 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"spec/files/journalofentomol13pomo_0063.txt",
|
41
41
|
"spec/name-spotter_spec.rb",
|
42
42
|
"spec/scientific_name_spec.rb",
|
43
|
-
"spec/spec_helper.rb"
|
43
|
+
"spec/spec_helper.rb",
|
44
|
+
"tf_logic.txt"
|
44
45
|
]
|
45
46
|
s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
|
46
47
|
s.licenses = ["MIT"]
|
data/spec/name-spotter_spec.rb
CHANGED
@@ -93,14 +93,14 @@ describe "NameSpotter" do
|
|
93
93
|
offsets[0].should == 67
|
94
94
|
end
|
95
95
|
|
96
|
-
it "should normalize capitalization of found names" do
|
96
|
+
it "should not normalize capitalization of found names" do
|
97
|
+
#this is a problem we are aware of
|
97
98
|
text = "We need to make sure that Ophioihrix nidis and OPHTOMVXIDAE and also Ophiocynodus and especially ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be Asleronyx excavata should all be capitalized correctly"
|
98
99
|
res = @neti.find(text)
|
99
|
-
res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"
|
100
|
+
res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"OPHTOMVXIDAE", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"ASTÉROCHEMIDAE", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"STFROPHVTIDAE", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
|
100
101
|
end
|
101
102
|
|
102
103
|
it "should not break NetiNeti results from processing OCR with | character in it" do
|
103
|
-
text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
|
104
104
|
text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
|
105
105
|
res = @neti.find(text)
|
106
106
|
res.should == {:names=>[{:verbatim=>"Ophloihrix nidis", :scientificName=>"Ophloihrix nidis", :offsetStart=>26, :offsetEnd=>41}]}
|
@@ -111,5 +111,18 @@ describe "NameSpotter" do
|
|
111
111
|
res = @tf.find(text)
|
112
112
|
res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
|
113
113
|
end
|
114
|
+
|
115
|
+
it "should register situations where new name started and prev name is finished in the same cycle in TF" do
|
116
|
+
text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
|
117
|
+
res = @tf.find(text)
|
118
|
+
res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>29, :offsetEnd=>42}, {:verbatim=>"(Araneae:", :scientificName=>"Araneae", :offsetStart=>44, :offsetEnd=>52}, {:verbatim=>"Lycosidae)", :scientificName=>"Lycosidae", :offsetStart=>54, :offsetEnd=>63}]}
|
119
|
+
end
|
120
|
+
|
121
|
+
it "should ignore abbreviated genus before family for TaxonFinder" do
|
122
|
+
text = "What happens another called P. (LYCOSIDAE) is the species?"
|
123
|
+
res = @tf.find(text)
|
124
|
+
res[:names].size.should == 1
|
125
|
+
res.should == {:names=>[{:verbatim=>"(LYCOSIDAE)", :scientificName=>"Lycosidae", :offsetStart=>32, :offsetEnd=>42}]}
|
126
|
+
end
|
114
127
|
|
115
128
|
end
|
data/tf_logic.txt
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
if no 5 items input:
|
2
|
+
|
3
|
+
return ||0||-1||
|
4
|
+
word list match - 0
|
5
|
+
score -1
|
6
|
+
|
7
|
+
if word is empty:
|
8
|
+
|
9
|
+
return ||0|$currentString|$wordListMatches||
|
10
|
+
|
11
|
+
where currentString is from input stripped
|
12
|
+
and worldListMatches is from input stripped
|
13
|
+
wordlist match 0
|
14
|
+
score - wordListMatch??
|
15
|
+
|
16
|
+
**********GENUS**********************
|
17
|
+
for Genus + species
|
18
|
+
if genus found and species has punctuation at the end (end of the name because of comma, end of the sentence etc.)
|
19
|
+
|
20
|
+
return ||0|$currentString|$wordListMatches||
|
21
|
+
where $wordListMatches is contactenation of strings, like 0S
|
22
|
+
|
23
|
+
if genus found and species, but not end of the sentence
|
24
|
+
$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||
|
25
|
+
search continues... found species, can be subspecies
|
26
|
+
|
27
|
+
for abbreviated Genera (1-2 letters (where '-' can also be the first letter ???):
|
28
|
+
expand Genus if last used genus is known with this letters
|
29
|
+
if abbrev did not make sense (genus is not found):
|
30
|
+
||0||-1||
|
31
|
+
- result is nothing
|
32
|
+
Also means 2 letter genera are not found by NameFinder
|
33
|
+
|
34
|
+
if next word starts right after genus
|
35
|
+
if next word is potential abbr genus
|
36
|
+
****$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
|
37
|
+
if next word is subgenus
|
38
|
+
if genus is abbreviated find it and extend
|
39
|
+
"$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
|
40
|
+
if abbr not found start new name from 'subgenus' as genus this time
|
41
|
+
"$cleanCandidateWord|genus|$scoreG||-1||\n"
|
42
|
+
else return genus (subgenus)
|
43
|
+
"$currentString ($cleanCandidateWord)|genus|$wordListMatches"."$scoreG||-1||\n"
|
44
|
+
if next word is genus
|
45
|
+
****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
|
46
|
+
|
47
|
+
if next word is family
|
48
|
+
if genus was abbreviated make it ""
|
49
|
+
****"||0|$currentString|$wordListMatches"; empty string genus -- should be ignored
|
50
|
+
if genus and family -- return both as return_string1 and return_string2 correspondently
|
51
|
+
****||0|genus|genus_score|family|family_score
|
52
|
+
|
53
|
+
***********SPECIES**********
|
54
|
+
if subspecies:
|
55
|
+
"$currentString $cleanCandidateWord|species|$wordListMatches"."$score||-1||\n"
|
56
|
+
currently number of infrapsecies is unlimited
|
57
|
+
if rank
|
58
|
+
"$currentString $candidateWord|rank|$wordListMatches"."$score||-1||\n";
|
59
|
+
potentially also unlimited
|
60
|
+
if potential abbr genus
|
61
|
+
****"$cleanCandidateWord|genus|0|$currentString|$wordListMatches||\n"
|
62
|
+
if genus
|
63
|
+
****"$cleanCandidateWord|genus|$scoreG|$currentString|$wordListMatches||\n"
|
64
|
+
if family
|
65
|
+
****||0|species|species|family|family_score
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: name-spotter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-06-
|
14
|
+
date: 2012-06-18 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rake
|
@@ -254,6 +254,7 @@ files:
|
|
254
254
|
- spec/name-spotter_spec.rb
|
255
255
|
- spec/scientific_name_spec.rb
|
256
256
|
- spec/spec_helper.rb
|
257
|
+
- tf_logic.txt
|
257
258
|
homepage: http://github.com/GlobalNamesArchitecture/name-spotter
|
258
259
|
licenses:
|
259
260
|
- MIT
|
@@ -269,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
269
270
|
version: '0'
|
270
271
|
segments:
|
271
272
|
- 0
|
272
|
-
hash:
|
273
|
+
hash: 2125020200384233400
|
273
274
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
274
275
|
none: false
|
275
276
|
requirements:
|