name-spotter 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/name-spotter/taxon_finder_client.rb +25 -12
- data/name-spotter.gemspec +2 -2
- data/spec/name-spotter_spec.rb +12 -1
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
@@ -14,17 +14,20 @@ class NameSpotter
|
|
14
14
|
@current_string = ''
|
15
15
|
@current_string_state = ''
|
16
16
|
@word_list_matches = 0
|
17
|
-
@cursor = 8.times.inject([]) { |res| res << [''
|
17
|
+
@cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
|
18
18
|
@current_index = nil
|
19
19
|
words = str.split(/\s/)
|
20
20
|
words.each do |word|
|
21
21
|
if word.empty?
|
22
|
-
@cursor[-1][
|
22
|
+
@cursor[-1][2] = @cursor[-1][2] + 1
|
23
23
|
else
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
abbr_no_space = word.match(/^([A-Z][a-z]?\.)([a-z|\p{Latin}]+)/)
|
25
|
+
if abbr_no_space
|
26
|
+
process_word(abbr_no_space[1], 0)
|
27
|
+
process_word(word[abbr_no_space[1].size..-1], 1)
|
28
|
+
else
|
29
|
+
process_word(word, 1)
|
30
|
+
end
|
28
31
|
end
|
29
32
|
end
|
30
33
|
socket.close
|
@@ -34,6 +37,13 @@ class NameSpotter
|
|
34
37
|
|
35
38
|
private
|
36
39
|
|
40
|
+
def process_word(word, word_separator_size)
|
41
|
+
cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
|
42
|
+
@cursor.shift
|
43
|
+
@cursor << cursor_entry
|
44
|
+
taxon_find(word)
|
45
|
+
end
|
46
|
+
|
37
47
|
def socket
|
38
48
|
@socket ||= TCPSocket.open @host, @port
|
39
49
|
end
|
@@ -51,7 +61,7 @@ class NameSpotter
|
|
51
61
|
next if scientific_string.empty?
|
52
62
|
add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
|
53
63
|
end
|
54
|
-
@current_index = @current_string.empty? ? nil : @cursor[-1][
|
64
|
+
@current_index = @current_string.empty? ? nil : @cursor[-1][1]
|
55
65
|
end
|
56
66
|
end
|
57
67
|
|
@@ -62,7 +72,7 @@ class NameSpotter
|
|
62
72
|
@word_list_matches = word_list_matches
|
63
73
|
@return_score = return_score
|
64
74
|
if !@current_index && @current_string.size > 0
|
65
|
-
@current_index = @cursor[-1][
|
75
|
+
@current_index = @cursor[-1][1]
|
66
76
|
end
|
67
77
|
if not return_string.blank? or not return_string_2.blank?
|
68
78
|
OpenStruct.new( { :current_string => current_string,
|
@@ -83,11 +93,14 @@ class NameSpotter
|
|
83
93
|
str.force_encoding('utf-8')
|
84
94
|
start_position = verbatim_string = nil
|
85
95
|
if @current_index
|
86
|
-
start_position = is_return_string2 ? @cursor[-1][
|
87
|
-
|
88
|
-
|
96
|
+
start_position = is_return_string2 ? @cursor[-1][1] : @current_index
|
97
|
+
indices = @cursor.map { |item| item[1] }
|
98
|
+
verbatim_components = @cursor[indices.rindex(start_position)..-1]
|
99
|
+
sci_name_items_num = str.split(" ").size
|
100
|
+
verbatim_components = verbatim_components[0...sci_name_items_num]
|
101
|
+
verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
|
89
102
|
else
|
90
|
-
verbatim_string, start_position = @cursor[-1]
|
103
|
+
verbatim_string, start_position, space_size = @cursor[-1]
|
91
104
|
end
|
92
105
|
scientific_string = str
|
93
106
|
[verbatim_string, scientific_string, start_position]
|
data/name-spotter.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "name-spotter"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-07-11"
|
13
13
|
s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
|
14
14
|
s.email = "dmozzherin@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/spec/name-spotter_spec.rb
CHANGED
@@ -82,7 +82,7 @@ describe "NameSpotter" do
|
|
82
82
|
it "should properly handle abbreviated names found by taxonfinder" do
|
83
83
|
text = "Pardosa moesta Banks, 1892 is one spider, Schizocosa ocreata Keyserling, 1887 is a second and a third is Schizocosa saltatrix borealis. The abbreviations are P. moesta, S. ocreata, and S. saltatrix borealis is the third."
|
84
84
|
tf_res = @tf.find(text)
|
85
|
-
tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis
|
85
|
+
tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>133}, {:verbatim=>"P. moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>166}, {:verbatim=>"S. ocreata", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>178}, {:verbatim=>"S. saltatrix borealis", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>205}]}
|
86
86
|
end
|
87
87
|
|
88
88
|
it "should not make unsequential offsets on a page when using NetiNeti" do
|
@@ -111,6 +111,17 @@ describe "NameSpotter" do
|
|
111
111
|
res = @tf.find(text)
|
112
112
|
res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
|
113
113
|
end
|
114
|
+
|
115
|
+
it "should be able to recognize names like P.moesta by TaxonFinder" do
|
116
|
+
text = "Pardosa moesta! If we encounter Pardosa moesta and then P.modica another name I know is Xenopus laevis and also P.moesta. Again without space TaxonFinder should find both. And Plantago major foreva"
|
117
|
+
res = @tf.find(text)
|
118
|
+
res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>32, :offsetEnd=>45}, {:verbatim=>"P.modica", :scientificName=>"P[ardosa] modica", :offsetStart=>56, :offsetEnd=>63}, {:verbatim=>"Xenopus laevis", :scientificName=>"Xenopus laevis", :offsetStart=>88, :offsetEnd=>101}, {:verbatim=>"P.moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>112, :offsetEnd=>119}, {:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>176, :offsetEnd=>189}]}
|
119
|
+
res[:names].map do |name|
|
120
|
+
verbatim = name[:verbatim]
|
121
|
+
found_name = text[name[:offsetStart]..name[:offsetEnd]]
|
122
|
+
found_name.should == verbatim
|
123
|
+
end
|
124
|
+
end
|
114
125
|
|
115
126
|
it "should register situations where new name started and prev name is finished in the same cycle in TF" do
|
116
127
|
text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: name-spotter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-
|
14
|
+
date: 2012-07-11 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rake
|
@@ -270,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
270
270
|
version: '0'
|
271
271
|
segments:
|
272
272
|
- 0
|
273
|
-
hash:
|
273
|
+
hash: 1101945023410070648
|
274
274
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
275
275
|
none: false
|
276
276
|
requirements:
|