name-spotter 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/name-spotter/taxon_finder_client.rb +25 -12
- data/name-spotter.gemspec +2 -2
- data/spec/name-spotter_spec.rb +12 -1
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
@@ -14,17 +14,20 @@ class NameSpotter
|
|
14
14
|
@current_string = ''
|
15
15
|
@current_string_state = ''
|
16
16
|
@word_list_matches = 0
|
17
|
-
@cursor = 8.times.inject([]) { |res| res << [''
|
17
|
+
@cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
|
18
18
|
@current_index = nil
|
19
19
|
words = str.split(/\s/)
|
20
20
|
words.each do |word|
|
21
21
|
if word.empty?
|
22
|
-
@cursor[-1][
|
22
|
+
@cursor[-1][2] = @cursor[-1][2] + 1
|
23
23
|
else
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
abbr_no_space = word.match(/^([A-Z][a-z]?\.)([a-z|\p{Latin}]+)/)
|
25
|
+
if abbr_no_space
|
26
|
+
process_word(abbr_no_space[1], 0)
|
27
|
+
process_word(word[abbr_no_space[1].size..-1], 1)
|
28
|
+
else
|
29
|
+
process_word(word, 1)
|
30
|
+
end
|
28
31
|
end
|
29
32
|
end
|
30
33
|
socket.close
|
@@ -34,6 +37,13 @@ class NameSpotter
|
|
34
37
|
|
35
38
|
private
|
36
39
|
|
40
|
+
def process_word(word, word_separator_size)
|
41
|
+
cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
|
42
|
+
@cursor.shift
|
43
|
+
@cursor << cursor_entry
|
44
|
+
taxon_find(word)
|
45
|
+
end
|
46
|
+
|
37
47
|
def socket
|
38
48
|
@socket ||= TCPSocket.open @host, @port
|
39
49
|
end
|
@@ -51,7 +61,7 @@ class NameSpotter
|
|
51
61
|
next if scientific_string.empty?
|
52
62
|
add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
|
53
63
|
end
|
54
|
-
@current_index = @current_string.empty? ? nil : @cursor[-1][
|
64
|
+
@current_index = @current_string.empty? ? nil : @cursor[-1][1]
|
55
65
|
end
|
56
66
|
end
|
57
67
|
|
@@ -62,7 +72,7 @@ class NameSpotter
|
|
62
72
|
@word_list_matches = word_list_matches
|
63
73
|
@return_score = return_score
|
64
74
|
if !@current_index && @current_string.size > 0
|
65
|
-
@current_index = @cursor[-1][
|
75
|
+
@current_index = @cursor[-1][1]
|
66
76
|
end
|
67
77
|
if not return_string.blank? or not return_string_2.blank?
|
68
78
|
OpenStruct.new( { :current_string => current_string,
|
@@ -83,11 +93,14 @@ class NameSpotter
|
|
83
93
|
str.force_encoding('utf-8')
|
84
94
|
start_position = verbatim_string = nil
|
85
95
|
if @current_index
|
86
|
-
start_position = is_return_string2 ? @cursor[-1][
|
87
|
-
|
88
|
-
|
96
|
+
start_position = is_return_string2 ? @cursor[-1][1] : @current_index
|
97
|
+
indices = @cursor.map { |item| item[1] }
|
98
|
+
verbatim_components = @cursor[indices.rindex(start_position)..-1]
|
99
|
+
sci_name_items_num = str.split(" ").size
|
100
|
+
verbatim_components = verbatim_components[0...sci_name_items_num]
|
101
|
+
verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
|
89
102
|
else
|
90
|
-
verbatim_string, start_position = @cursor[-1]
|
103
|
+
verbatim_string, start_position, space_size = @cursor[-1]
|
91
104
|
end
|
92
105
|
scientific_string = str
|
93
106
|
[verbatim_string, scientific_string, start_position]
|
data/name-spotter.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "name-spotter"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-07-11"
|
13
13
|
s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
|
14
14
|
s.email = "dmozzherin@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/spec/name-spotter_spec.rb
CHANGED
@@ -82,7 +82,7 @@ describe "NameSpotter" do
|
|
82
82
|
it "should properly handle abbreviated names found by taxonfinder" do
|
83
83
|
text = "Pardosa moesta Banks, 1892 is one spider, Schizocosa ocreata Keyserling, 1887 is a second and a third is Schizocosa saltatrix borealis. The abbreviations are P. moesta, S. ocreata, and S. saltatrix borealis is the third."
|
84
84
|
tf_res = @tf.find(text)
|
85
|
-
tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis
|
85
|
+
tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>133}, {:verbatim=>"P. moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>166}, {:verbatim=>"S. ocreata", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>178}, {:verbatim=>"S. saltatrix borealis", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>205}]}
|
86
86
|
end
|
87
87
|
|
88
88
|
it "should not make unsequential offsets on a page when using NetiNeti" do
|
@@ -111,6 +111,17 @@ describe "NameSpotter" do
|
|
111
111
|
res = @tf.find(text)
|
112
112
|
res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
|
113
113
|
end
|
114
|
+
|
115
|
+
it "should be able to recognize names like P.moesta by TaxonFinder" do
|
116
|
+
text = "Pardosa moesta! If we encounter Pardosa moesta and then P.modica another name I know is Xenopus laevis and also P.moesta. Again without space TaxonFinder should find both. And Plantago major foreva"
|
117
|
+
res = @tf.find(text)
|
118
|
+
res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>32, :offsetEnd=>45}, {:verbatim=>"P.modica", :scientificName=>"P[ardosa] modica", :offsetStart=>56, :offsetEnd=>63}, {:verbatim=>"Xenopus laevis", :scientificName=>"Xenopus laevis", :offsetStart=>88, :offsetEnd=>101}, {:verbatim=>"P.moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>112, :offsetEnd=>119}, {:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>176, :offsetEnd=>189}]}
|
119
|
+
res[:names].map do |name|
|
120
|
+
verbatim = name[:verbatim]
|
121
|
+
found_name = text[name[:offsetStart]..name[:offsetEnd]]
|
122
|
+
found_name.should == verbatim
|
123
|
+
end
|
124
|
+
end
|
114
125
|
|
115
126
|
it "should register situations where new name started and prev name is finished in the same cycle in TF" do
|
116
127
|
text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: name-spotter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-
|
14
|
+
date: 2012-07-11 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rake
|
@@ -270,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
270
270
|
version: '0'
|
271
271
|
segments:
|
272
272
|
- 0
|
273
|
-
hash:
|
273
|
+
hash: 1101945023410070648
|
274
274
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
275
275
|
none: false
|
276
276
|
requirements:
|