name-spotter 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
@@ -14,17 +14,20 @@ class NameSpotter
14
14
  @current_string = ''
15
15
  @current_string_state = ''
16
16
  @word_list_matches = 0
17
- @cursor = 8.times.inject([]) { |res| res << ['',-1] }
17
+ @cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
18
18
  @current_index = nil
19
19
  words = str.split(/\s/)
20
20
  words.each do |word|
21
21
  if word.empty?
22
- @cursor[-1][0] << " "
22
+ @cursor[-1][2] = @cursor[-1][2] + 1
23
23
  else
24
- cursor_entry = [word, 1 + @cursor[-1][0].size + @cursor[-1][-1]]
25
- @cursor.shift
26
- @cursor << cursor_entry
27
- taxon_find(word)
24
+ abbr_no_space = word.match(/^([A-Z][a-z]?\.)([a-z|\p{Latin}]+)/)
25
+ if abbr_no_space
26
+ process_word(abbr_no_space[1], 0)
27
+ process_word(word[abbr_no_space[1].size..-1], 1)
28
+ else
29
+ process_word(word, 1)
30
+ end
28
31
  end
29
32
  end
30
33
  socket.close
@@ -34,6 +37,13 @@ class NameSpotter
34
37
 
35
38
  private
36
39
 
40
+ def process_word(word, word_separator_size)
41
+ cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
42
+ @cursor.shift
43
+ @cursor << cursor_entry
44
+ taxon_find(word)
45
+ end
46
+
37
47
  def socket
38
48
  @socket ||= TCPSocket.open @host, @port
39
49
  end
@@ -51,7 +61,7 @@ class NameSpotter
51
61
  next if scientific_string.empty?
52
62
  add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
53
63
  end
54
- @current_index = @current_string.empty? ? nil : @cursor[-1][-1]
64
+ @current_index = @current_string.empty? ? nil : @cursor[-1][1]
55
65
  end
56
66
  end
57
67
 
@@ -62,7 +72,7 @@ class NameSpotter
62
72
  @word_list_matches = word_list_matches
63
73
  @return_score = return_score
64
74
  if !@current_index && @current_string.size > 0
65
- @current_index = @cursor[-1][-1]
75
+ @current_index = @cursor[-1][1]
66
76
  end
67
77
  if not return_string.blank? or not return_string_2.blank?
68
78
  OpenStruct.new( { :current_string => current_string,
@@ -83,11 +93,14 @@ class NameSpotter
83
93
  str.force_encoding('utf-8')
84
94
  start_position = verbatim_string = nil
85
95
  if @current_index
86
- start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
87
- words, indices = @cursor.transpose
88
- verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
96
+ start_position = is_return_string2 ? @cursor[-1][1] : @current_index
97
+ indices = @cursor.map { |item| item[1] }
98
+ verbatim_components = @cursor[indices.rindex(start_position)..-1]
99
+ sci_name_items_num = str.split(" ").size
100
+ verbatim_components = verbatim_components[0...sci_name_items_num]
101
+ verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
89
102
  else
90
- verbatim_string, start_position = @cursor[-1]
103
+ verbatim_string, start_position, space_size = @cursor[-1]
91
104
  end
92
105
  scientific_string = str
93
106
  [verbatim_string, scientific_string, start_position]
data/name-spotter.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "name-spotter"
8
- s.version = "0.2.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
12
- s.date = "2012-06-18"
12
+ s.date = "2012-07-11"
13
13
  s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
14
14
  s.email = "dmozzherin@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -82,7 +82,7 @@ describe "NameSpotter" do
82
82
  it "should properly handle abbreviated names found by taxonfinder" do
83
83
  text = "Pardosa moesta Banks, 1892 is one spider, Schizocosa ocreata Keyserling, 1887 is a second and a third is Schizocosa saltatrix borealis. The abbreviations are P. moesta, S. ocreata, and S. saltatrix borealis is the third."
84
84
  tf_res = @tf.find(text)
85
- tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis.", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>134}, {:verbatim=>"P. moesta,", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>167}, {:verbatim=>"S. ocreata,", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>179}, {:verbatim=>"S. saltatrix borealis is", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>208}]}
85
+ tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>133}, {:verbatim=>"P. moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>166}, {:verbatim=>"S. ocreata", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>178}, {:verbatim=>"S. saltatrix borealis", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>205}]}
86
86
  end
87
87
 
88
88
  it "should not make unsequential offsets on a page when using NetiNeti" do
@@ -111,6 +111,17 @@ describe "NameSpotter" do
111
111
  res = @tf.find(text)
112
112
  res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
113
113
  end
114
+
115
+ it "should be able to recognize names like P.moesta by TaxonFinder" do
116
+ text = "Pardosa moesta! If we encounter Pardosa moesta and then P.modica another name I know is Xenopus laevis and also P.moesta. Again without space TaxonFinder should find both. And Plantago major foreva"
117
+ res = @tf.find(text)
118
+ res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>32, :offsetEnd=>45}, {:verbatim=>"P.modica", :scientificName=>"P[ardosa] modica", :offsetStart=>56, :offsetEnd=>63}, {:verbatim=>"Xenopus laevis", :scientificName=>"Xenopus laevis", :offsetStart=>88, :offsetEnd=>101}, {:verbatim=>"P.moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>112, :offsetEnd=>119}, {:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>176, :offsetEnd=>189}]}
119
+ res[:names].map do |name|
120
+ verbatim = name[:verbatim]
121
+ found_name = text[name[:offsetStart]..name[:offsetEnd]]
122
+ found_name.should == verbatim
123
+ end
124
+ end
114
125
 
115
126
  it "should register situations where new name started and prev name is finished in the same cycle in TF" do
116
127
  text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: name-spotter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-18 00:00:00.000000000 Z
14
+ date: 2012-07-11 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rake
@@ -270,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
270
270
  version: '0'
271
271
  segments:
272
272
  - 0
273
- hash: 2125020200384233400
273
+ hash: 1101945023410070648
274
274
  required_rubygems_version: !ruby/object:Gem::Requirement
275
275
  none: false
276
276
  requirements: