name-spotter 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
@@ -14,17 +14,20 @@ class NameSpotter
14
14
  @current_string = ''
15
15
  @current_string_state = ''
16
16
  @word_list_matches = 0
17
- @cursor = 8.times.inject([]) { |res| res << ['',-1] }
17
+ @cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
18
18
  @current_index = nil
19
19
  words = str.split(/\s/)
20
20
  words.each do |word|
21
21
  if word.empty?
22
- @cursor[-1][0] << " "
22
+ @cursor[-1][2] = @cursor[-1][2] + 1
23
23
  else
24
- cursor_entry = [word, 1 + @cursor[-1][0].size + @cursor[-1][-1]]
25
- @cursor.shift
26
- @cursor << cursor_entry
27
- taxon_find(word)
24
+ abbr_no_space = word.match(/^([A-Z][a-z]?\.)([a-z|\p{Latin}]+)/)
25
+ if abbr_no_space
26
+ process_word(abbr_no_space[1], 0)
27
+ process_word(word[abbr_no_space[1].size..-1], 1)
28
+ else
29
+ process_word(word, 1)
30
+ end
28
31
  end
29
32
  end
30
33
  socket.close
@@ -34,6 +37,13 @@ class NameSpotter
34
37
 
35
38
  private
36
39
 
40
+ def process_word(word, word_separator_size)
41
+ cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
42
+ @cursor.shift
43
+ @cursor << cursor_entry
44
+ taxon_find(word)
45
+ end
46
+
37
47
  def socket
38
48
  @socket ||= TCPSocket.open @host, @port
39
49
  end
@@ -51,7 +61,7 @@ class NameSpotter
51
61
  next if scientific_string.empty?
52
62
  add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
53
63
  end
54
- @current_index = @current_string.empty? ? nil : @cursor[-1][-1]
64
+ @current_index = @current_string.empty? ? nil : @cursor[-1][1]
55
65
  end
56
66
  end
57
67
 
@@ -62,7 +72,7 @@ class NameSpotter
62
72
  @word_list_matches = word_list_matches
63
73
  @return_score = return_score
64
74
  if !@current_index && @current_string.size > 0
65
- @current_index = @cursor[-1][-1]
75
+ @current_index = @cursor[-1][1]
66
76
  end
67
77
  if not return_string.blank? or not return_string_2.blank?
68
78
  OpenStruct.new( { :current_string => current_string,
@@ -83,11 +93,14 @@ class NameSpotter
83
93
  str.force_encoding('utf-8')
84
94
  start_position = verbatim_string = nil
85
95
  if @current_index
86
- start_position = is_return_string2 ? @cursor[-1][-1] : @current_index
87
- words, indices = @cursor.transpose
88
- verbatim_string = (str.include?("[") || is_return_string2) ? words[indices.index(start_position)..-1].join(" ") : words[indices.index(start_position)...-1].join(" ")
96
+ start_position = is_return_string2 ? @cursor[-1][1] : @current_index
97
+ indices = @cursor.map { |item| item[1] }
98
+ verbatim_components = @cursor[indices.rindex(start_position)..-1]
99
+ sci_name_items_num = str.split(" ").size
100
+ verbatim_components = verbatim_components[0...sci_name_items_num]
101
+ verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
89
102
  else
90
- verbatim_string, start_position = @cursor[-1]
103
+ verbatim_string, start_position, space_size = @cursor[-1]
91
104
  end
92
105
  scientific_string = str
93
106
  [verbatim_string, scientific_string, start_position]
data/name-spotter.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "name-spotter"
8
- s.version = "0.2.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
12
- s.date = "2012-06-18"
12
+ s.date = "2012-07-11"
13
13
  s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
14
14
  s.email = "dmozzherin@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -82,7 +82,7 @@ describe "NameSpotter" do
82
82
  it "should properly handle abbreviated names found by taxonfinder" do
83
83
  text = "Pardosa moesta Banks, 1892 is one spider, Schizocosa ocreata Keyserling, 1887 is a second and a third is Schizocosa saltatrix borealis. The abbreviations are P. moesta, S. ocreata, and S. saltatrix borealis is the third."
84
84
  tf_res = @tf.find(text)
85
- tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis.", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>134}, {:verbatim=>"P. moesta,", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>167}, {:verbatim=>"S. ocreata,", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>179}, {:verbatim=>"S. saltatrix borealis is", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>208}]}
85
+ tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>133}, {:verbatim=>"P. moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>166}, {:verbatim=>"S. ocreata", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>178}, {:verbatim=>"S. saltatrix borealis", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>205}]}
86
86
  end
87
87
 
88
88
  it "should not make unsequential offsets on a page when using NetiNeti" do
@@ -111,6 +111,17 @@ describe "NameSpotter" do
111
111
  res = @tf.find(text)
112
112
  res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
113
113
  end
114
+
115
+ it "should be able to recognize names like P.moesta by TaxonFinder" do
116
+ text = "Pardosa moesta! If we encounter Pardosa moesta and then P.modica another name I know is Xenopus laevis and also P.moesta. Again without space TaxonFinder should find both. And Plantago major foreva"
117
+ res = @tf.find(text)
118
+ res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>32, :offsetEnd=>45}, {:verbatim=>"P.modica", :scientificName=>"P[ardosa] modica", :offsetStart=>56, :offsetEnd=>63}, {:verbatim=>"Xenopus laevis", :scientificName=>"Xenopus laevis", :offsetStart=>88, :offsetEnd=>101}, {:verbatim=>"P.moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>112, :offsetEnd=>119}, {:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>176, :offsetEnd=>189}]}
119
+ res[:names].map do |name|
120
+ verbatim = name[:verbatim]
121
+ found_name = text[name[:offsetStart]..name[:offsetEnd]]
122
+ found_name.should == verbatim
123
+ end
124
+ end
114
125
 
115
126
  it "should register situations where new name started and prev name is finished in the same cycle in TF" do
116
127
  text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: name-spotter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-18 00:00:00.000000000 Z
14
+ date: 2012-07-11 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rake
@@ -270,7 +270,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
270
270
  version: '0'
271
271
  segments:
272
272
  - 0
273
- hash: 2125020200384233400
273
+ hash: 1101945023410070648
274
274
  required_rubygems_version: !ruby/object:Gem::Requirement
275
275
  none: false
276
276
  requirements: