name-spotter 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.8
1
+ 0.1.9
@@ -1,11 +1,8 @@
1
+ # encoding: utf-8
1
2
  class NameSpotter
2
3
  class TaxonFinderClient < NameSpotter::Client
3
4
  def initialize(opts = { host: "0.0.0.0", port: "1234" })
4
5
  super
5
- # We keep track of the document to get accurate offsets.
6
- # Other methods such as keeping track of the character number
7
- # didn't work so well due to the nature of TaxonFinder.
8
- @document = ""
9
6
  end
10
7
 
11
8
  def find(str, from_web_form=false)
@@ -17,20 +14,21 @@ class NameSpotter
17
14
  @current_string = ''
18
15
  @current_string_state = ''
19
16
  @word_list_matches = 0
20
- @empty_count = 0
17
+ @cursor = 5.times.inject([]) { |res| res << ['',-1] }
18
+ @current_index = nil
21
19
  words = str.split(/\s/)
22
20
  words.each do |word|
23
- # Since we split on whitespace, this addition of a " " char
24
- # allows us to keep the document accurate and is basically
25
- # replacing all \s matches with " "
26
- @document << word + " "
27
- unless word.empty?
21
+ if word.empty?
22
+ @cursor[-1][0] << " "
23
+ else
24
+ cursor_entry = [word, 1 + @cursor[-1][0].size + @cursor[-1][-1]]
25
+ @cursor.shift
26
+ @cursor << cursor_entry
28
27
  taxon_find(word)
29
28
  end
30
29
  end
31
30
  socket.close
32
31
  @socket = nil
33
- @document = ""
34
32
  @names
35
33
  end
36
34
 
@@ -64,6 +62,9 @@ class NameSpotter
64
62
  @current_string_state = current_string_state
65
63
  @word_list_matches = word_list_matches
66
64
  @return_score = return_score
65
+ if @current_string.size > 0 && !@current_index
66
+ @current_index = @cursor[-1][-1]
67
+ end
67
68
  if not return_string.blank? or not return_string_2.blank?
68
69
  OpenStruct.new( { :current_string => current_string,
69
70
  :current_string_state => current_string_state,
@@ -73,16 +74,22 @@ class NameSpotter
73
74
  :return_string_2 => return_string_2,
74
75
  :return_score_2 => return_score_2 })
75
76
  else
77
+ @current_index = nil if @current_string.empty? && @current_index
76
78
  false
77
79
  end
78
80
  end
79
81
 
80
82
  def process_response(str)
81
83
  str.force_encoding('utf-8')
82
- verbatim_string = str.sub(/\[.*\]/, '.')
83
- verbatim_regex = Regexp.new(verbatim_string.split(/\s/).join('\s+'), true)
84
- start_position = @document.rindex(verbatim_regex)
85
- verbatim_string = @document_verbatim[start_position..-1].match(verbatim_regex)[0] if start_position
84
+ start_position = verbatim_string = nil
85
+ if @current_index
86
+ start_position = @current_index
87
+ words, indices = @cursor.transpose
88
+ verbatim_string = words[indices.index(start_position)...-1].join(" ")
89
+ @current_index = nil
90
+ else
91
+ verbatim_string, start_position = @cursor[-1]
92
+ end
86
93
  scientific_string = str
87
94
  [verbatim_string, scientific_string, start_position]
88
95
  end
data/name-spotter.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "name-spotter"
8
- s.version = "0.1.8"
8
+ s.version = "0.1.9"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
12
- s.date = "2012-06-06"
12
+ s.date = "2012-06-12"
13
13
  s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
14
14
  s.email = "dmozzherin@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -39,7 +39,7 @@ describe "NameSpotter" do
39
39
  end
40
40
 
41
41
  it "should be able to find scientific names in text" do
42
- text = "Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus and unicoded name Aranea röselii. Also it has name unknown before: Varanus bitatawa species"
42
+ text = "Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus and unicoded name Aranea röselii. Also it has name unknown before: Varanus bitatawa species"
43
43
  res = @neti.find(text)[:names].map { |n| n[:scientificName] }
44
44
  res.should == ["Betula alba", "Mus musculus", "B. alba", "Aranea röselii", "Varanus bitatawa"]
45
45
  tf_res = @tf.find(text)
@@ -72,11 +72,11 @@ describe "NameSpotter" do
72
72
  end
73
73
 
74
74
  it "should be able to return offsets for all names found by taxonfinder" do
75
- text = "We have to be sure that Betula\n alba and PSEUDOSCORPIONIDA and Aranea röselii and capitalized ARANEA RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 all get their offsets"
75
+ text = "We have to be sure that Betula\n alba and PSEUDOSCORPIONIDA and ×Inkea which is not Passeriformes. We also have another hybrid Passiflora ×rosea and Aranea röselii and capitalized ARANEA RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 all get their offsets"
76
76
  res = @neti.find(text)
77
- res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>63, :offsetEnd=>76}, {:verbatim=>"Pardosa\n moesta", :scientificName=>"Pardosa moesta", :offsetStart=>113, :offsetEnd=>127}]}
77
+ res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"Passiflora ×rosea", :scientificName=>"Passiflora ×rosea", :offsetStart=>126, :offsetEnd=>142}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>148, :offsetEnd=>161}, {:verbatim=>"Pardosa\n moesta", :scientificName=>"Pardosa moesta", :offsetStart=>198, :offsetEnd=>212}]}
78
78
  tf_res = @tf.find(text)
79
- tf_res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"PSEUDOSCORPIONIDA", :scientificName=>"Pseudoscorpionida", :offsetStart=>41, :offsetEnd=>57}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>63, :offsetEnd=>76}, {:verbatim=>"ARANEA", :scientificName=>"Aranea", :offsetStart=>94, :offsetEnd=>99}, {:verbatim=>"Pardosa\n moesta f. moesta", :scientificName=>"Pardosa moesta f. moesta", :offsetStart=>113, :offsetEnd=>137}]}
79
+ tf_res.should == {:names=>[{:verbatim=>"Betula alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"PSEUDOSCORPIONIDA", :scientificName=>"Pseudoscorpionida", :offsetStart=>41, :offsetEnd=>57}, {:verbatim=>"Passeriformes.", :scientificName=>"Passeriformes", :offsetStart=>83, :offsetEnd=>96}, {:verbatim=>"Passiflora ×rosea", :scientificName=>"Passiflora rosea", :offsetStart=>126, :offsetEnd=>142}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>148, :offsetEnd=>161}, {:verbatim=>"ARANEA", :scientificName=>"Aranea", :offsetStart=>179, :offsetEnd=>184}, {:verbatim=>"Pardosa moesta f. moesta", :scientificName=>"Pardosa moesta f. moesta", :offsetStart=>198, :offsetEnd=>222}]}
80
80
  end
81
81
 
82
82
  it "should not make unsequential offsets on a page when using NetiNeti" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: name-spotter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-06 00:00:00.000000000 Z
14
+ date: 2012-06-12 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rake
@@ -269,7 +269,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
269
269
  version: '0'
270
270
  segments:
271
271
  - 0
272
- hash: 765055699781609627
272
+ hash: 3944184144538713044
273
273
  required_rubygems_version: !ruby/object:Gem::Requirement
274
274
  none: false
275
275
  requirements: