name-spotter 0.1.8 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.8
1
+ 0.1.9
@@ -1,11 +1,8 @@
1
+ # encoding: utf-8
1
2
  class NameSpotter
2
3
  class TaxonFinderClient < NameSpotter::Client
3
4
  def initialize(opts = { host: "0.0.0.0", port: "1234" })
4
5
  super
5
- # We keep track of the document to get accurate offsets.
6
- # Other methods such as keeping track of the character number
7
- # didn't work so well due to the nature of TaxonFinder.
8
- @document = ""
9
6
  end
10
7
 
11
8
  def find(str, from_web_form=false)
@@ -17,20 +14,21 @@ class NameSpotter
17
14
  @current_string = ''
18
15
  @current_string_state = ''
19
16
  @word_list_matches = 0
20
- @empty_count = 0
17
+ @cursor = 5.times.inject([]) { |res| res << ['',-1] }
18
+ @current_index = nil
21
19
  words = str.split(/\s/)
22
20
  words.each do |word|
23
- # Since we split on whitespace, this addition of a " " char
24
- # allows us to keep the document accurate and is basically
25
- # replacing all \s matches with " "
26
- @document << word + " "
27
- unless word.empty?
21
+ if word.empty?
22
+ @cursor[-1][0] << " "
23
+ else
24
+ cursor_entry = [word, 1 + @cursor[-1][0].size + @cursor[-1][-1]]
25
+ @cursor.shift
26
+ @cursor << cursor_entry
28
27
  taxon_find(word)
29
28
  end
30
29
  end
31
30
  socket.close
32
31
  @socket = nil
33
- @document = ""
34
32
  @names
35
33
  end
36
34
 
@@ -64,6 +62,9 @@ class NameSpotter
64
62
  @current_string_state = current_string_state
65
63
  @word_list_matches = word_list_matches
66
64
  @return_score = return_score
65
+ if @current_string.size > 0 && !@current_index
66
+ @current_index = @cursor[-1][-1]
67
+ end
67
68
  if not return_string.blank? or not return_string_2.blank?
68
69
  OpenStruct.new( { :current_string => current_string,
69
70
  :current_string_state => current_string_state,
@@ -73,16 +74,22 @@ class NameSpotter
73
74
  :return_string_2 => return_string_2,
74
75
  :return_score_2 => return_score_2 })
75
76
  else
77
+ @current_index = nil if @current_string.empty? && @current_index
76
78
  false
77
79
  end
78
80
  end
79
81
 
80
82
  def process_response(str)
81
83
  str.force_encoding('utf-8')
82
- verbatim_string = str.sub(/\[.*\]/, '.')
83
- verbatim_regex = Regexp.new(verbatim_string.split(/\s/).join('\s+'), true)
84
- start_position = @document.rindex(verbatim_regex)
85
- verbatim_string = @document_verbatim[start_position..-1].match(verbatim_regex)[0] if start_position
84
+ start_position = verbatim_string = nil
85
+ if @current_index
86
+ start_position = @current_index
87
+ words, indices = @cursor.transpose
88
+ verbatim_string = words[indices.index(start_position)...-1].join(" ")
89
+ @current_index = nil
90
+ else
91
+ verbatim_string, start_position = @cursor[-1]
92
+ end
86
93
  scientific_string = str
87
94
  [verbatim_string, scientific_string, start_position]
88
95
  end
data/name-spotter.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "name-spotter"
8
- s.version = "0.1.8"
8
+ s.version = "0.1.9"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
12
- s.date = "2012-06-06"
12
+ s.date = "2012-06-12"
13
13
  s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
14
14
  s.email = "dmozzherin@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -39,7 +39,7 @@ describe "NameSpotter" do
39
39
  end
40
40
 
41
41
  it "should be able to find scientific names in text" do
42
- text = "Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus and unicoded name Aranea röselii. Also it has name unknown before: Varanus bitatawa species"
42
+ text = "Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus and unicoded name Aranea röselii. Also it has name unknown before: Varanus bitatawa species"
43
43
  res = @neti.find(text)[:names].map { |n| n[:scientificName] }
44
44
  res.should == ["Betula alba", "Mus musculus", "B. alba", "Aranea röselii", "Varanus bitatawa"]
45
45
  tf_res = @tf.find(text)
@@ -72,11 +72,11 @@ describe "NameSpotter" do
72
72
  end
73
73
 
74
74
  it "should be able to return offsets for all names found by taxonfinder" do
75
- text = "We have to be sure that Betula\n alba and PSEUDOSCORPIONIDA and Aranea röselii and capitalized ARANEA RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 all get their offsets"
75
+ text = "We have to be sure that Betula\n alba and PSEUDOSCORPIONIDA and ×Inkea which is not Passeriformes. We also have another hybrid Passiflora ×rosea and Aranea röselii and capitalized ARANEA RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 all get their offsets"
76
76
  res = @neti.find(text)
77
- res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>63, :offsetEnd=>76}, {:verbatim=>"Pardosa\n moesta", :scientificName=>"Pardosa moesta", :offsetStart=>113, :offsetEnd=>127}]}
77
+ res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"Passiflora ×rosea", :scientificName=>"Passiflora ×rosea", :offsetStart=>126, :offsetEnd=>142}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>148, :offsetEnd=>161}, {:verbatim=>"Pardosa\n moesta", :scientificName=>"Pardosa moesta", :offsetStart=>198, :offsetEnd=>212}]}
78
78
  tf_res = @tf.find(text)
79
- tf_res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"PSEUDOSCORPIONIDA", :scientificName=>"Pseudoscorpionida", :offsetStart=>41, :offsetEnd=>57}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>63, :offsetEnd=>76}, {:verbatim=>"ARANEA", :scientificName=>"Aranea", :offsetStart=>94, :offsetEnd=>99}, {:verbatim=>"Pardosa\n moesta f. moesta", :scientificName=>"Pardosa moesta f. moesta", :offsetStart=>113, :offsetEnd=>137}]}
79
+ tf_res.should == {:names=>[{:verbatim=>"Betula alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"PSEUDOSCORPIONIDA", :scientificName=>"Pseudoscorpionida", :offsetStart=>41, :offsetEnd=>57}, {:verbatim=>"Passeriformes.", :scientificName=>"Passeriformes", :offsetStart=>83, :offsetEnd=>96}, {:verbatim=>"Passiflora ×rosea", :scientificName=>"Passiflora rosea", :offsetStart=>126, :offsetEnd=>142}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>148, :offsetEnd=>161}, {:verbatim=>"ARANEA", :scientificName=>"Aranea", :offsetStart=>179, :offsetEnd=>184}, {:verbatim=>"Pardosa moesta f. moesta", :scientificName=>"Pardosa moesta f. moesta", :offsetStart=>198, :offsetEnd=>222}]}
80
80
  end
81
81
 
82
82
  it "should not make unsequential offsets on a page when using NetiNeti" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: name-spotter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-06 00:00:00.000000000 Z
14
+ date: 2012-06-12 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rake
@@ -269,7 +269,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
269
269
  version: '0'
270
270
  segments:
271
271
  - 0
272
- hash: 765055699781609627
272
+ hash: 3944184144538713044
273
273
  required_rubygems_version: !ruby/object:Gem::Requirement
274
274
  none: false
275
275
  requirements: