name-spotter 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7d5a8968a40ba3511bda226ce4e7cbbae11189e9
4
+ data.tar.gz: 02dd6325c7a5786737a3ddbf73f2282c9e3b9f3d
5
+ SHA512:
6
+ metadata.gz: 7c3b6c2c51b869b45b1dd7a057a5439595978efa967005871689392d9fdb5f6c5e5fb92beeebb0769aa3d1318417bb0ea1f819b5ecd28014ebd3b24a7694242a
7
+ data.tar.gz: a7acd16270b09005f0732319e15264b3373519852d940af4ed47db2d94d70b296e021abc64f2a9b6cb6397aa5874bd967ef73cf7e8ada4874a0cbf5ec4061c4c
@@ -0,0 +1,44 @@
1
+ q
2
+ c
3
+ q
4
+ p response
5
+ q
6
+ @names
7
+ c
8
+ words
9
+ q
10
+ fg
11
+ words
12
+ q
13
+ fg
14
+ words
15
+ q
16
+ fg
17
+ c
18
+ words
19
+ fg
20
+ q
21
+ fg
22
+ @names
23
+ q
24
+ res
25
+ q
26
+ fg
27
+ name
28
+ q
29
+ fg
30
+ response
31
+ q
32
+ p response
33
+ q
34
+ fg
35
+ text
36
+ q
37
+ p response
38
+ expect(NameSpotter.english?(eng3)).to be true
39
+ expect(NameSpotter.english?(eng2)).to be true
40
+ expect(NameSpotter.english?(eng)).to be true
41
+ NameSpotter.english?(not_eng)
42
+ NameSpotter.english?(eng3)
43
+ NameSpotter.english?(eng2)
44
+ NameSpotter.english?(eng)
@@ -0,0 +1,51 @@
1
+ #Gemfile.lock
2
+ Gemfile.lock
3
+
4
+ # rcov generated
5
+ coverage
6
+
7
+ # rdoc generated
8
+ rdoc
9
+
10
+ # yard generated
11
+ doc
12
+ .yardoc
13
+
14
+ # bundler
15
+ .bundle
16
+
17
+ # jeweler generated
18
+ pkg
19
+
20
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
21
+ #
22
+ # * Create a file at ~/.gitignore
23
+ # * Include files you want ignored
24
+ # * Run: git config --global core.excludesfile ~/.gitignore
25
+ #
26
+ # After doing this, these files will be ignored in all your git projects,
27
+ # saving you from having to 'pollute' every project you touch with them
28
+ #
29
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
30
+ #
31
+ # For MacOS:
32
+ #
33
+ #.DS_Store
34
+
35
+ # For TextMate
36
+ #*.tmproj
37
+ #tmtags
38
+
39
+ # For emacs:
40
+ #*~
41
+ #\#*
42
+ #.\#*
43
+
44
+ # For vim:
45
+ #*.swp
46
+
47
+ # For redcar:
48
+ #.redcar
49
+
50
+ # For rubinius:
51
+ #*.rbc
data/.rspec CHANGED
@@ -1 +1,3 @@
1
+ --format progress
1
2
  --color
3
+ --require spec_helper
@@ -0,0 +1 @@
1
+ 2.1.6
@@ -0,0 +1,22 @@
1
+ sudo: required
2
+ language: ruby
3
+ services:
4
+ - docker
5
+
6
+ rvm:
7
+ - 2.0
8
+ - 2.1
9
+ - 2.2
10
+
11
+ # bundler_args: --without development
12
+
13
+ before_install:
14
+ - docker pull gnames/netineti
15
+ - docker pull gnames/taxonfinder
16
+ - docker run -d -p 0.0.0.0:1234:1234 --name tf gnames/taxonfinder
17
+ - docker run -d -p 0.0.0.0:6384:6384 --name nn gnames/netineti
18
+ # before_script:
19
+ # - sleep 100
20
+ branches:
21
+ only:
22
+ - master
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ 0.3.0 fix tailing name bug, refactor tests, move to docker, add version method
2
+
1
3
  0.2.4 explicitly setting utf-8 encoding for taxon finder socket
2
4
 
3
5
  0.2.3 fixed typo
data/Gemfile CHANGED
@@ -1,23 +1,3 @@
1
- source "http://rubygems.org"
2
- # Add dependencies required to use your gem here.
3
- # Example:
4
- # gem "activesupport", ">= 2.3.5"
1
+ source 'https://rubygems.org'
5
2
 
6
- gem "rake"
7
- gem "rest-client"
8
- gem "builder"
9
- gem "json"
10
- gem "unicode_utils"
11
- gem "unsupervised-language-detection"
12
-
13
- # Add dependencies to develop your gem here.
14
- # Include everything needed to run rake, tests, features, etc.
15
- group :development do
16
- gem "rspec"
17
- gem "rspec-expectations"
18
- gem "cucumber", ">= 0"
19
- gem "capybara"
20
- gem "bundler"
21
- gem "jeweler", "~> 1.6.4"
22
- gem "debugger"
23
- end
3
+ gemspec
@@ -0,0 +1,116 @@
1
+ name-spotter
2
+ ============
3
+
4
+ [![Gem Version][1]][2]
5
+ [![Continuous Integration Status][3]][4]
6
+ [![Dependency Status][5]][6]
7
+
8
+
9
+ Finds biodiversity scientific names in texts using TaxonFinder
10
+ (by Patrick Leary) or NetiNeti (by Lakshmi Manohar Akella) libraries.
11
+ This gem works with Ruby >= 2.0
12
+
13
+ Requirements
14
+ ------------
15
+
16
+ * Docker
17
+
18
+ Installation
19
+ ------------
20
+
21
+ Install the gem
22
+
23
+ gem install name-spotter
24
+
25
+ Install and run TaxonFinder and NetiNeti docker containers
26
+
27
+ ```bash
28
+ docker pull gnames/netineti
29
+ docker pull gnames/taxonfinder
30
+ docker run -d -p 0.0.0.0:1234:1234 --name tf gnames/taxonfinder
31
+ docker run -d -p 0.0.0.0:6384:6384 --name nn gnames/netineti
32
+ ```
33
+
34
+ Usage
35
+ -----
36
+
37
+ If you are using localhost and default ports for NetiNeti and TaxonFinder:
38
+
39
+ ```ruby
40
+ require "name-spotter"
41
+
42
+ neti_client = NameSpotter::NetiNetiClient.new()
43
+ tf_client = NameSpotter::TaxonFinderClient.new()
44
+ neti_name_spotter = NameSpotter.new(neti_client)
45
+ tf_name_spotter = NameSpotter.new(tf_client)
46
+
47
+ neti_name_spotter.find(your_text)
48
+ tf_name_spotter.find(your_text)
49
+ ```
50
+
51
+ If you have installed NetiNeti and TaxonFinder on a machine
52
+ with non-default port:
53
+
54
+ ```ruby
55
+ neti_client = NameSpotter::NetiNetiClient.new(host: "example.com",
56
+ port: 5555)
57
+ #or
58
+ neti_client = NameSpotter::NetiNetiClient.new(host: "123.123.123.111",
59
+ port: 5555)
60
+ ```
61
+
62
+ If you want to get results in JSON or XML formats
63
+
64
+ ```ruby
65
+ neti_name_spotter.find(your_text, "json")
66
+ neti_name_spotter.find(your_text, "xml")
67
+ ```
68
+
69
+ Development
70
+ -----------
71
+
72
+ To run tests start TaxonFinder and NetiNeti on your local machine with
73
+ default configurations and run
74
+
75
+ ```
76
+ bundle exec rake
77
+ ```
78
+
79
+
80
+
81
+ Contributing to name-spotter
82
+ ----------------------------
83
+
84
+ * Check out the latest master to make sure the feature hasn't been implemented
85
+ or the bug hasn't been fixed yet
86
+ * Check out the issue tracker to make sure someone already hasn't requested
87
+ it and/or contributed it
88
+ * Fork the project
89
+ * Start a feature/bugfix branch
90
+ * Commit and push until you are happy with your contribution
91
+ * Make sure to add tests for it. This is important so I don't break it in a
92
+ future version unintentionally.
93
+ * Please try not to mess with the Rakefile, version, or history. If you want
94
+ to have your own version, or is otherwise necessary, that is fine, but please
95
+ isolate to its own commit so I can cherry-pick around it.
96
+
97
+ Copyright
98
+ ---------
99
+
100
+ Authors: [Chuck Ha][7], [Anthony Goddard][8], [Dmitry Mozzherin][9],
101
+ [David Shorthouse][10]
102
+
103
+ Copyright (c) 2012-2016 Marine Biological Laboratory. See [LICENSE.txt][11] for
104
+ further details.
105
+
106
+ [1]: https://badge.fury.io/rb/name-spotter.svg
107
+ [2]: http://badge.fury.io/rb/name-spotter
108
+ [3]: https://secure.travis-ci.org/GlobalNamesArchitecture/name-spotter.svg
109
+ [4]: http://travis-ci.org/GlobalNamesArchitecture/name-spotter
110
+ [5]: https://gemnasium.com/GlobalNamesArchitecture/name-spotter.svg
111
+ [6]: https://gemnasium.com/GlobalNamesArchitecture/name-spotter
112
+ [7]: https://github.com/ChuckHa
113
+ [8]: https://github.com/agoddard
114
+ [9]: https://github.com/dimus
115
+ [10]: https://github.com/dshorthouse
116
+ [11]: https://raw.githubusercontent.com/GlobalNamesArchitecture/name-spotter/master/LICENSE.txt
data/Rakefile CHANGED
@@ -1,7 +1,7 @@
1
- # encoding: utf-8
2
-
3
1
  require 'rubygems'
4
2
  require 'bundler'
3
+ require "bundler/gem_tasks"
4
+
5
5
  begin
6
6
  Bundler.setup(:default, :development)
7
7
  rescue Bundler::BundlerError => e
@@ -11,20 +11,6 @@ rescue Bundler::BundlerError => e
11
11
  end
12
12
  require 'rake'
13
13
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "name-spotter"
18
- gem.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Scientific names finder}
21
- gem.description = %Q{The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)}
22
- gem.email = "dmozzherin@gmail.com"
23
- gem.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
14
  require 'rspec/core'
29
15
  require 'rspec/core/rake_task'
30
16
  RSpec::Core::RakeTask.new(:spec) do |spec|
@@ -36,8 +22,5 @@ RSpec::Core::RakeTask.new(:rcov) do |spec|
36
22
  spec.rcov = true
37
23
  end
38
24
 
39
- require 'cucumber/rake/task'
40
- Cucumber::Rake::Task.new(:features)
41
-
42
25
  task :default => :spec
43
26
 
@@ -23,7 +23,9 @@ class NameSpotter
23
23
  end
24
24
  res
25
25
  end
26
- eng, not_eng = tweets.shuffle[0...50].partition {|a| UnsupervisedLanguageDetection.is_english_tweet?(a.join(" "))}
26
+ eng, not_eng = tweets.shuffle[0...50].partition do |a|
27
+ UnsupervisedLanguageDetection.is_english_tweet?(a.join(" "))
28
+ end
27
29
  percentage = eng.size.to_f/(not_eng.size + eng.size)
28
30
  percentage > 0.5
29
31
  end
@@ -8,11 +8,13 @@ end
8
8
  class String
9
9
  def constantize()
10
10
  camel_cased_word = self
11
- names = camel_cased_word.split('::')
11
+ names = camel_cased_word.split("::")
12
12
  names.shift if names.empty? || names.first.empty?
13
13
  constant = Object
14
14
  names.each do |name|
15
- constant = constant.const_defined?(name) ? constant.const_get(name) : constant.const_missing(name)
15
+ constant = constant.const_defined?(name) ?
16
+ constant.const_get(name) :
17
+ constant.const_missing(name)
16
18
  end
17
19
  constant
18
20
  end
@@ -1,22 +1,29 @@
1
1
  class NameSpotter
2
2
  class NetiNetiClient < Client
3
- def initialize(opts = { host: '0.0.0.0', port: 6384 })
4
- super
3
+ def initialize(opts = { host: "0.0.0.0", port: 6384 })
4
+ super
5
5
  end
6
6
 
7
7
  def find(text)
8
8
  # the form does not get sent if text is nil or empty
9
9
  return [] if text.nil? || text.empty?
10
- resource = RestClient::Resource.new("http://#{@host}:#{@port}", timeout: 9_000_000, open_timeout: 9_000_000, connection: "Keep-Alive")
11
- #TODO: we should figure out a better delimiter in NetiNeti (or use json) so we don't need to susbitute pipe with a letter here
10
+ text << " " # hack to find the last name
11
+ resource = RestClient::Resource.new("http://#{@host}:#{@port}",
12
+ timeout: 9_000_000,
13
+ open_timeout: 9_000_000,
14
+ connection: "Keep-Alive")
15
+ #TODO: we should figure out a better delimiter in NetiNeti (or use json)
16
+ # so we don't need to susbitute pipe with a letter here
12
17
  response = resource.post(data: text.gsub("|", "l")) #hhhhhhack
13
18
  response.body.split("|").collect do |info|
14
19
  res = info.split(",")
15
20
  name = res[0...-2].join(",")
16
21
  offset_start = res[-2]
17
- name.force_encoding('utf-8')
22
+ name.force_encoding("utf-8")
18
23
  normalized_name = NameSpotter::ScientificName.normalize(name)
19
- NameSpotter::ScientificName.new(name, :scientific_name => normalized_name, :start_position => offset_start.to_i)
24
+ NameSpotter::ScientificName.new(name,
25
+ scientific_name: normalized_name,
26
+ start_position: offset_start.to_i)
20
27
  end
21
28
  end
22
29
  end
@@ -3,7 +3,7 @@ class NameSpotter
3
3
  attr_reader :verbatim, :scientific, :start_pos, :end_pos, :score
4
4
 
5
5
  def self.normalize(name)
6
- name = name.gsub(",", " ")
6
+ name = name.gsub(",", " ")
7
7
  name = name.gsub(/\s+/, " ")
8
8
  end
9
9
 
@@ -22,8 +22,8 @@ class NameSpotter
22
22
  other_name.is_a?(Name) &&
23
23
  other_name.verbatim.eql?(verbatim) &&
24
24
  other_name.scientific.eql?(scientific) &&
25
- other_name.start_pos.eql?(start_pos) &&
26
- other_name.end_pos.eql?(end_pos) &&
25
+ other_name.start_pos.eql?(start_pos) &&
26
+ other_name.end_pos.eql?(end_pos) &&
27
27
  other_name.score.eql?(score)
28
28
  end
29
29
 
@@ -7,14 +7,14 @@ class NameSpotter
7
7
 
8
8
  def find(str, from_web_form=false)
9
9
  @names = []
10
- @document_verbatim = str
11
10
  return [] if str.nil? || str.empty?
12
-
11
+ str << " ." # hack to find last name
12
+ @document_verbatim = str
13
13
  # These are for the data-send-back that happens in TaxonFinder
14
- @current_string = ''
15
- @current_string_state = ''
14
+ @current_string = ""
15
+ @current_string_state = ""
16
16
  @word_list_matches = 0
17
- @cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
17
+ @cursor = 8.times.inject([]) { |res| res << ["",0, 0] }
18
18
  @current_index = nil
19
19
  words = str.split(/\s/)
20
20
  words.each do |word|
@@ -34,11 +34,13 @@ class NameSpotter
34
34
  @socket = nil
35
35
  @names
36
36
  end
37
-
37
+
38
38
  private
39
39
 
40
40
  def process_word(word, word_separator_size)
41
- cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
41
+ cursor_entry = [word,
42
+ @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2],
43
+ word_separator_size]
42
44
  @cursor.shift
43
45
  @cursor << cursor_entry
44
46
  taxon_find(word)
@@ -47,30 +49,37 @@ class NameSpotter
47
49
  def socket
48
50
  unless @socket
49
51
  @socket = TCPSocket.open(@host, @port)
50
- @socket.set_encoding('utf-8')
52
+ @socket.set_encoding("utf-8")
51
53
  end
52
54
  @socket
53
55
  end
54
56
 
55
57
  def taxon_find(word)
56
- input = "#{word}|#{@current_string}|#{@current_string_state}|#{@word_list_matches}|0"
58
+ input =
59
+ "#{word}|#{@current_string}|#{@current_string_state}|#{@word_list_matches}|0"
57
60
  socket.write(input + "\n")
58
61
  if output = socket.gets
59
62
  response = parse_socket_response(output)
60
63
  return if not response
61
-
62
- [response.return_string, response.return_string_2].each_with_index do |str, i|
64
+
65
+ [response.return_string,
66
+ response.return_string_2].each_with_index do |str, i|
63
67
  next if !str || str.split(" ").size > 6
64
- verbatim_string, scientific_string, start_position = process_response(str, i)
68
+ verbatim_string, scientific_string, start_position =
69
+ process_response(str, i)
65
70
  next if scientific_string.empty?
66
- add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
71
+ add_name NameSpotter::ScientificName.new(verbatim_string,
72
+ start_position: start_position,
73
+ scientific_name: scientific_string)
67
74
  end
68
75
  @current_index = @current_string.empty? ? nil : @cursor[-1][1]
69
76
  end
70
77
  end
71
78
 
72
79
  def parse_socket_response(response)
73
- current_string, current_string_state, word_list_matches, return_string, return_score, return_string_2, return_score_2 = response.strip.split '|'
80
+ current_string, current_string_state, word_list_matches,
81
+ return_string, return_score, return_string_2,
82
+ return_score_2 = response.strip.split "|"
74
83
  @current_string = current_string
75
84
  @current_string_state = current_string_state
76
85
  @word_list_matches = word_list_matches
@@ -78,14 +87,14 @@ class NameSpotter
78
87
  if !@current_index && @current_string.size > 0
79
88
  @current_index = @cursor[-1][1]
80
89
  end
81
- if not return_string.blank? or not return_string_2.blank?
82
- OpenStruct.new( { :current_string => current_string,
83
- :current_string_state => current_string_state,
84
- :word_list_matches => word_list_matches,
85
- :return_string => return_string,
86
- :return_score => return_score,
87
- :return_string_2 => return_string_2,
88
- :return_score_2 => return_score_2 })
90
+ if not return_string.blank? or not return_string_2.blank?
91
+ OpenStruct.new( { current_string: current_string,
92
+ current_string_state: current_string_state,
93
+ word_list_matches: word_list_matches,
94
+ return_string: return_string,
95
+ return_score: return_score,
96
+ return_string_2: return_string_2,
97
+ return_score_2: return_score_2 })
89
98
  else
90
99
  @current_index = nil if @current_string.empty? && @current_index
91
100
  false
@@ -94,7 +103,7 @@ class NameSpotter
94
103
 
95
104
  def process_response(str, index)
96
105
  is_return_string2 = (index == 1)
97
- str.force_encoding('utf-8')
106
+ str.force_encoding("utf-8")
98
107
  start_position = verbatim_string = nil
99
108
  if @current_index
100
109
  start_position = is_return_string2 ? @cursor[-1][1] : @current_index
@@ -102,7 +111,9 @@ class NameSpotter
102
111
  verbatim_components = @cursor[indices.rindex(start_position)..-1]
103
112
  sci_name_items_num = str.split(" ").size
104
113
  verbatim_components = verbatim_components[0...sci_name_items_num]
105
- verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
114
+ verbatim_string = verbatim_components.map do |w|
115
+ w[0] + (" " * w[2])
116
+ end.join("").gsub(/[\.\,\!\;]*\s*$/, "")
106
117
  else
107
118
  verbatim_string, start_position, space_size = @cursor[-1]
108
119
  end