name-spotter 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7d5a8968a40ba3511bda226ce4e7cbbae11189e9
4
+ data.tar.gz: 02dd6325c7a5786737a3ddbf73f2282c9e3b9f3d
5
+ SHA512:
6
+ metadata.gz: 7c3b6c2c51b869b45b1dd7a057a5439595978efa967005871689392d9fdb5f6c5e5fb92beeebb0769aa3d1318417bb0ea1f819b5ecd28014ebd3b24a7694242a
7
+ data.tar.gz: a7acd16270b09005f0732319e15264b3373519852d940af4ed47db2d94d70b296e021abc64f2a9b6cb6397aa5874bd967ef73cf7e8ada4874a0cbf5ec4061c4c
@@ -0,0 +1,44 @@
1
+ q
2
+ c
3
+ q
4
+ p response
5
+ q
6
+ @names
7
+ c
8
+ words
9
+ q
10
+ fg
11
+ words
12
+ q
13
+ fg
14
+ words
15
+ q
16
+ fg
17
+ c
18
+ words
19
+ fg
20
+ q
21
+ fg
22
+ @names
23
+ q
24
+ res
25
+ q
26
+ fg
27
+ name
28
+ q
29
+ fg
30
+ response
31
+ q
32
+ p response
33
+ q
34
+ fg
35
+ text
36
+ q
37
+ p response
38
+ expect(NameSpotter.english?(eng3)).to be true
39
+ expect(NameSpotter.english?(eng2)).to be true
40
+ expect(NameSpotter.english?(eng)).to be true
41
+ NameSpotter.english?(not_eng)
42
+ NameSpotter.english?(eng3)
43
+ NameSpotter.english?(eng2)
44
+ NameSpotter.english?(eng)
@@ -0,0 +1,51 @@
1
+ #Gemfile.lock
2
+ Gemfile.lock
3
+
4
+ # rcov generated
5
+ coverage
6
+
7
+ # rdoc generated
8
+ rdoc
9
+
10
+ # yard generated
11
+ doc
12
+ .yardoc
13
+
14
+ # bundler
15
+ .bundle
16
+
17
+ # jeweler generated
18
+ pkg
19
+
20
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
21
+ #
22
+ # * Create a file at ~/.gitignore
23
+ # * Include files you want ignored
24
+ # * Run: git config --global core.excludesfile ~/.gitignore
25
+ #
26
+ # After doing this, these files will be ignored in all your git projects,
27
+ # saving you from having to 'pollute' every project you touch with them
28
+ #
29
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
30
+ #
31
+ # For MacOS:
32
+ #
33
+ #.DS_Store
34
+
35
+ # For TextMate
36
+ #*.tmproj
37
+ #tmtags
38
+
39
+ # For emacs:
40
+ #*~
41
+ #\#*
42
+ #.\#*
43
+
44
+ # For vim:
45
+ #*.swp
46
+
47
+ # For redcar:
48
+ #.redcar
49
+
50
+ # For rubinius:
51
+ #*.rbc
data/.rspec CHANGED
@@ -1 +1,3 @@
1
+ --format progress
1
2
  --color
3
+ --require spec_helper
@@ -0,0 +1 @@
1
+ 2.1.6
@@ -0,0 +1,22 @@
1
+ sudo: required
2
+ language: ruby
3
+ services:
4
+ - docker
5
+
6
+ rvm:
7
+ - 2.0
8
+ - 2.1
9
+ - 2.2
10
+
11
+ # bundler_args: --without development
12
+
13
+ before_install:
14
+ - docker pull gnames/netineti
15
+ - docker pull gnames/taxonfinder
16
+ - docker run -d -p 0.0.0.0:1234:1234 --name tf gnames/taxonfinder
17
+ - docker run -d -p 0.0.0.0:6384:6384 --name nn gnames/netineti
18
+ # before_script:
19
+ # - sleep 100
20
+ branches:
21
+ only:
22
+ - master
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ 0.3.0 fix tailing name bug, refactor tests, move to docker, add version method
2
+
1
3
  0.2.4 explicitly setting utf-8 encoding for taxon finder socket
2
4
 
3
5
  0.2.3 fixed typo
data/Gemfile CHANGED
@@ -1,23 +1,3 @@
1
- source "http://rubygems.org"
2
- # Add dependencies required to use your gem here.
3
- # Example:
4
- # gem "activesupport", ">= 2.3.5"
1
+ source 'https://rubygems.org'
5
2
 
6
- gem "rake"
7
- gem "rest-client"
8
- gem "builder"
9
- gem "json"
10
- gem "unicode_utils"
11
- gem "unsupervised-language-detection"
12
-
13
- # Add dependencies to develop your gem here.
14
- # Include everything needed to run rake, tests, features, etc.
15
- group :development do
16
- gem "rspec"
17
- gem "rspec-expectations"
18
- gem "cucumber", ">= 0"
19
- gem "capybara"
20
- gem "bundler"
21
- gem "jeweler", "~> 1.6.4"
22
- gem "debugger"
23
- end
3
+ gemspec
@@ -0,0 +1,116 @@
1
+ name-spotter
2
+ ============
3
+
4
+ [![Gem Version][1]][2]
5
+ [![Continuous Integration Status][3]][4]
6
+ [![Dependency Status][5]][6]
7
+
8
+
9
+ Finds biodiversity scientific names in texts using TaxonFinder
10
+ (by Patrick Leary) or NetiNeti (by Lakshmi Manohar Akella) libraries.
11
+ This gem works with Ruby >= 2.0
12
+
13
+ Requirements
14
+ ------------
15
+
16
+ * Docker
17
+
18
+ Installation
19
+ ------------
20
+
21
+ Install the gem
22
+
23
+ gem install name-spotter
24
+
25
+ Install and run TaxonFinder and NetiNeti docker containers
26
+
27
+ ```bash
28
+ docker pull gnames/netineti
29
+ docker pull gnames/taxonfinder
30
+ docker run -d -p 0.0.0.0:1234:1234 --name tf gnames/taxonfinder
31
+ docker run -d -p 0.0.0.0:6384:6384 --name nn gnames/netineti
32
+ ```
33
+
34
+ Usage
35
+ -----
36
+
37
+ If you are using localhost and default ports for NetiNeti and TaxonFinder:
38
+
39
+ ```ruby
40
+ require "name-spotter"
41
+
42
+ neti_client = NameSpotter::NetiNetiClient.new()
43
+ tf_client = NameSpotter::TaxonFinderClient.new()
44
+ neti_name_spotter = NameSpotter.new(neti_client)
45
+ tf_name_spotter = NameSpotter.new(tf_client)
46
+
47
+ neti_name_spotter.find(your_text)
48
+ tf_name_spotter.find(your_text)
49
+ ```
50
+
51
+ If you have installed NetiNeti and TaxonFinder on a machine
52
+ with non-default port:
53
+
54
+ ```ruby
55
+ neti_client = NameSpotter::NetiNetiClient.new(host: "example.com",
56
+ port: 5555)
57
+ #or
58
+ neti_client = NameSpotter::NetiNetiClient.new(host: "123.123.123.111",
59
+ port: 5555)
60
+ ```
61
+
62
+ If you want to get results in JSON or XML formats
63
+
64
+ ```ruby
65
+ neti_name_spotter.find(your_text, "json")
66
+ neti_name_spotter.find(your_text, "xml")
67
+ ```
68
+
69
+ Development
70
+ -----------
71
+
72
+ To run tests start TaxonFinder and NetiNeti on your local machine with
73
+ default configurations and run
74
+
75
+ ```
76
+ bundle exec rake
77
+ ```
78
+
79
+
80
+
81
+ Contributing to name-spotter
82
+ ----------------------------
83
+
84
+ * Check out the latest master to make sure the feature hasn't been implemented
85
+ or the bug hasn't been fixed yet
86
+ * Check out the issue tracker to make sure someone already hasn't requested
87
+ it and/or contributed it
88
+ * Fork the project
89
+ * Start a feature/bugfix branch
90
+ * Commit and push until you are happy with your contribution
91
+ * Make sure to add tests for it. This is important so I don't break it in a
92
+ future version unintentionally.
93
+ * Please try not to mess with the Rakefile, version, or history. If you want
94
+ to have your own version, or is otherwise necessary, that is fine, but please
95
+ isolate to its own commit so I can cherry-pick around it.
96
+
97
+ Copyright
98
+ ---------
99
+
100
+ Authors: [Chuck Ha][7], [Anthony Goddard][8], [Dmitry Mozzherin][9],
101
+ [David Shorthouse][10]
102
+
103
+ Copyright (c) 2012-2016 Marine Biological Laboratory. See [LICENSE.txt][11] for
104
+ further details.
105
+
106
+ [1]: https://badge.fury.io/rb/name-spotter.svg
107
+ [2]: http://badge.fury.io/rb/name-spotter
108
+ [3]: https://secure.travis-ci.org/GlobalNamesArchitecture/name-spotter.svg
109
+ [4]: http://travis-ci.org/GlobalNamesArchitecture/name-spotter
110
+ [5]: https://gemnasium.com/GlobalNamesArchitecture/name-spotter.svg
111
+ [6]: https://gemnasium.com/GlobalNamesArchitecture/name-spotter
112
+ [7]: https://github.com/ChuckHa
113
+ [8]: https://github.com/agoddard
114
+ [9]: https://github.com/dimus
115
+ [10]: https://github.com/dshorthouse
116
+ [11]: https://raw.githubusercontent.com/GlobalNamesArchitecture/name-spotter/master/LICENSE.txt
data/Rakefile CHANGED
@@ -1,7 +1,7 @@
1
- # encoding: utf-8
2
-
3
1
  require 'rubygems'
4
2
  require 'bundler'
3
+ require "bundler/gem_tasks"
4
+
5
5
  begin
6
6
  Bundler.setup(:default, :development)
7
7
  rescue Bundler::BundlerError => e
@@ -11,20 +11,6 @@ rescue Bundler::BundlerError => e
11
11
  end
12
12
  require 'rake'
13
13
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "name-spotter"
18
- gem.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Scientific names finder}
21
- gem.description = %Q{The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)}
22
- gem.email = "dmozzherin@gmail.com"
23
- gem.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
14
  require 'rspec/core'
29
15
  require 'rspec/core/rake_task'
30
16
  RSpec::Core::RakeTask.new(:spec) do |spec|
@@ -36,8 +22,5 @@ RSpec::Core::RakeTask.new(:rcov) do |spec|
36
22
  spec.rcov = true
37
23
  end
38
24
 
39
- require 'cucumber/rake/task'
40
- Cucumber::Rake::Task.new(:features)
41
-
42
25
  task :default => :spec
43
26
 
@@ -23,7 +23,9 @@ class NameSpotter
23
23
  end
24
24
  res
25
25
  end
26
- eng, not_eng = tweets.shuffle[0...50].partition {|a| UnsupervisedLanguageDetection.is_english_tweet?(a.join(" "))}
26
+ eng, not_eng = tweets.shuffle[0...50].partition do |a|
27
+ UnsupervisedLanguageDetection.is_english_tweet?(a.join(" "))
28
+ end
27
29
  percentage = eng.size.to_f/(not_eng.size + eng.size)
28
30
  percentage > 0.5
29
31
  end
@@ -8,11 +8,13 @@ end
8
8
  class String
9
9
  def constantize()
10
10
  camel_cased_word = self
11
- names = camel_cased_word.split('::')
11
+ names = camel_cased_word.split("::")
12
12
  names.shift if names.empty? || names.first.empty?
13
13
  constant = Object
14
14
  names.each do |name|
15
- constant = constant.const_defined?(name) ? constant.const_get(name) : constant.const_missing(name)
15
+ constant = constant.const_defined?(name) ?
16
+ constant.const_get(name) :
17
+ constant.const_missing(name)
16
18
  end
17
19
  constant
18
20
  end
@@ -1,22 +1,29 @@
1
1
  class NameSpotter
2
2
  class NetiNetiClient < Client
3
- def initialize(opts = { host: '0.0.0.0', port: 6384 })
4
- super
3
+ def initialize(opts = { host: "0.0.0.0", port: 6384 })
4
+ super
5
5
  end
6
6
 
7
7
  def find(text)
8
8
  # the form does not get sent if text is nil or empty
9
9
  return [] if text.nil? || text.empty?
10
- resource = RestClient::Resource.new("http://#{@host}:#{@port}", timeout: 9_000_000, open_timeout: 9_000_000, connection: "Keep-Alive")
11
- #TODO: we should figure out a better delimiter in NetiNeti (or use json) so we don't need to susbitute pipe with a letter here
10
+ text << " " # hack to find the last name
11
+ resource = RestClient::Resource.new("http://#{@host}:#{@port}",
12
+ timeout: 9_000_000,
13
+ open_timeout: 9_000_000,
14
+ connection: "Keep-Alive")
15
+ #TODO: we should figure out a better delimiter in NetiNeti (or use json)
16
+ # so we don't need to susbitute pipe with a letter here
12
17
  response = resource.post(data: text.gsub("|", "l")) #hhhhhhack
13
18
  response.body.split("|").collect do |info|
14
19
  res = info.split(",")
15
20
  name = res[0...-2].join(",")
16
21
  offset_start = res[-2]
17
- name.force_encoding('utf-8')
22
+ name.force_encoding("utf-8")
18
23
  normalized_name = NameSpotter::ScientificName.normalize(name)
19
- NameSpotter::ScientificName.new(name, :scientific_name => normalized_name, :start_position => offset_start.to_i)
24
+ NameSpotter::ScientificName.new(name,
25
+ scientific_name: normalized_name,
26
+ start_position: offset_start.to_i)
20
27
  end
21
28
  end
22
29
  end
@@ -3,7 +3,7 @@ class NameSpotter
3
3
  attr_reader :verbatim, :scientific, :start_pos, :end_pos, :score
4
4
 
5
5
  def self.normalize(name)
6
- name = name.gsub(",", " ")
6
+ name = name.gsub(",", " ")
7
7
  name = name.gsub(/\s+/, " ")
8
8
  end
9
9
 
@@ -22,8 +22,8 @@ class NameSpotter
22
22
  other_name.is_a?(Name) &&
23
23
  other_name.verbatim.eql?(verbatim) &&
24
24
  other_name.scientific.eql?(scientific) &&
25
- other_name.start_pos.eql?(start_pos) &&
26
- other_name.end_pos.eql?(end_pos) &&
25
+ other_name.start_pos.eql?(start_pos) &&
26
+ other_name.end_pos.eql?(end_pos) &&
27
27
  other_name.score.eql?(score)
28
28
  end
29
29
 
@@ -7,14 +7,14 @@ class NameSpotter
7
7
 
8
8
  def find(str, from_web_form=false)
9
9
  @names = []
10
- @document_verbatim = str
11
10
  return [] if str.nil? || str.empty?
12
-
11
+ str << " ." # hack to find last name
12
+ @document_verbatim = str
13
13
  # These are for the data-send-back that happens in TaxonFinder
14
- @current_string = ''
15
- @current_string_state = ''
14
+ @current_string = ""
15
+ @current_string_state = ""
16
16
  @word_list_matches = 0
17
- @cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
17
+ @cursor = 8.times.inject([]) { |res| res << ["",0, 0] }
18
18
  @current_index = nil
19
19
  words = str.split(/\s/)
20
20
  words.each do |word|
@@ -34,11 +34,13 @@ class NameSpotter
34
34
  @socket = nil
35
35
  @names
36
36
  end
37
-
37
+
38
38
  private
39
39
 
40
40
  def process_word(word, word_separator_size)
41
- cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
41
+ cursor_entry = [word,
42
+ @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2],
43
+ word_separator_size]
42
44
  @cursor.shift
43
45
  @cursor << cursor_entry
44
46
  taxon_find(word)
@@ -47,30 +49,37 @@ class NameSpotter
47
49
  def socket
48
50
  unless @socket
49
51
  @socket = TCPSocket.open(@host, @port)
50
- @socket.set_encoding('utf-8')
52
+ @socket.set_encoding("utf-8")
51
53
  end
52
54
  @socket
53
55
  end
54
56
 
55
57
  def taxon_find(word)
56
- input = "#{word}|#{@current_string}|#{@current_string_state}|#{@word_list_matches}|0"
58
+ input =
59
+ "#{word}|#{@current_string}|#{@current_string_state}|#{@word_list_matches}|0"
57
60
  socket.write(input + "\n")
58
61
  if output = socket.gets
59
62
  response = parse_socket_response(output)
60
63
  return if not response
61
-
62
- [response.return_string, response.return_string_2].each_with_index do |str, i|
64
+
65
+ [response.return_string,
66
+ response.return_string_2].each_with_index do |str, i|
63
67
  next if !str || str.split(" ").size > 6
64
- verbatim_string, scientific_string, start_position = process_response(str, i)
68
+ verbatim_string, scientific_string, start_position =
69
+ process_response(str, i)
65
70
  next if scientific_string.empty?
66
- add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
71
+ add_name NameSpotter::ScientificName.new(verbatim_string,
72
+ start_position: start_position,
73
+ scientific_name: scientific_string)
67
74
  end
68
75
  @current_index = @current_string.empty? ? nil : @cursor[-1][1]
69
76
  end
70
77
  end
71
78
 
72
79
  def parse_socket_response(response)
73
- current_string, current_string_state, word_list_matches, return_string, return_score, return_string_2, return_score_2 = response.strip.split '|'
80
+ current_string, current_string_state, word_list_matches,
81
+ return_string, return_score, return_string_2,
82
+ return_score_2 = response.strip.split "|"
74
83
  @current_string = current_string
75
84
  @current_string_state = current_string_state
76
85
  @word_list_matches = word_list_matches
@@ -78,14 +87,14 @@ class NameSpotter
78
87
  if !@current_index && @current_string.size > 0
79
88
  @current_index = @cursor[-1][1]
80
89
  end
81
- if not return_string.blank? or not return_string_2.blank?
82
- OpenStruct.new( { :current_string => current_string,
83
- :current_string_state => current_string_state,
84
- :word_list_matches => word_list_matches,
85
- :return_string => return_string,
86
- :return_score => return_score,
87
- :return_string_2 => return_string_2,
88
- :return_score_2 => return_score_2 })
90
+ if not return_string.blank? or not return_string_2.blank?
91
+ OpenStruct.new( { current_string: current_string,
92
+ current_string_state: current_string_state,
93
+ word_list_matches: word_list_matches,
94
+ return_string: return_string,
95
+ return_score: return_score,
96
+ return_string_2: return_string_2,
97
+ return_score_2: return_score_2 })
89
98
  else
90
99
  @current_index = nil if @current_string.empty? && @current_index
91
100
  false
@@ -94,7 +103,7 @@ class NameSpotter
94
103
 
95
104
  def process_response(str, index)
96
105
  is_return_string2 = (index == 1)
97
- str.force_encoding('utf-8')
106
+ str.force_encoding("utf-8")
98
107
  start_position = verbatim_string = nil
99
108
  if @current_index
100
109
  start_position = is_return_string2 ? @cursor[-1][1] : @current_index
@@ -102,7 +111,9 @@ class NameSpotter
102
111
  verbatim_components = @cursor[indices.rindex(start_position)..-1]
103
112
  sci_name_items_num = str.split(" ").size
104
113
  verbatim_components = verbatim_components[0...sci_name_items_num]
105
- verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
114
+ verbatim_string = verbatim_components.map do |w|
115
+ w[0] + (" " * w[2])
116
+ end.join("").gsub(/[\.\,\!\;]*\s*$/, "")
106
117
  else
107
118
  verbatim_string, start_position, space_size = @cursor[-1]
108
119
  end