RubyGems - name-spotter - Versions diffs - 0.2.4 → 0.3.0 - Mend

name-spotter 0.2.4 → 0.3.0

Files changed (29) hide show

checksums.yaml +7 -0
data/.byebug_history +44 -0
data/.gitignore +51 -0
data/.rspec +2 -0
data/.ruby-version +1 -0
data/.travis.yml +22 -0
data/CHANGELOG +2 -0
data/Gemfile +2 -22
data/README.md +116 -0
data/Rakefile +2 -19
data/lib/name-spotter.rb +3 -1
data/lib/name-spotter/monkey_patches.rb +4 -2
data/lib/name-spotter/neti_neti_client.rb +13 -6
data/lib/name-spotter/scientific_name.rb +3 -3
data/lib/name-spotter/taxon_finder_client.rb +35 -24
data/lib/name-spotter/version.rb +8 -0
data/name-spotter.gemspec +26 -98
data/spec/name-spotter_spec.rb +334 -131
data/spec/scientific_name_spec.rb +14 -19
data/spec/spec_helper.rb +2 -12
data/tf_logic.txt +3 -3
metadata +69 -142
data/.rvmrc +0 -1
data/Gemfile.lock +0 -84
data/README.rdoc +0 -95
data/VERSION +0 -1
data/features/name-spotter.feature +0 -9
data/features/step_definitions/name-spotter_steps.rb +0 -0
data/features/support/env.rb +0 -13

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7d5a8968a40ba3511bda226ce4e7cbbae11189e9
+  data.tar.gz: 02dd6325c7a5786737a3ddbf73f2282c9e3b9f3d
+SHA512:
+  metadata.gz: 7c3b6c2c51b869b45b1dd7a057a5439595978efa967005871689392d9fdb5f6c5e5fb92beeebb0769aa3d1318417bb0ea1f819b5ecd28014ebd3b24a7694242a
+  data.tar.gz: a7acd16270b09005f0732319e15264b3373519852d940af4ed47db2d94d70b296e021abc64f2a9b6cb6397aa5874bd967ef73cf7e8ada4874a0cbf5ec4061c4c

data/.byebug_history ADDED

@@ -0,0 +1,44 @@
+q
+c
+q
+p response
+q
+@names
+c
+words
+q
+fg
+words
+q
+fg
+words
+q
+fg
+c
+words
+fg
+q
+fg
+@names
+q
+res
+q
+fg
+name
+q
+fg
+response
+q
+p response
+q
+fg
+text
+q
+p response
+expect(NameSpotter.english?(eng3)).to be true
+expect(NameSpotter.english?(eng2)).to be true
+expect(NameSpotter.english?(eng)).to be true
+NameSpotter.english?(not_eng)
+NameSpotter.english?(eng3)
+NameSpotter.english?(eng2)
+NameSpotter.english?(eng)

data/.gitignore ADDED

@@ -0,0 +1,51 @@
+#Gemfile.lock
+Gemfile.lock
+# rcov generated
+coverage
+# rdoc generated
+rdoc
+# yard generated
+doc
+.yardoc
+# bundler
+.bundle
+# jeweler generated
+pkg
+# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
+#
+# * Create a file at ~/.gitignore
+# * Include files you want ignored
+# * Run: git config --global core.excludesfile ~/.gitignore
+#
+# After doing this, these files will be ignored in all your git projects,
+# saving you from having to 'pollute' every project you touch with them
+#
+# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
+#
+# For MacOS:
+#
+#.DS_Store
+# For TextMate
+#*.tmproj
+#tmtags
+# For emacs:
+#*~
+#\#*
+#.\#*
+# For vim:
+#*.swp
+# For redcar:
+#.redcar
+# For rubinius:
+#*.rbc

data/.rspec CHANGED

@@ -1 +1,3 @@
+--format progress
 --color
+--require spec_helper

data/.ruby-version ADDED

	@@ -0,0 +1 @@
1	+ 2.1.6

data/.travis.yml ADDED

@@ -0,0 +1,22 @@
+sudo: required
+language: ruby
+services:
+  - docker
+rvm:
+  - 2.0
+  - 2.1
+  - 2.2
+# bundler_args: --without development
+before_install:
+  - docker pull gnames/netineti
+  - docker pull gnames/taxonfinder
+  - docker run -d -p 0.0.0.0:1234:1234 --name tf gnames/taxonfinder
+  - docker run -d -p 0.0.0.0:6384:6384 --name nn gnames/netineti
+# before_script:
+#   - sleep 100
+branches:
+  only:
+      - master

data/CHANGELOG CHANGED

@@ -1,3 +1,5 @@
+0.3.0 fix tailing name bug, refactor tests, move to docker, add version method
 0.2.4 explicitly setting utf-8 encoding for taxon finder socket
 0.2.3 fixed typo

data/Gemfile CHANGED

@@ -1,23 +1,3 @@
-source "http://rubygems.org"
-# Add dependencies required to use your gem here.
-# Example:
-#   gem "activesupport", ">= 2.3.5"
+source 'https://rubygems.org'
-gem "rake"
-gem "rest-client"
-gem "builder"
-gem "json"
-gem "unicode_utils"
-gem "unsupervised-language-detection"
-# Add dependencies to develop your gem here.
-# Include everything needed to run rake, tests, features, etc.
-group :development do
-  gem "rspec"
-  gem "rspec-expectations"
-  gem "cucumber", ">= 0"
-  gem "capybara"
-  gem "bundler"
-  gem "jeweler", "~> 1.6.4"
-  gem "debugger"
-end
+gemspec

data/README.md ADDED

@@ -0,0 +1,116 @@
+name-spotter
+============
+[![Gem Version][1]][2]
+[![Continuous Integration Status][3]][4]
+[![Dependency Status][5]][6]
+Finds biodiversity scientific names in texts using TaxonFinder
+(by Patrick Leary) or NetiNeti (by Lakshmi Manohar Akella) libraries.
+This gem works with Ruby >= 2.0
+Requirements
+------------
+* Docker
+Installation
+------------
+Install the gem
+    gem install name-spotter
+Install and run TaxonFinder and NetiNeti docker containers
+```bash
+docker pull gnames/netineti
+docker pull gnames/taxonfinder
+docker run -d -p 0.0.0.0:1234:1234 --name tf gnames/taxonfinder
+docker run -d -p 0.0.0.0:6384:6384 --name nn gnames/netineti
+```
+Usage
+-----
+If you are using localhost and default ports for NetiNeti and TaxonFinder:
+```ruby
+require "name-spotter"
+neti_client       = NameSpotter::NetiNetiClient.new()
+tf_client         = NameSpotter::TaxonFinderClient.new()
+neti_name_spotter = NameSpotter.new(neti_client)
+tf_name_spotter   = NameSpotter.new(tf_client)
+neti_name_spotter.find(your_text)
+tf_name_spotter.find(your_text)
+```
+If you have installed NetiNeti and TaxonFinder on a machine
+with non-default port:
+```ruby
+neti_client = NameSpotter::NetiNetiClient.new(host: "example.com",
+                                              port: 5555)
+#or
+neti_client = NameSpotter::NetiNetiClient.new(host: "123.123.123.111",
+                                              port: 5555)
+```
+If you want to get results in JSON or XML formats
+```ruby
+neti_name_spotter.find(your_text, "json")
+neti_name_spotter.find(your_text, "xml")
+```
+Development
+-----------
+To run tests start TaxonFinder and NetiNeti on your local machine with
+default configurations and run
+```
+bundle exec rake
+```
+Contributing to name-spotter
+----------------------------
+* Check out the latest master to make sure the feature hasn't been implemented
+or the bug hasn't been fixed yet
+* Check out the issue tracker to make sure someone already hasn't requested
+it and/or contributed it
+* Fork the project
+* Start a feature/bugfix branch
+* Commit and push until you are happy with your contribution
+* Make sure to add tests for it. This is important so I don't break it in a
+future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want
+to have your own version, or is otherwise necessary, that is fine, but please
+isolate to its own commit so I can cherry-pick around it.
+Copyright
+---------
+Authors: [Chuck Ha][7], [Anthony Goddard][8], [Dmitry Mozzherin][9],
+[David Shorthouse][10]
+Copyright (c) 2012-2016 Marine Biological Laboratory. See [LICENSE.txt][11] for
+further details.
+[1]: https://badge.fury.io/rb/name-spotter.svg
+[2]: http://badge.fury.io/rb/name-spotter
+[3]: https://secure.travis-ci.org/GlobalNamesArchitecture/name-spotter.svg
+[4]: http://travis-ci.org/GlobalNamesArchitecture/name-spotter
+[5]: https://gemnasium.com/GlobalNamesArchitecture/name-spotter.svg
+[6]: https://gemnasium.com/GlobalNamesArchitecture/name-spotter
+[7]: https://github.com/ChuckHa
+[8]: https://github.com/agoddard
+[9]: https://github.com/dimus
+[10]: https://github.com/dshorthouse
+[11]: https://raw.githubusercontent.com/GlobalNamesArchitecture/name-spotter/master/LICENSE.txt

data/Rakefile CHANGED

@@ -1,7 +1,7 @@
-# encoding: utf-8
 require 'rubygems'
 require 'bundler'
+require "bundler/gem_tasks"
 begin
   Bundler.setup(:default, :development)
 rescue Bundler::BundlerError => e
@@ -11,20 +11,6 @@ rescue Bundler::BundlerError => e
 end
 require 'rake'
-require 'jeweler'
-Jeweler::Tasks.new do |gem|
-  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
-  gem.name = "name-spotter"
-  gem.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
-  gem.license = "MIT"
-  gem.summary = %Q{Scientific names finder}
-  gem.description = %Q{The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)}
-  gem.email = "dmozzherin@gmail.com"
-  gem.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
-  # dependencies defined in Gemfile
-end
-Jeweler::RubygemsDotOrgTasks.new
 require 'rspec/core'
 require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec) do |spec|
@@ -36,8 +22,5 @@ RSpec::Core::RakeTask.new(:rcov) do |spec|
   spec.rcov = true
 end
-require 'cucumber/rake/task'
-Cucumber::Rake::Task.new(:features)
 task :default => :spec

data/lib/name-spotter.rb CHANGED

@@ -23,7 +23,9 @@ class NameSpotter
       end
       res
     end
-    eng, not_eng = tweets.shuffle[0...50].partition {|a| UnsupervisedLanguageDetection.is_english_tweet?(a.join(" "))}
+    eng, not_eng = tweets.shuffle[0...50].partition do |a|
+      UnsupervisedLanguageDetection.is_english_tweet?(a.join(" "))
+    end
     percentage = eng.size.to_f/(not_eng.size + eng.size)
     percentage > 0.5
   end

data/lib/name-spotter/monkey_patches.rb CHANGED

@@ -8,11 +8,13 @@ end
 class String
   def constantize()
     camel_cased_word = self
-    names = camel_cased_word.split('::')
+    names = camel_cased_word.split("::")
     names.shift if names.empty? || names.first.empty?
     constant = Object
     names.each do |name|
-      constant = constant.const_defined?(name) ? constant.const_get(name) : constant.const_missing(name)
+      constant = constant.const_defined?(name) ?
+        constant.const_get(name) :
+        constant.const_missing(name)
     end
     constant
   end

data/lib/name-spotter/neti_neti_client.rb CHANGED

@@ -1,22 +1,29 @@
 class NameSpotter
   class NetiNetiClient < Client
-    def initialize(opts = { host: '0.0.0.0', port: 6384 })
-      super
+    def initialize(opts = { host: "0.0.0.0", port: 6384 })
+      super
     end
     def find(text)
       # the form does not get sent if text is nil or empty
       return [] if text.nil? || text.empty?
-      resource = RestClient::Resource.new("http://#{@host}:#{@port}", timeout: 9_000_000, open_timeout: 9_000_000, connection: "Keep-Alive")
-      #TODO: we should figure out a better delimiter in NetiNeti (or use json) so we don't need to susbitute pipe with a letter here
+      text << " " # hack to find the last name
+      resource = RestClient::Resource.new("http://#{@host}:#{@port}",
+                                          timeout: 9_000_000,
+                                          open_timeout: 9_000_000,
+                                          connection: "Keep-Alive")
+      #TODO: we should figure out a better delimiter in NetiNeti (or use json)
+      # so we don't need to susbitute pipe with a letter here
       response = resource.post(data: text.gsub("|", "l")) #hhhhhhack
       response.body.split("|").collect do |info|
         res = info.split(",")
         name = res[0...-2].join(",")
         offset_start = res[-2]
-        name.force_encoding('utf-8')
+        name.force_encoding("utf-8")
         normalized_name = NameSpotter::ScientificName.normalize(name)
-        NameSpotter::ScientificName.new(name, :scientific_name => normalized_name, :start_position => offset_start.to_i)
+        NameSpotter::ScientificName.new(name,
+                                        scientific_name: normalized_name,
+                                        start_position: offset_start.to_i)
       end
     end
   end

data/lib/name-spotter/scientific_name.rb CHANGED

@@ -3,7 +3,7 @@ class NameSpotter
     attr_reader :verbatim, :scientific, :start_pos, :end_pos, :score
     def self.normalize(name)
-      name = name.gsub(",", " ")
+      name = name.gsub(",", " ")
       name = name.gsub(/\s+/, " ")
     end
@@ -22,8 +22,8 @@ class NameSpotter
       other_name.is_a?(Name) &&
         other_name.verbatim.eql?(verbatim) &&
         other_name.scientific.eql?(scientific) &&
-        other_name.start_pos.eql?(start_pos) &&
-        other_name.end_pos.eql?(end_pos) &&
+        other_name.start_pos.eql?(start_pos) &&
+        other_name.end_pos.eql?(end_pos) &&
         other_name.score.eql?(score)
     end

data/lib/name-spotter/taxon_finder_client.rb CHANGED

@@ -7,14 +7,14 @@ class NameSpotter
     def find(str, from_web_form=false)
       @names = []
-      @document_verbatim = str
       return [] if str.nil? || str.empty?
+      str << " ." # hack to find last name
+      @document_verbatim = str
       # These are for the data-send-back that happens in TaxonFinder
-      @current_string = ''
-      @current_string_state = ''
+      @current_string = ""
+      @current_string_state = ""
       @word_list_matches = 0
-      @cursor = 8.times.inject([]) { |res| res << ['',0, 0] }
+      @cursor = 8.times.inject([]) { |res| res << ["",0, 0] }
       @current_index = nil
       words = str.split(/\s/)
       words.each do |word|
@@ -34,11 +34,13 @@ class NameSpotter
       @socket = nil
       @names
     end
     private
     def process_word(word, word_separator_size)
-      cursor_entry = [word, @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2], word_separator_size]
+      cursor_entry = [word,
+                      @cursor[-1][0].size + @cursor[-1][1] + @cursor[-1][2],
+                      word_separator_size]
       @cursor.shift
       @cursor << cursor_entry
       taxon_find(word)
@@ -47,30 +49,37 @@ class NameSpotter
     def socket
       unless @socket
         @socket = TCPSocket.open(@host, @port)
-        @socket.set_encoding('utf-8')
+        @socket.set_encoding("utf-8")
       end
       @socket
     end
     def taxon_find(word)
-      input = "#{word}|#{@current_string}|#{@current_string_state}|#{@word_list_matches}|0"
+      input =
+   "#{word}|#{@current_string}|#{@current_string_state}|#{@word_list_matches}|0"
         socket.write(input + "\n")
       if output = socket.gets
         response = parse_socket_response(output)
         return if not response
-        [response.return_string, response.return_string_2].each_with_index do |str, i|
+        [response.return_string,
+         response.return_string_2].each_with_index do |str, i|
           next if !str || str.split(" ").size > 6
-          verbatim_string, scientific_string, start_position = process_response(str, i)
+          verbatim_string, scientific_string, start_position =
+            process_response(str, i)
           next if scientific_string.empty?
-          add_name NameSpotter::ScientificName.new(verbatim_string, :start_position => start_position, :scientific_name => scientific_string)
+          add_name NameSpotter::ScientificName.new(verbatim_string,
+                                    start_position: start_position,
+                                    scientific_name: scientific_string)
         end
         @current_index = @current_string.empty? ? nil : @cursor[-1][1]
       end
     end
     def parse_socket_response(response)
-      current_string, current_string_state, word_list_matches, return_string, return_score, return_string_2, return_score_2 = response.strip.split '|'
+      current_string, current_string_state, word_list_matches,
+        return_string, return_score, return_string_2,
+        return_score_2 = response.strip.split "|"
       @current_string = current_string
       @current_string_state = current_string_state
       @word_list_matches = word_list_matches
@@ -78,14 +87,14 @@ class NameSpotter
       if !@current_index && @current_string.size > 0
           @current_index = @cursor[-1][1]
       end
-      if not return_string.blank? or not return_string_2.blank?
-        OpenStruct.new( { :current_string       => current_string,
-                       :current_string_state => current_string_state,
-                       :word_list_matches    => word_list_matches,
-                       :return_string        => return_string,
-                       :return_score         => return_score,
-                       :return_string_2      => return_string_2,
-                       :return_score_2       => return_score_2 })
+      if not return_string.blank? or not return_string_2.blank?
+        OpenStruct.new( { current_string:    current_string,
+                       current_string_state: current_string_state,
+                       word_list_matches:    word_list_matches,
+                       return_string:        return_string,
+                       return_score:         return_score,
+                       return_string_2:      return_string_2,
+                       return_score_2:       return_score_2 })
       else
         @current_index = nil if @current_string.empty? && @current_index
         false
@@ -94,7 +103,7 @@ class NameSpotter
     def process_response(str, index)
       is_return_string2 = (index == 1)
-      str.force_encoding('utf-8')
+      str.force_encoding("utf-8")
       start_position = verbatim_string = nil
       if @current_index
         start_position = is_return_string2 ? @cursor[-1][1] : @current_index
@@ -102,7 +111,9 @@ class NameSpotter
         verbatim_components = @cursor[indices.rindex(start_position)..-1]
         sci_name_items_num = str.split(" ").size
         verbatim_components = verbatim_components[0...sci_name_items_num]
-        verbatim_string = verbatim_components.map {|w| w[0] + (" " * w[2])}.join("").gsub(/[\.\,\!\;]*\s*$/, '')
+        verbatim_string = verbatim_components.map do |w|
+          w[0] + (" " * w[2])
+        end.join("").gsub(/[\.\,\!\;]*\s*$/, "")
       else
         verbatim_string, start_position, space_size = @cursor[-1]
       end