RubyGems - nlp-pure - Versions diffs - 0.1.0 → 0.2.0 - Mend

nlp-pure 0.1.0 → 0.2.0

Files changed (27) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/.rubocop.yml +20 -4
data/.travis.yml +10 -5
data/CHANGELOG.md +2 -0
data/Gemfile +5 -7
data/README.md +36 -21
data/Rakefile +17 -8
data/lib/nlp_pure.rb +5 -4
data/lib/nlp_pure/logging.rb +2 -1
data/lib/nlp_pure/segmenting.rb +1 -0
data/lib/nlp_pure/segmenting/default_sentence.rb +94 -0
data/lib/nlp_pure/segmenting/default_word.rb +4 -4
data/lib/nlp_pure/version.rb +2 -1
data/nlp-pure.gemspec +2 -1
data/test/fixtures/corpus_english_simple.rb +85 -0
data/test/lib/nlp_pure/segmenting/default_sentence_test.rb +123 -0
data/test/lib/nlp_pure/segmenting/default_word_test.rb +106 -0
data/test/lib/nlp_pure/segmenting_test.rb +13 -0
data/test/lib/nlp_pure_test.rb +13 -0
data/test/test_helper.rb +4 -0
metadata +26 -29
data/Guardfile +0 -20
data/spec/lib/nlp_pure_spec.rb +0 -11
data/spec/lib/segmenting/default_word_spec.rb +0 -207
data/spec/lib/segmenting_spec.rb +0 -11
data/spec/spec_helper.rb +0 -16

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c5bbc92e65c96837a6e53f28248e15d48a35abe1
-  data.tar.gz: 79f767942ba8723a3f5f6eb04ea0ec4498e02591
+  metadata.gz: 040f2a0f166664553334b5b5c4bd8119b1033fd0
+  data.tar.gz: a414246804bbba1dd57a4ecd70e82d57eb92bbc5
 SHA512:
-  metadata.gz: 9e00458afc1dadd851ea8ccd4e312ec19c6b775b455ebf6d5e599480dde8f333704a8c9601f62970bdefe26ffea7bd509bb3cd52314775d642865587d94e7214
-  data.tar.gz: 72abbd773eb915a11f76526b9bdfb37cbcd05c258aab45fd3c7e18c9fc1591c84c97cc3f99641ecee20ad27ea47d10daf9e35128d572d95aeb17aeda809e8a93
+  metadata.gz: ffe7ee7f37f3b724f74ce7ddaf318fd38d5addcdcaa844f1abafe81bbdd7d5c20dfe53f9122a65155173491775e17819940ee2e7faac523403d184e508a60827
+  data.tar.gz: 7d69a2dfc54b6cf1e008b35d7fc040697f3e7f9decfab4613564814a4cc74e3affb7ba52eae4efddb751b2f6df6be2cfef617929d3cbbbe8691d62307cdd19b3

data/.gitignore CHANGED

@@ -29,3 +29,5 @@ Gemfile.lock
 *.swp
 .DS_Store
+TODO.md
+ideas

data/.rubocop.yml CHANGED

@@ -2,25 +2,41 @@ AllCops:
   Exclude:
     - Guardfile
     - 'vendor/**/*'
+    - 'test/**/*'
-  RunRailsCops: false
+Rails:
+  Enabled: false
+# NLP is hard
+AbcSize:
+  Max: 22.5
 AlignParameters:
   Enabled: false
 ClassAndModuleChildren:
   Enabled: false
+# NLP is hard
+CyclomaticComplexity:
+  Max: 12
 Encoding:
   Enabled: false
-LineLength:
-  Max: 200
 HashSyntax:
   Exclude:
     - Rakefile
     - 'spec/**/*'
     - 'test/**/*'
+LineLength:
+  Max: 200
+# NLP is hard
+MethodLength:
+  Max: 15
+PerceivedComplexity:
+  Max: 12
 # Don't fail on whitespace between method names and arguments
-Style/SingleSpaceBeforeFirstArg:
+Style/SpaceBeforeFirstArg:
+  Enabled: false
+Style/SymbolArray:
   Enabled: false
 # Indent private/protected/public as deep as method definitions

data/.travis.yml CHANGED

@@ -1,14 +1,19 @@
 language: ruby
 sudo: false
 cache: bundler
+dist: trusty
+addons:
+  apt:
+    packages:
+      - haveged
 # NOTE: these run in order
 rvm:
   - jruby
-  - rbx-2
-  - 2.0.0
-  - 2.1
-  - 2.2
+  - rbx-3.73
+  - 2.2.7
+  - 2.3.4
+  - 2.4.1
 matrix:
   allow_failures:
-    - rvm: rbx-2
+    - rvm: rbx-3.73
     - rvm: jruby

data/CHANGELOG.md CHANGED

@@ -4,6 +4,8 @@ Officially leaving a non-semantic versioning scheme.
 Added benchmarking test.
+Replaced RSpec with Minitest
 # 0.0.5
 Fixed bug in `NlpPure::Segmenting::DefaultWord` where leading ellipses could produce extra segmented words.

data/Gemfile CHANGED

@@ -1,22 +1,20 @@
 source 'https://rubygems.org'
 gemspec
+gem 'rubocop'
 platforms :rbx do
-  gem 'rubysl', '~> 2.0'         # if using anything in the ruby standard library
   gem 'psych'                    # if using yaml
-  gem 'minitest'                 # if using minitest
   gem 'rubinius-developer_tools' # if using any of coverage, debugger, profiler
+  gem 'rubysl', '~> 2.0'         # if using anything in the ruby standard library
 end
 platforms :jruby do
-  gem 'jruby-openssl'
   gem 'activerecord-jdbcsqlite3-adapter'
+  gem 'jruby-openssl'
 end
 group :test do
-  gem 'rake'
-  gem 'rspec', '~> 3.0.0'
-  gem 'guard-rspec'
-  gem 'guard-rubocop'
   gem 'coveralls', require: false
+  gem 'rake'
 end

data/README.md CHANGED

@@ -1,8 +1,8 @@
 # NLP Pure
-[![Code Climate](https://codeclimate.com/github/parhamr/nlp-pure/badges/gpa.svg)](https://codeclimate.com/github/parhamr/nlp-pure)
+[![Gem Version](https://badge.fury.io/rb/nlp-pure.svg)](https://badge.fury.io/rb/nlp-pure) [![Code Climate](https://codeclimate.com/github/parhamr/nlp-pure/badges/gpa.svg)](https://codeclimate.com/github/parhamr/nlp-pure)
 [![Build Status](https://travis-ci.org/parhamr/nlp-pure.svg?branch=master)](https://travis-ci.org/parhamr/nlp-pure)
-[![Coverage Status](https://coveralls.io/repos/parhamr/nlp-pure/badge.png?branch=master)](https://coveralls.io/r/parhamr/nlp-pure?branch=master)
+[![Coverage Status](https://coveralls.io/repos/github/parhamr/nlp-pure/badge.svg?branch=master)](https://coveralls.io/github/parhamr/nlp-pure?branch=master)
 Natural language processing algorithms implemented in pure Ruby with minimal dependencies.
@@ -14,7 +14,8 @@ This project aims to provide functionality similar to [Treat](https://github.com
 * [Installation](#installation)
 * [Usage](#usage)
-** [Word Segmentation](#word-segmentation)
+  * [Word Segmentation](#word-segmentation)
+  * [Sentence Segmentation](#sentence-segmentation)
 * [Supported Ruby Versions](#supported-ruby-versions)
 * [Versioning](#versioning)
 * [Contributing](CONTRIBUTING.md)
@@ -61,6 +62,20 @@ irb(main):005:0> NlpPure::Segmenting::DefaultWord.parse "Mary had a little lamb,
 => ["Mary", "had", "a", "little", "lamb,", "His", "fleece", "was", "white", "as", "snow,", "And", "everywhere", "that", "Mary", "went,", "The", "lamb", "was", "sure", "to", "go."]
 ```
+### Sentence Segmentation
+```
+M017-PDX:nlp-pure rp0616$ bundle exec irb
+irb(main):001:0> require 'nlp_pure/segmenting/default_sentence'
+=> true
+irb(main):002:0> NlpPure::Segmenting::DefaultSentence.parse 'The U.S.A. is a member of NATO.'
+=> ["The U.S.A. is a member of NATO."]
+irb(main):003:0> NlpPure::Segmenting::DefaultSentence.parse 'Mary had a little lamb. The lamb\U+FFE2s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.'
+=> ["Mary had a little lamb.", "The lambs fleece was white as snow.", "Everywhere that Mary went, the lamb was sure to go."]
+irb(main):004:0> NlpPure::Segmenting::DefaultSentence.parse 'I am excited! Today is Friday.'
+=> ["I am excited!", "Today is Friday."]
+```
 ## Supported Ruby Versions
@@ -101,29 +116,29 @@ spec.add_dependency 'nlp-pure', '~> 0.1'
 [Search “nlp” at ruby-toolbox.com](https://www.ruby-toolbox.com/search?q=nlp)
 * APIs
-** [alchemy_api](https://github.com/dbalatero/alchemy_api)
-** [napi-ruby](https://github.com/Maluuba/napi-ruby)
-** [poliqarpr](https://github.com/apohllo/poliqarpr)
-** [wlapi](https://github.com/arbox/wlapi)
+  * [alchemy_api](https://github.com/dbalatero/alchemy_api)
+  * [napi-ruby](https://github.com/Maluuba/napi-ruby)
+  * [poliqarpr](https://github.com/apohllo/poliqarpr)
+  * [wlapi](https://github.com/arbox/wlapi)
 * Bindings and Toolkits
-** [open-nlp](https://github.com/louismullie/open-nlp)
-** [stanford-core-nlp](https://github.com/louismullie/stanford-core-nlp)
-** [treat](https://github.com/louismullie/treat)
+  * [open-nlp](https://github.com/louismullie/open-nlp)
+  * [stanford-core-nlp](https://github.com/louismullie/stanford-core-nlp)
+  * [treat](https://github.com/louismullie/treat)
 * Classification
-** [linnaeus](https://github.com/djcp/linnaeus)
-** [maxent_string_classifier](https://github.com/mccraigmccraig/maxent_string_classifier)
+  * [linnaeus](https://github.com/djcp/linnaeus)
+  * [maxent_string_classifier](https://github.com/mccraigmccraig/maxent_string_classifier)
 * N-Grams
-** [ruby-ngram](https://github.com/tkellen/ruby-ngram)
+  * [ruby-ngram](https://github.com/tkellen/ruby-ngram)
 * Specific Languages
-** Polish
-*** [nlp](https://github.com/knife/nlp)
+  * Polish
+    * [nlp](https://github.com/knife/nlp)
 * Stopwords
-** [clarifier](https://github.com/meducation/clarifier)
-** [stopwords](https://github.com/brez/stopwords)
-** [stopwords-filter](https://github.com/brenes/stopwords-filter)
+  * [clarifier](https://github.com/meducation/clarifier)
+  * [stopwords](https://github.com/brez/stopwords)
+  * [stopwords-filter](https://github.com/brenes/stopwords-filter)
 * Tokenization
-** [rseg](https://rubygems.org/gems/rseg)
-** [Tokenizer](https://github.com/arbox/tokenizer)
+  * [rseg](https://rubygems.org/gems/rseg)
+  * [Tokenizer](https://github.com/arbox/tokenizer)
 * Word Counters
-** [words_counted](https://github.com/abitdodgy/words_counted)
+  * [words_counted](https://github.com/abitdodgy/words_counted)

data/Rakefile CHANGED

@@ -2,17 +2,26 @@ require 'bundler'
 Bundler::GemHelper.install_tasks
 begin
-  require 'rspec/core/rake_task'
-  require 'rubocop/rake_task'
-  RSpec::Core::RakeTask.new(:spec)
+  task :coverage do
+    require 'coveralls'
+    Coveralls.wear!
+    require 'minitest'
+  end
-  task :rubocop do
-    require 'rubocop'
-    cli = RuboCop::CLI.new
-    cli.run
+  require 'rake/testtask'
+  Rake::TestTask.new(:test) do |t|
+    require_relative 'test/test_helper'
+    t.verbose = true
+    t.pattern = 'test/**/*_test.rb'
+  end
+  require 'rubocop/rake_task'
+  RuboCop::RakeTask.new(:rubocop) do |task|
+    # don't abort rake on failure
+    task.fail_on_error = false
   end
-  task :default => [:spec, :rubocop]
+  task default: [:coverage, :test, :rubocop]
 rescue LoadError => e
   STDERR << "#{e.class}: #{e.message} (#{e.backtrace[0]})"
 end

data/lib/nlp_pure.rb CHANGED

@@ -1,13 +1,14 @@
 # encoding: utf-8
 require 'nlp_pure/version'
-fail "NLP Pure #{NlpPure::VERSION} does not support Ruby 1.9." if RUBY_PLATFORM != 'java' && RUBY_VERSION < '2.0.0'
+raise "NLP Pure #{NlpPure::VERSION} does not support Ruby 1.9." if RUBY_PLATFORM != 'java' && RUBY_VERSION < '2.0.0'
 #
 module NlpPure
-  NAME = 'NlpPure'
-  LICENSE = 'See LICENSE for details.'
+  NAME = 'NlpPure'.freeze
+  LICENSE = 'See LICENSE for details.'.freeze
-  DEFAULTS = {}
+  DEFAULTS = {}.freeze
   def self.logger
     NlpPure::Logging.logger

data/lib/nlp_pure/logging.rb CHANGED

@@ -1,4 +1,5 @@
 # encoding: utf-8
 require 'time'
 require 'logger'
@@ -8,7 +9,7 @@ module NlpPure
   module Logging
     #
     class Pretty < Logger::Formatter
-      def call(severity, time, program_name, message)
+      def call(severity, time, _program_name, message)
         "#{time.utc.iso8601(2)} #{::Process.pid} #{severity}: #{message}\n"
       end
     end

data/lib/nlp_pure/segmenting.rb CHANGED

@@ -1,4 +1,5 @@
 # encoding: utf-8
 #
 module NlpPure
   # Namespace for segmenting implementations

data/lib/nlp_pure/segmenting/default_sentence.rb ADDED

@@ -0,0 +1,94 @@
+# encoding: utf-8
+module NlpPure
+  module Segmenting
+    # SEE ALSO: Unsupervised Multilingual Sentence Boundary Detection. Kiss, Strunk; 2006.
+    # NOTE: this fails on some proper nouns with abbreviations (e.g. business names)
+    #       and fails on single-linebreak headings
+    module DefaultSentence
+      DEFAULT_OPTIONS = {
+        # punctuation or linebreaks
+        split: /([.?!]|\n{2,}|\r\n)+/,
+        # array of arrays; [0] should be regexp, [1] should be replacement
+        # NOTE: minor performance risk in letting this array grow long
+        gsub:  [
+          # period ellipses need reconstruction
+          [/\.{3,}/, '…']
+        ],
+        naive_sentence_word_count: 3,
+        segment_boundary: '. '
+      }.freeze
+      module_function
+      # NOTE: exposed as a method for easy mock/stub
+      def options
+        DEFAULT_OPTIONS
+      end
+      def parse(*args)
+        return nil if args.nil? || args.empty?
+        # naive split
+        segments = clean_input(args[0]).split(options.fetch(:split, nil))
+        # skip rejoin if one segment
+        return segments if segments.length == 1
+        returning = rejoin_segment_fragments(segments).compact
+        STDERR << "#{returning.inspect}\n" if ENV['DEBUG']
+        returning
+      end
+      def clean_input(text = nil)
+        input = text.to_s
+        # perform replacements to work around the limitations of the splitting regexp
+        options.fetch(:gsub, []).each do |gsub_pair|
+          input.gsub!(gsub_pair[0], gsub_pair[1])
+        end
+        # NOTE: leading whitespace is problematic; ref #12
+        input.strip
+      end
+      def rejoin_segment_fragments(segments)
+        reassociated_segments = []
+        # take all segments
+        while (segment = segments.shift)
+          STDERR << "#{segment.inspect}\n" if ENV['DEBUG']
+          # join segments if needed
+          reassociated_segments << handle_special_fragments(segments, segment)
+        end
+        reassociated_segments
+      end
+      # rejoin leading punctuation, abbreviation, and numbers
+      def handle_special_fragments(segments, segment)
+        # NOTE: always index zero because we're shifting
+        while next_segment_appears_included?(segments[0])
+          STDERR << "\t\t<< #{segments[0].inspect}\n" if ENV['DEBUG']
+          segment = "#{segment}#{segments.shift}"
+        end
+        segment.strip
+      end
+      def next_segment_appears_included?(segment)
+        return false unless segment
+        # NOTE: the logic is expanded for logging reasons (despite style violation)
+        if segment[0] =~ options.fetch(:split, nil)
+          STDERR << "\t! leading punctuation detected\n" if ENV['DEBUG']
+        elsif segment[0] =~ /^\w/
+          STDERR << "\t! assuming abbreviation\n" if ENV['DEBUG']
+        elsif segment =~ /^\s[a-z0-9]/
+          STDERR << "\t! greedily grabbing lowercase\n" if ENV['DEBUG']
+        elsif segment =~ /^\d/
+          STDERR << "\t! leading numeral detected\n" if ENV['DEBUG']
+        else
+          STDERR << "\t\tx\n" if ENV['DEBUG']
+          return false
+        end
+        true
+      end
+      def cleanup_segmenting(segments)
+        segments.compact
+      end
+    end
+  end
+end

data/lib/nlp_pure/segmenting/default_word.rb CHANGED

@@ -13,15 +13,15 @@ module NlpPure
         gsub:  [
           # ellipses at the start of a string are problematic; ref #12
           [/^\s?(…|\.{3,})/, ' ']
-        ]
+        ],
+        segment_boundary: ' '
       }.freeze
       module_function
       def parse(*args)
-        unless args.nil? || args.empty?
-          clean_input(args[0]).split(options.fetch(:split, nil))
-        end
+        return nil if args.nil? || args.empty?
+        clean_input(args[0]).split(options.fetch(:split, nil))
       end
       def clean_input(text = nil)

data/lib/nlp_pure/version.rb CHANGED

@@ -1,5 +1,6 @@
 # encoding: utf-8
 #
 module NlpPure
-  VERSION = '0.1.0'
+  VERSION = '0.2.0'.freeze
 end

data/nlp-pure.gemspec CHANGED

@@ -1,4 +1,5 @@
 # encoding: utf-8
 require File.expand_path('../lib/nlp_pure/version', __FILE__)
 Gem::Specification.new do |gem|
@@ -15,6 +16,6 @@ Gem::Specification.new do |gem|
   gem.require_paths = ['lib']
   gem.version       = NlpPure::VERSION
   gem.add_development_dependency      'rake', '~> 10.4'
-  gem.add_development_dependency      'rspec', '~> 3.0'
+  gem.add_development_dependency      'minitest', '~> 5.5'
   gem.add_development_dependency      'coveralls', '~> 0.7'
 end

data/test/fixtures/corpus_english_simple.rb ADDED

@@ -0,0 +1,85 @@
+module CorpusEnglishSimple
+  def english_simple_sentence
+    'The quick brown fox jumps over the lazy dog.'
+  end
+  def english_hyphen_sentence
+    'The New York-based company hired new staff.'
+  end
+  def english_dash_sentence
+    'The quick brown fox—full of energy—jumps over the lazy dog.'
+  end
+  def english_spaced_dash_sentence
+    'The quick brown fox — full of energy — jumps over the lazy dog.'
+  end
+  def english_twohyphen_sentence
+    'The quick brown fox--full of energy--jumps over the lazy dog.'
+  end
+  def english_ellipsis_sentence
+    'The quick brown fox…jumps over the lazy dog.'
+  end
+  def english_spaced_ellipsis_sentence
+    'The quick brown fox … jumps over the lazy dog.'
+  end
+  def english_period_ellipsis_sentence
+    'The quick brown fox...jumps over the lazy dog.'
+  end
+  def english_leading_ellipsis_sentence
+    ' … the quick brown fox jumps over the lazy dog.'
+  end
+  def english_leading_period_ellipsis_sentence
+    ' ... the quick brown fox jumps over the lazy dog.'
+  end
+  def english_trailing_ellipsis_sentence
+    'The quick brown fox jumps over the lazy dog … '
+  end
+  def english_spaced_period_ellipsis_sentence
+    'The quick brown fox ... jumps over the lazy dog.'
+  end
+  def english_abbreviation_sentence
+    'The U.S.A. is a member of NATO.'
+  end
+  def english_simple_paragraph
+    'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.'
+  end
+  def english_simple_line_breaks
+    "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go."
+  end
+  def english_financial_sentence
+    "AMERICAN INDUSTRY INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter A.B. Hammersmith & Co."
+  end
+  def english_short_sentence
+    "Go!"
+  end
+  def english_excalamations
+    "I am excited! Today is Friday."
+  end
+  def english_short_question
+    "You?"
+  end
+  def english_leading_question
+    "On which side of the road do you drive? In North America we drive on the right side."
+  end
+  def english_usa_constitution_preamble
+    "United States of America 1789 (rev. 1992)\r\nPREAMBLE\r\nWe the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defense, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
+  end
+end

data/test/lib/nlp_pure/segmenting/default_sentence_test.rb ADDED

@@ -0,0 +1,123 @@
+# encoding: utf-8
+require 'minitest/autorun'
+require 'nlp_pure/segmenting/default_sentence'
+require_relative '../../../fixtures/corpus_english_simple'
+#
+class TestNlpPureSegmentingDefaultSentence < Minitest::Test
+  describe '[module]' do
+    def test_module_is_defined
+      assert_equal defined?(NlpPure::Segmenting::DefaultSentence), 'constant'
+    end
+  end
+  describe '(English language)' do
+    include ::CorpusEnglishSimple
+    describe '.parse' do
+      describe 'with `nil` argument' do
+        def test_parse_returns_array
+          assert_equal [], NlpPure::Segmenting::DefaultSentence.parse(nil)
+        end
+      end
+      describe 'without arguments' do
+        def test_parse_returns_nil
+          assert_nil NlpPure::Segmenting::DefaultSentence.parse
+        end
+      end
+      describe 'with strings' do
+        def test_parse_returns_sentence_array
+          assert_instance_of Array, NlpPure::Segmenting::DefaultSentence.parse(english_simple_sentence)
+        end
+        def test_parse_correctly_counts_sentences
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_simple_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_hyphens
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_hyphen_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_doublehyphen_dashes
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_twohyphen_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_dashes
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_dash_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_spaced_dashes
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_spaced_dash_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_ellipses
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_ellipsis_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_spaced_ellipses
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_spaced_ellipsis_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_periodellipses
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_period_ellipsis_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_spaced_periodellipses
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_spaced_period_ellipsis_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_leading_spaced_periodellipses
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_leading_ellipsis_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_trailing_spaced_periodellipses
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_trailing_ellipsis_sentence).length
+        end
+        def test_parse_does_not_sentence_segment_abbreviations
+          assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_abbreviation_sentence),
+            ["The U.S.A. is a member of NATO."]
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_abbreviation_sentence).length
+        end
+        def test_parse_does_not_sentence_segment_financial_jargon
+          skip("FIXME: financial jargon is hard")
+          assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_financial_sentence),
+            [english_financial_sentence]
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_financial_sentence).length
+        end
+        def test_parse_correctly_sentence_segments_longer_texts
+          assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_simple_paragraph),
+            ["Mary had a little lamb.", "The lamb’s fleece was white as snow.", "Everywhere that Mary went, the lamb was sure to go."]
+          assert_equal 3, NlpPure::Segmenting::DefaultSentence.parse(english_simple_paragraph).length
+        end
+        def test_parse_correctly_sentence_segments_line_breaks
+          assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_simple_line_breaks).length
+        end
+        def test_parse_correctly_sentence_segments_exclamations
+          assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_excalamations),
+            ["I am excited!", "Today is Friday."]
+          assert_equal 2, NlpPure::Segmenting::DefaultSentence.parse(english_excalamations).length
+        end
+        def test_parse_correctly_sentence_segments_questions
+          assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_leading_question),
+            ["On which side of the road do you drive?", "In North America we drive on the right side."]
+          assert_equal 2, NlpPure::Segmenting::DefaultSentence.parse(english_leading_question).length
+        end
+        def test_parse_correctly_sentence_usa_constitution_preamble
+          assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_usa_constitution_preamble)[0],
+            "United States of America 1789 (rev. 1992)"
+          assert_equal 3, NlpPure::Segmenting::DefaultSentence.parse(english_usa_constitution_preamble).length
+        end
+      end
+    end
+  end
+end

data/test/lib/nlp_pure/segmenting/default_word_test.rb ADDED

@@ -0,0 +1,106 @@
+# encoding: utf-8
+require 'minitest/autorun'
+require 'nlp_pure/segmenting/default_word'
+require_relative '../../../fixtures/corpus_english_simple'
+#
+class TestNlpPureSegmentingDefaultWord < Minitest::Test
+  describe '[module]' do
+    def test_module_is_defined
+      assert_equal defined?(NlpPure::Segmenting::DefaultWord), 'constant'
+    end
+  end
+  describe '(English language)' do
+    include ::CorpusEnglishSimple
+    describe '.parse' do
+      describe 'with `nil` argument' do
+        def test_parse_returns_array
+          assert_equal [], NlpPure::Segmenting::DefaultWord.parse(nil)
+        end
+      end
+      describe 'without arguments' do
+        def test_parse_returns_nil
+          assert_nil NlpPure::Segmenting::DefaultWord.parse
+        end
+      end
+      def test_parse_returns_word_array
+        assert_instance_of Array, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence)
+      end
+      def test_parse_correctly_counts_words
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).length
+      end
+      def test_parse_does_not_mangle_english_simple_sentence
+        assert_equal english_simple_sentence, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).join(NlpPure::Segmenting::DefaultWord.options[:segment_boundary])
+      end
+      def test_parse_correctly_word_segments_hyphens
+        assert_equal 8, NlpPure::Segmenting::DefaultWord.parse(english_hyphen_sentence).length
+      end
+      def test_parse_does_not_mangle_english_hyphen_sentence
+        skip("FIXME")
+        assert_equal english_simple_sentence, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).join(NlpPure::Segmenting::DefaultWord.options[:segment_boundary])
+      end
+      def test_parse_correctly_word_segments_doublehyphen_dashes
+        assert_equal 12, NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).length
+      end
+      def test_parse_does_not_mangle_english_twohyphen_sentence
+        skip("FIXME")
+        assert_equal english_twohyphen_sentence, NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).join(NlpPure::Segmenting::DefaultWord.options[:segment_boundary])
+      end
+      def test_parse_correctly_word_segments_dashes
+        assert_equal 12, NlpPure::Segmenting::DefaultWord.parse(english_dash_sentence).length
+      end
+      def test_parse_correctly_word_segments_spaced_dashes
+        assert_equal 12, NlpPure::Segmenting::DefaultWord.parse(english_spaced_dash_sentence).length
+      end
+      def test_parse_correctly_word_segments_ellipses
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_ellipsis_sentence).length
+      end
+      def test_parse_correctly_word_segments_spaced_ellipses
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_spaced_ellipsis_sentence).length
+      end
+      def test_parse_correctly_word_segments_periodellipses
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_period_ellipsis_sentence).length
+      end
+      def test_parse_correctly_word_segments_spaced_periodellipses
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipsis_sentence).length
+      end
+      def test_parse_correctly_word_segments_leading_spaced_periodellipses
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_leading_ellipsis_sentence).length
+      end
+      def test_parse_correctly_word_segments_trailing_spaced_periodellipses
+        assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipsis_sentence).length
+      end
+      def test_parse_does_not_word_segment_abbreviations
+        assert_equal 7, NlpPure::Segmenting::DefaultWord.parse(english_abbreviation_sentence).length
+      end
+      def test_parse_correctly_word_segments_longer_texts
+        assert_equal 22, NlpPure::Segmenting::DefaultWord.parse(english_simple_paragraph).length
+      end
+      def test_parse_correctly_word_segments_line_breaks
+        assert_equal 22, NlpPure::Segmenting::DefaultWord.parse(english_simple_line_breaks).length
+      end
+    end
+  end
+end

data/test/lib/nlp_pure/segmenting_test.rb ADDED

@@ -0,0 +1,13 @@
+# encoding: utf-8
+require 'minitest/autorun'
+require 'nlp_pure/segmenting'
+#
+class TestNlpPureSegmenting < Minitest::Test
+  describe '[module]' do
+    def test_module_is_defined
+      assert_equal defined?(NlpPure::Segmenting), 'constant'
+    end
+  end
+end

data/test/lib/nlp_pure_test.rb ADDED

@@ -0,0 +1,13 @@
+# encoding: utf-8
+require 'minitest/autorun'
+require 'nlp_pure'
+#
+class TestNlpPure < Minitest::Test
+  describe '[module]' do
+    def test_module_is_defined
+      assert_equal defined?(NlpPure), 'constant'
+    end
+  end
+end

data/test/test_helper.rb ADDED

@@ -0,0 +1,4 @@
+# encoding: utf-8
+require_relative 'fixtures/corpus_english_simple'

metadata CHANGED

@@ -1,55 +1,55 @@
 --- !ruby/object:Gem::Specification
 name: nlp-pure
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Reid Parham
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-16 00:00:00.000000000 Z
+date: 2017-04-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: '10.4'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: '10.4'
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: minitest
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: '5.5'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: '5.5'
 - !ruby/object:Gem::Dependency
   name: coveralls
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: '0.7'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
         version: '0.7'
 description: Natural language processing algorithms implemented in pure Ruby with
@@ -60,27 +60,29 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- ".gitignore"
-- ".rspec"
-- ".rubocop.yml"
-- ".travis.yml"
+- .gitignore
+- .rspec
+- .rubocop.yml
+- .travis.yml
 - CHANGELOG.md
 - CONTRIBUTING.md
 - Gemfile
-- Guardfile
 - LICENSE
 - README.md
 - Rakefile
 - lib/nlp_pure.rb
 - lib/nlp_pure/logging.rb
 - lib/nlp_pure/segmenting.rb
+- lib/nlp_pure/segmenting/default_sentence.rb
 - lib/nlp_pure/segmenting/default_word.rb
 - lib/nlp_pure/version.rb
 - nlp-pure.gemspec
-- spec/lib/nlp_pure_spec.rb
-- spec/lib/segmenting/default_word_spec.rb
-- spec/lib/segmenting_spec.rb
-- spec/spec_helper.rb
+- test/fixtures/corpus_english_simple.rb
+- test/lib/nlp_pure/segmenting/default_sentence_test.rb
+- test/lib/nlp_pure/segmenting/default_word_test.rb
+- test/lib/nlp_pure/segmenting_test.rb
+- test/lib/nlp_pure_test.rb
+- test/test_helper.rb
 homepage: https://github.com/parhamr/nlp-pure
 licenses:
 - MIT
@@ -91,24 +93,19 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.0.14.1
 signing_key:
 specification_version: 4
 summary: Natural language processing algorithms implemented in pure Ruby with minimal
   dependencies
-test_files:
-- spec/lib/nlp_pure_spec.rb
-- spec/lib/segmenting/default_word_spec.rb
-- spec/lib/segmenting_spec.rb
-- spec/spec_helper.rb
-has_rdoc:
+test_files: []

data/Guardfile DELETED

@@ -1,20 +0,0 @@
-guard :rspec, cmd: "bundle exec rspec", all_on_start: false, all_after_pass: false, failed_mode: :none do
-  require "guard/rspec/dsl"
-  dsl = Guard::RSpec::Dsl.new(self)
-  # RSpec files
-  rspec = dsl.rspec
-  watch(rspec.spec_helper) { rspec.spec_dir }
-  watch(rspec.spec_support) { rspec.spec_dir }
-  watch(rspec.spec_files)
-  # Ruby files
-  ruby = dsl.ruby
-  dsl.watch_spec_files_for(ruby.lib_files)
-end
-guard :rubocop, all_on_start: false, keep_failed: false do
-  watch(%r{.+\.rb$})
-  watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
-end

data/spec/lib/nlp_pure_spec.rb DELETED

@@ -1,11 +0,0 @@
-# encoding: utf-8
-require 'spec_helper'
-require 'nlp_pure'
-describe NlpPure do
-  describe '[module]' do
-    it 'is defined' do
-      expect(defined?(NlpPure)).to be_truthy
-    end
-  end
-end

data/spec/lib/segmenting/default_word_spec.rb DELETED

@@ -1,207 +0,0 @@
-# encoding: utf-8
-require 'spec_helper'
-require 'nlp_pure/segmenting/default_word'
-describe NlpPure::Segmenting::DefaultWord do
-  describe '[module]' do
-    it 'is defined' do
-      expect(defined?(NlpPure::Segmenting::DefaultWord)).to be_truthy
-    end
-    describe '::DEFAULT_OPTIONS' do
-      it 'is Hash' do
-        expect(NlpPure::Segmenting::DefaultWord::DEFAULT_OPTIONS).to be_a Hash
-      end
-    end
-  end
-  describe '.parse' do
-    context 'English' do
-      let(:english_simple_sentence) { 'The quick brown fox jumps over the lazy dog.' }
-      let(:english_hyphen_sentence) { 'The New York-based company hired new staff.' }
-      let(:english_dash_sentence) { 'The quick brown fox—full of energy—jumps over the lazy dog.' }
-      let(:english_spaced_dash_sentence) { 'The quick brown fox — full of energy — jumps over the lazy dog.' }
-      let(:english_twohyphen_sentence) { 'The quick brown fox--full of energy--jumps over the lazy dog.' }
-      let(:english_ellipsis_sentence) { 'The quick brown fox…jumps over the lazy dog.' }
-      let(:english_spaced_ellipsis_sentence) { 'The quick brown fox … jumps over the lazy dog.' }
-      let(:english_period_ellipsis_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
-      let(:english_leading_ellipsis_sentence) { ' … the quick brown fox jumps over the lazy dog.' }
-      let(:english_leading_period_ellipsis_sentence) { ' ... the quick brown fox jumps over the lazy dog.' }
-      let(:english_trailing_ellipsis_sentence) { 'The quick brown fox jumps over the lazy dog … ' }
-      let(:english_spaced_period_ellipsis_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
-      let(:english_abbreviation_sentence) { 'The U.S.A. is a member of NATO.' }
-      let(:english_simple_paragraph) { 'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.' }
-      let(:english_simple_line_breaks) { "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go." }
-      context '(with nil options)' do
-        before do
-          expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return(nil)
-        end
-        it 'raises NoMethodError' do
-          expect { NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence) }.to raise_error
-        end
-      end
-      context '(with blank options)' do
-        before do
-          expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return({})
-        end
-        it 'returns Array' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence)).to be_an Array
-        end
-      end
-      context '(with default options)' do
-        context 'with `nil` argument' do
-          it 'does not raise error' do
-            expect { NlpPure::Segmenting::DefaultWord.parse(nil) }.to_not raise_error
-          end
-          it 'returns Array' do
-            expect(NlpPure::Segmenting::DefaultWord.parse(nil)).to be_an Array
-          end
-        end
-        context 'without arguments' do
-          it 'does not raise error' do
-            expect { NlpPure::Segmenting::DefaultWord.parse }.to_not raise_error
-          end
-          it 'returns nil' do
-            expect(NlpPure::Segmenting::DefaultWord.parse).to eq nil
-          end
-        end
-        it 'returns Array' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence)).to be_an Array
-        end
-        it 'correctly counts words' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).length).to eq(9)
-        end
-        it 'correctly segments hyphens' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_hyphen_sentence).length).to eq(8)
-        end
-        it 'correctly segments double-hyphen dashes' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).length).to eq(12)
-        end
-        it 'correctly segments dashes' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_dash_sentence).length).to eq(12)
-        end
-        it 'correctly segments spaced dashes' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_dash_sentence).length).to eq(12)
-        end
-        it 'correctly segments ellipses' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_ellipsis_sentence).length).to eq(9)
-        end
-        it 'correctly segments spaced ellipses' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_ellipsis_sentence).length).to eq(9)
-        end
-        it 'correctly segments period-ellipses' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipsis_sentence).length).to eq(9)
-        end
-        it 'correctly segments spaced period-ellipses' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipsis_sentence).length).to eq(9)
-        end
-        it 'correctly segments with leading, spaced ellipses' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_leading_ellipsis_sentence).length).to eq(9)
-        end
-        it 'correctly segments with trailing, spaced ellipses' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipsis_sentence).length).to eq(9)
-        end
-        it 'does not segment abbreviations' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_abbreviation_sentence).length).to eq(7)
-        end
-        it 'correctly counts with longer texts' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_paragraph).length).to eq(22)
-        end
-        it 'correctly counts with line breaks' do
-          expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_line_breaks).length).to eq(22)
-        end
-        context 'benchmarking' do
-          before do
-            require 'benchmark'
-          end
-          it 'takes time', benchmarking: true do
-            expect(
-              Benchmark.realtime do
-                1000.times do
-                  NlpPure::Segmenting::DefaultWord.parse(english_simple_line_breaks)
-                end
-              end
-            ).to be < 0.1
-          end
-        end
-      end
-    end
-  end
-  describe '.clean_input' do
-    context 'English' do
-      let(:english_leading_ellipsis_sentence) { ' … the quick brown fox jumps over the lazy dog.' }
-      context '(with nil options)' do
-        before do
-          expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return(nil)
-        end
-        it 'raises NoMethodError' do
-          expect { NlpPure::Segmenting::DefaultWord.clean_input(english_leading_ellipsis_sentence) }.to raise_error
-        end
-      end
-      context '(with blank options)' do
-        before do
-          expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return({})
-        end
-        it 'only strips whitespace' do
-          expect(NlpPure::Segmenting::DefaultWord.clean_input(english_leading_ellipsis_sentence)).to eq english_leading_ellipsis_sentence.strip
-        end
-      end
-      context '(with default options)' do
-        context 'with `nil` argument' do
-          it 'does not raise error' do
-            expect { NlpPure::Segmenting::DefaultWord.clean_input(nil) }.to_not raise_error
-          end
-          it 'returns empty String' do
-            expect(NlpPure::Segmenting::DefaultWord.clean_input(nil)).to eq ''
-          end
-        end
-        context 'without arguments' do
-          it 'does not raise error' do
-            expect { NlpPure::Segmenting::DefaultWord.clean_input }.to_not raise_error
-          end
-          it 'returns nil' do
-            expect(NlpPure::Segmenting::DefaultWord.clean_input).to eq ''
-          end
-        end
-        it 'modifies the input' do
-          expect(NlpPure::Segmenting::DefaultWord.clean_input(english_leading_ellipsis_sentence)).to_not eq english_leading_ellipsis_sentence
-        end
-      end
-    end
-  end
-end

data/spec/lib/segmenting_spec.rb DELETED

@@ -1,11 +0,0 @@
-# encoding: utf-8
-require 'spec_helper'
-require 'nlp_pure/segmenting'
-describe NlpPure::Segmenting do
-  describe '[module]' do
-    it 'is defined' do
-      expect(defined?(NlpPure::Segmenting)).to be_truthy
-    end
-  end
-end

data/spec/spec_helper.rb DELETED

@@ -1,16 +0,0 @@
-# encoding: utf-8
-require 'rspec'
-require 'coveralls'
-Coveralls.wear! do
-  add_filter '/vendor/'
-  add_filter '/test/'
-  add_filter '/tmp/'
-  add_filter '/spec/'
-end
-RSpec.configure do |config|
-  config.expect_with :rspec do |c|
-    c.syntax = :expect
-  end
-end