nlp-pure 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +20 -4
- data/.travis.yml +10 -5
- data/CHANGELOG.md +2 -0
- data/Gemfile +5 -7
- data/README.md +36 -21
- data/Rakefile +17 -8
- data/lib/nlp_pure.rb +5 -4
- data/lib/nlp_pure/logging.rb +2 -1
- data/lib/nlp_pure/segmenting.rb +1 -0
- data/lib/nlp_pure/segmenting/default_sentence.rb +94 -0
- data/lib/nlp_pure/segmenting/default_word.rb +4 -4
- data/lib/nlp_pure/version.rb +2 -1
- data/nlp-pure.gemspec +2 -1
- data/test/fixtures/corpus_english_simple.rb +85 -0
- data/test/lib/nlp_pure/segmenting/default_sentence_test.rb +123 -0
- data/test/lib/nlp_pure/segmenting/default_word_test.rb +106 -0
- data/test/lib/nlp_pure/segmenting_test.rb +13 -0
- data/test/lib/nlp_pure_test.rb +13 -0
- data/test/test_helper.rb +4 -0
- metadata +26 -29
- data/Guardfile +0 -20
- data/spec/lib/nlp_pure_spec.rb +0 -11
- data/spec/lib/segmenting/default_word_spec.rb +0 -207
- data/spec/lib/segmenting_spec.rb +0 -11
- data/spec/spec_helper.rb +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 040f2a0f166664553334b5b5c4bd8119b1033fd0
|
4
|
+
data.tar.gz: a414246804bbba1dd57a4ecd70e82d57eb92bbc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ffe7ee7f37f3b724f74ce7ddaf318fd38d5addcdcaa844f1abafe81bbdd7d5c20dfe53f9122a65155173491775e17819940ee2e7faac523403d184e508a60827
|
7
|
+
data.tar.gz: 7d69a2dfc54b6cf1e008b35d7fc040697f3e7f9decfab4613564814a4cc74e3affb7ba52eae4efddb751b2f6df6be2cfef617929d3cbbbe8691d62307cdd19b3
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -2,25 +2,41 @@ AllCops:
|
|
2
2
|
Exclude:
|
3
3
|
- Guardfile
|
4
4
|
- 'vendor/**/*'
|
5
|
+
- 'test/**/*'
|
5
6
|
|
6
|
-
|
7
|
+
Rails:
|
8
|
+
Enabled: false
|
7
9
|
|
10
|
+
# NLP is hard
|
11
|
+
AbcSize:
|
12
|
+
Max: 22.5
|
8
13
|
AlignParameters:
|
9
14
|
Enabled: false
|
10
15
|
ClassAndModuleChildren:
|
11
16
|
Enabled: false
|
17
|
+
# NLP is hard
|
18
|
+
CyclomaticComplexity:
|
19
|
+
Max: 12
|
12
20
|
Encoding:
|
13
21
|
Enabled: false
|
14
|
-
LineLength:
|
15
|
-
Max: 200
|
16
22
|
HashSyntax:
|
17
23
|
Exclude:
|
18
24
|
- Rakefile
|
19
25
|
- 'spec/**/*'
|
20
26
|
- 'test/**/*'
|
27
|
+
LineLength:
|
28
|
+
Max: 200
|
29
|
+
# NLP is hard
|
30
|
+
MethodLength:
|
31
|
+
Max: 15
|
32
|
+
PerceivedComplexity:
|
33
|
+
Max: 12
|
21
34
|
|
22
35
|
# Don't fail on whitespace between method names and arguments
|
23
|
-
Style/
|
36
|
+
Style/SpaceBeforeFirstArg:
|
37
|
+
Enabled: false
|
38
|
+
|
39
|
+
Style/SymbolArray:
|
24
40
|
Enabled: false
|
25
41
|
|
26
42
|
# Indent private/protected/public as deep as method definitions
|
data/.travis.yml
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
language: ruby
|
2
2
|
sudo: false
|
3
3
|
cache: bundler
|
4
|
+
dist: trusty
|
5
|
+
addons:
|
6
|
+
apt:
|
7
|
+
packages:
|
8
|
+
- haveged
|
4
9
|
# NOTE: these run in order
|
5
10
|
rvm:
|
6
11
|
- jruby
|
7
|
-
- rbx-
|
8
|
-
- 2.
|
9
|
-
- 2.
|
10
|
-
- 2.
|
12
|
+
- rbx-3.73
|
13
|
+
- 2.2.7
|
14
|
+
- 2.3.4
|
15
|
+
- 2.4.1
|
11
16
|
matrix:
|
12
17
|
allow_failures:
|
13
|
-
- rvm: rbx-
|
18
|
+
- rvm: rbx-3.73
|
14
19
|
- rvm: jruby
|
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
@@ -1,22 +1,20 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
gemspec
|
3
3
|
|
4
|
+
gem 'rubocop'
|
5
|
+
|
4
6
|
platforms :rbx do
|
5
|
-
gem 'rubysl', '~> 2.0' # if using anything in the ruby standard library
|
6
7
|
gem 'psych' # if using yaml
|
7
|
-
gem 'minitest' # if using minitest
|
8
8
|
gem 'rubinius-developer_tools' # if using any of coverage, debugger, profiler
|
9
|
+
gem 'rubysl', '~> 2.0' # if using anything in the ruby standard library
|
9
10
|
end
|
10
11
|
|
11
12
|
platforms :jruby do
|
12
|
-
gem 'jruby-openssl'
|
13
13
|
gem 'activerecord-jdbcsqlite3-adapter'
|
14
|
+
gem 'jruby-openssl'
|
14
15
|
end
|
15
16
|
|
16
17
|
group :test do
|
17
|
-
gem 'rake'
|
18
|
-
gem 'rspec', '~> 3.0.0'
|
19
|
-
gem 'guard-rspec'
|
20
|
-
gem 'guard-rubocop'
|
21
18
|
gem 'coveralls', require: false
|
19
|
+
gem 'rake'
|
22
20
|
end
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# NLP Pure
|
2
2
|
|
3
|
-
[![Code Climate](https://codeclimate.com/github/parhamr/nlp-pure/badges/gpa.svg)](https://codeclimate.com/github/parhamr/nlp-pure)
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/nlp-pure.svg)](https://badge.fury.io/rb/nlp-pure) [![Code Climate](https://codeclimate.com/github/parhamr/nlp-pure/badges/gpa.svg)](https://codeclimate.com/github/parhamr/nlp-pure)
|
4
4
|
[![Build Status](https://travis-ci.org/parhamr/nlp-pure.svg?branch=master)](https://travis-ci.org/parhamr/nlp-pure)
|
5
|
-
[![Coverage Status](https://coveralls.io/repos/parhamr/nlp-pure/badge.
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/github/parhamr/nlp-pure/badge.svg?branch=master)](https://coveralls.io/github/parhamr/nlp-pure?branch=master)
|
6
6
|
|
7
7
|
Natural language processing algorithms implemented in pure Ruby with minimal dependencies.
|
8
8
|
|
@@ -14,7 +14,8 @@ This project aims to provide functionality similar to [Treat](https://github.com
|
|
14
14
|
|
15
15
|
* [Installation](#installation)
|
16
16
|
* [Usage](#usage)
|
17
|
-
|
17
|
+
* [Word Segmentation](#word-segmentation)
|
18
|
+
* [Sentence Segmentation](#sentence-segmentation)
|
18
19
|
* [Supported Ruby Versions](#supported-ruby-versions)
|
19
20
|
* [Versioning](#versioning)
|
20
21
|
* [Contributing](CONTRIBUTING.md)
|
@@ -61,6 +62,20 @@ irb(main):005:0> NlpPure::Segmenting::DefaultWord.parse "Mary had a little lamb,
|
|
61
62
|
=> ["Mary", "had", "a", "little", "lamb,", "His", "fleece", "was", "white", "as", "snow,", "And", "everywhere", "that", "Mary", "went,", "The", "lamb", "was", "sure", "to", "go."]
|
62
63
|
```
|
63
64
|
|
65
|
+
### Sentence Segmentation
|
66
|
+
|
67
|
+
```
|
68
|
+
M017-PDX:nlp-pure rp0616$ bundle exec irb
|
69
|
+
irb(main):001:0> require 'nlp_pure/segmenting/default_sentence'
|
70
|
+
=> true
|
71
|
+
irb(main):002:0> NlpPure::Segmenting::DefaultSentence.parse 'The U.S.A. is a member of NATO.'
|
72
|
+
=> ["The U.S.A. is a member of NATO."]
|
73
|
+
irb(main):003:0> NlpPure::Segmenting::DefaultSentence.parse 'Mary had a little lamb. The lamb\U+FFE2s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.'
|
74
|
+
=> ["Mary had a little lamb.", "The lambs fleece was white as snow.", "Everywhere that Mary went, the lamb was sure to go."]
|
75
|
+
irb(main):004:0> NlpPure::Segmenting::DefaultSentence.parse 'I am excited! Today is Friday.'
|
76
|
+
=> ["I am excited!", "Today is Friday."]
|
77
|
+
```
|
78
|
+
|
64
79
|
|
65
80
|
## Supported Ruby Versions
|
66
81
|
|
@@ -101,29 +116,29 @@ spec.add_dependency 'nlp-pure', '~> 0.1'
|
|
101
116
|
[Search “nlp” at ruby-toolbox.com](https://www.ruby-toolbox.com/search?q=nlp)
|
102
117
|
|
103
118
|
* APIs
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
119
|
+
* [alchemy_api](https://github.com/dbalatero/alchemy_api)
|
120
|
+
* [napi-ruby](https://github.com/Maluuba/napi-ruby)
|
121
|
+
* [poliqarpr](https://github.com/apohllo/poliqarpr)
|
122
|
+
* [wlapi](https://github.com/arbox/wlapi)
|
108
123
|
* Bindings and Toolkits
|
109
|
-
|
110
|
-
|
111
|
-
|
124
|
+
* [open-nlp](https://github.com/louismullie/open-nlp)
|
125
|
+
* [stanford-core-nlp](https://github.com/louismullie/stanford-core-nlp)
|
126
|
+
* [treat](https://github.com/louismullie/treat)
|
112
127
|
* Classification
|
113
|
-
|
114
|
-
|
128
|
+
* [linnaeus](https://github.com/djcp/linnaeus)
|
129
|
+
* [maxent_string_classifier](https://github.com/mccraigmccraig/maxent_string_classifier)
|
115
130
|
* N-Grams
|
116
|
-
|
131
|
+
* [ruby-ngram](https://github.com/tkellen/ruby-ngram)
|
117
132
|
* Specific Languages
|
118
|
-
|
119
|
-
|
133
|
+
* Polish
|
134
|
+
* [nlp](https://github.com/knife/nlp)
|
120
135
|
* Stopwords
|
121
|
-
|
122
|
-
|
123
|
-
|
136
|
+
* [clarifier](https://github.com/meducation/clarifier)
|
137
|
+
* [stopwords](https://github.com/brez/stopwords)
|
138
|
+
* [stopwords-filter](https://github.com/brenes/stopwords-filter)
|
124
139
|
* Tokenization
|
125
|
-
|
126
|
-
|
140
|
+
* [rseg](https://rubygems.org/gems/rseg)
|
141
|
+
* [Tokenizer](https://github.com/arbox/tokenizer)
|
127
142
|
* Word Counters
|
128
|
-
|
143
|
+
* [words_counted](https://github.com/abitdodgy/words_counted)
|
129
144
|
|
data/Rakefile
CHANGED
@@ -2,17 +2,26 @@ require 'bundler'
|
|
2
2
|
Bundler::GemHelper.install_tasks
|
3
3
|
|
4
4
|
begin
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
task :coverage do
|
6
|
+
require 'coveralls'
|
7
|
+
Coveralls.wear!
|
8
|
+
require 'minitest'
|
9
|
+
end
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
require 'rake/testtask'
|
12
|
+
Rake::TestTask.new(:test) do |t|
|
13
|
+
require_relative 'test/test_helper'
|
14
|
+
t.verbose = true
|
15
|
+
t.pattern = 'test/**/*_test.rb'
|
16
|
+
end
|
17
|
+
|
18
|
+
require 'rubocop/rake_task'
|
19
|
+
RuboCop::RakeTask.new(:rubocop) do |task|
|
20
|
+
# don't abort rake on failure
|
21
|
+
task.fail_on_error = false
|
13
22
|
end
|
14
23
|
|
15
|
-
task :
|
24
|
+
task default: [:coverage, :test, :rubocop]
|
16
25
|
rescue LoadError => e
|
17
26
|
STDERR << "#{e.class}: #{e.message} (#{e.backtrace[0]})"
|
18
27
|
end
|
data/lib/nlp_pure.rb
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
2
3
|
require 'nlp_pure/version'
|
3
|
-
|
4
|
+
raise "NLP Pure #{NlpPure::VERSION} does not support Ruby 1.9." if RUBY_PLATFORM != 'java' && RUBY_VERSION < '2.0.0'
|
4
5
|
|
5
6
|
#
|
6
7
|
module NlpPure
|
7
|
-
NAME = 'NlpPure'
|
8
|
-
LICENSE = 'See LICENSE for details.'
|
8
|
+
NAME = 'NlpPure'.freeze
|
9
|
+
LICENSE = 'See LICENSE for details.'.freeze
|
9
10
|
|
10
|
-
DEFAULTS = {}
|
11
|
+
DEFAULTS = {}.freeze
|
11
12
|
|
12
13
|
def self.logger
|
13
14
|
NlpPure::Logging.logger
|
data/lib/nlp_pure/logging.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
2
3
|
require 'time'
|
3
4
|
require 'logger'
|
4
5
|
|
@@ -8,7 +9,7 @@ module NlpPure
|
|
8
9
|
module Logging
|
9
10
|
#
|
10
11
|
class Pretty < Logger::Formatter
|
11
|
-
def call(severity, time,
|
12
|
+
def call(severity, time, _program_name, message)
|
12
13
|
"#{time.utc.iso8601(2)} #{::Process.pid} #{severity}: #{message}\n"
|
13
14
|
end
|
14
15
|
end
|
data/lib/nlp_pure/segmenting.rb
CHANGED
@@ -0,0 +1,94 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module NlpPure
|
4
|
+
module Segmenting
|
5
|
+
# SEE ALSO: Unsupervised Multilingual Sentence Boundary Detection. Kiss, Strunk; 2006.
|
6
|
+
# NOTE: this fails on some proper nouns with abbreviations (e.g. business names)
|
7
|
+
# and fails on single-linebreak headings
|
8
|
+
module DefaultSentence
|
9
|
+
DEFAULT_OPTIONS = {
|
10
|
+
# punctuation or linebreaks
|
11
|
+
split: /([.?!]|\n{2,}|\r\n)+/,
|
12
|
+
# array of arrays; [0] should be regexp, [1] should be replacement
|
13
|
+
# NOTE: minor performance risk in letting this array grow long
|
14
|
+
gsub: [
|
15
|
+
# period ellipses need reconstruction
|
16
|
+
[/\.{3,}/, '…']
|
17
|
+
],
|
18
|
+
naive_sentence_word_count: 3,
|
19
|
+
segment_boundary: '. '
|
20
|
+
}.freeze
|
21
|
+
|
22
|
+
module_function
|
23
|
+
|
24
|
+
# NOTE: exposed as a method for easy mock/stub
|
25
|
+
def options
|
26
|
+
DEFAULT_OPTIONS
|
27
|
+
end
|
28
|
+
|
29
|
+
def parse(*args)
|
30
|
+
return nil if args.nil? || args.empty?
|
31
|
+
# naive split
|
32
|
+
segments = clean_input(args[0]).split(options.fetch(:split, nil))
|
33
|
+
# skip rejoin if one segment
|
34
|
+
return segments if segments.length == 1
|
35
|
+
returning = rejoin_segment_fragments(segments).compact
|
36
|
+
STDERR << "#{returning.inspect}\n" if ENV['DEBUG']
|
37
|
+
returning
|
38
|
+
end
|
39
|
+
|
40
|
+
def clean_input(text = nil)
|
41
|
+
input = text.to_s
|
42
|
+
# perform replacements to work around the limitations of the splitting regexp
|
43
|
+
options.fetch(:gsub, []).each do |gsub_pair|
|
44
|
+
input.gsub!(gsub_pair[0], gsub_pair[1])
|
45
|
+
end
|
46
|
+
# NOTE: leading whitespace is problematic; ref #12
|
47
|
+
input.strip
|
48
|
+
end
|
49
|
+
|
50
|
+
def rejoin_segment_fragments(segments)
|
51
|
+
reassociated_segments = []
|
52
|
+
# take all segments
|
53
|
+
while (segment = segments.shift)
|
54
|
+
STDERR << "#{segment.inspect}\n" if ENV['DEBUG']
|
55
|
+
# join segments if needed
|
56
|
+
reassociated_segments << handle_special_fragments(segments, segment)
|
57
|
+
end
|
58
|
+
reassociated_segments
|
59
|
+
end
|
60
|
+
|
61
|
+
# rejoin leading punctuation, abbreviation, and numbers
|
62
|
+
def handle_special_fragments(segments, segment)
|
63
|
+
# NOTE: always index zero because we're shifting
|
64
|
+
while next_segment_appears_included?(segments[0])
|
65
|
+
STDERR << "\t\t<< #{segments[0].inspect}\n" if ENV['DEBUG']
|
66
|
+
segment = "#{segment}#{segments.shift}"
|
67
|
+
end
|
68
|
+
segment.strip
|
69
|
+
end
|
70
|
+
|
71
|
+
def next_segment_appears_included?(segment)
|
72
|
+
return false unless segment
|
73
|
+
# NOTE: the logic is expanded for logging reasons (despite style violation)
|
74
|
+
if segment[0] =~ options.fetch(:split, nil)
|
75
|
+
STDERR << "\t! leading punctuation detected\n" if ENV['DEBUG']
|
76
|
+
elsif segment[0] =~ /^\w/
|
77
|
+
STDERR << "\t! assuming abbreviation\n" if ENV['DEBUG']
|
78
|
+
elsif segment =~ /^\s[a-z0-9]/
|
79
|
+
STDERR << "\t! greedily grabbing lowercase\n" if ENV['DEBUG']
|
80
|
+
elsif segment =~ /^\d/
|
81
|
+
STDERR << "\t! leading numeral detected\n" if ENV['DEBUG']
|
82
|
+
else
|
83
|
+
STDERR << "\t\tx\n" if ENV['DEBUG']
|
84
|
+
return false
|
85
|
+
end
|
86
|
+
true
|
87
|
+
end
|
88
|
+
|
89
|
+
def cleanup_segmenting(segments)
|
90
|
+
segments.compact
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -13,15 +13,15 @@ module NlpPure
|
|
13
13
|
gsub: [
|
14
14
|
# ellipses at the start of a string are problematic; ref #12
|
15
15
|
[/^\s?(…|\.{3,})/, ' ']
|
16
|
-
]
|
16
|
+
],
|
17
|
+
segment_boundary: ' '
|
17
18
|
}.freeze
|
18
19
|
|
19
20
|
module_function
|
20
21
|
|
21
22
|
def parse(*args)
|
22
|
-
|
23
|
-
|
24
|
-
end
|
23
|
+
return nil if args.nil? || args.empty?
|
24
|
+
clean_input(args[0]).split(options.fetch(:split, nil))
|
25
25
|
end
|
26
26
|
|
27
27
|
def clean_input(text = nil)
|
data/lib/nlp_pure/version.rb
CHANGED
data/nlp-pure.gemspec
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
2
3
|
require File.expand_path('../lib/nlp_pure/version', __FILE__)
|
3
4
|
|
4
5
|
Gem::Specification.new do |gem|
|
@@ -15,6 +16,6 @@ Gem::Specification.new do |gem|
|
|
15
16
|
gem.require_paths = ['lib']
|
16
17
|
gem.version = NlpPure::VERSION
|
17
18
|
gem.add_development_dependency 'rake', '~> 10.4'
|
18
|
-
gem.add_development_dependency '
|
19
|
+
gem.add_development_dependency 'minitest', '~> 5.5'
|
19
20
|
gem.add_development_dependency 'coveralls', '~> 0.7'
|
20
21
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module CorpusEnglishSimple
|
2
|
+
def english_simple_sentence
|
3
|
+
'The quick brown fox jumps over the lazy dog.'
|
4
|
+
end
|
5
|
+
|
6
|
+
def english_hyphen_sentence
|
7
|
+
'The New York-based company hired new staff.'
|
8
|
+
end
|
9
|
+
|
10
|
+
def english_dash_sentence
|
11
|
+
'The quick brown fox—full of energy—jumps over the lazy dog.'
|
12
|
+
end
|
13
|
+
|
14
|
+
def english_spaced_dash_sentence
|
15
|
+
'The quick brown fox — full of energy — jumps over the lazy dog.'
|
16
|
+
end
|
17
|
+
|
18
|
+
def english_twohyphen_sentence
|
19
|
+
'The quick brown fox--full of energy--jumps over the lazy dog.'
|
20
|
+
end
|
21
|
+
|
22
|
+
def english_ellipsis_sentence
|
23
|
+
'The quick brown fox…jumps over the lazy dog.'
|
24
|
+
end
|
25
|
+
|
26
|
+
def english_spaced_ellipsis_sentence
|
27
|
+
'The quick brown fox … jumps over the lazy dog.'
|
28
|
+
end
|
29
|
+
|
30
|
+
def english_period_ellipsis_sentence
|
31
|
+
'The quick brown fox...jumps over the lazy dog.'
|
32
|
+
end
|
33
|
+
|
34
|
+
def english_leading_ellipsis_sentence
|
35
|
+
' … the quick brown fox jumps over the lazy dog.'
|
36
|
+
end
|
37
|
+
|
38
|
+
def english_leading_period_ellipsis_sentence
|
39
|
+
' ... the quick brown fox jumps over the lazy dog.'
|
40
|
+
end
|
41
|
+
|
42
|
+
def english_trailing_ellipsis_sentence
|
43
|
+
'The quick brown fox jumps over the lazy dog … '
|
44
|
+
end
|
45
|
+
|
46
|
+
def english_spaced_period_ellipsis_sentence
|
47
|
+
'The quick brown fox ... jumps over the lazy dog.'
|
48
|
+
end
|
49
|
+
|
50
|
+
def english_abbreviation_sentence
|
51
|
+
'The U.S.A. is a member of NATO.'
|
52
|
+
end
|
53
|
+
|
54
|
+
def english_simple_paragraph
|
55
|
+
'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.'
|
56
|
+
end
|
57
|
+
|
58
|
+
def english_simple_line_breaks
|
59
|
+
"Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go."
|
60
|
+
end
|
61
|
+
|
62
|
+
def english_financial_sentence
|
63
|
+
"AMERICAN INDUSTRY INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter A.B. Hammersmith & Co."
|
64
|
+
end
|
65
|
+
|
66
|
+
def english_short_sentence
|
67
|
+
"Go!"
|
68
|
+
end
|
69
|
+
|
70
|
+
def english_excalamations
|
71
|
+
"I am excited! Today is Friday."
|
72
|
+
end
|
73
|
+
|
74
|
+
def english_short_question
|
75
|
+
"You?"
|
76
|
+
end
|
77
|
+
|
78
|
+
def english_leading_question
|
79
|
+
"On which side of the road do you drive? In North America we drive on the right side."
|
80
|
+
end
|
81
|
+
|
82
|
+
def english_usa_constitution_preamble
|
83
|
+
"United States of America 1789 (rev. 1992)\r\nPREAMBLE\r\nWe the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defense, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America."
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'nlp_pure/segmenting/default_sentence'
|
5
|
+
require_relative '../../../fixtures/corpus_english_simple'
|
6
|
+
|
7
|
+
#
|
8
|
+
class TestNlpPureSegmentingDefaultSentence < Minitest::Test
|
9
|
+
describe '[module]' do
|
10
|
+
def test_module_is_defined
|
11
|
+
assert_equal defined?(NlpPure::Segmenting::DefaultSentence), 'constant'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '(English language)' do
|
16
|
+
include ::CorpusEnglishSimple
|
17
|
+
|
18
|
+
describe '.parse' do
|
19
|
+
describe 'with `nil` argument' do
|
20
|
+
def test_parse_returns_array
|
21
|
+
assert_equal [], NlpPure::Segmenting::DefaultSentence.parse(nil)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe 'without arguments' do
|
26
|
+
def test_parse_returns_nil
|
27
|
+
assert_nil NlpPure::Segmenting::DefaultSentence.parse
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'with strings' do
|
32
|
+
def test_parse_returns_sentence_array
|
33
|
+
assert_instance_of Array, NlpPure::Segmenting::DefaultSentence.parse(english_simple_sentence)
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_parse_correctly_counts_sentences
|
37
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_simple_sentence).length
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_parse_correctly_sentence_segments_hyphens
|
41
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_hyphen_sentence).length
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_parse_correctly_sentence_segments_doublehyphen_dashes
|
45
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_twohyphen_sentence).length
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_parse_correctly_sentence_segments_dashes
|
49
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_dash_sentence).length
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_parse_correctly_sentence_segments_spaced_dashes
|
53
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_spaced_dash_sentence).length
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_parse_correctly_sentence_segments_ellipses
|
57
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_ellipsis_sentence).length
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_parse_correctly_sentence_segments_spaced_ellipses
|
61
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_spaced_ellipsis_sentence).length
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_parse_correctly_sentence_segments_periodellipses
|
65
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_period_ellipsis_sentence).length
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_parse_correctly_sentence_segments_spaced_periodellipses
|
69
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_spaced_period_ellipsis_sentence).length
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_parse_correctly_sentence_segments_leading_spaced_periodellipses
|
73
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_leading_ellipsis_sentence).length
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_parse_correctly_sentence_segments_trailing_spaced_periodellipses
|
77
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_trailing_ellipsis_sentence).length
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_parse_does_not_sentence_segment_abbreviations
|
81
|
+
assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_abbreviation_sentence),
|
82
|
+
["The U.S.A. is a member of NATO."]
|
83
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_abbreviation_sentence).length
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_parse_does_not_sentence_segment_financial_jargon
|
87
|
+
skip("FIXME: financial jargon is hard")
|
88
|
+
assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_financial_sentence),
|
89
|
+
[english_financial_sentence]
|
90
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_financial_sentence).length
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_parse_correctly_sentence_segments_longer_texts
|
94
|
+
assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_simple_paragraph),
|
95
|
+
["Mary had a little lamb.", "The lamb’s fleece was white as snow.", "Everywhere that Mary went, the lamb was sure to go."]
|
96
|
+
assert_equal 3, NlpPure::Segmenting::DefaultSentence.parse(english_simple_paragraph).length
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_parse_correctly_sentence_segments_line_breaks
|
100
|
+
assert_equal 1, NlpPure::Segmenting::DefaultSentence.parse(english_simple_line_breaks).length
|
101
|
+
end
|
102
|
+
|
103
|
+
def test_parse_correctly_sentence_segments_exclamations
|
104
|
+
assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_excalamations),
|
105
|
+
["I am excited!", "Today is Friday."]
|
106
|
+
assert_equal 2, NlpPure::Segmenting::DefaultSentence.parse(english_excalamations).length
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_parse_correctly_sentence_segments_questions
|
110
|
+
assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_leading_question),
|
111
|
+
["On which side of the road do you drive?", "In North America we drive on the right side."]
|
112
|
+
assert_equal 2, NlpPure::Segmenting::DefaultSentence.parse(english_leading_question).length
|
113
|
+
end
|
114
|
+
|
115
|
+
def test_parse_correctly_sentence_usa_constitution_preamble
|
116
|
+
assert_equal NlpPure::Segmenting::DefaultSentence.parse(english_usa_constitution_preamble)[0],
|
117
|
+
"United States of America 1789 (rev. 1992)"
|
118
|
+
assert_equal 3, NlpPure::Segmenting::DefaultSentence.parse(english_usa_constitution_preamble).length
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'nlp_pure/segmenting/default_word'
|
5
|
+
require_relative '../../../fixtures/corpus_english_simple'
|
6
|
+
|
7
|
+
#
|
8
|
+
class TestNlpPureSegmentingDefaultWord < Minitest::Test
|
9
|
+
describe '[module]' do
|
10
|
+
def test_module_is_defined
|
11
|
+
assert_equal defined?(NlpPure::Segmenting::DefaultWord), 'constant'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '(English language)' do
|
16
|
+
include ::CorpusEnglishSimple
|
17
|
+
|
18
|
+
describe '.parse' do
|
19
|
+
describe 'with `nil` argument' do
|
20
|
+
def test_parse_returns_array
|
21
|
+
assert_equal [], NlpPure::Segmenting::DefaultWord.parse(nil)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe 'without arguments' do
|
26
|
+
def test_parse_returns_nil
|
27
|
+
assert_nil NlpPure::Segmenting::DefaultWord.parse
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_parse_returns_word_array
|
32
|
+
assert_instance_of Array, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence)
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_parse_correctly_counts_words
|
36
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).length
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_parse_does_not_mangle_english_simple_sentence
|
40
|
+
assert_equal english_simple_sentence, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).join(NlpPure::Segmenting::DefaultWord.options[:segment_boundary])
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_parse_correctly_word_segments_hyphens
|
44
|
+
assert_equal 8, NlpPure::Segmenting::DefaultWord.parse(english_hyphen_sentence).length
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_parse_does_not_mangle_english_hyphen_sentence
|
48
|
+
skip("FIXME")
|
49
|
+
assert_equal english_simple_sentence, NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).join(NlpPure::Segmenting::DefaultWord.options[:segment_boundary])
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_parse_correctly_word_segments_doublehyphen_dashes
|
53
|
+
assert_equal 12, NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).length
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_parse_does_not_mangle_english_twohyphen_sentence
|
57
|
+
skip("FIXME")
|
58
|
+
assert_equal english_twohyphen_sentence, NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).join(NlpPure::Segmenting::DefaultWord.options[:segment_boundary])
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_parse_correctly_word_segments_dashes
|
62
|
+
assert_equal 12, NlpPure::Segmenting::DefaultWord.parse(english_dash_sentence).length
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_parse_correctly_word_segments_spaced_dashes
|
66
|
+
assert_equal 12, NlpPure::Segmenting::DefaultWord.parse(english_spaced_dash_sentence).length
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_parse_correctly_word_segments_ellipses
|
70
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_ellipsis_sentence).length
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_parse_correctly_word_segments_spaced_ellipses
|
74
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_spaced_ellipsis_sentence).length
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_parse_correctly_word_segments_periodellipses
|
78
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_period_ellipsis_sentence).length
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_parse_correctly_word_segments_spaced_periodellipses
|
82
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipsis_sentence).length
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_parse_correctly_word_segments_leading_spaced_periodellipses
|
86
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_leading_ellipsis_sentence).length
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_parse_correctly_word_segments_trailing_spaced_periodellipses
|
90
|
+
assert_equal 9, NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipsis_sentence).length
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_parse_does_not_word_segment_abbreviations
|
94
|
+
assert_equal 7, NlpPure::Segmenting::DefaultWord.parse(english_abbreviation_sentence).length
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_parse_correctly_word_segments_longer_texts
|
98
|
+
assert_equal 22, NlpPure::Segmenting::DefaultWord.parse(english_simple_paragraph).length
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_parse_correctly_word_segments_line_breaks
|
102
|
+
assert_equal 22, NlpPure::Segmenting::DefaultWord.parse(english_simple_line_breaks).length
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'nlp_pure/segmenting'
|
5
|
+
|
6
|
+
#
|
7
|
+
class TestNlpPureSegmenting < Minitest::Test
|
8
|
+
describe '[module]' do
|
9
|
+
def test_module_is_defined
|
10
|
+
assert_equal defined?(NlpPure::Segmenting), 'constant'
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp-pure
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Parham
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-04-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '10.4'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '10.4'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: minitest
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '5.5'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '5.5'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ~>
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0.7'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ~>
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0.7'
|
55
55
|
description: Natural language processing algorithms implemented in pure Ruby with
|
@@ -60,27 +60,29 @@ executables: []
|
|
60
60
|
extensions: []
|
61
61
|
extra_rdoc_files: []
|
62
62
|
files:
|
63
|
-
-
|
64
|
-
-
|
65
|
-
-
|
66
|
-
-
|
63
|
+
- .gitignore
|
64
|
+
- .rspec
|
65
|
+
- .rubocop.yml
|
66
|
+
- .travis.yml
|
67
67
|
- CHANGELOG.md
|
68
68
|
- CONTRIBUTING.md
|
69
69
|
- Gemfile
|
70
|
-
- Guardfile
|
71
70
|
- LICENSE
|
72
71
|
- README.md
|
73
72
|
- Rakefile
|
74
73
|
- lib/nlp_pure.rb
|
75
74
|
- lib/nlp_pure/logging.rb
|
76
75
|
- lib/nlp_pure/segmenting.rb
|
76
|
+
- lib/nlp_pure/segmenting/default_sentence.rb
|
77
77
|
- lib/nlp_pure/segmenting/default_word.rb
|
78
78
|
- lib/nlp_pure/version.rb
|
79
79
|
- nlp-pure.gemspec
|
80
|
-
-
|
81
|
-
-
|
82
|
-
-
|
83
|
-
-
|
80
|
+
- test/fixtures/corpus_english_simple.rb
|
81
|
+
- test/lib/nlp_pure/segmenting/default_sentence_test.rb
|
82
|
+
- test/lib/nlp_pure/segmenting/default_word_test.rb
|
83
|
+
- test/lib/nlp_pure/segmenting_test.rb
|
84
|
+
- test/lib/nlp_pure_test.rb
|
85
|
+
- test/test_helper.rb
|
84
86
|
homepage: https://github.com/parhamr/nlp-pure
|
85
87
|
licenses:
|
86
88
|
- MIT
|
@@ -91,24 +93,19 @@ require_paths:
|
|
91
93
|
- lib
|
92
94
|
required_ruby_version: !ruby/object:Gem::Requirement
|
93
95
|
requirements:
|
94
|
-
- -
|
96
|
+
- - '>='
|
95
97
|
- !ruby/object:Gem::Version
|
96
98
|
version: '0'
|
97
99
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
100
|
requirements:
|
99
|
-
- -
|
101
|
+
- - '>='
|
100
102
|
- !ruby/object:Gem::Version
|
101
103
|
version: '0'
|
102
104
|
requirements: []
|
103
105
|
rubyforge_project:
|
104
|
-
rubygems_version: 2.
|
106
|
+
rubygems_version: 2.0.14.1
|
105
107
|
signing_key:
|
106
108
|
specification_version: 4
|
107
109
|
summary: Natural language processing algorithms implemented in pure Ruby with minimal
|
108
110
|
dependencies
|
109
|
-
test_files:
|
110
|
-
- spec/lib/nlp_pure_spec.rb
|
111
|
-
- spec/lib/segmenting/default_word_spec.rb
|
112
|
-
- spec/lib/segmenting_spec.rb
|
113
|
-
- spec/spec_helper.rb
|
114
|
-
has_rdoc:
|
111
|
+
test_files: []
|
data/Guardfile
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
guard :rspec, cmd: "bundle exec rspec", all_on_start: false, all_after_pass: false, failed_mode: :none do
|
2
|
-
require "guard/rspec/dsl"
|
3
|
-
dsl = Guard::RSpec::Dsl.new(self)
|
4
|
-
|
5
|
-
# RSpec files
|
6
|
-
rspec = dsl.rspec
|
7
|
-
watch(rspec.spec_helper) { rspec.spec_dir }
|
8
|
-
watch(rspec.spec_support) { rspec.spec_dir }
|
9
|
-
watch(rspec.spec_files)
|
10
|
-
|
11
|
-
# Ruby files
|
12
|
-
ruby = dsl.ruby
|
13
|
-
dsl.watch_spec_files_for(ruby.lib_files)
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
guard :rubocop, all_on_start: false, keep_failed: false do
|
18
|
-
watch(%r{.+\.rb$})
|
19
|
-
watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
|
20
|
-
end
|
data/spec/lib/nlp_pure_spec.rb
DELETED
@@ -1,207 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'spec_helper'
|
3
|
-
require 'nlp_pure/segmenting/default_word'
|
4
|
-
|
5
|
-
describe NlpPure::Segmenting::DefaultWord do
|
6
|
-
describe '[module]' do
|
7
|
-
it 'is defined' do
|
8
|
-
expect(defined?(NlpPure::Segmenting::DefaultWord)).to be_truthy
|
9
|
-
end
|
10
|
-
|
11
|
-
describe '::DEFAULT_OPTIONS' do
|
12
|
-
it 'is Hash' do
|
13
|
-
expect(NlpPure::Segmenting::DefaultWord::DEFAULT_OPTIONS).to be_a Hash
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
describe '.parse' do
|
19
|
-
context 'English' do
|
20
|
-
let(:english_simple_sentence) { 'The quick brown fox jumps over the lazy dog.' }
|
21
|
-
let(:english_hyphen_sentence) { 'The New York-based company hired new staff.' }
|
22
|
-
let(:english_dash_sentence) { 'The quick brown fox—full of energy—jumps over the lazy dog.' }
|
23
|
-
let(:english_spaced_dash_sentence) { 'The quick brown fox — full of energy — jumps over the lazy dog.' }
|
24
|
-
let(:english_twohyphen_sentence) { 'The quick brown fox--full of energy--jumps over the lazy dog.' }
|
25
|
-
let(:english_ellipsis_sentence) { 'The quick brown fox…jumps over the lazy dog.' }
|
26
|
-
let(:english_spaced_ellipsis_sentence) { 'The quick brown fox … jumps over the lazy dog.' }
|
27
|
-
let(:english_period_ellipsis_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
|
28
|
-
let(:english_leading_ellipsis_sentence) { ' … the quick brown fox jumps over the lazy dog.' }
|
29
|
-
let(:english_leading_period_ellipsis_sentence) { ' ... the quick brown fox jumps over the lazy dog.' }
|
30
|
-
let(:english_trailing_ellipsis_sentence) { 'The quick brown fox jumps over the lazy dog … ' }
|
31
|
-
let(:english_spaced_period_ellipsis_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
|
32
|
-
let(:english_abbreviation_sentence) { 'The U.S.A. is a member of NATO.' }
|
33
|
-
let(:english_simple_paragraph) { 'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.' }
|
34
|
-
let(:english_simple_line_breaks) { "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go." }
|
35
|
-
|
36
|
-
context '(with nil options)' do
|
37
|
-
before do
|
38
|
-
expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return(nil)
|
39
|
-
end
|
40
|
-
|
41
|
-
it 'raises NoMethodError' do
|
42
|
-
expect { NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence) }.to raise_error
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
context '(with blank options)' do
|
47
|
-
before do
|
48
|
-
expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return({})
|
49
|
-
end
|
50
|
-
|
51
|
-
it 'returns Array' do
|
52
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence)).to be_an Array
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
context '(with default options)' do
|
57
|
-
context 'with `nil` argument' do
|
58
|
-
it 'does not raise error' do
|
59
|
-
expect { NlpPure::Segmenting::DefaultWord.parse(nil) }.to_not raise_error
|
60
|
-
end
|
61
|
-
|
62
|
-
it 'returns Array' do
|
63
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(nil)).to be_an Array
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
context 'without arguments' do
|
68
|
-
it 'does not raise error' do
|
69
|
-
expect { NlpPure::Segmenting::DefaultWord.parse }.to_not raise_error
|
70
|
-
end
|
71
|
-
|
72
|
-
it 'returns nil' do
|
73
|
-
expect(NlpPure::Segmenting::DefaultWord.parse).to eq nil
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
it 'returns Array' do
|
78
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence)).to be_an Array
|
79
|
-
end
|
80
|
-
|
81
|
-
it 'correctly counts words' do
|
82
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_sentence).length).to eq(9)
|
83
|
-
end
|
84
|
-
|
85
|
-
it 'correctly segments hyphens' do
|
86
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_hyphen_sentence).length).to eq(8)
|
87
|
-
end
|
88
|
-
|
89
|
-
it 'correctly segments double-hyphen dashes' do
|
90
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).length).to eq(12)
|
91
|
-
end
|
92
|
-
|
93
|
-
it 'correctly segments dashes' do
|
94
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_dash_sentence).length).to eq(12)
|
95
|
-
end
|
96
|
-
|
97
|
-
it 'correctly segments spaced dashes' do
|
98
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_dash_sentence).length).to eq(12)
|
99
|
-
end
|
100
|
-
|
101
|
-
it 'correctly segments ellipses' do
|
102
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_ellipsis_sentence).length).to eq(9)
|
103
|
-
end
|
104
|
-
|
105
|
-
it 'correctly segments spaced ellipses' do
|
106
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_ellipsis_sentence).length).to eq(9)
|
107
|
-
end
|
108
|
-
|
109
|
-
it 'correctly segments period-ellipses' do
|
110
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipsis_sentence).length).to eq(9)
|
111
|
-
end
|
112
|
-
|
113
|
-
it 'correctly segments spaced period-ellipses' do
|
114
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipsis_sentence).length).to eq(9)
|
115
|
-
end
|
116
|
-
|
117
|
-
it 'correctly segments with leading, spaced ellipses' do
|
118
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_leading_ellipsis_sentence).length).to eq(9)
|
119
|
-
end
|
120
|
-
|
121
|
-
it 'correctly segments with trailing, spaced ellipses' do
|
122
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipsis_sentence).length).to eq(9)
|
123
|
-
end
|
124
|
-
|
125
|
-
it 'does not segment abbreviations' do
|
126
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_abbreviation_sentence).length).to eq(7)
|
127
|
-
end
|
128
|
-
|
129
|
-
it 'correctly counts with longer texts' do
|
130
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_paragraph).length).to eq(22)
|
131
|
-
end
|
132
|
-
|
133
|
-
it 'correctly counts with line breaks' do
|
134
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(english_simple_line_breaks).length).to eq(22)
|
135
|
-
end
|
136
|
-
|
137
|
-
context 'benchmarking' do
|
138
|
-
before do
|
139
|
-
require 'benchmark'
|
140
|
-
end
|
141
|
-
|
142
|
-
it 'takes time', benchmarking: true do
|
143
|
-
expect(
|
144
|
-
Benchmark.realtime do
|
145
|
-
1000.times do
|
146
|
-
NlpPure::Segmenting::DefaultWord.parse(english_simple_line_breaks)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
).to be < 0.1
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
describe '.clean_input' do
|
157
|
-
context 'English' do
|
158
|
-
let(:english_leading_ellipsis_sentence) { ' … the quick brown fox jumps over the lazy dog.' }
|
159
|
-
|
160
|
-
context '(with nil options)' do
|
161
|
-
before do
|
162
|
-
expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return(nil)
|
163
|
-
end
|
164
|
-
|
165
|
-
it 'raises NoMethodError' do
|
166
|
-
expect { NlpPure::Segmenting::DefaultWord.clean_input(english_leading_ellipsis_sentence) }.to raise_error
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
context '(with blank options)' do
|
171
|
-
before do
|
172
|
-
expect(NlpPure::Segmenting::DefaultWord).to receive(:options).at_least(:once).and_return({})
|
173
|
-
end
|
174
|
-
|
175
|
-
it 'only strips whitespace' do
|
176
|
-
expect(NlpPure::Segmenting::DefaultWord.clean_input(english_leading_ellipsis_sentence)).to eq english_leading_ellipsis_sentence.strip
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
context '(with default options)' do
|
181
|
-
context 'with `nil` argument' do
|
182
|
-
it 'does not raise error' do
|
183
|
-
expect { NlpPure::Segmenting::DefaultWord.clean_input(nil) }.to_not raise_error
|
184
|
-
end
|
185
|
-
|
186
|
-
it 'returns empty String' do
|
187
|
-
expect(NlpPure::Segmenting::DefaultWord.clean_input(nil)).to eq ''
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
context 'without arguments' do
|
192
|
-
it 'does not raise error' do
|
193
|
-
expect { NlpPure::Segmenting::DefaultWord.clean_input }.to_not raise_error
|
194
|
-
end
|
195
|
-
|
196
|
-
it 'returns nil' do
|
197
|
-
expect(NlpPure::Segmenting::DefaultWord.clean_input).to eq ''
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
it 'modifies the input' do
|
202
|
-
expect(NlpPure::Segmenting::DefaultWord.clean_input(english_leading_ellipsis_sentence)).to_not eq english_leading_ellipsis_sentence
|
203
|
-
end
|
204
|
-
end
|
205
|
-
end
|
206
|
-
end
|
207
|
-
end
|
data/spec/lib/segmenting_spec.rb
DELETED
data/spec/spec_helper.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'rspec'
|
3
|
-
require 'coveralls'
|
4
|
-
|
5
|
-
Coveralls.wear! do
|
6
|
-
add_filter '/vendor/'
|
7
|
-
add_filter '/test/'
|
8
|
-
add_filter '/tmp/'
|
9
|
-
add_filter '/spec/'
|
10
|
-
end
|
11
|
-
|
12
|
-
RSpec.configure do |config|
|
13
|
-
config.expect_with :rspec do |c|
|
14
|
-
c.syntax = :expect
|
15
|
-
end
|
16
|
-
end
|