nlp-pure 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +2 -2
- data/lib/nlp_pure/segmenting/default_word.rb +3 -4
- data/lib/nlp_pure/version.rb +1 -1
- data/spec/lib/segmenting/default_word_spec.rb +35 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7adc921eb51b54bd646cc0c7c57edbfd47a4d7fb
|
4
|
+
data.tar.gz: 54930de1e0cd9f5507bde731e057f22942f1d212
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0ee82f8e519e712d36f89af779e9b42aed4887e0b87fa5c77e6b422f5b8ab6d66afc702c650f3bd9149a80d7c2fbb1434504fb53e132164c5dafb8c520b5102
|
7
|
+
data.tar.gz: db7bf9e6178c2e25a5d5bcdde763dbeb8fc9d3130800a0bfd41a8a6abae1ab64ad012ca27bd7334c241e8c4486566cd6f60a8f9c173086c53747f3296d7e9fed
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
# 0.0.4
|
2
|
+
|
3
|
+
Fixed bug in `NlpPure::Segmenting::DefaultWord` where ellipses without spaces would not segment.
|
4
|
+
|
5
|
+
# 0.0.3
|
6
|
+
|
7
|
+
Fixed bug in `NlpPure::Segmenting::DefaultWord` where double hyphens and spaced dashes would segment as empty words.
|
8
|
+
|
1
9
|
# 0.0.2
|
2
10
|
|
3
11
|
Added `NlpPure::Segmenting::DefaultWord` module for segmenting text into words.
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[](https://codeclimate.com/github/parhamr/nlp-pure)
|
4
4
|
[](https://travis-ci.org/parhamr/nlp-pure)
|
5
|
-
[](https://coveralls.io/r/parhamr/nlp-pure)
|
5
|
+
[](https://coveralls.io/r/parhamr/nlp-pure?branch=master)
|
6
6
|
|
7
7
|
Natural language processing algorithms implemented in pure Ruby with minimal dependencies.
|
8
8
|
|
@@ -44,7 +44,7 @@ $ gem install nlp-pure
|
|
44
44
|
|
45
45
|
```
|
46
46
|
$ bundle exec irb
|
47
|
-
irb(main):001:0>
|
47
|
+
irb(main):001:0> require 'nlp_pure/segmenting/default_word'
|
48
48
|
=> true
|
49
49
|
irb(main):002:0> NlpPure::Segmenting::DefaultWord.parse 'The quick brown fox jumps over the lazy dog.'
|
50
50
|
=> ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog."]
|
@@ -5,8 +5,9 @@ module NlpPure
|
|
5
5
|
#
|
6
6
|
module DefaultWord
|
7
7
|
DEFAULT_OPTIONS = {
|
8
|
-
#
|
9
|
-
|
8
|
+
# 3+ periods as pseudo-ellipsis (with optional whitespace)
|
9
|
+
# OR hyphen, en dash, em dash, and whitespace
|
10
|
+
split: /\s?\.{3,}+\s?|[\s\-–—…]+/
|
10
11
|
}.freeze
|
11
12
|
def self.parse(*args)
|
12
13
|
unless args.nil? || args.empty?
|
@@ -22,5 +23,3 @@ module NlpPure
|
|
22
23
|
end
|
23
24
|
end
|
24
25
|
end
|
25
|
-
|
26
|
-
require_relative '../segmenting'
|
data/lib/nlp_pure/version.rb
CHANGED
@@ -14,6 +14,13 @@ describe NlpPure::Segmenting::DefaultWord do
|
|
14
14
|
let(:english_simple_sentence) { 'The quick brown fox jumps over the lazy dog.' }
|
15
15
|
let(:english_hyphen_sentence) { 'The New York-based company hired new staff.' }
|
16
16
|
let(:english_dash_sentence) { 'The quick brown fox—full of energy—jumps over the lazy dog.' }
|
17
|
+
let(:english_spaced_dash_sentence) { 'The quick brown fox — full of energy — jumps over the lazy dog.' }
|
18
|
+
let(:english_twohyphen_sentence) { 'The quick brown fox--full of energy--jumps over the lazy dog.' }
|
19
|
+
let(:english_ellipsis_sentence) { 'The quick brown fox…jumps over the lazy dog.' }
|
20
|
+
let(:english_spaced_ellipsis_sentence) { 'The quick brown fox … jumps over the lazy dog.' }
|
21
|
+
let(:english_period_ellipses_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
|
22
|
+
let(:english_trailing_ellipses_sentence) { 'The quick brown fox jumps over the lazy dog …' }
|
23
|
+
let(:english_spaced_period_ellipses_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
|
17
24
|
let(:english_abbreviation_sentence) { 'The U.S.A. is a member of NATO.' }
|
18
25
|
let(:english_simple_paragraph) { 'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.' }
|
19
26
|
let(:english_simple_line_breaks) { "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go." }
|
@@ -51,10 +58,38 @@ describe NlpPure::Segmenting::DefaultWord do
|
|
51
58
|
expect(NlpPure::Segmenting::DefaultWord.parse(english_hyphen_sentence).length).to eq(8)
|
52
59
|
end
|
53
60
|
|
61
|
+
it 'correctly segments double-hyphen dashes' do
|
62
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).length).to eq(12)
|
63
|
+
end
|
64
|
+
|
54
65
|
it 'correctly segments dashes' do
|
66
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_dash_sentence).length).to eq(12)
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'correctly segments spaced dashes' do
|
55
70
|
expect(NlpPure::Segmenting::DefaultWord.parse(english_dash_sentence).length).to eq(12)
|
56
71
|
end
|
57
72
|
|
73
|
+
it 'correctly segments ellipses' do
|
74
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_ellipsis_sentence).length).to eq(9)
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'correctly segments spaced ellipses' do
|
78
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_ellipsis_sentence).length).to eq(9)
|
79
|
+
end
|
80
|
+
|
81
|
+
it 'correctly segments period-ellipses' do
|
82
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipses_sentence).length).to eq(9)
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'correctly segments spaced period-ellipses' do
|
86
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipses_sentence).length).to eq(9)
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'correctly segments with trailing, spaced ellipses' do
|
90
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipses_sentence).length).to eq(9)
|
91
|
+
end
|
92
|
+
|
58
93
|
it 'does not segment abbreviations' do
|
59
94
|
expect(NlpPure::Segmenting::DefaultWord.parse(english_abbreviation_sentence).length).to eq(7)
|
60
95
|
end
|