nlp-pure 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/nlp_pure/segmenting/default_word.rb +19 -3
- data/lib/nlp_pure/version.rb +1 -1
- data/spec/lib/segmenting/default_word_spec.rb +12 -6
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ae3951baabcafe913e157a575e3dc718a646f16
|
4
|
+
data.tar.gz: 14a6567449629a482bdc8863ffbfd04ae72af61b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f1766d42dd2916bdb0491448a9db0122b86f31325e3f12ce94c0d6b403cf5ecf50e4e95139f018f76896c1ef432e71a11dc36d8c2c597dc0870f400fb56bfeae
|
7
|
+
data.tar.gz: b3baa2f16339813070ffa978e03e8972046baebba815259d695da517baedc2830a55d0cfa75dcd234e307d2f09684194c02aa83a3b919f3db08be1426eb71537
|
data/CHANGELOG.md
CHANGED
@@ -7,13 +7,29 @@ module NlpPure
|
|
7
7
|
DEFAULT_OPTIONS = {
|
8
8
|
# 3+ periods as pseudo-ellipsis (with optional whitespace)
|
9
9
|
# OR hyphen, en dash, em dash, and whitespace
|
10
|
-
split: /\s?\.{3,}
|
10
|
+
split: /\s?\.{3,}\s?|[\s\-–—…]+/,
|
11
|
+
# array of arrays; [0] should be regexp, [1] should be replacement
|
12
|
+
# NOTE: minor performance risk in letting this array grow long
|
13
|
+
gsub: [
|
14
|
+
# ellipses at the start of a string are problematic; ref #12
|
15
|
+
[/^\s?(…|\.{3,})/, ' ']
|
16
|
+
]
|
11
17
|
}.freeze
|
18
|
+
|
12
19
|
def self.parse(*args)
|
13
20
|
unless args.nil? || args.empty?
|
14
|
-
|
15
|
-
|
21
|
+
clean_input(args[0]).split(options[:split])
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.clean_input(text = nil)
|
26
|
+
input = text.to_s
|
27
|
+
# perform replacements to work around the limitations of the splitting regexp
|
28
|
+
options.fetch(:gsub, []).each do |gsub_pair|
|
29
|
+
input.gsub!(gsub_pair[0], gsub_pair[1])
|
16
30
|
end
|
31
|
+
# NOTE: leading whitespace is problematic; ref #12
|
32
|
+
input.strip
|
17
33
|
end
|
18
34
|
|
19
35
|
# NOTE: exposed as a method for easy mock/stub
|
data/lib/nlp_pure/version.rb
CHANGED
@@ -18,9 +18,11 @@ describe NlpPure::Segmenting::DefaultWord do
|
|
18
18
|
let(:english_twohyphen_sentence) { 'The quick brown fox--full of energy--jumps over the lazy dog.' }
|
19
19
|
let(:english_ellipsis_sentence) { 'The quick brown fox…jumps over the lazy dog.' }
|
20
20
|
let(:english_spaced_ellipsis_sentence) { 'The quick brown fox … jumps over the lazy dog.' }
|
21
|
-
let(:
|
22
|
-
let(:
|
23
|
-
let(:
|
21
|
+
let(:english_period_ellipsis_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
|
22
|
+
let(:english_leading_ellipsis_sentence) { ' … the quick brown fox jumps over the lazy dog.' }
|
23
|
+
let(:english_leading_period_ellipsis_sentence) { ' ... the quick brown fox jumps over the lazy dog.' }
|
24
|
+
let(:english_trailing_ellipsis_sentence) { 'The quick brown fox jumps over the lazy dog … ' }
|
25
|
+
let(:english_spaced_period_ellipsis_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
|
24
26
|
let(:english_abbreviation_sentence) { 'The U.S.A. is a member of NATO.' }
|
25
27
|
let(:english_simple_paragraph) { 'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.' }
|
26
28
|
let(:english_simple_line_breaks) { "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go." }
|
@@ -79,15 +81,19 @@ describe NlpPure::Segmenting::DefaultWord do
|
|
79
81
|
end
|
80
82
|
|
81
83
|
it 'correctly segments period-ellipses' do
|
82
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(
|
84
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipsis_sentence).length).to eq(9)
|
83
85
|
end
|
84
86
|
|
85
87
|
it 'correctly segments spaced period-ellipses' do
|
86
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(
|
88
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipsis_sentence).length).to eq(9)
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'correctly segments with leading, spaced ellipses' do
|
92
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_leading_ellipsis_sentence).length).to eq(9)
|
87
93
|
end
|
88
94
|
|
89
95
|
it 'correctly segments with trailing, spaced ellipses' do
|
90
|
-
expect(NlpPure::Segmenting::DefaultWord.parse(
|
96
|
+
expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipsis_sentence).length).to eq(9)
|
91
97
|
end
|
92
98
|
|
93
99
|
it 'does not segment abbreviations' do
|