nlp-pure 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7adc921eb51b54bd646cc0c7c57edbfd47a4d7fb
4
- data.tar.gz: 54930de1e0cd9f5507bde731e057f22942f1d212
3
+ metadata.gz: 8ae3951baabcafe913e157a575e3dc718a646f16
4
+ data.tar.gz: 14a6567449629a482bdc8863ffbfd04ae72af61b
5
5
  SHA512:
6
- metadata.gz: a0ee82f8e519e712d36f89af779e9b42aed4887e0b87fa5c77e6b422f5b8ab6d66afc702c650f3bd9149a80d7c2fbb1434504fb53e132164c5dafb8c520b5102
7
- data.tar.gz: db7bf9e6178c2e25a5d5bcdde763dbeb8fc9d3130800a0bfd41a8a6abae1ab64ad012ca27bd7334c241e8c4486566cd6f60a8f9c173086c53747f3296d7e9fed
6
+ metadata.gz: f1766d42dd2916bdb0491448a9db0122b86f31325e3f12ce94c0d6b403cf5ecf50e4e95139f018f76896c1ef432e71a11dc36d8c2c597dc0870f400fb56bfeae
7
+ data.tar.gz: b3baa2f16339813070ffa978e03e8972046baebba815259d695da517baedc2830a55d0cfa75dcd234e307d2f09684194c02aa83a3b919f3db08be1426eb71537
@@ -1,3 +1,7 @@
1
+ # 0.0.5
2
+
3
+ Fixed bug in `NlpPure::Segmenting::DefaultWord` where leading ellipses could produce extra segmented words.
4
+
1
5
  # 0.0.4
2
6
 
3
7
  Fixed bug in `NlpPure::Segmenting::DefaultWord` where ellipses without spaces would not segment.
@@ -7,13 +7,29 @@ module NlpPure
7
7
  DEFAULT_OPTIONS = {
8
8
  # 3+ periods as pseudo-ellipsis (with optional whitespace)
9
9
  # OR hyphen, en dash, em dash, and whitespace
10
- split: /\s?\.{3,}+\s?|[\s\-–—…]+/
10
+ split: /\s?\.{3,}\s?|[\s\-–—…]+/,
11
+ # array of arrays; [0] should be regexp, [1] should be replacement
12
+ # NOTE: minor performance risk in letting this array grow long
13
+ gsub: [
14
+ # ellipses at the start of a string are problematic; ref #12
15
+ [/^\s?(…|\.{3,})/, ' ']
16
+ ]
11
17
  }.freeze
18
+
12
19
  def self.parse(*args)
13
20
  unless args.nil? || args.empty?
14
- input = args[0].to_s
15
- input.split(options[:split])
21
+ clean_input(args[0]).split(options[:split])
22
+ end
23
+ end
24
+
25
+ def self.clean_input(text = nil)
26
+ input = text.to_s
27
+ # perform replacements to work around the limitations of the splitting regexp
28
+ options.fetch(:gsub, []).each do |gsub_pair|
29
+ input.gsub!(gsub_pair[0], gsub_pair[1])
16
30
  end
31
+ # NOTE: leading whitespace is problematic; ref #12
32
+ input.strip
17
33
  end
18
34
 
19
35
  # NOTE: exposed as a method for easy mock/stub
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
  #
3
3
  module NlpPure
4
- VERSION = '0.0.4'
4
+ VERSION = '0.0.5'
5
5
  end
@@ -18,9 +18,11 @@ describe NlpPure::Segmenting::DefaultWord do
18
18
  let(:english_twohyphen_sentence) { 'The quick brown fox--full of energy--jumps over the lazy dog.' }
19
19
  let(:english_ellipsis_sentence) { 'The quick brown fox…jumps over the lazy dog.' }
20
20
  let(:english_spaced_ellipsis_sentence) { 'The quick brown fox … jumps over the lazy dog.' }
21
- let(:english_period_ellipses_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
22
- let(:english_trailing_ellipses_sentence) { 'The quick brown fox jumps over the lazy dog' }
23
- let(:english_spaced_period_ellipses_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
21
+ let(:english_period_ellipsis_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
22
+ let(:english_leading_ellipsis_sentence) { ' … the quick brown fox jumps over the lazy dog.' }
23
+ let(:english_leading_period_ellipsis_sentence) { ' ... the quick brown fox jumps over the lazy dog.' }
24
+ let(:english_trailing_ellipsis_sentence) { 'The quick brown fox jumps over the lazy dog … ' }
25
+ let(:english_spaced_period_ellipsis_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
24
26
  let(:english_abbreviation_sentence) { 'The U.S.A. is a member of NATO.' }
25
27
  let(:english_simple_paragraph) { 'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.' }
26
28
  let(:english_simple_line_breaks) { "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go." }
@@ -79,15 +81,19 @@ describe NlpPure::Segmenting::DefaultWord do
79
81
  end
80
82
 
81
83
  it 'correctly segments period-ellipses' do
82
- expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipses_sentence).length).to eq(9)
84
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipsis_sentence).length).to eq(9)
83
85
  end
84
86
 
85
87
  it 'correctly segments spaced period-ellipses' do
86
- expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipses_sentence).length).to eq(9)
88
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipsis_sentence).length).to eq(9)
89
+ end
90
+
91
+ it 'correctly segments with leading, spaced ellipses' do
92
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_leading_ellipsis_sentence).length).to eq(9)
87
93
  end
88
94
 
89
95
  it 'correctly segments with trailing, spaced ellipses' do
90
- expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipses_sentence).length).to eq(9)
96
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipsis_sentence).length).to eq(9)
91
97
  end
92
98
 
93
99
  it 'does not segment abbreviations' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nlp-pure
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Reid Parham