nlp-pure 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c4f6247383c48cd71d5ccebf6cc937023d1a880b
4
- data.tar.gz: b33e1b19f2bfb5d49c6082f10699f06e2052e32c
3
+ metadata.gz: 7adc921eb51b54bd646cc0c7c57edbfd47a4d7fb
4
+ data.tar.gz: 54930de1e0cd9f5507bde731e057f22942f1d212
5
5
  SHA512:
6
- metadata.gz: 7dee5b3c6947d08ef7e8b92a7332baeaaf8785969f98673613a7de2287dbba65dfb06a6f05990036013abaf01235f0017cb75cdf64be27770cc0c06046e30d99
7
- data.tar.gz: a01ebec79d05301998d3618c1cd4e1b9cfa23982b36834cee17c529559b2bf36b5a32ea11d843469574924c42be245f192a636a3ee7f288490f7516e39f0a589
6
+ metadata.gz: a0ee82f8e519e712d36f89af779e9b42aed4887e0b87fa5c77e6b422f5b8ab6d66afc702c650f3bd9149a80d7c2fbb1434504fb53e132164c5dafb8c520b5102
7
+ data.tar.gz: db7bf9e6178c2e25a5d5bcdde763dbeb8fc9d3130800a0bfd41a8a6abae1ab64ad012ca27bd7334c241e8c4486566cd6f60a8f9c173086c53747f3296d7e9fed
data/CHANGELOG.md CHANGED
@@ -1,3 +1,11 @@
1
+ # 0.0.4
2
+
3
+ Fixed bug in `NlpPure::Segmenting::DefaultWord` where ellipses without spaces would not segment.
4
+
5
+ # 0.0.3
6
+
7
+ Fixed bug in `NlpPure::Segmenting::DefaultWord` where double hyphens and spaced dashes would segment as empty words.
8
+
1
9
  # 0.0.2
2
10
 
3
11
  Added `NlpPure::Segmenting::DefaultWord` module for segmenting text into words.
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![Code Climate](https://codeclimate.com/github/parhamr/nlp-pure/badges/gpa.svg)](https://codeclimate.com/github/parhamr/nlp-pure)
4
4
  [![Build Status](https://travis-ci.org/parhamr/nlp-pure.svg?branch=master)](https://travis-ci.org/parhamr/nlp-pure)
5
- [![Coverage Status](https://coveralls.io/repos/parhamr/nlp-pure/badge.png?branch=master)](https://coveralls.io/r/parhamr/nlp-pure)
5
+ [![Coverage Status](https://coveralls.io/repos/parhamr/nlp-pure/badge.png?branch=master)](https://coveralls.io/r/parhamr/nlp-pure?branch=master)
6
6
 
7
7
  Natural language processing algorithms implemented in pure Ruby with minimal dependencies.
8
8
 
@@ -44,7 +44,7 @@ $ gem install nlp-pure
44
44
 
45
45
  ```
46
46
  $ bundle exec irb
47
- irb(main):001:0> require_relative './lib/nlp_pure/segmenting/default_word'
47
+ irb(main):001:0> require 'nlp_pure/segmenting/default_word'
48
48
  => true
49
49
  irb(main):002:0> NlpPure::Segmenting::DefaultWord.parse 'The quick brown fox jumps over the lazy dog.'
50
50
  => ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog."]
@@ -5,8 +5,9 @@ module NlpPure
5
5
  #
6
6
  module DefaultWord
7
7
  DEFAULT_OPTIONS = {
8
- # hyphen, en dash, em dash, and string
9
- split: /[\-–—\s]/
8
+ # 3+ periods as pseudo-ellipsis (with optional whitespace)
9
+ # OR hyphen, en dash, em dash, and whitespace
10
+ split: /\s?\.{3,}+\s?|[\s\-–—…]+/
10
11
  }.freeze
11
12
  def self.parse(*args)
12
13
  unless args.nil? || args.empty?
@@ -22,5 +23,3 @@ module NlpPure
22
23
  end
23
24
  end
24
25
  end
25
-
26
- require_relative '../segmenting'
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
  #
3
3
  module NlpPure
4
- VERSION = '0.0.2'
4
+ VERSION = '0.0.4'
5
5
  end
@@ -14,6 +14,13 @@ describe NlpPure::Segmenting::DefaultWord do
14
14
  let(:english_simple_sentence) { 'The quick brown fox jumps over the lazy dog.' }
15
15
  let(:english_hyphen_sentence) { 'The New York-based company hired new staff.' }
16
16
  let(:english_dash_sentence) { 'The quick brown fox—full of energy—jumps over the lazy dog.' }
17
+ let(:english_spaced_dash_sentence) { 'The quick brown fox — full of energy — jumps over the lazy dog.' }
18
+ let(:english_twohyphen_sentence) { 'The quick brown fox--full of energy--jumps over the lazy dog.' }
19
+ let(:english_ellipsis_sentence) { 'The quick brown fox…jumps over the lazy dog.' }
20
+ let(:english_spaced_ellipsis_sentence) { 'The quick brown fox … jumps over the lazy dog.' }
21
+ let(:english_period_ellipses_sentence) { 'The quick brown fox...jumps over the lazy dog.' }
22
+ let(:english_trailing_ellipses_sentence) { 'The quick brown fox jumps over the lazy dog …' }
23
+ let(:english_spaced_period_ellipses_sentence) { 'The quick brown fox ... jumps over the lazy dog.' }
17
24
  let(:english_abbreviation_sentence) { 'The U.S.A. is a member of NATO.' }
18
25
  let(:english_simple_paragraph) { 'Mary had a little lamb. The lamb’s fleece was white as snow. Everywhere that Mary went, the lamb was sure to go.' }
19
26
  let(:english_simple_line_breaks) { "Mary had a little lamb,\nHis fleece was white as snow,\nAnd everywhere that Mary went,\nThe lamb was sure to go." }
@@ -51,10 +58,38 @@ describe NlpPure::Segmenting::DefaultWord do
51
58
  expect(NlpPure::Segmenting::DefaultWord.parse(english_hyphen_sentence).length).to eq(8)
52
59
  end
53
60
 
61
+ it 'correctly segments double-hyphen dashes' do
62
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_twohyphen_sentence).length).to eq(12)
63
+ end
64
+
54
65
  it 'correctly segments dashes' do
66
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_dash_sentence).length).to eq(12)
67
+ end
68
+
69
+ it 'correctly segments spaced dashes' do
55
70
  expect(NlpPure::Segmenting::DefaultWord.parse(english_dash_sentence).length).to eq(12)
56
71
  end
57
72
 
73
+ it 'correctly segments ellipses' do
74
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_ellipsis_sentence).length).to eq(9)
75
+ end
76
+
77
+ it 'correctly segments spaced ellipses' do
78
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_ellipsis_sentence).length).to eq(9)
79
+ end
80
+
81
+ it 'correctly segments period-ellipses' do
82
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_period_ellipses_sentence).length).to eq(9)
83
+ end
84
+
85
+ it 'correctly segments spaced period-ellipses' do
86
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_spaced_period_ellipses_sentence).length).to eq(9)
87
+ end
88
+
89
+ it 'correctly segments with trailing, spaced ellipses' do
90
+ expect(NlpPure::Segmenting::DefaultWord.parse(english_trailing_ellipses_sentence).length).to eq(9)
91
+ end
92
+
58
93
  it 'does not segment abbreviations' do
59
94
  expect(NlpPure::Segmenting::DefaultWord.parse(english_abbreviation_sentence).length).to eq(7)
60
95
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nlp-pure
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Reid Parham