pragmatic_segmenter 0.3.12 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/NEWS +4 -0
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +10 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c313d610281828819a76463bd3b42590927307e2
|
4
|
+
data.tar.gz: 3c9f340a197450a6dffac38f4a3c0b378f3b8edf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8756402fcb03f456f27d8359b4e8adfd970fed42a89bdfebab9b7a75b25d83dd9b4db19b1f7982b71214c573500b519cbc1920b6a54fa2632a3521dabee9a68
|
7
|
+
data.tar.gz: 911a665d2609086e20aff601161e63ce22410e18c0d169fc9d8032cef029062aa7a3cdafa082c55db1ab78ae6f2e5f7a1f302246c716f5393a815ed9aec890dc
|
data/.gitignore
CHANGED
data/NEWS
CHANGED
data/README.md
CHANGED
@@ -847,6 +847,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
847
847
|
**Version 0.3.12**
|
848
848
|
* Fix issue involving words with leading apostrophes
|
849
849
|
|
850
|
+
**Version 0.3.13**
|
851
|
+
* Fix issue involving unexpected sentence break between abbreviation and hyphen
|
852
|
+
|
850
853
|
## Contributing
|
851
854
|
|
852
855
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -113,7 +113,7 @@ module PragmaticSegmenter
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def replace_period_of_abbr(txt, abbr)
|
116
|
-
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((
|
116
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
117
117
|
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
118
118
|
txt
|
119
119
|
end
|
@@ -1359,5 +1359,15 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1359
1359
|
ps = PragmaticSegmenter::Segmenter.new(text: "I wrote this in the ’nineties. It has four sentences. This is the third, isn't it? And this is the last")
|
1360
1360
|
expect(ps.segment).to eq(["I wrote this in the ’nineties.", "It has four sentences.", "This is the third, isn't it?", "And this is the last"])
|
1361
1361
|
end
|
1362
|
+
|
1363
|
+
it "correctly segments text #109" do
|
1364
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He has Ph.D.-level training", clean: false)
|
1365
|
+
expect(ps.segment).to eq(["He has Ph.D.-level training"])
|
1366
|
+
end
|
1367
|
+
|
1368
|
+
it "correctly segments text #110" do
|
1369
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He has Ph.D. level training", clean: false)
|
1370
|
+
expect(ps.segment).to eq(["He has Ph.D. level training"])
|
1371
|
+
end
|
1362
1372
|
end
|
1363
1373
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
requirements: []
|
180
180
|
rubyforge_project:
|
181
|
-
rubygems_version: 2.
|
181
|
+
rubygems_version: 2.6.8
|
182
182
|
signing_key:
|
183
183
|
specification_version: 4
|
184
184
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|