pragmatic_segmenter 0.3.12 → 0.3.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/NEWS +4 -0
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +10 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c313d610281828819a76463bd3b42590927307e2
|
4
|
+
data.tar.gz: 3c9f340a197450a6dffac38f4a3c0b378f3b8edf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8756402fcb03f456f27d8359b4e8adfd970fed42a89bdfebab9b7a75b25d83dd9b4db19b1f7982b71214c573500b519cbc1920b6a54fa2632a3521dabee9a68
|
7
|
+
data.tar.gz: 911a665d2609086e20aff601161e63ce22410e18c0d169fc9d8032cef029062aa7a3cdafa082c55db1ab78ae6f2e5f7a1f302246c716f5393a815ed9aec890dc
|
data/.gitignore
CHANGED
data/NEWS
CHANGED
data/README.md
CHANGED
@@ -847,6 +847,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
847
847
|
**Version 0.3.12**
|
848
848
|
* Fix issue involving words with leading apostrophes
|
849
849
|
|
850
|
+
**Version 0.3.13**
|
851
|
+
* Fix issue involving unexpected sentence break between abbreviation and hyphen
|
852
|
+
|
850
853
|
## Contributing
|
851
854
|
|
852
855
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -113,7 +113,7 @@ module PragmaticSegmenter
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def replace_period_of_abbr(txt, abbr)
|
116
|
-
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((
|
116
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
117
117
|
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
118
118
|
txt
|
119
119
|
end
|
@@ -1359,5 +1359,15 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1359
1359
|
ps = PragmaticSegmenter::Segmenter.new(text: "I wrote this in the ’nineties. It has four sentences. This is the third, isn't it? And this is the last")
|
1360
1360
|
expect(ps.segment).to eq(["I wrote this in the ’nineties.", "It has four sentences.", "This is the third, isn't it?", "And this is the last"])
|
1361
1361
|
end
|
1362
|
+
|
1363
|
+
it "correctly segments text #109" do
|
1364
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He has Ph.D.-level training", clean: false)
|
1365
|
+
expect(ps.segment).to eq(["He has Ph.D.-level training"])
|
1366
|
+
end
|
1367
|
+
|
1368
|
+
it "correctly segments text #110" do
|
1369
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He has Ph.D. level training", clean: false)
|
1370
|
+
expect(ps.segment).to eq(["He has Ph.D. level training"])
|
1371
|
+
end
|
1362
1372
|
end
|
1363
1373
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -178,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
requirements: []
|
180
180
|
rubyforge_project:
|
181
|
-
rubygems_version: 2.
|
181
|
+
rubygems_version: 2.6.8
|
182
182
|
signing_key:
|
183
183
|
specification_version: 4
|
184
184
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|