pragmatic_segmenter 0.3.18 → 0.3.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS +4 -0
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/danish_spec.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +6 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3f03709232bf5b433b73de74a04b254228bc3ce6
|
4
|
+
data.tar.gz: 087860da234619d431849de1f85614b876e6d31a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d2759f5a302f6acf40b34990df8fd20d352b0aef5dfdd1799e5f7303d065466964566c068c505460837e838a8fa296754031084ef42715aab709f5d7a2f882c
|
7
|
+
data.tar.gz: 0ad1c988cff72e818fe9fdd4de77831e792e20834f27c6a40b34f594e8ef5a9927fdf1da259cdcd7a7d990b592c53f09bba11c59432b74d3b6a6f46615186378
|
data/NEWS
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
0.3.19 (2018-07-19):
|
2
|
+
|
3
|
+
* Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
|
4
|
+
|
1
5
|
0.3.18 (2018-03-27):
|
2
6
|
|
3
7
|
* Improvement: Performance optimizations
|
data/README.md
CHANGED
@@ -865,6 +865,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
865
865
|
**Version 0.3.18**
|
866
866
|
* Performance optimizations
|
867
867
|
|
868
|
+
**Version 0.3.19**
|
869
|
+
* Treat a parenthetical following an abbreviation as part of the same segment
|
870
|
+
|
868
871
|
## Contributing
|
869
872
|
|
870
873
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -108,7 +108,7 @@ module PragmaticSegmenter
|
|
108
108
|
end
|
109
109
|
|
110
110
|
def replace_period_of_abbr(txt, abbr)
|
111
|
-
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
111
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
112
112
|
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
113
113
|
txt
|
114
114
|
end
|
@@ -507,7 +507,7 @@ RSpec.describe PragmaticSegmenter::Languages::Danish, "(da)" do
|
|
507
507
|
|
508
508
|
it 'correctly segments text #048' do
|
509
509
|
ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
|
510
|
-
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.
|
510
|
+
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
|
511
511
|
end
|
512
512
|
|
513
513
|
it 'correctly segments text #049' do
|
@@ -512,7 +512,7 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
512
512
|
|
513
513
|
it 'correctly segments text #048' do
|
514
514
|
ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
|
515
|
-
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.
|
515
|
+
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
|
516
516
|
end
|
517
517
|
|
518
518
|
it 'correctly segments text #049' do
|
@@ -1406,5 +1406,10 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1406
1406
|
expect(ps.segment).to eq(["In placebo-controlled studies of all uses of Tracleer, marked decreases in hemoglobin (>15% decrease from baseline resulting in values <11 g/ dL) were observed in 6% of Tracleer-treated patients and 3% of placebo-treated patients.", "Bosentan is highly bound (>98%) to plasma proteins, mainly albumin."])
|
1407
1407
|
end
|
1408
1408
|
|
1409
|
+
it 'correctly segments text #118' do
|
1410
|
+
text = "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."
|
1411
|
+
ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
|
1412
|
+
expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
|
1413
|
+
end
|
1409
1414
|
end
|
1410
1415
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
180
|
version: '0'
|
181
181
|
requirements: []
|
182
182
|
rubyforge_project:
|
183
|
-
rubygems_version: 2.
|
183
|
+
rubygems_version: 2.6.14
|
184
184
|
signing_key:
|
185
185
|
specification_version: 4
|
186
186
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|