pragmatic_segmenter 0.3.18 → 0.3.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS +4 -0
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/danish_spec.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +6 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3f03709232bf5b433b73de74a04b254228bc3ce6
|
4
|
+
data.tar.gz: 087860da234619d431849de1f85614b876e6d31a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d2759f5a302f6acf40b34990df8fd20d352b0aef5dfdd1799e5f7303d065466964566c068c505460837e838a8fa296754031084ef42715aab709f5d7a2f882c
|
7
|
+
data.tar.gz: 0ad1c988cff72e818fe9fdd4de77831e792e20834f27c6a40b34f594e8ef5a9927fdf1da259cdcd7a7d990b592c53f09bba11c59432b74d3b6a6f46615186378
|
data/NEWS
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
0.3.19 (2018-07-19):
|
2
|
+
|
3
|
+
* Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
|
4
|
+
|
1
5
|
0.3.18 (2018-03-27):
|
2
6
|
|
3
7
|
* Improvement: Performance optimizations
|
data/README.md
CHANGED
@@ -865,6 +865,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
865
865
|
**Version 0.3.18**
|
866
866
|
* Performance optimizations
|
867
867
|
|
868
|
+
**Version 0.3.19**
|
869
|
+
* Treat a parenthetical following an abbreviation as part of the same segment
|
870
|
+
|
868
871
|
## Contributing
|
869
872
|
|
870
873
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -108,7 +108,7 @@ module PragmaticSegmenter
|
|
108
108
|
end
|
109
109
|
|
110
110
|
def replace_period_of_abbr(txt, abbr)
|
111
|
-
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
111
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
112
112
|
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
113
113
|
txt
|
114
114
|
end
|
@@ -507,7 +507,7 @@ RSpec.describe PragmaticSegmenter::Languages::Danish, "(da)" do
|
|
507
507
|
|
508
508
|
it 'correctly segments text #048' do
|
509
509
|
ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
|
510
|
-
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.
|
510
|
+
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
|
511
511
|
end
|
512
512
|
|
513
513
|
it 'correctly segments text #049' do
|
@@ -512,7 +512,7 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
512
512
|
|
513
513
|
it 'correctly segments text #048' do
|
514
514
|
ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
|
515
|
-
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.
|
515
|
+
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
|
516
516
|
end
|
517
517
|
|
518
518
|
it 'correctly segments text #049' do
|
@@ -1406,5 +1406,10 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1406
1406
|
expect(ps.segment).to eq(["In placebo-controlled studies of all uses of Tracleer, marked decreases in hemoglobin (>15% decrease from baseline resulting in values <11 g/ dL) were observed in 6% of Tracleer-treated patients and 3% of placebo-treated patients.", "Bosentan is highly bound (>98%) to plasma proteins, mainly albumin."])
|
1407
1407
|
end
|
1408
1408
|
|
1409
|
+
it 'correctly segments text #118' do
|
1410
|
+
text = "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."
|
1411
|
+
ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
|
1412
|
+
expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
|
1413
|
+
end
|
1409
1414
|
end
|
1410
1415
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
180
|
version: '0'
|
181
181
|
requirements: []
|
182
182
|
rubyforge_project:
|
183
|
-
rubygems_version: 2.
|
183
|
+
rubygems_version: 2.6.14
|
184
184
|
signing_key:
|
185
185
|
specification_version: 4
|
186
186
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|