pragmatic_segmenter 0.3.18 → 0.3.19

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3bb581e56e988521adc41dbb94bc7281ee7dfa95
4
- data.tar.gz: 0c3b6fe877a5d39d36053b7ed68a860b5d17779b
3
+ metadata.gz: 3f03709232bf5b433b73de74a04b254228bc3ce6
4
+ data.tar.gz: 087860da234619d431849de1f85614b876e6d31a
5
5
  SHA512:
6
- metadata.gz: 1b1dd64b5a382e8bb7ed5d79fbb9264565d71088f234ba4a9bd7cae47184e1c64f78b32a72f39326aa31936c6c6742aa2ff75cd75cd1b328987a6061a4d2534b
7
- data.tar.gz: c150b178c93b7183300559e89c117cf8f9f93adf6ef33790a3ce0b292c66588fe6461c724210f7f93e078d2d08a10e995d7234bad963f8d5aa1c52c378effe5e
6
+ metadata.gz: 6d2759f5a302f6acf40b34990df8fd20d352b0aef5dfdd1799e5f7303d065466964566c068c505460837e838a8fa296754031084ef42715aab709f5d7a2f882c
7
+ data.tar.gz: 0ad1c988cff72e818fe9fdd4de77831e792e20834f27c6a40b34f594e8ef5a9927fdf1da259cdcd7a7d990b592c53f09bba11c59432b74d3b6a6f46615186378
data/NEWS CHANGED
@@ -1,3 +1,7 @@
1
+ 0.3.19 (2018-07-19):
2
+
3
+ * Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
4
+
1
5
  0.3.18 (2018-03-27):
2
6
 
3
7
  * Improvement: Performance optimizations
data/README.md CHANGED
@@ -865,6 +865,9 @@ To test the relative performance of different segmentation tools and libraries I
865
865
  **Version 0.3.18**
866
866
  * Performance optimizations
867
867
 
868
+ **Version 0.3.19**
869
+ * Treat a parenthetical following an abbreviation as part of the same segment
870
+
868
871
  ## Contributing
869
872
 
870
873
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -108,7 +108,7 @@ module PragmaticSegmenter
108
108
  end
109
109
 
110
110
  def replace_period_of_abbr(txt, abbr)
111
- txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
111
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
112
112
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
113
113
  txt
114
114
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PragmaticSegmenter
4
- VERSION = "0.3.18"
4
+ VERSION = "0.3.19"
5
5
  end
@@ -507,7 +507,7 @@ RSpec.describe PragmaticSegmenter::Languages::Danish, "(da)" do
507
507
 
508
508
  it 'correctly segments text #048' do
509
509
  ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
510
- expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"])
510
+ expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
511
511
  end
512
512
 
513
513
  it 'correctly segments text #049' do
@@ -512,7 +512,7 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
512
512
 
513
513
  it 'correctly segments text #048' do
514
514
  ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
515
- expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"])
515
+ expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
516
516
  end
517
517
 
518
518
  it 'correctly segments text #049' do
@@ -1406,5 +1406,10 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
1406
1406
  expect(ps.segment).to eq(["In placebo-controlled studies of all uses of Tracleer, marked decreases in hemoglobin (>15% decrease from baseline resulting in values <11 g/ dL) were observed in 6% of Tracleer-treated patients and 3% of placebo-treated patients.", "Bosentan is highly bound (>98%) to plasma proteins, mainly albumin."])
1407
1407
  end
1408
1408
 
1409
+ it 'correctly segments text #118' do
1410
+ text = "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."
1411
+ ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
1412
+ expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
1413
+ end
1409
1414
  end
1410
1415
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.18
4
+ version: 0.3.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-27 00:00:00.000000000 Z
11
+ date: 2018-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
180
180
  version: '0'
181
181
  requirements: []
182
182
  rubyforge_project:
183
- rubygems_version: 2.4.1
183
+ rubygems_version: 2.6.14
184
184
  signing_key:
185
185
  specification_version: 4
186
186
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across