pragmatic_segmenter 0.3.18 → 0.3.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3bb581e56e988521adc41dbb94bc7281ee7dfa95
4
- data.tar.gz: 0c3b6fe877a5d39d36053b7ed68a860b5d17779b
3
+ metadata.gz: 3f03709232bf5b433b73de74a04b254228bc3ce6
4
+ data.tar.gz: 087860da234619d431849de1f85614b876e6d31a
5
5
  SHA512:
6
- metadata.gz: 1b1dd64b5a382e8bb7ed5d79fbb9264565d71088f234ba4a9bd7cae47184e1c64f78b32a72f39326aa31936c6c6742aa2ff75cd75cd1b328987a6061a4d2534b
7
- data.tar.gz: c150b178c93b7183300559e89c117cf8f9f93adf6ef33790a3ce0b292c66588fe6461c724210f7f93e078d2d08a10e995d7234bad963f8d5aa1c52c378effe5e
6
+ metadata.gz: 6d2759f5a302f6acf40b34990df8fd20d352b0aef5dfdd1799e5f7303d065466964566c068c505460837e838a8fa296754031084ef42715aab709f5d7a2f882c
7
+ data.tar.gz: 0ad1c988cff72e818fe9fdd4de77831e792e20834f27c6a40b34f594e8ef5a9927fdf1da259cdcd7a7d990b592c53f09bba11c59432b74d3b6a6f46615186378
data/NEWS CHANGED
@@ -1,3 +1,7 @@
1
+ 0.3.19 (2018-07-19):
2
+
3
+ * Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
4
+
1
5
  0.3.18 (2018-03-27):
2
6
 
3
7
  * Improvement: Performance optimizations
data/README.md CHANGED
@@ -865,6 +865,9 @@ To test the relative performance of different segmentation tools and libraries I
865
865
  **Version 0.3.18**
866
866
  * Performance optimizations
867
867
 
868
+ **Version 0.3.19**
869
+ * Treat a parenthetical following an abbreviation as part of the same segment
870
+
868
871
  ## Contributing
869
872
 
870
873
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -108,7 +108,7 @@ module PragmaticSegmenter
108
108
  end
109
109
 
110
110
  def replace_period_of_abbr(txt, abbr)
111
- txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
111
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
112
112
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
113
113
  txt
114
114
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PragmaticSegmenter
4
- VERSION = "0.3.18"
4
+ VERSION = "0.3.19"
5
5
  end
@@ -507,7 +507,7 @@ RSpec.describe PragmaticSegmenter::Languages::Danish, "(da)" do
507
507
 
508
508
  it 'correctly segments text #048' do
509
509
  ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
510
- expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"])
510
+ expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
511
511
  end
512
512
 
513
513
  it 'correctly segments text #049' do
@@ -512,7 +512,7 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
512
512
 
513
513
  it 'correctly segments text #048' do
514
514
  ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
515
- expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"])
515
+ expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)"])
516
516
  end
517
517
 
518
518
  it 'correctly segments text #049' do
@@ -1406,5 +1406,10 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
1406
1406
  expect(ps.segment).to eq(["In placebo-controlled studies of all uses of Tracleer, marked decreases in hemoglobin (>15% decrease from baseline resulting in values <11 g/ dL) were observed in 6% of Tracleer-treated patients and 3% of placebo-treated patients.", "Bosentan is highly bound (>98%) to plasma proteins, mainly albumin."])
1407
1407
  end
1408
1408
 
1409
+ it 'correctly segments text #118' do
1410
+ text = "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."
1411
+ ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
1412
+ expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
1413
+ end
1409
1414
  end
1410
1415
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.18
4
+ version: 0.3.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-27 00:00:00.000000000 Z
11
+ date: 2018-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -180,7 +180,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
180
180
  version: '0'
181
181
  requirements: []
182
182
  rubyforge_project:
183
- rubygems_version: 2.4.1
183
+ rubygems_version: 2.6.14
184
184
  signing_key:
185
185
  specification_version: 4
186
186
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across