pragmatic_segmenter 0.3.14 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f4cb40793083044dcf653f7c5c2c64d6e1f9476
4
- data.tar.gz: df80bb40afbb05d6f9d1109f0fbb412b34fd7171
3
+ metadata.gz: d67794ddb8ba2e0fdcdd4b0bda833ec07d7addc0
4
+ data.tar.gz: 04e7435b5148d865aca9062a9c72f88f8768860e
5
5
  SHA512:
6
- metadata.gz: 3296133dffb5dc2a1c3084f31ecd6115c9e5ffbcb53590ee751fa1e69159dbffadc0bbb6885f7fb09f38ec53aceaf07e2da8942ba6135ee344ae34fd1fac4d83
7
- data.tar.gz: 6c3230195453e40c8a4bbfcb9b864a369db7a3beebcf4f61f37b11f2f8dff454a173456a59fa298d5962560441828fbec6b3d926e4dd033785cd4b8a7a716a05
6
+ metadata.gz: c23512769b5ad8d8190ff84443662d41959960352b45e5c108ed8417d12953f44be37ce8117b2434bb72c411b4c905472525e1411984e5e3cd39de97d1865b93
7
+ data.tar.gz: 6e108d84ad98a09e5cc6bfa51881541eb1f90b7103e7c004a1cd2980a3bed0c970e850fb5df4231af326dd15b7eaf04e1a09f5c96e849e2942a3e8dbbf94638f
data/NEWS CHANGED
@@ -1,4 +1,8 @@
1
- 0.3.13 (2017-06-28):
1
+ 0.3.15 (2017-06-28):
2
+
3
+ * Improvement: Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
4
+
5
+ 0.3.14 (2017-06-28):
2
6
 
3
7
  * Improvement: Add English abbreviation Rs. to denote the Indian currency
4
8
 
data/README.md CHANGED
@@ -853,6 +853,9 @@ To test the relative performance of different segmentation tools and libraries I
853
853
  **Version 0.3.14**
854
854
  * Add English abbreviation Rs. to denote the Indian currency
855
855
 
856
+ **Version 0.3.15**
857
+ * Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
858
+
856
859
  ## Contributing
857
860
 
858
861
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -25,6 +25,9 @@ module PragmaticSegmenter
25
25
  # Rubular: http://rubular.com/r/mXf8cW025o
26
26
  WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/
27
27
 
28
+ # Rubular: http://rubular.com/r/jTtDKfjxzr
29
+ BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/
30
+
28
31
  attr_reader :text
29
32
  def initialize(text:)
30
33
  @text = text
@@ -42,6 +45,7 @@ module PragmaticSegmenter
42
45
  sub_punctuation_between_square_brackets(txt)
43
46
  sub_punctuation_between_parens(txt)
44
47
  sub_punctuation_between_quotes_arrow(txt)
48
+ sub_punctuation_between_em_dashes(txt)
45
49
  sub_punctuation_between_quotes_slanted(txt)
46
50
  end
47
51
 
@@ -87,6 +91,13 @@ module PragmaticSegmenter
87
91
  ).replace
88
92
  end
89
93
 
94
+ def sub_punctuation_between_em_dashes(txt)
95
+ PragmaticSegmenter::PunctuationReplacer.new(
96
+ matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
97
+ text: txt
98
+ ).replace
99
+ end
100
+
90
101
  def sub_punctuation_between_quotes_slanted(txt)
91
102
  PragmaticSegmenter::PunctuationReplacer.new(
92
103
  matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.3.14"
2
+ VERSION = "0.3.15"
3
3
  end
@@ -1384,5 +1384,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
1384
1384
  ps = PragmaticSegmenter::Segmenter.new(text: "After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation.", clean: false)
1385
1385
  expect(ps.segment).to eq(["After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation."])
1386
1386
  end
1387
+
1388
+ it "correctly segments text #114" do
1389
+ ps = PragmaticSegmenter::Segmenter.new(text: "Mix it, put it in the oven, and -- voila! -- you have cake.", clean: false)
1390
+ expect(ps.segment).to eq(["Mix it, put it in the oven, and -- voila! -- you have cake."])
1391
+ end
1392
+
1393
+ it "correctly segments text #115" do
1394
+ ps = PragmaticSegmenter::Segmenter.new(text: "Some can be -- if I may say so? -- a bit questionable.", clean: false)
1395
+ expect(ps.segment).to eq(["Some can be -- if I may say so? -- a bit questionable."])
1396
+ end
1397
+
1398
+ it "correctly segments text #116" do
1399
+ ps = PragmaticSegmenter::Segmenter.new(text: "What do you see? - Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries.", clean: false)
1400
+ expect(ps.segment).to eq(["What do you see?", "- Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries."])
1401
+ end
1387
1402
  end
1388
1403
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias