pragmatic_segmenter 0.3.14 → 0.3.15

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f4cb40793083044dcf653f7c5c2c64d6e1f9476
4
- data.tar.gz: df80bb40afbb05d6f9d1109f0fbb412b34fd7171
3
+ metadata.gz: d67794ddb8ba2e0fdcdd4b0bda833ec07d7addc0
4
+ data.tar.gz: 04e7435b5148d865aca9062a9c72f88f8768860e
5
5
  SHA512:
6
- metadata.gz: 3296133dffb5dc2a1c3084f31ecd6115c9e5ffbcb53590ee751fa1e69159dbffadc0bbb6885f7fb09f38ec53aceaf07e2da8942ba6135ee344ae34fd1fac4d83
7
- data.tar.gz: 6c3230195453e40c8a4bbfcb9b864a369db7a3beebcf4f61f37b11f2f8dff454a173456a59fa298d5962560441828fbec6b3d926e4dd033785cd4b8a7a716a05
6
+ metadata.gz: c23512769b5ad8d8190ff84443662d41959960352b45e5c108ed8417d12953f44be37ce8117b2434bb72c411b4c905472525e1411984e5e3cd39de97d1865b93
7
+ data.tar.gz: 6e108d84ad98a09e5cc6bfa51881541eb1f90b7103e7c004a1cd2980a3bed0c970e850fb5df4231af326dd15b7eaf04e1a09f5c96e849e2942a3e8dbbf94638f
data/NEWS CHANGED
@@ -1,4 +1,8 @@
1
- 0.3.13 (2017-06-28):
1
+ 0.3.15 (2017-06-28):
2
+
3
+ * Improvement: Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
4
+
5
+ 0.3.14 (2017-06-28):
2
6
 
3
7
  * Improvement: Add English abbreviation Rs. to denote the Indian currency
4
8
 
data/README.md CHANGED
@@ -853,6 +853,9 @@ To test the relative performance of different segmentation tools and libraries I
853
853
  **Version 0.3.14**
854
854
  * Add English abbreviation Rs. to denote the Indian currency
855
855
 
856
+ **Version 0.3.15**
857
+ * Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
858
+
856
859
  ## Contributing
857
860
 
858
861
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -25,6 +25,9 @@ module PragmaticSegmenter
25
25
  # Rubular: http://rubular.com/r/mXf8cW025o
26
26
  WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/
27
27
 
28
+ # Rubular: http://rubular.com/r/jTtDKfjxzr
29
+ BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/
30
+
28
31
  attr_reader :text
29
32
  def initialize(text:)
30
33
  @text = text
@@ -42,6 +45,7 @@ module PragmaticSegmenter
42
45
  sub_punctuation_between_square_brackets(txt)
43
46
  sub_punctuation_between_parens(txt)
44
47
  sub_punctuation_between_quotes_arrow(txt)
48
+ sub_punctuation_between_em_dashes(txt)
45
49
  sub_punctuation_between_quotes_slanted(txt)
46
50
  end
47
51
 
@@ -87,6 +91,13 @@ module PragmaticSegmenter
87
91
  ).replace
88
92
  end
89
93
 
94
+ def sub_punctuation_between_em_dashes(txt)
95
+ PragmaticSegmenter::PunctuationReplacer.new(
96
+ matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
97
+ text: txt
98
+ ).replace
99
+ end
100
+
90
101
  def sub_punctuation_between_quotes_slanted(txt)
91
102
  PragmaticSegmenter::PunctuationReplacer.new(
92
103
  matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.3.14"
2
+ VERSION = "0.3.15"
3
3
  end
@@ -1384,5 +1384,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
1384
1384
  ps = PragmaticSegmenter::Segmenter.new(text: "After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation.", clean: false)
1385
1385
  expect(ps.segment).to eq(["After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation."])
1386
1386
  end
1387
+
1388
+ it "correctly segments text #114" do
1389
+ ps = PragmaticSegmenter::Segmenter.new(text: "Mix it, put it in the oven, and -- voila! -- you have cake.", clean: false)
1390
+ expect(ps.segment).to eq(["Mix it, put it in the oven, and -- voila! -- you have cake."])
1391
+ end
1392
+
1393
+ it "correctly segments text #115" do
1394
+ ps = PragmaticSegmenter::Segmenter.new(text: "Some can be -- if I may say so? -- a bit questionable.", clean: false)
1395
+ expect(ps.segment).to eq(["Some can be -- if I may say so? -- a bit questionable."])
1396
+ end
1397
+
1398
+ it "correctly segments text #116" do
1399
+ ps = PragmaticSegmenter::Segmenter.new(text: "What do you see? - Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries.", clean: false)
1400
+ expect(ps.segment).to eq(["What do you see?", "- Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries."])
1401
+ end
1387
1402
  end
1388
1403
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias