pragmatic_segmenter 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ac7a7ecd4d02a9d2df378b3c174ba338238b26df
4
- data.tar.gz: fa33b3bd39e88c065bef560f0d22b3124ed99da8
3
+ metadata.gz: 1708727dc62734ca577e6d9b1c05af49551563a7
4
+ data.tar.gz: 1f00d14609486e72068d3a189851ef9fd9f66a31
5
5
  SHA512:
6
- metadata.gz: ff3c13ce3db23f80d2051fe95b94dc7dd057ed36818f06b6ddff18f14179be88ef0d381250d1bebba3322b5439d59658522fb56440fb082fb69d32baefb54e7b
7
- data.tar.gz: 21b0f050be65715cf1ab020d7ec31780369ef6cb64a6de0ff18176eabd94e5a38073f57ed258bb21c62c1d8eaf4978a6cb66e1e4f10c697549fb319dc762b02c
6
+ metadata.gz: 9a45479d6c2cfa930e2d6bafcaf9648208dee53f79b5d16fa020fe5427210c3b7673663707bf1feda98344fe4765796a060fac8055f2641f107d7d1ce5e51022
7
+ data.tar.gz: c92f8ee9fb924a3b8e46d6efb1fa9fa27eb72a06a66f9e9d64f84efa4458efd42a6d88d63f4c28880cd7ada18f333cd8ab927bb9a580d319d1ff1ec38e3b3664
data/README.md CHANGED
@@ -769,7 +769,10 @@ To test the relative performance of different segmentation tools and libraries I
769
769
  **Version 0.1.7**
770
770
  * Add Alice in Wonderland specs
771
771
  * Fix parenthesis between double quotations bug
772
- * Fix split after quotation ending in dash bug
772
+ * Fix split after quotation ending in dash bug
773
+
774
+ **Version 0.1.8**
775
+ * Fix bug in splitting new sentence after single quotes
773
776
 
774
777
  ## Contributing
775
778
 
@@ -49,7 +49,8 @@ module PragmaticSegmenter
49
49
  def sub_punctuation_between_single_quotes(txt)
50
50
  PragmaticSegmenter::PunctuationReplacer.new(
51
51
  matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
52
- text: txt
52
+ text: txt,
53
+ match_type: 'single'
53
54
  ).replace
54
55
  end
55
56
 
@@ -4,11 +4,15 @@ module PragmaticSegmenter
4
4
  # This class replaces punctuation that is typically a sentence boundary
5
5
  # but in this case is not a sentence boundary.
6
6
  class PunctuationReplacer
7
+ # Rubular: http://rubular.com/r/2YFrKWQUYi
8
+ BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
9
+
7
10
  include Rules
8
- attr_reader :matches_array, :text
9
- def initialize(text:, matches_array:)
11
+ attr_reader :matches_array, :text, :match_type
12
+ def initialize(text:, matches_array:, **args)
10
13
  @text = text
11
14
  @matches_array = matches_array
15
+ @match_type = args[:match_type]
12
16
  end
13
17
 
14
18
  def replace
@@ -29,7 +33,9 @@ module PragmaticSegmenter
29
33
  sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
30
34
  sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
31
35
  sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
32
- sub_7 = sub_characters(txt, sub_6, "'", '&⎋&')
36
+ unless match_type.eql?('single')
37
+ sub_7 = sub_characters(txt, sub_6, "'", '&⎋&')
38
+ end
33
39
  end
34
40
  txt.apply(SubEscapedRegexReservedCharacters::All)
35
41
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.8"
3
3
  end
@@ -1498,6 +1498,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
1498
1498
  ps = PragmaticSegmenter::Segmenter.new(text: "\"Dinah'll miss me very much to-night, I should think!\" (Dinah was the cat.) \"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\"")
1499
1499
  expect(ps.segment).to eq(["\"Dinah'll miss me very much to-night, I should think!\"", "(Dinah was the cat.)", "\"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\""])
1500
1500
  end
1501
+
1502
+ it "correctly segments text #098" do
1503
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
1504
+ expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
1505
+ end
1501
1506
  end
1502
1507
  end
1503
1508
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias