pragmatic_segmenter 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1708727dc62734ca577e6d9b1c05af49551563a7
|
4
|
+
data.tar.gz: 1f00d14609486e72068d3a189851ef9fd9f66a31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a45479d6c2cfa930e2d6bafcaf9648208dee53f79b5d16fa020fe5427210c3b7673663707bf1feda98344fe4765796a060fac8055f2641f107d7d1ce5e51022
|
7
|
+
data.tar.gz: c92f8ee9fb924a3b8e46d6efb1fa9fa27eb72a06a66f9e9d64f84efa4458efd42a6d88d63f4c28880cd7ada18f333cd8ab927bb9a580d319d1ff1ec38e3b3664
|
data/README.md
CHANGED
@@ -769,7 +769,10 @@ To test the relative performance of different segmentation tools and libraries I
|
|
769
769
|
**Version 0.1.7**
|
770
770
|
* Add Alice in Wonderland specs
|
771
771
|
* Fix parenthesis between double quotations bug
|
772
|
-
* Fix split after quotation ending in dash bug
|
772
|
+
* Fix split after quotation ending in dash bug
|
773
|
+
|
774
|
+
**Version 0.1.8**
|
775
|
+
* Fix bug in splitting new sentence after single quotes
|
773
776
|
|
774
777
|
## Contributing
|
775
778
|
|
@@ -4,11 +4,15 @@ module PragmaticSegmenter
|
|
4
4
|
# This class replaces punctuation that is typically a sentence boundary
|
5
5
|
# but in this case is not a sentence boundary.
|
6
6
|
class PunctuationReplacer
|
7
|
+
# Rubular: http://rubular.com/r/2YFrKWQUYi
|
8
|
+
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
|
9
|
+
|
7
10
|
include Rules
|
8
|
-
attr_reader :matches_array, :text
|
9
|
-
def initialize(text:, matches_array
|
11
|
+
attr_reader :matches_array, :text, :match_type
|
12
|
+
def initialize(text:, matches_array:, **args)
|
10
13
|
@text = text
|
11
14
|
@matches_array = matches_array
|
15
|
+
@match_type = args[:match_type]
|
12
16
|
end
|
13
17
|
|
14
18
|
def replace
|
@@ -29,7 +33,9 @@ module PragmaticSegmenter
|
|
29
33
|
sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
|
30
34
|
sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
|
31
35
|
sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
|
32
|
-
|
36
|
+
unless match_type.eql?('single')
|
37
|
+
sub_7 = sub_characters(txt, sub_6, "'", '&⎋&')
|
38
|
+
end
|
33
39
|
end
|
34
40
|
txt.apply(SubEscapedRegexReservedCharacters::All)
|
35
41
|
end
|
@@ -1498,6 +1498,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
1498
1498
|
ps = PragmaticSegmenter::Segmenter.new(text: "\"Dinah'll miss me very much to-night, I should think!\" (Dinah was the cat.) \"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\"")
|
1499
1499
|
expect(ps.segment).to eq(["\"Dinah'll miss me very much to-night, I should think!\"", "(Dinah was the cat.)", "\"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\""])
|
1500
1500
|
end
|
1501
|
+
|
1502
|
+
it "correctly segments text #098" do
|
1503
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
|
1504
|
+
expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
|
1505
|
+
end
|
1501
1506
|
end
|
1502
1507
|
end
|
1503
1508
|
|