pragmatic_segmenter 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1708727dc62734ca577e6d9b1c05af49551563a7
|
4
|
+
data.tar.gz: 1f00d14609486e72068d3a189851ef9fd9f66a31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a45479d6c2cfa930e2d6bafcaf9648208dee53f79b5d16fa020fe5427210c3b7673663707bf1feda98344fe4765796a060fac8055f2641f107d7d1ce5e51022
|
7
|
+
data.tar.gz: c92f8ee9fb924a3b8e46d6efb1fa9fa27eb72a06a66f9e9d64f84efa4458efd42a6d88d63f4c28880cd7ada18f333cd8ab927bb9a580d319d1ff1ec38e3b3664
|
data/README.md
CHANGED
@@ -769,7 +769,10 @@ To test the relative performance of different segmentation tools and libraries I
|
|
769
769
|
**Version 0.1.7**
|
770
770
|
* Add Alice in Wonderland specs
|
771
771
|
* Fix parenthesis between double quotations bug
|
772
|
-
* Fix split after quotation ending in dash bug
|
772
|
+
* Fix split after quotation ending in dash bug
|
773
|
+
|
774
|
+
**Version 0.1.8**
|
775
|
+
* Fix bug in splitting new sentence after single quotes
|
773
776
|
|
774
777
|
## Contributing
|
775
778
|
|
@@ -4,11 +4,15 @@ module PragmaticSegmenter
|
|
4
4
|
# This class replaces punctuation that is typically a sentence boundary
|
5
5
|
# but in this case is not a sentence boundary.
|
6
6
|
class PunctuationReplacer
|
7
|
+
# Rubular: http://rubular.com/r/2YFrKWQUYi
|
8
|
+
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
|
9
|
+
|
7
10
|
include Rules
|
8
|
-
attr_reader :matches_array, :text
|
9
|
-
def initialize(text:, matches_array
|
11
|
+
attr_reader :matches_array, :text, :match_type
|
12
|
+
def initialize(text:, matches_array:, **args)
|
10
13
|
@text = text
|
11
14
|
@matches_array = matches_array
|
15
|
+
@match_type = args[:match_type]
|
12
16
|
end
|
13
17
|
|
14
18
|
def replace
|
@@ -29,7 +33,9 @@ module PragmaticSegmenter
|
|
29
33
|
sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
|
30
34
|
sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
|
31
35
|
sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
|
32
|
-
|
36
|
+
unless match_type.eql?('single')
|
37
|
+
sub_7 = sub_characters(txt, sub_6, "'", '&⎋&')
|
38
|
+
end
|
33
39
|
end
|
34
40
|
txt.apply(SubEscapedRegexReservedCharacters::All)
|
35
41
|
end
|
@@ -1498,6 +1498,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
1498
1498
|
ps = PragmaticSegmenter::Segmenter.new(text: "\"Dinah'll miss me very much to-night, I should think!\" (Dinah was the cat.) \"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\"")
|
1499
1499
|
expect(ps.segment).to eq(["\"Dinah'll miss me very much to-night, I should think!\"", "(Dinah was the cat.)", "\"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\""])
|
1500
1500
|
end
|
1501
|
+
|
1502
|
+
it "correctly segments text #098" do
|
1503
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
|
1504
|
+
expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
|
1505
|
+
end
|
1501
1506
|
end
|
1502
1507
|
end
|
1503
1508
|
|