pragmatic_segmenter 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2951fa2242e1eb7ce0898862d41fd5239d871f8
|
4
|
+
data.tar.gz: 01d84b2637e84598907ef08e59ec64e90855c2ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 25ec02bc41c649a57b1bc993848bed546514568c74e2e0ec5b6a2242fbbb579e61e199210f8ea4349ceadee5a06ad0fa76c7c60196762903cab984bbad3c2286
|
7
|
+
data.tar.gz: 925a24d8c131813ddb4f80a8865697baa0efe05ef781cabfc13dfb9b1efc878c2e6e276a46a9f1301049a99b2f64a558b6a9911e0a2f5f8f4c17622df3fd0b2a
|
data/README.md
CHANGED
@@ -760,6 +760,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
760
760
|
**Version 0.1.4**
|
761
761
|
* Fix missing abbreviations
|
762
762
|
|
763
|
+
**Version 0.1.5**
|
764
|
+
* Fix comma at end of quoatation bug
|
765
|
+
|
763
766
|
## Contributing
|
764
767
|
|
765
768
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module PragmaticSegmenter
|
4
4
|
# This class splits text at sentence boundary punctuation marks
|
5
5
|
class SentenceBoundaryPunctuation
|
6
|
-
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
7
7
|
|
8
8
|
attr_reader :text
|
9
9
|
def initialize(text:)
|
@@ -928,6 +928,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
928
928
|
ps = PragmaticSegmenter::Segmenter.new(text: "[15: 12:32] [16: firma? 13:28]")
|
929
929
|
expect(ps.segment).to eq(["[15: 12:32] [16: firma? 13:28]"])
|
930
930
|
end
|
931
|
+
|
932
|
+
it "correctly segments text #092" do
|
933
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\"It's a good thing that the water is really calm,\" I answered ironically.")
|
934
|
+
expect(ps.segment).to eq(["\"It's a good thing that the water is really calm,\" I answered ironically."])
|
935
|
+
end
|
931
936
|
end
|
932
937
|
end
|
933
938
|
|