pragmatic_segmenter 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2951fa2242e1eb7ce0898862d41fd5239d871f8
|
4
|
+
data.tar.gz: 01d84b2637e84598907ef08e59ec64e90855c2ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 25ec02bc41c649a57b1bc993848bed546514568c74e2e0ec5b6a2242fbbb579e61e199210f8ea4349ceadee5a06ad0fa76c7c60196762903cab984bbad3c2286
|
7
|
+
data.tar.gz: 925a24d8c131813ddb4f80a8865697baa0efe05ef781cabfc13dfb9b1efc878c2e6e276a46a9f1301049a99b2f64a558b6a9911e0a2f5f8f4c17622df3fd0b2a
|
data/README.md
CHANGED
@@ -760,6 +760,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
760
760
|
**Version 0.1.4**
|
761
761
|
* Fix missing abbreviations
|
762
762
|
|
763
|
+
**Version 0.1.5**
|
764
|
+
* Fix comma at end of quoatation bug
|
765
|
+
|
763
766
|
## Contributing
|
764
767
|
|
765
768
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module PragmaticSegmenter
|
4
4
|
# This class splits text at sentence boundary punctuation marks
|
5
5
|
class SentenceBoundaryPunctuation
|
6
|
-
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
7
7
|
|
8
8
|
attr_reader :text
|
9
9
|
def initialize(text:)
|
@@ -928,6 +928,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
928
928
|
ps = PragmaticSegmenter::Segmenter.new(text: "[15: 12:32] [16: firma? 13:28]")
|
929
929
|
expect(ps.segment).to eq(["[15: 12:32] [16: firma? 13:28]"])
|
930
930
|
end
|
931
|
+
|
932
|
+
it "correctly segments text #092" do
|
933
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\"It's a good thing that the water is really calm,\" I answered ironically.")
|
934
|
+
expect(ps.segment).to eq(["\"It's a good thing that the water is really calm,\" I answered ironically."])
|
935
|
+
end
|
931
936
|
end
|
932
937
|
end
|
933
938
|
|