pragmatic_segmenter 0.3.14 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS +5 -1
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +11 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +15 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d67794ddb8ba2e0fdcdd4b0bda833ec07d7addc0
|
|
4
|
+
data.tar.gz: 04e7435b5148d865aca9062a9c72f88f8768860e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c23512769b5ad8d8190ff84443662d41959960352b45e5c108ed8417d12953f44be37ce8117b2434bb72c411b4c905472525e1411984e5e3cd39de97d1865b93
|
|
7
|
+
data.tar.gz: 6e108d84ad98a09e5cc6bfa51881541eb1f90b7103e7c004a1cd2980a3bed0c970e850fb5df4231af326dd15b7eaf04e1a09f5c96e849e2942a3e8dbbf94638f
|
data/NEWS
CHANGED
|
@@ -1,4 +1,8 @@
|
|
|
1
|
-
0.3.
|
|
1
|
+
0.3.15 (2017-06-28):
|
|
2
|
+
|
|
3
|
+
* Improvement: Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
|
|
4
|
+
|
|
5
|
+
0.3.14 (2017-06-28):
|
|
2
6
|
|
|
3
7
|
* Improvement: Add English abbreviation Rs. to denote the Indian currency
|
|
4
8
|
|
data/README.md
CHANGED
|
@@ -853,6 +853,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
|
853
853
|
**Version 0.3.14**
|
|
854
854
|
* Add English abbreviation Rs. to denote the Indian currency
|
|
855
855
|
|
|
856
|
+
**Version 0.3.15**
|
|
857
|
+
* Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
|
|
858
|
+
|
|
856
859
|
## Contributing
|
|
857
860
|
|
|
858
861
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
|
@@ -25,6 +25,9 @@ module PragmaticSegmenter
|
|
|
25
25
|
# Rubular: http://rubular.com/r/mXf8cW025o
|
|
26
26
|
WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/
|
|
27
27
|
|
|
28
|
+
# Rubular: http://rubular.com/r/jTtDKfjxzr
|
|
29
|
+
BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/
|
|
30
|
+
|
|
28
31
|
attr_reader :text
|
|
29
32
|
def initialize(text:)
|
|
30
33
|
@text = text
|
|
@@ -42,6 +45,7 @@ module PragmaticSegmenter
|
|
|
42
45
|
sub_punctuation_between_square_brackets(txt)
|
|
43
46
|
sub_punctuation_between_parens(txt)
|
|
44
47
|
sub_punctuation_between_quotes_arrow(txt)
|
|
48
|
+
sub_punctuation_between_em_dashes(txt)
|
|
45
49
|
sub_punctuation_between_quotes_slanted(txt)
|
|
46
50
|
end
|
|
47
51
|
|
|
@@ -87,6 +91,13 @@ module PragmaticSegmenter
|
|
|
87
91
|
).replace
|
|
88
92
|
end
|
|
89
93
|
|
|
94
|
+
def sub_punctuation_between_em_dashes(txt)
|
|
95
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
|
96
|
+
matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
|
|
97
|
+
text: txt
|
|
98
|
+
).replace
|
|
99
|
+
end
|
|
100
|
+
|
|
90
101
|
def sub_punctuation_between_quotes_slanted(txt)
|
|
91
102
|
PragmaticSegmenter::PunctuationReplacer.new(
|
|
92
103
|
matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
|
|
@@ -1384,5 +1384,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
|
1384
1384
|
ps = PragmaticSegmenter::Segmenter.new(text: "After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation.", clean: false)
|
|
1385
1385
|
expect(ps.segment).to eq(["After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation."])
|
|
1386
1386
|
end
|
|
1387
|
+
|
|
1388
|
+
it "correctly segments text #114" do
|
|
1389
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Mix it, put it in the oven, and -- voila! -- you have cake.", clean: false)
|
|
1390
|
+
expect(ps.segment).to eq(["Mix it, put it in the oven, and -- voila! -- you have cake."])
|
|
1391
|
+
end
|
|
1392
|
+
|
|
1393
|
+
it "correctly segments text #115" do
|
|
1394
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Some can be -- if I may say so? -- a bit questionable.", clean: false)
|
|
1395
|
+
expect(ps.segment).to eq(["Some can be -- if I may say so? -- a bit questionable."])
|
|
1396
|
+
end
|
|
1397
|
+
|
|
1398
|
+
it "correctly segments text #116" do
|
|
1399
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "What do you see? - Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries.", clean: false)
|
|
1400
|
+
expect(ps.segment).to eq(["What do you see?", "- Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries."])
|
|
1401
|
+
end
|
|
1387
1402
|
end
|
|
1388
1403
|
end
|