pragmatic_segmenter 0.3.14 → 0.3.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS +5 -1
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +11 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter/languages/english_spec.rb +15 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d67794ddb8ba2e0fdcdd4b0bda833ec07d7addc0
|
4
|
+
data.tar.gz: 04e7435b5148d865aca9062a9c72f88f8768860e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c23512769b5ad8d8190ff84443662d41959960352b45e5c108ed8417d12953f44be37ce8117b2434bb72c411b4c905472525e1411984e5e3cd39de97d1865b93
|
7
|
+
data.tar.gz: 6e108d84ad98a09e5cc6bfa51881541eb1f90b7103e7c004a1cd2980a3bed0c970e850fb5df4231af326dd15b7eaf04e1a09f5c96e849e2942a3e8dbbf94638f
|
data/NEWS
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
0.3.
|
1
|
+
0.3.15 (2017-06-28):
|
2
|
+
|
3
|
+
* Improvement: Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
|
4
|
+
|
5
|
+
0.3.14 (2017-06-28):
|
2
6
|
|
3
7
|
* Improvement: Add English abbreviation Rs. to denote the Indian currency
|
4
8
|
|
data/README.md
CHANGED
@@ -853,6 +853,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
853
853
|
**Version 0.3.14**
|
854
854
|
* Add English abbreviation Rs. to denote the Indian currency
|
855
855
|
|
856
|
+
**Version 0.3.15**
|
857
|
+
* Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
|
858
|
+
|
856
859
|
## Contributing
|
857
860
|
|
858
861
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -25,6 +25,9 @@ module PragmaticSegmenter
|
|
25
25
|
# Rubular: http://rubular.com/r/mXf8cW025o
|
26
26
|
WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/
|
27
27
|
|
28
|
+
# Rubular: http://rubular.com/r/jTtDKfjxzr
|
29
|
+
BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/
|
30
|
+
|
28
31
|
attr_reader :text
|
29
32
|
def initialize(text:)
|
30
33
|
@text = text
|
@@ -42,6 +45,7 @@ module PragmaticSegmenter
|
|
42
45
|
sub_punctuation_between_square_brackets(txt)
|
43
46
|
sub_punctuation_between_parens(txt)
|
44
47
|
sub_punctuation_between_quotes_arrow(txt)
|
48
|
+
sub_punctuation_between_em_dashes(txt)
|
45
49
|
sub_punctuation_between_quotes_slanted(txt)
|
46
50
|
end
|
47
51
|
|
@@ -87,6 +91,13 @@ module PragmaticSegmenter
|
|
87
91
|
).replace
|
88
92
|
end
|
89
93
|
|
94
|
+
def sub_punctuation_between_em_dashes(txt)
|
95
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
96
|
+
matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
|
97
|
+
text: txt
|
98
|
+
).replace
|
99
|
+
end
|
100
|
+
|
90
101
|
def sub_punctuation_between_quotes_slanted(txt)
|
91
102
|
PragmaticSegmenter::PunctuationReplacer.new(
|
92
103
|
matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
|
@@ -1384,5 +1384,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
|
|
1384
1384
|
ps = PragmaticSegmenter::Segmenter.new(text: "After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation.", clean: false)
|
1385
1385
|
expect(ps.segment).to eq(["After completion of each Period, I will be paid an advance amount of rs. 1000 and this amount will be deducted from my final study compensation."])
|
1386
1386
|
end
|
1387
|
+
|
1388
|
+
it "correctly segments text #114" do
|
1389
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Mix it, put it in the oven, and -- voila! -- you have cake.", clean: false)
|
1390
|
+
expect(ps.segment).to eq(["Mix it, put it in the oven, and -- voila! -- you have cake."])
|
1391
|
+
end
|
1392
|
+
|
1393
|
+
it "correctly segments text #115" do
|
1394
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Some can be -- if I may say so? -- a bit questionable.", clean: false)
|
1395
|
+
expect(ps.segment).to eq(["Some can be -- if I may say so? -- a bit questionable."])
|
1396
|
+
end
|
1397
|
+
|
1398
|
+
it "correctly segments text #116" do
|
1399
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "What do you see? - Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries.", clean: false)
|
1400
|
+
expect(ps.segment).to eq(["What do you see?", "- Posted like silent sentinels all around the town, stand thousands upon thousands of mortal men fixed in ocean reveries."])
|
1401
|
+
end
|
1387
1402
|
end
|
1388
1403
|
end
|