pragmatic_segmenter 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/list.rb +5 -4
- data/lib/pragmatic_segmenter/rules.rb +3 -1
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c54f1c34c1f0bd34858cbd9c3915ec47f8a4d807
|
4
|
+
data.tar.gz: 19a936bfd8fb6c0046f06a20f50319fb5a2c0ed7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 824392373d56549289ae89de98976d53afdf74ccde45f3b6069164d8882fbeef7ae04e2fec1261af2baf43223246b4f1f90ebab12b97fa0c22e0d81d6caa56fa
|
7
|
+
data.tar.gz: 4f43e20c69be0515d81c9d98e33b7ed2aadf05d5110ddd7acae3f971745c8406c03763fdc275afa3b14de3d5aa8846a85638845d90cc4c9fd34f4cf9e120d1d3
|
data/README.md
CHANGED
@@ -741,6 +741,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
741
741
|
**Version 0.0.8**
|
742
742
|
* Fix error in `list.rb`
|
743
743
|
|
744
|
+
**Version 0.0.9**
|
745
|
+
* Improve handling of alphabetical and roman numeral lists
|
746
|
+
|
744
747
|
## Contributing
|
745
748
|
|
746
749
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -32,7 +32,7 @@ module PragmaticSegmenter
|
|
32
32
|
|
33
33
|
# Rubular: http://rubular.com/r/NsNFSqrNvJ
|
34
34
|
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
|
35
|
-
|
35
|
+
/\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
|
36
36
|
|
37
37
|
# Rubular: http://rubular.com/r/wMpnVedEIb
|
38
38
|
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
|
@@ -134,10 +134,10 @@ module PragmaticSegmenter
|
|
134
134
|
|
135
135
|
def replace_alphabet_list_parens(a, txt)
|
136
136
|
txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
|
137
|
-
if
|
138
|
-
a.eql?(m.dup.downcase) ? "\
|
137
|
+
if m.include?('(')
|
138
|
+
a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
|
139
139
|
else
|
140
|
-
a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m
|
140
|
+
a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m)}" : "#{m}"
|
141
141
|
end
|
142
142
|
end
|
143
143
|
end
|
@@ -175,6 +175,7 @@ module PragmaticSegmenter
|
|
175
175
|
else
|
176
176
|
alphabet = ('a'..'z').to_a
|
177
177
|
end
|
178
|
+
list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
|
178
179
|
list_array.each_with_index do |a, i|
|
179
180
|
if i.eql?(list_array.length - 1)
|
180
181
|
last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
|
@@ -61,6 +61,7 @@ module PragmaticSegmenter
|
|
61
61
|
MixedDoubleQQ = Rule.new(/☇/, '??')
|
62
62
|
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
63
63
|
MixedDoubleEE = Rule.new(/☄/, '!!')
|
64
|
+
LeftParens = Rule.new(/&✂&/, '(')
|
64
65
|
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
65
66
|
Newline = Rule.new(/ȹ/, "\n")
|
66
67
|
|
@@ -70,7 +71,8 @@ module PragmaticSegmenter
|
|
70
71
|
ExclamationPoint, QuestionMark,
|
71
72
|
FullWidthQuestionMark, MixedDoubleQE,
|
72
73
|
MixedDoubleQQ, MixedDoubleEQ,
|
73
|
-
MixedDoubleEE,
|
74
|
+
MixedDoubleEE, LeftParens,
|
75
|
+
TemporaryEndingPunctutation,
|
74
76
|
Newline ]
|
75
77
|
end
|
76
78
|
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module PragmaticSegmenter
|
4
4
|
# This class splits text at sentence boundary punctuation marks
|
5
5
|
class SentenceBoundaryPunctuation
|
6
|
-
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)])
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
7
7
|
|
8
8
|
attr_reader :text
|
9
9
|
def initialize(text:)
|
@@ -900,7 +900,7 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
900
900
|
end
|
901
901
|
|
902
902
|
it "correctly segments text #086" do
|
903
|
-
ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world.
|
903
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. (b) Hello world. (c) Hello world. (d) Hello world. (e) Hello world.\n(f) Hello world.", language: "en")
|
904
904
|
expect(ps.segment).to eq(["(a) Hello world.", "(b) Hello world.", "(c) Hello world.", "(d) Hello world.", "(e) Hello world.", "(f) Hello world."])
|
905
905
|
end
|
906
906
|
|