pragmatic_segmenter 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/list.rb +5 -4
- data/lib/pragmatic_segmenter/rules.rb +3 -1
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c54f1c34c1f0bd34858cbd9c3915ec47f8a4d807
|
4
|
+
data.tar.gz: 19a936bfd8fb6c0046f06a20f50319fb5a2c0ed7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 824392373d56549289ae89de98976d53afdf74ccde45f3b6069164d8882fbeef7ae04e2fec1261af2baf43223246b4f1f90ebab12b97fa0c22e0d81d6caa56fa
|
7
|
+
data.tar.gz: 4f43e20c69be0515d81c9d98e33b7ed2aadf05d5110ddd7acae3f971745c8406c03763fdc275afa3b14de3d5aa8846a85638845d90cc4c9fd34f4cf9e120d1d3
|
data/README.md
CHANGED
@@ -741,6 +741,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
741
741
|
**Version 0.0.8**
|
742
742
|
* Fix error in `list.rb`
|
743
743
|
|
744
|
+
**Version 0.0.9**
|
745
|
+
* Improve handling of alphabetical and roman numeral lists
|
746
|
+
|
744
747
|
## Contributing
|
745
748
|
|
746
749
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -32,7 +32,7 @@ module PragmaticSegmenter
|
|
32
32
|
|
33
33
|
# Rubular: http://rubular.com/r/NsNFSqrNvJ
|
34
34
|
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
|
35
|
-
|
35
|
+
/\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
|
36
36
|
|
37
37
|
# Rubular: http://rubular.com/r/wMpnVedEIb
|
38
38
|
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
|
@@ -134,10 +134,10 @@ module PragmaticSegmenter
|
|
134
134
|
|
135
135
|
def replace_alphabet_list_parens(a, txt)
|
136
136
|
txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
|
137
|
-
if
|
138
|
-
a.eql?(m.dup.downcase) ? "\
|
137
|
+
if m.include?('(')
|
138
|
+
a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
|
139
139
|
else
|
140
|
-
a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m
|
140
|
+
a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m)}" : "#{m}"
|
141
141
|
end
|
142
142
|
end
|
143
143
|
end
|
@@ -175,6 +175,7 @@ module PragmaticSegmenter
|
|
175
175
|
else
|
176
176
|
alphabet = ('a'..'z').to_a
|
177
177
|
end
|
178
|
+
list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
|
178
179
|
list_array.each_with_index do |a, i|
|
179
180
|
if i.eql?(list_array.length - 1)
|
180
181
|
last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
|
@@ -61,6 +61,7 @@ module PragmaticSegmenter
|
|
61
61
|
MixedDoubleQQ = Rule.new(/☇/, '??')
|
62
62
|
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
63
63
|
MixedDoubleEE = Rule.new(/☄/, '!!')
|
64
|
+
LeftParens = Rule.new(/&✂&/, '(')
|
64
65
|
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
65
66
|
Newline = Rule.new(/ȹ/, "\n")
|
66
67
|
|
@@ -70,7 +71,8 @@ module PragmaticSegmenter
|
|
70
71
|
ExclamationPoint, QuestionMark,
|
71
72
|
FullWidthQuestionMark, MixedDoubleQE,
|
72
73
|
MixedDoubleQQ, MixedDoubleEQ,
|
73
|
-
MixedDoubleEE,
|
74
|
+
MixedDoubleEE, LeftParens,
|
75
|
+
TemporaryEndingPunctutation,
|
74
76
|
Newline ]
|
75
77
|
end
|
76
78
|
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module PragmaticSegmenter
|
4
4
|
# This class splits text at sentence boundary punctuation marks
|
5
5
|
class SentenceBoundaryPunctuation
|
6
|
-
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)])
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
7
7
|
|
8
8
|
attr_reader :text
|
9
9
|
def initialize(text:)
|
@@ -900,7 +900,7 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
900
900
|
end
|
901
901
|
|
902
902
|
it "correctly segments text #086" do
|
903
|
-
ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world.
|
903
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. (b) Hello world. (c) Hello world. (d) Hello world. (e) Hello world.\n(f) Hello world.", language: "en")
|
904
904
|
expect(ps.segment).to eq(["(a) Hello world.", "(b) Hello world.", "(c) Hello world.", "(d) Hello world.", "(e) Hello world.", "(f) Hello world."])
|
905
905
|
end
|
906
906
|
|