pragmatic_segmenter 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pragmatic_segmenter/cleaner.rb +26 -12
- data/lib/pragmatic_segmenter/ellipsis.rb +0 -1
- data/lib/pragmatic_segmenter/language_support.rb +0 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +1 -1
- data/lib/pragmatic_segmenter/list.rb +0 -2
- data/lib/pragmatic_segmenter/number.rb +5 -5
- data/lib/pragmatic_segmenter/process.rb +30 -46
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +17 -37
- data/lib/pragmatic_segmenter/rules.rb +65 -4
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0545b8e2fe6446107740b5c458b96e76b6edc51
|
4
|
+
data.tar.gz: 746d97aba038d8f23a6701d7df08205ff48203a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5975faedda7f913678ea122317266722895376da90be3b4094e50d61d2eef1e531b3df890d93199393f1945a053e14646e8d2b7bc73287de9250751f332483aa
|
7
|
+
data.tar.gz: d48fcd09e289833f82a5e6aa5915b3faa5ecf4417874273a2b8bb7640f51454f1374736449478da8328006277b0727c16b3b8badd258791fb011dd23f351266e
|
@@ -50,9 +50,13 @@ module PragmaticSegmenter
|
|
50
50
|
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
51
51
|
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
52
52
|
|
53
|
-
# http://rubular.com/r/IQ4TPfsbd8
|
53
|
+
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
54
54
|
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
55
55
|
|
56
|
+
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
57
|
+
|
58
|
+
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
59
|
+
|
56
60
|
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
57
61
|
|
58
62
|
QuotationsFirstRule = Rule.new(/''/, '"')
|
@@ -81,12 +85,14 @@ module PragmaticSegmenter
|
|
81
85
|
def clean
|
82
86
|
return unless text
|
83
87
|
@clean_text = remove_all_newlines(text)
|
84
|
-
|
85
|
-
|
86
|
-
@clean_text
|
87
|
-
@clean_text
|
88
|
-
@clean_text
|
89
|
-
|
88
|
+
replace_double_newlines(@clean_text)
|
89
|
+
replace_newlines(@clean_text)
|
90
|
+
replace_escaped_newlines(@clean_text)
|
91
|
+
@clean_text.apply(HtmlRules::All)
|
92
|
+
@clean_text.apply(InlineFormattingRule)
|
93
|
+
clean_quotations(@clean_text)
|
94
|
+
clean_table_of_contents(@clean_text)
|
95
|
+
clean_consecutive_characters(@clean_text)
|
90
96
|
end
|
91
97
|
|
92
98
|
private
|
@@ -110,6 +116,11 @@ module PragmaticSegmenter
|
|
110
116
|
txt.apply(NewLineInMiddleOfWordRule)
|
111
117
|
end
|
112
118
|
|
119
|
+
def replace_escaped_newlines(txt)
|
120
|
+
txt.apply(EscapedNewLineRule).
|
121
|
+
apply(EscapedCarriageReturnRule)
|
122
|
+
end
|
123
|
+
|
113
124
|
def replace_double_newlines(txt)
|
114
125
|
txt.apply(DoubleNewLineWithSpaceRule).
|
115
126
|
apply(DoubleNewLineRule)
|
@@ -117,13 +128,11 @@ module PragmaticSegmenter
|
|
117
128
|
|
118
129
|
def replace_newlines(txt)
|
119
130
|
if doc_type.eql?('pdf')
|
120
|
-
|
131
|
+
remove_pdf_line_breaks(txt)
|
121
132
|
else
|
122
|
-
txt
|
123
|
-
|
124
|
-
apply(ReplaceNewlineWithCarriageReturnRule)
|
133
|
+
txt.apply(NewLineFollowedByPeriodRule).
|
134
|
+
apply(ReplaceNewlineWithCarriageReturnRule)
|
125
135
|
end
|
126
|
-
txt
|
127
136
|
end
|
128
137
|
|
129
138
|
def remove_pdf_line_breaks(txt)
|
@@ -142,5 +151,10 @@ module PragmaticSegmenter
|
|
142
151
|
apply(ConsecutivePeriodsRule).
|
143
152
|
apply(ConsecutiveForwardSlashRule)
|
144
153
|
end
|
154
|
+
|
155
|
+
def clean_consecutive_characters(txt)
|
156
|
+
txt.apply(ConsecutivePeriodsRule).
|
157
|
+
apply(ConsecutiveForwardSlashRule)
|
158
|
+
end
|
145
159
|
end
|
146
160
|
end
|
@@ -124,14 +124,12 @@ module PragmaticSegmenter
|
|
124
124
|
txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
|
125
125
|
a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
|
126
126
|
end
|
127
|
-
txt
|
128
127
|
end
|
129
128
|
|
130
129
|
def replace_alphabet_list_parens(a, txt)
|
131
130
|
txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
|
132
131
|
a.eql?(m) ? "\r#{Regexp.escape(a.to_s)}" : "#{m}"
|
133
132
|
end
|
134
|
-
txt
|
135
133
|
end
|
136
134
|
|
137
135
|
def replace_correct_alphabet_list(a, txt, parens)
|
@@ -25,11 +25,11 @@ module PragmaticSegmenter
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def replace
|
28
|
-
@
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
@text.apply(PeriodBeforeNumberRule).
|
29
|
+
apply(NumberAfterPeriodBeforeLetterRule).
|
30
|
+
apply(NewLineNumberPeriodSpaceLetterRule).
|
31
|
+
apply(StartLineNumberPeriodRule).
|
32
|
+
apply(StartLineTwoDigitNumberPeriodRule)
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
@@ -30,57 +30,53 @@ module PragmaticSegmenter
|
|
30
30
|
reformatted_text = replace_abbreviations(reformatted_text)
|
31
31
|
reformatted_text = replace_numbers(reformatted_text)
|
32
32
|
reformatted_text = reformatted_text.apply(GeoLocationRule)
|
33
|
-
|
33
|
+
split_into_segments(reformatted_text)
|
34
34
|
end
|
35
35
|
|
36
36
|
private
|
37
37
|
|
38
|
-
def
|
39
|
-
|
38
|
+
def split_into_segments(txt)
|
39
|
+
txt.split("\r")
|
40
|
+
.map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All, EmailRule) }
|
41
|
+
.map { |segment| check_for_punctuation(segment) }.flatten
|
42
|
+
.map! { |segment| segment.apply(SubSymbolsRules::All) }
|
43
|
+
.map { |segment| post_process_segments(segment) }
|
44
|
+
.flatten.compact.delete_if(&:empty?)
|
45
|
+
end
|
40
46
|
|
41
|
-
|
42
|
-
|
47
|
+
def post_process_segments(txt)
|
48
|
+
return if consecutive_underscore?(txt) || txt.length < 2
|
49
|
+
txt.apply(ReinsertEllipsisRules::All).apply(ExtraWhiteSpaceRule)
|
50
|
+
if txt =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
|
51
|
+
txt.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
|
52
|
+
else
|
53
|
+
txt.tr("\n", '').strip
|
43
54
|
end
|
55
|
+
end
|
44
56
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
sentence_array = []
|
50
|
-
segments.each_with_index do |line|
|
51
|
-
next if line.gsub(/_{3,}/, '').length.eql?(0) || line.length < 2
|
52
|
-
line = reinsert_ellipsis(line)
|
53
|
-
line = line.apply(ExtraWhiteSpaceRule)
|
54
|
-
if line =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
|
55
|
-
subline = line.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
|
56
|
-
subline.each do |s|
|
57
|
-
sentence_array << s
|
58
|
-
end
|
59
|
-
else
|
60
|
-
sentence_array << line.tr("\n", '').strip
|
61
|
-
end
|
62
|
-
end
|
63
|
-
sentence_array.reject(&:empty?)
|
57
|
+
def consecutive_underscore?(txt)
|
58
|
+
# Rubular: http://rubular.com/r/fTF2Ff3WBL
|
59
|
+
txt.gsub(/_{3,}/, '').length.eql?(0)
|
64
60
|
end
|
65
61
|
|
66
|
-
def
|
67
|
-
if punctuation_array.any? { |p|
|
68
|
-
process_text(
|
62
|
+
def check_for_punctuation(txt)
|
63
|
+
if punctuation_array.any? { |p| txt.include?(p) }
|
64
|
+
process_text(txt)
|
69
65
|
else
|
70
|
-
|
66
|
+
txt
|
71
67
|
end
|
72
68
|
end
|
73
69
|
|
74
|
-
def process_text(
|
75
|
-
|
76
|
-
PragmaticSegmenter::ExclamationWords.apply_rules(
|
77
|
-
between_punctutation(
|
78
|
-
|
70
|
+
def process_text(txt)
|
71
|
+
txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
|
72
|
+
PragmaticSegmenter::ExclamationWords.apply_rules(txt)
|
73
|
+
between_punctutation(txt)
|
74
|
+
txt = txt.apply(
|
79
75
|
DoublePuctationRules::All,
|
80
76
|
QuestionMarkInQuotationRule,
|
81
77
|
ExclamationPointRules::All
|
82
78
|
)
|
83
|
-
sentence_boundary_punctuation(
|
79
|
+
sentence_boundary_punctuation(txt)
|
84
80
|
end
|
85
81
|
|
86
82
|
def replace_numbers(txt)
|
@@ -102,17 +98,5 @@ module PragmaticSegmenter
|
|
102
98
|
def sentence_boundary_punctuation(txt)
|
103
99
|
PragmaticSegmenter::SentenceBoundaryPunctuation.new(text: txt).split
|
104
100
|
end
|
105
|
-
|
106
|
-
def sub_symbols(txt)
|
107
|
-
txt.gsub(/∯/, '.').gsub(/♬/, '،').gsub(/♭/, ':').gsub(/ᓰ/, '。').gsub(/ᓱ/, '.')
|
108
|
-
.gsub(/ᓳ/, '!').gsub(/ᓴ/, '!').gsub(/ᓷ/, '?').gsub(/ᓸ/, '?').gsub(/☉/, '?!')
|
109
|
-
.gsub(/☈/, '!?').gsub(/☇/, '??').gsub(/☄/, '!!').delete('ȸ').gsub(/ȹ/, "\n")
|
110
|
-
end
|
111
|
-
|
112
|
-
def reinsert_ellipsis(line)
|
113
|
-
line.gsub(/ƪ/, '...').gsub(/♟/, ' . . . ')
|
114
|
-
.gsub(/♝/, '. . . .').gsub(/☏/, '..')
|
115
|
-
.gsub(/∮/, '.')
|
116
|
-
end
|
117
101
|
end
|
118
102
|
end
|
@@ -4,6 +4,7 @@ module PragmaticSegmenter
|
|
4
4
|
# This class replaces punctuation that is typically a sentence boundary
|
5
5
|
# but in this case is not a sentence boundary.
|
6
6
|
class PunctuationReplacer
|
7
|
+
include Rules
|
7
8
|
attr_reader :matches_array, :text
|
8
9
|
def initialize(text:, matches_array:)
|
9
10
|
@text = text
|
@@ -18,45 +19,24 @@ module PragmaticSegmenter
|
|
18
19
|
|
19
20
|
def replace_punctuation(array, txt)
|
20
21
|
return if !array || array.empty?
|
21
|
-
txt.
|
22
|
-
txt.gsub!(')', '\\)')
|
23
|
-
txt.gsub!(']', '\\]')
|
24
|
-
txt.gsub!('[', '\\[')
|
25
|
-
txt.gsub!('-', '\\-')
|
22
|
+
txt.apply(EscapeRegexReservedCharacters::All)
|
26
23
|
array.each do |a|
|
27
|
-
a.
|
28
|
-
a.
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
txt
|
35
|
-
|
36
|
-
sub_1 = sub.gsub('。', 'ᓰ')
|
37
|
-
txt.gsub!(/#{Regexp.escape(sub)}/, "#{sub_1}")
|
38
|
-
|
39
|
-
sub_2 = sub_1.gsub('.', 'ᓱ')
|
40
|
-
txt.gsub!(/#{Regexp.escape(sub_1)}/, "#{sub_2}")
|
41
|
-
|
42
|
-
sub_3 = sub_2.gsub('!', 'ᓳ')
|
43
|
-
txt.gsub!(/#{Regexp.escape(sub_2)}/, "#{sub_3}")
|
44
|
-
|
45
|
-
sub_4 = sub_3.gsub('!', 'ᓴ')
|
46
|
-
txt.gsub!(/#{Regexp.escape(sub_3)}/, "#{sub_4}")
|
47
|
-
|
48
|
-
sub_5 = sub_4.gsub('?', 'ᓷ')
|
49
|
-
txt.gsub!(/#{Regexp.escape(sub_4)}/, "#{sub_5}")
|
50
|
-
|
51
|
-
sub_6 = sub_5.gsub('?', 'ᓸ')
|
52
|
-
txt.gsub!(/#{Regexp.escape(sub_5)}/, "#{sub_6}")
|
24
|
+
a.apply(EscapeRegexReservedCharacters::All)
|
25
|
+
sub = sub_characters(txt, a, '.', '∯')
|
26
|
+
sub_1 = sub_characters(txt, sub, '。', '&ᓰ&')
|
27
|
+
sub_2 = sub_characters(txt, sub_1, '.', '&ᓱ&')
|
28
|
+
sub_3 = sub_characters(txt, sub_2, '!', '&ᓳ&')
|
29
|
+
sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
|
30
|
+
sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
|
31
|
+
sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
|
53
32
|
end
|
54
|
-
txt.
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
txt
|
33
|
+
txt.apply(SubEscapedRegexReservedCharacters::All)
|
34
|
+
end
|
35
|
+
|
36
|
+
def sub_characters(txt, string, char_a, char_b)
|
37
|
+
sub = string.gsub(char_a, char_b)
|
38
|
+
txt.gsub!(/#{Regexp.escape(string)}/, "#{sub}")
|
39
|
+
sub
|
60
40
|
end
|
61
41
|
end
|
62
42
|
end
|
@@ -11,17 +11,17 @@ module PragmaticSegmenter
|
|
11
11
|
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
12
12
|
|
13
13
|
# Rubular: http://rubular.com/r/aXPUGm6fQh
|
14
|
-
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '
|
14
|
+
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
|
15
15
|
|
16
16
|
module ExclamationPointRules
|
17
17
|
# Rubular: http://rubular.com/r/XS1XXFRfM2
|
18
|
-
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '
|
18
|
+
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
|
19
19
|
|
20
20
|
# Rubular: http://rubular.com/r/sl57YI8LkA
|
21
|
-
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '
|
21
|
+
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
|
22
22
|
|
23
23
|
# Rubular: http://rubular.com/r/f9zTjmkIPb
|
24
|
-
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '
|
24
|
+
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
|
25
25
|
|
26
26
|
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
27
27
|
end
|
@@ -34,5 +34,66 @@ module PragmaticSegmenter
|
|
34
34
|
|
35
35
|
All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
|
36
36
|
end
|
37
|
+
|
38
|
+
module ReinsertEllipsisRules
|
39
|
+
ThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
|
40
|
+
ThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
|
41
|
+
FourSpacePeriod = Rule.new(/♝/, '. . . .')
|
42
|
+
TwoConsecutivePeriod = Rule.new(/☏/, '..')
|
43
|
+
OnePeriod = Rule.new(/∮/, '.')
|
44
|
+
|
45
|
+
All = [ ThreeConsecutivePeriod, ThreeSpacePeriod,
|
46
|
+
FourSpacePeriod, TwoConsecutivePeriod,
|
47
|
+
OnePeriod ]
|
48
|
+
end
|
49
|
+
|
50
|
+
module SubSymbolsRules
|
51
|
+
Period = Rule.new(/∯/, '.')
|
52
|
+
ArabicComma = Rule.new(/♬/, '،')
|
53
|
+
SemiColon = Rule.new(/♭/, ':')
|
54
|
+
FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
|
55
|
+
SpecialPeriod = Rule.new(/&ᓱ&/, '.')
|
56
|
+
FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
|
57
|
+
ExclamationPoint = Rule.new(/&ᓴ&/, '!')
|
58
|
+
QuestionMark = Rule.new(/&ᓷ&/, '?')
|
59
|
+
FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
|
60
|
+
MixedDoubleQE = Rule.new(/☉/, '?!')
|
61
|
+
MixedDoubleQQ = Rule.new(/☇/, '??')
|
62
|
+
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
63
|
+
MixedDoubleEE = Rule.new(/☄/, '!!')
|
64
|
+
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
65
|
+
Newline = Rule.new(/ȹ/, "\n")
|
66
|
+
|
67
|
+
All = [ Period, ArabicComma,
|
68
|
+
SemiColon, FullWidthPeriod,
|
69
|
+
SpecialPeriod, FullWidthExclamation,
|
70
|
+
ExclamationPoint, QuestionMark,
|
71
|
+
FullWidthQuestionMark, MixedDoubleQE,
|
72
|
+
MixedDoubleQQ, MixedDoubleEQ,
|
73
|
+
MixedDoubleEE, TemporaryEndingPunctutation,
|
74
|
+
Newline ]
|
75
|
+
end
|
76
|
+
|
77
|
+
module EscapeRegexReservedCharacters
|
78
|
+
LeftParen = Rule.new('(', '\\(')
|
79
|
+
RightParen = Rule.new(')', '\\)')
|
80
|
+
LeftBracket = Rule.new('[', '\\[')
|
81
|
+
RightBracket = Rule.new(']', '\\]')
|
82
|
+
Dash = Rule.new('-', '\\-')
|
83
|
+
|
84
|
+
All = [ LeftParen, RightParen,
|
85
|
+
LeftBracket, RightBracket, Dash ]
|
86
|
+
end
|
87
|
+
|
88
|
+
module SubEscapedRegexReservedCharacters
|
89
|
+
LeftParen = Rule.new('\\(', '(')
|
90
|
+
RightParen = Rule.new('\\)', ')')
|
91
|
+
LeftBracket = Rule.new('\\[', '[')
|
92
|
+
RightBracket = Rule.new('\\]', ']')
|
93
|
+
Dash = Rule.new('\\-', '-')
|
94
|
+
|
95
|
+
All = [ LeftParen, RightParen,
|
96
|
+
LeftBracket, RightBracket, Dash ]
|
97
|
+
end
|
37
98
|
end
|
38
99
|
end
|
@@ -868,6 +868,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
868
868
|
ps = PragmaticSegmenter::Segmenter.new(text: "////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////Header starts here\r////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////", language: 'en')
|
869
869
|
expect(ps.segment).to eq(["Header starts here"])
|
870
870
|
end
|
871
|
+
|
872
|
+
it 'correctly segments text #082' do
|
873
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \r\n Hello.', language: 'en')
|
874
|
+
expect(ps.segment).to eq(["Hello World.", "Hello."])
|
875
|
+
end
|
871
876
|
end
|
872
877
|
end
|
873
878
|
|