pragmatic_segmenter 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_segmenter/cleaner.rb +26 -12
- data/lib/pragmatic_segmenter/ellipsis.rb +0 -1
- data/lib/pragmatic_segmenter/language_support.rb +0 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +1 -1
- data/lib/pragmatic_segmenter/list.rb +0 -2
- data/lib/pragmatic_segmenter/number.rb +5 -5
- data/lib/pragmatic_segmenter/process.rb +30 -46
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +17 -37
- data/lib/pragmatic_segmenter/rules.rb +65 -4
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e0545b8e2fe6446107740b5c458b96e76b6edc51
|
4
|
+
data.tar.gz: 746d97aba038d8f23a6701d7df08205ff48203a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5975faedda7f913678ea122317266722895376da90be3b4094e50d61d2eef1e531b3df890d93199393f1945a053e14646e8d2b7bc73287de9250751f332483aa
|
7
|
+
data.tar.gz: d48fcd09e289833f82a5e6aa5915b3faa5ecf4417874273a2b8bb7640f51454f1374736449478da8328006277b0727c16b3b8badd258791fb011dd23f351266e
|
@@ -50,9 +50,13 @@ module PragmaticSegmenter
|
|
50
50
|
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
51
51
|
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
52
52
|
|
53
|
-
# http://rubular.com/r/IQ4TPfsbd8
|
53
|
+
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
54
54
|
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
55
55
|
|
56
|
+
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
57
|
+
|
58
|
+
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
59
|
+
|
56
60
|
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
57
61
|
|
58
62
|
QuotationsFirstRule = Rule.new(/''/, '"')
|
@@ -81,12 +85,14 @@ module PragmaticSegmenter
|
|
81
85
|
def clean
|
82
86
|
return unless text
|
83
87
|
@clean_text = remove_all_newlines(text)
|
84
|
-
|
85
|
-
|
86
|
-
@clean_text
|
87
|
-
@clean_text
|
88
|
-
@clean_text
|
89
|
-
|
88
|
+
replace_double_newlines(@clean_text)
|
89
|
+
replace_newlines(@clean_text)
|
90
|
+
replace_escaped_newlines(@clean_text)
|
91
|
+
@clean_text.apply(HtmlRules::All)
|
92
|
+
@clean_text.apply(InlineFormattingRule)
|
93
|
+
clean_quotations(@clean_text)
|
94
|
+
clean_table_of_contents(@clean_text)
|
95
|
+
clean_consecutive_characters(@clean_text)
|
90
96
|
end
|
91
97
|
|
92
98
|
private
|
@@ -110,6 +116,11 @@ module PragmaticSegmenter
|
|
110
116
|
txt.apply(NewLineInMiddleOfWordRule)
|
111
117
|
end
|
112
118
|
|
119
|
+
def replace_escaped_newlines(txt)
|
120
|
+
txt.apply(EscapedNewLineRule).
|
121
|
+
apply(EscapedCarriageReturnRule)
|
122
|
+
end
|
123
|
+
|
113
124
|
def replace_double_newlines(txt)
|
114
125
|
txt.apply(DoubleNewLineWithSpaceRule).
|
115
126
|
apply(DoubleNewLineRule)
|
@@ -117,13 +128,11 @@ module PragmaticSegmenter
|
|
117
128
|
|
118
129
|
def replace_newlines(txt)
|
119
130
|
if doc_type.eql?('pdf')
|
120
|
-
|
131
|
+
remove_pdf_line_breaks(txt)
|
121
132
|
else
|
122
|
-
txt
|
123
|
-
|
124
|
-
apply(ReplaceNewlineWithCarriageReturnRule)
|
133
|
+
txt.apply(NewLineFollowedByPeriodRule).
|
134
|
+
apply(ReplaceNewlineWithCarriageReturnRule)
|
125
135
|
end
|
126
|
-
txt
|
127
136
|
end
|
128
137
|
|
129
138
|
def remove_pdf_line_breaks(txt)
|
@@ -142,5 +151,10 @@ module PragmaticSegmenter
|
|
142
151
|
apply(ConsecutivePeriodsRule).
|
143
152
|
apply(ConsecutiveForwardSlashRule)
|
144
153
|
end
|
154
|
+
|
155
|
+
def clean_consecutive_characters(txt)
|
156
|
+
txt.apply(ConsecutivePeriodsRule).
|
157
|
+
apply(ConsecutiveForwardSlashRule)
|
158
|
+
end
|
145
159
|
end
|
146
160
|
end
|
@@ -124,14 +124,12 @@ module PragmaticSegmenter
|
|
124
124
|
txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
|
125
125
|
a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
|
126
126
|
end
|
127
|
-
txt
|
128
127
|
end
|
129
128
|
|
130
129
|
def replace_alphabet_list_parens(a, txt)
|
131
130
|
txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
|
132
131
|
a.eql?(m) ? "\r#{Regexp.escape(a.to_s)}" : "#{m}"
|
133
132
|
end
|
134
|
-
txt
|
135
133
|
end
|
136
134
|
|
137
135
|
def replace_correct_alphabet_list(a, txt, parens)
|
@@ -25,11 +25,11 @@ module PragmaticSegmenter
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def replace
|
28
|
-
@
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
@text.apply(PeriodBeforeNumberRule).
|
29
|
+
apply(NumberAfterPeriodBeforeLetterRule).
|
30
|
+
apply(NewLineNumberPeriodSpaceLetterRule).
|
31
|
+
apply(StartLineNumberPeriodRule).
|
32
|
+
apply(StartLineTwoDigitNumberPeriodRule)
|
33
33
|
end
|
34
34
|
end
|
35
35
|
end
|
@@ -30,57 +30,53 @@ module PragmaticSegmenter
|
|
30
30
|
reformatted_text = replace_abbreviations(reformatted_text)
|
31
31
|
reformatted_text = replace_numbers(reformatted_text)
|
32
32
|
reformatted_text = reformatted_text.apply(GeoLocationRule)
|
33
|
-
|
33
|
+
split_into_segments(reformatted_text)
|
34
34
|
end
|
35
35
|
|
36
36
|
private
|
37
37
|
|
38
|
-
def
|
39
|
-
|
38
|
+
def split_into_segments(txt)
|
39
|
+
txt.split("\r")
|
40
|
+
.map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All, EmailRule) }
|
41
|
+
.map { |segment| check_for_punctuation(segment) }.flatten
|
42
|
+
.map! { |segment| segment.apply(SubSymbolsRules::All) }
|
43
|
+
.map { |segment| post_process_segments(segment) }
|
44
|
+
.flatten.compact.delete_if(&:empty?)
|
45
|
+
end
|
40
46
|
|
41
|
-
|
42
|
-
|
47
|
+
def post_process_segments(txt)
|
48
|
+
return if consecutive_underscore?(txt) || txt.length < 2
|
49
|
+
txt.apply(ReinsertEllipsisRules::All).apply(ExtraWhiteSpaceRule)
|
50
|
+
if txt =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
|
51
|
+
txt.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
|
52
|
+
else
|
53
|
+
txt.tr("\n", '').strip
|
43
54
|
end
|
55
|
+
end
|
44
56
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
sentence_array = []
|
50
|
-
segments.each_with_index do |line|
|
51
|
-
next if line.gsub(/_{3,}/, '').length.eql?(0) || line.length < 2
|
52
|
-
line = reinsert_ellipsis(line)
|
53
|
-
line = line.apply(ExtraWhiteSpaceRule)
|
54
|
-
if line =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
|
55
|
-
subline = line.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
|
56
|
-
subline.each do |s|
|
57
|
-
sentence_array << s
|
58
|
-
end
|
59
|
-
else
|
60
|
-
sentence_array << line.tr("\n", '').strip
|
61
|
-
end
|
62
|
-
end
|
63
|
-
sentence_array.reject(&:empty?)
|
57
|
+
def consecutive_underscore?(txt)
|
58
|
+
# Rubular: http://rubular.com/r/fTF2Ff3WBL
|
59
|
+
txt.gsub(/_{3,}/, '').length.eql?(0)
|
64
60
|
end
|
65
61
|
|
66
|
-
def
|
67
|
-
if punctuation_array.any? { |p|
|
68
|
-
process_text(
|
62
|
+
def check_for_punctuation(txt)
|
63
|
+
if punctuation_array.any? { |p| txt.include?(p) }
|
64
|
+
process_text(txt)
|
69
65
|
else
|
70
|
-
|
66
|
+
txt
|
71
67
|
end
|
72
68
|
end
|
73
69
|
|
74
|
-
def process_text(
|
75
|
-
|
76
|
-
PragmaticSegmenter::ExclamationWords.apply_rules(
|
77
|
-
between_punctutation(
|
78
|
-
|
70
|
+
def process_text(txt)
|
71
|
+
txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
|
72
|
+
PragmaticSegmenter::ExclamationWords.apply_rules(txt)
|
73
|
+
between_punctutation(txt)
|
74
|
+
txt = txt.apply(
|
79
75
|
DoublePuctationRules::All,
|
80
76
|
QuestionMarkInQuotationRule,
|
81
77
|
ExclamationPointRules::All
|
82
78
|
)
|
83
|
-
sentence_boundary_punctuation(
|
79
|
+
sentence_boundary_punctuation(txt)
|
84
80
|
end
|
85
81
|
|
86
82
|
def replace_numbers(txt)
|
@@ -102,17 +98,5 @@ module PragmaticSegmenter
|
|
102
98
|
def sentence_boundary_punctuation(txt)
|
103
99
|
PragmaticSegmenter::SentenceBoundaryPunctuation.new(text: txt).split
|
104
100
|
end
|
105
|
-
|
106
|
-
def sub_symbols(txt)
|
107
|
-
txt.gsub(/∯/, '.').gsub(/♬/, '،').gsub(/♭/, ':').gsub(/ᓰ/, '。').gsub(/ᓱ/, '.')
|
108
|
-
.gsub(/ᓳ/, '!').gsub(/ᓴ/, '!').gsub(/ᓷ/, '?').gsub(/ᓸ/, '?').gsub(/☉/, '?!')
|
109
|
-
.gsub(/☈/, '!?').gsub(/☇/, '??').gsub(/☄/, '!!').delete('ȸ').gsub(/ȹ/, "\n")
|
110
|
-
end
|
111
|
-
|
112
|
-
def reinsert_ellipsis(line)
|
113
|
-
line.gsub(/ƪ/, '...').gsub(/♟/, ' . . . ')
|
114
|
-
.gsub(/♝/, '. . . .').gsub(/☏/, '..')
|
115
|
-
.gsub(/∮/, '.')
|
116
|
-
end
|
117
101
|
end
|
118
102
|
end
|
@@ -4,6 +4,7 @@ module PragmaticSegmenter
|
|
4
4
|
# This class replaces punctuation that is typically a sentence boundary
|
5
5
|
# but in this case is not a sentence boundary.
|
6
6
|
class PunctuationReplacer
|
7
|
+
include Rules
|
7
8
|
attr_reader :matches_array, :text
|
8
9
|
def initialize(text:, matches_array:)
|
9
10
|
@text = text
|
@@ -18,45 +19,24 @@ module PragmaticSegmenter
|
|
18
19
|
|
19
20
|
def replace_punctuation(array, txt)
|
20
21
|
return if !array || array.empty?
|
21
|
-
txt.
|
22
|
-
txt.gsub!(')', '\\)')
|
23
|
-
txt.gsub!(']', '\\]')
|
24
|
-
txt.gsub!('[', '\\[')
|
25
|
-
txt.gsub!('-', '\\-')
|
22
|
+
txt.apply(EscapeRegexReservedCharacters::All)
|
26
23
|
array.each do |a|
|
27
|
-
a.
|
28
|
-
a.
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
txt
|
35
|
-
|
36
|
-
sub_1 = sub.gsub('。', 'ᓰ')
|
37
|
-
txt.gsub!(/#{Regexp.escape(sub)}/, "#{sub_1}")
|
38
|
-
|
39
|
-
sub_2 = sub_1.gsub('.', 'ᓱ')
|
40
|
-
txt.gsub!(/#{Regexp.escape(sub_1)}/, "#{sub_2}")
|
41
|
-
|
42
|
-
sub_3 = sub_2.gsub('!', 'ᓳ')
|
43
|
-
txt.gsub!(/#{Regexp.escape(sub_2)}/, "#{sub_3}")
|
44
|
-
|
45
|
-
sub_4 = sub_3.gsub('!', 'ᓴ')
|
46
|
-
txt.gsub!(/#{Regexp.escape(sub_3)}/, "#{sub_4}")
|
47
|
-
|
48
|
-
sub_5 = sub_4.gsub('?', 'ᓷ')
|
49
|
-
txt.gsub!(/#{Regexp.escape(sub_4)}/, "#{sub_5}")
|
50
|
-
|
51
|
-
sub_6 = sub_5.gsub('?', 'ᓸ')
|
52
|
-
txt.gsub!(/#{Regexp.escape(sub_5)}/, "#{sub_6}")
|
24
|
+
a.apply(EscapeRegexReservedCharacters::All)
|
25
|
+
sub = sub_characters(txt, a, '.', '∯')
|
26
|
+
sub_1 = sub_characters(txt, sub, '。', '&ᓰ&')
|
27
|
+
sub_2 = sub_characters(txt, sub_1, '.', '&ᓱ&')
|
28
|
+
sub_3 = sub_characters(txt, sub_2, '!', '&ᓳ&')
|
29
|
+
sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
|
30
|
+
sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
|
31
|
+
sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
|
53
32
|
end
|
54
|
-
txt.
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
txt
|
33
|
+
txt.apply(SubEscapedRegexReservedCharacters::All)
|
34
|
+
end
|
35
|
+
|
36
|
+
def sub_characters(txt, string, char_a, char_b)
|
37
|
+
sub = string.gsub(char_a, char_b)
|
38
|
+
txt.gsub!(/#{Regexp.escape(string)}/, "#{sub}")
|
39
|
+
sub
|
60
40
|
end
|
61
41
|
end
|
62
42
|
end
|
@@ -11,17 +11,17 @@ module PragmaticSegmenter
|
|
11
11
|
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
12
12
|
|
13
13
|
# Rubular: http://rubular.com/r/aXPUGm6fQh
|
14
|
-
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '
|
14
|
+
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
|
15
15
|
|
16
16
|
module ExclamationPointRules
|
17
17
|
# Rubular: http://rubular.com/r/XS1XXFRfM2
|
18
|
-
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '
|
18
|
+
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
|
19
19
|
|
20
20
|
# Rubular: http://rubular.com/r/sl57YI8LkA
|
21
|
-
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '
|
21
|
+
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
|
22
22
|
|
23
23
|
# Rubular: http://rubular.com/r/f9zTjmkIPb
|
24
|
-
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '
|
24
|
+
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
|
25
25
|
|
26
26
|
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
27
27
|
end
|
@@ -34,5 +34,66 @@ module PragmaticSegmenter
|
|
34
34
|
|
35
35
|
All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
|
36
36
|
end
|
37
|
+
|
38
|
+
module ReinsertEllipsisRules
|
39
|
+
ThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
|
40
|
+
ThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
|
41
|
+
FourSpacePeriod = Rule.new(/♝/, '. . . .')
|
42
|
+
TwoConsecutivePeriod = Rule.new(/☏/, '..')
|
43
|
+
OnePeriod = Rule.new(/∮/, '.')
|
44
|
+
|
45
|
+
All = [ ThreeConsecutivePeriod, ThreeSpacePeriod,
|
46
|
+
FourSpacePeriod, TwoConsecutivePeriod,
|
47
|
+
OnePeriod ]
|
48
|
+
end
|
49
|
+
|
50
|
+
module SubSymbolsRules
|
51
|
+
Period = Rule.new(/∯/, '.')
|
52
|
+
ArabicComma = Rule.new(/♬/, '،')
|
53
|
+
SemiColon = Rule.new(/♭/, ':')
|
54
|
+
FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
|
55
|
+
SpecialPeriod = Rule.new(/&ᓱ&/, '.')
|
56
|
+
FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
|
57
|
+
ExclamationPoint = Rule.new(/&ᓴ&/, '!')
|
58
|
+
QuestionMark = Rule.new(/&ᓷ&/, '?')
|
59
|
+
FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
|
60
|
+
MixedDoubleQE = Rule.new(/☉/, '?!')
|
61
|
+
MixedDoubleQQ = Rule.new(/☇/, '??')
|
62
|
+
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
63
|
+
MixedDoubleEE = Rule.new(/☄/, '!!')
|
64
|
+
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
65
|
+
Newline = Rule.new(/ȹ/, "\n")
|
66
|
+
|
67
|
+
All = [ Period, ArabicComma,
|
68
|
+
SemiColon, FullWidthPeriod,
|
69
|
+
SpecialPeriod, FullWidthExclamation,
|
70
|
+
ExclamationPoint, QuestionMark,
|
71
|
+
FullWidthQuestionMark, MixedDoubleQE,
|
72
|
+
MixedDoubleQQ, MixedDoubleEQ,
|
73
|
+
MixedDoubleEE, TemporaryEndingPunctutation,
|
74
|
+
Newline ]
|
75
|
+
end
|
76
|
+
|
77
|
+
module EscapeRegexReservedCharacters
|
78
|
+
LeftParen = Rule.new('(', '\\(')
|
79
|
+
RightParen = Rule.new(')', '\\)')
|
80
|
+
LeftBracket = Rule.new('[', '\\[')
|
81
|
+
RightBracket = Rule.new(']', '\\]')
|
82
|
+
Dash = Rule.new('-', '\\-')
|
83
|
+
|
84
|
+
All = [ LeftParen, RightParen,
|
85
|
+
LeftBracket, RightBracket, Dash ]
|
86
|
+
end
|
87
|
+
|
88
|
+
module SubEscapedRegexReservedCharacters
|
89
|
+
LeftParen = Rule.new('\\(', '(')
|
90
|
+
RightParen = Rule.new('\\)', ')')
|
91
|
+
LeftBracket = Rule.new('\\[', '[')
|
92
|
+
RightBracket = Rule.new('\\]', ']')
|
93
|
+
Dash = Rule.new('\\-', '-')
|
94
|
+
|
95
|
+
All = [ LeftParen, RightParen,
|
96
|
+
LeftBracket, RightBracket, Dash ]
|
97
|
+
end
|
37
98
|
end
|
38
99
|
end
|
@@ -868,6 +868,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
868
868
|
ps = PragmaticSegmenter::Segmenter.new(text: "////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////Header starts here\r////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////", language: 'en')
|
869
869
|
expect(ps.segment).to eq(["Header starts here"])
|
870
870
|
end
|
871
|
+
|
872
|
+
it 'correctly segments text #082' do
|
873
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \r\n Hello.', language: 'en')
|
874
|
+
expect(ps.segment).to eq(["Hello World.", "Hello."])
|
875
|
+
end
|
871
876
|
end
|
872
877
|
end
|
873
878
|
|