pragmatic_tokenizer 3.0.6 → 3.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +10 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +41 -93
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +149 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +76 -113
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +4 -5
- data/spec/performance_spec.rb +0 -1
- data/spec/spec_helper.rb +1 -1
- metadata +3 -3
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -58
@@ -1,70 +1,22 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'set'
|
3
2
|
require 'cgi'
|
3
|
+
require 'pragmatic_tokenizer/regex'
|
4
4
|
require 'pragmatic_tokenizer/languages'
|
5
5
|
require 'pragmatic_tokenizer/pre_processor'
|
6
6
|
require 'pragmatic_tokenizer/post_processor'
|
7
|
-
require 'pragmatic_tokenizer/full_stop_separator'
|
8
7
|
require 'unicode'
|
9
8
|
|
10
9
|
module PragmaticTokenizer
|
11
10
|
class Tokenizer
|
12
11
|
|
13
|
-
|
14
|
-
NUMBERS_OPTIONS = Set.new([
|
15
|
-
MENTIONS_OPTIONS = Set.new([
|
12
|
+
PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
|
13
|
+
NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
|
14
|
+
MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
|
16
15
|
MAX_TOKEN_LENGTH = 50
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
REGEX_URL = /(http|https)(\.|:)/
|
22
|
-
REGEX_HYPHEN = /\-/
|
23
|
-
REGEX_LONG_WORD = /\-|\_/
|
24
|
-
REGEXP_SPLIT_CHECK = /@|@|(http)/
|
25
|
-
REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
|
26
|
-
REGEX_APOSTROPHE_S = /['’`́]s$/
|
27
|
-
REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
|
28
|
-
REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
|
29
|
-
REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
|
30
|
-
REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
|
31
|
-
REGEX_ASTERISK = /\*+/
|
32
|
-
REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
|
33
|
-
REGEX_UNDERSCORE_AT_END,
|
34
|
-
REGEX_ASTERISK)
|
35
|
-
# https://en.wikipedia.org/wiki/Control_character
|
36
|
-
# matches any character with hexadecimal value 00 through 1F or 7F.
|
37
|
-
# Rubular: http://rubular.com/r/E83fpBoDjI
|
38
|
-
REGEXP_CONTROL = /[[:cntrl:]]/
|
39
|
-
REGEXP_ENDING_COLON = /\:(?=\z)/
|
40
|
-
REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
|
41
|
-
REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
|
42
|
-
REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
|
43
|
-
REGEXP_SPECIAL_SYMBOL = /[®©]/
|
44
|
-
REGEXP_PERCENT_AT_START = /\A\%/
|
45
|
-
# https://codepoints.net/enclosed_alphanumeric_supplement
|
46
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
47
|
-
REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
|
48
|
-
REGEXP_ENDING_COLON,
|
49
|
-
REGEXP_EXCLAMATION_AT_START,
|
50
|
-
REGEXP_EXCLAMATION_AT_END,
|
51
|
-
REGEXP_HYPHEN_AT_START,
|
52
|
-
REGEXP_SPECIAL_SYMBOL,
|
53
|
-
REGEXP_PERCENT_AT_START,
|
54
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT)
|
55
|
-
REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
|
56
|
-
REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
|
57
|
-
REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
|
58
|
-
REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
|
59
|
-
REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
|
60
|
-
REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
|
61
|
-
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
|
62
|
-
REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
|
63
|
-
REGEXP_NUMBER_ONLY = /\A\d+\z/
|
64
|
-
REGEXP_NO_NUMBERS = /\A\D+\z/
|
65
|
-
REGEXP_NUMBER = /\D*\d+\d*/
|
66
|
-
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
67
|
-
REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
|
16
|
+
NOTHING = ''.freeze
|
17
|
+
DOT = '.'.freeze
|
18
|
+
SPACE = ' '.freeze
|
19
|
+
SINGLE_QUOTE = "'".freeze
|
68
20
|
|
69
21
|
# @param [Hash] opts optional arguments
|
70
22
|
|
@@ -124,7 +76,7 @@ module PragmaticTokenizer
|
|
124
76
|
@abbreviations = Set.new(opts[:abbreviations])
|
125
77
|
@stop_words = Set.new(opts[:stop_words])
|
126
78
|
|
127
|
-
#
|
79
|
+
# Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
|
128
80
|
@contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
|
129
81
|
@abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
|
130
82
|
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
@@ -136,13 +88,13 @@ module PragmaticTokenizer
|
|
136
88
|
@stop_words += language::STOP_WORDS
|
137
89
|
end
|
138
90
|
|
139
|
-
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless
|
91
|
+
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
|
140
92
|
raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
|
141
93
|
raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
|
142
94
|
|
143
95
|
integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
|
144
96
|
|
145
|
-
raise "In Pragmatic Tokenizer minimum_length must be an Integer"
|
97
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
|
146
98
|
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
|
147
99
|
end
|
148
100
|
|
@@ -152,21 +104,27 @@ module PragmaticTokenizer
|
|
152
104
|
return [] unless text
|
153
105
|
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
154
106
|
CGI.unescapeHTML(text)
|
155
|
-
.scan(
|
156
|
-
.flat_map { |segment|
|
107
|
+
.scan(Regex::CHUNK_LONG_INPUT_TEXT)
|
108
|
+
.flat_map { |segment| process_segment(segment) }
|
157
109
|
end
|
158
110
|
|
159
111
|
private
|
160
112
|
|
161
|
-
def
|
162
|
-
|
113
|
+
def process_segment(segment)
|
114
|
+
pre_processed = pre_process(segment)
|
115
|
+
cased_segment = chosen_case(pre_processed)
|
116
|
+
@tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
|
117
|
+
post_process_tokens
|
118
|
+
end
|
119
|
+
|
120
|
+
def pre_process(segment)
|
121
|
+
segment
|
163
122
|
.extend(PragmaticTokenizer::PreProcessor)
|
164
123
|
.pre_process(language: @language_module)
|
165
124
|
end
|
166
125
|
|
167
|
-
def
|
168
|
-
|
169
|
-
remove_various!
|
126
|
+
def post_process_tokens
|
127
|
+
remove_by_options!
|
170
128
|
process_numbers!
|
171
129
|
process_punctuation!
|
172
130
|
expand_contractions! if @expand_contractions
|
@@ -180,45 +138,45 @@ module PragmaticTokenizer
|
|
180
138
|
@tokens.reject(&:empty?)
|
181
139
|
end
|
182
140
|
|
183
|
-
def run_post_processor(text)
|
184
|
-
PostProcessor.new(
|
185
|
-
text: chosen_case(text),
|
186
|
-
abbreviations: @abbreviations,
|
187
|
-
downcase: @downcase
|
188
|
-
).post_process
|
189
|
-
end
|
190
|
-
|
191
141
|
def expand_contractions!
|
192
|
-
@tokens = @tokens.flat_map { |
|
142
|
+
@tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
|
193
143
|
end
|
194
144
|
|
195
145
|
def expand_token_contraction(token)
|
196
|
-
normalized = inverse_case(token.gsub(
|
146
|
+
normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
|
197
147
|
return token unless @contractions.key?(normalized)
|
198
|
-
result = @contractions[normalized].split(
|
148
|
+
result = @contractions[normalized].split(SPACE)
|
199
149
|
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
200
150
|
result
|
201
151
|
end
|
202
152
|
|
203
153
|
def clean!
|
204
154
|
@tokens = @tokens
|
205
|
-
.flat_map
|
206
|
-
.map!
|
207
|
-
.
|
208
|
-
|
155
|
+
.flat_map { |token| split_underscores_asterisk(token) }
|
156
|
+
.map! { |token| remove_irrelevant_characters(token) }
|
157
|
+
.delete_if { |token| many_dots?(token) }
|
158
|
+
end
|
159
|
+
|
160
|
+
def split_underscores_asterisk(token)
|
161
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
162
|
+
token.split(Regex::UNDERSCORES_ASTERISK)
|
209
163
|
end
|
210
164
|
|
211
|
-
def
|
212
|
-
|
213
|
-
return
|
214
|
-
|
215
|
-
token
|
165
|
+
def remove_irrelevant_characters(token)
|
166
|
+
token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
|
167
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
168
|
+
token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
|
169
|
+
token
|
170
|
+
end
|
171
|
+
|
172
|
+
def many_dots?(token)
|
173
|
+
token =~ Regex::MANY_PERIODS
|
216
174
|
end
|
217
175
|
|
218
176
|
def classic_filter!
|
219
177
|
@tokens.map! do |token|
|
220
|
-
token.delete!(
|
221
|
-
token.sub!(
|
178
|
+
token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
|
179
|
+
token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
|
222
180
|
token
|
223
181
|
end
|
224
182
|
end
|
@@ -226,26 +184,26 @@ module PragmaticTokenizer
|
|
226
184
|
def process_numbers!
|
227
185
|
case @numbers
|
228
186
|
when :semi
|
229
|
-
@tokens.delete_if { |
|
187
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
|
230
188
|
when :none
|
231
|
-
@tokens.delete_if { |
|
189
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
|
232
190
|
when :only
|
233
|
-
@tokens.delete_if { |
|
191
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
|
234
192
|
end
|
235
193
|
end
|
236
194
|
|
237
195
|
def remove_short_tokens!
|
238
|
-
@tokens.delete_if { |
|
196
|
+
@tokens.delete_if { |token| token.length < @minimum_length }
|
239
197
|
end
|
240
198
|
|
241
199
|
def process_punctuation!
|
242
200
|
case @punctuation
|
243
201
|
when :semi
|
244
|
-
@tokens.delete_if { |
|
202
|
+
@tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
|
245
203
|
when :none
|
246
|
-
@tokens.delete_if { |
|
204
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
247
205
|
when :only
|
248
|
-
@tokens.keep_if
|
206
|
+
@tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
249
207
|
end
|
250
208
|
end
|
251
209
|
|
@@ -256,45 +214,50 @@ module PragmaticTokenizer
|
|
256
214
|
def mentions!
|
257
215
|
case @mentions
|
258
216
|
when :remove
|
259
|
-
@tokens.delete_if { |
|
217
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
|
260
218
|
when :keep_and_clean
|
261
|
-
@tokens.map!
|
219
|
+
@tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
|
262
220
|
end
|
263
221
|
end
|
264
222
|
|
265
223
|
def hashtags!
|
266
224
|
case @hashtags
|
267
225
|
when :remove
|
268
|
-
@tokens.delete_if { |
|
226
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
|
269
227
|
when :keep_and_clean
|
270
|
-
@tokens
|
271
|
-
.flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
|
272
|
-
.map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
|
228
|
+
@tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
|
273
229
|
end
|
274
230
|
end
|
275
231
|
|
276
|
-
def
|
277
|
-
@tokens.delete_if { |
|
232
|
+
def remove_by_options!
|
233
|
+
@tokens.delete_if { |token| token =~ regex_by_options }
|
278
234
|
end
|
279
235
|
|
280
|
-
def
|
281
|
-
@
|
236
|
+
def regex_by_options
|
237
|
+
@regex_by_options ||= begin
|
282
238
|
regex_array = []
|
283
|
-
regex_array <<
|
284
|
-
regex_array <<
|
285
|
-
regex_array <<
|
286
|
-
regex_array <<
|
239
|
+
regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
|
240
|
+
regex_array << Regex::ONLY_EMAIL if @remove_emails
|
241
|
+
regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
|
242
|
+
regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
|
287
243
|
Regexp.union(regex_array)
|
288
244
|
end
|
289
245
|
end
|
290
246
|
|
291
247
|
def split_long_words!
|
292
|
-
@tokens = @tokens
|
293
|
-
|
248
|
+
@tokens = @tokens.flat_map { |token| split_long_word(token) }
|
249
|
+
end
|
250
|
+
|
251
|
+
def split_long_word(token)
|
252
|
+
return token unless @long_word_split
|
253
|
+
return token if token.length <= @long_word_split
|
254
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
|
+
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
+
token.split(Regex::HYPHEN_OR_UNDERSCORE)
|
294
257
|
end
|
295
258
|
|
296
|
-
def chosen_case(
|
297
|
-
@downcase ? Unicode.downcase(
|
259
|
+
def chosen_case(text)
|
260
|
+
@downcase ? Unicode.downcase(text) : text
|
298
261
|
end
|
299
262
|
|
300
263
|
def inverse_case(token)
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'pragmatic_tokenizer/version'
|
5
4
|
|
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ["Kevin S. Dias"]
|
10
9
|
spec.email = ["diasks2@gmail.com"]
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
11
|
+
spec.summary = 'A multilingual tokenizer'
|
12
|
+
spec.description = 'A multilingual tokenizer to split a string into tokens.'
|
13
|
+
spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
|
15
14
|
|
16
15
|
spec.files = `git ls-files -z`.split("\x0")
|
17
16
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
data/spec/performance_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
$LOAD_PATH.unshift File.expand_path('
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
2
2
|
require 'pragmatic_tokenizer'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/languages.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -140,6 +139,7 @@ files:
|
|
140
139
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
141
140
|
- lib/pragmatic_tokenizer/post_processor.rb
|
142
141
|
- lib/pragmatic_tokenizer/pre_processor.rb
|
142
|
+
- lib/pragmatic_tokenizer/regex.rb
|
143
143
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
144
144
|
- lib/pragmatic_tokenizer/version.rb
|
145
145
|
- pragmatic_tokenizer.gemspec
|
@@ -1,58 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates true full stops while ignoring
|
5
|
-
# periods that are part of an abbreviation
|
6
|
-
class FullStopSeparator
|
7
|
-
|
8
|
-
REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
|
9
|
-
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
-
REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
|
11
|
-
DOT = '.'.freeze
|
12
|
-
|
13
|
-
def initialize(tokens:, abbreviations:, downcase:)
|
14
|
-
@tokens = tokens
|
15
|
-
@abbreviations = abbreviations
|
16
|
-
@downcase = downcase
|
17
|
-
end
|
18
|
-
|
19
|
-
def separate
|
20
|
-
@cleaned_tokens = create_cleaned_tokens
|
21
|
-
replace_last_token unless @cleaned_tokens.empty?
|
22
|
-
@cleaned_tokens
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def create_cleaned_tokens
|
28
|
-
@tokens[0..-2]
|
29
|
-
.flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
|
30
|
-
.push(@tokens.last)
|
31
|
-
end
|
32
|
-
|
33
|
-
def abbreviation?(token)
|
34
|
-
return false unless token.end_with?(DOT) && token.length > 1
|
35
|
-
shortened = token.chomp(DOT)
|
36
|
-
!defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
|
37
|
-
end
|
38
|
-
|
39
|
-
def defined_abbreviation?(token)
|
40
|
-
@abbreviations.include?(inverse_case(token))
|
41
|
-
end
|
42
|
-
|
43
|
-
def inverse_case(token)
|
44
|
-
@downcase ? token : Unicode.downcase(token)
|
45
|
-
end
|
46
|
-
|
47
|
-
def replace_last_token
|
48
|
-
last_token = @cleaned_tokens[-1]
|
49
|
-
return unless last_token.end_with?(DOT) && last_token.length > 1
|
50
|
-
shortened = last_token.chomp(DOT)
|
51
|
-
return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
|
52
|
-
@cleaned_tokens[-1] = Regexp.last_match(1)
|
53
|
-
@cleaned_tokens << DOT
|
54
|
-
end
|
55
|
-
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|