pragmatic_tokenizer 3.0.6 → 3.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +10 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +41 -93
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +149 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +76 -113
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +4 -5
- data/spec/performance_spec.rb +0 -1
- data/spec/spec_helper.rb +1 -1
- metadata +3 -3
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -58
@@ -1,70 +1,22 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'set'
|
3
2
|
require 'cgi'
|
3
|
+
require 'pragmatic_tokenizer/regex'
|
4
4
|
require 'pragmatic_tokenizer/languages'
|
5
5
|
require 'pragmatic_tokenizer/pre_processor'
|
6
6
|
require 'pragmatic_tokenizer/post_processor'
|
7
|
-
require 'pragmatic_tokenizer/full_stop_separator'
|
8
7
|
require 'unicode'
|
9
8
|
|
10
9
|
module PragmaticTokenizer
|
11
10
|
class Tokenizer
|
12
11
|
|
13
|
-
|
14
|
-
NUMBERS_OPTIONS = Set.new([
|
15
|
-
MENTIONS_OPTIONS = Set.new([
|
12
|
+
PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
|
13
|
+
NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
|
14
|
+
MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
|
16
15
|
MAX_TOKEN_LENGTH = 50
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
REGEX_URL = /(http|https)(\.|:)/
|
22
|
-
REGEX_HYPHEN = /\-/
|
23
|
-
REGEX_LONG_WORD = /\-|\_/
|
24
|
-
REGEXP_SPLIT_CHECK = /@|@|(http)/
|
25
|
-
REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
|
26
|
-
REGEX_APOSTROPHE_S = /['’`́]s$/
|
27
|
-
REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
|
28
|
-
REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
|
29
|
-
REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
|
30
|
-
REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
|
31
|
-
REGEX_ASTERISK = /\*+/
|
32
|
-
REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
|
33
|
-
REGEX_UNDERSCORE_AT_END,
|
34
|
-
REGEX_ASTERISK)
|
35
|
-
# https://en.wikipedia.org/wiki/Control_character
|
36
|
-
# matches any character with hexadecimal value 00 through 1F or 7F.
|
37
|
-
# Rubular: http://rubular.com/r/E83fpBoDjI
|
38
|
-
REGEXP_CONTROL = /[[:cntrl:]]/
|
39
|
-
REGEXP_ENDING_COLON = /\:(?=\z)/
|
40
|
-
REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
|
41
|
-
REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
|
42
|
-
REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
|
43
|
-
REGEXP_SPECIAL_SYMBOL = /[®©]/
|
44
|
-
REGEXP_PERCENT_AT_START = /\A\%/
|
45
|
-
# https://codepoints.net/enclosed_alphanumeric_supplement
|
46
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
47
|
-
REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
|
48
|
-
REGEXP_ENDING_COLON,
|
49
|
-
REGEXP_EXCLAMATION_AT_START,
|
50
|
-
REGEXP_EXCLAMATION_AT_END,
|
51
|
-
REGEXP_HYPHEN_AT_START,
|
52
|
-
REGEXP_SPECIAL_SYMBOL,
|
53
|
-
REGEXP_PERCENT_AT_START,
|
54
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT)
|
55
|
-
REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
|
56
|
-
REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
|
57
|
-
REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
|
58
|
-
REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
|
59
|
-
REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
|
60
|
-
REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
|
61
|
-
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
|
62
|
-
REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
|
63
|
-
REGEXP_NUMBER_ONLY = /\A\d+\z/
|
64
|
-
REGEXP_NO_NUMBERS = /\A\D+\z/
|
65
|
-
REGEXP_NUMBER = /\D*\d+\d*/
|
66
|
-
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
67
|
-
REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
|
16
|
+
NOTHING = ''.freeze
|
17
|
+
DOT = '.'.freeze
|
18
|
+
SPACE = ' '.freeze
|
19
|
+
SINGLE_QUOTE = "'".freeze
|
68
20
|
|
69
21
|
# @param [Hash] opts optional arguments
|
70
22
|
|
@@ -124,7 +76,7 @@ module PragmaticTokenizer
|
|
124
76
|
@abbreviations = Set.new(opts[:abbreviations])
|
125
77
|
@stop_words = Set.new(opts[:stop_words])
|
126
78
|
|
127
|
-
#
|
79
|
+
# Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
|
128
80
|
@contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
|
129
81
|
@abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
|
130
82
|
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
@@ -136,13 +88,13 @@ module PragmaticTokenizer
|
|
136
88
|
@stop_words += language::STOP_WORDS
|
137
89
|
end
|
138
90
|
|
139
|
-
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless
|
91
|
+
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
|
140
92
|
raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
|
141
93
|
raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
|
142
94
|
|
143
95
|
integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
|
144
96
|
|
145
|
-
raise "In Pragmatic Tokenizer minimum_length must be an Integer"
|
97
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
|
146
98
|
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
|
147
99
|
end
|
148
100
|
|
@@ -152,21 +104,27 @@ module PragmaticTokenizer
|
|
152
104
|
return [] unless text
|
153
105
|
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
154
106
|
CGI.unescapeHTML(text)
|
155
|
-
.scan(
|
156
|
-
.flat_map { |segment|
|
107
|
+
.scan(Regex::CHUNK_LONG_INPUT_TEXT)
|
108
|
+
.flat_map { |segment| process_segment(segment) }
|
157
109
|
end
|
158
110
|
|
159
111
|
private
|
160
112
|
|
161
|
-
def
|
162
|
-
|
113
|
+
def process_segment(segment)
|
114
|
+
pre_processed = pre_process(segment)
|
115
|
+
cased_segment = chosen_case(pre_processed)
|
116
|
+
@tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
|
117
|
+
post_process_tokens
|
118
|
+
end
|
119
|
+
|
120
|
+
def pre_process(segment)
|
121
|
+
segment
|
163
122
|
.extend(PragmaticTokenizer::PreProcessor)
|
164
123
|
.pre_process(language: @language_module)
|
165
124
|
end
|
166
125
|
|
167
|
-
def
|
168
|
-
|
169
|
-
remove_various!
|
126
|
+
def post_process_tokens
|
127
|
+
remove_by_options!
|
170
128
|
process_numbers!
|
171
129
|
process_punctuation!
|
172
130
|
expand_contractions! if @expand_contractions
|
@@ -180,45 +138,45 @@ module PragmaticTokenizer
|
|
180
138
|
@tokens.reject(&:empty?)
|
181
139
|
end
|
182
140
|
|
183
|
-
def run_post_processor(text)
|
184
|
-
PostProcessor.new(
|
185
|
-
text: chosen_case(text),
|
186
|
-
abbreviations: @abbreviations,
|
187
|
-
downcase: @downcase
|
188
|
-
).post_process
|
189
|
-
end
|
190
|
-
|
191
141
|
def expand_contractions!
|
192
|
-
@tokens = @tokens.flat_map { |
|
142
|
+
@tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
|
193
143
|
end
|
194
144
|
|
195
145
|
def expand_token_contraction(token)
|
196
|
-
normalized = inverse_case(token.gsub(
|
146
|
+
normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
|
197
147
|
return token unless @contractions.key?(normalized)
|
198
|
-
result = @contractions[normalized].split(
|
148
|
+
result = @contractions[normalized].split(SPACE)
|
199
149
|
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
200
150
|
result
|
201
151
|
end
|
202
152
|
|
203
153
|
def clean!
|
204
154
|
@tokens = @tokens
|
205
|
-
.flat_map
|
206
|
-
.map!
|
207
|
-
.
|
208
|
-
|
155
|
+
.flat_map { |token| split_underscores_asterisk(token) }
|
156
|
+
.map! { |token| remove_irrelevant_characters(token) }
|
157
|
+
.delete_if { |token| many_dots?(token) }
|
158
|
+
end
|
159
|
+
|
160
|
+
def split_underscores_asterisk(token)
|
161
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
162
|
+
token.split(Regex::UNDERSCORES_ASTERISK)
|
209
163
|
end
|
210
164
|
|
211
|
-
def
|
212
|
-
|
213
|
-
return
|
214
|
-
|
215
|
-
token
|
165
|
+
def remove_irrelevant_characters(token)
|
166
|
+
token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
|
167
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
168
|
+
token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
|
169
|
+
token
|
170
|
+
end
|
171
|
+
|
172
|
+
def many_dots?(token)
|
173
|
+
token =~ Regex::MANY_PERIODS
|
216
174
|
end
|
217
175
|
|
218
176
|
def classic_filter!
|
219
177
|
@tokens.map! do |token|
|
220
|
-
token.delete!(
|
221
|
-
token.sub!(
|
178
|
+
token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
|
179
|
+
token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
|
222
180
|
token
|
223
181
|
end
|
224
182
|
end
|
@@ -226,26 +184,26 @@ module PragmaticTokenizer
|
|
226
184
|
def process_numbers!
|
227
185
|
case @numbers
|
228
186
|
when :semi
|
229
|
-
@tokens.delete_if { |
|
187
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
|
230
188
|
when :none
|
231
|
-
@tokens.delete_if { |
|
189
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
|
232
190
|
when :only
|
233
|
-
@tokens.delete_if { |
|
191
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
|
234
192
|
end
|
235
193
|
end
|
236
194
|
|
237
195
|
def remove_short_tokens!
|
238
|
-
@tokens.delete_if { |
|
196
|
+
@tokens.delete_if { |token| token.length < @minimum_length }
|
239
197
|
end
|
240
198
|
|
241
199
|
def process_punctuation!
|
242
200
|
case @punctuation
|
243
201
|
when :semi
|
244
|
-
@tokens.delete_if { |
|
202
|
+
@tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
|
245
203
|
when :none
|
246
|
-
@tokens.delete_if { |
|
204
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
247
205
|
when :only
|
248
|
-
@tokens.keep_if
|
206
|
+
@tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
249
207
|
end
|
250
208
|
end
|
251
209
|
|
@@ -256,45 +214,50 @@ module PragmaticTokenizer
|
|
256
214
|
def mentions!
|
257
215
|
case @mentions
|
258
216
|
when :remove
|
259
|
-
@tokens.delete_if { |
|
217
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
|
260
218
|
when :keep_and_clean
|
261
|
-
@tokens.map!
|
219
|
+
@tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
|
262
220
|
end
|
263
221
|
end
|
264
222
|
|
265
223
|
def hashtags!
|
266
224
|
case @hashtags
|
267
225
|
when :remove
|
268
|
-
@tokens.delete_if { |
|
226
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
|
269
227
|
when :keep_and_clean
|
270
|
-
@tokens
|
271
|
-
.flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
|
272
|
-
.map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
|
228
|
+
@tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
|
273
229
|
end
|
274
230
|
end
|
275
231
|
|
276
|
-
def
|
277
|
-
@tokens.delete_if { |
|
232
|
+
def remove_by_options!
|
233
|
+
@tokens.delete_if { |token| token =~ regex_by_options }
|
278
234
|
end
|
279
235
|
|
280
|
-
def
|
281
|
-
@
|
236
|
+
def regex_by_options
|
237
|
+
@regex_by_options ||= begin
|
282
238
|
regex_array = []
|
283
|
-
regex_array <<
|
284
|
-
regex_array <<
|
285
|
-
regex_array <<
|
286
|
-
regex_array <<
|
239
|
+
regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
|
240
|
+
regex_array << Regex::ONLY_EMAIL if @remove_emails
|
241
|
+
regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
|
242
|
+
regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
|
287
243
|
Regexp.union(regex_array)
|
288
244
|
end
|
289
245
|
end
|
290
246
|
|
291
247
|
def split_long_words!
|
292
|
-
@tokens = @tokens
|
293
|
-
|
248
|
+
@tokens = @tokens.flat_map { |token| split_long_word(token) }
|
249
|
+
end
|
250
|
+
|
251
|
+
def split_long_word(token)
|
252
|
+
return token unless @long_word_split
|
253
|
+
return token if token.length <= @long_word_split
|
254
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
|
+
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
+
token.split(Regex::HYPHEN_OR_UNDERSCORE)
|
294
257
|
end
|
295
258
|
|
296
|
-
def chosen_case(
|
297
|
-
@downcase ? Unicode.downcase(
|
259
|
+
def chosen_case(text)
|
260
|
+
@downcase ? Unicode.downcase(text) : text
|
298
261
|
end
|
299
262
|
|
300
263
|
def inverse_case(token)
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'pragmatic_tokenizer/version'
|
5
4
|
|
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ["Kevin S. Dias"]
|
10
9
|
spec.email = ["diasks2@gmail.com"]
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
11
|
+
spec.summary = 'A multilingual tokenizer'
|
12
|
+
spec.description = 'A multilingual tokenizer to split a string into tokens.'
|
13
|
+
spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
|
15
14
|
|
16
15
|
spec.files = `git ls-files -z`.split("\x0")
|
17
16
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
data/spec/performance_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
$LOAD_PATH.unshift File.expand_path('
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
2
2
|
require 'pragmatic_tokenizer'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/languages.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -140,6 +139,7 @@ files:
|
|
140
139
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
141
140
|
- lib/pragmatic_tokenizer/post_processor.rb
|
142
141
|
- lib/pragmatic_tokenizer/pre_processor.rb
|
142
|
+
- lib/pragmatic_tokenizer/regex.rb
|
143
143
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
144
144
|
- lib/pragmatic_tokenizer/version.rb
|
145
145
|
- pragmatic_tokenizer.gemspec
|
@@ -1,58 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates true full stops while ignoring
|
5
|
-
# periods that are part of an abbreviation
|
6
|
-
class FullStopSeparator
|
7
|
-
|
8
|
-
REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
|
9
|
-
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
-
REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
|
11
|
-
DOT = '.'.freeze
|
12
|
-
|
13
|
-
def initialize(tokens:, abbreviations:, downcase:)
|
14
|
-
@tokens = tokens
|
15
|
-
@abbreviations = abbreviations
|
16
|
-
@downcase = downcase
|
17
|
-
end
|
18
|
-
|
19
|
-
def separate
|
20
|
-
@cleaned_tokens = create_cleaned_tokens
|
21
|
-
replace_last_token unless @cleaned_tokens.empty?
|
22
|
-
@cleaned_tokens
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def create_cleaned_tokens
|
28
|
-
@tokens[0..-2]
|
29
|
-
.flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
|
30
|
-
.push(@tokens.last)
|
31
|
-
end
|
32
|
-
|
33
|
-
def abbreviation?(token)
|
34
|
-
return false unless token.end_with?(DOT) && token.length > 1
|
35
|
-
shortened = token.chomp(DOT)
|
36
|
-
!defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
|
37
|
-
end
|
38
|
-
|
39
|
-
def defined_abbreviation?(token)
|
40
|
-
@abbreviations.include?(inverse_case(token))
|
41
|
-
end
|
42
|
-
|
43
|
-
def inverse_case(token)
|
44
|
-
@downcase ? token : Unicode.downcase(token)
|
45
|
-
end
|
46
|
-
|
47
|
-
def replace_last_token
|
48
|
-
last_token = @cleaned_tokens[-1]
|
49
|
-
return unless last_token.end_with?(DOT) && last_token.length > 1
|
50
|
-
shortened = last_token.chomp(DOT)
|
51
|
-
return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
|
52
|
-
@cleaned_tokens[-1] = Regexp.last_match(1)
|
53
|
-
@cleaned_tokens << DOT
|
54
|
-
end
|
55
|
-
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|