pragmatic_tokenizer 3.0.6 → 3.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  3. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  4. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  5. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  6. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  9. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  10. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  11. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/french.rb +10 -9
  13. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  16. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/post_processor.rb +41 -93
  26. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  27. data/lib/pragmatic_tokenizer/regex.rb +149 -0
  28. data/lib/pragmatic_tokenizer/tokenizer.rb +76 -113
  29. data/lib/pragmatic_tokenizer/version.rb +1 -1
  30. data/pragmatic_tokenizer.gemspec +4 -5
  31. data/spec/performance_spec.rb +0 -1
  32. data/spec/spec_helper.rb +1 -1
  33. metadata +3 -3
  34. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -58
@@ -1,70 +1,22 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'set'
3
2
  require 'cgi'
3
+ require 'pragmatic_tokenizer/regex'
4
4
  require 'pragmatic_tokenizer/languages'
5
5
  require 'pragmatic_tokenizer/pre_processor'
6
6
  require 'pragmatic_tokenizer/post_processor'
7
- require 'pragmatic_tokenizer/full_stop_separator'
8
7
  require 'unicode'
9
8
 
10
9
  module PragmaticTokenizer
11
10
  class Tokenizer
12
11
 
13
- PUNCTIATION_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
14
- NUMBERS_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
15
- MENTIONS_OPTIONS = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
12
+ PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
13
+ NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
14
+ MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
16
15
  MAX_TOKEN_LENGTH = 50
17
- EMPTY_STRING = ''.freeze
18
- DOT_STRING = '.'.freeze
19
- SPACE_STRING = ' '.freeze
20
- REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
- REGEX_URL = /(http|https)(\.|:)/
22
- REGEX_HYPHEN = /\-/
23
- REGEX_LONG_WORD = /\-|\_/
24
- REGEXP_SPLIT_CHECK = /@|@|(http)/
25
- REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
26
- REGEX_APOSTROPHE_S = /['’`́]s$/
27
- REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
28
- REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
29
- REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
30
- REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
31
- REGEX_ASTERISK = /\*+/
32
- REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
33
- REGEX_UNDERSCORE_AT_END,
34
- REGEX_ASTERISK)
35
- # https://en.wikipedia.org/wiki/Control_character
36
- # matches any character with hexadecimal value 00 through 1F or 7F.
37
- # Rubular: http://rubular.com/r/E83fpBoDjI
38
- REGEXP_CONTROL = /[[:cntrl:]]/
39
- REGEXP_ENDING_COLON = /\:(?=\z)/
40
- REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
41
- REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
42
- REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
43
- REGEXP_SPECIAL_SYMBOL = /[®©]/
44
- REGEXP_PERCENT_AT_START = /\A\%/
45
- # https://codepoints.net/enclosed_alphanumeric_supplement
46
- REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
47
- REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
48
- REGEXP_ENDING_COLON,
49
- REGEXP_EXCLAMATION_AT_START,
50
- REGEXP_EXCLAMATION_AT_END,
51
- REGEXP_HYPHEN_AT_START,
52
- REGEXP_SPECIAL_SYMBOL,
53
- REGEXP_PERCENT_AT_START,
54
- REGEXP_ALPHANUMERIC_SUPPLEMENT)
55
- REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
56
- REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
57
- REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
58
- REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
59
- REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
60
- REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
61
- PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
62
- REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
63
- REGEXP_NUMBER_ONLY = /\A\d+\z/
64
- REGEXP_NO_NUMBERS = /\A\D+\z/
65
- REGEXP_NUMBER = /\D*\d+\d*/
66
- REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
67
- REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
16
+ NOTHING = ''.freeze
17
+ DOT = '.'.freeze
18
+ SPACE = ' '.freeze
19
+ SINGLE_QUOTE = "'".freeze
68
20
 
69
21
  # @param [Hash] opts optional arguments
70
22
 
@@ -124,7 +76,7 @@ module PragmaticTokenizer
124
76
  @abbreviations = Set.new(opts[:abbreviations])
125
77
  @stop_words = Set.new(opts[:stop_words])
126
78
 
127
- # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
79
+ # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
128
80
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
129
81
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
130
82
  @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
@@ -136,13 +88,13 @@ module PragmaticTokenizer
136
88
  @stop_words += language::STOP_WORDS
137
89
  end
138
90
 
139
- raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
91
+ raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
140
92
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
141
93
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
142
94
 
143
95
  integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
144
96
 
145
- raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
97
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
146
98
  raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
147
99
  end
148
100
 
@@ -152,21 +104,27 @@ module PragmaticTokenizer
152
104
  return [] unless text
153
105
  raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
154
106
  CGI.unescapeHTML(text)
155
- .scan(REGEXP_CHUNK_STRING)
156
- .flat_map { |segment| post_process(pre_process(segment)) }
107
+ .scan(Regex::CHUNK_LONG_INPUT_TEXT)
108
+ .flat_map { |segment| process_segment(segment) }
157
109
  end
158
110
 
159
111
  private
160
112
 
161
- def pre_process(text)
162
- text
113
+ def process_segment(segment)
114
+ pre_processed = pre_process(segment)
115
+ cased_segment = chosen_case(pre_processed)
116
+ @tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
117
+ post_process_tokens
118
+ end
119
+
120
+ def pre_process(segment)
121
+ segment
163
122
  .extend(PragmaticTokenizer::PreProcessor)
164
123
  .pre_process(language: @language_module)
165
124
  end
166
125
 
167
- def post_process(text)
168
- @tokens = run_post_processor(text)
169
- remove_various!
126
+ def post_process_tokens
127
+ remove_by_options!
170
128
  process_numbers!
171
129
  process_punctuation!
172
130
  expand_contractions! if @expand_contractions
@@ -180,45 +138,45 @@ module PragmaticTokenizer
180
138
  @tokens.reject(&:empty?)
181
139
  end
182
140
 
183
- def run_post_processor(text)
184
- PostProcessor.new(
185
- text: chosen_case(text),
186
- abbreviations: @abbreviations,
187
- downcase: @downcase
188
- ).post_process
189
- end
190
-
191
141
  def expand_contractions!
192
- @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
142
+ @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
193
143
  end
194
144
 
195
145
  def expand_token_contraction(token)
196
- normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
146
+ normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
197
147
  return token unless @contractions.key?(normalized)
198
- result = @contractions[normalized].split(SPACE_STRING)
148
+ result = @contractions[normalized].split(SPACE)
199
149
  result[0] = Unicode.capitalize(result[0]) unless @downcase
200
150
  result
201
151
  end
202
152
 
203
153
  def clean!
204
154
  @tokens = @tokens
205
- .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
206
- .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
207
- .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
208
- .delete_if { |t| unclean_token?(t) }
155
+ .flat_map { |token| split_underscores_asterisk(token) }
156
+ .map! { |token| remove_irrelevant_characters(token) }
157
+ .delete_if { |token| many_dots?(token) }
158
+ end
159
+
160
+ def split_underscores_asterisk(token)
161
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
162
+ token.split(Regex::UNDERSCORES_ASTERISK)
209
163
  end
210
164
 
211
- def unclean_token?(token)
212
- return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
213
- return true if token.length > MAX_TOKEN_LENGTH
214
- return true if token.include?('\\'.freeze)
215
- token =~ REGEXP_CONSECUTIVE_DOTS
165
+ def remove_irrelevant_characters(token)
166
+ token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
167
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
168
+ token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
169
+ token
170
+ end
171
+
172
+ def many_dots?(token)
173
+ token =~ Regex::MANY_PERIODS
216
174
  end
217
175
 
218
176
  def classic_filter!
219
177
  @tokens.map! do |token|
220
- token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
221
- token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
178
+ token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
179
+ token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
222
180
  token
223
181
  end
224
182
  end
@@ -226,26 +184,26 @@ module PragmaticTokenizer
226
184
  def process_numbers!
227
185
  case @numbers
228
186
  when :semi
229
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
187
+ @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
230
188
  when :none
231
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
189
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
232
190
  when :only
233
- @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
191
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
234
192
  end
235
193
  end
236
194
 
237
195
  def remove_short_tokens!
238
- @tokens.delete_if { |t| t.length < @minimum_length }
196
+ @tokens.delete_if { |token| token.length < @minimum_length }
239
197
  end
240
198
 
241
199
  def process_punctuation!
242
200
  case @punctuation
243
201
  when :semi
244
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
202
+ @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
245
203
  when :none
246
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
204
+ @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
247
205
  when :only
248
- @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
206
+ @tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
249
207
  end
250
208
  end
251
209
 
@@ -256,45 +214,50 @@ module PragmaticTokenizer
256
214
  def mentions!
257
215
  case @mentions
258
216
  when :remove
259
- @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
217
+ @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
260
218
  when :keep_and_clean
261
- @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
219
+ @tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
262
220
  end
263
221
  end
264
222
 
265
223
  def hashtags!
266
224
  case @hashtags
267
225
  when :remove
268
- @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
226
+ @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
269
227
  when :keep_and_clean
270
- @tokens = @tokens
271
- .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
272
- .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
228
+ @tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
273
229
  end
274
230
  end
275
231
 
276
- def remove_various!
277
- @tokens.delete_if { |t| t =~ regex_various }
232
+ def remove_by_options!
233
+ @tokens.delete_if { |token| token =~ regex_by_options }
278
234
  end
279
235
 
280
- def regex_various
281
- @regex_various ||= begin
236
+ def regex_by_options
237
+ @regex_by_options ||= begin
282
238
  regex_array = []
283
- regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
284
- regex_array << REGEX_EMAIL if @remove_emails
285
- regex_array << REGEX_URL if @remove_urls
286
- regex_array << REGEX_DOMAIN if @remove_domains
239
+ regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
240
+ regex_array << Regex::ONLY_EMAIL if @remove_emails
241
+ regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
242
+ regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
287
243
  Regexp.union(regex_array)
288
244
  end
289
245
  end
290
246
 
291
247
  def split_long_words!
292
- @tokens = @tokens
293
- .flat_map { |t| (t.length > @long_word_split && t !~ REGEXP_SPLIT_CHECK ) ? t.split(REGEX_LONG_WORD) : t }
248
+ @tokens = @tokens.flat_map { |token| split_long_word(token) }
249
+ end
250
+
251
+ def split_long_word(token)
252
+ return token unless @long_word_split
253
+ return token if token.length <= @long_word_split
254
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
+ return token if token =~ Regex::DOMAIN_OR_EMAIL
256
+ token.split(Regex::HYPHEN_OR_UNDERSCORE)
294
257
  end
295
258
 
296
- def chosen_case(txt)
297
- @downcase ? Unicode.downcase(txt) : txt
259
+ def chosen_case(text)
260
+ @downcase ? Unicode.downcase(text) : text
298
261
  end
299
262
 
300
263
  def inverse_case(token)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.6".freeze
2
+ VERSION = "3.0.7".freeze
3
3
  end
@@ -1,5 +1,4 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'pragmatic_tokenizer/version'
5
4
 
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ["Kevin S. Dias"]
10
9
  spec.email = ["diasks2@gmail.com"]
11
10
 
12
- spec.summary = %q{A multilingual tokenizer}
13
- spec.description = %q{A multilingual tokenizer to split a string into tokens.}
14
- spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
11
+ spec.summary = 'A multilingual tokenizer'
12
+ spec.description = 'A multilingual tokenizer to split a string into tokens.'
13
+ spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
15
14
 
16
15
  spec.files = `git ls-files -z`.split("\x0")
17
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -1,4 +1,3 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'benchmark'
3
2
  require 'spec_helper'
4
3
  require 'stackprof'
data/spec/spec_helper.rb CHANGED
@@ -1,2 +1,2 @@
1
- $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
1
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
2
2
  require 'pragmatic_tokenizer'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.6
4
+ version: 3.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-15 00:00:00.000000000 Z
11
+ date: 2018-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -111,7 +111,6 @@ files:
111
111
  - README.md
112
112
  - Rakefile
113
113
  - lib/pragmatic_tokenizer.rb
114
- - lib/pragmatic_tokenizer/full_stop_separator.rb
115
114
  - lib/pragmatic_tokenizer/languages.rb
116
115
  - lib/pragmatic_tokenizer/languages/arabic.rb
117
116
  - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -140,6 +139,7 @@ files:
140
139
  - lib/pragmatic_tokenizer/languages/turkish.rb
141
140
  - lib/pragmatic_tokenizer/post_processor.rb
142
141
  - lib/pragmatic_tokenizer/pre_processor.rb
142
+ - lib/pragmatic_tokenizer/regex.rb
143
143
  - lib/pragmatic_tokenizer/tokenizer.rb
144
144
  - lib/pragmatic_tokenizer/version.rb
145
145
  - pragmatic_tokenizer.gemspec
@@ -1,58 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticTokenizer
4
- # This class separates true full stops while ignoring
5
- # periods that are part of an abbreviation
6
- class FullStopSeparator
7
-
8
- REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
9
- REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
- REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
11
- DOT = '.'.freeze
12
-
13
- def initialize(tokens:, abbreviations:, downcase:)
14
- @tokens = tokens
15
- @abbreviations = abbreviations
16
- @downcase = downcase
17
- end
18
-
19
- def separate
20
- @cleaned_tokens = create_cleaned_tokens
21
- replace_last_token unless @cleaned_tokens.empty?
22
- @cleaned_tokens
23
- end
24
-
25
- private
26
-
27
- def create_cleaned_tokens
28
- @tokens[0..-2]
29
- .flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
30
- .push(@tokens.last)
31
- end
32
-
33
- def abbreviation?(token)
34
- return false unless token.end_with?(DOT) && token.length > 1
35
- shortened = token.chomp(DOT)
36
- !defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
37
- end
38
-
39
- def defined_abbreviation?(token)
40
- @abbreviations.include?(inverse_case(token))
41
- end
42
-
43
- def inverse_case(token)
44
- @downcase ? token : Unicode.downcase(token)
45
- end
46
-
47
- def replace_last_token
48
- last_token = @cleaned_tokens[-1]
49
- return unless last_token.end_with?(DOT) && last_token.length > 1
50
- shortened = last_token.chomp(DOT)
51
- return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
52
- @cleaned_tokens[-1] = Regexp.last_match(1)
53
- @cleaned_tokens << DOT
54
- end
55
-
56
- end
57
-
58
- end