pragmatic_tokenizer 3.0.6 → 3.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  3. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  4. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  5. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  6. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  9. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  10. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  11. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/french.rb +10 -9
  13. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  16. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/post_processor.rb +41 -93
  26. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  27. data/lib/pragmatic_tokenizer/regex.rb +149 -0
  28. data/lib/pragmatic_tokenizer/tokenizer.rb +76 -113
  29. data/lib/pragmatic_tokenizer/version.rb +1 -1
  30. data/pragmatic_tokenizer.gemspec +4 -5
  31. data/spec/performance_spec.rb +0 -1
  32. data/spec/spec_helper.rb +1 -1
  33. metadata +3 -3
  34. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -58
@@ -1,70 +1,22 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'set'
3
2
  require 'cgi'
3
+ require 'pragmatic_tokenizer/regex'
4
4
  require 'pragmatic_tokenizer/languages'
5
5
  require 'pragmatic_tokenizer/pre_processor'
6
6
  require 'pragmatic_tokenizer/post_processor'
7
- require 'pragmatic_tokenizer/full_stop_separator'
8
7
  require 'unicode'
9
8
 
10
9
  module PragmaticTokenizer
11
10
  class Tokenizer
12
11
 
13
- PUNCTIATION_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
14
- NUMBERS_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
15
- MENTIONS_OPTIONS = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
12
+ PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
13
+ NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
14
+ MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
16
15
  MAX_TOKEN_LENGTH = 50
17
- EMPTY_STRING = ''.freeze
18
- DOT_STRING = '.'.freeze
19
- SPACE_STRING = ' '.freeze
20
- REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
- REGEX_URL = /(http|https)(\.|:)/
22
- REGEX_HYPHEN = /\-/
23
- REGEX_LONG_WORD = /\-|\_/
24
- REGEXP_SPLIT_CHECK = /@|@|(http)/
25
- REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
26
- REGEX_APOSTROPHE_S = /['’`́]s$/
27
- REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
28
- REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
29
- REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
30
- REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
31
- REGEX_ASTERISK = /\*+/
32
- REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
33
- REGEX_UNDERSCORE_AT_END,
34
- REGEX_ASTERISK)
35
- # https://en.wikipedia.org/wiki/Control_character
36
- # matches any character with hexadecimal value 00 through 1F or 7F.
37
- # Rubular: http://rubular.com/r/E83fpBoDjI
38
- REGEXP_CONTROL = /[[:cntrl:]]/
39
- REGEXP_ENDING_COLON = /\:(?=\z)/
40
- REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
41
- REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
42
- REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
43
- REGEXP_SPECIAL_SYMBOL = /[®©]/
44
- REGEXP_PERCENT_AT_START = /\A\%/
45
- # https://codepoints.net/enclosed_alphanumeric_supplement
46
- REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
47
- REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
48
- REGEXP_ENDING_COLON,
49
- REGEXP_EXCLAMATION_AT_START,
50
- REGEXP_EXCLAMATION_AT_END,
51
- REGEXP_HYPHEN_AT_START,
52
- REGEXP_SPECIAL_SYMBOL,
53
- REGEXP_PERCENT_AT_START,
54
- REGEXP_ALPHANUMERIC_SUPPLEMENT)
55
- REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
56
- REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
57
- REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
58
- REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
59
- REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
60
- REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
61
- PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
62
- REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
63
- REGEXP_NUMBER_ONLY = /\A\d+\z/
64
- REGEXP_NO_NUMBERS = /\A\D+\z/
65
- REGEXP_NUMBER = /\D*\d+\d*/
66
- REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
67
- REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
16
+ NOTHING = ''.freeze
17
+ DOT = '.'.freeze
18
+ SPACE = ' '.freeze
19
+ SINGLE_QUOTE = "'".freeze
68
20
 
69
21
  # @param [Hash] opts optional arguments
70
22
 
@@ -124,7 +76,7 @@ module PragmaticTokenizer
124
76
  @abbreviations = Set.new(opts[:abbreviations])
125
77
  @stop_words = Set.new(opts[:stop_words])
126
78
 
127
- # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
79
+ # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
128
80
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
129
81
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
130
82
  @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
@@ -136,13 +88,13 @@ module PragmaticTokenizer
136
88
  @stop_words += language::STOP_WORDS
137
89
  end
138
90
 
139
- raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
91
+ raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
140
92
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
141
93
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
142
94
 
143
95
  integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
144
96
 
145
- raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
97
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
146
98
  raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
147
99
  end
148
100
 
@@ -152,21 +104,27 @@ module PragmaticTokenizer
152
104
  return [] unless text
153
105
  raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
154
106
  CGI.unescapeHTML(text)
155
- .scan(REGEXP_CHUNK_STRING)
156
- .flat_map { |segment| post_process(pre_process(segment)) }
107
+ .scan(Regex::CHUNK_LONG_INPUT_TEXT)
108
+ .flat_map { |segment| process_segment(segment) }
157
109
  end
158
110
 
159
111
  private
160
112
 
161
- def pre_process(text)
162
- text
113
+ def process_segment(segment)
114
+ pre_processed = pre_process(segment)
115
+ cased_segment = chosen_case(pre_processed)
116
+ @tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
117
+ post_process_tokens
118
+ end
119
+
120
+ def pre_process(segment)
121
+ segment
163
122
  .extend(PragmaticTokenizer::PreProcessor)
164
123
  .pre_process(language: @language_module)
165
124
  end
166
125
 
167
- def post_process(text)
168
- @tokens = run_post_processor(text)
169
- remove_various!
126
+ def post_process_tokens
127
+ remove_by_options!
170
128
  process_numbers!
171
129
  process_punctuation!
172
130
  expand_contractions! if @expand_contractions
@@ -180,45 +138,45 @@ module PragmaticTokenizer
180
138
  @tokens.reject(&:empty?)
181
139
  end
182
140
 
183
- def run_post_processor(text)
184
- PostProcessor.new(
185
- text: chosen_case(text),
186
- abbreviations: @abbreviations,
187
- downcase: @downcase
188
- ).post_process
189
- end
190
-
191
141
  def expand_contractions!
192
- @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
142
+ @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
193
143
  end
194
144
 
195
145
  def expand_token_contraction(token)
196
- normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
146
+ normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
197
147
  return token unless @contractions.key?(normalized)
198
- result = @contractions[normalized].split(SPACE_STRING)
148
+ result = @contractions[normalized].split(SPACE)
199
149
  result[0] = Unicode.capitalize(result[0]) unless @downcase
200
150
  result
201
151
  end
202
152
 
203
153
  def clean!
204
154
  @tokens = @tokens
205
- .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
206
- .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
207
- .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
208
- .delete_if { |t| unclean_token?(t) }
155
+ .flat_map { |token| split_underscores_asterisk(token) }
156
+ .map! { |token| remove_irrelevant_characters(token) }
157
+ .delete_if { |token| many_dots?(token) }
158
+ end
159
+
160
+ def split_underscores_asterisk(token)
161
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
162
+ token.split(Regex::UNDERSCORES_ASTERISK)
209
163
  end
210
164
 
211
- def unclean_token?(token)
212
- return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
213
- return true if token.length > MAX_TOKEN_LENGTH
214
- return true if token.include?('\\'.freeze)
215
- token =~ REGEXP_CONSECUTIVE_DOTS
165
+ def remove_irrelevant_characters(token)
166
+ token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
167
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
168
+ token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
169
+ token
170
+ end
171
+
172
+ def many_dots?(token)
173
+ token =~ Regex::MANY_PERIODS
216
174
  end
217
175
 
218
176
  def classic_filter!
219
177
  @tokens.map! do |token|
220
- token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
221
- token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
178
+ token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
179
+ token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
222
180
  token
223
181
  end
224
182
  end
@@ -226,26 +184,26 @@ module PragmaticTokenizer
226
184
  def process_numbers!
227
185
  case @numbers
228
186
  when :semi
229
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
187
+ @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
230
188
  when :none
231
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
189
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
232
190
  when :only
233
- @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
191
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
234
192
  end
235
193
  end
236
194
 
237
195
  def remove_short_tokens!
238
- @tokens.delete_if { |t| t.length < @minimum_length }
196
+ @tokens.delete_if { |token| token.length < @minimum_length }
239
197
  end
240
198
 
241
199
  def process_punctuation!
242
200
  case @punctuation
243
201
  when :semi
244
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
202
+ @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
245
203
  when :none
246
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
204
+ @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
247
205
  when :only
248
- @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
206
+ @tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
249
207
  end
250
208
  end
251
209
 
@@ -256,45 +214,50 @@ module PragmaticTokenizer
256
214
  def mentions!
257
215
  case @mentions
258
216
  when :remove
259
- @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
217
+ @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
260
218
  when :keep_and_clean
261
- @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
219
+ @tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
262
220
  end
263
221
  end
264
222
 
265
223
  def hashtags!
266
224
  case @hashtags
267
225
  when :remove
268
- @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
226
+ @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
269
227
  when :keep_and_clean
270
- @tokens = @tokens
271
- .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
272
- .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
228
+ @tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
273
229
  end
274
230
  end
275
231
 
276
- def remove_various!
277
- @tokens.delete_if { |t| t =~ regex_various }
232
+ def remove_by_options!
233
+ @tokens.delete_if { |token| token =~ regex_by_options }
278
234
  end
279
235
 
280
- def regex_various
281
- @regex_various ||= begin
236
+ def regex_by_options
237
+ @regex_by_options ||= begin
282
238
  regex_array = []
283
- regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
284
- regex_array << REGEX_EMAIL if @remove_emails
285
- regex_array << REGEX_URL if @remove_urls
286
- regex_array << REGEX_DOMAIN if @remove_domains
239
+ regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
240
+ regex_array << Regex::ONLY_EMAIL if @remove_emails
241
+ regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
242
+ regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
287
243
  Regexp.union(regex_array)
288
244
  end
289
245
  end
290
246
 
291
247
  def split_long_words!
292
- @tokens = @tokens
293
- .flat_map { |t| (t.length > @long_word_split && t !~ REGEXP_SPLIT_CHECK ) ? t.split(REGEX_LONG_WORD) : t }
248
+ @tokens = @tokens.flat_map { |token| split_long_word(token) }
249
+ end
250
+
251
+ def split_long_word(token)
252
+ return token unless @long_word_split
253
+ return token if token.length <= @long_word_split
254
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
+ return token if token =~ Regex::DOMAIN_OR_EMAIL
256
+ token.split(Regex::HYPHEN_OR_UNDERSCORE)
294
257
  end
295
258
 
296
- def chosen_case(txt)
297
- @downcase ? Unicode.downcase(txt) : txt
259
+ def chosen_case(text)
260
+ @downcase ? Unicode.downcase(text) : text
298
261
  end
299
262
 
300
263
  def inverse_case(token)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.6".freeze
2
+ VERSION = "3.0.7".freeze
3
3
  end
@@ -1,5 +1,4 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'pragmatic_tokenizer/version'
5
4
 
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ["Kevin S. Dias"]
10
9
  spec.email = ["diasks2@gmail.com"]
11
10
 
12
- spec.summary = %q{A multilingual tokenizer}
13
- spec.description = %q{A multilingual tokenizer to split a string into tokens.}
14
- spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
11
+ spec.summary = 'A multilingual tokenizer'
12
+ spec.description = 'A multilingual tokenizer to split a string into tokens.'
13
+ spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
15
14
 
16
15
  spec.files = `git ls-files -z`.split("\x0")
17
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -1,4 +1,3 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'benchmark'
3
2
  require 'spec_helper'
4
3
  require 'stackprof'
data/spec/spec_helper.rb CHANGED
@@ -1,2 +1,2 @@
1
- $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
1
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
2
2
  require 'pragmatic_tokenizer'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.6
4
+ version: 3.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-15 00:00:00.000000000 Z
11
+ date: 2018-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -111,7 +111,6 @@ files:
111
111
  - README.md
112
112
  - Rakefile
113
113
  - lib/pragmatic_tokenizer.rb
114
- - lib/pragmatic_tokenizer/full_stop_separator.rb
115
114
  - lib/pragmatic_tokenizer/languages.rb
116
115
  - lib/pragmatic_tokenizer/languages/arabic.rb
117
116
  - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -140,6 +139,7 @@ files:
140
139
  - lib/pragmatic_tokenizer/languages/turkish.rb
141
140
  - lib/pragmatic_tokenizer/post_processor.rb
142
141
  - lib/pragmatic_tokenizer/pre_processor.rb
142
+ - lib/pragmatic_tokenizer/regex.rb
143
143
  - lib/pragmatic_tokenizer/tokenizer.rb
144
144
  - lib/pragmatic_tokenizer/version.rb
145
145
  - pragmatic_tokenizer.gemspec
@@ -1,58 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticTokenizer
4
- # This class separates true full stops while ignoring
5
- # periods that are part of an abbreviation
6
- class FullStopSeparator
7
-
8
- REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
9
- REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
- REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
11
- DOT = '.'.freeze
12
-
13
- def initialize(tokens:, abbreviations:, downcase:)
14
- @tokens = tokens
15
- @abbreviations = abbreviations
16
- @downcase = downcase
17
- end
18
-
19
- def separate
20
- @cleaned_tokens = create_cleaned_tokens
21
- replace_last_token unless @cleaned_tokens.empty?
22
- @cleaned_tokens
23
- end
24
-
25
- private
26
-
27
- def create_cleaned_tokens
28
- @tokens[0..-2]
29
- .flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
30
- .push(@tokens.last)
31
- end
32
-
33
- def abbreviation?(token)
34
- return false unless token.end_with?(DOT) && token.length > 1
35
- shortened = token.chomp(DOT)
36
- !defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
37
- end
38
-
39
- def defined_abbreviation?(token)
40
- @abbreviations.include?(inverse_case(token))
41
- end
42
-
43
- def inverse_case(token)
44
- @downcase ? token : Unicode.downcase(token)
45
- end
46
-
47
- def replace_last_token
48
- last_token = @cleaned_tokens[-1]
49
- return unless last_token.end_with?(DOT) && last_token.length > 1
50
- shortened = last_token.chomp(DOT)
51
- return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
52
- @cleaned_tokens[-1] = Regexp.last_match(1)
53
- @cleaned_tokens << DOT
54
- end
55
-
56
- end
57
-
58
- end