pragmatic_tokenizer 3.0.3 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +1 -1
  3. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  4. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  5. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  6. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  7. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  9. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  10. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  11. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  12. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  13. data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
  14. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  17. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  20. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  21. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  22. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  23. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
  27. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  28. data/lib/pragmatic_tokenizer/regex.rb +149 -0
  29. data/lib/pragmatic_tokenizer/tokenizer.rb +82 -116
  30. data/lib/pragmatic_tokenizer/version.rb +1 -1
  31. data/pragmatic_tokenizer.gemspec +5 -6
  32. data/spec/languages/deutsch_spec.rb +1 -1
  33. data/spec/languages/english_spec.rb +52 -0
  34. data/spec/languages/french_spec.rb +2 -2
  35. data/spec/performance_spec.rb +1 -1
  36. data/spec/spec_helper.rb +1 -1
  37. metadata +8 -8
  38. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -63
@@ -0,0 +1,149 @@
1
+ module PragmaticTokenizer
2
+ class Regex
3
+
4
+ # Things that can or should be done:
5
+ # - check where the use of unicode categories helps (\p{Abbreviation})
6
+ # - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
7
+ # - check multiple domain regex, we have spec issues when using one or the other
8
+ # - check multiple punctuation regex
9
+
10
+ # Text that needs to be tokenized is initially split into chunks of this length:
11
+ CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
12
+
13
+ # Ranges
14
+ RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
15
+ RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
16
+ RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
17
+ RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
18
+ RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
19
+
20
+ # Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
21
+ COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
22
+ COLON2 = /(?::)/
23
+ COMMAS = /(?:([,‚])+)/
24
+ ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
25
+ EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
26
+ DIGIT = /(?:[[:digit:]]+)/
27
+ ASTERISK = /(?:\*+)/
28
+ UNDERSCORE = /(?:_+)/
29
+ HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ PERIOD_AND_PRIOR = /(?:(.+\.))/
31
+ PERIOD_ONLY = /(?:(\.))/
32
+ CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
33
+ PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
34
+ PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
35
+ PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
36
+ PUNCTUATION4 = /(?:[..。]+)/
37
+ DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
38
+ NO_BREAK_SPACE = /(?:\u00A0+)/
39
+ HTTP = /(?:https?:\/\/)/
40
+ TIME_WITH_COLON = /(?:\d:\d)/
41
+ DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
42
+ DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
43
+ DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
44
+ DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
45
+ NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
46
+ HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
47
+ HASHTAG = /(?:[##][[:print:]]+)/
48
+ MENTION = /(?:[@@][[:print:]]+)/
49
+ HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
50
+ ONE_AS_EXCLAMATION = /(?:\D1+)/
51
+ ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
52
+ MANY_PERIODS = /(?:^\.{2,}$)/
53
+ COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
54
+ CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
55
+ APOSTROPHE_AND_S = /(?:['’`́]s)/
56
+ ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
57
+ ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
58
+
59
+ # Regular expressions used to capture items
60
+ CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
61
+ QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
62
+ # Should we change specs and also capture "/", just like we capture ":" and "?"
63
+ SLASH_NOT_URL = /#{NOT_URL.source}\//
64
+ SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
65
+ MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
66
+ MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
67
+ BRACKET = /([{}()\[\]])/
68
+ EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
69
+ PERCENT_BEFORE_DIGIT = /(%)\d+/
70
+ COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
71
+ COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
72
+ COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
73
+ QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
74
+ QUOTE = /('')|["”]/
75
+ HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
76
+ HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
77
+
78
+ STARTS_WITH_COMMAS = /^#{COMMAS.source}/
79
+ STARTS_WITH_HTTP = /^#{HTTP.source}/
80
+ STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
81
+ STARTS_WITH_COLON1 = /^#{COLON1.source}/
82
+ STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
83
+ STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
84
+
85
+ ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
86
+ ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
87
+ ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
88
+ ENDS_WITH_COLON2 = /#{COLON2.source}$/
89
+ ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
90
+ ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
91
+ ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
92
+ ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
93
+ ENDS_WITH_ALPHA = /[[:alpha:]]$/
94
+ ENDS_WITH_DIGIT = /[[:digit:]]$/
95
+
96
+ ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
97
+ NO_DECIMALS = /(?:^\D+$)/
98
+ ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
99
+ ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
100
+ ONLY_EMAIL = /^#{EMAIL}$/
101
+ ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
102
+ ONLY_HASHTAG = /^#{HASHTAG}$/
103
+ ONLY_MENTION = /^#{MENTION}$/
104
+ ONLY_DOMAIN1 = /^#{DOMAIN1}$/
105
+ ONLY_DOMAIN2 = /^#{DOMAIN2}$/
106
+ ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
107
+ DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
108
+ UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
109
+ NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
110
+
111
+ COMMAS_OR_PUNCTUATION = Regexp.union(
112
+ STARTS_WITH_COMMAS,
113
+ ENDS_WITH_PUNCTUATION1,
114
+ ENDS_WITH_PUNCTUATION2
115
+ )
116
+
117
+ # Can this constant name be clarified?
118
+ VARIOUS = Regexp.union(
119
+ SLASH_NOT_URL,
120
+ QUESTION_MARK_NOT_URL,
121
+ ENCLOSED_PLUS,
122
+ STARTS_WITH_COLON1,
123
+ DINGBATS,
124
+ HASHTAG_WITH_HYPHEN,
125
+ CAPTURE_UNUSUAL_AND_EMOJI
126
+ )
127
+
128
+ IRRELEVANT_CHARACTERS = Regexp.union(
129
+ STARTS_WITH_PUNCTUATION3,
130
+ ENDS_WITH_COLON2,
131
+ ENDS_WITH_ONES_EXCLAMATIONS,
132
+ CONTROL_CHARACTER,
133
+ COPYRIGHT_TRADEMARK,
134
+ RANGE_ALPHANUMERIC_SUPPLEMENT
135
+ )
136
+
137
+ PRE_PROCESS = Regexp.union(
138
+ SHIFT_BOUNDARY_CHARACTERS,
139
+ MULTIPLE_DOTS,
140
+ BRACKET,
141
+ MULTIPLE_DASHES,
142
+ EXCLAMATION_BETWEEN_ALPHA,
143
+ PERCENT_BEFORE_DIGIT,
144
+ COMMA_BEFORE_NON_DIGIT,
145
+ COMMA_AFTER_NON_DIGIT
146
+ )
147
+
148
+ end
149
+ end
@@ -1,69 +1,22 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'set'
3
2
  require 'cgi'
3
+ require 'pragmatic_tokenizer/regex'
4
4
  require 'pragmatic_tokenizer/languages'
5
5
  require 'pragmatic_tokenizer/pre_processor'
6
6
  require 'pragmatic_tokenizer/post_processor'
7
- require 'pragmatic_tokenizer/full_stop_separator'
8
7
  require 'unicode'
9
8
 
10
9
  module PragmaticTokenizer
11
10
  class Tokenizer
12
11
 
13
- PUNCTIATION_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
14
- NUMBERS_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
15
- MENTIONS_OPTIONS = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
12
+ PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
13
+ NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
14
+ MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
16
15
  MAX_TOKEN_LENGTH = 50
17
- EMPTY_STRING = ''.freeze
18
- DOT_STRING = '.'.freeze
19
- SPACE_STRING = ' '.freeze
20
- REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
- REGEX_URL = /(http|https)(\.|:)/
22
- REGEX_HYPHEN = /\-/
23
- REGEX_UNDERSCORE = /\_/
24
- REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
25
- REGEX_APOSTROPHE_S = /['’`́]s$/
26
- REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
27
- REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
28
- REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
29
- REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
30
- REGEX_ASTERISK = /\*+/
31
- REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
32
- REGEX_UNDERSCORE_AT_END,
33
- REGEX_ASTERISK)
34
- # https://en.wikipedia.org/wiki/Control_character
35
- # matches any character with hexadecimal value 00 through 1F or 7F.
36
- # Rubular: http://rubular.com/r/E83fpBoDjI
37
- REGEXP_CONTROL = /[[:cntrl:]]/
38
- REGEXP_ENDING_COLON = /\:(?=\z)/
39
- REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
40
- REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
41
- REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
42
- REGEXP_SPECIAL_SYMBOL = /[®©]/
43
- REGEXP_PERCENT_AT_START = /\A\%/
44
- # https://codepoints.net/enclosed_alphanumeric_supplement
45
- REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
46
- REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
47
- REGEXP_ENDING_COLON,
48
- REGEXP_EXCLAMATION_AT_START,
49
- REGEXP_EXCLAMATION_AT_END,
50
- REGEXP_HYPHEN_AT_START,
51
- REGEXP_SPECIAL_SYMBOL,
52
- REGEXP_PERCENT_AT_START,
53
- REGEXP_ALPHANUMERIC_SUPPLEMENT)
54
- REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
55
- REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
56
- REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
57
- REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
58
- REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
59
- REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
60
- PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
61
- REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
62
- REGEXP_NUMBER_ONLY = /\A\d+\z/
63
- REGEXP_NO_NUMBERS = /\A\D+\z/
64
- REGEXP_NUMBER = /\D*\d+\d*/
65
- REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
66
- REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
16
+ NOTHING = ''.freeze
17
+ DOT = '.'.freeze
18
+ SPACE = ' '.freeze
19
+ SINGLE_QUOTE = "'".freeze
67
20
 
68
21
  # @param [Hash] opts optional arguments
69
22
 
@@ -123,10 +76,10 @@ module PragmaticTokenizer
123
76
  @abbreviations = Set.new(opts[:abbreviations])
124
77
  @stop_words = Set.new(opts[:stop_words])
125
78
 
126
- # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
79
+ # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
127
80
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
128
81
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
129
- @stop_words += @language_module::STOP_WORDS if @stop_words.empty? && @filter_languages.empty?
82
+ @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
130
83
 
131
84
  @filter_languages.each do |lang|
132
85
  language = Languages.get_language_by_code(lang)
@@ -135,34 +88,43 @@ module PragmaticTokenizer
135
88
  @stop_words += language::STOP_WORDS
136
89
  end
137
90
 
138
- raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
91
+ raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
139
92
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
140
93
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
141
- raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == Fixnum || @minimum_length.nil?
142
- raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == Fixnum || @long_word_split.nil?
94
+
95
+ integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
96
+
97
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
98
+ raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
143
99
  end
144
100
 
145
101
  # @param [String] text to be tokenized
146
102
 
147
103
  def tokenize(text)
148
104
  return [] unless text
149
- raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
105
+ raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
150
106
  CGI.unescapeHTML(text)
151
- .scan(REGEXP_CHUNK_STRING)
152
- .flat_map { |segment| post_process(pre_process(segment)) }
107
+ .scan(Regex::CHUNK_LONG_INPUT_TEXT)
108
+ .flat_map { |segment| process_segment(segment) }
153
109
  end
154
110
 
155
111
  private
156
112
 
157
- def pre_process(text)
158
- text
113
+ def process_segment(segment)
114
+ pre_processed = pre_process(segment)
115
+ cased_segment = chosen_case(pre_processed)
116
+ @tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
117
+ post_process_tokens
118
+ end
119
+
120
+ def pre_process(segment)
121
+ segment
159
122
  .extend(PragmaticTokenizer::PreProcessor)
160
123
  .pre_process(language: @language_module)
161
124
  end
162
125
 
163
- def post_process(text)
164
- @tokens = run_post_processor(text)
165
- remove_various!
126
+ def post_process_tokens
127
+ remove_by_options!
166
128
  process_numbers!
167
129
  process_punctuation!
168
130
  expand_contractions! if @expand_contractions
@@ -176,45 +138,45 @@ module PragmaticTokenizer
176
138
  @tokens.reject(&:empty?)
177
139
  end
178
140
 
179
- def run_post_processor(text)
180
- PostProcessor.new(
181
- text: chosen_case(text),
182
- abbreviations: @abbreviations,
183
- downcase: @downcase
184
- ).post_process
185
- end
186
-
187
141
  def expand_contractions!
188
- @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
142
+ @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
189
143
  end
190
144
 
191
145
  def expand_token_contraction(token)
192
- normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
146
+ normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
193
147
  return token unless @contractions.key?(normalized)
194
- result = @contractions[normalized].split(SPACE_STRING)
148
+ result = @contractions[normalized].split(SPACE)
195
149
  result[0] = Unicode.capitalize(result[0]) unless @downcase
196
150
  result
197
151
  end
198
152
 
199
153
  def clean!
200
154
  @tokens = @tokens
201
- .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
202
- .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
203
- .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
204
- .delete_if { |t| unclean_token?(t) }
155
+ .flat_map { |token| split_underscores_asterisk(token) }
156
+ .map! { |token| remove_irrelevant_characters(token) }
157
+ .delete_if { |token| many_dots?(token) }
205
158
  end
206
159
 
207
- def unclean_token?(token)
208
- return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
209
- return true if token.length > MAX_TOKEN_LENGTH
210
- return true if token.include?('\\'.freeze)
211
- token =~ REGEXP_CONSECUTIVE_DOTS
160
+ def split_underscores_asterisk(token)
161
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
162
+ token.split(Regex::UNDERSCORES_ASTERISK)
163
+ end
164
+
165
+ def remove_irrelevant_characters(token)
166
+ token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
167
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
168
+ token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
169
+ token
170
+ end
171
+
172
+ def many_dots?(token)
173
+ token =~ Regex::MANY_PERIODS
212
174
  end
213
175
 
214
176
  def classic_filter!
215
177
  @tokens.map! do |token|
216
- token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
217
- token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
178
+ token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
179
+ token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
218
180
  token
219
181
  end
220
182
  end
@@ -222,26 +184,26 @@ module PragmaticTokenizer
222
184
  def process_numbers!
223
185
  case @numbers
224
186
  when :semi
225
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
187
+ @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
226
188
  when :none
227
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
189
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
228
190
  when :only
229
- @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
191
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
230
192
  end
231
193
  end
232
194
 
233
195
  def remove_short_tokens!
234
- @tokens.delete_if { |t| t.length < @minimum_length }
196
+ @tokens.delete_if { |token| token.length < @minimum_length }
235
197
  end
236
198
 
237
199
  def process_punctuation!
238
200
  case @punctuation
239
201
  when :semi
240
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
202
+ @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
241
203
  when :none
242
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
204
+ @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
243
205
  when :only
244
- @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
206
+ @tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
245
207
  end
246
208
  end
247
209
 
@@ -252,46 +214,50 @@ module PragmaticTokenizer
252
214
  def mentions!
253
215
  case @mentions
254
216
  when :remove
255
- @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
217
+ @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
256
218
  when :keep_and_clean
257
- @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
219
+ @tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
258
220
  end
259
221
  end
260
222
 
261
223
  def hashtags!
262
224
  case @hashtags
263
225
  when :remove
264
- @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
226
+ @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
265
227
  when :keep_and_clean
266
- @tokens = @tokens
267
- .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
268
- .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
228
+ @tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
269
229
  end
270
230
  end
271
231
 
272
- def remove_various!
273
- @tokens.delete_if { |t| t =~ regex_various }
232
+ def remove_by_options!
233
+ @tokens.delete_if { |token| token =~ regex_by_options }
274
234
  end
275
235
 
276
- def regex_various
277
- @regex_various ||= begin
236
+ def regex_by_options
237
+ @regex_by_options ||= begin
278
238
  regex_array = []
279
- regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
280
- regex_array << REGEX_EMAIL if @remove_emails
281
- regex_array << REGEX_URL if @remove_urls
282
- regex_array << REGEX_DOMAIN if @remove_domains
239
+ regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
240
+ regex_array << Regex::ONLY_EMAIL if @remove_emails
241
+ regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
242
+ regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
283
243
  Regexp.union(regex_array)
284
244
  end
285
245
  end
286
246
 
287
247
  def split_long_words!
288
- @tokens = @tokens
289
- .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_HYPHEN) : t }
290
- .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_UNDERSCORE) : t }
248
+ @tokens = @tokens.flat_map { |token| split_long_word(token) }
249
+ end
250
+
251
+ def split_long_word(token)
252
+ return token unless @long_word_split
253
+ return token if token.length <= @long_word_split
254
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
+ return token if token =~ Regex::DOMAIN_OR_EMAIL
256
+ token.split(Regex::HYPHEN_OR_UNDERSCORE)
291
257
  end
292
258
 
293
- def chosen_case(token)
294
- @downcase ? Unicode.downcase(token) : token
259
+ def chosen_case(text)
260
+ @downcase ? Unicode.downcase(text) : text
295
261
  end
296
262
 
297
263
  def inverse_case(token)