pragmatic_tokenizer 3.0.3 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +1 -1
  3. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  4. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  5. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  6. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  7. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  9. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  10. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  11. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  12. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  13. data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
  14. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  17. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  20. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  21. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  22. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  23. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
  27. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  28. data/lib/pragmatic_tokenizer/regex.rb +149 -0
  29. data/lib/pragmatic_tokenizer/tokenizer.rb +82 -116
  30. data/lib/pragmatic_tokenizer/version.rb +1 -1
  31. data/pragmatic_tokenizer.gemspec +5 -6
  32. data/spec/languages/deutsch_spec.rb +1 -1
  33. data/spec/languages/english_spec.rb +52 -0
  34. data/spec/languages/french_spec.rb +2 -2
  35. data/spec/performance_spec.rb +1 -1
  36. data/spec/spec_helper.rb +1 -1
  37. metadata +8 -8
  38. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -63
@@ -0,0 +1,149 @@
1
+ module PragmaticTokenizer
2
+ class Regex
3
+
4
+ # Things that can or should be done:
5
+ # - check where the use of unicode categories helps (\p{Abbreviation})
6
+ # - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
7
+ # - check multiple domain regex, we have spec issues when using one or the other
8
+ # - check multiple punctuation regex
9
+
10
+ # Text that needs to be tokenized is initially split into chunks of this length:
11
+ CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
12
+
13
+ # Ranges
14
+ RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
15
+ RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
16
+ RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
17
+ RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
18
+ RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
19
+
20
+ # Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
21
+ COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
22
+ COLON2 = /(?::)/
23
+ COMMAS = /(?:([,‚])+)/
24
+ ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
25
+ EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
26
+ DIGIT = /(?:[[:digit:]]+)/
27
+ ASTERISK = /(?:\*+)/
28
+ UNDERSCORE = /(?:_+)/
29
+ HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ PERIOD_AND_PRIOR = /(?:(.+\.))/
31
+ PERIOD_ONLY = /(?:(\.))/
32
+ CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
33
+ PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
34
+ PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
35
+ PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
36
+ PUNCTUATION4 = /(?:[..。]+)/
37
+ DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
38
+ NO_BREAK_SPACE = /(?:\u00A0+)/
39
+ HTTP = /(?:https?:\/\/)/
40
+ TIME_WITH_COLON = /(?:\d:\d)/
41
+ DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
42
+ DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
43
+ DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
44
+ DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
45
+ NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
46
+ HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
47
+ HASHTAG = /(?:[##][[:print:]]+)/
48
+ MENTION = /(?:[@@][[:print:]]+)/
49
+ HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
50
+ ONE_AS_EXCLAMATION = /(?:\D1+)/
51
+ ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
52
+ MANY_PERIODS = /(?:^\.{2,}$)/
53
+ COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
54
+ CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
55
+ APOSTROPHE_AND_S = /(?:['’`́]s)/
56
+ ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
57
+ ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
58
+
59
+ # Regular expressions used to capture items
60
+ CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
61
+ QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
62
+ # Should we change specs and also capture "/", just like we capture ":" and "?"
63
+ SLASH_NOT_URL = /#{NOT_URL.source}\//
64
+ SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
65
+ MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
66
+ MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
67
+ BRACKET = /([{}()\[\]])/
68
+ EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
69
+ PERCENT_BEFORE_DIGIT = /(%)\d+/
70
+ COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
71
+ COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
72
+ COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
73
+ QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
74
+ QUOTE = /('')|["”]/
75
+ HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
76
+ HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
77
+
78
+ STARTS_WITH_COMMAS = /^#{COMMAS.source}/
79
+ STARTS_WITH_HTTP = /^#{HTTP.source}/
80
+ STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
81
+ STARTS_WITH_COLON1 = /^#{COLON1.source}/
82
+ STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
83
+ STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
84
+
85
+ ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
86
+ ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
87
+ ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
88
+ ENDS_WITH_COLON2 = /#{COLON2.source}$/
89
+ ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
90
+ ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
91
+ ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
92
+ ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
93
+ ENDS_WITH_ALPHA = /[[:alpha:]]$/
94
+ ENDS_WITH_DIGIT = /[[:digit:]]$/
95
+
96
+ ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
97
+ NO_DECIMALS = /(?:^\D+$)/
98
+ ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
99
+ ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
100
+ ONLY_EMAIL = /^#{EMAIL}$/
101
+ ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
102
+ ONLY_HASHTAG = /^#{HASHTAG}$/
103
+ ONLY_MENTION = /^#{MENTION}$/
104
+ ONLY_DOMAIN1 = /^#{DOMAIN1}$/
105
+ ONLY_DOMAIN2 = /^#{DOMAIN2}$/
106
+ ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
107
+ DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
108
+ UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
109
+ NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
110
+
111
+ COMMAS_OR_PUNCTUATION = Regexp.union(
112
+ STARTS_WITH_COMMAS,
113
+ ENDS_WITH_PUNCTUATION1,
114
+ ENDS_WITH_PUNCTUATION2
115
+ )
116
+
117
+ # Can this constant name be clarified?
118
+ VARIOUS = Regexp.union(
119
+ SLASH_NOT_URL,
120
+ QUESTION_MARK_NOT_URL,
121
+ ENCLOSED_PLUS,
122
+ STARTS_WITH_COLON1,
123
+ DINGBATS,
124
+ HASHTAG_WITH_HYPHEN,
125
+ CAPTURE_UNUSUAL_AND_EMOJI
126
+ )
127
+
128
+ IRRELEVANT_CHARACTERS = Regexp.union(
129
+ STARTS_WITH_PUNCTUATION3,
130
+ ENDS_WITH_COLON2,
131
+ ENDS_WITH_ONES_EXCLAMATIONS,
132
+ CONTROL_CHARACTER,
133
+ COPYRIGHT_TRADEMARK,
134
+ RANGE_ALPHANUMERIC_SUPPLEMENT
135
+ )
136
+
137
+ PRE_PROCESS = Regexp.union(
138
+ SHIFT_BOUNDARY_CHARACTERS,
139
+ MULTIPLE_DOTS,
140
+ BRACKET,
141
+ MULTIPLE_DASHES,
142
+ EXCLAMATION_BETWEEN_ALPHA,
143
+ PERCENT_BEFORE_DIGIT,
144
+ COMMA_BEFORE_NON_DIGIT,
145
+ COMMA_AFTER_NON_DIGIT
146
+ )
147
+
148
+ end
149
+ end
@@ -1,69 +1,22 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'set'
3
2
  require 'cgi'
3
+ require 'pragmatic_tokenizer/regex'
4
4
  require 'pragmatic_tokenizer/languages'
5
5
  require 'pragmatic_tokenizer/pre_processor'
6
6
  require 'pragmatic_tokenizer/post_processor'
7
- require 'pragmatic_tokenizer/full_stop_separator'
8
7
  require 'unicode'
9
8
 
10
9
  module PragmaticTokenizer
11
10
  class Tokenizer
12
11
 
13
- PUNCTIATION_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
14
- NUMBERS_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
15
- MENTIONS_OPTIONS = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
12
+ PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
13
+ NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
14
+ MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
16
15
  MAX_TOKEN_LENGTH = 50
17
- EMPTY_STRING = ''.freeze
18
- DOT_STRING = '.'.freeze
19
- SPACE_STRING = ' '.freeze
20
- REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
- REGEX_URL = /(http|https)(\.|:)/
22
- REGEX_HYPHEN = /\-/
23
- REGEX_UNDERSCORE = /\_/
24
- REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
25
- REGEX_APOSTROPHE_S = /['’`́]s$/
26
- REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
27
- REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
28
- REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
29
- REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
30
- REGEX_ASTERISK = /\*+/
31
- REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
32
- REGEX_UNDERSCORE_AT_END,
33
- REGEX_ASTERISK)
34
- # https://en.wikipedia.org/wiki/Control_character
35
- # matches any character with hexadecimal value 00 through 1F or 7F.
36
- # Rubular: http://rubular.com/r/E83fpBoDjI
37
- REGEXP_CONTROL = /[[:cntrl:]]/
38
- REGEXP_ENDING_COLON = /\:(?=\z)/
39
- REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
40
- REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
41
- REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
42
- REGEXP_SPECIAL_SYMBOL = /[®©]/
43
- REGEXP_PERCENT_AT_START = /\A\%/
44
- # https://codepoints.net/enclosed_alphanumeric_supplement
45
- REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
46
- REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
47
- REGEXP_ENDING_COLON,
48
- REGEXP_EXCLAMATION_AT_START,
49
- REGEXP_EXCLAMATION_AT_END,
50
- REGEXP_HYPHEN_AT_START,
51
- REGEXP_SPECIAL_SYMBOL,
52
- REGEXP_PERCENT_AT_START,
53
- REGEXP_ALPHANUMERIC_SUPPLEMENT)
54
- REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
55
- REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
56
- REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
57
- REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
58
- REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
59
- REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
60
- PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
61
- REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
62
- REGEXP_NUMBER_ONLY = /\A\d+\z/
63
- REGEXP_NO_NUMBERS = /\A\D+\z/
64
- REGEXP_NUMBER = /\D*\d+\d*/
65
- REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
66
- REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
16
+ NOTHING = ''.freeze
17
+ DOT = '.'.freeze
18
+ SPACE = ' '.freeze
19
+ SINGLE_QUOTE = "'".freeze
67
20
 
68
21
  # @param [Hash] opts optional arguments
69
22
 
@@ -123,10 +76,10 @@ module PragmaticTokenizer
123
76
  @abbreviations = Set.new(opts[:abbreviations])
124
77
  @stop_words = Set.new(opts[:stop_words])
125
78
 
126
- # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
79
+ # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
127
80
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
128
81
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
129
- @stop_words += @language_module::STOP_WORDS if @stop_words.empty? && @filter_languages.empty?
82
+ @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
130
83
 
131
84
  @filter_languages.each do |lang|
132
85
  language = Languages.get_language_by_code(lang)
@@ -135,34 +88,43 @@ module PragmaticTokenizer
135
88
  @stop_words += language::STOP_WORDS
136
89
  end
137
90
 
138
- raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
91
+ raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
139
92
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
140
93
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
141
- raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == Fixnum || @minimum_length.nil?
142
- raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == Fixnum || @long_word_split.nil?
94
+
95
+ integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
96
+
97
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
98
+ raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
143
99
  end
144
100
 
145
101
  # @param [String] text to be tokenized
146
102
 
147
103
  def tokenize(text)
148
104
  return [] unless text
149
- raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
105
+ raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
150
106
  CGI.unescapeHTML(text)
151
- .scan(REGEXP_CHUNK_STRING)
152
- .flat_map { |segment| post_process(pre_process(segment)) }
107
+ .scan(Regex::CHUNK_LONG_INPUT_TEXT)
108
+ .flat_map { |segment| process_segment(segment) }
153
109
  end
154
110
 
155
111
  private
156
112
 
157
- def pre_process(text)
158
- text
113
+ def process_segment(segment)
114
+ pre_processed = pre_process(segment)
115
+ cased_segment = chosen_case(pre_processed)
116
+ @tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
117
+ post_process_tokens
118
+ end
119
+
120
+ def pre_process(segment)
121
+ segment
159
122
  .extend(PragmaticTokenizer::PreProcessor)
160
123
  .pre_process(language: @language_module)
161
124
  end
162
125
 
163
- def post_process(text)
164
- @tokens = run_post_processor(text)
165
- remove_various!
126
+ def post_process_tokens
127
+ remove_by_options!
166
128
  process_numbers!
167
129
  process_punctuation!
168
130
  expand_contractions! if @expand_contractions
@@ -176,45 +138,45 @@ module PragmaticTokenizer
176
138
  @tokens.reject(&:empty?)
177
139
  end
178
140
 
179
- def run_post_processor(text)
180
- PostProcessor.new(
181
- text: chosen_case(text),
182
- abbreviations: @abbreviations,
183
- downcase: @downcase
184
- ).post_process
185
- end
186
-
187
141
  def expand_contractions!
188
- @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
142
+ @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
189
143
  end
190
144
 
191
145
  def expand_token_contraction(token)
192
- normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
146
+ normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
193
147
  return token unless @contractions.key?(normalized)
194
- result = @contractions[normalized].split(SPACE_STRING)
148
+ result = @contractions[normalized].split(SPACE)
195
149
  result[0] = Unicode.capitalize(result[0]) unless @downcase
196
150
  result
197
151
  end
198
152
 
199
153
  def clean!
200
154
  @tokens = @tokens
201
- .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
202
- .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
203
- .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
204
- .delete_if { |t| unclean_token?(t) }
155
+ .flat_map { |token| split_underscores_asterisk(token) }
156
+ .map! { |token| remove_irrelevant_characters(token) }
157
+ .delete_if { |token| many_dots?(token) }
205
158
  end
206
159
 
207
- def unclean_token?(token)
208
- return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
209
- return true if token.length > MAX_TOKEN_LENGTH
210
- return true if token.include?('\\'.freeze)
211
- token =~ REGEXP_CONSECUTIVE_DOTS
160
+ def split_underscores_asterisk(token)
161
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
162
+ token.split(Regex::UNDERSCORES_ASTERISK)
163
+ end
164
+
165
+ def remove_irrelevant_characters(token)
166
+ token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
167
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
168
+ token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
169
+ token
170
+ end
171
+
172
+ def many_dots?(token)
173
+ token =~ Regex::MANY_PERIODS
212
174
  end
213
175
 
214
176
  def classic_filter!
215
177
  @tokens.map! do |token|
216
- token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
217
- token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
178
+ token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
179
+ token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
218
180
  token
219
181
  end
220
182
  end
@@ -222,26 +184,26 @@ module PragmaticTokenizer
222
184
  def process_numbers!
223
185
  case @numbers
224
186
  when :semi
225
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
187
+ @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
226
188
  when :none
227
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
189
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
228
190
  when :only
229
- @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
191
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
230
192
  end
231
193
  end
232
194
 
233
195
  def remove_short_tokens!
234
- @tokens.delete_if { |t| t.length < @minimum_length }
196
+ @tokens.delete_if { |token| token.length < @minimum_length }
235
197
  end
236
198
 
237
199
  def process_punctuation!
238
200
  case @punctuation
239
201
  when :semi
240
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
202
+ @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
241
203
  when :none
242
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
204
+ @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
243
205
  when :only
244
- @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
206
+ @tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
245
207
  end
246
208
  end
247
209
 
@@ -252,46 +214,50 @@ module PragmaticTokenizer
252
214
  def mentions!
253
215
  case @mentions
254
216
  when :remove
255
- @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
217
+ @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
256
218
  when :keep_and_clean
257
- @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
219
+ @tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
258
220
  end
259
221
  end
260
222
 
261
223
  def hashtags!
262
224
  case @hashtags
263
225
  when :remove
264
- @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
226
+ @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
265
227
  when :keep_and_clean
266
- @tokens = @tokens
267
- .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
268
- .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
228
+ @tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
269
229
  end
270
230
  end
271
231
 
272
- def remove_various!
273
- @tokens.delete_if { |t| t =~ regex_various }
232
+ def remove_by_options!
233
+ @tokens.delete_if { |token| token =~ regex_by_options }
274
234
  end
275
235
 
276
- def regex_various
277
- @regex_various ||= begin
236
+ def regex_by_options
237
+ @regex_by_options ||= begin
278
238
  regex_array = []
279
- regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
280
- regex_array << REGEX_EMAIL if @remove_emails
281
- regex_array << REGEX_URL if @remove_urls
282
- regex_array << REGEX_DOMAIN if @remove_domains
239
+ regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
240
+ regex_array << Regex::ONLY_EMAIL if @remove_emails
241
+ regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
242
+ regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
283
243
  Regexp.union(regex_array)
284
244
  end
285
245
  end
286
246
 
287
247
  def split_long_words!
288
- @tokens = @tokens
289
- .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_HYPHEN) : t }
290
- .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_UNDERSCORE) : t }
248
+ @tokens = @tokens.flat_map { |token| split_long_word(token) }
249
+ end
250
+
251
+ def split_long_word(token)
252
+ return token unless @long_word_split
253
+ return token if token.length <= @long_word_split
254
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
+ return token if token =~ Regex::DOMAIN_OR_EMAIL
256
+ token.split(Regex::HYPHEN_OR_UNDERSCORE)
291
257
  end
292
258
 
293
- def chosen_case(token)
294
- @downcase ? Unicode.downcase(token) : token
259
+ def chosen_case(text)
260
+ @downcase ? Unicode.downcase(text) : text
295
261
  end
296
262
 
297
263
  def inverse_case(token)