pragmatic_tokenizer 3.0.4 → 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  3. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  4. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  5. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  6. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  9. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  10. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  11. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
  13. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  16. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
  26. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  27. data/lib/pragmatic_tokenizer/regex.rb +150 -0
  28. data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
  29. data/lib/pragmatic_tokenizer/version.rb +1 -1
  30. data/pragmatic_tokenizer.gemspec +5 -6
  31. data/spec/languages/english_spec.rb +13 -0
  32. data/spec/languages/french_spec.rb +2 -2
  33. data/spec/performance_spec.rb +0 -1
  34. data/spec/spec_helper.rb +1 -1
  35. metadata +12 -12
  36. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
@@ -0,0 +1,150 @@
1
+ module PragmaticTokenizer
2
+ class Regex
3
+
4
+ # Things that can or should be done:
5
+ # - check where the use of unicode categories helps (\p{Abbreviation})
6
+ # - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
7
+ # - check multiple domain regex, we have spec issues when using one or the other
8
+ # - check multiple punctuation regex
9
+
10
+ # Text that needs to be tokenized is initially split into chunks of this length:
11
+ CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
12
+
13
+ # Ranges
14
+ RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
15
+ RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
16
+ RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
17
+ RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
18
+ RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
19
+
20
+ # Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
21
+ COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
22
+ COLON2 = /(?::)/
23
+ COMMAS = /(?:([,‚])+)/
24
+ ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
25
+ EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
26
+ DIGIT = /(?:[[:digit:]]+)/
27
+ ASTERISK = /(?:\*+)/
28
+ UNDERSCORE = /(?:_+)/
29
+ HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ LONG_WORD_SPLIT = /(?:[-_\/—–])/
31
+ PERIOD_AND_PRIOR = /(?:(.+\.))/
32
+ PERIOD_ONLY = /(?:(\.))/
33
+ CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
34
+ PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
35
+ PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
36
+ PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
37
+ PUNCTUATION4 = /(?:[..。]+)/
38
+ DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
39
+ NO_BREAK_SPACE = /(?:\u00A0+)/
40
+ HTTP = /(?:https?:\/\/)/
41
+ TIME_WITH_COLON = /(?:\d:\d)/
42
+ DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
43
+ DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
44
+ DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
45
+ DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
46
+ NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
47
+ HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
48
+ HASHTAG = /(?:[##][[:print:]]+)/
49
+ MENTION = /(?:[@@][[:print:]]+)/
50
+ HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
51
+ ONE_AS_EXCLAMATION = /(?:\D1+)/
52
+ ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
53
+ MANY_PERIODS = /(?:^\.{2,}$)/
54
+ COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
55
+ CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
56
+ APOSTROPHE_AND_S = /(?:['’`́]s)/
57
+ ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
58
+ ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
59
+
60
+ # Regular expressions used to capture items
61
+ CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
62
+ QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
63
+ # Should we change specs and also capture "/", just like we capture ":" and "?"
64
+ SLASH_NOT_URL = /#{NOT_URL.source}\//
65
+ SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
66
+ MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
67
+ MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
68
+ BRACKET = /([{}()\[\]])/
69
+ EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
70
+ PERCENT_BEFORE_DIGIT = /(%)\d+/
71
+ COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
72
+ COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
73
+ COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
74
+ QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
75
+ QUOTE = /('')|["”]/
76
+ HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
77
+ HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
78
+
79
+ STARTS_WITH_COMMAS = /^#{COMMAS.source}/
80
+ STARTS_WITH_HTTP = /^#{HTTP.source}/
81
+ STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
82
+ STARTS_WITH_COLON1 = /^#{COLON1.source}/
83
+ STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
84
+ STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
85
+
86
+ ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
87
+ ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
88
+ ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
89
+ ENDS_WITH_COLON2 = /#{COLON2.source}$/
90
+ ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
91
+ ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
92
+ ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
93
+ ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
94
+ ENDS_WITH_ALPHA = /[[:alpha:]]$/
95
+ ENDS_WITH_DIGIT = /[[:digit:]]$/
96
+
97
+ ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
98
+ NO_DECIMALS = /(?:^\D+$)/
99
+ ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
100
+ ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
101
+ ONLY_EMAIL = /^#{EMAIL}$/
102
+ ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
103
+ ONLY_HASHTAG = /^#{HASHTAG}$/
104
+ ONLY_MENTION = /^#{MENTION}$/
105
+ ONLY_DOMAIN1 = /^#{DOMAIN1}$/
106
+ ONLY_DOMAIN2 = /^#{DOMAIN2}$/
107
+ ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
108
+ DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
109
+ UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
110
+ NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
111
+
112
+ COMMAS_OR_PUNCTUATION = Regexp.union(
113
+ STARTS_WITH_COMMAS,
114
+ ENDS_WITH_PUNCTUATION1,
115
+ ENDS_WITH_PUNCTUATION2
116
+ )
117
+
118
+ # Can this constant name be clarified?
119
+ VARIOUS = Regexp.union(
120
+ SLASH_NOT_URL,
121
+ QUESTION_MARK_NOT_URL,
122
+ ENCLOSED_PLUS,
123
+ STARTS_WITH_COLON1,
124
+ DINGBATS,
125
+ HASHTAG_WITH_HYPHEN,
126
+ CAPTURE_UNUSUAL_AND_EMOJI
127
+ )
128
+
129
+ IRRELEVANT_CHARACTERS = Regexp.union(
130
+ STARTS_WITH_PUNCTUATION3,
131
+ ENDS_WITH_COLON2,
132
+ ENDS_WITH_ONES_EXCLAMATIONS,
133
+ CONTROL_CHARACTER,
134
+ COPYRIGHT_TRADEMARK,
135
+ RANGE_ALPHANUMERIC_SUPPLEMENT
136
+ )
137
+
138
+ PRE_PROCESS = Regexp.union(
139
+ SHIFT_BOUNDARY_CHARACTERS,
140
+ MULTIPLE_DOTS,
141
+ BRACKET,
142
+ MULTIPLE_DASHES,
143
+ EXCLAMATION_BETWEEN_ALPHA,
144
+ PERCENT_BEFORE_DIGIT,
145
+ COMMA_BEFORE_NON_DIGIT,
146
+ COMMA_AFTER_NON_DIGIT
147
+ )
148
+
149
+ end
150
+ end
@@ -1,70 +1,22 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'set'
3
2
  require 'cgi'
3
+ require 'pragmatic_tokenizer/regex'
4
4
  require 'pragmatic_tokenizer/languages'
5
5
  require 'pragmatic_tokenizer/pre_processor'
6
6
  require 'pragmatic_tokenizer/post_processor'
7
- require 'pragmatic_tokenizer/full_stop_separator'
8
7
  require 'unicode'
9
8
 
10
9
  module PragmaticTokenizer
11
10
  class Tokenizer
12
11
 
13
- PUNCTIATION_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
14
- NUMBERS_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
15
- MENTIONS_OPTIONS = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
12
+ PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
13
+ NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
14
+ MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
16
15
  MAX_TOKEN_LENGTH = 50
17
- EMPTY_STRING = ''.freeze
18
- DOT_STRING = '.'.freeze
19
- SPACE_STRING = ' '.freeze
20
- REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
- REGEX_URL = /(http|https)(\.|:)/
22
- REGEX_HYPHEN = /\-/
23
- REGEX_LONG_WORD = /\-|\_/
24
- REGEXP_SPLIT_CHECK = /@|@|(http)/
25
- REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
26
- REGEX_APOSTROPHE_S = /['’`́]s$/
27
- REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
28
- REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
29
- REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
30
- REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
31
- REGEX_ASTERISK = /\*+/
32
- REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
33
- REGEX_UNDERSCORE_AT_END,
34
- REGEX_ASTERISK)
35
- # https://en.wikipedia.org/wiki/Control_character
36
- # matches any character with hexadecimal value 00 through 1F or 7F.
37
- # Rubular: http://rubular.com/r/E83fpBoDjI
38
- REGEXP_CONTROL = /[[:cntrl:]]/
39
- REGEXP_ENDING_COLON = /\:(?=\z)/
40
- REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
41
- REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
42
- REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
43
- REGEXP_SPECIAL_SYMBOL = /[®©]/
44
- REGEXP_PERCENT_AT_START = /\A\%/
45
- # https://codepoints.net/enclosed_alphanumeric_supplement
46
- REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
47
- REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
48
- REGEXP_ENDING_COLON,
49
- REGEXP_EXCLAMATION_AT_START,
50
- REGEXP_EXCLAMATION_AT_END,
51
- REGEXP_HYPHEN_AT_START,
52
- REGEXP_SPECIAL_SYMBOL,
53
- REGEXP_PERCENT_AT_START,
54
- REGEXP_ALPHANUMERIC_SUPPLEMENT)
55
- REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
56
- REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
57
- REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
58
- REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
59
- REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
60
- REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
61
- PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
62
- REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
63
- REGEXP_NUMBER_ONLY = /\A\d+\z/
64
- REGEXP_NO_NUMBERS = /\A\D+\z/
65
- REGEXP_NUMBER = /\D*\d+\d*/
66
- REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
67
- REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
16
+ NOTHING = ''.freeze
17
+ DOT = '.'.freeze
18
+ SPACE = ' '.freeze
19
+ SINGLE_QUOTE = "'".freeze
68
20
 
69
21
  # @param [Hash] opts optional arguments
70
22
 
@@ -124,7 +76,7 @@ module PragmaticTokenizer
124
76
  @abbreviations = Set.new(opts[:abbreviations])
125
77
  @stop_words = Set.new(opts[:stop_words])
126
78
 
127
- # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
79
+ # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
128
80
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
129
81
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
130
82
  @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
@@ -136,34 +88,43 @@ module PragmaticTokenizer
136
88
  @stop_words += language::STOP_WORDS
137
89
  end
138
90
 
139
- raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
91
+ raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
140
92
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
141
93
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
142
- raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == Fixnum || @minimum_length.nil?
143
- raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == Fixnum || @long_word_split.nil?
94
+
95
+ integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
96
+
97
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
98
+ raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
144
99
  end
145
100
 
146
101
  # @param [String] text to be tokenized
147
102
 
148
103
  def tokenize(text)
149
104
  return [] unless text
150
- raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
105
+ raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
151
106
  CGI.unescapeHTML(text)
152
- .scan(REGEXP_CHUNK_STRING)
153
- .flat_map { |segment| post_process(pre_process(segment)) }
107
+ .scan(Regex::CHUNK_LONG_INPUT_TEXT)
108
+ .flat_map { |segment| process_segment(segment) }
154
109
  end
155
110
 
156
111
  private
157
112
 
158
- def pre_process(text)
159
- text
113
+ def process_segment(segment)
114
+ pre_processed = pre_process(segment)
115
+ cased_segment = chosen_case(pre_processed)
116
+ @tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
117
+ post_process_tokens
118
+ end
119
+
120
+ def pre_process(segment)
121
+ segment
160
122
  .extend(PragmaticTokenizer::PreProcessor)
161
123
  .pre_process(language: @language_module)
162
124
  end
163
125
 
164
- def post_process(text)
165
- @tokens = run_post_processor(text)
166
- remove_various!
126
+ def post_process_tokens
127
+ remove_by_options!
167
128
  process_numbers!
168
129
  process_punctuation!
169
130
  expand_contractions! if @expand_contractions
@@ -177,45 +138,45 @@ module PragmaticTokenizer
177
138
  @tokens.reject(&:empty?)
178
139
  end
179
140
 
180
- def run_post_processor(text)
181
- PostProcessor.new(
182
- text: chosen_case(text),
183
- abbreviations: @abbreviations,
184
- downcase: @downcase
185
- ).post_process
186
- end
187
-
188
141
  def expand_contractions!
189
- @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
142
+ @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
190
143
  end
191
144
 
192
145
  def expand_token_contraction(token)
193
- normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
146
+ normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
194
147
  return token unless @contractions.key?(normalized)
195
- result = @contractions[normalized].split(SPACE_STRING)
148
+ result = @contractions[normalized].split(SPACE)
196
149
  result[0] = Unicode.capitalize(result[0]) unless @downcase
197
150
  result
198
151
  end
199
152
 
200
153
  def clean!
201
154
  @tokens = @tokens
202
- .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
203
- .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
204
- .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
205
- .delete_if { |t| unclean_token?(t) }
155
+ .flat_map { |token| split_underscores_asterisk(token) }
156
+ .map! { |token| remove_irrelevant_characters(token) }
157
+ .delete_if { |token| many_dots?(token) }
206
158
  end
207
159
 
208
- def unclean_token?(token)
209
- return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
210
- return true if token.length > MAX_TOKEN_LENGTH
211
- return true if token.include?('\\'.freeze)
212
- token =~ REGEXP_CONSECUTIVE_DOTS
160
+ def split_underscores_asterisk(token)
161
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
162
+ token.split(Regex::UNDERSCORES_ASTERISK)
163
+ end
164
+
165
+ def remove_irrelevant_characters(token)
166
+ token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
167
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
168
+ token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
169
+ token
170
+ end
171
+
172
+ def many_dots?(token)
173
+ token =~ Regex::MANY_PERIODS
213
174
  end
214
175
 
215
176
  def classic_filter!
216
177
  @tokens.map! do |token|
217
- token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
218
- token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
178
+ token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
179
+ token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
219
180
  token
220
181
  end
221
182
  end
@@ -223,26 +184,26 @@ module PragmaticTokenizer
223
184
  def process_numbers!
224
185
  case @numbers
225
186
  when :semi
226
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
187
+ @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
227
188
  when :none
228
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
189
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
229
190
  when :only
230
- @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
191
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
231
192
  end
232
193
  end
233
194
 
234
195
  def remove_short_tokens!
235
- @tokens.delete_if { |t| t.length < @minimum_length }
196
+ @tokens.delete_if { |token| token.length < @minimum_length }
236
197
  end
237
198
 
238
199
  def process_punctuation!
239
200
  case @punctuation
240
201
  when :semi
241
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
202
+ @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
242
203
  when :none
243
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
204
+ @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
244
205
  when :only
245
- @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
206
+ @tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
246
207
  end
247
208
  end
248
209
 
@@ -253,45 +214,50 @@ module PragmaticTokenizer
253
214
  def mentions!
254
215
  case @mentions
255
216
  when :remove
256
- @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
217
+ @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
257
218
  when :keep_and_clean
258
- @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
219
+ @tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
259
220
  end
260
221
  end
261
222
 
262
223
  def hashtags!
263
224
  case @hashtags
264
225
  when :remove
265
- @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
226
+ @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
266
227
  when :keep_and_clean
267
- @tokens = @tokens
268
- .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
269
- .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
228
+ @tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
270
229
  end
271
230
  end
272
231
 
273
- def remove_various!
274
- @tokens.delete_if { |t| t =~ regex_various }
232
+ def remove_by_options!
233
+ @tokens.delete_if { |token| token =~ regex_by_options }
275
234
  end
276
235
 
277
- def regex_various
278
- @regex_various ||= begin
236
+ def regex_by_options
237
+ @regex_by_options ||= begin
279
238
  regex_array = []
280
- regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
281
- regex_array << REGEX_EMAIL if @remove_emails
282
- regex_array << REGEX_URL if @remove_urls
283
- regex_array << REGEX_DOMAIN if @remove_domains
239
+ regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
240
+ regex_array << Regex::ONLY_EMAIL if @remove_emails
241
+ regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
242
+ regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
284
243
  Regexp.union(regex_array)
285
244
  end
286
245
  end
287
246
 
288
247
  def split_long_words!
289
- @tokens = @tokens
290
- .flat_map { |t| (t.length > @long_word_split && t !~ REGEXP_SPLIT_CHECK ) ? t.split(REGEX_LONG_WORD) : t }
248
+ @tokens = @tokens.flat_map { |token| split_long_word(token) }
249
+ end
250
+
251
+ def split_long_word(token)
252
+ return token unless @long_word_split
253
+ return token if token.length <= @long_word_split
254
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
+ return token if token =~ Regex::DOMAIN_OR_EMAIL
256
+ token.split(Regex::LONG_WORD_SPLIT)
291
257
  end
292
258
 
293
- def chosen_case(txt)
294
- @downcase ? Unicode.downcase(txt) : txt
259
+ def chosen_case(text)
260
+ @downcase ? Unicode.downcase(text) : text
295
261
  end
296
262
 
297
263
  def inverse_case(token)