pragmatic_tokenizer 3.0.4 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  3. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  4. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  5. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  6. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  9. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  10. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  11. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
  13. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  16. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
  26. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  27. data/lib/pragmatic_tokenizer/regex.rb +150 -0
  28. data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
  29. data/lib/pragmatic_tokenizer/version.rb +1 -1
  30. data/pragmatic_tokenizer.gemspec +5 -6
  31. data/spec/languages/english_spec.rb +13 -0
  32. data/spec/languages/french_spec.rb +2 -2
  33. data/spec/performance_spec.rb +0 -1
  34. data/spec/spec_helper.rb +1 -1
  35. metadata +12 -12
  36. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
@@ -0,0 +1,150 @@
1
+ module PragmaticTokenizer
2
+ class Regex
3
+
4
+ # Things that can or should be done:
5
+ # - check where the use of unicode categories helps (\p{Abbreviation})
6
+ # - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
7
+ # - check multiple domain regex, we have spec issues when using one or the other
8
+ # - check multiple punctuation regex
9
+
10
+ # Text that needs to be tokenized is initially split into chunks of this length:
11
+ CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
12
+
13
+ # Ranges
14
+ RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
15
+ RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
16
+ RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
17
+ RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
18
+ RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
19
+
20
+ # Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
21
+ COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
22
+ COLON2 = /(?::)/
23
+ COMMAS = /(?:([,‚])+)/
24
+ ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
25
+ EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
26
+ DIGIT = /(?:[[:digit:]]+)/
27
+ ASTERISK = /(?:\*+)/
28
+ UNDERSCORE = /(?:_+)/
29
+ HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ LONG_WORD_SPLIT = /(?:[-_\/—–])/
31
+ PERIOD_AND_PRIOR = /(?:(.+\.))/
32
+ PERIOD_ONLY = /(?:(\.))/
33
+ CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
34
+ PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
35
+ PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
36
+ PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
37
+ PUNCTUATION4 = /(?:[..。]+)/
38
+ DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
39
+ NO_BREAK_SPACE = /(?:\u00A0+)/
40
+ HTTP = /(?:https?:\/\/)/
41
+ TIME_WITH_COLON = /(?:\d:\d)/
42
+ DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
43
+ DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
44
+ DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
45
+ DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
46
+ NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
47
+ HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
48
+ HASHTAG = /(?:[##][[:print:]]+)/
49
+ MENTION = /(?:[@@][[:print:]]+)/
50
+ HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
51
+ ONE_AS_EXCLAMATION = /(?:\D1+)/
52
+ ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
53
+ MANY_PERIODS = /(?:^\.{2,}$)/
54
+ COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
55
+ CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
56
+ APOSTROPHE_AND_S = /(?:['’`́]s)/
57
+ ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
58
+ ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
59
+
60
+ # Regular expressions used to capture items
61
+ CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
62
+ QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
63
+ # Should we change specs and also capture "/", just like we capture ":" and "?"
64
+ SLASH_NOT_URL = /#{NOT_URL.source}\//
65
+ SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
66
+ MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
67
+ MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
68
+ BRACKET = /([{}()\[\]])/
69
+ EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
70
+ PERCENT_BEFORE_DIGIT = /(%)\d+/
71
+ COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
72
+ COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
73
+ COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
74
+ QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
75
+ QUOTE = /('')|["”]/
76
+ HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
77
+ HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
78
+
79
+ STARTS_WITH_COMMAS = /^#{COMMAS.source}/
80
+ STARTS_WITH_HTTP = /^#{HTTP.source}/
81
+ STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
82
+ STARTS_WITH_COLON1 = /^#{COLON1.source}/
83
+ STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
84
+ STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
85
+
86
+ ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
87
+ ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
88
+ ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
89
+ ENDS_WITH_COLON2 = /#{COLON2.source}$/
90
+ ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
91
+ ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
92
+ ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
93
+ ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
94
+ ENDS_WITH_ALPHA = /[[:alpha:]]$/
95
+ ENDS_WITH_DIGIT = /[[:digit:]]$/
96
+
97
+ ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
98
+ NO_DECIMALS = /(?:^\D+$)/
99
+ ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
100
+ ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
101
+ ONLY_EMAIL = /^#{EMAIL}$/
102
+ ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
103
+ ONLY_HASHTAG = /^#{HASHTAG}$/
104
+ ONLY_MENTION = /^#{MENTION}$/
105
+ ONLY_DOMAIN1 = /^#{DOMAIN1}$/
106
+ ONLY_DOMAIN2 = /^#{DOMAIN2}$/
107
+ ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
108
+ DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
109
+ UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
110
+ NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
111
+
112
+ COMMAS_OR_PUNCTUATION = Regexp.union(
113
+ STARTS_WITH_COMMAS,
114
+ ENDS_WITH_PUNCTUATION1,
115
+ ENDS_WITH_PUNCTUATION2
116
+ )
117
+
118
+ # Can this constant name be clarified?
119
+ VARIOUS = Regexp.union(
120
+ SLASH_NOT_URL,
121
+ QUESTION_MARK_NOT_URL,
122
+ ENCLOSED_PLUS,
123
+ STARTS_WITH_COLON1,
124
+ DINGBATS,
125
+ HASHTAG_WITH_HYPHEN,
126
+ CAPTURE_UNUSUAL_AND_EMOJI
127
+ )
128
+
129
+ IRRELEVANT_CHARACTERS = Regexp.union(
130
+ STARTS_WITH_PUNCTUATION3,
131
+ ENDS_WITH_COLON2,
132
+ ENDS_WITH_ONES_EXCLAMATIONS,
133
+ CONTROL_CHARACTER,
134
+ COPYRIGHT_TRADEMARK,
135
+ RANGE_ALPHANUMERIC_SUPPLEMENT
136
+ )
137
+
138
+ PRE_PROCESS = Regexp.union(
139
+ SHIFT_BOUNDARY_CHARACTERS,
140
+ MULTIPLE_DOTS,
141
+ BRACKET,
142
+ MULTIPLE_DASHES,
143
+ EXCLAMATION_BETWEEN_ALPHA,
144
+ PERCENT_BEFORE_DIGIT,
145
+ COMMA_BEFORE_NON_DIGIT,
146
+ COMMA_AFTER_NON_DIGIT
147
+ )
148
+
149
+ end
150
+ end
@@ -1,70 +1,22 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'set'
3
2
  require 'cgi'
3
+ require 'pragmatic_tokenizer/regex'
4
4
  require 'pragmatic_tokenizer/languages'
5
5
  require 'pragmatic_tokenizer/pre_processor'
6
6
  require 'pragmatic_tokenizer/post_processor'
7
- require 'pragmatic_tokenizer/full_stop_separator'
8
7
  require 'unicode'
9
8
 
10
9
  module PragmaticTokenizer
11
10
  class Tokenizer
12
11
 
13
- PUNCTIATION_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
14
- NUMBERS_OPTIONS = Set.new([:all, :semi, :none, :only]).freeze
15
- MENTIONS_OPTIONS = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
12
+ PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
13
+ NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
14
+ MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
16
15
  MAX_TOKEN_LENGTH = 50
17
- EMPTY_STRING = ''.freeze
18
- DOT_STRING = '.'.freeze
19
- SPACE_STRING = ' '.freeze
20
- REGEX_DOMAIN = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
21
- REGEX_URL = /(http|https)(\.|:)/
22
- REGEX_HYPHEN = /\-/
23
- REGEX_LONG_WORD = /\-|\_/
24
- REGEXP_SPLIT_CHECK = /@|@|(http)/
25
- REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
26
- REGEX_APOSTROPHE_S = /['’`́]s$/
27
- REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
28
- REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
29
- REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
30
- REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
31
- REGEX_ASTERISK = /\*+/
32
- REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
33
- REGEX_UNDERSCORE_AT_END,
34
- REGEX_ASTERISK)
35
- # https://en.wikipedia.org/wiki/Control_character
36
- # matches any character with hexadecimal value 00 through 1F or 7F.
37
- # Rubular: http://rubular.com/r/E83fpBoDjI
38
- REGEXP_CONTROL = /[[:cntrl:]]/
39
- REGEXP_ENDING_COLON = /\:(?=\z)/
40
- REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
41
- REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
42
- REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
43
- REGEXP_SPECIAL_SYMBOL = /[®©]/
44
- REGEXP_PERCENT_AT_START = /\A\%/
45
- # https://codepoints.net/enclosed_alphanumeric_supplement
46
- REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
47
- REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
48
- REGEXP_ENDING_COLON,
49
- REGEXP_EXCLAMATION_AT_START,
50
- REGEXP_EXCLAMATION_AT_END,
51
- REGEXP_HYPHEN_AT_START,
52
- REGEXP_SPECIAL_SYMBOL,
53
- REGEXP_PERCENT_AT_START,
54
- REGEXP_ALPHANUMERIC_SUPPLEMENT)
55
- REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
56
- REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
57
- REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
58
- REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
59
- REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
60
- REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
61
- PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
62
- REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
63
- REGEXP_NUMBER_ONLY = /\A\d+\z/
64
- REGEXP_NO_NUMBERS = /\A\D+\z/
65
- REGEXP_NUMBER = /\D*\d+\d*/
66
- REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
67
- REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
16
+ NOTHING = ''.freeze
17
+ DOT = '.'.freeze
18
+ SPACE = ' '.freeze
19
+ SINGLE_QUOTE = "'".freeze
68
20
 
69
21
  # @param [Hash] opts optional arguments
70
22
 
@@ -124,7 +76,7 @@ module PragmaticTokenizer
124
76
  @abbreviations = Set.new(opts[:abbreviations])
125
77
  @stop_words = Set.new(opts[:stop_words])
126
78
 
127
- # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
79
+ # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
128
80
  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
129
81
  @abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
130
82
  @stop_words += @language_module::STOP_WORDS if @stop_words.empty?
@@ -136,34 +88,43 @@ module PragmaticTokenizer
136
88
  @stop_words += language::STOP_WORDS
137
89
  end
138
90
 
139
- raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
91
+ raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
140
92
  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
141
93
  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
142
- raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == Fixnum || @minimum_length.nil?
143
- raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == Fixnum || @long_word_split.nil?
94
+
95
+ integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
96
+
97
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
98
+ raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
144
99
  end
145
100
 
146
101
  # @param [String] text to be tokenized
147
102
 
148
103
  def tokenize(text)
149
104
  return [] unless text
150
- raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
105
+ raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
151
106
  CGI.unescapeHTML(text)
152
- .scan(REGEXP_CHUNK_STRING)
153
- .flat_map { |segment| post_process(pre_process(segment)) }
107
+ .scan(Regex::CHUNK_LONG_INPUT_TEXT)
108
+ .flat_map { |segment| process_segment(segment) }
154
109
  end
155
110
 
156
111
  private
157
112
 
158
- def pre_process(text)
159
- text
113
+ def process_segment(segment)
114
+ pre_processed = pre_process(segment)
115
+ cased_segment = chosen_case(pre_processed)
116
+ @tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
117
+ post_process_tokens
118
+ end
119
+
120
+ def pre_process(segment)
121
+ segment
160
122
  .extend(PragmaticTokenizer::PreProcessor)
161
123
  .pre_process(language: @language_module)
162
124
  end
163
125
 
164
- def post_process(text)
165
- @tokens = run_post_processor(text)
166
- remove_various!
126
+ def post_process_tokens
127
+ remove_by_options!
167
128
  process_numbers!
168
129
  process_punctuation!
169
130
  expand_contractions! if @expand_contractions
@@ -177,45 +138,45 @@ module PragmaticTokenizer
177
138
  @tokens.reject(&:empty?)
178
139
  end
179
140
 
180
- def run_post_processor(text)
181
- PostProcessor.new(
182
- text: chosen_case(text),
183
- abbreviations: @abbreviations,
184
- downcase: @downcase
185
- ).post_process
186
- end
187
-
188
141
  def expand_contractions!
189
- @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
142
+ @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
190
143
  end
191
144
 
192
145
  def expand_token_contraction(token)
193
- normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
146
+ normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
194
147
  return token unless @contractions.key?(normalized)
195
- result = @contractions[normalized].split(SPACE_STRING)
148
+ result = @contractions[normalized].split(SPACE)
196
149
  result[0] = Unicode.capitalize(result[0]) unless @downcase
197
150
  result
198
151
  end
199
152
 
200
153
  def clean!
201
154
  @tokens = @tokens
202
- .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
203
- .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
204
- .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
205
- .delete_if { |t| unclean_token?(t) }
155
+ .flat_map { |token| split_underscores_asterisk(token) }
156
+ .map! { |token| remove_irrelevant_characters(token) }
157
+ .delete_if { |token| many_dots?(token) }
206
158
  end
207
159
 
208
- def unclean_token?(token)
209
- return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
210
- return true if token.length > MAX_TOKEN_LENGTH
211
- return true if token.include?('\\'.freeze)
212
- token =~ REGEXP_CONSECUTIVE_DOTS
160
+ def split_underscores_asterisk(token)
161
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
162
+ token.split(Regex::UNDERSCORES_ASTERISK)
163
+ end
164
+
165
+ def remove_irrelevant_characters(token)
166
+ token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
167
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
168
+ token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
169
+ token
170
+ end
171
+
172
+ def many_dots?(token)
173
+ token =~ Regex::MANY_PERIODS
213
174
  end
214
175
 
215
176
  def classic_filter!
216
177
  @tokens.map! do |token|
217
- token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
218
- token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
178
+ token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
179
+ token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
219
180
  token
220
181
  end
221
182
  end
@@ -223,26 +184,26 @@ module PragmaticTokenizer
223
184
  def process_numbers!
224
185
  case @numbers
225
186
  when :semi
226
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
187
+ @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
227
188
  when :none
228
- @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
189
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
229
190
  when :only
230
- @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
191
+ @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
231
192
  end
232
193
  end
233
194
 
234
195
  def remove_short_tokens!
235
- @tokens.delete_if { |t| t.length < @minimum_length }
196
+ @tokens.delete_if { |token| token.length < @minimum_length }
236
197
  end
237
198
 
238
199
  def process_punctuation!
239
200
  case @punctuation
240
201
  when :semi
241
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
202
+ @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
242
203
  when :none
243
- @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
204
+ @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
244
205
  when :only
245
- @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
206
+ @tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
246
207
  end
247
208
  end
248
209
 
@@ -253,45 +214,50 @@ module PragmaticTokenizer
253
214
  def mentions!
254
215
  case @mentions
255
216
  when :remove
256
- @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
217
+ @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
257
218
  when :keep_and_clean
258
- @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
219
+ @tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
259
220
  end
260
221
  end
261
222
 
262
223
  def hashtags!
263
224
  case @hashtags
264
225
  when :remove
265
- @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
226
+ @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
266
227
  when :keep_and_clean
267
- @tokens = @tokens
268
- .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
269
- .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
228
+ @tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
270
229
  end
271
230
  end
272
231
 
273
- def remove_various!
274
- @tokens.delete_if { |t| t =~ regex_various }
232
+ def remove_by_options!
233
+ @tokens.delete_if { |token| token =~ regex_by_options }
275
234
  end
276
235
 
277
- def regex_various
278
- @regex_various ||= begin
236
+ def regex_by_options
237
+ @regex_by_options ||= begin
279
238
  regex_array = []
280
- regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
281
- regex_array << REGEX_EMAIL if @remove_emails
282
- regex_array << REGEX_URL if @remove_urls
283
- regex_array << REGEX_DOMAIN if @remove_domains
239
+ regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
240
+ regex_array << Regex::ONLY_EMAIL if @remove_emails
241
+ regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
242
+ regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
284
243
  Regexp.union(regex_array)
285
244
  end
286
245
  end
287
246
 
288
247
  def split_long_words!
289
- @tokens = @tokens
290
- .flat_map { |t| (t.length > @long_word_split && t !~ REGEXP_SPLIT_CHECK ) ? t.split(REGEX_LONG_WORD) : t }
248
+ @tokens = @tokens.flat_map { |token| split_long_word(token) }
249
+ end
250
+
251
+ def split_long_word(token)
252
+ return token unless @long_word_split
253
+ return token if token.length <= @long_word_split
254
+ return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
+ return token if token =~ Regex::DOMAIN_OR_EMAIL
256
+ token.split(Regex::LONG_WORD_SPLIT)
291
257
  end
292
258
 
293
- def chosen_case(txt)
294
- @downcase ? Unicode.downcase(txt) : txt
259
+ def chosen_case(text)
260
+ @downcase ? Unicode.downcase(text) : text
295
261
  end
296
262
 
297
263
  def inverse_case(token)