pragmatic_tokenizer 3.0.3 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +1 -1
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +149 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +82 -116
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/deutsch_spec.rb +1 -1
- data/spec/languages/english_spec.rb +52 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +1 -1
- data/spec/spec_helper.rb +1 -1
- metadata +8 -8
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -63
@@ -0,0 +1,149 @@
|
|
1
|
+
module PragmaticTokenizer
|
2
|
+
class Regex
|
3
|
+
|
4
|
+
# Things that can or should be done:
|
5
|
+
# - check where the use of unicode categories helps (\p{Abbreviation})
|
6
|
+
# - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
|
7
|
+
# - check multiple domain regex, we have spec issues when using one or the other
|
8
|
+
# - check multiple punctuation regex
|
9
|
+
|
10
|
+
# Text that needs to be tokenized is initially split into chunks of this length:
|
11
|
+
CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
|
12
|
+
|
13
|
+
# Ranges
|
14
|
+
RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
|
15
|
+
RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
|
16
|
+
RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
|
17
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
18
|
+
RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
|
19
|
+
|
20
|
+
# Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
|
21
|
+
COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
|
22
|
+
COLON2 = /(?::)/
|
23
|
+
COMMAS = /(?:([,‚])+)/
|
24
|
+
ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
|
25
|
+
EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
|
26
|
+
DIGIT = /(?:[[:digit:]]+)/
|
27
|
+
ASTERISK = /(?:\*+)/
|
28
|
+
UNDERSCORE = /(?:_+)/
|
29
|
+
HYPHEN_OR_UNDERSCORE = /(?:[-_])/
|
30
|
+
PERIOD_AND_PRIOR = /(?:(.+\.))/
|
31
|
+
PERIOD_ONLY = /(?:(\.))/
|
32
|
+
CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
|
33
|
+
PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
|
34
|
+
PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
|
35
|
+
PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
|
36
|
+
PUNCTUATION4 = /(?:[..。]+)/
|
37
|
+
DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
|
38
|
+
NO_BREAK_SPACE = /(?:\u00A0+)/
|
39
|
+
HTTP = /(?:https?:\/\/)/
|
40
|
+
TIME_WITH_COLON = /(?:\d:\d)/
|
41
|
+
DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
|
42
|
+
DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
|
43
|
+
DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
|
44
|
+
DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
|
45
|
+
NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
|
46
|
+
HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
|
47
|
+
HASHTAG = /(?:[##][[:print:]]+)/
|
48
|
+
MENTION = /(?:[@@][[:print:]]+)/
|
49
|
+
HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
|
50
|
+
ONE_AS_EXCLAMATION = /(?:\D1+)/
|
51
|
+
ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
|
52
|
+
MANY_PERIODS = /(?:^\.{2,}$)/
|
53
|
+
COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
|
54
|
+
CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
|
55
|
+
APOSTROPHE_AND_S = /(?:['’`́]s)/
|
56
|
+
ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
|
57
|
+
ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
|
58
|
+
|
59
|
+
# Regular expressions used to capture items
|
60
|
+
CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
|
61
|
+
QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
|
62
|
+
# Should we change specs and also capture "/", just like we capture ":" and "?"
|
63
|
+
SLASH_NOT_URL = /#{NOT_URL.source}\//
|
64
|
+
SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
|
65
|
+
MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
|
66
|
+
MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
|
67
|
+
BRACKET = /([{}()\[\]])/
|
68
|
+
EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
|
69
|
+
PERCENT_BEFORE_DIGIT = /(%)\d+/
|
70
|
+
COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
|
71
|
+
COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
|
72
|
+
COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
|
73
|
+
QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
|
74
|
+
QUOTE = /('')|["”]/
|
75
|
+
HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
|
76
|
+
HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
|
77
|
+
|
78
|
+
STARTS_WITH_COMMAS = /^#{COMMAS.source}/
|
79
|
+
STARTS_WITH_HTTP = /^#{HTTP.source}/
|
80
|
+
STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
|
81
|
+
STARTS_WITH_COLON1 = /^#{COLON1.source}/
|
82
|
+
STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
|
83
|
+
STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
|
84
|
+
|
85
|
+
ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
|
86
|
+
ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
|
87
|
+
ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
|
88
|
+
ENDS_WITH_COLON2 = /#{COLON2.source}$/
|
89
|
+
ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
|
90
|
+
ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
|
91
|
+
ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
|
92
|
+
ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
|
93
|
+
ENDS_WITH_ALPHA = /[[:alpha:]]$/
|
94
|
+
ENDS_WITH_DIGIT = /[[:digit:]]$/
|
95
|
+
|
96
|
+
ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
|
97
|
+
NO_DECIMALS = /(?:^\D+$)/
|
98
|
+
ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
|
99
|
+
ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
|
100
|
+
ONLY_EMAIL = /^#{EMAIL}$/
|
101
|
+
ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
|
102
|
+
ONLY_HASHTAG = /^#{HASHTAG}$/
|
103
|
+
ONLY_MENTION = /^#{MENTION}$/
|
104
|
+
ONLY_DOMAIN1 = /^#{DOMAIN1}$/
|
105
|
+
ONLY_DOMAIN2 = /^#{DOMAIN2}$/
|
106
|
+
ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
|
107
|
+
DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
|
108
|
+
UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
|
109
|
+
NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
|
110
|
+
|
111
|
+
COMMAS_OR_PUNCTUATION = Regexp.union(
|
112
|
+
STARTS_WITH_COMMAS,
|
113
|
+
ENDS_WITH_PUNCTUATION1,
|
114
|
+
ENDS_WITH_PUNCTUATION2
|
115
|
+
)
|
116
|
+
|
117
|
+
# Can this constant name be clarified?
|
118
|
+
VARIOUS = Regexp.union(
|
119
|
+
SLASH_NOT_URL,
|
120
|
+
QUESTION_MARK_NOT_URL,
|
121
|
+
ENCLOSED_PLUS,
|
122
|
+
STARTS_WITH_COLON1,
|
123
|
+
DINGBATS,
|
124
|
+
HASHTAG_WITH_HYPHEN,
|
125
|
+
CAPTURE_UNUSUAL_AND_EMOJI
|
126
|
+
)
|
127
|
+
|
128
|
+
IRRELEVANT_CHARACTERS = Regexp.union(
|
129
|
+
STARTS_WITH_PUNCTUATION3,
|
130
|
+
ENDS_WITH_COLON2,
|
131
|
+
ENDS_WITH_ONES_EXCLAMATIONS,
|
132
|
+
CONTROL_CHARACTER,
|
133
|
+
COPYRIGHT_TRADEMARK,
|
134
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT
|
135
|
+
)
|
136
|
+
|
137
|
+
PRE_PROCESS = Regexp.union(
|
138
|
+
SHIFT_BOUNDARY_CHARACTERS,
|
139
|
+
MULTIPLE_DOTS,
|
140
|
+
BRACKET,
|
141
|
+
MULTIPLE_DASHES,
|
142
|
+
EXCLAMATION_BETWEEN_ALPHA,
|
143
|
+
PERCENT_BEFORE_DIGIT,
|
144
|
+
COMMA_BEFORE_NON_DIGIT,
|
145
|
+
COMMA_AFTER_NON_DIGIT
|
146
|
+
)
|
147
|
+
|
148
|
+
end
|
149
|
+
end
|
@@ -1,69 +1,22 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'set'
|
3
2
|
require 'cgi'
|
3
|
+
require 'pragmatic_tokenizer/regex'
|
4
4
|
require 'pragmatic_tokenizer/languages'
|
5
5
|
require 'pragmatic_tokenizer/pre_processor'
|
6
6
|
require 'pragmatic_tokenizer/post_processor'
|
7
|
-
require 'pragmatic_tokenizer/full_stop_separator'
|
8
7
|
require 'unicode'
|
9
8
|
|
10
9
|
module PragmaticTokenizer
|
11
10
|
class Tokenizer
|
12
11
|
|
13
|
-
|
14
|
-
NUMBERS_OPTIONS = Set.new([
|
15
|
-
MENTIONS_OPTIONS = Set.new([
|
12
|
+
PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
|
13
|
+
NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
|
14
|
+
MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
|
16
15
|
MAX_TOKEN_LENGTH = 50
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
REGEX_URL = /(http|https)(\.|:)/
|
22
|
-
REGEX_HYPHEN = /\-/
|
23
|
-
REGEX_UNDERSCORE = /\_/
|
24
|
-
REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
|
25
|
-
REGEX_APOSTROPHE_S = /['’`́]s$/
|
26
|
-
REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
|
27
|
-
REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
|
28
|
-
REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
|
29
|
-
REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
|
30
|
-
REGEX_ASTERISK = /\*+/
|
31
|
-
REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
|
32
|
-
REGEX_UNDERSCORE_AT_END,
|
33
|
-
REGEX_ASTERISK)
|
34
|
-
# https://en.wikipedia.org/wiki/Control_character
|
35
|
-
# matches any character with hexadecimal value 00 through 1F or 7F.
|
36
|
-
# Rubular: http://rubular.com/r/E83fpBoDjI
|
37
|
-
REGEXP_CONTROL = /[[:cntrl:]]/
|
38
|
-
REGEXP_ENDING_COLON = /\:(?=\z)/
|
39
|
-
REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
|
40
|
-
REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
|
41
|
-
REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
|
42
|
-
REGEXP_SPECIAL_SYMBOL = /[®©]/
|
43
|
-
REGEXP_PERCENT_AT_START = /\A\%/
|
44
|
-
# https://codepoints.net/enclosed_alphanumeric_supplement
|
45
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
46
|
-
REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
|
47
|
-
REGEXP_ENDING_COLON,
|
48
|
-
REGEXP_EXCLAMATION_AT_START,
|
49
|
-
REGEXP_EXCLAMATION_AT_END,
|
50
|
-
REGEXP_HYPHEN_AT_START,
|
51
|
-
REGEXP_SPECIAL_SYMBOL,
|
52
|
-
REGEXP_PERCENT_AT_START,
|
53
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT)
|
54
|
-
REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
|
55
|
-
REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
|
56
|
-
REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
|
57
|
-
REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
|
58
|
-
REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
|
59
|
-
REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
|
60
|
-
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
|
61
|
-
REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
|
62
|
-
REGEXP_NUMBER_ONLY = /\A\d+\z/
|
63
|
-
REGEXP_NO_NUMBERS = /\A\D+\z/
|
64
|
-
REGEXP_NUMBER = /\D*\d+\d*/
|
65
|
-
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
66
|
-
REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
|
16
|
+
NOTHING = ''.freeze
|
17
|
+
DOT = '.'.freeze
|
18
|
+
SPACE = ' '.freeze
|
19
|
+
SINGLE_QUOTE = "'".freeze
|
67
20
|
|
68
21
|
# @param [Hash] opts optional arguments
|
69
22
|
|
@@ -123,10 +76,10 @@ module PragmaticTokenizer
|
|
123
76
|
@abbreviations = Set.new(opts[:abbreviations])
|
124
77
|
@stop_words = Set.new(opts[:stop_words])
|
125
78
|
|
126
|
-
#
|
79
|
+
# Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
|
127
80
|
@contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
|
128
81
|
@abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
|
129
|
-
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
82
|
+
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
130
83
|
|
131
84
|
@filter_languages.each do |lang|
|
132
85
|
language = Languages.get_language_by_code(lang)
|
@@ -135,34 +88,43 @@ module PragmaticTokenizer
|
|
135
88
|
@stop_words += language::STOP_WORDS
|
136
89
|
end
|
137
90
|
|
138
|
-
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless
|
91
|
+
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
|
139
92
|
raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
|
140
93
|
raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
|
141
|
-
|
142
|
-
|
94
|
+
|
95
|
+
integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
|
96
|
+
|
97
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
|
98
|
+
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
|
143
99
|
end
|
144
100
|
|
145
101
|
# @param [String] text to be tokenized
|
146
102
|
|
147
103
|
def tokenize(text)
|
148
104
|
return [] unless text
|
149
|
-
raise "In
|
105
|
+
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
150
106
|
CGI.unescapeHTML(text)
|
151
|
-
.scan(
|
152
|
-
.flat_map { |segment|
|
107
|
+
.scan(Regex::CHUNK_LONG_INPUT_TEXT)
|
108
|
+
.flat_map { |segment| process_segment(segment) }
|
153
109
|
end
|
154
110
|
|
155
111
|
private
|
156
112
|
|
157
|
-
def
|
158
|
-
|
113
|
+
def process_segment(segment)
|
114
|
+
pre_processed = pre_process(segment)
|
115
|
+
cased_segment = chosen_case(pre_processed)
|
116
|
+
@tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
|
117
|
+
post_process_tokens
|
118
|
+
end
|
119
|
+
|
120
|
+
def pre_process(segment)
|
121
|
+
segment
|
159
122
|
.extend(PragmaticTokenizer::PreProcessor)
|
160
123
|
.pre_process(language: @language_module)
|
161
124
|
end
|
162
125
|
|
163
|
-
def
|
164
|
-
|
165
|
-
remove_various!
|
126
|
+
def post_process_tokens
|
127
|
+
remove_by_options!
|
166
128
|
process_numbers!
|
167
129
|
process_punctuation!
|
168
130
|
expand_contractions! if @expand_contractions
|
@@ -176,45 +138,45 @@ module PragmaticTokenizer
|
|
176
138
|
@tokens.reject(&:empty?)
|
177
139
|
end
|
178
140
|
|
179
|
-
def run_post_processor(text)
|
180
|
-
PostProcessor.new(
|
181
|
-
text: chosen_case(text),
|
182
|
-
abbreviations: @abbreviations,
|
183
|
-
downcase: @downcase
|
184
|
-
).post_process
|
185
|
-
end
|
186
|
-
|
187
141
|
def expand_contractions!
|
188
|
-
@tokens = @tokens.flat_map { |
|
142
|
+
@tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
|
189
143
|
end
|
190
144
|
|
191
145
|
def expand_token_contraction(token)
|
192
|
-
normalized = inverse_case(token.gsub(
|
146
|
+
normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
|
193
147
|
return token unless @contractions.key?(normalized)
|
194
|
-
result = @contractions[normalized].split(
|
148
|
+
result = @contractions[normalized].split(SPACE)
|
195
149
|
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
196
150
|
result
|
197
151
|
end
|
198
152
|
|
199
153
|
def clean!
|
200
154
|
@tokens = @tokens
|
201
|
-
.flat_map
|
202
|
-
.map!
|
203
|
-
.
|
204
|
-
.delete_if { |t| unclean_token?(t) }
|
155
|
+
.flat_map { |token| split_underscores_asterisk(token) }
|
156
|
+
.map! { |token| remove_irrelevant_characters(token) }
|
157
|
+
.delete_if { |token| many_dots?(token) }
|
205
158
|
end
|
206
159
|
|
207
|
-
def
|
208
|
-
return
|
209
|
-
|
210
|
-
|
211
|
-
|
160
|
+
def split_underscores_asterisk(token)
|
161
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
162
|
+
token.split(Regex::UNDERSCORES_ASTERISK)
|
163
|
+
end
|
164
|
+
|
165
|
+
def remove_irrelevant_characters(token)
|
166
|
+
token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
|
167
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
168
|
+
token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
|
169
|
+
token
|
170
|
+
end
|
171
|
+
|
172
|
+
def many_dots?(token)
|
173
|
+
token =~ Regex::MANY_PERIODS
|
212
174
|
end
|
213
175
|
|
214
176
|
def classic_filter!
|
215
177
|
@tokens.map! do |token|
|
216
|
-
token.delete!(
|
217
|
-
token.sub!(
|
178
|
+
token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
|
179
|
+
token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
|
218
180
|
token
|
219
181
|
end
|
220
182
|
end
|
@@ -222,26 +184,26 @@ module PragmaticTokenizer
|
|
222
184
|
def process_numbers!
|
223
185
|
case @numbers
|
224
186
|
when :semi
|
225
|
-
@tokens.delete_if { |
|
187
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
|
226
188
|
when :none
|
227
|
-
@tokens.delete_if { |
|
189
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
|
228
190
|
when :only
|
229
|
-
@tokens.delete_if { |
|
191
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
|
230
192
|
end
|
231
193
|
end
|
232
194
|
|
233
195
|
def remove_short_tokens!
|
234
|
-
@tokens.delete_if { |
|
196
|
+
@tokens.delete_if { |token| token.length < @minimum_length }
|
235
197
|
end
|
236
198
|
|
237
199
|
def process_punctuation!
|
238
200
|
case @punctuation
|
239
201
|
when :semi
|
240
|
-
@tokens.delete_if { |
|
202
|
+
@tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
|
241
203
|
when :none
|
242
|
-
@tokens.delete_if { |
|
204
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
243
205
|
when :only
|
244
|
-
@tokens.keep_if
|
206
|
+
@tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
245
207
|
end
|
246
208
|
end
|
247
209
|
|
@@ -252,46 +214,50 @@ module PragmaticTokenizer
|
|
252
214
|
def mentions!
|
253
215
|
case @mentions
|
254
216
|
when :remove
|
255
|
-
@tokens.delete_if { |
|
217
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
|
256
218
|
when :keep_and_clean
|
257
|
-
@tokens.map!
|
219
|
+
@tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
|
258
220
|
end
|
259
221
|
end
|
260
222
|
|
261
223
|
def hashtags!
|
262
224
|
case @hashtags
|
263
225
|
when :remove
|
264
|
-
@tokens.delete_if { |
|
226
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
|
265
227
|
when :keep_and_clean
|
266
|
-
@tokens
|
267
|
-
.flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
|
268
|
-
.map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
|
228
|
+
@tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
|
269
229
|
end
|
270
230
|
end
|
271
231
|
|
272
|
-
def
|
273
|
-
@tokens.delete_if { |
|
232
|
+
def remove_by_options!
|
233
|
+
@tokens.delete_if { |token| token =~ regex_by_options }
|
274
234
|
end
|
275
235
|
|
276
|
-
def
|
277
|
-
@
|
236
|
+
def regex_by_options
|
237
|
+
@regex_by_options ||= begin
|
278
238
|
regex_array = []
|
279
|
-
regex_array <<
|
280
|
-
regex_array <<
|
281
|
-
regex_array <<
|
282
|
-
regex_array <<
|
239
|
+
regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
|
240
|
+
regex_array << Regex::ONLY_EMAIL if @remove_emails
|
241
|
+
regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
|
242
|
+
regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
|
283
243
|
Regexp.union(regex_array)
|
284
244
|
end
|
285
245
|
end
|
286
246
|
|
287
247
|
def split_long_words!
|
288
|
-
@tokens = @tokens
|
289
|
-
|
290
|
-
|
248
|
+
@tokens = @tokens.flat_map { |token| split_long_word(token) }
|
249
|
+
end
|
250
|
+
|
251
|
+
def split_long_word(token)
|
252
|
+
return token unless @long_word_split
|
253
|
+
return token if token.length <= @long_word_split
|
254
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
|
+
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
+
token.split(Regex::HYPHEN_OR_UNDERSCORE)
|
291
257
|
end
|
292
258
|
|
293
|
-
def chosen_case(
|
294
|
-
@downcase ? Unicode.downcase(
|
259
|
+
def chosen_case(text)
|
260
|
+
@downcase ? Unicode.downcase(text) : text
|
295
261
|
end
|
296
262
|
|
297
263
|
def inverse_case(token)
|