pragmatic_tokenizer 3.0.4 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +150 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/english_spec.rb +13 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +0 -1
- data/spec/spec_helper.rb +1 -1
- metadata +12 -12
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
@@ -0,0 +1,150 @@
|
|
1
|
+
module PragmaticTokenizer
|
2
|
+
class Regex
|
3
|
+
|
4
|
+
# Things that can or should be done:
|
5
|
+
# - check where the use of unicode categories helps (\p{Abbreviation})
|
6
|
+
# - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
|
7
|
+
# - check multiple domain regex, we have spec issues when using one or the other
|
8
|
+
# - check multiple punctuation regex
|
9
|
+
|
10
|
+
# Text that needs to be tokenized is initially split into chunks of this length:
|
11
|
+
CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
|
12
|
+
|
13
|
+
# Ranges
|
14
|
+
RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
|
15
|
+
RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
|
16
|
+
RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
|
17
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
18
|
+
RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
|
19
|
+
|
20
|
+
# Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
|
21
|
+
COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
|
22
|
+
COLON2 = /(?::)/
|
23
|
+
COMMAS = /(?:([,‚])+)/
|
24
|
+
ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
|
25
|
+
EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
|
26
|
+
DIGIT = /(?:[[:digit:]]+)/
|
27
|
+
ASTERISK = /(?:\*+)/
|
28
|
+
UNDERSCORE = /(?:_+)/
|
29
|
+
HYPHEN_OR_UNDERSCORE = /(?:[-_])/
|
30
|
+
LONG_WORD_SPLIT = /(?:[-_\/—–])/
|
31
|
+
PERIOD_AND_PRIOR = /(?:(.+\.))/
|
32
|
+
PERIOD_ONLY = /(?:(\.))/
|
33
|
+
CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
|
34
|
+
PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
|
35
|
+
PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
|
36
|
+
PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
|
37
|
+
PUNCTUATION4 = /(?:[..。]+)/
|
38
|
+
DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
|
39
|
+
NO_BREAK_SPACE = /(?:\u00A0+)/
|
40
|
+
HTTP = /(?:https?:\/\/)/
|
41
|
+
TIME_WITH_COLON = /(?:\d:\d)/
|
42
|
+
DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
|
43
|
+
DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
|
44
|
+
DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
|
45
|
+
DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
|
46
|
+
NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
|
47
|
+
HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
|
48
|
+
HASHTAG = /(?:[##][[:print:]]+)/
|
49
|
+
MENTION = /(?:[@@][[:print:]]+)/
|
50
|
+
HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
|
51
|
+
ONE_AS_EXCLAMATION = /(?:\D1+)/
|
52
|
+
ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
|
53
|
+
MANY_PERIODS = /(?:^\.{2,}$)/
|
54
|
+
COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
|
55
|
+
CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
|
56
|
+
APOSTROPHE_AND_S = /(?:['’`́]s)/
|
57
|
+
ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
|
58
|
+
ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
|
59
|
+
|
60
|
+
# Regular expressions used to capture items
|
61
|
+
CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
|
62
|
+
QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
|
63
|
+
# Should we change specs and also capture "/", just like we capture ":" and "?"
|
64
|
+
SLASH_NOT_URL = /#{NOT_URL.source}\//
|
65
|
+
SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
|
66
|
+
MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
|
67
|
+
MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
|
68
|
+
BRACKET = /([{}()\[\]])/
|
69
|
+
EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
|
70
|
+
PERCENT_BEFORE_DIGIT = /(%)\d+/
|
71
|
+
COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
|
72
|
+
COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
|
73
|
+
COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
|
74
|
+
QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
|
75
|
+
QUOTE = /('')|["”]/
|
76
|
+
HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
|
77
|
+
HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
|
78
|
+
|
79
|
+
STARTS_WITH_COMMAS = /^#{COMMAS.source}/
|
80
|
+
STARTS_WITH_HTTP = /^#{HTTP.source}/
|
81
|
+
STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
|
82
|
+
STARTS_WITH_COLON1 = /^#{COLON1.source}/
|
83
|
+
STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
|
84
|
+
STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
|
85
|
+
|
86
|
+
ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
|
87
|
+
ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
|
88
|
+
ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
|
89
|
+
ENDS_WITH_COLON2 = /#{COLON2.source}$/
|
90
|
+
ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
|
91
|
+
ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
|
92
|
+
ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
|
93
|
+
ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
|
94
|
+
ENDS_WITH_ALPHA = /[[:alpha:]]$/
|
95
|
+
ENDS_WITH_DIGIT = /[[:digit:]]$/
|
96
|
+
|
97
|
+
ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
|
98
|
+
NO_DECIMALS = /(?:^\D+$)/
|
99
|
+
ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
|
100
|
+
ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
|
101
|
+
ONLY_EMAIL = /^#{EMAIL}$/
|
102
|
+
ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
|
103
|
+
ONLY_HASHTAG = /^#{HASHTAG}$/
|
104
|
+
ONLY_MENTION = /^#{MENTION}$/
|
105
|
+
ONLY_DOMAIN1 = /^#{DOMAIN1}$/
|
106
|
+
ONLY_DOMAIN2 = /^#{DOMAIN2}$/
|
107
|
+
ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
|
108
|
+
DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
|
109
|
+
UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
|
110
|
+
NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
|
111
|
+
|
112
|
+
COMMAS_OR_PUNCTUATION = Regexp.union(
|
113
|
+
STARTS_WITH_COMMAS,
|
114
|
+
ENDS_WITH_PUNCTUATION1,
|
115
|
+
ENDS_WITH_PUNCTUATION2
|
116
|
+
)
|
117
|
+
|
118
|
+
# Can this constant name be clarified?
|
119
|
+
VARIOUS = Regexp.union(
|
120
|
+
SLASH_NOT_URL,
|
121
|
+
QUESTION_MARK_NOT_URL,
|
122
|
+
ENCLOSED_PLUS,
|
123
|
+
STARTS_WITH_COLON1,
|
124
|
+
DINGBATS,
|
125
|
+
HASHTAG_WITH_HYPHEN,
|
126
|
+
CAPTURE_UNUSUAL_AND_EMOJI
|
127
|
+
)
|
128
|
+
|
129
|
+
IRRELEVANT_CHARACTERS = Regexp.union(
|
130
|
+
STARTS_WITH_PUNCTUATION3,
|
131
|
+
ENDS_WITH_COLON2,
|
132
|
+
ENDS_WITH_ONES_EXCLAMATIONS,
|
133
|
+
CONTROL_CHARACTER,
|
134
|
+
COPYRIGHT_TRADEMARK,
|
135
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT
|
136
|
+
)
|
137
|
+
|
138
|
+
PRE_PROCESS = Regexp.union(
|
139
|
+
SHIFT_BOUNDARY_CHARACTERS,
|
140
|
+
MULTIPLE_DOTS,
|
141
|
+
BRACKET,
|
142
|
+
MULTIPLE_DASHES,
|
143
|
+
EXCLAMATION_BETWEEN_ALPHA,
|
144
|
+
PERCENT_BEFORE_DIGIT,
|
145
|
+
COMMA_BEFORE_NON_DIGIT,
|
146
|
+
COMMA_AFTER_NON_DIGIT
|
147
|
+
)
|
148
|
+
|
149
|
+
end
|
150
|
+
end
|
@@ -1,70 +1,22 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'set'
|
3
2
|
require 'cgi'
|
3
|
+
require 'pragmatic_tokenizer/regex'
|
4
4
|
require 'pragmatic_tokenizer/languages'
|
5
5
|
require 'pragmatic_tokenizer/pre_processor'
|
6
6
|
require 'pragmatic_tokenizer/post_processor'
|
7
|
-
require 'pragmatic_tokenizer/full_stop_separator'
|
8
7
|
require 'unicode'
|
9
8
|
|
10
9
|
module PragmaticTokenizer
|
11
10
|
class Tokenizer
|
12
11
|
|
13
|
-
|
14
|
-
NUMBERS_OPTIONS = Set.new([
|
15
|
-
MENTIONS_OPTIONS = Set.new([
|
12
|
+
PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
|
13
|
+
NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
|
14
|
+
MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
|
16
15
|
MAX_TOKEN_LENGTH = 50
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
REGEX_URL = /(http|https)(\.|:)/
|
22
|
-
REGEX_HYPHEN = /\-/
|
23
|
-
REGEX_LONG_WORD = /\-|\_/
|
24
|
-
REGEXP_SPLIT_CHECK = /@|@|(http)/
|
25
|
-
REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
|
26
|
-
REGEX_APOSTROPHE_S = /['’`́]s$/
|
27
|
-
REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
|
28
|
-
REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
|
29
|
-
REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
|
30
|
-
REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
|
31
|
-
REGEX_ASTERISK = /\*+/
|
32
|
-
REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
|
33
|
-
REGEX_UNDERSCORE_AT_END,
|
34
|
-
REGEX_ASTERISK)
|
35
|
-
# https://en.wikipedia.org/wiki/Control_character
|
36
|
-
# matches any character with hexadecimal value 00 through 1F or 7F.
|
37
|
-
# Rubular: http://rubular.com/r/E83fpBoDjI
|
38
|
-
REGEXP_CONTROL = /[[:cntrl:]]/
|
39
|
-
REGEXP_ENDING_COLON = /\:(?=\z)/
|
40
|
-
REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
|
41
|
-
REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
|
42
|
-
REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
|
43
|
-
REGEXP_SPECIAL_SYMBOL = /[®©]/
|
44
|
-
REGEXP_PERCENT_AT_START = /\A\%/
|
45
|
-
# https://codepoints.net/enclosed_alphanumeric_supplement
|
46
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
47
|
-
REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
|
48
|
-
REGEXP_ENDING_COLON,
|
49
|
-
REGEXP_EXCLAMATION_AT_START,
|
50
|
-
REGEXP_EXCLAMATION_AT_END,
|
51
|
-
REGEXP_HYPHEN_AT_START,
|
52
|
-
REGEXP_SPECIAL_SYMBOL,
|
53
|
-
REGEXP_PERCENT_AT_START,
|
54
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT)
|
55
|
-
REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
|
56
|
-
REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
|
57
|
-
REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
|
58
|
-
REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
|
59
|
-
REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
|
60
|
-
REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
|
61
|
-
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
|
62
|
-
REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
|
63
|
-
REGEXP_NUMBER_ONLY = /\A\d+\z/
|
64
|
-
REGEXP_NO_NUMBERS = /\A\D+\z/
|
65
|
-
REGEXP_NUMBER = /\D*\d+\d*/
|
66
|
-
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
67
|
-
REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
|
16
|
+
NOTHING = ''.freeze
|
17
|
+
DOT = '.'.freeze
|
18
|
+
SPACE = ' '.freeze
|
19
|
+
SINGLE_QUOTE = "'".freeze
|
68
20
|
|
69
21
|
# @param [Hash] opts optional arguments
|
70
22
|
|
@@ -124,7 +76,7 @@ module PragmaticTokenizer
|
|
124
76
|
@abbreviations = Set.new(opts[:abbreviations])
|
125
77
|
@stop_words = Set.new(opts[:stop_words])
|
126
78
|
|
127
|
-
#
|
79
|
+
# Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
|
128
80
|
@contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
|
129
81
|
@abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
|
130
82
|
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
@@ -136,34 +88,43 @@ module PragmaticTokenizer
|
|
136
88
|
@stop_words += language::STOP_WORDS
|
137
89
|
end
|
138
90
|
|
139
|
-
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless
|
91
|
+
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
|
140
92
|
raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
|
141
93
|
raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
|
142
|
-
|
143
|
-
|
94
|
+
|
95
|
+
integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
|
96
|
+
|
97
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
|
98
|
+
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
|
144
99
|
end
|
145
100
|
|
146
101
|
# @param [String] text to be tokenized
|
147
102
|
|
148
103
|
def tokenize(text)
|
149
104
|
return [] unless text
|
150
|
-
raise "In
|
105
|
+
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
151
106
|
CGI.unescapeHTML(text)
|
152
|
-
.scan(
|
153
|
-
.flat_map { |segment|
|
107
|
+
.scan(Regex::CHUNK_LONG_INPUT_TEXT)
|
108
|
+
.flat_map { |segment| process_segment(segment) }
|
154
109
|
end
|
155
110
|
|
156
111
|
private
|
157
112
|
|
158
|
-
def
|
159
|
-
|
113
|
+
def process_segment(segment)
|
114
|
+
pre_processed = pre_process(segment)
|
115
|
+
cased_segment = chosen_case(pre_processed)
|
116
|
+
@tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
|
117
|
+
post_process_tokens
|
118
|
+
end
|
119
|
+
|
120
|
+
def pre_process(segment)
|
121
|
+
segment
|
160
122
|
.extend(PragmaticTokenizer::PreProcessor)
|
161
123
|
.pre_process(language: @language_module)
|
162
124
|
end
|
163
125
|
|
164
|
-
def
|
165
|
-
|
166
|
-
remove_various!
|
126
|
+
def post_process_tokens
|
127
|
+
remove_by_options!
|
167
128
|
process_numbers!
|
168
129
|
process_punctuation!
|
169
130
|
expand_contractions! if @expand_contractions
|
@@ -177,45 +138,45 @@ module PragmaticTokenizer
|
|
177
138
|
@tokens.reject(&:empty?)
|
178
139
|
end
|
179
140
|
|
180
|
-
def run_post_processor(text)
|
181
|
-
PostProcessor.new(
|
182
|
-
text: chosen_case(text),
|
183
|
-
abbreviations: @abbreviations,
|
184
|
-
downcase: @downcase
|
185
|
-
).post_process
|
186
|
-
end
|
187
|
-
|
188
141
|
def expand_contractions!
|
189
|
-
@tokens = @tokens.flat_map { |
|
142
|
+
@tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
|
190
143
|
end
|
191
144
|
|
192
145
|
def expand_token_contraction(token)
|
193
|
-
normalized = inverse_case(token.gsub(
|
146
|
+
normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
|
194
147
|
return token unless @contractions.key?(normalized)
|
195
|
-
result = @contractions[normalized].split(
|
148
|
+
result = @contractions[normalized].split(SPACE)
|
196
149
|
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
197
150
|
result
|
198
151
|
end
|
199
152
|
|
200
153
|
def clean!
|
201
154
|
@tokens = @tokens
|
202
|
-
.flat_map
|
203
|
-
.map!
|
204
|
-
.
|
205
|
-
.delete_if { |t| unclean_token?(t) }
|
155
|
+
.flat_map { |token| split_underscores_asterisk(token) }
|
156
|
+
.map! { |token| remove_irrelevant_characters(token) }
|
157
|
+
.delete_if { |token| many_dots?(token) }
|
206
158
|
end
|
207
159
|
|
208
|
-
def
|
209
|
-
return
|
210
|
-
|
211
|
-
|
212
|
-
|
160
|
+
def split_underscores_asterisk(token)
|
161
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
162
|
+
token.split(Regex::UNDERSCORES_ASTERISK)
|
163
|
+
end
|
164
|
+
|
165
|
+
def remove_irrelevant_characters(token)
|
166
|
+
token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
|
167
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
168
|
+
token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
|
169
|
+
token
|
170
|
+
end
|
171
|
+
|
172
|
+
def many_dots?(token)
|
173
|
+
token =~ Regex::MANY_PERIODS
|
213
174
|
end
|
214
175
|
|
215
176
|
def classic_filter!
|
216
177
|
@tokens.map! do |token|
|
217
|
-
token.delete!(
|
218
|
-
token.sub!(
|
178
|
+
token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
|
179
|
+
token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
|
219
180
|
token
|
220
181
|
end
|
221
182
|
end
|
@@ -223,26 +184,26 @@ module PragmaticTokenizer
|
|
223
184
|
def process_numbers!
|
224
185
|
case @numbers
|
225
186
|
when :semi
|
226
|
-
@tokens.delete_if { |
|
187
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
|
227
188
|
when :none
|
228
|
-
@tokens.delete_if { |
|
189
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
|
229
190
|
when :only
|
230
|
-
@tokens.delete_if { |
|
191
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
|
231
192
|
end
|
232
193
|
end
|
233
194
|
|
234
195
|
def remove_short_tokens!
|
235
|
-
@tokens.delete_if { |
|
196
|
+
@tokens.delete_if { |token| token.length < @minimum_length }
|
236
197
|
end
|
237
198
|
|
238
199
|
def process_punctuation!
|
239
200
|
case @punctuation
|
240
201
|
when :semi
|
241
|
-
@tokens.delete_if { |
|
202
|
+
@tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
|
242
203
|
when :none
|
243
|
-
@tokens.delete_if { |
|
204
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
244
205
|
when :only
|
245
|
-
@tokens.keep_if
|
206
|
+
@tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
246
207
|
end
|
247
208
|
end
|
248
209
|
|
@@ -253,45 +214,50 @@ module PragmaticTokenizer
|
|
253
214
|
def mentions!
|
254
215
|
case @mentions
|
255
216
|
when :remove
|
256
|
-
@tokens.delete_if { |
|
217
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
|
257
218
|
when :keep_and_clean
|
258
|
-
@tokens.map!
|
219
|
+
@tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
|
259
220
|
end
|
260
221
|
end
|
261
222
|
|
262
223
|
def hashtags!
|
263
224
|
case @hashtags
|
264
225
|
when :remove
|
265
|
-
@tokens.delete_if { |
|
226
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
|
266
227
|
when :keep_and_clean
|
267
|
-
@tokens
|
268
|
-
.flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
|
269
|
-
.map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
|
228
|
+
@tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
|
270
229
|
end
|
271
230
|
end
|
272
231
|
|
273
|
-
def
|
274
|
-
@tokens.delete_if { |
|
232
|
+
def remove_by_options!
|
233
|
+
@tokens.delete_if { |token| token =~ regex_by_options }
|
275
234
|
end
|
276
235
|
|
277
|
-
def
|
278
|
-
@
|
236
|
+
def regex_by_options
|
237
|
+
@regex_by_options ||= begin
|
279
238
|
regex_array = []
|
280
|
-
regex_array <<
|
281
|
-
regex_array <<
|
282
|
-
regex_array <<
|
283
|
-
regex_array <<
|
239
|
+
regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
|
240
|
+
regex_array << Regex::ONLY_EMAIL if @remove_emails
|
241
|
+
regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
|
242
|
+
regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
|
284
243
|
Regexp.union(regex_array)
|
285
244
|
end
|
286
245
|
end
|
287
246
|
|
288
247
|
def split_long_words!
|
289
|
-
@tokens = @tokens
|
290
|
-
|
248
|
+
@tokens = @tokens.flat_map { |token| split_long_word(token) }
|
249
|
+
end
|
250
|
+
|
251
|
+
def split_long_word(token)
|
252
|
+
return token unless @long_word_split
|
253
|
+
return token if token.length <= @long_word_split
|
254
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
|
+
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
+
token.split(Regex::LONG_WORD_SPLIT)
|
291
257
|
end
|
292
258
|
|
293
|
-
def chosen_case(
|
294
|
-
@downcase ? Unicode.downcase(
|
259
|
+
def chosen_case(text)
|
260
|
+
@downcase ? Unicode.downcase(text) : text
|
295
261
|
end
|
296
262
|
|
297
263
|
def inverse_case(token)
|