pragmatic_tokenizer 3.0.4 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +150 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/english_spec.rb +13 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +0 -1
- data/spec/spec_helper.rb +1 -1
- metadata +12 -12
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
@@ -0,0 +1,150 @@
|
|
1
|
+
module PragmaticTokenizer
|
2
|
+
class Regex
|
3
|
+
|
4
|
+
# Things that can or should be done:
|
5
|
+
# - check where the use of unicode categories helps (\p{Abbreviation})
|
6
|
+
# - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
|
7
|
+
# - check multiple domain regex, we have spec issues when using one or the other
|
8
|
+
# - check multiple punctuation regex
|
9
|
+
|
10
|
+
# Text that needs to be tokenized is initially split into chunks of this length:
|
11
|
+
CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
|
12
|
+
|
13
|
+
# Ranges
|
14
|
+
RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
|
15
|
+
RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
|
16
|
+
RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
|
17
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
18
|
+
RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
|
19
|
+
|
20
|
+
# Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
|
21
|
+
COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
|
22
|
+
COLON2 = /(?::)/
|
23
|
+
COMMAS = /(?:([,‚])+)/
|
24
|
+
ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
|
25
|
+
EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
|
26
|
+
DIGIT = /(?:[[:digit:]]+)/
|
27
|
+
ASTERISK = /(?:\*+)/
|
28
|
+
UNDERSCORE = /(?:_+)/
|
29
|
+
HYPHEN_OR_UNDERSCORE = /(?:[-_])/
|
30
|
+
LONG_WORD_SPLIT = /(?:[-_\/—–])/
|
31
|
+
PERIOD_AND_PRIOR = /(?:(.+\.))/
|
32
|
+
PERIOD_ONLY = /(?:(\.))/
|
33
|
+
CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
|
34
|
+
PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
|
35
|
+
PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
|
36
|
+
PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
|
37
|
+
PUNCTUATION4 = /(?:[..。]+)/
|
38
|
+
DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
|
39
|
+
NO_BREAK_SPACE = /(?:\u00A0+)/
|
40
|
+
HTTP = /(?:https?:\/\/)/
|
41
|
+
TIME_WITH_COLON = /(?:\d:\d)/
|
42
|
+
DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
|
43
|
+
DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
|
44
|
+
DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
|
45
|
+
DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
|
46
|
+
NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
|
47
|
+
HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
|
48
|
+
HASHTAG = /(?:[##][[:print:]]+)/
|
49
|
+
MENTION = /(?:[@@][[:print:]]+)/
|
50
|
+
HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
|
51
|
+
ONE_AS_EXCLAMATION = /(?:\D1+)/
|
52
|
+
ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
|
53
|
+
MANY_PERIODS = /(?:^\.{2,}$)/
|
54
|
+
COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
|
55
|
+
CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
|
56
|
+
APOSTROPHE_AND_S = /(?:['’`́]s)/
|
57
|
+
ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
|
58
|
+
ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
|
59
|
+
|
60
|
+
# Regular expressions used to capture items
|
61
|
+
CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
|
62
|
+
QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
|
63
|
+
# Should we change specs and also capture "/", just like we capture ":" and "?"
|
64
|
+
SLASH_NOT_URL = /#{NOT_URL.source}\//
|
65
|
+
SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
|
66
|
+
MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
|
67
|
+
MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
|
68
|
+
BRACKET = /([{}()\[\]])/
|
69
|
+
EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
|
70
|
+
PERCENT_BEFORE_DIGIT = /(%)\d+/
|
71
|
+
COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
|
72
|
+
COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
|
73
|
+
COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
|
74
|
+
QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
|
75
|
+
QUOTE = /('')|["”]/
|
76
|
+
HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
|
77
|
+
HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
|
78
|
+
|
79
|
+
STARTS_WITH_COMMAS = /^#{COMMAS.source}/
|
80
|
+
STARTS_WITH_HTTP = /^#{HTTP.source}/
|
81
|
+
STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
|
82
|
+
STARTS_WITH_COLON1 = /^#{COLON1.source}/
|
83
|
+
STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
|
84
|
+
STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
|
85
|
+
|
86
|
+
ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
|
87
|
+
ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
|
88
|
+
ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
|
89
|
+
ENDS_WITH_COLON2 = /#{COLON2.source}$/
|
90
|
+
ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
|
91
|
+
ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
|
92
|
+
ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
|
93
|
+
ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
|
94
|
+
ENDS_WITH_ALPHA = /[[:alpha:]]$/
|
95
|
+
ENDS_WITH_DIGIT = /[[:digit:]]$/
|
96
|
+
|
97
|
+
ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
|
98
|
+
NO_DECIMALS = /(?:^\D+$)/
|
99
|
+
ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
|
100
|
+
ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
|
101
|
+
ONLY_EMAIL = /^#{EMAIL}$/
|
102
|
+
ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
|
103
|
+
ONLY_HASHTAG = /^#{HASHTAG}$/
|
104
|
+
ONLY_MENTION = /^#{MENTION}$/
|
105
|
+
ONLY_DOMAIN1 = /^#{DOMAIN1}$/
|
106
|
+
ONLY_DOMAIN2 = /^#{DOMAIN2}$/
|
107
|
+
ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
|
108
|
+
DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
|
109
|
+
UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
|
110
|
+
NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
|
111
|
+
|
112
|
+
COMMAS_OR_PUNCTUATION = Regexp.union(
|
113
|
+
STARTS_WITH_COMMAS,
|
114
|
+
ENDS_WITH_PUNCTUATION1,
|
115
|
+
ENDS_WITH_PUNCTUATION2
|
116
|
+
)
|
117
|
+
|
118
|
+
# Can this constant name be clarified?
|
119
|
+
VARIOUS = Regexp.union(
|
120
|
+
SLASH_NOT_URL,
|
121
|
+
QUESTION_MARK_NOT_URL,
|
122
|
+
ENCLOSED_PLUS,
|
123
|
+
STARTS_WITH_COLON1,
|
124
|
+
DINGBATS,
|
125
|
+
HASHTAG_WITH_HYPHEN,
|
126
|
+
CAPTURE_UNUSUAL_AND_EMOJI
|
127
|
+
)
|
128
|
+
|
129
|
+
IRRELEVANT_CHARACTERS = Regexp.union(
|
130
|
+
STARTS_WITH_PUNCTUATION3,
|
131
|
+
ENDS_WITH_COLON2,
|
132
|
+
ENDS_WITH_ONES_EXCLAMATIONS,
|
133
|
+
CONTROL_CHARACTER,
|
134
|
+
COPYRIGHT_TRADEMARK,
|
135
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT
|
136
|
+
)
|
137
|
+
|
138
|
+
PRE_PROCESS = Regexp.union(
|
139
|
+
SHIFT_BOUNDARY_CHARACTERS,
|
140
|
+
MULTIPLE_DOTS,
|
141
|
+
BRACKET,
|
142
|
+
MULTIPLE_DASHES,
|
143
|
+
EXCLAMATION_BETWEEN_ALPHA,
|
144
|
+
PERCENT_BEFORE_DIGIT,
|
145
|
+
COMMA_BEFORE_NON_DIGIT,
|
146
|
+
COMMA_AFTER_NON_DIGIT
|
147
|
+
)
|
148
|
+
|
149
|
+
end
|
150
|
+
end
|
@@ -1,70 +1,22 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'set'
|
3
2
|
require 'cgi'
|
3
|
+
require 'pragmatic_tokenizer/regex'
|
4
4
|
require 'pragmatic_tokenizer/languages'
|
5
5
|
require 'pragmatic_tokenizer/pre_processor'
|
6
6
|
require 'pragmatic_tokenizer/post_processor'
|
7
|
-
require 'pragmatic_tokenizer/full_stop_separator'
|
8
7
|
require 'unicode'
|
9
8
|
|
10
9
|
module PragmaticTokenizer
|
11
10
|
class Tokenizer
|
12
11
|
|
13
|
-
|
14
|
-
NUMBERS_OPTIONS = Set.new([
|
15
|
-
MENTIONS_OPTIONS = Set.new([
|
12
|
+
PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
|
13
|
+
NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
|
14
|
+
MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
|
16
15
|
MAX_TOKEN_LENGTH = 50
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
REGEX_URL = /(http|https)(\.|:)/
|
22
|
-
REGEX_HYPHEN = /\-/
|
23
|
-
REGEX_LONG_WORD = /\-|\_/
|
24
|
-
REGEXP_SPLIT_CHECK = /@|@|(http)/
|
25
|
-
REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
|
26
|
-
REGEX_APOSTROPHE_S = /['’`́]s$/
|
27
|
-
REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
|
28
|
-
REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
|
29
|
-
REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
|
30
|
-
REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
|
31
|
-
REGEX_ASTERISK = /\*+/
|
32
|
-
REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
|
33
|
-
REGEX_UNDERSCORE_AT_END,
|
34
|
-
REGEX_ASTERISK)
|
35
|
-
# https://en.wikipedia.org/wiki/Control_character
|
36
|
-
# matches any character with hexadecimal value 00 through 1F or 7F.
|
37
|
-
# Rubular: http://rubular.com/r/E83fpBoDjI
|
38
|
-
REGEXP_CONTROL = /[[:cntrl:]]/
|
39
|
-
REGEXP_ENDING_COLON = /\:(?=\z)/
|
40
|
-
REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
|
41
|
-
REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
|
42
|
-
REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
|
43
|
-
REGEXP_SPECIAL_SYMBOL = /[®©]/
|
44
|
-
REGEXP_PERCENT_AT_START = /\A\%/
|
45
|
-
# https://codepoints.net/enclosed_alphanumeric_supplement
|
46
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
47
|
-
REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
|
48
|
-
REGEXP_ENDING_COLON,
|
49
|
-
REGEXP_EXCLAMATION_AT_START,
|
50
|
-
REGEXP_EXCLAMATION_AT_END,
|
51
|
-
REGEXP_HYPHEN_AT_START,
|
52
|
-
REGEXP_SPECIAL_SYMBOL,
|
53
|
-
REGEXP_PERCENT_AT_START,
|
54
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT)
|
55
|
-
REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
|
56
|
-
REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
|
57
|
-
REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
|
58
|
-
REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
|
59
|
-
REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
|
60
|
-
REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
|
61
|
-
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
|
62
|
-
REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
|
63
|
-
REGEXP_NUMBER_ONLY = /\A\d+\z/
|
64
|
-
REGEXP_NO_NUMBERS = /\A\D+\z/
|
65
|
-
REGEXP_NUMBER = /\D*\d+\d*/
|
66
|
-
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
67
|
-
REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
|
16
|
+
NOTHING = ''.freeze
|
17
|
+
DOT = '.'.freeze
|
18
|
+
SPACE = ' '.freeze
|
19
|
+
SINGLE_QUOTE = "'".freeze
|
68
20
|
|
69
21
|
# @param [Hash] opts optional arguments
|
70
22
|
|
@@ -124,7 +76,7 @@ module PragmaticTokenizer
|
|
124
76
|
@abbreviations = Set.new(opts[:abbreviations])
|
125
77
|
@stop_words = Set.new(opts[:stop_words])
|
126
78
|
|
127
|
-
#
|
79
|
+
# Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
|
128
80
|
@contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
|
129
81
|
@abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
|
130
82
|
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
@@ -136,34 +88,43 @@ module PragmaticTokenizer
|
|
136
88
|
@stop_words += language::STOP_WORDS
|
137
89
|
end
|
138
90
|
|
139
|
-
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless
|
91
|
+
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
|
140
92
|
raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
|
141
93
|
raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
|
142
|
-
|
143
|
-
|
94
|
+
|
95
|
+
integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
|
96
|
+
|
97
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
|
98
|
+
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
|
144
99
|
end
|
145
100
|
|
146
101
|
# @param [String] text to be tokenized
|
147
102
|
|
148
103
|
def tokenize(text)
|
149
104
|
return [] unless text
|
150
|
-
raise "In
|
105
|
+
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
151
106
|
CGI.unescapeHTML(text)
|
152
|
-
.scan(
|
153
|
-
.flat_map { |segment|
|
107
|
+
.scan(Regex::CHUNK_LONG_INPUT_TEXT)
|
108
|
+
.flat_map { |segment| process_segment(segment) }
|
154
109
|
end
|
155
110
|
|
156
111
|
private
|
157
112
|
|
158
|
-
def
|
159
|
-
|
113
|
+
def process_segment(segment)
|
114
|
+
pre_processed = pre_process(segment)
|
115
|
+
cased_segment = chosen_case(pre_processed)
|
116
|
+
@tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
|
117
|
+
post_process_tokens
|
118
|
+
end
|
119
|
+
|
120
|
+
def pre_process(segment)
|
121
|
+
segment
|
160
122
|
.extend(PragmaticTokenizer::PreProcessor)
|
161
123
|
.pre_process(language: @language_module)
|
162
124
|
end
|
163
125
|
|
164
|
-
def
|
165
|
-
|
166
|
-
remove_various!
|
126
|
+
def post_process_tokens
|
127
|
+
remove_by_options!
|
167
128
|
process_numbers!
|
168
129
|
process_punctuation!
|
169
130
|
expand_contractions! if @expand_contractions
|
@@ -177,45 +138,45 @@ module PragmaticTokenizer
|
|
177
138
|
@tokens.reject(&:empty?)
|
178
139
|
end
|
179
140
|
|
180
|
-
def run_post_processor(text)
|
181
|
-
PostProcessor.new(
|
182
|
-
text: chosen_case(text),
|
183
|
-
abbreviations: @abbreviations,
|
184
|
-
downcase: @downcase
|
185
|
-
).post_process
|
186
|
-
end
|
187
|
-
|
188
141
|
def expand_contractions!
|
189
|
-
@tokens = @tokens.flat_map { |
|
142
|
+
@tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
|
190
143
|
end
|
191
144
|
|
192
145
|
def expand_token_contraction(token)
|
193
|
-
normalized = inverse_case(token.gsub(
|
146
|
+
normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
|
194
147
|
return token unless @contractions.key?(normalized)
|
195
|
-
result = @contractions[normalized].split(
|
148
|
+
result = @contractions[normalized].split(SPACE)
|
196
149
|
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
197
150
|
result
|
198
151
|
end
|
199
152
|
|
200
153
|
def clean!
|
201
154
|
@tokens = @tokens
|
202
|
-
.flat_map
|
203
|
-
.map!
|
204
|
-
.
|
205
|
-
.delete_if { |t| unclean_token?(t) }
|
155
|
+
.flat_map { |token| split_underscores_asterisk(token) }
|
156
|
+
.map! { |token| remove_irrelevant_characters(token) }
|
157
|
+
.delete_if { |token| many_dots?(token) }
|
206
158
|
end
|
207
159
|
|
208
|
-
def
|
209
|
-
return
|
210
|
-
|
211
|
-
|
212
|
-
|
160
|
+
def split_underscores_asterisk(token)
|
161
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
162
|
+
token.split(Regex::UNDERSCORES_ASTERISK)
|
163
|
+
end
|
164
|
+
|
165
|
+
def remove_irrelevant_characters(token)
|
166
|
+
token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
|
167
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
168
|
+
token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
|
169
|
+
token
|
170
|
+
end
|
171
|
+
|
172
|
+
def many_dots?(token)
|
173
|
+
token =~ Regex::MANY_PERIODS
|
213
174
|
end
|
214
175
|
|
215
176
|
def classic_filter!
|
216
177
|
@tokens.map! do |token|
|
217
|
-
token.delete!(
|
218
|
-
token.sub!(
|
178
|
+
token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
|
179
|
+
token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
|
219
180
|
token
|
220
181
|
end
|
221
182
|
end
|
@@ -223,26 +184,26 @@ module PragmaticTokenizer
|
|
223
184
|
def process_numbers!
|
224
185
|
case @numbers
|
225
186
|
when :semi
|
226
|
-
@tokens.delete_if { |
|
187
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
|
227
188
|
when :none
|
228
|
-
@tokens.delete_if { |
|
189
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
|
229
190
|
when :only
|
230
|
-
@tokens.delete_if { |
|
191
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
|
231
192
|
end
|
232
193
|
end
|
233
194
|
|
234
195
|
def remove_short_tokens!
|
235
|
-
@tokens.delete_if { |
|
196
|
+
@tokens.delete_if { |token| token.length < @minimum_length }
|
236
197
|
end
|
237
198
|
|
238
199
|
def process_punctuation!
|
239
200
|
case @punctuation
|
240
201
|
when :semi
|
241
|
-
@tokens.delete_if { |
|
202
|
+
@tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
|
242
203
|
when :none
|
243
|
-
@tokens.delete_if { |
|
204
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
244
205
|
when :only
|
245
|
-
@tokens.keep_if
|
206
|
+
@tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
246
207
|
end
|
247
208
|
end
|
248
209
|
|
@@ -253,45 +214,50 @@ module PragmaticTokenizer
|
|
253
214
|
def mentions!
|
254
215
|
case @mentions
|
255
216
|
when :remove
|
256
|
-
@tokens.delete_if { |
|
217
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
|
257
218
|
when :keep_and_clean
|
258
|
-
@tokens.map!
|
219
|
+
@tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
|
259
220
|
end
|
260
221
|
end
|
261
222
|
|
262
223
|
def hashtags!
|
263
224
|
case @hashtags
|
264
225
|
when :remove
|
265
|
-
@tokens.delete_if { |
|
226
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
|
266
227
|
when :keep_and_clean
|
267
|
-
@tokens
|
268
|
-
.flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
|
269
|
-
.map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
|
228
|
+
@tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
|
270
229
|
end
|
271
230
|
end
|
272
231
|
|
273
|
-
def
|
274
|
-
@tokens.delete_if { |
|
232
|
+
def remove_by_options!
|
233
|
+
@tokens.delete_if { |token| token =~ regex_by_options }
|
275
234
|
end
|
276
235
|
|
277
|
-
def
|
278
|
-
@
|
236
|
+
def regex_by_options
|
237
|
+
@regex_by_options ||= begin
|
279
238
|
regex_array = []
|
280
|
-
regex_array <<
|
281
|
-
regex_array <<
|
282
|
-
regex_array <<
|
283
|
-
regex_array <<
|
239
|
+
regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
|
240
|
+
regex_array << Regex::ONLY_EMAIL if @remove_emails
|
241
|
+
regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
|
242
|
+
regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
|
284
243
|
Regexp.union(regex_array)
|
285
244
|
end
|
286
245
|
end
|
287
246
|
|
288
247
|
def split_long_words!
|
289
|
-
@tokens = @tokens
|
290
|
-
|
248
|
+
@tokens = @tokens.flat_map { |token| split_long_word(token) }
|
249
|
+
end
|
250
|
+
|
251
|
+
def split_long_word(token)
|
252
|
+
return token unless @long_word_split
|
253
|
+
return token if token.length <= @long_word_split
|
254
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
|
+
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
+
token.split(Regex::LONG_WORD_SPLIT)
|
291
257
|
end
|
292
258
|
|
293
|
-
def chosen_case(
|
294
|
-
@downcase ? Unicode.downcase(
|
259
|
+
def chosen_case(text)
|
260
|
+
@downcase ? Unicode.downcase(text) : text
|
295
261
|
end
|
296
262
|
|
297
263
|
def inverse_case(token)
|