pragmatic_tokenizer 3.0.3 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +1 -1
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +149 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +82 -116
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/deutsch_spec.rb +1 -1
- data/spec/languages/english_spec.rb +52 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +1 -1
- data/spec/spec_helper.rb +1 -1
- metadata +8 -8
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -63
@@ -0,0 +1,149 @@
|
|
1
|
+
module PragmaticTokenizer
|
2
|
+
class Regex
|
3
|
+
|
4
|
+
# Things that can or should be done:
|
5
|
+
# - check where the use of unicode categories helps (\p{Abbreviation})
|
6
|
+
# - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
|
7
|
+
# - check multiple domain regex, we have spec issues when using one or the other
|
8
|
+
# - check multiple punctuation regex
|
9
|
+
|
10
|
+
# Text that needs to be tokenized is initially split into chunks of this length:
|
11
|
+
CHUNK_LONG_INPUT_TEXT = /\S.{1,10000}(?!\S)/m
|
12
|
+
|
13
|
+
# Ranges
|
14
|
+
RANGE_DINGBATS = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
|
15
|
+
RANGE_VARIATION_SELECTORS = /[\uFE00-\uFE0F]/ # alter the previous character
|
16
|
+
RANGE_FULLWIDTH = /[\uFF01-\ufF1F]/ # e.g. !"#'?
|
17
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
18
|
+
RANGE_UNUSUAL_AND_EMOJI = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
|
19
|
+
|
20
|
+
# Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
|
21
|
+
COLON1 = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
|
22
|
+
COLON2 = /(?::)/
|
23
|
+
COMMAS = /(?:([,‚])+)/
|
24
|
+
ENCLOSED_PLUS = /(?:([[:print:]]+)\+([[:print:]]+))/
|
25
|
+
EMAIL = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
|
26
|
+
DIGIT = /(?:[[:digit:]]+)/
|
27
|
+
ASTERISK = /(?:\*+)/
|
28
|
+
UNDERSCORE = /(?:_+)/
|
29
|
+
HYPHEN_OR_UNDERSCORE = /(?:[-_])/
|
30
|
+
PERIOD_AND_PRIOR = /(?:(.+\.))/
|
31
|
+
PERIOD_ONLY = /(?:(\.))/
|
32
|
+
CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
|
33
|
+
PUNCTUATION1 = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
|
34
|
+
PUNCTUATION2 = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
|
35
|
+
PUNCTUATION3 = /(?:[!%\-–\u00AD]+)/
|
36
|
+
PUNCTUATION4 = /(?:[..。]+)/
|
37
|
+
DINGBATS = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
|
38
|
+
NO_BREAK_SPACE = /(?:\u00A0+)/
|
39
|
+
HTTP = /(?:https?:\/\/)/
|
40
|
+
TIME_WITH_COLON = /(?:\d:\d)/
|
41
|
+
DOMAIN_PREFIX = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
|
42
|
+
DOMAIN_SUFFIX = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
|
43
|
+
DOMAIN1 = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
|
44
|
+
DOMAIN2 = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
|
45
|
+
NOT_URL = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
|
46
|
+
HASHTAG_OR_MENTION = /(?:[@#@#][[:print:]]+)/
|
47
|
+
HASHTAG = /(?:[##][[:print:]]+)/
|
48
|
+
MENTION = /(?:[@@][[:print:]]+)/
|
49
|
+
HASHTAG_WITH_HYPHEN = /(?:^([##][[:digit:]]+)-)/
|
50
|
+
ONE_AS_EXCLAMATION = /(?:\D1+)/
|
51
|
+
ONES_EXCLAMATIONS = /(?:!+(1*+!*+)*+)/
|
52
|
+
MANY_PERIODS = /(?:^\.{2,}$)/
|
53
|
+
COPYRIGHT_TRADEMARK = /(?:[®©™]+)/
|
54
|
+
CONTROL_CHARACTER = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
|
55
|
+
APOSTROPHE_AND_S = /(?:['’`́]s)/
|
56
|
+
ALSO_DECIMALS = /(?:[[:alpha:]]*+[[:digit:]]+)/
|
57
|
+
ACUTE_ACCENT_S = /(?:\s\u0301(?=s))/
|
58
|
+
|
59
|
+
# Regular expressions used to capture items
|
60
|
+
CAPTURE_UNUSUAL_AND_EMOJI = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
|
61
|
+
QUESTION_MARK_NOT_URL = /#{NOT_URL.source}(\?)/
|
62
|
+
# Should we change specs and also capture "/", just like we capture ":" and "?"
|
63
|
+
SLASH_NOT_URL = /#{NOT_URL.source}\//
|
64
|
+
SHIFT_BOUNDARY_CHARACTERS = /([;^&|…«»„“¿¡≠]+)/
|
65
|
+
MULTIPLE_DOTS = /(\.{2,})/ # we keep all dashes
|
66
|
+
MULTIPLE_DASHES = /(-){2,}/ # we only keep first dash
|
67
|
+
BRACKET = /([{}()\[\]])/
|
68
|
+
EXCLAMATION_BETWEEN_ALPHA = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
|
69
|
+
PERCENT_BEFORE_DIGIT = /(%)\d+/
|
70
|
+
COMMA_BEFORE_NON_DIGIT = /(,)(?=\D)/
|
71
|
+
COMMA_AFTER_NON_DIGIT = /(?<=\D)(,)/
|
72
|
+
COLON_IN_URL = /(?<=[(https?|ftp)]):(?=\/\/)/
|
73
|
+
QUOTE_BEFORE_PRINT = /(('')|["“])(?=[[:print:]])/
|
74
|
+
QUOTE = /('')|["”]/
|
75
|
+
HYPHEN_AFTER_NON_WORD = /(?<=\W)(-)/
|
76
|
+
HYPHEN_BEFORE_NON_WORD = /(-)(?=\W)/
|
77
|
+
|
78
|
+
STARTS_WITH_COMMAS = /^#{COMMAS.source}/
|
79
|
+
STARTS_WITH_HTTP = /^#{HTTP.source}/
|
80
|
+
STARTS_WITH_DOMAIN = /^#{DOMAIN_PREFIX.source}/
|
81
|
+
STARTS_WITH_COLON1 = /^#{COLON1.source}/
|
82
|
+
STARTS_WITH_UNDERSCORE = /^#{UNDERSCORE.source}/
|
83
|
+
STARTS_WITH_PUNCTUATION3 = /^#{PUNCTUATION3.source}/
|
84
|
+
|
85
|
+
ENDS_WITH_DOMAIN = /#{DOMAIN_SUFFIX.source}$/
|
86
|
+
ENDS_WITH_PUNCTUATION1 = /#{PUNCTUATION1.source}$/
|
87
|
+
ENDS_WITH_PUNCTUATION2 = /#{PUNCTUATION2.source}$/
|
88
|
+
ENDS_WITH_COLON2 = /#{COLON2.source}$/
|
89
|
+
ENDS_WITH_UNDERSCORE = /#{UNDERSCORE.source}$/
|
90
|
+
ENDS_WITH_ONES_EXCLAMATIONS = /#{ONES_EXCLAMATIONS.source}$/
|
91
|
+
ENDS_WITH_EXCITED_ONE = /#{ONE_AS_EXCLAMATION.source}$/
|
92
|
+
ENDS_WITH_APOSTROPHE_AND_S = /#{APOSTROPHE_AND_S.source}$/
|
93
|
+
ENDS_WITH_ALPHA = /[[:alpha:]]$/
|
94
|
+
ENDS_WITH_DIGIT = /[[:digit:]]$/
|
95
|
+
|
96
|
+
ONLY_DECIMALS = /(?:^[[:digit:]]+$)/
|
97
|
+
NO_DECIMALS = /(?:^\D+$)/
|
98
|
+
ONLY_PUNCTUATION = /^[[[:punct:]]^|+]+$/
|
99
|
+
ONLY_ROMAN_NUMERALS = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
|
100
|
+
ONLY_EMAIL = /^#{EMAIL}$/
|
101
|
+
ONLY_HASHTAG_MENTION = /^#{HASHTAG_OR_MENTION}$/
|
102
|
+
ONLY_HASHTAG = /^#{HASHTAG}$/
|
103
|
+
ONLY_MENTION = /^#{MENTION}$/
|
104
|
+
ONLY_DOMAIN1 = /^#{DOMAIN1}$/
|
105
|
+
ONLY_DOMAIN2 = /^#{DOMAIN2}$/
|
106
|
+
ONLY_DOMAIN3 = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
|
107
|
+
DOMAIN_OR_EMAIL = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
|
108
|
+
UNDERSCORES_ASTERISK = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
|
109
|
+
NO_DECIMALS_NO_NUMERALS = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
|
110
|
+
|
111
|
+
COMMAS_OR_PUNCTUATION = Regexp.union(
|
112
|
+
STARTS_WITH_COMMAS,
|
113
|
+
ENDS_WITH_PUNCTUATION1,
|
114
|
+
ENDS_WITH_PUNCTUATION2
|
115
|
+
)
|
116
|
+
|
117
|
+
# Can this constant name be clarified?
|
118
|
+
VARIOUS = Regexp.union(
|
119
|
+
SLASH_NOT_URL,
|
120
|
+
QUESTION_MARK_NOT_URL,
|
121
|
+
ENCLOSED_PLUS,
|
122
|
+
STARTS_WITH_COLON1,
|
123
|
+
DINGBATS,
|
124
|
+
HASHTAG_WITH_HYPHEN,
|
125
|
+
CAPTURE_UNUSUAL_AND_EMOJI
|
126
|
+
)
|
127
|
+
|
128
|
+
IRRELEVANT_CHARACTERS = Regexp.union(
|
129
|
+
STARTS_WITH_PUNCTUATION3,
|
130
|
+
ENDS_WITH_COLON2,
|
131
|
+
ENDS_WITH_ONES_EXCLAMATIONS,
|
132
|
+
CONTROL_CHARACTER,
|
133
|
+
COPYRIGHT_TRADEMARK,
|
134
|
+
RANGE_ALPHANUMERIC_SUPPLEMENT
|
135
|
+
)
|
136
|
+
|
137
|
+
PRE_PROCESS = Regexp.union(
|
138
|
+
SHIFT_BOUNDARY_CHARACTERS,
|
139
|
+
MULTIPLE_DOTS,
|
140
|
+
BRACKET,
|
141
|
+
MULTIPLE_DASHES,
|
142
|
+
EXCLAMATION_BETWEEN_ALPHA,
|
143
|
+
PERCENT_BEFORE_DIGIT,
|
144
|
+
COMMA_BEFORE_NON_DIGIT,
|
145
|
+
COMMA_AFTER_NON_DIGIT
|
146
|
+
)
|
147
|
+
|
148
|
+
end
|
149
|
+
end
|
@@ -1,69 +1,22 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'set'
|
3
2
|
require 'cgi'
|
3
|
+
require 'pragmatic_tokenizer/regex'
|
4
4
|
require 'pragmatic_tokenizer/languages'
|
5
5
|
require 'pragmatic_tokenizer/pre_processor'
|
6
6
|
require 'pragmatic_tokenizer/post_processor'
|
7
|
-
require 'pragmatic_tokenizer/full_stop_separator'
|
8
7
|
require 'unicode'
|
9
8
|
|
10
9
|
module PragmaticTokenizer
|
11
10
|
class Tokenizer
|
12
11
|
|
13
|
-
|
14
|
-
NUMBERS_OPTIONS = Set.new([
|
15
|
-
MENTIONS_OPTIONS = Set.new([
|
12
|
+
PUNCTUATION_OPTIONS = Set.new(%i[all semi none only]).freeze
|
13
|
+
NUMBERS_OPTIONS = Set.new(%i[all semi none only]).freeze
|
14
|
+
MENTIONS_OPTIONS = Set.new(%i[keep_original keep_and_clean remove]).freeze
|
16
15
|
MAX_TOKEN_LENGTH = 50
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
REGEX_URL = /(http|https)(\.|:)/
|
22
|
-
REGEX_HYPHEN = /\-/
|
23
|
-
REGEX_UNDERSCORE = /\_/
|
24
|
-
REGEX_CONTRACTIONS = /[‘’‚‛‹›'´`]/
|
25
|
-
REGEX_APOSTROPHE_S = /['’`́]s$/
|
26
|
-
REGEX_EMAIL = /\S+(@|@)\S+\.\S+/
|
27
|
-
REGEX_HASHTAG_OR_MENTION = /[@@#|#]/
|
28
|
-
REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
|
29
|
-
REGEX_UNDERSCORE_AT_END = /\_+(?=\z)/
|
30
|
-
REGEX_ASTERISK = /\*+/
|
31
|
-
REGEX_UNIFIED1 = Regexp.union(REGEX_UNDERSCORE_AT_START,
|
32
|
-
REGEX_UNDERSCORE_AT_END,
|
33
|
-
REGEX_ASTERISK)
|
34
|
-
# https://en.wikipedia.org/wiki/Control_character
|
35
|
-
# matches any character with hexadecimal value 00 through 1F or 7F.
|
36
|
-
# Rubular: http://rubular.com/r/E83fpBoDjI
|
37
|
-
REGEXP_CONTROL = /[[:cntrl:]]/
|
38
|
-
REGEXP_ENDING_COLON = /\:(?=\z)/
|
39
|
-
REGEXP_EXCLAMATION_AT_START = /(?<=\A)!+(?=.+)/
|
40
|
-
REGEXP_EXCLAMATION_AT_END = /!+(1*!*)*(?=\z)/
|
41
|
-
REGEXP_HYPHEN_AT_START = /\A(-|–|\u{00AD})/
|
42
|
-
REGEXP_SPECIAL_SYMBOL = /[®©]/
|
43
|
-
REGEXP_PERCENT_AT_START = /\A\%/
|
44
|
-
# https://codepoints.net/enclosed_alphanumeric_supplement
|
45
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
|
46
|
-
REGEX_UNIFIED2 = Regexp.union(REGEXP_CONTROL,
|
47
|
-
REGEXP_ENDING_COLON,
|
48
|
-
REGEXP_EXCLAMATION_AT_START,
|
49
|
-
REGEXP_EXCLAMATION_AT_END,
|
50
|
-
REGEXP_HYPHEN_AT_START,
|
51
|
-
REGEXP_SPECIAL_SYMBOL,
|
52
|
-
REGEXP_PERCENT_AT_START,
|
53
|
-
REGEXP_ALPHANUMERIC_SUPPLEMENT)
|
54
|
-
REGEXP_ONE_AS_EXCLAMATION = /(?<=\D)1+(?=\z)/
|
55
|
-
REGEXP_HASHTAG_AT_START = /(?<=\A)(#|#)/
|
56
|
-
REGEXP_AT_SIGN_AT_START = /(?<=\A)(@|@)/
|
57
|
-
REGEXP_HYPHEN_HASTAG = /\A(#|#)\S+-/
|
58
|
-
REGEXP_EMOJI_SNOWFLAKE = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
|
59
|
-
REGEX_EMOJI_UNIFIED = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
|
60
|
-
PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
|
61
|
-
REGEXP_PUNCTUATION_ONLY = /\A[[:punct:]]+\z/
|
62
|
-
REGEXP_NUMBER_ONLY = /\A\d+\z/
|
63
|
-
REGEXP_NO_NUMBERS = /\A\D+\z/
|
64
|
-
REGEXP_NUMBER = /\D*\d+\d*/
|
65
|
-
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
66
|
-
REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
|
16
|
+
NOTHING = ''.freeze
|
17
|
+
DOT = '.'.freeze
|
18
|
+
SPACE = ' '.freeze
|
19
|
+
SINGLE_QUOTE = "'".freeze
|
67
20
|
|
68
21
|
# @param [Hash] opts optional arguments
|
69
22
|
|
@@ -123,10 +76,10 @@ module PragmaticTokenizer
|
|
123
76
|
@abbreviations = Set.new(opts[:abbreviations])
|
124
77
|
@stop_words = Set.new(opts[:stop_words])
|
125
78
|
|
126
|
-
#
|
79
|
+
# Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
|
127
80
|
@contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
|
128
81
|
@abbreviations += @language_module::ABBREVIATIONS if @abbreviations.empty?
|
129
|
-
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
82
|
+
@stop_words += @language_module::STOP_WORDS if @stop_words.empty?
|
130
83
|
|
131
84
|
@filter_languages.each do |lang|
|
132
85
|
language = Languages.get_language_by_code(lang)
|
@@ -135,34 +88,43 @@ module PragmaticTokenizer
|
|
135
88
|
@stop_words += language::STOP_WORDS
|
136
89
|
end
|
137
90
|
|
138
|
-
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless
|
91
|
+
raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
|
139
92
|
raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
|
140
93
|
raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
|
141
|
-
|
142
|
-
|
94
|
+
|
95
|
+
integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
|
96
|
+
|
97
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == integer_class || @minimum_length.nil?
|
98
|
+
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
|
143
99
|
end
|
144
100
|
|
145
101
|
# @param [String] text to be tokenized
|
146
102
|
|
147
103
|
def tokenize(text)
|
148
104
|
return [] unless text
|
149
|
-
raise "In
|
105
|
+
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
150
106
|
CGI.unescapeHTML(text)
|
151
|
-
.scan(
|
152
|
-
.flat_map { |segment|
|
107
|
+
.scan(Regex::CHUNK_LONG_INPUT_TEXT)
|
108
|
+
.flat_map { |segment| process_segment(segment) }
|
153
109
|
end
|
154
110
|
|
155
111
|
private
|
156
112
|
|
157
|
-
def
|
158
|
-
|
113
|
+
def process_segment(segment)
|
114
|
+
pre_processed = pre_process(segment)
|
115
|
+
cased_segment = chosen_case(pre_processed)
|
116
|
+
@tokens = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
|
117
|
+
post_process_tokens
|
118
|
+
end
|
119
|
+
|
120
|
+
def pre_process(segment)
|
121
|
+
segment
|
159
122
|
.extend(PragmaticTokenizer::PreProcessor)
|
160
123
|
.pre_process(language: @language_module)
|
161
124
|
end
|
162
125
|
|
163
|
-
def
|
164
|
-
|
165
|
-
remove_various!
|
126
|
+
def post_process_tokens
|
127
|
+
remove_by_options!
|
166
128
|
process_numbers!
|
167
129
|
process_punctuation!
|
168
130
|
expand_contractions! if @expand_contractions
|
@@ -176,45 +138,45 @@ module PragmaticTokenizer
|
|
176
138
|
@tokens.reject(&:empty?)
|
177
139
|
end
|
178
140
|
|
179
|
-
def run_post_processor(text)
|
180
|
-
PostProcessor.new(
|
181
|
-
text: chosen_case(text),
|
182
|
-
abbreviations: @abbreviations,
|
183
|
-
downcase: @downcase
|
184
|
-
).post_process
|
185
|
-
end
|
186
|
-
|
187
141
|
def expand_contractions!
|
188
|
-
@tokens = @tokens.flat_map { |
|
142
|
+
@tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
|
189
143
|
end
|
190
144
|
|
191
145
|
def expand_token_contraction(token)
|
192
|
-
normalized = inverse_case(token.gsub(
|
146
|
+
normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
|
193
147
|
return token unless @contractions.key?(normalized)
|
194
|
-
result = @contractions[normalized].split(
|
148
|
+
result = @contractions[normalized].split(SPACE)
|
195
149
|
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
196
150
|
result
|
197
151
|
end
|
198
152
|
|
199
153
|
def clean!
|
200
154
|
@tokens = @tokens
|
201
|
-
.flat_map
|
202
|
-
.map!
|
203
|
-
.
|
204
|
-
.delete_if { |t| unclean_token?(t) }
|
155
|
+
.flat_map { |token| split_underscores_asterisk(token) }
|
156
|
+
.map! { |token| remove_irrelevant_characters(token) }
|
157
|
+
.delete_if { |token| many_dots?(token) }
|
205
158
|
end
|
206
159
|
|
207
|
-
def
|
208
|
-
return
|
209
|
-
|
210
|
-
|
211
|
-
|
160
|
+
def split_underscores_asterisk(token)
|
161
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
162
|
+
token.split(Regex::UNDERSCORES_ASTERISK)
|
163
|
+
end
|
164
|
+
|
165
|
+
def remove_irrelevant_characters(token)
|
166
|
+
token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
|
167
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
168
|
+
token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
|
169
|
+
token
|
170
|
+
end
|
171
|
+
|
172
|
+
def many_dots?(token)
|
173
|
+
token =~ Regex::MANY_PERIODS
|
212
174
|
end
|
213
175
|
|
214
176
|
def classic_filter!
|
215
177
|
@tokens.map! do |token|
|
216
|
-
token.delete!(
|
217
|
-
token.sub!(
|
178
|
+
token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
|
179
|
+
token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
|
218
180
|
token
|
219
181
|
end
|
220
182
|
end
|
@@ -222,26 +184,26 @@ module PragmaticTokenizer
|
|
222
184
|
def process_numbers!
|
223
185
|
case @numbers
|
224
186
|
when :semi
|
225
|
-
@tokens.delete_if { |
|
187
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
|
226
188
|
when :none
|
227
|
-
@tokens.delete_if { |
|
189
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
|
228
190
|
when :only
|
229
|
-
@tokens.delete_if { |
|
191
|
+
@tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
|
230
192
|
end
|
231
193
|
end
|
232
194
|
|
233
195
|
def remove_short_tokens!
|
234
|
-
@tokens.delete_if { |
|
196
|
+
@tokens.delete_if { |token| token.length < @minimum_length }
|
235
197
|
end
|
236
198
|
|
237
199
|
def process_punctuation!
|
238
200
|
case @punctuation
|
239
201
|
when :semi
|
240
|
-
@tokens.delete_if { |
|
202
|
+
@tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
|
241
203
|
when :none
|
242
|
-
@tokens.delete_if { |
|
204
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
243
205
|
when :only
|
244
|
-
@tokens.keep_if
|
206
|
+
@tokens.keep_if { |token| token =~ Regex::ONLY_PUNCTUATION }
|
245
207
|
end
|
246
208
|
end
|
247
209
|
|
@@ -252,46 +214,50 @@ module PragmaticTokenizer
|
|
252
214
|
def mentions!
|
253
215
|
case @mentions
|
254
216
|
when :remove
|
255
|
-
@tokens.delete_if { |
|
217
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
|
256
218
|
when :keep_and_clean
|
257
|
-
@tokens.map!
|
219
|
+
@tokens.map! { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
|
258
220
|
end
|
259
221
|
end
|
260
222
|
|
261
223
|
def hashtags!
|
262
224
|
case @hashtags
|
263
225
|
when :remove
|
264
|
-
@tokens.delete_if { |
|
226
|
+
@tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
|
265
227
|
when :keep_and_clean
|
266
|
-
@tokens
|
267
|
-
.flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
|
268
|
-
.map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
|
228
|
+
@tokens.map! { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
|
269
229
|
end
|
270
230
|
end
|
271
231
|
|
272
|
-
def
|
273
|
-
@tokens.delete_if { |
|
232
|
+
def remove_by_options!
|
233
|
+
@tokens.delete_if { |token| token =~ regex_by_options }
|
274
234
|
end
|
275
235
|
|
276
|
-
def
|
277
|
-
@
|
236
|
+
def regex_by_options
|
237
|
+
@regex_by_options ||= begin
|
278
238
|
regex_array = []
|
279
|
-
regex_array <<
|
280
|
-
regex_array <<
|
281
|
-
regex_array <<
|
282
|
-
regex_array <<
|
239
|
+
regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
|
240
|
+
regex_array << Regex::ONLY_EMAIL if @remove_emails
|
241
|
+
regex_array << Regex::STARTS_WITH_HTTP if @remove_urls
|
242
|
+
regex_array << Regex::ONLY_DOMAIN2 if @remove_domains
|
283
243
|
Regexp.union(regex_array)
|
284
244
|
end
|
285
245
|
end
|
286
246
|
|
287
247
|
def split_long_words!
|
288
|
-
@tokens = @tokens
|
289
|
-
|
290
|
-
|
248
|
+
@tokens = @tokens.flat_map { |token| split_long_word(token) }
|
249
|
+
end
|
250
|
+
|
251
|
+
def split_long_word(token)
|
252
|
+
return token unless @long_word_split
|
253
|
+
return token if token.length <= @long_word_split
|
254
|
+
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
|
+
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
+
token.split(Regex::HYPHEN_OR_UNDERSCORE)
|
291
257
|
end
|
292
258
|
|
293
|
-
def chosen_case(
|
294
|
-
@downcase ? Unicode.downcase(
|
259
|
+
def chosen_case(text)
|
260
|
+
@downcase ? Unicode.downcase(text) : text
|
295
261
|
end
|
296
262
|
|
297
263
|
def inverse_case(token)
|