pragmatic_tokenizer 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -11
- data/lib/pragmatic_tokenizer/languages/common.rb +2 -1
- data/lib/pragmatic_tokenizer/languages/english.rb +1 -0
- data/lib/pragmatic_tokenizer/post_processor.rb +2 -2
- data/lib/pragmatic_tokenizer/pre_processor.rb +6 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +10 -3
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8cba2ce060ad1d9ffc74953a9e3a9504b1c8ed13
|
4
|
+
data.tar.gz: 3d96486358f974ce30165199381b27c3e01f7625
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8cc83be7dc5d9db9dd03d8895ea0a0d7bb1856b0f7b82698362dfb97f8e6cd32c3cbaccbb320ce04cb193594c765ae4cbbce5cb24634f3fe0aced37298ce75c5
|
7
|
+
data.tar.gz: f7edb187f8cc2f60aad58d79eac454b556dec53d77aa86a90d55d054d009783be8b4619da22550a79a01ac91540a7b1e3d5ea509e4c482821e32517f976c6208
|
data/README.md
CHANGED
@@ -85,7 +85,7 @@ options = {
|
|
85
85
|
<hr>
|
86
86
|
|
87
87
|
##### `remove_stop_words`
|
88
|
-
**default** = `
|
88
|
+
**default** = `false`
|
89
89
|
- `true`
|
90
90
|
Removes all stop words.
|
91
91
|
- `false`
|
@@ -94,7 +94,7 @@ options = {
|
|
94
94
|
<hr>
|
95
95
|
|
96
96
|
##### `expand_contractions`
|
97
|
-
**default** = `
|
97
|
+
**default** = `false`
|
98
98
|
- `true`
|
99
99
|
Expands contractions (i.e. i'll -> i will).
|
100
100
|
- `false`
|
@@ -135,7 +135,7 @@ options = {
|
|
135
135
|
<hr>
|
136
136
|
|
137
137
|
##### `remove_emoji`
|
138
|
-
**default** = `
|
138
|
+
**default** = `false`
|
139
139
|
- `true`
|
140
140
|
Removes any token that contains an emoji.
|
141
141
|
- `false`
|
@@ -144,7 +144,7 @@ options = {
|
|
144
144
|
<hr>
|
145
145
|
|
146
146
|
##### `remove_urls`
|
147
|
-
**default** = `
|
147
|
+
**default** = `false`
|
148
148
|
- `true`
|
149
149
|
Removes any token that contains a URL.
|
150
150
|
- `false`
|
@@ -153,7 +153,7 @@ options = {
|
|
153
153
|
<hr>
|
154
154
|
|
155
155
|
##### `remove_domains`
|
156
|
-
**default** = `
|
156
|
+
**default** = `false`
|
157
157
|
- `true`
|
158
158
|
Removes any token that contains a domain.
|
159
159
|
- `false`
|
@@ -162,7 +162,7 @@ options = {
|
|
162
162
|
<hr>
|
163
163
|
|
164
164
|
##### `remove_domains`
|
165
|
-
**default** = `
|
165
|
+
**default** = `false`
|
166
166
|
- `true`
|
167
167
|
Removes any token that contains a domain.
|
168
168
|
- `false`
|
@@ -171,7 +171,7 @@ options = {
|
|
171
171
|
<hr>
|
172
172
|
|
173
173
|
##### `clean`
|
174
|
-
**default** = `
|
174
|
+
**default** = `false`
|
175
175
|
- `true`
|
176
176
|
Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
|
177
177
|
- `false`
|
@@ -180,7 +180,7 @@ options = {
|
|
180
180
|
<hr>
|
181
181
|
|
182
182
|
##### `hashtags`
|
183
|
-
**default** =
|
183
|
+
**default** = `:keep_original`
|
184
184
|
- `:keep_original`
|
185
185
|
Does not alter the token at all.
|
186
186
|
- `:keep_and_clean`
|
@@ -191,7 +191,7 @@ options = {
|
|
191
191
|
<hr>
|
192
192
|
|
193
193
|
##### `mentions`
|
194
|
-
**default** =
|
194
|
+
**default** = `:keep_original`
|
195
195
|
- `:keep_original`
|
196
196
|
Does not alter the token at all.
|
197
197
|
- `:keep_and_clean`
|
@@ -202,7 +202,7 @@ options = {
|
|
202
202
|
<hr>
|
203
203
|
|
204
204
|
##### `classic_filter`
|
205
|
-
**default** = `
|
205
|
+
**default** = `false`
|
206
206
|
- `true`
|
207
207
|
Removes dots from acronyms and 's from the end of tokens.
|
208
208
|
- `false`
|
@@ -211,7 +211,7 @@ options = {
|
|
211
211
|
<hr>
|
212
212
|
|
213
213
|
##### `downcase`
|
214
|
-
**default** = `
|
214
|
+
**default** = `true`
|
215
215
|
|
216
216
|
<hr>
|
217
217
|
|
@@ -2,7 +2,7 @@ module PragmaticTokenizer
|
|
2
2
|
module Languages
|
3
3
|
module Common
|
4
4
|
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
|
5
|
-
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }.freeze
|
5
|
+
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.'].freeze
|
7
7
|
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix'].freeze
|
8
8
|
SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
|
@@ -18,6 +18,7 @@ module PragmaticTokenizer
|
|
18
18
|
def handle_single_quotes(text)
|
19
19
|
# Convert left quotes to special character except for 'Twas or 'twas
|
20
20
|
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
21
|
+
text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
|
21
22
|
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
22
23
|
# Separate right single quotes
|
23
24
|
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
@@ -99,6 +99,7 @@ module PragmaticTokenizer
|
|
99
99
|
def handle_single_quotes(text)
|
100
100
|
# Convert left quotes to special character except for 'Twas or 'twas
|
101
101
|
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
102
|
+
text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
|
102
103
|
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
103
104
|
# Separate right single quotes
|
104
105
|
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
@@ -17,6 +17,7 @@ module PragmaticTokenizer
|
|
17
17
|
.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
|
18
18
|
.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
|
19
19
|
.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
|
20
|
+
.flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
|
20
21
|
.flat_map { |t| t.include?(".") &&
|
21
22
|
t !~ /(http|https|www)(\.|:)/ &&
|
22
23
|
t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
|
@@ -35,14 +36,13 @@ module PragmaticTokenizer
|
|
35
36
|
abbreviations.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub(/\./, '\1. \2').split(' ').flatten : t }
|
36
37
|
.flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
|
37
38
|
.flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
|
38
|
-
.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
39
39
|
).separate
|
40
40
|
end
|
41
41
|
|
42
42
|
private
|
43
43
|
|
44
44
|
def convert_sym_to_punct(token)
|
45
|
-
symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
|
45
|
+
symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.match(token)
|
46
46
|
symbol_matches.nil? ? token : token.gsub!(symbol_matches[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol_matches[0]))
|
47
47
|
end
|
48
48
|
end
|
@@ -16,6 +16,7 @@ module PragmaticTokenizer
|
|
16
16
|
shift_bracket(text)
|
17
17
|
shift_semicolon(text)
|
18
18
|
shift_caret(text)
|
19
|
+
shift_hashtag(text)
|
19
20
|
shift_vertical_bar(text)
|
20
21
|
convert_dbl_quotes(text)
|
21
22
|
convert_sgl_quotes(text)
|
@@ -29,6 +30,7 @@ module PragmaticTokenizer
|
|
29
30
|
def shift_comma(text)
|
30
31
|
# Shift commas off everything but numbers
|
31
32
|
text.gsub!(/,(?!\d)/o, ' , ') || text
|
33
|
+
text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
|
32
34
|
end
|
33
35
|
|
34
36
|
def shift_multiple_dash(text)
|
@@ -78,6 +80,10 @@ module PragmaticTokenizer
|
|
78
80
|
text.gsub!(/\^/, ' ^ ') || text
|
79
81
|
end
|
80
82
|
|
83
|
+
def shift_hashtag(text)
|
84
|
+
text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
|
85
|
+
end
|
86
|
+
|
81
87
|
def shift_vertical_bar(text)
|
82
88
|
text.gsub!(/\|/, ' | ') || text
|
83
89
|
end
|
@@ -160,11 +160,17 @@ module PragmaticTokenizer
|
|
160
160
|
.flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
161
161
|
.flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
162
162
|
.map { |t| t.gsub(/[[:cntrl:]]/, '') }
|
163
|
+
.map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
|
164
|
+
.map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
|
165
|
+
.map { |t| t.gsub(/1+(?=\z)/, '') }
|
166
|
+
.map { |t| t.gsub(/!+(?=\z)/, '') }
|
167
|
+
.map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
|
163
168
|
.delete_if { |t| t =~ /\A-+\z/ ||
|
164
169
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
165
170
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
166
171
|
t.length > 50 ||
|
167
|
-
(t.length > 1 && t =~ /[&*+<=>^|~]/i)
|
172
|
+
(t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
|
173
|
+
(t.length == 1 && t =~ /\:/)
|
168
174
|
}
|
169
175
|
end
|
170
176
|
|
@@ -211,7 +217,7 @@ module PragmaticTokenizer
|
|
211
217
|
end
|
212
218
|
|
213
219
|
def remove_emails!
|
214
|
-
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
|
220
|
+
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
|
215
221
|
end
|
216
222
|
|
217
223
|
def mentions!
|
@@ -228,6 +234,7 @@ module PragmaticTokenizer
|
|
228
234
|
when 'remove'
|
229
235
|
@tokens.delete_if { |t| t =~ /\A(#|#)/ }
|
230
236
|
when 'keep_and_clean'
|
237
|
+
@tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
231
238
|
@tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
|
232
239
|
end
|
233
240
|
end
|
@@ -237,7 +244,7 @@ module PragmaticTokenizer
|
|
237
244
|
end
|
238
245
|
|
239
246
|
def remove_domains!
|
240
|
-
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]
|
247
|
+
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
|
241
248
|
end
|
242
249
|
|
243
250
|
def split_long_words!
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|