pragmatic_tokenizer 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -11
- data/lib/pragmatic_tokenizer/languages/common.rb +2 -1
- data/lib/pragmatic_tokenizer/languages/english.rb +1 -0
- data/lib/pragmatic_tokenizer/post_processor.rb +2 -2
- data/lib/pragmatic_tokenizer/pre_processor.rb +6 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +10 -3
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8cba2ce060ad1d9ffc74953a9e3a9504b1c8ed13
|
4
|
+
data.tar.gz: 3d96486358f974ce30165199381b27c3e01f7625
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8cc83be7dc5d9db9dd03d8895ea0a0d7bb1856b0f7b82698362dfb97f8e6cd32c3cbaccbb320ce04cb193594c765ae4cbbce5cb24634f3fe0aced37298ce75c5
|
7
|
+
data.tar.gz: f7edb187f8cc2f60aad58d79eac454b556dec53d77aa86a90d55d054d009783be8b4619da22550a79a01ac91540a7b1e3d5ea509e4c482821e32517f976c6208
|
data/README.md
CHANGED
@@ -85,7 +85,7 @@ options = {
|
|
85
85
|
<hr>
|
86
86
|
|
87
87
|
##### `remove_stop_words`
|
88
|
-
**default** = `
|
88
|
+
**default** = `false`
|
89
89
|
- `true`
|
90
90
|
Removes all stop words.
|
91
91
|
- `false`
|
@@ -94,7 +94,7 @@ options = {
|
|
94
94
|
<hr>
|
95
95
|
|
96
96
|
##### `expand_contractions`
|
97
|
-
**default** = `
|
97
|
+
**default** = `false`
|
98
98
|
- `true`
|
99
99
|
Expands contractions (i.e. i'll -> i will).
|
100
100
|
- `false`
|
@@ -135,7 +135,7 @@ options = {
|
|
135
135
|
<hr>
|
136
136
|
|
137
137
|
##### `remove_emoji`
|
138
|
-
**default** = `
|
138
|
+
**default** = `false`
|
139
139
|
- `true`
|
140
140
|
Removes any token that contains an emoji.
|
141
141
|
- `false`
|
@@ -144,7 +144,7 @@ options = {
|
|
144
144
|
<hr>
|
145
145
|
|
146
146
|
##### `remove_urls`
|
147
|
-
**default** = `
|
147
|
+
**default** = `false`
|
148
148
|
- `true`
|
149
149
|
Removes any token that contains a URL.
|
150
150
|
- `false`
|
@@ -153,7 +153,7 @@ options = {
|
|
153
153
|
<hr>
|
154
154
|
|
155
155
|
##### `remove_domains`
|
156
|
-
**default** = `
|
156
|
+
**default** = `false`
|
157
157
|
- `true`
|
158
158
|
Removes any token that contains a domain.
|
159
159
|
- `false`
|
@@ -162,7 +162,7 @@ options = {
|
|
162
162
|
<hr>
|
163
163
|
|
164
164
|
##### `remove_domains`
|
165
|
-
**default** = `
|
165
|
+
**default** = `false`
|
166
166
|
- `true`
|
167
167
|
Removes any token that contains a domain.
|
168
168
|
- `false`
|
@@ -171,7 +171,7 @@ options = {
|
|
171
171
|
<hr>
|
172
172
|
|
173
173
|
##### `clean`
|
174
|
-
**default** = `
|
174
|
+
**default** = `false`
|
175
175
|
- `true`
|
176
176
|
Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
|
177
177
|
- `false`
|
@@ -180,7 +180,7 @@ options = {
|
|
180
180
|
<hr>
|
181
181
|
|
182
182
|
##### `hashtags`
|
183
|
-
**default** =
|
183
|
+
**default** = `:keep_original`
|
184
184
|
- `:keep_original`
|
185
185
|
Does not alter the token at all.
|
186
186
|
- `:keep_and_clean`
|
@@ -191,7 +191,7 @@ options = {
|
|
191
191
|
<hr>
|
192
192
|
|
193
193
|
##### `mentions`
|
194
|
-
**default** =
|
194
|
+
**default** = `:keep_original`
|
195
195
|
- `:keep_original`
|
196
196
|
Does not alter the token at all.
|
197
197
|
- `:keep_and_clean`
|
@@ -202,7 +202,7 @@ options = {
|
|
202
202
|
<hr>
|
203
203
|
|
204
204
|
##### `classic_filter`
|
205
|
-
**default** = `
|
205
|
+
**default** = `false`
|
206
206
|
- `true`
|
207
207
|
Removes dots from acronyms and 's from the end of tokens.
|
208
208
|
- `false`
|
@@ -211,7 +211,7 @@ options = {
|
|
211
211
|
<hr>
|
212
212
|
|
213
213
|
##### `downcase`
|
214
|
-
**default** = `
|
214
|
+
**default** = `true`
|
215
215
|
|
216
216
|
<hr>
|
217
217
|
|
@@ -2,7 +2,7 @@ module PragmaticTokenizer
|
|
2
2
|
module Languages
|
3
3
|
module Common
|
4
4
|
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
|
5
|
-
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }.freeze
|
5
|
+
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.'].freeze
|
7
7
|
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix'].freeze
|
8
8
|
SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
|
@@ -18,6 +18,7 @@ module PragmaticTokenizer
|
|
18
18
|
def handle_single_quotes(text)
|
19
19
|
# Convert left quotes to special character except for 'Twas or 'twas
|
20
20
|
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
21
|
+
text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
|
21
22
|
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
22
23
|
# Separate right single quotes
|
23
24
|
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
@@ -99,6 +99,7 @@ module PragmaticTokenizer
|
|
99
99
|
def handle_single_quotes(text)
|
100
100
|
# Convert left quotes to special character except for 'Twas or 'twas
|
101
101
|
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
102
|
+
text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
|
102
103
|
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
103
104
|
# Separate right single quotes
|
104
105
|
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
@@ -17,6 +17,7 @@ module PragmaticTokenizer
|
|
17
17
|
.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
|
18
18
|
.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
|
19
19
|
.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
|
20
|
+
.flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
|
20
21
|
.flat_map { |t| t.include?(".") &&
|
21
22
|
t !~ /(http|https|www)(\.|:)/ &&
|
22
23
|
t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
|
@@ -35,14 +36,13 @@ module PragmaticTokenizer
|
|
35
36
|
abbreviations.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub(/\./, '\1. \2').split(' ').flatten : t }
|
36
37
|
.flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
|
37
38
|
.flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
|
38
|
-
.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
39
39
|
).separate
|
40
40
|
end
|
41
41
|
|
42
42
|
private
|
43
43
|
|
44
44
|
def convert_sym_to_punct(token)
|
45
|
-
symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
|
45
|
+
symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.match(token)
|
46
46
|
symbol_matches.nil? ? token : token.gsub!(symbol_matches[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol_matches[0]))
|
47
47
|
end
|
48
48
|
end
|
@@ -16,6 +16,7 @@ module PragmaticTokenizer
|
|
16
16
|
shift_bracket(text)
|
17
17
|
shift_semicolon(text)
|
18
18
|
shift_caret(text)
|
19
|
+
shift_hashtag(text)
|
19
20
|
shift_vertical_bar(text)
|
20
21
|
convert_dbl_quotes(text)
|
21
22
|
convert_sgl_quotes(text)
|
@@ -29,6 +30,7 @@ module PragmaticTokenizer
|
|
29
30
|
def shift_comma(text)
|
30
31
|
# Shift commas off everything but numbers
|
31
32
|
text.gsub!(/,(?!\d)/o, ' , ') || text
|
33
|
+
text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
|
32
34
|
end
|
33
35
|
|
34
36
|
def shift_multiple_dash(text)
|
@@ -78,6 +80,10 @@ module PragmaticTokenizer
|
|
78
80
|
text.gsub!(/\^/, ' ^ ') || text
|
79
81
|
end
|
80
82
|
|
83
|
+
def shift_hashtag(text)
|
84
|
+
text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
|
85
|
+
end
|
86
|
+
|
81
87
|
def shift_vertical_bar(text)
|
82
88
|
text.gsub!(/\|/, ' | ') || text
|
83
89
|
end
|
@@ -160,11 +160,17 @@ module PragmaticTokenizer
|
|
160
160
|
.flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
161
161
|
.flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
162
162
|
.map { |t| t.gsub(/[[:cntrl:]]/, '') }
|
163
|
+
.map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
|
164
|
+
.map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
|
165
|
+
.map { |t| t.gsub(/1+(?=\z)/, '') }
|
166
|
+
.map { |t| t.gsub(/!+(?=\z)/, '') }
|
167
|
+
.map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
|
163
168
|
.delete_if { |t| t =~ /\A-+\z/ ||
|
164
169
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
165
170
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
166
171
|
t.length > 50 ||
|
167
|
-
(t.length > 1 && t =~ /[&*+<=>^|~]/i)
|
172
|
+
(t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
|
173
|
+
(t.length == 1 && t =~ /\:/)
|
168
174
|
}
|
169
175
|
end
|
170
176
|
|
@@ -211,7 +217,7 @@ module PragmaticTokenizer
|
|
211
217
|
end
|
212
218
|
|
213
219
|
def remove_emails!
|
214
|
-
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
|
220
|
+
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
|
215
221
|
end
|
216
222
|
|
217
223
|
def mentions!
|
@@ -228,6 +234,7 @@ module PragmaticTokenizer
|
|
228
234
|
when 'remove'
|
229
235
|
@tokens.delete_if { |t| t =~ /\A(#|#)/ }
|
230
236
|
when 'keep_and_clean'
|
237
|
+
@tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
231
238
|
@tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
|
232
239
|
end
|
233
240
|
end
|
@@ -237,7 +244,7 @@ module PragmaticTokenizer
|
|
237
244
|
end
|
238
245
|
|
239
246
|
def remove_domains!
|
240
|
-
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]
|
247
|
+
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
|
241
248
|
end
|
242
249
|
|
243
250
|
def split_long_words!
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|