pragmatic_tokenizer 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c4834da7c6c1b1d6c614226840bb2fd5ef8b48b6
4
- data.tar.gz: 395868d67e973b2a6e9e28b4b9883c95d1746fe6
3
+ metadata.gz: 8cba2ce060ad1d9ffc74953a9e3a9504b1c8ed13
4
+ data.tar.gz: 3d96486358f974ce30165199381b27c3e01f7625
5
5
  SHA512:
6
- metadata.gz: cc69a6f19545c9f5755df5c996e0625f0e65883fea81f01a877d10fce5f5b4eba8931529aecff9afb2ce56f8b993350d9bad15a94a5bb718db4eeafbbe611a29
7
- data.tar.gz: f08442f148d59d98d3970e50ccc3bab2d59c1728fb06d9fefe4670ff5b4aca688168c81c30da3a49d1d800d8b398d53e76d9482c120450f2edc90b1b3c174617
6
+ metadata.gz: 8cc83be7dc5d9db9dd03d8895ea0a0d7bb1856b0f7b82698362dfb97f8e6cd32c3cbaccbb320ce04cb193594c765ae4cbbce5cb24634f3fe0aced37298ce75c5
7
+ data.tar.gz: f7edb187f8cc2f60aad58d79eac454b556dec53d77aa86a90d55d054d009783be8b4619da22550a79a01ac91540a7b1e3d5ea509e4c482821e32517f976c6208
data/README.md CHANGED
@@ -85,7 +85,7 @@ options = {
85
85
  <hr>
86
86
 
87
87
  ##### `remove_stop_words`
88
- **default** = `'false'`
88
+ **default** = `false`
89
89
  - `true`
90
90
  Removes all stop words.
91
91
  - `false`
@@ -94,7 +94,7 @@ options = {
94
94
  <hr>
95
95
 
96
96
  ##### `expand_contractions`
97
- **default** = `'false'`
97
+ **default** = `false`
98
98
  - `true`
99
99
  Expands contractions (i.e. i'll -> i will).
100
100
  - `false`
@@ -135,7 +135,7 @@ options = {
135
135
  <hr>
136
136
 
137
137
  ##### `remove_emoji`
138
- **default** = `'false'`
138
+ **default** = `false`
139
139
  - `true`
140
140
  Removes any token that contains an emoji.
141
141
  - `false`
@@ -144,7 +144,7 @@ options = {
144
144
  <hr>
145
145
 
146
146
  ##### `remove_urls`
147
- **default** = `'false'`
147
+ **default** = `false`
148
148
  - `true`
149
149
  Removes any token that contains a URL.
150
150
  - `false`
@@ -153,7 +153,7 @@ options = {
153
153
  <hr>
154
154
 
155
155
  ##### `remove_domains`
156
- **default** = `'false'`
156
+ **default** = `false`
157
157
  - `true`
158
158
  Removes any token that contains a domain.
159
159
  - `false`
@@ -162,7 +162,7 @@ options = {
162
162
  <hr>
163
163
 
164
164
  ##### `remove_domains`
165
- **default** = `'false'`
165
+ **default** = `false`
166
166
  - `true`
167
167
  Removes any token that contains a domain.
168
168
  - `false`
@@ -171,7 +171,7 @@ options = {
171
171
  <hr>
172
172
 
173
173
  ##### `clean`
174
- **default** = `'false'`
174
+ **default** = `false`
175
175
  - `true`
176
176
  Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
177
177
  - `false`
@@ -180,7 +180,7 @@ options = {
180
180
  <hr>
181
181
 
182
182
  ##### `hashtags`
183
- **default** = `'keep_original'`
183
+ **default** = `:keep_original`
184
184
  - `:keep_original`
185
185
  Does not alter the token at all.
186
186
  - `:keep_and_clean`
@@ -191,7 +191,7 @@ options = {
191
191
  <hr>
192
192
 
193
193
  ##### `mentions`
194
- **default** = `'keep_original'`
194
+ **default** = `:keep_original`
195
195
  - `:keep_original`
196
196
  Does not alter the token at all.
197
197
  - `:keep_and_clean`
@@ -202,7 +202,7 @@ options = {
202
202
  <hr>
203
203
 
204
204
  ##### `classic_filter`
205
- **default** = `'false'`
205
+ **default** = `false`
206
206
  - `true`
207
207
  Removes dots from acronyms and 's from the end of tokens.
208
208
  - `false`
@@ -211,7 +211,7 @@ options = {
211
211
  <hr>
212
212
 
213
213
  ##### `downcase`
214
- **default** = `'true'`
214
+ **default** = `true`
215
215
 
216
216
  <hr>
217
217
 
@@ -2,7 +2,7 @@ module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
4
  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
5
- PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }.freeze
5
+ PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
6
6
  SEMI_PUNCTUATION = ['。', '.', '.'].freeze
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix'].freeze
8
8
  SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
@@ -18,6 +18,7 @@ module PragmaticTokenizer
18
18
  def handle_single_quotes(text)
19
19
  # Convert left quotes to special character except for 'Twas or 'twas
20
20
  text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
21
+ text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
21
22
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
22
23
  # Separate right single quotes
23
24
  text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
@@ -99,6 +99,7 @@ module PragmaticTokenizer
99
99
  def handle_single_quotes(text)
100
100
  # Convert left quotes to special character except for 'Twas or 'twas
101
101
  text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
102
+ text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
102
103
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
103
104
  # Separate right single quotes
104
105
  text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
@@ -17,6 +17,7 @@ module PragmaticTokenizer
17
17
  .flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
18
18
  .flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
19
19
  .flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
20
+ .flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
20
21
  .flat_map { |t| t.include?(".") &&
21
22
  t !~ /(http|https|www)(\.|:)/ &&
22
23
  t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
@@ -35,14 +36,13 @@ module PragmaticTokenizer
35
36
  abbreviations.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub(/\./, '\1. \2').split(' ').flatten : t }
36
37
  .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
37
38
  .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
38
- .flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
39
39
  ).separate
40
40
  end
41
41
 
42
42
  private
43
43
 
44
44
  def convert_sym_to_punct(token)
45
- symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
45
+ symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
46
46
  symbol_matches.nil? ? token : token.gsub!(symbol_matches[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol_matches[0]))
47
47
  end
48
48
  end
@@ -16,6 +16,7 @@ module PragmaticTokenizer
16
16
  shift_bracket(text)
17
17
  shift_semicolon(text)
18
18
  shift_caret(text)
19
+ shift_hashtag(text)
19
20
  shift_vertical_bar(text)
20
21
  convert_dbl_quotes(text)
21
22
  convert_sgl_quotes(text)
@@ -29,6 +30,7 @@ module PragmaticTokenizer
29
30
  def shift_comma(text)
30
31
  # Shift commas off everything but numbers
31
32
  text.gsub!(/,(?!\d)/o, ' , ') || text
33
+ text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
32
34
  end
33
35
 
34
36
  def shift_multiple_dash(text)
@@ -78,6 +80,10 @@ module PragmaticTokenizer
78
80
  text.gsub!(/\^/, ' ^ ') || text
79
81
  end
80
82
 
83
+ def shift_hashtag(text)
84
+ text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
85
+ end
86
+
81
87
  def shift_vertical_bar(text)
82
88
  text.gsub!(/\|/, ' | ') || text
83
89
  end
@@ -160,11 +160,17 @@ module PragmaticTokenizer
160
160
  .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
161
161
  .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
162
162
  .map { |t| t.gsub(/[[:cntrl:]]/, '') }
163
+ .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
164
+ .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
165
+ .map { |t| t.gsub(/1+(?=\z)/, '') }
166
+ .map { |t| t.gsub(/!+(?=\z)/, '') }
167
+ .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
163
168
  .delete_if { |t| t =~ /\A-+\z/ ||
164
169
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
165
170
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
166
171
  t.length > 50 ||
167
- (t.length > 1 && t =~ /[&*+<=>^|~]/i)
172
+ (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
173
+ (t.length == 1 && t =~ /\:/)
168
174
  }
169
175
  end
170
176
 
@@ -211,7 +217,7 @@ module PragmaticTokenizer
211
217
  end
212
218
 
213
219
  def remove_emails!
214
- @tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
220
+ @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
215
221
  end
216
222
 
217
223
  def mentions!
@@ -228,6 +234,7 @@ module PragmaticTokenizer
228
234
  when 'remove'
229
235
  @tokens.delete_if { |t| t =~ /\A(#|#)/ }
230
236
  when 'keep_and_clean'
237
+ @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
231
238
  @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
232
239
  end
233
240
  end
@@ -237,7 +244,7 @@ module PragmaticTokenizer
237
244
  end
238
245
 
239
246
  def remove_domains!
240
- @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
247
+ @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
241
248
  end
242
249
 
243
250
  def split_long_words!
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-18 00:00:00.000000000 Z
11
+ date: 2016-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode