pragmatic_tokenizer 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c4834da7c6c1b1d6c614226840bb2fd5ef8b48b6
4
- data.tar.gz: 395868d67e973b2a6e9e28b4b9883c95d1746fe6
3
+ metadata.gz: 8cba2ce060ad1d9ffc74953a9e3a9504b1c8ed13
4
+ data.tar.gz: 3d96486358f974ce30165199381b27c3e01f7625
5
5
  SHA512:
6
- metadata.gz: cc69a6f19545c9f5755df5c996e0625f0e65883fea81f01a877d10fce5f5b4eba8931529aecff9afb2ce56f8b993350d9bad15a94a5bb718db4eeafbbe611a29
7
- data.tar.gz: f08442f148d59d98d3970e50ccc3bab2d59c1728fb06d9fefe4670ff5b4aca688168c81c30da3a49d1d800d8b398d53e76d9482c120450f2edc90b1b3c174617
6
+ metadata.gz: 8cc83be7dc5d9db9dd03d8895ea0a0d7bb1856b0f7b82698362dfb97f8e6cd32c3cbaccbb320ce04cb193594c765ae4cbbce5cb24634f3fe0aced37298ce75c5
7
+ data.tar.gz: f7edb187f8cc2f60aad58d79eac454b556dec53d77aa86a90d55d054d009783be8b4619da22550a79a01ac91540a7b1e3d5ea509e4c482821e32517f976c6208
data/README.md CHANGED
@@ -85,7 +85,7 @@ options = {
85
85
  <hr>
86
86
 
87
87
  ##### `remove_stop_words`
88
- **default** = `'false'`
88
+ **default** = `false`
89
89
  - `true`
90
90
  Removes all stop words.
91
91
  - `false`
@@ -94,7 +94,7 @@ options = {
94
94
  <hr>
95
95
 
96
96
  ##### `expand_contractions`
97
- **default** = `'false'`
97
+ **default** = `false`
98
98
  - `true`
99
99
  Expands contractions (i.e. i'll -> i will).
100
100
  - `false`
@@ -135,7 +135,7 @@ options = {
135
135
  <hr>
136
136
 
137
137
  ##### `remove_emoji`
138
- **default** = `'false'`
138
+ **default** = `false`
139
139
  - `true`
140
140
  Removes any token that contains an emoji.
141
141
  - `false`
@@ -144,7 +144,7 @@ options = {
144
144
  <hr>
145
145
 
146
146
  ##### `remove_urls`
147
- **default** = `'false'`
147
+ **default** = `false`
148
148
  - `true`
149
149
  Removes any token that contains a URL.
150
150
  - `false`
@@ -153,7 +153,7 @@ options = {
153
153
  <hr>
154
154
 
155
155
  ##### `remove_domains`
156
- **default** = `'false'`
156
+ **default** = `false`
157
157
  - `true`
158
158
  Removes any token that contains a domain.
159
159
  - `false`
@@ -162,7 +162,7 @@ options = {
162
162
  <hr>
163
163
 
164
164
  ##### `remove_domains`
165
- **default** = `'false'`
165
+ **default** = `false`
166
166
  - `true`
167
167
  Removes any token that contains a domain.
168
168
  - `false`
@@ -171,7 +171,7 @@ options = {
171
171
  <hr>
172
172
 
173
173
  ##### `clean`
174
- **default** = `'false'`
174
+ **default** = `false`
175
175
  - `true`
176
176
  Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
177
177
  - `false`
@@ -180,7 +180,7 @@ options = {
180
180
  <hr>
181
181
 
182
182
  ##### `hashtags`
183
- **default** = `'keep_original'`
183
+ **default** = `:keep_original`
184
184
  - `:keep_original`
185
185
  Does not alter the token at all.
186
186
  - `:keep_and_clean`
@@ -191,7 +191,7 @@ options = {
191
191
  <hr>
192
192
 
193
193
  ##### `mentions`
194
- **default** = `'keep_original'`
194
+ **default** = `:keep_original`
195
195
  - `:keep_original`
196
196
  Does not alter the token at all.
197
197
  - `:keep_and_clean`
@@ -202,7 +202,7 @@ options = {
202
202
  <hr>
203
203
 
204
204
  ##### `classic_filter`
205
- **default** = `'false'`
205
+ **default** = `false`
206
206
  - `true`
207
207
  Removes dots from acronyms and 's from the end of tokens.
208
208
  - `false`
@@ -211,7 +211,7 @@ options = {
211
211
  <hr>
212
212
 
213
213
  ##### `downcase`
214
- **default** = `'true'`
214
+ **default** = `true`
215
215
 
216
216
  <hr>
217
217
 
@@ -2,7 +2,7 @@ module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
4
  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
5
- PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }.freeze
5
+ PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
6
6
  SEMI_PUNCTUATION = ['。', '.', '.'].freeze
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix'].freeze
8
8
  SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
@@ -18,6 +18,7 @@ module PragmaticTokenizer
18
18
  def handle_single_quotes(text)
19
19
  # Convert left quotes to special character except for 'Twas or 'twas
20
20
  text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
21
+ text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
21
22
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
22
23
  # Separate right single quotes
23
24
  text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
@@ -99,6 +99,7 @@ module PragmaticTokenizer
99
99
  def handle_single_quotes(text)
100
100
  # Convert left quotes to special character except for 'Twas or 'twas
101
101
  text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
102
+ text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
102
103
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
103
104
  # Separate right single quotes
104
105
  text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
@@ -17,6 +17,7 @@ module PragmaticTokenizer
17
17
  .flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
18
18
  .flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
19
19
  .flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
20
+ .flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
20
21
  .flat_map { |t| t.include?(".") &&
21
22
  t !~ /(http|https|www)(\.|:)/ &&
22
23
  t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
@@ -35,14 +36,13 @@ module PragmaticTokenizer
35
36
  abbreviations.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub(/\./, '\1. \2').split(' ').flatten : t }
36
37
  .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
37
38
  .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
38
- .flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
39
39
  ).separate
40
40
  end
41
41
 
42
42
  private
43
43
 
44
44
  def convert_sym_to_punct(token)
45
- symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
45
+ symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
46
46
  symbol_matches.nil? ? token : token.gsub!(symbol_matches[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol_matches[0]))
47
47
  end
48
48
  end
@@ -16,6 +16,7 @@ module PragmaticTokenizer
16
16
  shift_bracket(text)
17
17
  shift_semicolon(text)
18
18
  shift_caret(text)
19
+ shift_hashtag(text)
19
20
  shift_vertical_bar(text)
20
21
  convert_dbl_quotes(text)
21
22
  convert_sgl_quotes(text)
@@ -29,6 +30,7 @@ module PragmaticTokenizer
29
30
  def shift_comma(text)
30
31
  # Shift commas off everything but numbers
31
32
  text.gsub!(/,(?!\d)/o, ' , ') || text
33
+ text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
32
34
  end
33
35
 
34
36
  def shift_multiple_dash(text)
@@ -78,6 +80,10 @@ module PragmaticTokenizer
78
80
  text.gsub!(/\^/, ' ^ ') || text
79
81
  end
80
82
 
83
+ def shift_hashtag(text)
84
+ text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
85
+ end
86
+
81
87
  def shift_vertical_bar(text)
82
88
  text.gsub!(/\|/, ' | ') || text
83
89
  end
@@ -160,11 +160,17 @@ module PragmaticTokenizer
160
160
  .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
161
161
  .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
162
162
  .map { |t| t.gsub(/[[:cntrl:]]/, '') }
163
+ .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
164
+ .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
165
+ .map { |t| t.gsub(/1+(?=\z)/, '') }
166
+ .map { |t| t.gsub(/!+(?=\z)/, '') }
167
+ .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
163
168
  .delete_if { |t| t =~ /\A-+\z/ ||
164
169
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
165
170
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
166
171
  t.length > 50 ||
167
- (t.length > 1 && t =~ /[&*+<=>^|~]/i)
172
+ (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
173
+ (t.length == 1 && t =~ /\:/)
168
174
  }
169
175
  end
170
176
 
@@ -211,7 +217,7 @@ module PragmaticTokenizer
211
217
  end
212
218
 
213
219
  def remove_emails!
214
- @tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
220
+ @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
215
221
  end
216
222
 
217
223
  def mentions!
@@ -228,6 +234,7 @@ module PragmaticTokenizer
228
234
  when 'remove'
229
235
  @tokens.delete_if { |t| t =~ /\A(#|#)/ }
230
236
  when 'keep_and_clean'
237
+ @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
231
238
  @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
232
239
  end
233
240
  end
@@ -237,7 +244,7 @@ module PragmaticTokenizer
237
244
  end
238
245
 
239
246
  def remove_domains!
240
- @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
247
+ @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
241
248
  end
242
249
 
243
250
  def split_long_words!
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-18 00:00:00.000000000 Z
11
+ date: 2016-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode