pragmatic_tokenizer 0.2.0 โ†’ 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4d4f4efd555a719dda2fd957aa2683a8846140f5
4
- data.tar.gz: 7328a978a04c6a7754aab8d5b013d1174744bea5
3
+ metadata.gz: 3b2988beef12450f5a7f653ec5c2e8db5e11efce
4
+ data.tar.gz: 56d01b9c9ffef57ae3365f3519214318f7828a20
5
5
  SHA512:
6
- metadata.gz: 7f0c65f7c717424af8fcc2e00ede126803ac5a4980e819d18020acb91b04e722f0e44e7be4067aeee0d35f165443ad1dd5534af5fe8f85af2ae63f4f83ece756
7
- data.tar.gz: be8292abc8548194d660baa35947e4f5b7fbc365dad518a199f8741224bf1c864b214604b67d53e3265b7bda9f5ea561a1832f726c4dcbd04d40cbd1d8db7370
6
+ metadata.gz: 938592c183b4bd1f2fca41554d74e17b85f608549cfb218476c99c84829fec15e7ac6b2dbc4ce69b60a5780de90e0878203b5bfd54cdfbd71bfd0daed57d15f9
7
+ data.tar.gz: d767954ebcfd5003f1a58b0023307b93b22de30e08f491954511d1bcdafab4c3f23f5edb710f1d40515df98510bd63bcd9c427e4d2315f33b0d1641b726e998f
data/README.md CHANGED
@@ -146,32 +146,66 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
146
146
  #### `#urls`
147
147
  Extract only valid URL tokens
148
148
 
149
- *Not Yet Implemented*
149
+ **Example Usage**
150
+ ```ruby
151
+ text = "Go to http://www.example.com"
152
+
153
+ PragmaticTokenizer::Tokenizer.new(text).urls
154
+ # => ["http://www.example.com"]
155
+ ```
150
156
 
151
157
  <hr>
152
158
 
153
159
  #### `#emails`
154
160
  Extract only valid email tokens
155
161
 
156
- *Not Yet Implemented*
162
+ **Example Usage**
163
+ ```ruby
164
+ text = "Please email example@example.com for more info."
165
+
166
+ PragmaticTokenizer::Tokenizer.new(text).emails
167
+ # => ["example@example.com"]
168
+ ```
157
169
 
158
170
  <hr>
159
171
 
160
172
  #### `#hashtags`
161
173
  Extract only valid hashtag tokens
162
174
 
163
- *Not Yet Implemented*
175
+ **Example Usage**
176
+ ```ruby
177
+ text = "Find me all the #fun #hashtags and give me #backallofthem."
178
+
179
+ PragmaticTokenizer::Tokenizer.new(text).hashtags
180
+ # => ["#fun", "#hashtags", "#backallofthem"]
181
+ ```
164
182
 
165
183
  <hr>
166
184
 
167
185
  #### `#mentions`
168
186
  Extract only valid @ mention tokens
169
187
 
188
+ **Example Usage**
189
+ ```ruby
190
+ text = "Find me all the @awesome mentions."
191
+
192
+ PragmaticTokenizer::Tokenizer.new(text).hashtags
193
+ # => ["@awesome"]
194
+ ```
195
+
170
196
  <hr>
171
197
 
172
198
  #### `#emoticons`
173
199
  Extract only simple emoticon tokens
174
200
 
201
+ **Example Usage**
202
+ ```ruby
203
+ text = "Hello ;-) :) ๐Ÿ˜„"
204
+
205
+ PragmaticTokenizer::Tokenizer.new(text).emoticons
206
+ # => [";-)", ":)""]
207
+ ```
208
+
175
209
  <hr>
176
210
 
177
211
  #### `#emoji`
@@ -179,6 +213,14 @@ Extract only validโ€  emoji tokens
179
213
 
180
214
  *โ€ matches all 1012 single-character Unicode Emoji (all except for two-character flags)*
181
215
 
216
+ **Example Usage**
217
+ ```ruby
218
+ text = "Return the emoji ๐Ÿ‘ฟ๐Ÿ˜๐Ÿ˜ฑ๐Ÿ”๐ŸŒš."
219
+
220
+ PragmaticTokenizer::Tokenizer.new(text).emoticons
221
+ # => ["๐Ÿ‘ฟ", "๐Ÿ˜", "๐Ÿ˜ฑ", "๐Ÿ”", "๐ŸŒš"]
222
+ ```
223
+
182
224
  ## Language Support
183
225
 
184
226
  The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
@@ -44,19 +44,19 @@ module PragmaticTokenizer
44
44
  end
45
45
 
46
46
  def urls
47
- []
47
+ text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
48
48
  end
49
49
 
50
50
  def emails
51
- []
51
+ text.split(' ').delete_if { |t| t !~ /\S+(๏ผ |@)\S+/ }.map { |t| t.chomp('.') }
52
52
  end
53
53
 
54
54
  def hashtags
55
- []
55
+ text.split(' ').delete_if { |t| t !~ /(#|๏ผƒ)/ }.map { |t| t.chomp('.') }
56
56
  end
57
57
 
58
58
  def mentions
59
- text.split(' ').delete_if { |t| t !~ /\A(@|๏ผ )/ }
59
+ text.split(' ').delete_if { |t| t !~ /(@|๏ผ )/ }.map { |t| t.chomp('.') }
60
60
  end
61
61
 
62
62
  def emoticons
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias