pragmatic_tokenizer 0.2.0 โ 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +45 -3
- data/lib/pragmatic_tokenizer/tokenizer.rb +4 -4
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b2988beef12450f5a7f653ec5c2e8db5e11efce
|
4
|
+
data.tar.gz: 56d01b9c9ffef57ae3365f3519214318f7828a20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 938592c183b4bd1f2fca41554d74e17b85f608549cfb218476c99c84829fec15e7ac6b2dbc4ce69b60a5780de90e0878203b5bfd54cdfbd71bfd0daed57d15f9
|
7
|
+
data.tar.gz: d767954ebcfd5003f1a58b0023307b93b22de30e08f491954511d1bcdafab4c3f23f5edb710f1d40515df98510bd63bcd9c427e4d2315f33b0d1641b726e998f
|
data/README.md
CHANGED
@@ -146,32 +146,66 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
|
146
146
|
#### `#urls`
|
147
147
|
Extract only valid URL tokens
|
148
148
|
|
149
|
-
|
149
|
+
**Example Usage**
|
150
|
+
```ruby
|
151
|
+
text = "Go to http://www.example.com"
|
152
|
+
|
153
|
+
PragmaticTokenizer::Tokenizer.new(text).urls
|
154
|
+
# => ["http://www.example.com"]
|
155
|
+
```
|
150
156
|
|
151
157
|
<hr>
|
152
158
|
|
153
159
|
#### `#emails`
|
154
160
|
Extract only valid email tokens
|
155
161
|
|
156
|
-
|
162
|
+
**Example Usage**
|
163
|
+
```ruby
|
164
|
+
text = "Please email example@example.com for more info."
|
165
|
+
|
166
|
+
PragmaticTokenizer::Tokenizer.new(text).emails
|
167
|
+
# => ["example@example.com"]
|
168
|
+
```
|
157
169
|
|
158
170
|
<hr>
|
159
171
|
|
160
172
|
#### `#hashtags`
|
161
173
|
Extract only valid hashtag tokens
|
162
174
|
|
163
|
-
|
175
|
+
**Example Usage**
|
176
|
+
```ruby
|
177
|
+
text = "Find me all the #fun #hashtags and give me #backallofthem."
|
178
|
+
|
179
|
+
PragmaticTokenizer::Tokenizer.new(text).hashtags
|
180
|
+
# => ["#fun", "#hashtags", "#backallofthem"]
|
181
|
+
```
|
164
182
|
|
165
183
|
<hr>
|
166
184
|
|
167
185
|
#### `#mentions`
|
168
186
|
Extract only valid @ mention tokens
|
169
187
|
|
188
|
+
**Example Usage**
|
189
|
+
```ruby
|
190
|
+
text = "Find me all the @awesome mentions."
|
191
|
+
|
192
|
+
PragmaticTokenizer::Tokenizer.new(text).hashtags
|
193
|
+
# => ["@awesome"]
|
194
|
+
```
|
195
|
+
|
170
196
|
<hr>
|
171
197
|
|
172
198
|
#### `#emoticons`
|
173
199
|
Extract only simple emoticon tokens
|
174
200
|
|
201
|
+
**Example Usage**
|
202
|
+
```ruby
|
203
|
+
text = "Hello ;-) :) ๐"
|
204
|
+
|
205
|
+
PragmaticTokenizer::Tokenizer.new(text).emoticons
|
206
|
+
# => [";-)", ":)""]
|
207
|
+
```
|
208
|
+
|
175
209
|
<hr>
|
176
210
|
|
177
211
|
#### `#emoji`
|
@@ -179,6 +213,14 @@ Extract only validโ emoji tokens
|
|
179
213
|
|
180
214
|
*โ matches all 1012 single-character Unicode Emoji (all except for two-character flags)*
|
181
215
|
|
216
|
+
**Example Usage**
|
217
|
+
```ruby
|
218
|
+
text = "Return the emoji ๐ฟ๐๐ฑ๐๐."
|
219
|
+
|
220
|
+
PragmaticTokenizer::Tokenizer.new(text).emoticons
|
221
|
+
# => ["๐ฟ", "๐", "๐ฑ", "๐", "๐"]
|
222
|
+
```
|
223
|
+
|
182
224
|
## Language Support
|
183
225
|
|
184
226
|
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
@@ -44,19 +44,19 @@ module PragmaticTokenizer
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def urls
|
47
|
-
|
47
|
+
text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
|
48
48
|
end
|
49
49
|
|
50
50
|
def emails
|
51
|
-
|
51
|
+
text.split(' ').delete_if { |t| t !~ /\S+(๏ผ |@)\S+/ }.map { |t| t.chomp('.') }
|
52
52
|
end
|
53
53
|
|
54
54
|
def hashtags
|
55
|
-
|
55
|
+
text.split(' ').delete_if { |t| t !~ /(#|๏ผ)/ }.map { |t| t.chomp('.') }
|
56
56
|
end
|
57
57
|
|
58
58
|
def mentions
|
59
|
-
text.split(' ').delete_if { |t| t !~
|
59
|
+
text.split(' ').delete_if { |t| t !~ /(@|๏ผ )/ }.map { |t| t.chomp('.') }
|
60
60
|
end
|
61
61
|
|
62
62
|
def emoticons
|