pragmatic_tokenizer 0.2.0 โ 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +45 -3
- data/lib/pragmatic_tokenizer/tokenizer.rb +4 -4
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3b2988beef12450f5a7f653ec5c2e8db5e11efce
|
4
|
+
data.tar.gz: 56d01b9c9ffef57ae3365f3519214318f7828a20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 938592c183b4bd1f2fca41554d74e17b85f608549cfb218476c99c84829fec15e7ac6b2dbc4ce69b60a5780de90e0878203b5bfd54cdfbd71bfd0daed57d15f9
|
7
|
+
data.tar.gz: d767954ebcfd5003f1a58b0023307b93b22de30e08f491954511d1bcdafab4c3f23f5edb710f1d40515df98510bd63bcd9c427e4d2315f33b0d1641b726e998f
|
data/README.md
CHANGED
@@ -146,32 +146,66 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
|
146
146
|
#### `#urls`
|
147
147
|
Extract only valid URL tokens
|
148
148
|
|
149
|
-
|
149
|
+
**Example Usage**
|
150
|
+
```ruby
|
151
|
+
text = "Go to http://www.example.com"
|
152
|
+
|
153
|
+
PragmaticTokenizer::Tokenizer.new(text).urls
|
154
|
+
# => ["http://www.example.com"]
|
155
|
+
```
|
150
156
|
|
151
157
|
<hr>
|
152
158
|
|
153
159
|
#### `#emails`
|
154
160
|
Extract only valid email tokens
|
155
161
|
|
156
|
-
|
162
|
+
**Example Usage**
|
163
|
+
```ruby
|
164
|
+
text = "Please email example@example.com for more info."
|
165
|
+
|
166
|
+
PragmaticTokenizer::Tokenizer.new(text).emails
|
167
|
+
# => ["example@example.com"]
|
168
|
+
```
|
157
169
|
|
158
170
|
<hr>
|
159
171
|
|
160
172
|
#### `#hashtags`
|
161
173
|
Extract only valid hashtag tokens
|
162
174
|
|
163
|
-
|
175
|
+
**Example Usage**
|
176
|
+
```ruby
|
177
|
+
text = "Find me all the #fun #hashtags and give me #backallofthem."
|
178
|
+
|
179
|
+
PragmaticTokenizer::Tokenizer.new(text).hashtags
|
180
|
+
# => ["#fun", "#hashtags", "#backallofthem"]
|
181
|
+
```
|
164
182
|
|
165
183
|
<hr>
|
166
184
|
|
167
185
|
#### `#mentions`
|
168
186
|
Extract only valid @ mention tokens
|
169
187
|
|
188
|
+
**Example Usage**
|
189
|
+
```ruby
|
190
|
+
text = "Find me all the @awesome mentions."
|
191
|
+
|
192
|
+
PragmaticTokenizer::Tokenizer.new(text).hashtags
|
193
|
+
# => ["@awesome"]
|
194
|
+
```
|
195
|
+
|
170
196
|
<hr>
|
171
197
|
|
172
198
|
#### `#emoticons`
|
173
199
|
Extract only simple emoticon tokens
|
174
200
|
|
201
|
+
**Example Usage**
|
202
|
+
```ruby
|
203
|
+
text = "Hello ;-) :) ๐"
|
204
|
+
|
205
|
+
PragmaticTokenizer::Tokenizer.new(text).emoticons
|
206
|
+
# => [";-)", ":)""]
|
207
|
+
```
|
208
|
+
|
175
209
|
<hr>
|
176
210
|
|
177
211
|
#### `#emoji`
|
@@ -179,6 +213,14 @@ Extract only validโ emoji tokens
|
|
179
213
|
|
180
214
|
*โ matches all 1012 single-character Unicode Emoji (all except for two-character flags)*
|
181
215
|
|
216
|
+
**Example Usage**
|
217
|
+
```ruby
|
218
|
+
text = "Return the emoji ๐ฟ๐๐ฑ๐๐."
|
219
|
+
|
220
|
+
PragmaticTokenizer::Tokenizer.new(text).emoticons
|
221
|
+
# => ["๐ฟ", "๐", "๐ฑ", "๐", "๐"]
|
222
|
+
```
|
223
|
+
|
182
224
|
## Language Support
|
183
225
|
|
184
226
|
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
@@ -44,19 +44,19 @@ module PragmaticTokenizer
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def urls
|
47
|
-
|
47
|
+
text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
|
48
48
|
end
|
49
49
|
|
50
50
|
def emails
|
51
|
-
|
51
|
+
text.split(' ').delete_if { |t| t !~ /\S+(๏ผ |@)\S+/ }.map { |t| t.chomp('.') }
|
52
52
|
end
|
53
53
|
|
54
54
|
def hashtags
|
55
|
-
|
55
|
+
text.split(' ').delete_if { |t| t !~ /(#|๏ผ)/ }.map { |t| t.chomp('.') }
|
56
56
|
end
|
57
57
|
|
58
58
|
def mentions
|
59
|
-
text.split(' ').delete_if { |t| t !~
|
59
|
+
text.split(' ').delete_if { |t| t !~ /(@|๏ผ )/ }.map { |t| t.chomp('.') }
|
60
60
|
end
|
61
61
|
|
62
62
|
def emoticons
|