pragmatic_tokenizer 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +4 -0
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e9c65d84e7930d904363cc2ee8cf646e62838197
|
4
|
+
data.tar.gz: 6cd30081a14c5da1e47732ca5855d528e5846589
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02feda33c9f6d8caa28b90054826abfdfdd2e6479180f8cf1ee2139bbf78853fe1a2dd17286403b3bd363f73afe7cf74bf9432d0f08a78083558eff91d2ec286
|
7
|
+
data.tar.gz: f510f5f2eedc28efd22cebfab3d28fd5f415fe6dae42c71c97b57c64ab2f6a6b2d99a6e84925e908822fa10e0789a8b041bd8a9b433e430ad9b03043f6963e4f
|
data/README.md
CHANGED
@@ -156,6 +156,19 @@ PragmaticTokenizer::Tokenizer.new(text).urls
|
|
156
156
|
|
157
157
|
<hr>
|
158
158
|
|
159
|
+
#### `#domains`
|
160
|
+
Extract only valid domain tokens
|
161
|
+
|
162
|
+
**Example Usage**
|
163
|
+
```ruby
|
164
|
+
text = "See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
|
165
|
+
|
166
|
+
PragmaticTokenizer::Tokenizer.new(text).urls
|
167
|
+
# => ["cnn.com/europe", "english.alarabiya.net"]
|
168
|
+
```
|
169
|
+
|
170
|
+
<hr>
|
171
|
+
|
159
172
|
#### `#emails`
|
160
173
|
Extract only valid email tokens
|
161
174
|
|
@@ -43,6 +43,10 @@ module PragmaticTokenizer
|
|
43
43
|
downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
|
44
44
|
end
|
45
45
|
|
46
|
+
def domains
|
47
|
+
text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
|
48
|
+
end
|
49
|
+
|
46
50
|
def urls
|
47
51
|
text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
|
48
52
|
end
|