pragmatic_tokenizer 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +4 -0
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e9c65d84e7930d904363cc2ee8cf646e62838197
|
4
|
+
data.tar.gz: 6cd30081a14c5da1e47732ca5855d528e5846589
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02feda33c9f6d8caa28b90054826abfdfdd2e6479180f8cf1ee2139bbf78853fe1a2dd17286403b3bd363f73afe7cf74bf9432d0f08a78083558eff91d2ec286
|
7
|
+
data.tar.gz: f510f5f2eedc28efd22cebfab3d28fd5f415fe6dae42c71c97b57c64ab2f6a6b2d99a6e84925e908822fa10e0789a8b041bd8a9b433e430ad9b03043f6963e4f
|
data/README.md
CHANGED
@@ -156,6 +156,19 @@ PragmaticTokenizer::Tokenizer.new(text).urls
|
|
156
156
|
|
157
157
|
<hr>
|
158
158
|
|
159
|
+
#### `#domains`
|
160
|
+
Extract only valid domain tokens
|
161
|
+
|
162
|
+
**Example Usage**
|
163
|
+
```ruby
|
164
|
+
text = "See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
|
165
|
+
|
166
|
+
PragmaticTokenizer::Tokenizer.new(text).urls
|
167
|
+
# => ["cnn.com/europe", "english.alarabiya.net"]
|
168
|
+
```
|
169
|
+
|
170
|
+
<hr>
|
171
|
+
|
159
172
|
#### `#emails`
|
160
173
|
Extract only valid email tokens
|
161
174
|
|
@@ -43,6 +43,10 @@ module PragmaticTokenizer
|
|
43
43
|
downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
|
44
44
|
end
|
45
45
|
|
46
|
+
def domains
|
47
|
+
text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
|
48
|
+
end
|
49
|
+
|
46
50
|
def urls
|
47
51
|
text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
|
48
52
|
end
|