pragmatic_tokenizer 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f19baff424f25d8ba1fcf08001dfd6fc5ac4c54b
4
- data.tar.gz: 645aaae817dd1a77c7fbfe506e5febe9746afbf7
3
+ metadata.gz: e9c65d84e7930d904363cc2ee8cf646e62838197
4
+ data.tar.gz: 6cd30081a14c5da1e47732ca5855d528e5846589
5
5
  SHA512:
6
- metadata.gz: c4a725af2141a66f0390258b36bc66a50994cc6f44e2db6b528b28e592df50baf1865d4445012dead0bf95a1347585acd2b93b22ba968ddf3a5fc08d4bcdaf79
7
- data.tar.gz: 0a820f57952cdc0adb8973607b083bc0e14ba5006a7cf4ab5515f874c1a6f3bc5d1ce532b895c3011203ae74555480a7b81431906607ae26a376dcc5dd4a7d66
6
+ metadata.gz: 02feda33c9f6d8caa28b90054826abfdfdd2e6479180f8cf1ee2139bbf78853fe1a2dd17286403b3bd363f73afe7cf74bf9432d0f08a78083558eff91d2ec286
7
+ data.tar.gz: f510f5f2eedc28efd22cebfab3d28fd5f415fe6dae42c71c97b57c64ab2f6a6b2d99a6e84925e908822fa10e0789a8b041bd8a9b433e430ad9b03043f6963e4f
data/README.md CHANGED
@@ -156,6 +156,19 @@ PragmaticTokenizer::Tokenizer.new(text).urls
156
156
 
157
157
  <hr>
158
158
 
159
+ #### `#domains`
160
+ Extract only valid domain tokens
161
+
162
+ **Example Usage**
163
+ ```ruby
164
+ text = "See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
165
+
166
+ PragmaticTokenizer::Tokenizer.new(text).urls
167
+ # => ["cnn.com/europe", "english.alarabiya.net"]
168
+ ```
169
+
170
+ <hr>
171
+
159
172
  #### `#emails`
160
173
  Extract only valid email tokens
161
174
 
@@ -43,6 +43,10 @@ module PragmaticTokenizer
43
43
  downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
44
44
  end
45
45
 
46
+ def domains
47
+ text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
48
+ end
49
+
46
50
  def urls
47
51
  text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
48
52
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias