pragmatic_tokenizer 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f19baff424f25d8ba1fcf08001dfd6fc5ac4c54b
4
- data.tar.gz: 645aaae817dd1a77c7fbfe506e5febe9746afbf7
3
+ metadata.gz: e9c65d84e7930d904363cc2ee8cf646e62838197
4
+ data.tar.gz: 6cd30081a14c5da1e47732ca5855d528e5846589
5
5
  SHA512:
6
- metadata.gz: c4a725af2141a66f0390258b36bc66a50994cc6f44e2db6b528b28e592df50baf1865d4445012dead0bf95a1347585acd2b93b22ba968ddf3a5fc08d4bcdaf79
7
- data.tar.gz: 0a820f57952cdc0adb8973607b083bc0e14ba5006a7cf4ab5515f874c1a6f3bc5d1ce532b895c3011203ae74555480a7b81431906607ae26a376dcc5dd4a7d66
6
+ metadata.gz: 02feda33c9f6d8caa28b90054826abfdfdd2e6479180f8cf1ee2139bbf78853fe1a2dd17286403b3bd363f73afe7cf74bf9432d0f08a78083558eff91d2ec286
7
+ data.tar.gz: f510f5f2eedc28efd22cebfab3d28fd5f415fe6dae42c71c97b57c64ab2f6a6b2d99a6e84925e908822fa10e0789a8b041bd8a9b433e430ad9b03043f6963e4f
data/README.md CHANGED
@@ -156,6 +156,19 @@ PragmaticTokenizer::Tokenizer.new(text).urls
156
156
 
157
157
  <hr>
158
158
 
159
+ #### `#domains`
160
+ Extract only valid domain tokens
161
+
162
+ **Example Usage**
163
+ ```ruby
164
+ text = "See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
165
+
166
+ PragmaticTokenizer::Tokenizer.new(text).urls
167
+ # => ["cnn.com/europe", "english.alarabiya.net"]
168
+ ```
169
+
170
+ <hr>
171
+
159
172
  #### `#emails`
160
173
  Extract only valid email tokens
161
174
 
@@ -43,6 +43,10 @@ module PragmaticTokenizer
43
43
  downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
44
44
  end
45
45
 
46
+ def domains
47
+ text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
48
+ end
49
+
46
50
  def urls
47
51
  text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
48
52
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias