pragmatic_tokenizer 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d459dd4e0d75f945fe329caccbb8b08a6fa74257
4
- data.tar.gz: 99c56dcf62fb9321198adb2ec07d97e259597af8
3
+ metadata.gz: 802aa9ef0922e27bb0838c65d661e0918b3f48c7
4
+ data.tar.gz: e4ecd8525874a49fdfbe37494d48aec07fb285ab
5
5
  SHA512:
6
- metadata.gz: 1fffa5e8e4c298bad2d0cdf6a08aed43d0a28f9e2d46d778bfe438e38b1deaa38f24a95b4a5206872828ade08a593d7e30737e68c0067c241d136c06e07df750
7
- data.tar.gz: fe4eed5ad06c8bea365e799bd9b0dd9132faa5457cd76ad2b5674f47cef200766fe8f1b68855a68e554d1f295aa906bce0bdf94a68d64df4df96807c4d346642
6
+ metadata.gz: 7999dd167559f7d49c2707dcf6d963ee63269bbb8d118236c59d9d67b1fb009ef54b1b8db872013f1c4ba73b9ddfae2482bff1b49d2227d197b73036695049bc
7
+ data.tar.gz: 7872a5e27d7aeb1332f4a00bffc5e3ea66fb09ece0b34419cc8f10f47c4b504928ded18eedcde9fa022bab9597e2ca8b3ba4ca28c89574425174a8f316d694fc
data/README.md CHANGED
@@ -56,6 +56,24 @@ Or install it yourself as:
56
56
  - `false`
57
57
  Leaves contractions as is.
58
58
 
59
+ <hr>
60
+
61
+ ##### `clean`
62
+ **default** = `'false'`
63
+ - `true`
64
+ Removes tokens consisting of only hypens or underscores.
65
+ - `false`
66
+ Leaves tokens as is.
67
+
68
+ <hr>
69
+
70
+ ##### `remove_numbers`
71
+ **default** = `'false'`
72
+ - `true`
73
+ Removes any token that contains a number.
74
+ - `false`
75
+ Leaves tokens as is.
76
+
59
77
  **Example Usage**
60
78
  ```ruby
61
79
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
@@ -84,6 +102,14 @@ PragmaticTokenizer::Tokenizer.new(text,
84
102
  punctuation: 'none'
85
103
  ).tokenize
86
104
  # => ["crazy", "sandowsky", "afford"]
105
+
106
+ text = "The price is $5.50 and it works for 5 hours."
107
+ PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
108
+ # => ["the", "price", "is", "and", "it", "works", "for", "hours", "."]
109
+
110
+ text = "Hello ______ ."
111
+ PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
112
+ # => ["hello", "."]
87
113
  ```
88
114
 
89
115
  ## Development
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
4
4
  module PragmaticTokenizer
5
5
  class Tokenizer
6
6
 
7
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module
8
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false)
7
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
8
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
9
9
  unless punctuation.eql?('all') ||
10
10
  punctuation.eql?('semi') ||
11
11
  punctuation.eql?('none') ||
@@ -30,11 +30,13 @@ module PragmaticTokenizer
30
30
  @punctuation = punctuation
31
31
  @remove_stop_words = remove_stop_words
32
32
  @expand_contractions = expand_contractions
33
+ @clean = clean
34
+ @remove_numbers = remove_numbers
33
35
  end
34
36
 
35
37
  def tokenize
36
38
  return [] unless text
37
- delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))
39
+ delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
38
40
  end
39
41
 
40
42
  private
@@ -45,6 +47,16 @@ module PragmaticTokenizer
45
47
  Processor
46
48
  end
47
49
 
50
+ def delete_numbers(tokens)
51
+ return tokens unless remove_numbers
52
+ tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
53
+ end
54
+
55
+ def cleaner(tokens)
56
+ return tokens unless clean
57
+ tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
58
+ end
59
+
48
60
  def remove_punctuation(tokens)
49
61
  case punctuation
50
62
  when 'all'
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias