pragmatic_tokenizer 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d459dd4e0d75f945fe329caccbb8b08a6fa74257
4
- data.tar.gz: 99c56dcf62fb9321198adb2ec07d97e259597af8
3
+ metadata.gz: 802aa9ef0922e27bb0838c65d661e0918b3f48c7
4
+ data.tar.gz: e4ecd8525874a49fdfbe37494d48aec07fb285ab
5
5
  SHA512:
6
- metadata.gz: 1fffa5e8e4c298bad2d0cdf6a08aed43d0a28f9e2d46d778bfe438e38b1deaa38f24a95b4a5206872828ade08a593d7e30737e68c0067c241d136c06e07df750
7
- data.tar.gz: fe4eed5ad06c8bea365e799bd9b0dd9132faa5457cd76ad2b5674f47cef200766fe8f1b68855a68e554d1f295aa906bce0bdf94a68d64df4df96807c4d346642
6
+ metadata.gz: 7999dd167559f7d49c2707dcf6d963ee63269bbb8d118236c59d9d67b1fb009ef54b1b8db872013f1c4ba73b9ddfae2482bff1b49d2227d197b73036695049bc
7
+ data.tar.gz: 7872a5e27d7aeb1332f4a00bffc5e3ea66fb09ece0b34419cc8f10f47c4b504928ded18eedcde9fa022bab9597e2ca8b3ba4ca28c89574425174a8f316d694fc
data/README.md CHANGED
@@ -56,6 +56,24 @@ Or install it yourself as:
56
56
  - `false`
57
57
  Leaves contractions as is.
58
58
 
59
+ <hr>
60
+
61
+ ##### `clean`
62
+ **default** = `'false'`
63
+ - `true`
64
+ Removes tokens consisting of only hypens or underscores.
65
+ - `false`
66
+ Leaves tokens as is.
67
+
68
+ <hr>
69
+
70
+ ##### `remove_numbers`
71
+ **default** = `'false'`
72
+ - `true`
73
+ Removes any token that contains a number.
74
+ - `false`
75
+ Leaves tokens as is.
76
+
59
77
  **Example Usage**
60
78
  ```ruby
61
79
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
@@ -84,6 +102,14 @@ PragmaticTokenizer::Tokenizer.new(text,
84
102
  punctuation: 'none'
85
103
  ).tokenize
86
104
  # => ["crazy", "sandowsky", "afford"]
105
+
106
+ text = "The price is $5.50 and it works for 5 hours."
107
+ PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
108
+ # => ["the", "price", "is", "and", "it", "works", "for", "hours", "."]
109
+
110
+ text = "Hello ______ ."
111
+ PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
112
+ # => ["hello", "."]
87
113
  ```
88
114
 
89
115
  ## Development
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
4
4
  module PragmaticTokenizer
5
5
  class Tokenizer
6
6
 
7
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module
8
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false)
7
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
8
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
9
9
  unless punctuation.eql?('all') ||
10
10
  punctuation.eql?('semi') ||
11
11
  punctuation.eql?('none') ||
@@ -30,11 +30,13 @@ module PragmaticTokenizer
30
30
  @punctuation = punctuation
31
31
  @remove_stop_words = remove_stop_words
32
32
  @expand_contractions = expand_contractions
33
+ @clean = clean
34
+ @remove_numbers = remove_numbers
33
35
  end
34
36
 
35
37
  def tokenize
36
38
  return [] unless text
37
- delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))
39
+ delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
38
40
  end
39
41
 
40
42
  private
@@ -45,6 +47,16 @@ module PragmaticTokenizer
45
47
  Processor
46
48
  end
47
49
 
50
+ def delete_numbers(tokens)
51
+ return tokens unless remove_numbers
52
+ tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
53
+ end
54
+
55
+ def cleaner(tokens)
56
+ return tokens unless clean
57
+ tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
58
+ end
59
+
48
60
  def remove_punctuation(tokens)
49
61
  case punctuation
50
62
  when 'all'
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias