pragmatic_tokenizer 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +26 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +15 -3
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 802aa9ef0922e27bb0838c65d661e0918b3f48c7
|
4
|
+
data.tar.gz: e4ecd8525874a49fdfbe37494d48aec07fb285ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7999dd167559f7d49c2707dcf6d963ee63269bbb8d118236c59d9d67b1fb009ef54b1b8db872013f1c4ba73b9ddfae2482bff1b49d2227d197b73036695049bc
|
7
|
+
data.tar.gz: 7872a5e27d7aeb1332f4a00bffc5e3ea66fb09ece0b34419cc8f10f47c4b504928ded18eedcde9fa022bab9597e2ca8b3ba4ca28c89574425174a8f316d694fc
|
data/README.md
CHANGED
@@ -56,6 +56,24 @@ Or install it yourself as:
|
|
56
56
|
- `false`
|
57
57
|
Leaves contractions as is.
|
58
58
|
|
59
|
+
<hr>
|
60
|
+
|
61
|
+
##### `clean`
|
62
|
+
**default** = `'false'`
|
63
|
+
- `true`
|
64
|
+
Removes tokens consisting of only hypens or underscores.
|
65
|
+
- `false`
|
66
|
+
Leaves tokens as is.
|
67
|
+
|
68
|
+
<hr>
|
69
|
+
|
70
|
+
##### `remove_numbers`
|
71
|
+
**default** = `'false'`
|
72
|
+
- `true`
|
73
|
+
Removes any token that contains a number.
|
74
|
+
- `false`
|
75
|
+
Leaves tokens as is.
|
76
|
+
|
59
77
|
**Example Usage**
|
60
78
|
```ruby
|
61
79
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
@@ -84,6 +102,14 @@ PragmaticTokenizer::Tokenizer.new(text,
|
|
84
102
|
punctuation: 'none'
|
85
103
|
).tokenize
|
86
104
|
# => ["crazy", "sandowsky", "afford"]
|
105
|
+
|
106
|
+
text = "The price is $5.50 and it works for 5 hours."
|
107
|
+
PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
|
108
|
+
# => ["the", "price", "is", "and", "it", "works", "for", "hours", "."]
|
109
|
+
|
110
|
+
text = "Hello ______ ."
|
111
|
+
PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
|
112
|
+
# => ["hello", "."]
|
87
113
|
```
|
88
114
|
|
89
115
|
## Development
|
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
|
|
4
4
|
module PragmaticTokenizer
|
5
5
|
class Tokenizer
|
6
6
|
|
7
|
-
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module
|
8
|
-
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false)
|
7
|
+
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
|
8
|
+
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
|
9
9
|
unless punctuation.eql?('all') ||
|
10
10
|
punctuation.eql?('semi') ||
|
11
11
|
punctuation.eql?('none') ||
|
@@ -30,11 +30,13 @@ module PragmaticTokenizer
|
|
30
30
|
@punctuation = punctuation
|
31
31
|
@remove_stop_words = remove_stop_words
|
32
32
|
@expand_contractions = expand_contractions
|
33
|
+
@clean = clean
|
34
|
+
@remove_numbers = remove_numbers
|
33
35
|
end
|
34
36
|
|
35
37
|
def tokenize
|
36
38
|
return [] unless text
|
37
|
-
delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))
|
39
|
+
delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
|
38
40
|
end
|
39
41
|
|
40
42
|
private
|
@@ -45,6 +47,16 @@ module PragmaticTokenizer
|
|
45
47
|
Processor
|
46
48
|
end
|
47
49
|
|
50
|
+
def delete_numbers(tokens)
|
51
|
+
return tokens unless remove_numbers
|
52
|
+
tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
|
53
|
+
end
|
54
|
+
|
55
|
+
def cleaner(tokens)
|
56
|
+
return tokens unless clean
|
57
|
+
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
|
58
|
+
end
|
59
|
+
|
48
60
|
def remove_punctuation(tokens)
|
49
61
|
case punctuation
|
50
62
|
when 'all'
|