pragmatic_tokenizer 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -2
- data/lib/pragmatic_tokenizer/languages/common.rb +5 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +10 -5
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77bf20ef9b491ecf72bd541712fda8338e4f318e
|
4
|
+
data.tar.gz: 6a8c8464d224532a05f80832142ef9a6b0b1c26b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 93ad61db52cb42f5ec17cfec971bb1505e7b8d2d9e2c40713be01f46fc0eebc2c8de302ad36c90bc9864696ba4b895050f9041d2bd0d2622b8b4f5d9d94118f5
|
7
|
+
data.tar.gz: f6ef9861babfc3e28ca90624e396f821cbeaa09da189f3ecec60a0724b15538d39afe920b1808cbb6a1f527044fe761745947af551d321cab79bdfe11b418f80
|
data/README.md
CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
|
|
61
61
|
##### `clean`
|
62
62
|
**default** = `'false'`
|
63
63
|
- `true`
|
64
|
-
Removes tokens consisting of only hypens or underscores.
|
64
|
+
Removes tokens consisting of only hypens or underscores as well as some special characters (®, ©, ™).
|
65
65
|
- `false`
|
66
66
|
Leaves tokens as is.
|
67
67
|
|
@@ -70,10 +70,16 @@ Or install it yourself as:
|
|
70
70
|
##### `remove_numbers`
|
71
71
|
**default** = `'false'`
|
72
72
|
- `true`
|
73
|
-
Removes any token that contains a number.
|
73
|
+
Removes any token that contains a number or Roman numeral.
|
74
74
|
- `false`
|
75
75
|
Leaves tokens as is.
|
76
76
|
|
77
|
+
<hr>
|
78
|
+
|
79
|
+
##### `minimum_length`
|
80
|
+
**default** = `0`
|
81
|
+
The minimum number of characters a token should be.
|
82
|
+
|
77
83
|
**Example Usage**
|
78
84
|
```ruby
|
79
85
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
@@ -110,6 +116,10 @@ PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
|
|
110
116
|
text = "Hello ______ ."
|
111
117
|
PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
|
112
118
|
# => ["hello", "."]
|
119
|
+
|
120
|
+
text = "Let's test the minimum length."
|
121
|
+
PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
122
|
+
# => ["minimum", "length"]
|
113
123
|
```
|
114
124
|
|
115
125
|
## Development
|
@@ -4,6 +4,11 @@ module PragmaticTokenizer
|
|
4
4
|
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
|
5
5
|
PUNCTUATION_MAP = ['♳', '♴', '♵', '♶', '♷', '♸', '♹', '♺', '⚀', '⚁', '⚂', '⚃', '⚄', '⚅', '☇', '☈', '☉', '☊', '☋', '☌', '☍', '☠', '☢', '☣', '☤', '☥', '☦', '☧', '☀', '☁', '☂', '☃', '☄', "☮", '♔', '♕', '♖', '♗', '♘', '♙', '♚']
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.']
|
7
|
+
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
|
8
|
+
SPECIAL_CHARACTERS = ['®', '©', '™']
|
9
|
+
ABBREVIATIONS = []
|
10
|
+
STOP_WORDS = []
|
11
|
+
CONTRACTIONS = {}
|
7
12
|
end
|
8
13
|
end
|
9
14
|
end
|
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
|
|
4
4
|
module PragmaticTokenizer
|
5
5
|
class Tokenizer
|
6
6
|
|
7
|
-
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
|
8
|
-
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
|
7
|
+
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length
|
8
|
+
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0)
|
9
9
|
unless punctuation.eql?('all') ||
|
10
10
|
punctuation.eql?('semi') ||
|
11
11
|
punctuation.eql?('none') ||
|
@@ -32,11 +32,12 @@ module PragmaticTokenizer
|
|
32
32
|
@expand_contractions = expand_contractions
|
33
33
|
@clean = clean
|
34
34
|
@remove_numbers = remove_numbers
|
35
|
+
@minimum_length = minimum_length
|
35
36
|
end
|
36
37
|
|
37
38
|
def tokenize
|
38
39
|
return [] unless text
|
39
|
-
delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
|
40
|
+
remove_short_tokens(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text)))))))
|
40
41
|
end
|
41
42
|
|
42
43
|
private
|
@@ -47,14 +48,18 @@ module PragmaticTokenizer
|
|
47
48
|
Processor
|
48
49
|
end
|
49
50
|
|
51
|
+
def remove_short_tokens(tokens)
|
52
|
+
tokens.delete_if { |t| t.length < minimum_length }
|
53
|
+
end
|
54
|
+
|
50
55
|
def delete_numbers(tokens)
|
51
56
|
return tokens unless remove_numbers
|
52
|
-
tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
|
57
|
+
tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
|
53
58
|
end
|
54
59
|
|
55
60
|
def cleaner(tokens)
|
56
61
|
return tokens unless clean
|
57
|
-
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
|
62
|
+
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) }
|
58
63
|
end
|
59
64
|
|
60
65
|
def remove_punctuation(tokens)
|