pragmatic_tokenizer 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -2
- data/lib/pragmatic_tokenizer/languages/common.rb +5 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +10 -5
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77bf20ef9b491ecf72bd541712fda8338e4f318e
|
4
|
+
data.tar.gz: 6a8c8464d224532a05f80832142ef9a6b0b1c26b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 93ad61db52cb42f5ec17cfec971bb1505e7b8d2d9e2c40713be01f46fc0eebc2c8de302ad36c90bc9864696ba4b895050f9041d2bd0d2622b8b4f5d9d94118f5
|
7
|
+
data.tar.gz: f6ef9861babfc3e28ca90624e396f821cbeaa09da189f3ecec60a0724b15538d39afe920b1808cbb6a1f527044fe761745947af551d321cab79bdfe11b418f80
|
data/README.md
CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
|
|
61
61
|
##### `clean`
|
62
62
|
**default** = `'false'`
|
63
63
|
- `true`
|
64
|
-
Removes tokens consisting of only hypens or underscores.
|
64
|
+
Removes tokens consisting of only hypens or underscores as well as some special characters (®, ©, ™).
|
65
65
|
- `false`
|
66
66
|
Leaves tokens as is.
|
67
67
|
|
@@ -70,10 +70,16 @@ Or install it yourself as:
|
|
70
70
|
##### `remove_numbers`
|
71
71
|
**default** = `'false'`
|
72
72
|
- `true`
|
73
|
-
Removes any token that contains a number.
|
73
|
+
Removes any token that contains a number or Roman numeral.
|
74
74
|
- `false`
|
75
75
|
Leaves tokens as is.
|
76
76
|
|
77
|
+
<hr>
|
78
|
+
|
79
|
+
##### `minimum_length`
|
80
|
+
**default** = `0`
|
81
|
+
The minimum number of characters a token should be.
|
82
|
+
|
77
83
|
**Example Usage**
|
78
84
|
```ruby
|
79
85
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
@@ -110,6 +116,10 @@ PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
|
|
110
116
|
text = "Hello ______ ."
|
111
117
|
PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
|
112
118
|
# => ["hello", "."]
|
119
|
+
|
120
|
+
text = "Let's test the minimum length."
|
121
|
+
PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
122
|
+
# => ["minimum", "length"]
|
113
123
|
```
|
114
124
|
|
115
125
|
## Development
|
@@ -4,6 +4,11 @@ module PragmaticTokenizer
|
|
4
4
|
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
|
5
5
|
PUNCTUATION_MAP = ['♳', '♴', '♵', '♶', '♷', '♸', '♹', '♺', '⚀', '⚁', '⚂', '⚃', '⚄', '⚅', '☇', '☈', '☉', '☊', '☋', '☌', '☍', '☠', '☢', '☣', '☤', '☥', '☦', '☧', '☀', '☁', '☂', '☃', '☄', "☮", '♔', '♕', '♖', '♗', '♘', '♙', '♚']
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.']
|
7
|
+
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
|
8
|
+
SPECIAL_CHARACTERS = ['®', '©', '™']
|
9
|
+
ABBREVIATIONS = []
|
10
|
+
STOP_WORDS = []
|
11
|
+
CONTRACTIONS = {}
|
7
12
|
end
|
8
13
|
end
|
9
14
|
end
|
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
|
|
4
4
|
module PragmaticTokenizer
|
5
5
|
class Tokenizer
|
6
6
|
|
7
|
-
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
|
8
|
-
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
|
7
|
+
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length
|
8
|
+
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0)
|
9
9
|
unless punctuation.eql?('all') ||
|
10
10
|
punctuation.eql?('semi') ||
|
11
11
|
punctuation.eql?('none') ||
|
@@ -32,11 +32,12 @@ module PragmaticTokenizer
|
|
32
32
|
@expand_contractions = expand_contractions
|
33
33
|
@clean = clean
|
34
34
|
@remove_numbers = remove_numbers
|
35
|
+
@minimum_length = minimum_length
|
35
36
|
end
|
36
37
|
|
37
38
|
def tokenize
|
38
39
|
return [] unless text
|
39
|
-
delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
|
40
|
+
remove_short_tokens(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text)))))))
|
40
41
|
end
|
41
42
|
|
42
43
|
private
|
@@ -47,14 +48,18 @@ module PragmaticTokenizer
|
|
47
48
|
Processor
|
48
49
|
end
|
49
50
|
|
51
|
+
def remove_short_tokens(tokens)
|
52
|
+
tokens.delete_if { |t| t.length < minimum_length }
|
53
|
+
end
|
54
|
+
|
50
55
|
def delete_numbers(tokens)
|
51
56
|
return tokens unless remove_numbers
|
52
|
-
tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
|
57
|
+
tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
|
53
58
|
end
|
54
59
|
|
55
60
|
def cleaner(tokens)
|
56
61
|
return tokens unless clean
|
57
|
-
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
|
62
|
+
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) }
|
58
63
|
end
|
59
64
|
|
60
65
|
def remove_punctuation(tokens)
|