pragmatic_tokenizer 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e86a121879d806b58f855e311c14be249ba6ce95
|
4
|
+
data.tar.gz: 9be42b0a437ddaa0e03630d0fd6eee64f242bc9a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f8fbf0b2de1674c557144568dc771fadcb882b892318eae04c7aae3f1ec53743f29dd208c971c45707ded50d88304aaa824ca2f5364fc65b96bd0b72d93e0d6
|
7
|
+
data.tar.gz: 39ee1f3e32cd243ef28c4f6b0823aa4cc523ca8a2adbb90cf6feca33fb966ccd452b70166f9ab7d2b64d859165b799c6ba3d45a410a2d5ff7116210170e77d02
|
data/README.md
CHANGED
@@ -50,6 +50,15 @@ Or install it yourself as:
|
|
50
50
|
|
51
51
|
<hr>
|
52
52
|
|
53
|
+
##### `remove_en_stop_words`
|
54
|
+
**default** = `'false'`
|
55
|
+
- `true`
|
56
|
+
Removes all English stop words (sometimes foreign language strings have English mixed in).
|
57
|
+
- `false`
|
58
|
+
Does not remove English stop words.
|
59
|
+
|
60
|
+
<hr>
|
61
|
+
|
53
62
|
##### `expand_contractions`
|
54
63
|
**default** = `'false'`
|
55
64
|
- `true`
|
@@ -26,6 +26,7 @@ module PragmaticTokenizer
|
|
26
26
|
"'tis" => "it is",
|
27
27
|
"it'll" => "it will",
|
28
28
|
"it'd" => "it would",
|
29
|
+
"let's" => "let us",
|
29
30
|
"we're" => "we are",
|
30
31
|
"we'll" => "we will",
|
31
32
|
"we'd" => "we would",
|
@@ -34,6 +35,11 @@ module PragmaticTokenizer
|
|
34
35
|
"they'll" => "they will",
|
35
36
|
"they'd" => "they would",
|
36
37
|
"they've" => "they have",
|
38
|
+
"there'd" => "there would",
|
39
|
+
"there'll" => "there will",
|
40
|
+
"there're" => "there are",
|
41
|
+
"there's" => "there has",
|
42
|
+
"there've" => "there have",
|
37
43
|
"that's" => "that is",
|
38
44
|
"that'll" => "that will",
|
39
45
|
"that'd" => "that would",
|
@@ -24,7 +24,7 @@ module PragmaticTokenizer
|
|
24
24
|
tokens = separate_full_stop(text.squeeze(' ')
|
25
25
|
.split
|
26
26
|
.flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
|
27
|
-
.flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
|
27
|
+
.flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
|
28
28
|
.map { |t| convert_sym_to_punct(t) })
|
29
29
|
separate_other_ending_punc(tokens)
|
30
30
|
end
|
@@ -5,8 +5,8 @@ require 'unicode'
|
|
5
5
|
module PragmaticTokenizer
|
6
6
|
class Tokenizer
|
7
7
|
|
8
|
-
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
|
9
|
-
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
|
8
|
+
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
|
9
|
+
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
|
10
10
|
unless punctuation.to_s.eql?('all') ||
|
11
11
|
punctuation.to_s.eql?('semi') ||
|
12
12
|
punctuation.to_s.eql?('none') ||
|
@@ -37,13 +37,15 @@ module PragmaticTokenizer
|
|
37
37
|
@minimum_length = minimum_length
|
38
38
|
@remove_roman_numerals = remove_roman_numerals
|
39
39
|
@downcase = downcase
|
40
|
+
@remove_en_stop_words = remove_en_stop_words
|
40
41
|
end
|
41
42
|
|
42
43
|
def tokenize
|
43
44
|
return [] unless text
|
44
45
|
tokens = []
|
45
46
|
text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
|
46
|
-
tokens <<
|
47
|
+
tokens << delete_en_stop_words(
|
48
|
+
delete_stop_words(
|
47
49
|
downcase_tokens(
|
48
50
|
cleaner(
|
49
51
|
remove_short_tokens(
|
@@ -58,7 +60,7 @@ module PragmaticTokenizer
|
|
58
60
|
shift_no_spaces_between_sentences(
|
59
61
|
split_at_forward_slash(
|
60
62
|
processor.new(language: language_module).process(text: segment)
|
61
|
-
)))))))))))))).reject { |t| t.empty? }
|
63
|
+
))))))))))))))).reject { |t| t.empty? }
|
62
64
|
end
|
63
65
|
tokens.flatten
|
64
66
|
end
|
@@ -190,6 +192,15 @@ module PragmaticTokenizer
|
|
190
192
|
end
|
191
193
|
end
|
192
194
|
|
195
|
+
def delete_en_stop_words(tokens)
|
196
|
+
return tokens unless remove_en_stop_words
|
197
|
+
if downcase
|
198
|
+
tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
|
199
|
+
else
|
200
|
+
tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
193
204
|
def split_at_forward_slash(tokens)
|
194
205
|
tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
195
206
|
end
|