pragmatic_tokenizer 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e86a121879d806b58f855e311c14be249ba6ce95
|
4
|
+
data.tar.gz: 9be42b0a437ddaa0e03630d0fd6eee64f242bc9a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f8fbf0b2de1674c557144568dc771fadcb882b892318eae04c7aae3f1ec53743f29dd208c971c45707ded50d88304aaa824ca2f5364fc65b96bd0b72d93e0d6
|
7
|
+
data.tar.gz: 39ee1f3e32cd243ef28c4f6b0823aa4cc523ca8a2adbb90cf6feca33fb966ccd452b70166f9ab7d2b64d859165b799c6ba3d45a410a2d5ff7116210170e77d02
|
data/README.md
CHANGED
@@ -50,6 +50,15 @@ Or install it yourself as:
|
|
50
50
|
|
51
51
|
<hr>
|
52
52
|
|
53
|
+
##### `remove_en_stop_words`
|
54
|
+
**default** = `'false'`
|
55
|
+
- `true`
|
56
|
+
Removes all English stop words (sometimes foreign language strings have English mixed in).
|
57
|
+
- `false`
|
58
|
+
Does not remove English stop words.
|
59
|
+
|
60
|
+
<hr>
|
61
|
+
|
53
62
|
##### `expand_contractions`
|
54
63
|
**default** = `'false'`
|
55
64
|
- `true`
|
@@ -26,6 +26,7 @@ module PragmaticTokenizer
|
|
26
26
|
"'tis" => "it is",
|
27
27
|
"it'll" => "it will",
|
28
28
|
"it'd" => "it would",
|
29
|
+
"let's" => "let us",
|
29
30
|
"we're" => "we are",
|
30
31
|
"we'll" => "we will",
|
31
32
|
"we'd" => "we would",
|
@@ -34,6 +35,11 @@ module PragmaticTokenizer
|
|
34
35
|
"they'll" => "they will",
|
35
36
|
"they'd" => "they would",
|
36
37
|
"they've" => "they have",
|
38
|
+
"there'd" => "there would",
|
39
|
+
"there'll" => "there will",
|
40
|
+
"there're" => "there are",
|
41
|
+
"there's" => "there has",
|
42
|
+
"there've" => "there have",
|
37
43
|
"that's" => "that is",
|
38
44
|
"that'll" => "that will",
|
39
45
|
"that'd" => "that would",
|
@@ -24,7 +24,7 @@ module PragmaticTokenizer
|
|
24
24
|
tokens = separate_full_stop(text.squeeze(' ')
|
25
25
|
.split
|
26
26
|
.flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
|
27
|
-
.flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
|
27
|
+
.flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
|
28
28
|
.map { |t| convert_sym_to_punct(t) })
|
29
29
|
separate_other_ending_punc(tokens)
|
30
30
|
end
|
@@ -5,8 +5,8 @@ require 'unicode'
|
|
5
5
|
module PragmaticTokenizer
|
6
6
|
class Tokenizer
|
7
7
|
|
8
|
-
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
|
9
|
-
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
|
8
|
+
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
|
9
|
+
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
|
10
10
|
unless punctuation.to_s.eql?('all') ||
|
11
11
|
punctuation.to_s.eql?('semi') ||
|
12
12
|
punctuation.to_s.eql?('none') ||
|
@@ -37,13 +37,15 @@ module PragmaticTokenizer
|
|
37
37
|
@minimum_length = minimum_length
|
38
38
|
@remove_roman_numerals = remove_roman_numerals
|
39
39
|
@downcase = downcase
|
40
|
+
@remove_en_stop_words = remove_en_stop_words
|
40
41
|
end
|
41
42
|
|
42
43
|
def tokenize
|
43
44
|
return [] unless text
|
44
45
|
tokens = []
|
45
46
|
text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
|
46
|
-
tokens <<
|
47
|
+
tokens << delete_en_stop_words(
|
48
|
+
delete_stop_words(
|
47
49
|
downcase_tokens(
|
48
50
|
cleaner(
|
49
51
|
remove_short_tokens(
|
@@ -58,7 +60,7 @@ module PragmaticTokenizer
|
|
58
60
|
shift_no_spaces_between_sentences(
|
59
61
|
split_at_forward_slash(
|
60
62
|
processor.new(language: language_module).process(text: segment)
|
61
|
-
)))))))))))))).reject { |t| t.empty? }
|
63
|
+
))))))))))))))).reject { |t| t.empty? }
|
62
64
|
end
|
63
65
|
tokens.flatten
|
64
66
|
end
|
@@ -190,6 +192,15 @@ module PragmaticTokenizer
|
|
190
192
|
end
|
191
193
|
end
|
192
194
|
|
195
|
+
def delete_en_stop_words(tokens)
|
196
|
+
return tokens unless remove_en_stop_words
|
197
|
+
if downcase
|
198
|
+
tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
|
199
|
+
else
|
200
|
+
tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
193
204
|
def split_at_forward_slash(tokens)
|
194
205
|
tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
195
206
|
end
|