pragmatic_tokenizer 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 414905d0126493ffc7224055dd2f79010061662c
|
4
|
+
data.tar.gz: 2b0995cc2b16cef7f7a521a65f90011118fd70f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62772587ff880bd192c504f9f319e1dd9e11c89bac1e37e67ef95d3f42f451fb3a770f5c57eb7152919c301e49e4d5c06a71001b27781e8ca5228bf4ab29c082
|
7
|
+
data.tar.gz: 977cc2c5fd69d0bca8e619860ee78e1161cb509da734ad0290b7dd54401822f399315d7e10606f8f3750a53314167568af6beccd688870bdacc04832c150d606
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
2
|
module Languages
|
3
3
|
module Common
|
4
|
-
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
|
5
|
-
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
|
4
|
+
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”']
|
5
|
+
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.']
|
7
7
|
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
|
8
8
|
SPECIAL_CHARACTERS = ['®', '©', '™']
|
@@ -15,9 +15,8 @@ module PragmaticTokenizer
|
|
15
15
|
shift_colon(text)
|
16
16
|
shift_bracket(text)
|
17
17
|
shift_semicolon(text)
|
18
|
-
|
19
|
-
|
20
|
-
shift_at_symbol(text)
|
18
|
+
shift_caret(text)
|
19
|
+
shift_vertical_bar(text)
|
21
20
|
convert_dbl_quotes(text)
|
22
21
|
convert_sgl_quotes(text)
|
23
22
|
shift_beginning_hyphen(text)
|
@@ -35,8 +34,10 @@ module PragmaticTokenizer
|
|
35
34
|
def convert_dbl_quotes(text)
|
36
35
|
# Convert left double quotes to special character
|
37
36
|
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
37
|
+
text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
|
38
38
|
# Convert remaining quotes to special character
|
39
39
|
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
40
|
+
text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
|
40
41
|
end
|
41
42
|
|
42
43
|
def convert_sgl_quotes(text)
|
@@ -51,6 +52,10 @@ module PragmaticTokenizer
|
|
51
52
|
text.gsub!(/--+/o, ' - ') || text
|
52
53
|
end
|
53
54
|
|
55
|
+
def shift_vertical_bar(text)
|
56
|
+
text.gsub!(/\|/, ' | ') || text
|
57
|
+
end
|
58
|
+
|
54
59
|
def shift_comma(text)
|
55
60
|
# Shift commas off everything but numbers
|
56
61
|
text.gsub!(/,(?!\d)/o, ' , ') || text
|
@@ -83,34 +88,26 @@ module PragmaticTokenizer
|
|
83
88
|
text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
|
84
89
|
end
|
85
90
|
|
86
|
-
def shift_underscore(text)
|
87
|
-
text.gsub!(/(?<=\s)\_+/, ' \1') || text
|
88
|
-
text.gsub!(/\_+(?=\s)/, ' \1') || text
|
89
|
-
text.gsub!(/(?<=\A)\_+/, '\1 ') || text
|
90
|
-
text.gsub!(/\_+(?=\z)/, ' \1') || text
|
91
|
-
end
|
92
|
-
|
93
|
-
def shift_asterisk(text)
|
94
|
-
text.gsub!(/\*+/, ' \1 ') || text
|
95
|
-
end
|
96
|
-
|
97
|
-
def shift_at_symbol(text)
|
98
|
-
text.gsub!(/(\A|\s)\@/, '\1 ') || text
|
99
|
-
end
|
100
|
-
|
101
91
|
def shift_colon(text)
|
92
|
+
puts "Text: #{text}"
|
102
93
|
return text unless text.include?(':') &&
|
103
|
-
|
104
|
-
|
94
|
+
text.partition(':').last[0] !~ /\A\d+/ &&
|
95
|
+
text.partition(':').first[-1] !~ /\A\d+/
|
96
|
+
puts "YOYOYO"
|
105
97
|
# Ignore web addresses
|
106
98
|
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
107
99
|
text.gsub!(/:/o, ' :') || text
|
100
|
+
text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
|
108
101
|
end
|
109
102
|
|
110
103
|
def shift_semicolon(text)
|
111
104
|
text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
|
112
105
|
end
|
113
106
|
|
107
|
+
def shift_caret(text)
|
108
|
+
text.gsub!(/\^/, ' ^ ') || text
|
109
|
+
end
|
110
|
+
|
114
111
|
def shift_ellipse(text)
|
115
112
|
text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
|
116
113
|
text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
|
@@ -167,7 +164,7 @@ module PragmaticTokenizer
|
|
167
164
|
end
|
168
165
|
|
169
166
|
def convert_sym_to_punct(token)
|
170
|
-
symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
|
167
|
+
symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
|
171
168
|
if symbol.nil?
|
172
169
|
return token
|
173
170
|
else
|
@@ -7,10 +7,10 @@ module PragmaticTokenizer
|
|
7
7
|
|
8
8
|
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
|
9
9
|
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
|
10
|
-
unless punctuation.eql?('all') ||
|
11
|
-
punctuation.eql?('semi') ||
|
12
|
-
punctuation.eql?('none') ||
|
13
|
-
punctuation.eql?('only')
|
10
|
+
unless punctuation.to_s.eql?('all') ||
|
11
|
+
punctuation.to_s.eql?('semi') ||
|
12
|
+
punctuation.to_s.eql?('none') ||
|
13
|
+
punctuation.to_s.eql?('only')
|
14
14
|
raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
15
15
|
# Punctuation 'all': Does not remove any punctuation from the result
|
16
16
|
|
@@ -25,10 +25,10 @@ module PragmaticTokenizer
|
|
25
25
|
# Punctuation 'only': Removes everything except punctuation. The
|
26
26
|
# returned result is an array of only the punctuation.
|
27
27
|
end
|
28
|
-
@text = CGI.unescapeHTML(text)
|
29
|
-
@language = language
|
30
|
-
@language_module = Languages.get_language_by_code(language)
|
31
|
-
@punctuation = punctuation
|
28
|
+
@text = CGI.unescapeHTML(text.to_s)
|
29
|
+
@language = language.to_s
|
30
|
+
@language_module = Languages.get_language_by_code(language.to_s)
|
31
|
+
@punctuation = punctuation.to_s
|
32
32
|
@remove_stop_words = remove_stop_words
|
33
33
|
@expand_contractions = expand_contractions
|
34
34
|
@clean = clean
|
@@ -40,7 +40,21 @@ module PragmaticTokenizer
|
|
40
40
|
|
41
41
|
def tokenize
|
42
42
|
return [] unless text
|
43
|
-
downcase_tokens(
|
43
|
+
downcase_tokens(
|
44
|
+
cleaner(
|
45
|
+
remove_short_tokens(
|
46
|
+
delete_numbers(
|
47
|
+
delete_roman_numerals(
|
48
|
+
find_contractions(
|
49
|
+
delete_stop_words(
|
50
|
+
remove_punctuation(
|
51
|
+
split_at_middle_period_1(
|
52
|
+
split_at_middle_period_2(
|
53
|
+
split_beginning_period(
|
54
|
+
shift_no_spaces_between_sentences(
|
55
|
+
split_at_forward_slash(
|
56
|
+
processor.new(language: language_module).process(text: text)
|
57
|
+
))))))))))))).reject { |t| t.empty? }
|
44
58
|
end
|
45
59
|
|
46
60
|
def domains
|
@@ -80,6 +94,35 @@ module PragmaticTokenizer
|
|
80
94
|
Processor
|
81
95
|
end
|
82
96
|
|
97
|
+
def split_at_middle_period_1(tokens)
|
98
|
+
tokens.flat_map { |t| t.include?(".") &&
|
99
|
+
t !~ /(http|https|www)(\.|:)/ &&
|
100
|
+
t.length > 1 &&
|
101
|
+
t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
|
102
|
+
t !~ /\S+(@|@)\S+/ &&
|
103
|
+
language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
|
104
|
+
end
|
105
|
+
|
106
|
+
def split_at_middle_period_2(tokens)
|
107
|
+
tokens.flat_map { |t| t.include?(".") &&
|
108
|
+
t !~ /(http|https|www)(\.|:)/ &&
|
109
|
+
t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
|
110
|
+
t !~ /\.[a-z]{2}/ &&
|
111
|
+
t.length > 2 &&
|
112
|
+
t.count(".") == 1 &&
|
113
|
+
t !~ /\d+/ &&
|
114
|
+
!language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
|
115
|
+
t !~ /\S+(@|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
|
116
|
+
end
|
117
|
+
|
118
|
+
def split_beginning_period(tokens)
|
119
|
+
tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
|
120
|
+
end
|
121
|
+
|
122
|
+
def shift_no_spaces_between_sentences(tokens)
|
123
|
+
tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
|
124
|
+
end
|
125
|
+
|
83
126
|
def downcase_tokens(tokens)
|
84
127
|
return tokens unless downcase
|
85
128
|
tokens.map { |t| Unicode::downcase(t) }
|
@@ -101,7 +144,13 @@ module PragmaticTokenizer
|
|
101
144
|
|
102
145
|
def cleaner(tokens)
|
103
146
|
return tokens unless clean
|
104
|
-
tokens.
|
147
|
+
tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
|
148
|
+
.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
149
|
+
.flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
|
150
|
+
.flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
|
151
|
+
.flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
152
|
+
.flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
153
|
+
.delete_if { |t| t =~ /\A-+\z/ ||
|
105
154
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
106
155
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
107
156
|
t.length > 50 ||
|
@@ -135,14 +184,16 @@ module PragmaticTokenizer
|
|
135
184
|
end
|
136
185
|
end
|
137
186
|
|
187
|
+
def split_at_forward_slash(tokens)
|
188
|
+
tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
189
|
+
end
|
190
|
+
|
138
191
|
def find_contractions(tokens)
|
139
192
|
return tokens unless expand_contractions && language_module::CONTRACTIONS
|
140
193
|
if downcase
|
141
194
|
tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').flatten : t }
|
142
|
-
.flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
143
195
|
else
|
144
196
|
tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
145
|
-
.flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
146
197
|
end
|
147
198
|
end
|
148
199
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|