pragmatic_tokenizer 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 414905d0126493ffc7224055dd2f79010061662c
|
4
|
+
data.tar.gz: 2b0995cc2b16cef7f7a521a65f90011118fd70f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 62772587ff880bd192c504f9f319e1dd9e11c89bac1e37e67ef95d3f42f451fb3a770f5c57eb7152919c301e49e4d5c06a71001b27781e8ca5228bf4ab29c082
|
7
|
+
data.tar.gz: 977cc2c5fd69d0bca8e619860ee78e1161cb509da734ad0290b7dd54401822f399315d7e10606f8f3750a53314167568af6beccd688870bdacc04832c150d606
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
2
|
module Languages
|
3
3
|
module Common
|
4
|
-
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
|
5
|
-
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
|
4
|
+
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”']
|
5
|
+
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.']
|
7
7
|
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
|
8
8
|
SPECIAL_CHARACTERS = ['®', '©', '™']
|
@@ -15,9 +15,8 @@ module PragmaticTokenizer
|
|
15
15
|
shift_colon(text)
|
16
16
|
shift_bracket(text)
|
17
17
|
shift_semicolon(text)
|
18
|
-
|
19
|
-
|
20
|
-
shift_at_symbol(text)
|
18
|
+
shift_caret(text)
|
19
|
+
shift_vertical_bar(text)
|
21
20
|
convert_dbl_quotes(text)
|
22
21
|
convert_sgl_quotes(text)
|
23
22
|
shift_beginning_hyphen(text)
|
@@ -35,8 +34,10 @@ module PragmaticTokenizer
|
|
35
34
|
def convert_dbl_quotes(text)
|
36
35
|
# Convert left double quotes to special character
|
37
36
|
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
37
|
+
text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
|
38
38
|
# Convert remaining quotes to special character
|
39
39
|
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
40
|
+
text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
|
40
41
|
end
|
41
42
|
|
42
43
|
def convert_sgl_quotes(text)
|
@@ -51,6 +52,10 @@ module PragmaticTokenizer
|
|
51
52
|
text.gsub!(/--+/o, ' - ') || text
|
52
53
|
end
|
53
54
|
|
55
|
+
def shift_vertical_bar(text)
|
56
|
+
text.gsub!(/\|/, ' | ') || text
|
57
|
+
end
|
58
|
+
|
54
59
|
def shift_comma(text)
|
55
60
|
# Shift commas off everything but numbers
|
56
61
|
text.gsub!(/,(?!\d)/o, ' , ') || text
|
@@ -83,34 +88,26 @@ module PragmaticTokenizer
|
|
83
88
|
text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
|
84
89
|
end
|
85
90
|
|
86
|
-
def shift_underscore(text)
|
87
|
-
text.gsub!(/(?<=\s)\_+/, ' \1') || text
|
88
|
-
text.gsub!(/\_+(?=\s)/, ' \1') || text
|
89
|
-
text.gsub!(/(?<=\A)\_+/, '\1 ') || text
|
90
|
-
text.gsub!(/\_+(?=\z)/, ' \1') || text
|
91
|
-
end
|
92
|
-
|
93
|
-
def shift_asterisk(text)
|
94
|
-
text.gsub!(/\*+/, ' \1 ') || text
|
95
|
-
end
|
96
|
-
|
97
|
-
def shift_at_symbol(text)
|
98
|
-
text.gsub!(/(\A|\s)\@/, '\1 ') || text
|
99
|
-
end
|
100
|
-
|
101
91
|
def shift_colon(text)
|
92
|
+
puts "Text: #{text}"
|
102
93
|
return text unless text.include?(':') &&
|
103
|
-
|
104
|
-
|
94
|
+
text.partition(':').last[0] !~ /\A\d+/ &&
|
95
|
+
text.partition(':').first[-1] !~ /\A\d+/
|
96
|
+
puts "YOYOYO"
|
105
97
|
# Ignore web addresses
|
106
98
|
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
107
99
|
text.gsub!(/:/o, ' :') || text
|
100
|
+
text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
|
108
101
|
end
|
109
102
|
|
110
103
|
def shift_semicolon(text)
|
111
104
|
text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
|
112
105
|
end
|
113
106
|
|
107
|
+
def shift_caret(text)
|
108
|
+
text.gsub!(/\^/, ' ^ ') || text
|
109
|
+
end
|
110
|
+
|
114
111
|
def shift_ellipse(text)
|
115
112
|
text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
|
116
113
|
text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
|
@@ -167,7 +164,7 @@ module PragmaticTokenizer
|
|
167
164
|
end
|
168
165
|
|
169
166
|
def convert_sym_to_punct(token)
|
170
|
-
symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
|
167
|
+
symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
|
171
168
|
if symbol.nil?
|
172
169
|
return token
|
173
170
|
else
|
@@ -7,10 +7,10 @@ module PragmaticTokenizer
|
|
7
7
|
|
8
8
|
attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
|
9
9
|
def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
|
10
|
-
unless punctuation.eql?('all') ||
|
11
|
-
punctuation.eql?('semi') ||
|
12
|
-
punctuation.eql?('none') ||
|
13
|
-
punctuation.eql?('only')
|
10
|
+
unless punctuation.to_s.eql?('all') ||
|
11
|
+
punctuation.to_s.eql?('semi') ||
|
12
|
+
punctuation.to_s.eql?('none') ||
|
13
|
+
punctuation.to_s.eql?('only')
|
14
14
|
raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
15
15
|
# Punctuation 'all': Does not remove any punctuation from the result
|
16
16
|
|
@@ -25,10 +25,10 @@ module PragmaticTokenizer
|
|
25
25
|
# Punctuation 'only': Removes everything except punctuation. The
|
26
26
|
# returned result is an array of only the punctuation.
|
27
27
|
end
|
28
|
-
@text = CGI.unescapeHTML(text)
|
29
|
-
@language = language
|
30
|
-
@language_module = Languages.get_language_by_code(language)
|
31
|
-
@punctuation = punctuation
|
28
|
+
@text = CGI.unescapeHTML(text.to_s)
|
29
|
+
@language = language.to_s
|
30
|
+
@language_module = Languages.get_language_by_code(language.to_s)
|
31
|
+
@punctuation = punctuation.to_s
|
32
32
|
@remove_stop_words = remove_stop_words
|
33
33
|
@expand_contractions = expand_contractions
|
34
34
|
@clean = clean
|
@@ -40,7 +40,21 @@ module PragmaticTokenizer
|
|
40
40
|
|
41
41
|
def tokenize
|
42
42
|
return [] unless text
|
43
|
-
downcase_tokens(
|
43
|
+
downcase_tokens(
|
44
|
+
cleaner(
|
45
|
+
remove_short_tokens(
|
46
|
+
delete_numbers(
|
47
|
+
delete_roman_numerals(
|
48
|
+
find_contractions(
|
49
|
+
delete_stop_words(
|
50
|
+
remove_punctuation(
|
51
|
+
split_at_middle_period_1(
|
52
|
+
split_at_middle_period_2(
|
53
|
+
split_beginning_period(
|
54
|
+
shift_no_spaces_between_sentences(
|
55
|
+
split_at_forward_slash(
|
56
|
+
processor.new(language: language_module).process(text: text)
|
57
|
+
))))))))))))).reject { |t| t.empty? }
|
44
58
|
end
|
45
59
|
|
46
60
|
def domains
|
@@ -80,6 +94,35 @@ module PragmaticTokenizer
|
|
80
94
|
Processor
|
81
95
|
end
|
82
96
|
|
97
|
+
def split_at_middle_period_1(tokens)
|
98
|
+
tokens.flat_map { |t| t.include?(".") &&
|
99
|
+
t !~ /(http|https|www)(\.|:)/ &&
|
100
|
+
t.length > 1 &&
|
101
|
+
t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
|
102
|
+
t !~ /\S+(@|@)\S+/ &&
|
103
|
+
language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
|
104
|
+
end
|
105
|
+
|
106
|
+
def split_at_middle_period_2(tokens)
|
107
|
+
tokens.flat_map { |t| t.include?(".") &&
|
108
|
+
t !~ /(http|https|www)(\.|:)/ &&
|
109
|
+
t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
|
110
|
+
t !~ /\.[a-z]{2}/ &&
|
111
|
+
t.length > 2 &&
|
112
|
+
t.count(".") == 1 &&
|
113
|
+
t !~ /\d+/ &&
|
114
|
+
!language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
|
115
|
+
t !~ /\S+(@|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
|
116
|
+
end
|
117
|
+
|
118
|
+
def split_beginning_period(tokens)
|
119
|
+
tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
|
120
|
+
end
|
121
|
+
|
122
|
+
def shift_no_spaces_between_sentences(tokens)
|
123
|
+
tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
|
124
|
+
end
|
125
|
+
|
83
126
|
def downcase_tokens(tokens)
|
84
127
|
return tokens unless downcase
|
85
128
|
tokens.map { |t| Unicode::downcase(t) }
|
@@ -101,7 +144,13 @@ module PragmaticTokenizer
|
|
101
144
|
|
102
145
|
def cleaner(tokens)
|
103
146
|
return tokens unless clean
|
104
|
-
tokens.
|
147
|
+
tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
|
148
|
+
.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
149
|
+
.flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
|
150
|
+
.flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
|
151
|
+
.flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
152
|
+
.flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
153
|
+
.delete_if { |t| t =~ /\A-+\z/ ||
|
105
154
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
106
155
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
107
156
|
t.length > 50 ||
|
@@ -135,14 +184,16 @@ module PragmaticTokenizer
|
|
135
184
|
end
|
136
185
|
end
|
137
186
|
|
187
|
+
def split_at_forward_slash(tokens)
|
188
|
+
tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
189
|
+
end
|
190
|
+
|
138
191
|
def find_contractions(tokens)
|
139
192
|
return tokens unless expand_contractions && language_module::CONTRACTIONS
|
140
193
|
if downcase
|
141
194
|
tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').flatten : t }
|
142
|
-
.flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
143
195
|
else
|
144
196
|
tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
145
|
-
.flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
146
197
|
end
|
147
198
|
end
|
148
199
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|