pragmatic_tokenizer 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -47,7 +47,7 @@ module PragmaticTokenizer
|
|
47
47
|
# @option opts [Boolean] :remove_urls - (default: false)
|
48
48
|
# @option opts [Boolean] :remove_domains - (default: false)
|
49
49
|
|
50
|
-
def initialize(text, opts
|
50
|
+
def initialize(text, opts={})
|
51
51
|
@text = CGI.unescapeHTML(text)
|
52
52
|
@filter_languages = opts[:filter_languages] || []
|
53
53
|
@language = opts[:language] || 'en'
|
@@ -62,17 +62,17 @@ module PragmaticTokenizer
|
|
62
62
|
merged_abbreviations = []
|
63
63
|
@filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
|
64
64
|
merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
|
65
|
-
@abbreviations
|
65
|
+
@abbreviations = merged_abbreviations.flatten
|
66
66
|
|
67
67
|
merged_contractions = {}
|
68
68
|
@filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
|
69
69
|
merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
|
70
|
-
@contractions
|
70
|
+
@contractions = merged_contractions
|
71
71
|
|
72
72
|
merged_stop_words = []
|
73
73
|
@filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
|
74
74
|
merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
|
75
|
-
@stop_words
|
75
|
+
@stop_words = merged_stop_words.flatten
|
76
76
|
end
|
77
77
|
@punctuation = opts[:punctuation] || 'all'
|
78
78
|
@numbers = opts[:numbers] || 'all'
|
@@ -89,20 +89,20 @@ module PragmaticTokenizer
|
|
89
89
|
@remove_domains = opts[:remove_domains] || false
|
90
90
|
|
91
91
|
unless punctuation.to_s.eql?('all') ||
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
punctuation.to_s.eql?('semi') ||
|
93
|
+
punctuation.to_s.eql?('none') ||
|
94
|
+
punctuation.to_s.eql?('only')
|
95
95
|
raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
96
96
|
end
|
97
97
|
unless numbers.to_s.eql?('all') ||
|
98
|
-
|
99
|
-
|
100
|
-
|
98
|
+
numbers.to_s.eql?('semi') ||
|
99
|
+
numbers.to_s.eql?('none') ||
|
100
|
+
numbers.to_s.eql?('only')
|
101
101
|
raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
102
102
|
end
|
103
103
|
unless mentions.to_s.eql?('keep_original') ||
|
104
|
-
|
105
|
-
|
104
|
+
mentions.to_s.eql?('keep_and_clean') ||
|
105
|
+
mentions.to_s.eql?('remove')
|
106
106
|
raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
|
107
107
|
end
|
108
108
|
raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
|
@@ -112,153 +112,178 @@ module PragmaticTokenizer
|
|
112
112
|
|
113
113
|
def tokenize
|
114
114
|
return [] unless text
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
tokens.flatten
|
115
|
+
text
|
116
|
+
.scan(/.{,10000}(?=\s|\z)/m)
|
117
|
+
.map { |segment| post_process(pre_process(segment)) }
|
118
|
+
.flatten
|
120
119
|
end
|
121
120
|
|
122
121
|
private
|
123
122
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
classic_filter! if classic_filter
|
130
|
-
process_numbers!
|
131
|
-
remove_short_tokens! if minimum_length > 0
|
132
|
-
process_punctuation!
|
133
|
-
remove_stop_words!(stop_words) if remove_stop_words
|
134
|
-
remove_emoji! if remove_emoji
|
135
|
-
remove_emails! if remove_emails
|
136
|
-
mentions! if mentions
|
137
|
-
hashtags! if hashtags
|
138
|
-
remove_urls! if remove_urls
|
139
|
-
remove_domains! if remove_domains
|
140
|
-
split_long_words! if long_word_split
|
141
|
-
@tokens.reject { |t| t.empty? }
|
142
|
-
end
|
123
|
+
def pre_process(text)
|
124
|
+
text
|
125
|
+
.extend(PragmaticTokenizer::PreProcessor)
|
126
|
+
.pre_process(language: language_module)
|
127
|
+
end
|
143
128
|
|
144
|
-
|
145
|
-
|
146
|
-
|
129
|
+
def post_process(text)
|
130
|
+
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
|
131
|
+
downcase! if downcase
|
132
|
+
expand_contractions!(contractions) if expand_contractions
|
133
|
+
clean! if clean
|
134
|
+
classic_filter! if classic_filter
|
135
|
+
process_numbers!
|
136
|
+
remove_short_tokens! if minimum_length > 0
|
137
|
+
process_punctuation!
|
138
|
+
remove_stop_words!(stop_words) if remove_stop_words
|
139
|
+
remove_emoji! if remove_emoji
|
140
|
+
remove_emails! if remove_emails
|
141
|
+
mentions! if mentions
|
142
|
+
hashtags! if hashtags
|
143
|
+
remove_urls! if remove_urls
|
144
|
+
remove_domains! if remove_domains
|
145
|
+
split_long_words! if long_word_split
|
146
|
+
@tokens.reject(&:empty?)
|
147
|
+
end
|
147
148
|
|
148
|
-
|
149
|
-
|
150
|
-
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
|
151
|
-
else
|
152
|
-
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
149
|
+
def downcase!
|
150
|
+
@tokens.map! { |t| Unicode.downcase(t) }
|
153
151
|
end
|
154
|
-
end
|
155
152
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
153
|
+
def expand_contractions!(contractions)
|
154
|
+
@tokens = if downcase
|
155
|
+
@tokens.flat_map do |t|
|
156
|
+
if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
|
157
|
+
contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
|
158
|
+
.split(' ')
|
159
|
+
.flatten
|
160
|
+
else
|
161
|
+
t
|
162
|
+
end
|
163
|
+
end
|
164
|
+
else
|
165
|
+
@tokens.flat_map do |t|
|
166
|
+
if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
|
167
|
+
contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
|
168
|
+
.split(' ')
|
169
|
+
.each_with_index
|
170
|
+
.map { |token, i| i.eql?(0) ? Unicode.capitalize(token) : token }
|
171
|
+
.flatten
|
172
|
+
else
|
173
|
+
t
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
181
178
|
|
182
|
-
|
183
|
-
|
184
|
-
|
179
|
+
def clean!
|
180
|
+
@tokens = @tokens.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\s)\_+/) ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
181
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\s)/) ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
|
182
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\A)\_+/) ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
|
183
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\z)/) ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
184
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\*+/) ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
185
|
+
.map { |t| t.gsub(/[[:cntrl:]]/, '') }
|
186
|
+
.map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
|
187
|
+
.map { |t| t.gsub(/\:(?=\z)/, '') }
|
188
|
+
.map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
|
189
|
+
.map { |t| t !~ /[@@#|#]/ ? t.gsub(/(?<=\D)1+(?=\z)/, '') : t }
|
190
|
+
.map { |t| t.gsub(/!+(?=\z)/, '') }
|
191
|
+
.map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
|
192
|
+
.map { |t| t.gsub(/\u{00AD}/, '') }
|
193
|
+
.map { |t| t.gsub(/\A(-|–)/, '') }
|
194
|
+
.map { |t| t.gsub(/[®©]/, '') }
|
195
|
+
.map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
|
196
|
+
.delete_if do |t|
|
197
|
+
t =~ /\A-+\z/ ||
|
198
|
+
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
199
|
+
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
200
|
+
t.length > 50 ||
|
201
|
+
(t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
|
202
|
+
(t.length == 1 && t =~ /\:/)
|
203
|
+
end
|
204
|
+
end
|
185
205
|
|
186
|
-
|
187
|
-
|
188
|
-
when 'semi'
|
189
|
-
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
190
|
-
when 'none'
|
191
|
-
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
|
192
|
-
when 'only'
|
193
|
-
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
206
|
+
def classic_filter!
|
207
|
+
@tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.delete('.').chomp("'s").chomp("’s").chomp("`s").chomp("́s") : t.chomp("'s").chomp("’s").chomp("`s").chomp("́s") }
|
194
208
|
end
|
195
|
-
end
|
196
209
|
|
197
|
-
|
198
|
-
|
199
|
-
|
210
|
+
def process_numbers!
|
211
|
+
case numbers.to_s
|
212
|
+
when 'semi'
|
213
|
+
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
214
|
+
when 'none'
|
215
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
|
216
|
+
when 'only'
|
217
|
+
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
218
|
+
end
|
219
|
+
end
|
200
220
|
|
201
|
-
|
202
|
-
|
203
|
-
when 'semi'
|
204
|
-
@tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
205
|
-
when 'none'
|
206
|
-
@tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
207
|
-
when 'only'
|
208
|
-
@tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
221
|
+
def remove_short_tokens!
|
222
|
+
@tokens.delete_if { |t| t.length < minimum_length }
|
209
223
|
end
|
210
|
-
end
|
211
224
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
225
|
+
def process_punctuation!
|
226
|
+
case punctuation.to_s
|
227
|
+
when 'semi'
|
228
|
+
@tokens -= PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
229
|
+
when 'none'
|
230
|
+
@tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
231
|
+
when 'only'
|
232
|
+
@tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
233
|
+
end
|
217
234
|
end
|
218
|
-
end
|
219
235
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
236
|
+
def remove_stop_words!(stop_words)
|
237
|
+
if downcase
|
238
|
+
@tokens -= stop_words
|
239
|
+
else
|
240
|
+
@tokens.delete_if { |t| stop_words.include?(Unicode.downcase(t)) }
|
241
|
+
end
|
242
|
+
end
|
227
243
|
|
228
|
-
|
229
|
-
|
230
|
-
|
244
|
+
def remove_emoji!
|
245
|
+
@tokens.delete_if do |t|
|
246
|
+
t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX ||
|
247
|
+
t =~ /\u{2744}\u{FE0F}/ ||
|
248
|
+
t =~ /\u{2744}\u{FE0E}/ ||
|
249
|
+
t =~ /\u{2744}/
|
250
|
+
end
|
251
|
+
end
|
231
252
|
|
232
|
-
|
233
|
-
|
234
|
-
when 'remove'
|
235
|
-
@tokens.delete_if { |t| t =~ /\A(@|@)/ }
|
236
|
-
when 'keep_and_clean'
|
237
|
-
@tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
|
253
|
+
def remove_emails!
|
254
|
+
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
|
238
255
|
end
|
239
|
-
end
|
240
256
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
257
|
+
def mentions!
|
258
|
+
case mentions.to_s
|
259
|
+
when 'remove'
|
260
|
+
@tokens.delete_if { |t| t =~ /\A(@|@)/ }
|
261
|
+
when 'keep_and_clean'
|
262
|
+
@tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
|
263
|
+
end
|
248
264
|
end
|
249
|
-
end
|
250
265
|
|
251
|
-
|
252
|
-
|
253
|
-
|
266
|
+
def hashtags!
|
267
|
+
case hashtags.to_s
|
268
|
+
when 'remove'
|
269
|
+
@tokens.delete_if { |t| t =~ /\A(#|#)/ }
|
270
|
+
when 'keep_and_clean'
|
271
|
+
@tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
272
|
+
@tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
|
273
|
+
end
|
274
|
+
end
|
254
275
|
|
255
|
-
|
256
|
-
|
257
|
-
|
276
|
+
def remove_urls!
|
277
|
+
@tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
|
278
|
+
end
|
258
279
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
280
|
+
def remove_domains!
|
281
|
+
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
|
282
|
+
end
|
283
|
+
|
284
|
+
def split_long_words!
|
285
|
+
@tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
286
|
+
.map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
|
287
|
+
end
|
263
288
|
end
|
264
289
|
end
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -4,38 +4,42 @@ describe PragmaticTokenizer do
|
|
4
4
|
context 'Language: Bulgarian (bg)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
6
|
text = 'Стойностни, вкл. български и руски'
|
7
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
8
|
-
|
7
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
8
|
+
text,
|
9
|
+
language: 'bg'
|
9
10
|
)
|
10
11
|
expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
|
11
12
|
end
|
12
13
|
|
13
14
|
it 'tokenizes a string #002' do
|
14
15
|
text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
|
15
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
16
|
-
|
17
|
-
|
16
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
17
|
+
text,
|
18
|
+
language: 'bg',
|
19
|
+
remove_stop_words: true
|
18
20
|
)
|
19
21
|
expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
|
20
22
|
end
|
21
23
|
|
22
24
|
it 'tokenizes a string #003' do
|
23
25
|
text = 'Без български жертви в Париж.'
|
24
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
25
|
-
|
26
|
-
|
26
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
27
|
+
text,
|
28
|
+
language: 'bg',
|
29
|
+
remove_stop_words: true
|
27
30
|
)
|
28
31
|
expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
|
29
32
|
end
|
30
33
|
|
31
34
|
it 'tokenizes a string #004' do
|
32
35
|
text = 'Без български жертви в Париж.'
|
33
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
37
|
+
text,
|
38
|
+
language: 'bg',
|
39
|
+
remove_stop_words: true,
|
40
|
+
downcase: false
|
37
41
|
)
|
38
42
|
expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
|
39
43
|
end
|
40
44
|
end
|
41
|
-
end
|
45
|
+
end
|