pragmatic_tokenizer 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -47,7 +47,7 @@ module PragmaticTokenizer
|
|
47
47
|
# @option opts [Boolean] :remove_urls - (default: false)
|
48
48
|
# @option opts [Boolean] :remove_domains - (default: false)
|
49
49
|
|
50
|
-
def initialize(text, opts
|
50
|
+
def initialize(text, opts={})
|
51
51
|
@text = CGI.unescapeHTML(text)
|
52
52
|
@filter_languages = opts[:filter_languages] || []
|
53
53
|
@language = opts[:language] || 'en'
|
@@ -62,17 +62,17 @@ module PragmaticTokenizer
|
|
62
62
|
merged_abbreviations = []
|
63
63
|
@filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
|
64
64
|
merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
|
65
|
-
@abbreviations
|
65
|
+
@abbreviations = merged_abbreviations.flatten
|
66
66
|
|
67
67
|
merged_contractions = {}
|
68
68
|
@filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
|
69
69
|
merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
|
70
|
-
@contractions
|
70
|
+
@contractions = merged_contractions
|
71
71
|
|
72
72
|
merged_stop_words = []
|
73
73
|
@filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
|
74
74
|
merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
|
75
|
-
@stop_words
|
75
|
+
@stop_words = merged_stop_words.flatten
|
76
76
|
end
|
77
77
|
@punctuation = opts[:punctuation] || 'all'
|
78
78
|
@numbers = opts[:numbers] || 'all'
|
@@ -89,20 +89,20 @@ module PragmaticTokenizer
|
|
89
89
|
@remove_domains = opts[:remove_domains] || false
|
90
90
|
|
91
91
|
unless punctuation.to_s.eql?('all') ||
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
punctuation.to_s.eql?('semi') ||
|
93
|
+
punctuation.to_s.eql?('none') ||
|
94
|
+
punctuation.to_s.eql?('only')
|
95
95
|
raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
96
96
|
end
|
97
97
|
unless numbers.to_s.eql?('all') ||
|
98
|
-
|
99
|
-
|
100
|
-
|
98
|
+
numbers.to_s.eql?('semi') ||
|
99
|
+
numbers.to_s.eql?('none') ||
|
100
|
+
numbers.to_s.eql?('only')
|
101
101
|
raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
102
102
|
end
|
103
103
|
unless mentions.to_s.eql?('keep_original') ||
|
104
|
-
|
105
|
-
|
104
|
+
mentions.to_s.eql?('keep_and_clean') ||
|
105
|
+
mentions.to_s.eql?('remove')
|
106
106
|
raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
|
107
107
|
end
|
108
108
|
raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
|
@@ -112,153 +112,178 @@ module PragmaticTokenizer
|
|
112
112
|
|
113
113
|
def tokenize
|
114
114
|
return [] unless text
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
tokens.flatten
|
115
|
+
text
|
116
|
+
.scan(/.{,10000}(?=\s|\z)/m)
|
117
|
+
.map { |segment| post_process(pre_process(segment)) }
|
118
|
+
.flatten
|
120
119
|
end
|
121
120
|
|
122
121
|
private
|
123
122
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
classic_filter! if classic_filter
|
130
|
-
process_numbers!
|
131
|
-
remove_short_tokens! if minimum_length > 0
|
132
|
-
process_punctuation!
|
133
|
-
remove_stop_words!(stop_words) if remove_stop_words
|
134
|
-
remove_emoji! if remove_emoji
|
135
|
-
remove_emails! if remove_emails
|
136
|
-
mentions! if mentions
|
137
|
-
hashtags! if hashtags
|
138
|
-
remove_urls! if remove_urls
|
139
|
-
remove_domains! if remove_domains
|
140
|
-
split_long_words! if long_word_split
|
141
|
-
@tokens.reject { |t| t.empty? }
|
142
|
-
end
|
123
|
+
def pre_process(text)
|
124
|
+
text
|
125
|
+
.extend(PragmaticTokenizer::PreProcessor)
|
126
|
+
.pre_process(language: language_module)
|
127
|
+
end
|
143
128
|
|
144
|
-
|
145
|
-
|
146
|
-
|
129
|
+
def post_process(text)
|
130
|
+
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
|
131
|
+
downcase! if downcase
|
132
|
+
expand_contractions!(contractions) if expand_contractions
|
133
|
+
clean! if clean
|
134
|
+
classic_filter! if classic_filter
|
135
|
+
process_numbers!
|
136
|
+
remove_short_tokens! if minimum_length > 0
|
137
|
+
process_punctuation!
|
138
|
+
remove_stop_words!(stop_words) if remove_stop_words
|
139
|
+
remove_emoji! if remove_emoji
|
140
|
+
remove_emails! if remove_emails
|
141
|
+
mentions! if mentions
|
142
|
+
hashtags! if hashtags
|
143
|
+
remove_urls! if remove_urls
|
144
|
+
remove_domains! if remove_domains
|
145
|
+
split_long_words! if long_word_split
|
146
|
+
@tokens.reject(&:empty?)
|
147
|
+
end
|
147
148
|
|
148
|
-
|
149
|
-
|
150
|
-
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
|
151
|
-
else
|
152
|
-
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
149
|
+
def downcase!
|
150
|
+
@tokens.map! { |t| Unicode.downcase(t) }
|
153
151
|
end
|
154
|
-
end
|
155
152
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
153
|
+
def expand_contractions!(contractions)
|
154
|
+
@tokens = if downcase
|
155
|
+
@tokens.flat_map do |t|
|
156
|
+
if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
|
157
|
+
contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
|
158
|
+
.split(' ')
|
159
|
+
.flatten
|
160
|
+
else
|
161
|
+
t
|
162
|
+
end
|
163
|
+
end
|
164
|
+
else
|
165
|
+
@tokens.flat_map do |t|
|
166
|
+
if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
|
167
|
+
contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
|
168
|
+
.split(' ')
|
169
|
+
.each_with_index
|
170
|
+
.map { |token, i| i.eql?(0) ? Unicode.capitalize(token) : token }
|
171
|
+
.flatten
|
172
|
+
else
|
173
|
+
t
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
181
178
|
|
182
|
-
|
183
|
-
|
184
|
-
|
179
|
+
def clean!
|
180
|
+
@tokens = @tokens.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\s)\_+/) ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
181
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\s)/) ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
|
182
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\A)\_+/) ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
|
183
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\z)/) ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
184
|
+
.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\*+/) ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
185
|
+
.map { |t| t.gsub(/[[:cntrl:]]/, '') }
|
186
|
+
.map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
|
187
|
+
.map { |t| t.gsub(/\:(?=\z)/, '') }
|
188
|
+
.map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
|
189
|
+
.map { |t| t !~ /[@@#|#]/ ? t.gsub(/(?<=\D)1+(?=\z)/, '') : t }
|
190
|
+
.map { |t| t.gsub(/!+(?=\z)/, '') }
|
191
|
+
.map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
|
192
|
+
.map { |t| t.gsub(/\u{00AD}/, '') }
|
193
|
+
.map { |t| t.gsub(/\A(-|–)/, '') }
|
194
|
+
.map { |t| t.gsub(/[®©]/, '') }
|
195
|
+
.map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
|
196
|
+
.delete_if do |t|
|
197
|
+
t =~ /\A-+\z/ ||
|
198
|
+
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
199
|
+
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
200
|
+
t.length > 50 ||
|
201
|
+
(t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
|
202
|
+
(t.length == 1 && t =~ /\:/)
|
203
|
+
end
|
204
|
+
end
|
185
205
|
|
186
|
-
|
187
|
-
|
188
|
-
when 'semi'
|
189
|
-
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
190
|
-
when 'none'
|
191
|
-
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
|
192
|
-
when 'only'
|
193
|
-
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
206
|
+
def classic_filter!
|
207
|
+
@tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.delete('.').chomp("'s").chomp("’s").chomp("`s").chomp("́s") : t.chomp("'s").chomp("’s").chomp("`s").chomp("́s") }
|
194
208
|
end
|
195
|
-
end
|
196
209
|
|
197
|
-
|
198
|
-
|
199
|
-
|
210
|
+
def process_numbers!
|
211
|
+
case numbers.to_s
|
212
|
+
when 'semi'
|
213
|
+
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
214
|
+
when 'none'
|
215
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
|
216
|
+
when 'only'
|
217
|
+
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
218
|
+
end
|
219
|
+
end
|
200
220
|
|
201
|
-
|
202
|
-
|
203
|
-
when 'semi'
|
204
|
-
@tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
205
|
-
when 'none'
|
206
|
-
@tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
207
|
-
when 'only'
|
208
|
-
@tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
221
|
+
def remove_short_tokens!
|
222
|
+
@tokens.delete_if { |t| t.length < minimum_length }
|
209
223
|
end
|
210
|
-
end
|
211
224
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
225
|
+
def process_punctuation!
|
226
|
+
case punctuation.to_s
|
227
|
+
when 'semi'
|
228
|
+
@tokens -= PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
229
|
+
when 'none'
|
230
|
+
@tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
231
|
+
when 'only'
|
232
|
+
@tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
233
|
+
end
|
217
234
|
end
|
218
|
-
end
|
219
235
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
236
|
+
def remove_stop_words!(stop_words)
|
237
|
+
if downcase
|
238
|
+
@tokens -= stop_words
|
239
|
+
else
|
240
|
+
@tokens.delete_if { |t| stop_words.include?(Unicode.downcase(t)) }
|
241
|
+
end
|
242
|
+
end
|
227
243
|
|
228
|
-
|
229
|
-
|
230
|
-
|
244
|
+
def remove_emoji!
|
245
|
+
@tokens.delete_if do |t|
|
246
|
+
t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX ||
|
247
|
+
t =~ /\u{2744}\u{FE0F}/ ||
|
248
|
+
t =~ /\u{2744}\u{FE0E}/ ||
|
249
|
+
t =~ /\u{2744}/
|
250
|
+
end
|
251
|
+
end
|
231
252
|
|
232
|
-
|
233
|
-
|
234
|
-
when 'remove'
|
235
|
-
@tokens.delete_if { |t| t =~ /\A(@|@)/ }
|
236
|
-
when 'keep_and_clean'
|
237
|
-
@tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
|
253
|
+
def remove_emails!
|
254
|
+
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
|
238
255
|
end
|
239
|
-
end
|
240
256
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
257
|
+
def mentions!
|
258
|
+
case mentions.to_s
|
259
|
+
when 'remove'
|
260
|
+
@tokens.delete_if { |t| t =~ /\A(@|@)/ }
|
261
|
+
when 'keep_and_clean'
|
262
|
+
@tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
|
263
|
+
end
|
248
264
|
end
|
249
|
-
end
|
250
265
|
|
251
|
-
|
252
|
-
|
253
|
-
|
266
|
+
def hashtags!
|
267
|
+
case hashtags.to_s
|
268
|
+
when 'remove'
|
269
|
+
@tokens.delete_if { |t| t =~ /\A(#|#)/ }
|
270
|
+
when 'keep_and_clean'
|
271
|
+
@tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
272
|
+
@tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
|
273
|
+
end
|
274
|
+
end
|
254
275
|
|
255
|
-
|
256
|
-
|
257
|
-
|
276
|
+
def remove_urls!
|
277
|
+
@tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
|
278
|
+
end
|
258
279
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
280
|
+
def remove_domains!
|
281
|
+
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
|
282
|
+
end
|
283
|
+
|
284
|
+
def split_long_words!
|
285
|
+
@tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
286
|
+
.map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
|
287
|
+
end
|
263
288
|
end
|
264
289
|
end
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -4,38 +4,42 @@ describe PragmaticTokenizer do
|
|
4
4
|
context 'Language: Bulgarian (bg)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
6
|
text = 'Стойностни, вкл. български и руски'
|
7
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
8
|
-
|
7
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
8
|
+
text,
|
9
|
+
language: 'bg'
|
9
10
|
)
|
10
11
|
expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
|
11
12
|
end
|
12
13
|
|
13
14
|
it 'tokenizes a string #002' do
|
14
15
|
text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
|
15
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
16
|
-
|
17
|
-
|
16
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
17
|
+
text,
|
18
|
+
language: 'bg',
|
19
|
+
remove_stop_words: true
|
18
20
|
)
|
19
21
|
expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
|
20
22
|
end
|
21
23
|
|
22
24
|
it 'tokenizes a string #003' do
|
23
25
|
text = 'Без български жертви в Париж.'
|
24
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
25
|
-
|
26
|
-
|
26
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
27
|
+
text,
|
28
|
+
language: 'bg',
|
29
|
+
remove_stop_words: true
|
27
30
|
)
|
28
31
|
expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
|
29
32
|
end
|
30
33
|
|
31
34
|
it 'tokenizes a string #004' do
|
32
35
|
text = 'Без български жертви в Париж.'
|
33
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
37
|
+
text,
|
38
|
+
language: 'bg',
|
39
|
+
remove_stop_words: true,
|
40
|
+
downcase: false
|
37
41
|
)
|
38
42
|
expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
|
39
43
|
end
|
40
44
|
end
|
41
|
-
end
|
45
|
+
end
|