pragmatic_tokenizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -47,7 +47,7 @@ module PragmaticTokenizer
47
47
  # @option opts [Boolean] :remove_urls - (default: false)
48
48
  # @option opts [Boolean] :remove_domains - (default: false)
49
49
 
50
- def initialize(text, opts = {})
50
+ def initialize(text, opts={})
51
51
  @text = CGI.unescapeHTML(text)
52
52
  @filter_languages = opts[:filter_languages] || []
53
53
  @language = opts[:language] || 'en'
@@ -62,17 +62,17 @@ module PragmaticTokenizer
62
62
  merged_abbreviations = []
63
63
  @filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
64
64
  merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
65
- @abbreviations = merged_abbreviations.flatten
65
+ @abbreviations = merged_abbreviations.flatten
66
66
 
67
67
  merged_contractions = {}
68
68
  @filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
69
69
  merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
70
- @contractions = merged_contractions
70
+ @contractions = merged_contractions
71
71
 
72
72
  merged_stop_words = []
73
73
  @filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
74
74
  merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
75
- @stop_words = merged_stop_words.flatten
75
+ @stop_words = merged_stop_words.flatten
76
76
  end
77
77
  @punctuation = opts[:punctuation] || 'all'
78
78
  @numbers = opts[:numbers] || 'all'
@@ -89,20 +89,20 @@ module PragmaticTokenizer
89
89
  @remove_domains = opts[:remove_domains] || false
90
90
 
91
91
  unless punctuation.to_s.eql?('all') ||
92
- punctuation.to_s.eql?('semi') ||
93
- punctuation.to_s.eql?('none') ||
94
- punctuation.to_s.eql?('only')
92
+ punctuation.to_s.eql?('semi') ||
93
+ punctuation.to_s.eql?('none') ||
94
+ punctuation.to_s.eql?('only')
95
95
  raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
96
96
  end
97
97
  unless numbers.to_s.eql?('all') ||
98
- numbers.to_s.eql?('semi') ||
99
- numbers.to_s.eql?('none') ||
100
- numbers.to_s.eql?('only')
98
+ numbers.to_s.eql?('semi') ||
99
+ numbers.to_s.eql?('none') ||
100
+ numbers.to_s.eql?('only')
101
101
  raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
102
102
  end
103
103
  unless mentions.to_s.eql?('keep_original') ||
104
- mentions.to_s.eql?('keep_and_clean') ||
105
- mentions.to_s.eql?('remove')
104
+ mentions.to_s.eql?('keep_and_clean') ||
105
+ mentions.to_s.eql?('remove')
106
106
  raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
107
107
  end
108
108
  raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
@@ -112,153 +112,178 @@ module PragmaticTokenizer
112
112
 
113
113
  def tokenize
114
114
  return [] unless text
115
- tokens = []
116
- text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
117
- tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
118
- end
119
- tokens.flatten
115
+ text
116
+ .scan(/.{,10000}(?=\s|\z)/m)
117
+ .map { |segment| post_process(pre_process(segment)) }
118
+ .flatten
120
119
  end
121
120
 
122
121
  private
123
122
 
124
- def post_process(text)
125
- @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
126
- downcase! if downcase
127
- expand_contractions!(contractions) if expand_contractions
128
- clean! if clean
129
- classic_filter! if classic_filter
130
- process_numbers!
131
- remove_short_tokens! if minimum_length > 0
132
- process_punctuation!
133
- remove_stop_words!(stop_words) if remove_stop_words
134
- remove_emoji! if remove_emoji
135
- remove_emails! if remove_emails
136
- mentions! if mentions
137
- hashtags! if hashtags
138
- remove_urls! if remove_urls
139
- remove_domains! if remove_domains
140
- split_long_words! if long_word_split
141
- @tokens.reject { |t| t.empty? }
142
- end
123
+ def pre_process(text)
124
+ text
125
+ .extend(PragmaticTokenizer::PreProcessor)
126
+ .pre_process(language: language_module)
127
+ end
143
128
 
144
- def downcase!
145
- @tokens.map! { |t| Unicode::downcase(t) }
146
- end
129
+ def post_process(text)
130
+ @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
131
+ downcase! if downcase
132
+ expand_contractions!(contractions) if expand_contractions
133
+ clean! if clean
134
+ classic_filter! if classic_filter
135
+ process_numbers!
136
+ remove_short_tokens! if minimum_length > 0
137
+ process_punctuation!
138
+ remove_stop_words!(stop_words) if remove_stop_words
139
+ remove_emoji! if remove_emoji
140
+ remove_emails! if remove_emails
141
+ mentions! if mentions
142
+ hashtags! if hashtags
143
+ remove_urls! if remove_urls
144
+ remove_domains! if remove_domains
145
+ split_long_words! if long_word_split
146
+ @tokens.reject(&:empty?)
147
+ end
147
148
 
148
- def expand_contractions!(contractions)
149
- if downcase
150
- @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
151
- else
152
- @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
149
+ def downcase!
150
+ @tokens.map! { |t| Unicode.downcase(t) }
153
151
  end
154
- end
155
152
 
156
- def clean!
157
- @tokens = @tokens.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\s)\_+/) ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
158
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\s)/) ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
159
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\A)\_+/) ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
160
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\z)/) ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
161
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\*+/) ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
162
- .map { |t| t.gsub(/[[:cntrl:]]/, '') }
163
- .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
164
- .map { |t| t.gsub(/\:(?=\z)/, '') }
165
- .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
166
- .map { |t| t !~ /[@@#|#]/ ? t.gsub(/(?<=\D)1+(?=\z)/, '') : t }
167
- .map { |t| t.gsub(/!+(?=\z)/, '') }
168
- .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
169
- .map { |t| t.gsub(/\u{00AD}/, '') }
170
- .map { |t| t.gsub(/\A(-|–)/, '') }
171
- .map { |t| t.gsub(/[®©]/, '') }
172
- .map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
173
- .delete_if { |t| t =~ /\A-+\z/ ||
174
- PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
175
- t =~ /\A\.{2,}\z/ || t.include?("\\") ||
176
- t.length > 50 ||
177
- (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
178
- (t.length == 1 && t =~ /\:/)
179
- }
180
- end
153
+ def expand_contractions!(contractions)
154
+ @tokens = if downcase
155
+ @tokens.flat_map do |t|
156
+ if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
157
+ contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
158
+ .split(' ')
159
+ .flatten
160
+ else
161
+ t
162
+ end
163
+ end
164
+ else
165
+ @tokens.flat_map do |t|
166
+ if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
167
+ contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
168
+ .split(' ')
169
+ .each_with_index
170
+ .map { |token, i| i.eql?(0) ? Unicode.capitalize(token) : token }
171
+ .flatten
172
+ else
173
+ t
174
+ end
175
+ end
176
+ end
177
+ end
181
178
 
182
- def classic_filter!
183
- @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s").chomp("́s") : t.chomp("'s").chomp("’s").chomp("`s").chomp("́s") }
184
- end
179
+ def clean!
180
+ @tokens = @tokens.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\s)\_+/) ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
181
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\s)/) ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
182
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\A)\_+/) ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
183
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\z)/) ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
184
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\*+/) ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
185
+ .map { |t| t.gsub(/[[:cntrl:]]/, '') }
186
+ .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
187
+ .map { |t| t.gsub(/\:(?=\z)/, '') }
188
+ .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
189
+ .map { |t| t !~ /[@@#|#]/ ? t.gsub(/(?<=\D)1+(?=\z)/, '') : t }
190
+ .map { |t| t.gsub(/!+(?=\z)/, '') }
191
+ .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
192
+ .map { |t| t.gsub(/\u{00AD}/, '') }
193
+ .map { |t| t.gsub(/\A(-|–)/, '') }
194
+ .map { |t| t.gsub(/[®©]/, '') }
195
+ .map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
196
+ .delete_if do |t|
197
+ t =~ /\A-+\z/ ||
198
+ PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
199
+ t =~ /\A\.{2,}\z/ || t.include?("\\") ||
200
+ t.length > 50 ||
201
+ (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
202
+ (t.length == 1 && t =~ /\:/)
203
+ end
204
+ end
185
205
 
186
- def process_numbers!
187
- case numbers.to_s
188
- when 'semi'
189
- @tokens.delete_if { |t| t =~ /\A\d+\z/ }
190
- when 'none'
191
- @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
192
- when 'only'
193
- @tokens.delete_if { |t| t =~ /\A\D+\z/ }
206
+ def classic_filter!
207
+ @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.delete('.').chomp("'s").chomp("’s").chomp("`s").chomp("́s") : t.chomp("'s").chomp("’s").chomp("`s").chomp("́s") }
194
208
  end
195
- end
196
209
 
197
- def remove_short_tokens!
198
- @tokens.delete_if { |t| t.length < minimum_length }
199
- end
210
+ def process_numbers!
211
+ case numbers.to_s
212
+ when 'semi'
213
+ @tokens.delete_if { |t| t =~ /\A\d+\z/ }
214
+ when 'none'
215
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
216
+ when 'only'
217
+ @tokens.delete_if { |t| t =~ /\A\D+\z/ }
218
+ end
219
+ end
200
220
 
201
- def process_punctuation!
202
- case punctuation.to_s
203
- when 'semi'
204
- @tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
205
- when 'none'
206
- @tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
207
- when 'only'
208
- @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
221
+ def remove_short_tokens!
222
+ @tokens.delete_if { |t| t.length < minimum_length }
209
223
  end
210
- end
211
224
 
212
- def remove_stop_words!(stop_words)
213
- if downcase
214
- @tokens = @tokens - stop_words
215
- else
216
- @tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
225
+ def process_punctuation!
226
+ case punctuation.to_s
227
+ when 'semi'
228
+ @tokens -= PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
229
+ when 'none'
230
+ @tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
231
+ when 'only'
232
+ @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
233
+ end
217
234
  end
218
- end
219
235
 
220
- def remove_emoji!
221
- @tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX ||
222
- t =~ /\u{2744}\u{FE0F}/ ||
223
- t =~ /\u{2744}\u{FE0E}/ ||
224
- t =~ /\u{2744}/
225
- }
226
- end
236
+ def remove_stop_words!(stop_words)
237
+ if downcase
238
+ @tokens -= stop_words
239
+ else
240
+ @tokens.delete_if { |t| stop_words.include?(Unicode.downcase(t)) }
241
+ end
242
+ end
227
243
 
228
- def remove_emails!
229
- @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
230
- end
244
+ def remove_emoji!
245
+ @tokens.delete_if do |t|
246
+ t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX ||
247
+ t =~ /\u{2744}\u{FE0F}/ ||
248
+ t =~ /\u{2744}\u{FE0E}/ ||
249
+ t =~ /\u{2744}/
250
+ end
251
+ end
231
252
 
232
- def mentions!
233
- case mentions.to_s
234
- when 'remove'
235
- @tokens.delete_if { |t| t =~ /\A(@|@)/ }
236
- when 'keep_and_clean'
237
- @tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
253
+ def remove_emails!
254
+ @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
238
255
  end
239
- end
240
256
 
241
- def hashtags!
242
- case hashtags.to_s
243
- when 'remove'
244
- @tokens.delete_if { |t| t =~ /\A(#|#)/ }
245
- when 'keep_and_clean'
246
- @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
247
- @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
257
+ def mentions!
258
+ case mentions.to_s
259
+ when 'remove'
260
+ @tokens.delete_if { |t| t =~ /\A(@|@)/ }
261
+ when 'keep_and_clean'
262
+ @tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
263
+ end
248
264
  end
249
- end
250
265
 
251
- def remove_urls!
252
- @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
253
- end
266
+ def hashtags!
267
+ case hashtags.to_s
268
+ when 'remove'
269
+ @tokens.delete_if { |t| t =~ /\A(#|#)/ }
270
+ when 'keep_and_clean'
271
+ @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
272
+ @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
273
+ end
274
+ end
254
275
 
255
- def remove_domains!
256
- @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
257
- end
276
+ def remove_urls!
277
+ @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
278
+ end
258
279
 
259
- def split_long_words!
260
- @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
261
- .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
262
- end
280
+ def remove_domains!
281
+ @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
282
+ end
283
+
284
+ def split_long_words!
285
+ @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
286
+ .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
287
+ end
263
288
  end
264
289
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.4.0"
2
+ VERSION = "1.5.0".freeze
3
3
  end
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
25
25
  spec.add_development_dependency "stackprof"
26
+ spec.add_development_dependency "rubocop"
26
27
  end
@@ -4,38 +4,42 @@ describe PragmaticTokenizer do
4
4
  context 'Language: Bulgarian (bg)' do
5
5
  it 'tokenizes a string #001' do
6
6
  text = 'Стойностни, вкл. български и руски'
7
- pt = PragmaticTokenizer::Tokenizer.new(text,
8
- language: 'bg'
7
+ pt = PragmaticTokenizer::Tokenizer.new(
8
+ text,
9
+ language: 'bg'
9
10
  )
10
11
  expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
11
12
  end
12
13
 
13
14
  it 'tokenizes a string #002' do
14
15
  text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
15
- pt = PragmaticTokenizer::Tokenizer.new(text,
16
- language: 'bg',
17
- remove_stop_words: true
16
+ pt = PragmaticTokenizer::Tokenizer.new(
17
+ text,
18
+ language: 'bg',
19
+ remove_stop_words: true
18
20
  )
19
21
  expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
20
22
  end
21
23
 
22
24
  it 'tokenizes a string #003' do
23
25
  text = 'Без български жертви в Париж.'
24
- pt = PragmaticTokenizer::Tokenizer.new(text,
25
- language: 'bg',
26
- remove_stop_words: true
26
+ pt = PragmaticTokenizer::Tokenizer.new(
27
+ text,
28
+ language: 'bg',
29
+ remove_stop_words: true
27
30
  )
28
31
  expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
29
32
  end
30
33
 
31
34
  it 'tokenizes a string #004' do
32
35
  text = 'Без български жертви в Париж.'
33
- pt = PragmaticTokenizer::Tokenizer.new(text,
34
- language: 'bg',
35
- remove_stop_words: true,
36
- downcase: false
36
+ pt = PragmaticTokenizer::Tokenizer.new(
37
+ text,
38
+ language: 'bg',
39
+ remove_stop_words: true,
40
+ downcase: false
37
41
  )
38
42
  expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
39
43
  end
40
44
  end
41
- end
45
+ end