pragmatic_tokenizer 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -47,7 +47,7 @@ module PragmaticTokenizer
47
47
  # @option opts [Boolean] :remove_urls - (default: false)
48
48
  # @option opts [Boolean] :remove_domains - (default: false)
49
49
 
50
- def initialize(text, opts = {})
50
+ def initialize(text, opts={})
51
51
  @text = CGI.unescapeHTML(text)
52
52
  @filter_languages = opts[:filter_languages] || []
53
53
  @language = opts[:language] || 'en'
@@ -62,17 +62,17 @@ module PragmaticTokenizer
62
62
  merged_abbreviations = []
63
63
  @filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
64
64
  merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
65
- @abbreviations = merged_abbreviations.flatten
65
+ @abbreviations = merged_abbreviations.flatten
66
66
 
67
67
  merged_contractions = {}
68
68
  @filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
69
69
  merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
70
- @contractions = merged_contractions
70
+ @contractions = merged_contractions
71
71
 
72
72
  merged_stop_words = []
73
73
  @filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
74
74
  merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
75
- @stop_words = merged_stop_words.flatten
75
+ @stop_words = merged_stop_words.flatten
76
76
  end
77
77
  @punctuation = opts[:punctuation] || 'all'
78
78
  @numbers = opts[:numbers] || 'all'
@@ -89,20 +89,20 @@ module PragmaticTokenizer
89
89
  @remove_domains = opts[:remove_domains] || false
90
90
 
91
91
  unless punctuation.to_s.eql?('all') ||
92
- punctuation.to_s.eql?('semi') ||
93
- punctuation.to_s.eql?('none') ||
94
- punctuation.to_s.eql?('only')
92
+ punctuation.to_s.eql?('semi') ||
93
+ punctuation.to_s.eql?('none') ||
94
+ punctuation.to_s.eql?('only')
95
95
  raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
96
96
  end
97
97
  unless numbers.to_s.eql?('all') ||
98
- numbers.to_s.eql?('semi') ||
99
- numbers.to_s.eql?('none') ||
100
- numbers.to_s.eql?('only')
98
+ numbers.to_s.eql?('semi') ||
99
+ numbers.to_s.eql?('none') ||
100
+ numbers.to_s.eql?('only')
101
101
  raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
102
102
  end
103
103
  unless mentions.to_s.eql?('keep_original') ||
104
- mentions.to_s.eql?('keep_and_clean') ||
105
- mentions.to_s.eql?('remove')
104
+ mentions.to_s.eql?('keep_and_clean') ||
105
+ mentions.to_s.eql?('remove')
106
106
  raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
107
107
  end
108
108
  raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
@@ -112,153 +112,178 @@ module PragmaticTokenizer
112
112
 
113
113
  def tokenize
114
114
  return [] unless text
115
- tokens = []
116
- text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
117
- tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
118
- end
119
- tokens.flatten
115
+ text
116
+ .scan(/.{,10000}(?=\s|\z)/m)
117
+ .map { |segment| post_process(pre_process(segment)) }
118
+ .flatten
120
119
  end
121
120
 
122
121
  private
123
122
 
124
- def post_process(text)
125
- @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
126
- downcase! if downcase
127
- expand_contractions!(contractions) if expand_contractions
128
- clean! if clean
129
- classic_filter! if classic_filter
130
- process_numbers!
131
- remove_short_tokens! if minimum_length > 0
132
- process_punctuation!
133
- remove_stop_words!(stop_words) if remove_stop_words
134
- remove_emoji! if remove_emoji
135
- remove_emails! if remove_emails
136
- mentions! if mentions
137
- hashtags! if hashtags
138
- remove_urls! if remove_urls
139
- remove_domains! if remove_domains
140
- split_long_words! if long_word_split
141
- @tokens.reject { |t| t.empty? }
142
- end
123
+ def pre_process(text)
124
+ text
125
+ .extend(PragmaticTokenizer::PreProcessor)
126
+ .pre_process(language: language_module)
127
+ end
143
128
 
144
- def downcase!
145
- @tokens.map! { |t| Unicode::downcase(t) }
146
- end
129
+ def post_process(text)
130
+ @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
131
+ downcase! if downcase
132
+ expand_contractions!(contractions) if expand_contractions
133
+ clean! if clean
134
+ classic_filter! if classic_filter
135
+ process_numbers!
136
+ remove_short_tokens! if minimum_length > 0
137
+ process_punctuation!
138
+ remove_stop_words!(stop_words) if remove_stop_words
139
+ remove_emoji! if remove_emoji
140
+ remove_emails! if remove_emails
141
+ mentions! if mentions
142
+ hashtags! if hashtags
143
+ remove_urls! if remove_urls
144
+ remove_domains! if remove_domains
145
+ split_long_words! if long_word_split
146
+ @tokens.reject(&:empty?)
147
+ end
147
148
 
148
- def expand_contractions!(contractions)
149
- if downcase
150
- @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
151
- else
152
- @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
149
+ def downcase!
150
+ @tokens.map! { |t| Unicode.downcase(t) }
153
151
  end
154
- end
155
152
 
156
- def clean!
157
- @tokens = @tokens.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\s)\_+/) ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
158
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\s)/) ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
159
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\A)\_+/) ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
160
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\z)/) ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
161
- .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\*+/) ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
162
- .map { |t| t.gsub(/[[:cntrl:]]/, '') }
163
- .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
164
- .map { |t| t.gsub(/\:(?=\z)/, '') }
165
- .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
166
- .map { |t| t !~ /[@@#|#]/ ? t.gsub(/(?<=\D)1+(?=\z)/, '') : t }
167
- .map { |t| t.gsub(/!+(?=\z)/, '') }
168
- .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
169
- .map { |t| t.gsub(/\u{00AD}/, '') }
170
- .map { |t| t.gsub(/\A(-|–)/, '') }
171
- .map { |t| t.gsub(/[®©]/, '') }
172
- .map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
173
- .delete_if { |t| t =~ /\A-+\z/ ||
174
- PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
175
- t =~ /\A\.{2,}\z/ || t.include?("\\") ||
176
- t.length > 50 ||
177
- (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
178
- (t.length == 1 && t =~ /\:/)
179
- }
180
- end
153
+ def expand_contractions!(contractions)
154
+ @tokens = if downcase
155
+ @tokens.flat_map do |t|
156
+ if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
157
+ contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
158
+ .split(' ')
159
+ .flatten
160
+ else
161
+ t
162
+ end
163
+ end
164
+ else
165
+ @tokens.flat_map do |t|
166
+ if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
167
+ contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
168
+ .split(' ')
169
+ .each_with_index
170
+ .map { |token, i| i.eql?(0) ? Unicode.capitalize(token) : token }
171
+ .flatten
172
+ else
173
+ t
174
+ end
175
+ end
176
+ end
177
+ end
181
178
 
182
- def classic_filter!
183
- @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s").chomp("́s") : t.chomp("'s").chomp("’s").chomp("`s").chomp("́s") }
184
- end
179
+ def clean!
180
+ @tokens = @tokens.flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\s)\_+/) ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
181
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\s)/) ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
182
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /(?<=\A)\_+/) ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
183
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\_+(?=\z)/) ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
184
+ .flat_map { |t| (t !~ /[@@#|#]/ && t =~ /\*+/) ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
185
+ .map { |t| t.gsub(/[[:cntrl:]]/, '') }
186
+ .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
187
+ .map { |t| t.gsub(/\:(?=\z)/, '') }
188
+ .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
189
+ .map { |t| t !~ /[@@#|#]/ ? t.gsub(/(?<=\D)1+(?=\z)/, '') : t }
190
+ .map { |t| t.gsub(/!+(?=\z)/, '') }
191
+ .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
192
+ .map { |t| t.gsub(/\u{00AD}/, '') }
193
+ .map { |t| t.gsub(/\A(-|–)/, '') }
194
+ .map { |t| t.gsub(/[®©]/, '') }
195
+ .map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
196
+ .delete_if do |t|
197
+ t =~ /\A-+\z/ ||
198
+ PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
199
+ t =~ /\A\.{2,}\z/ || t.include?("\\") ||
200
+ t.length > 50 ||
201
+ (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
202
+ (t.length == 1 && t =~ /\:/)
203
+ end
204
+ end
185
205
 
186
- def process_numbers!
187
- case numbers.to_s
188
- when 'semi'
189
- @tokens.delete_if { |t| t =~ /\A\d+\z/ }
190
- when 'none'
191
- @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
192
- when 'only'
193
- @tokens.delete_if { |t| t =~ /\A\D+\z/ }
206
+ def classic_filter!
207
+ @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.delete('.').chomp("'s").chomp("’s").chomp("`s").chomp("́s") : t.chomp("'s").chomp("’s").chomp("`s").chomp("́s") }
194
208
  end
195
- end
196
209
 
197
- def remove_short_tokens!
198
- @tokens.delete_if { |t| t.length < minimum_length }
199
- end
210
+ def process_numbers!
211
+ case numbers.to_s
212
+ when 'semi'
213
+ @tokens.delete_if { |t| t =~ /\A\d+\z/ }
214
+ when 'none'
215
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
216
+ when 'only'
217
+ @tokens.delete_if { |t| t =~ /\A\D+\z/ }
218
+ end
219
+ end
200
220
 
201
- def process_punctuation!
202
- case punctuation.to_s
203
- when 'semi'
204
- @tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
205
- when 'none'
206
- @tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
207
- when 'only'
208
- @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
221
+ def remove_short_tokens!
222
+ @tokens.delete_if { |t| t.length < minimum_length }
209
223
  end
210
- end
211
224
 
212
- def remove_stop_words!(stop_words)
213
- if downcase
214
- @tokens = @tokens - stop_words
215
- else
216
- @tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
225
+ def process_punctuation!
226
+ case punctuation.to_s
227
+ when 'semi'
228
+ @tokens -= PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
229
+ when 'none'
230
+ @tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
231
+ when 'only'
232
+ @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
233
+ end
217
234
  end
218
- end
219
235
 
220
- def remove_emoji!
221
- @tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX ||
222
- t =~ /\u{2744}\u{FE0F}/ ||
223
- t =~ /\u{2744}\u{FE0E}/ ||
224
- t =~ /\u{2744}/
225
- }
226
- end
236
+ def remove_stop_words!(stop_words)
237
+ if downcase
238
+ @tokens -= stop_words
239
+ else
240
+ @tokens.delete_if { |t| stop_words.include?(Unicode.downcase(t)) }
241
+ end
242
+ end
227
243
 
228
- def remove_emails!
229
- @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
230
- end
244
+ def remove_emoji!
245
+ @tokens.delete_if do |t|
246
+ t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX ||
247
+ t =~ /\u{2744}\u{FE0F}/ ||
248
+ t =~ /\u{2744}\u{FE0E}/ ||
249
+ t =~ /\u{2744}/
250
+ end
251
+ end
231
252
 
232
- def mentions!
233
- case mentions.to_s
234
- when 'remove'
235
- @tokens.delete_if { |t| t =~ /\A(@|@)/ }
236
- when 'keep_and_clean'
237
- @tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
253
+ def remove_emails!
254
+ @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
238
255
  end
239
- end
240
256
 
241
- def hashtags!
242
- case hashtags.to_s
243
- when 'remove'
244
- @tokens.delete_if { |t| t =~ /\A(#|#)/ }
245
- when 'keep_and_clean'
246
- @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
247
- @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
257
+ def mentions!
258
+ case mentions.to_s
259
+ when 'remove'
260
+ @tokens.delete_if { |t| t =~ /\A(@|@)/ }
261
+ when 'keep_and_clean'
262
+ @tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
263
+ end
248
264
  end
249
- end
250
265
 
251
- def remove_urls!
252
- @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
253
- end
266
+ def hashtags!
267
+ case hashtags.to_s
268
+ when 'remove'
269
+ @tokens.delete_if { |t| t =~ /\A(#|#)/ }
270
+ when 'keep_and_clean'
271
+ @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
272
+ @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
273
+ end
274
+ end
254
275
 
255
- def remove_domains!
256
- @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
257
- end
276
+ def remove_urls!
277
+ @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
278
+ end
258
279
 
259
- def split_long_words!
260
- @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
261
- .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
262
- end
280
+ def remove_domains!
281
+ @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
282
+ end
283
+
284
+ def split_long_words!
285
+ @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
286
+ .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
287
+ end
263
288
  end
264
289
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.4.0"
2
+ VERSION = "1.5.0".freeze
3
3
  end
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
25
25
  spec.add_development_dependency "stackprof"
26
+ spec.add_development_dependency "rubocop"
26
27
  end
@@ -4,38 +4,42 @@ describe PragmaticTokenizer do
4
4
  context 'Language: Bulgarian (bg)' do
5
5
  it 'tokenizes a string #001' do
6
6
  text = 'Стойностни, вкл. български и руски'
7
- pt = PragmaticTokenizer::Tokenizer.new(text,
8
- language: 'bg'
7
+ pt = PragmaticTokenizer::Tokenizer.new(
8
+ text,
9
+ language: 'bg'
9
10
  )
10
11
  expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
11
12
  end
12
13
 
13
14
  it 'tokenizes a string #002' do
14
15
  text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
15
- pt = PragmaticTokenizer::Tokenizer.new(text,
16
- language: 'bg',
17
- remove_stop_words: true
16
+ pt = PragmaticTokenizer::Tokenizer.new(
17
+ text,
18
+ language: 'bg',
19
+ remove_stop_words: true
18
20
  )
19
21
  expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
20
22
  end
21
23
 
22
24
  it 'tokenizes a string #003' do
23
25
  text = 'Без български жертви в Париж.'
24
- pt = PragmaticTokenizer::Tokenizer.new(text,
25
- language: 'bg',
26
- remove_stop_words: true
26
+ pt = PragmaticTokenizer::Tokenizer.new(
27
+ text,
28
+ language: 'bg',
29
+ remove_stop_words: true
27
30
  )
28
31
  expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
29
32
  end
30
33
 
31
34
  it 'tokenizes a string #004' do
32
35
  text = 'Без български жертви в Париж.'
33
- pt = PragmaticTokenizer::Tokenizer.new(text,
34
- language: 'bg',
35
- remove_stop_words: true,
36
- downcase: false
36
+ pt = PragmaticTokenizer::Tokenizer.new(
37
+ text,
38
+ language: 'bg',
39
+ remove_stop_words: true,
40
+ downcase: false
37
41
  )
38
42
  expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
39
43
  end
40
44
  end
41
- end
45
+ end