fk_str 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Guilherme Baptista
2
+ https://github.com/gbaptista/fk_str
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in
12
+ all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc 'Run tests'
8
+ task :default => :test
data/lib/fk_str.rb ADDED
@@ -0,0 +1,497 @@
1
+ # encoding: utf-8
2
+
3
+ require 'date'
4
+ require 'fk_str/dictionary'
5
+
6
+ module FkStr
7
+
8
+ def self.treat_encoding str, debug=false
9
+ str_r = ''
10
+ str.lines.each_with_index { |l, i| str_r += ' ' + self.treat_encoding_s(l, debug) if !debug or (i > -1 and i < 1) }
11
+ return str_r.strip
12
+ end
13
+
14
+ def self.is_eq str, str_b, pct=1
15
+
16
+ str = self.to_term str, true
17
+ str_b = self.to_term str_b, true
18
+
19
+ str_c = str.concat str_b
20
+
21
+ return true if (100-(100*str_c.uniq.size/str_c.size)) >= pct
22
+
23
+ return false
24
+
25
+ end
26
+
27
+ def self.to_slug str
28
+
29
+ return str if str.to_s == ''
30
+
31
+ return self.remove_accents(str).gsub(/\s{1,}| {1,}/, ' ').gsub(/[\+\/_\-|:@#\\,]/, ' ').gsub('&', 'e').gsub(/[^a-zA-Z0-9 ]/, '').downcase.gsub(/\s{1,}| {1,}/, ' ').strip.gsub(' ', '-')
32
+
33
+ end
34
+
35
+ def self.to_term str, ar=false
36
+
37
+ return str if str.to_s == ''
38
+
39
+ str_ar = []
40
+
41
+ self.to_slug(str).split('-').each do |s|
42
+ s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r }
43
+ @@simple_downcase_consonants.each { |c| s = s.gsub /#{c}(h|r|l|u)/, c }
44
+ if !s.empty? and !@@countries_acronyms.include? s and !@@articles_and_others.include? s
45
+ s = s.gsub /m/, 'n'
46
+ s = s.gsub /l/, 'r'
47
+ s = s.gsub /z/, 's'
48
+ s = s.gsub /g/, 'j'
49
+ s = s.gsub /e|y/, 'i'
50
+ s = s.gsub /o|w/, 'u'
51
+ s = s.gsub /c|q/, 'k'
52
+ s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r }
53
+ s = s.gsub /(r|s|n)$/, ''
54
+ str_ar << s if !s.empty?
55
+ end
56
+ end
57
+
58
+ return str_ar.uniq if ar
59
+
60
+ return str_ar.uniq.join
61
+
62
+ end
63
+
64
+ def self.remove_accents str
65
+
66
+ return '' if str.to_s == ''
67
+ str = str.gsub(/[ÁÃÂÀÄĂĀÅÆ]/, 'A').gsub(/[áãâàäăāåæ]/, 'a')
68
+ str = str.gsub(/[ÉẼÊÈËĔĒ]/, 'E').gsub(/[éẽêèëĕē]/, 'e')
69
+ str = str.gsub(/[ÍĨÎÌÏĬĪ]/, 'I').gsub(/[íĩîìïĭī]/, 'i')
70
+ str = str.gsub(/[ÓÕÔÒÖŎŌŐÐ]/, 'O').gsub(/[óõôòöŏōőð]/, 'o')
71
+ str = str.gsub(/[ÚŨÛÙÜŬŪǕ]/, 'U').gsub(/[úũûùüŭūǖ]/, 'u')
72
+ str = str.gsub(/[ÇČ]/, 'C').gsub(/[çč]/, 'c').gsub(/Ğ/, 'G').gsub(/ğ/, 'g').gsub(/Ñ/, 'N').gsub(/ñ/, 'n').gsub(/Š/, 'S').gsub(/š/, 's')
73
+ str = str.gsub(/[ȲŸÝỲ]/, 'Y').gsub(/[ȳÿýỳ]/, 'y').gsub(/Ž/, 'Z').gsub(/ž/, 'z')
74
+
75
+ return str
76
+
77
+ end
78
+
79
+ # 35 seconds
80
+ # 18 seconds
81
+ # 16 seconds
82
+ def self.upcasewords str
83
+
84
+ return str if str.to_s == ''
85
+
86
+ # Trata espaçamentos duplicados ou inválidos.
87
+ str = str.gsub(/\s{1,}| {1,}/, ' ').strip
88
+
89
+ rstr = []
90
+ str.split(' ').each { |w| rstr << upcaseword(w) }
91
+ str = rstr.join(' ')
92
+
93
+ # Trata espaçamentos duplicados ou inválidos.
94
+ str = str.gsub(/\s{1,}| {1,}/, ' ')
95
+
96
+ # Maiúsculo na primeira letra
97
+ fl = @@letters_by_letter[remove_accents(str[0]).downcase]
98
+ fl.each { |l| str[0] = str[0].gsub(l[0], l[1]) } if fl
99
+
100
+ return str
101
+
102
+ end
103
+
104
+ def self.remove_if_ends_with str, texts, not_change_if_returns_with=nil, if_not_change_returns_with_last_removed=0
105
+
106
+ return str if str.split(' ').size == 1
107
+
108
+ texts.each_with_index { |t, i| texts.delete_at i if t == '' }
109
+
110
+ str_o = str
111
+
112
+ str = str.strip
113
+
114
+ str_t = self.remove_accents(str).downcase
115
+
116
+ texts = texts.uniq
117
+
118
+ texts.each_with_index { |v, i| texts[i] = self.remove_accents(v).downcase }
119
+
120
+ not_change_if_returns_with.each_with_index { |v, i| not_change_if_returns_with[i] = self.remove_accents(v).downcase } if !not_change_if_returns_with.nil?
121
+
122
+ removed = []
123
+
124
+ continue = true
125
+ while continue
126
+ continue = false
127
+ texts.each do |t|
128
+
129
+ # Se o final da string for igual ao termo...
130
+ if t == str_t[str_t.size-t.size..str_t.size].to_s
131
+
132
+ # Se antes do termo final na string não for igual à ' de ' ou ' da '...
133
+ if ![' de ', ' da '].include? str_t[str_t.size-t.size-4].to_s + str_t[str_t.size-t.size-3..str_t.size-t.size-2].to_s + str_t[str_t.size-t.size-1].to_s
134
+
135
+ # Se o primeiro char do termo não for uma letra ou se o char anterior ao termo não for uma letra...
136
+ if (!@@simple_downcase_letters.include? t[0] or !@@simple_downcase_letters.include? str_t[str_t.size-t.size-1]) and str_t.size > 1
137
+
138
+ str_l = str
139
+
140
+ str = str[0..str.size-t.size-1].strip
141
+ str_t = self.remove_accents(str).downcase
142
+
143
+ removed << str_l[str.size..str_l.size]
144
+
145
+ continue = true
146
+
147
+ end
148
+
149
+ end
150
+
151
+ end
152
+
153
+ end
154
+ end
155
+
156
+ # Se o retorno for igual à alguma condição que não deve ser retornada...
157
+ if !not_change_if_returns_with.nil?
158
+ if not_change_if_returns_with.include?(self.remove_accents(str).downcase)
159
+ # Se for solicitado que retorne apenas com x termos que foram removidos...
160
+ if if_not_change_returns_with_last_removed > 0
161
+ removed = removed.reverse
162
+ (1..if_not_change_returns_with_last_removed).each { |n| str += removed[n-1].to_s }
163
+ return str.strip
164
+ end
165
+ return str_o
166
+ end
167
+ end
168
+
169
+ return str
170
+
171
+ end
172
+
173
+ def self.extract_dates str, reference_date=Time.now, reverse_month_day=false
174
+
175
+ return [] if str.nil?
176
+
177
+ return [Time.new(str.year, str.month, str.day)] if str.kind_of?(Time) or str.kind_of?(Date) or str.kind_of?(DateTime)
178
+
179
+ o_str = str
180
+
181
+ years = []
182
+ (-30..20).each { |y| years << reference_date.year+y }
183
+
184
+ begin
185
+
186
+ str = str.gsub /[0-9]{1,}(º|ª)/, ' '
187
+
188
+ str = self.remove_accents str
189
+
190
+ str = str.downcase
191
+
192
+ str = str.gsub /[0-9]{1,}+[a-z]{1,}+[0-9]{1,}/, ''
193
+ str = str.gsub /[0-9]{1,}+[a-z]{1,}/, ' '
194
+ str = str.gsub /[a-z]{1,}+[0-9]{1,}/, ' '
195
+
196
+ str = str.gsub(/[^a-z|^0-9|^\/|^\-|^\.|^:]/i, ' ')
197
+
198
+ str = str.gsub(/[0-9]{1,}:[0-9]{1,}|:[0-9]{1,}|[0-9]{1,}h[0-9]{1,}|[0-9]{1,}%|[0-9]{1,}h |[0-9]{1,}h$|palco [0-9]{1,}/i, '')
199
+
200
+ str.scan(/[0-9]{1,}+.+[0-9]{1,}/).each { |d| str = str.gsub(d, d.gsub('.', '/')) }
201
+
202
+ if reverse_month_day
203
+ str.scan(/[0-9]{1,}\/[0-9]{1,}/).each do |d|
204
+ str = str.gsub(d, d.split('/')[1] + '/' + d.split('/')[0])
205
+ end
206
+ end
207
+
208
+ @@months_strs.each do |mc|
209
+ str.scan(/#{mc.first}.*[0-9]{1,2}+[1-9]{2,4}/).each do |md|
210
+ if md.scan(/[0-9]{1,2}/).size < 4 and md.scan(/[0-9]{4,}/).size < (md.scan(/[0-9]{2,2}/).size-1)
211
+
212
+ continue = true
213
+
214
+ @@months_strs.each do |smc|
215
+ md.scan(/[0-9].*#{smc.first}/).each do |d|
216
+ continue = false
217
+ end
218
+ end
219
+ if continue
220
+ m = md.scan(/[0-9]{1,2}/).first
221
+ str = str.gsub(/#{mc.first}.+#{m}/, "#{m} #{mc.first}").gsub(',', '')
222
+ end
223
+ end
224
+ end
225
+ end
226
+
227
+ str.scan(/[0-9]{4,4}-[0-9]{1,2}-[0-9]{1,2}/).each do |y|
228
+ str = str.gsub(y, y.split('-')[2] + '/' + y.split('-')[1] + '/' + y.split('-')[0])
229
+ end
230
+
231
+ str.scan(/[0-9]{4,4}\/[0-9]{1,2}\/[0-9]{1,2}/).each do |y|
232
+ str = str.gsub(y, y.split('/')[2] + '/' + y.split('/')[1] + '/' + y.split('/')[0])
233
+ end
234
+
235
+ str.scan(/[0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4}/).each do |y|
236
+ str = str.gsub(y, y.split('-')[0] + '/' + y.split('-')[1] + '/' + y.split('-')[2])
237
+ end
238
+
239
+ str.scan(/[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{1,}/).each do |y|
240
+ if y.split('/')[2].size < 4
241
+ sr = y.split('/').first + '/' + y.split('/')[1]
242
+ sy = y.split('/')[2]
243
+ if sy.size < 3
244
+ sy = '0' + sy if sy.size == 1
245
+ if years.include? (reference_date.year.to_s[0..1]+sy).to_i
246
+ sr += '/' + reference_date.year.to_s[0..1]+sy
247
+ elsif years.include? ((reference_date.year-100).to_s[0..1]+sy).to_i
248
+ sr += '/' + (reference_date.year-100).to_s[0..1]+sy
249
+ end
250
+ end
251
+ str = str.gsub(y, sr)
252
+ end
253
+ end
254
+
255
+ str = str.gsub(/[0-9]{5,}/, '')
256
+
257
+ dates = []
258
+ continue = true
259
+ while continue
260
+
261
+ @@months_strs.each do |m|
262
+
263
+ str.scan(/([0-9].*#{m.first})+([^0-9]|$)/).each do |d|
264
+ days = d.first.split(/(#{m.first})+([^0-9]|$)/).first
265
+ jump=false
266
+ @@months_strs.each do |mc|
267
+ if days.scan(/([0-9].*#{mc.first})+([^0-9]|$)/).size > 0
268
+ jump = true
269
+ end
270
+ end
271
+ if !jump
272
+
273
+ year = nil
274
+ str.scan(/#{days}#{m.first}.*[0-9]{4,4}/).each do |sc|
275
+ sy = sc.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '')
276
+
277
+ # [lorem 9/jan/2012] = false
278
+ # [2012 e 07/05/2012] = true
279
+ # [2012] = true
280
+
281
+ if sy.scan(/[0-9]{4,4}/).size > 1 or (sy.scan(/[0-9]{4,4}/).size == 1 and !sy.gsub(/[0-9]{4,4}/, '').match(/[0,9]/))
282
+ sy.scan(/[0-9]{4,4}/).each { |y| year=y.to_i if years.include? y.to_i; break; }
283
+ end
284
+ end
285
+
286
+ #puts '[' + str + '] => ' + year.inspect
287
+ str = str.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '')
288
+ #puts '[' + str + "\n\n"
289
+
290
+ days.gsub(/[0-9]{4,4}/, '').scan(/[0-9]{1,2}/).each do |day|
291
+ day = day.to_i
292
+ if day > 0 and day < 32
293
+ if year
294
+ dates<<Time.new(year, m[1], day)
295
+ elsif m[1]<(reference_date.month-3)
296
+ dates<<Time.new(reference_date.year+1, m[1], day)
297
+ else
298
+ dates<<Time.new(reference_date.year, m[1], day)
299
+ end
300
+ end
301
+ end
302
+ end
303
+ end
304
+ end
305
+ continue = false
306
+ @@months_strs.each do |mt|
307
+ if str.scan(/([0-9].*#{mt.first})+([^0-9]|$)/).size > 0
308
+ continue = true
309
+ end
310
+ end
311
+ end
312
+
313
+ return dates.uniq.sort
314
+
315
+ rescue => exc
316
+ return []
317
+ end
318
+
319
+ end
320
+
321
+ def self.extract_time str, date=nil, reference_time=Time.now
322
+
323
+ return nil if date.nil?
324
+
325
+ return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min) if str.nil? or !str.match /[0-9]{1,2}:[0-9]{1,2}/
326
+
327
+ begin
328
+ time = str.scan(/[0-9]{1,2}:[0-9]{1,2}/).first.split(':')
329
+ return Time.new(date.year, date.month, date.day, time[0], time[1])
330
+ rescue => exp
331
+ return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min)
332
+ end
333
+
334
+ end
335
+
336
+ private
337
+
338
+ def self.treat_encoding_s str, debug=false
339
+ begin
340
+ str_r = ''
341
+ ws = str.split(' ').each_slice(20)
342
+ ws.each_with_index do |w, i|
343
+ if i == 0
344
+ str_r += self.treat_encoding_i w.join(' '), 0, debug
345
+ else
346
+ str_r += ' ' + self.treat_encoding_i(w.join(' '), 0, debug)
347
+ end
348
+ end
349
+ rescue => exp
350
+ str_r = ''
351
+ str.chars.each_slice(200).each { |w| str_r += self.treat_encoding_i w.join, 0, debug }
352
+ end
353
+
354
+ return str_r
355
+
356
+ end
357
+
358
+ def self.valid_encoding str, tolerance=0, debug=false
359
+ str_v = str
360
+ begin
361
+ str_v.match 'á'
362
+ str_v = str_v.gsub /\s{1,}|\n{1,}|\r{1,}/, ''
363
+ @@legal_chars.each { |lc| str_v = str_v.gsub lc, '' }
364
+ @@invalid_sequences.each { |is| raise 'invalid sequence: ' + is if str.match is }
365
+ puts '[' + str_v + ']' if debug and str_v.size > 0
366
+ return false if str_v.size > tolerance
367
+ str_v.split('').each { |c| str = str.gsub c, '' } if str_v.size > 0
368
+ return str
369
+ rescue => exp
370
+ #puts '[error] ' + exp.message if debug or !exp.message.match /incompatible encoding|invalid byte sequence|invalid sequence/i
371
+ return false
372
+ end
373
+ end
374
+
375
+ def self.treat_encoding_i str, tolerance=0, debug=false
376
+
377
+ str_t = str
378
+
379
+ str_v = self.valid_encoding str_t, tolerance, debug
380
+ if !str_v
381
+ puts '[try force_encoding UTF-8]' if debug
382
+ begin
383
+ str_t = str.force_encoding 'UTF-8'
384
+ rescue => exp
385
+ end
386
+ else
387
+ return str_v
388
+ end
389
+
390
+ str_v = self.valid_encoding str_t, tolerance, debug
391
+ if !str_v
392
+ puts '[try WINDOWS-1252]' if debug
393
+ begin
394
+ str_t = str.encode 'UTF-8', 'WINDOWS-1252'
395
+ rescue => exp
396
+ end
397
+ else
398
+ return str_v
399
+ end
400
+
401
+ str_v = self.valid_encoding str_t, tolerance, debug
402
+ if !str_v
403
+ puts '[try UTF-8]' if debug
404
+ begin
405
+ str_t = str.encode 'UTF-8', 'UTF-8'
406
+ rescue => exp
407
+ end
408
+ else
409
+ return str_v
410
+ end
411
+
412
+ str_v = self.valid_encoding str_t, tolerance, debug
413
+ if !str_v
414
+ puts '[try ISO-8859-2]' if debug
415
+ begin
416
+ str_t = str.encode 'UTF-8', 'ISO-8859-2'
417
+ rescue => exp
418
+ end
419
+ else
420
+ return str_v
421
+ end
422
+
423
+ str_v = self.valid_encoding str_t, tolerance, debug
424
+ if !str_v
425
+ puts '[try ISO-8859-3]' if debug
426
+ begin
427
+ str_t = str.encode 'UTF-8', 'ISO-8859-3'
428
+ rescue => exp
429
+ end
430
+ else
431
+ return str_v
432
+ end
433
+
434
+ str_v = self.valid_encoding str_t, tolerance, debug
435
+ if tolerance == 0 and !str_v
436
+ str_t = self.treat_encoding_i str, 1, debug
437
+ end
438
+
439
+ return str_t
440
+
441
+ end
442
+
443
+ def self.upcaseword w
444
+
445
+ return w if w.to_s == ''
446
+
447
+ if w.scan(/#{@@separators_regex.join('|')}/).size == 0
448
+
449
+ # Cria uma Array apenas com os caracteres necessários por questões de performance.
450
+ letters = []
451
+ clean_word = self.remove_accents(w).downcase.gsub(/[^a-z]/, '')
452
+ clean_word.split('').uniq.each { |lt| @@letters_by_letter[lt].each { |l| letters << l } }
453
+
454
+ trf = 'tm'
455
+ trf = 'tfu' if w.size > 5 or !@@articles_and_others.include? clean_word
456
+ trf = 'tau' if !w.match(/^mr$|^jr$|^mr.$|^jr.$|^sr$|^sr.$/i) and ((w.size < 6 and clean_word.match(/[^aeiouwy]{4,}|[aeiouwy]{4,}|^[^aeiouwy]{2,3}$/)) or w.scan('.').size > 2)
457
+
458
+ letters.each do |l|
459
+
460
+ # Transforma tudo em minúsculo.
461
+ w = w.gsub l[1], l[0] if trf == 'tm' || trf == 'tfu'
462
+
463
+ # Maiúsculo na primeira letra caso não seja um artigo ou algo do gênero.
464
+ w = w.gsub /^#{l[0]}/, l[1] if trf == 'tfu'
465
+
466
+ # Transforma em maiúsculo:
467
+ # * Sequência de 4 ou mais consoantes.
468
+ # * Sequência de 4 ou mais vogais.
469
+ # * Sequência exata de 2 ou 3 vogais.
470
+
471
+ w = w.gsub l[0], l[1] if trf == 'tau'
472
+
473
+ end
474
+
475
+ else
476
+
477
+ # Quebra termos entre caracteres separadores como "'", "(", etc.
478
+ @@separators.each do |l|
479
+ sw = w.split(l)
480
+ if sw.size > 1
481
+ # Trata o termo isoladamente se não for uma letra única antes de "'"
482
+ sw.each_with_index { |v, i| sw[i] = upcaseword v if !(["'"].include? l and v.size == 1 and i == 0) }
483
+ if w[w.size-1] == l
484
+ w = sw.join(l) + l
485
+ else
486
+ w = sw.join(l)
487
+ end
488
+ end
489
+ end
490
+
491
+ end
492
+
493
+ return w
494
+
495
+ end
496
+
497
+ end
@@ -0,0 +1,106 @@
1
+ # encoding: utf-8
2
+
3
+ module FkStr
4
+
5
+ @@months_strs = {
6
+ 'jan' => 1, 'fev' => 2, 'mar' => 3, 'abr' => 4, 'mai' => 5, 'jun' => 6,
7
+ 'jul' => 7, 'ago' => 8, 'set' => 9, 'out' => 10, 'nov' => 11, 'dez' => 12,
8
+ '/1' => 1, '/2' => 2, '/3' => 3, '/4' => 4,
9
+ '/5' => 5, '/6' => 6, '/7' => 7, '/8' => 8, '/9' => 9,
10
+ '/01' => 1, '/02' => 2, '/03' => 3, '/04' => 4, '/05' => 5, '/06' => 6,
11
+ '/07' => 7, '/08' => 8, '/09' => 9, '/10' => 10, '/11' => 11, '/12' => 12,
12
+ 'feb' => 2, 'apr' => 4, 'may' => 5, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'dec' => 12
13
+ }
14
+
15
+ @@letters_by_letter = {
16
+ 'a' => { 'a' => 'A', 'á' => 'Á', 'ã' => 'Ã', 'â' => 'Â', 'à' => 'À', 'ä' => 'Ä', 'ă' => 'Ă', 'ā' => 'Ā', 'å' => 'Å', 'æ' => 'Æ' },
17
+ 'b' => { 'b' => 'B' },
18
+ 'c' => { 'c' => 'C', 'ç' => 'Ç', 'č' => 'Č' },
19
+ 'd' => { 'd' => 'D' },
20
+ 'e' => { 'e' => 'E', 'é' => 'É', 'ẽ' => 'Ẽ', 'ê' => 'Ê', 'è' => 'È', 'ë' => 'Ë', 'ĕ' => 'Ĕ', 'ē' => 'Ē' },
21
+ 'f' => { 'f' => 'F' },
22
+ 'g' => { 'g' => 'G', 'ğ' => 'Ğ' },
23
+ 'h' => { 'h' => 'H' },
24
+ 'i' => { 'i' => 'I', 'í' => 'Í', 'ĩ' => 'Ĩ', 'î' => 'Î', 'ì' => 'Ì', 'ï' => 'Ï', 'ĭ' => 'Ĭ', 'ī' => 'Ī' },
25
+ 'j' => { 'j' => 'J' },
26
+ 'k' => { 'k' => 'K' },
27
+ 'l' => { 'l' => 'L' },
28
+ 'm' => { 'm' => 'M' },
29
+ 'n' => { 'n' => 'N', 'ñ' => 'Ñ' },
30
+ 'o' => { 'o' => 'O', 'ó' => 'Ó', 'õ' => 'Õ', 'ô' => 'Ô', 'ò' => 'Ò', 'ö' => 'Ö', 'ŏ' => 'Ŏ', 'ō' => 'Ō', 'ő' => 'Ő', 'ð' => 'Ð' },
31
+ 'p' => { 'p' => 'P' },
32
+ 'q' => { 'q' => 'Q' },
33
+ 'r' => { 'r' => 'R' },
34
+ 's' => { 's' => 'S', 'š' => 'Š' },
35
+ 't' => { 't' => 'T' },
36
+ 'u' => { 'u' => 'U', 'ú' => 'Ú', 'ũ' => 'Ũ', 'û' => 'Û', 'ù' => 'Ù', 'ü' => 'Ü', 'ŭ' => 'Ŭ', 'ū' => 'Ū', 'ǖ' => 'Ǖ' },
37
+ 'v' => { 'v' => 'V' },
38
+ 'w' => { 'w' => 'W' },
39
+ 'x' => { 'x' => 'X' },
40
+ 'y' => { 'y' => 'Y', 'ȳ' => 'Ȳ', 'ÿ' => 'Ÿ', 'ý' => 'Ý', 'ỳ' => 'Ỳ' },
41
+ 'z' => { 'z' => 'Z', 'ž' => 'Ž' }
42
+ }
43
+
44
+ @@articles_and_others = [
45
+ # Português
46
+ 'a', 'ao', 'aos', 'as',
47
+ 'co', 'coa', 'coas', 'com', 'cos',
48
+ 'da', 'das', 'de', 'do', 'dos', 'dum', 'duma', 'dumas', 'duns',
49
+ 'e', 'em',
50
+ 'na', 'nas', 'no', 'nos', 'num', 'numa', 'numas', 'nuns',
51
+ 'o', 'os', 'ou',
52
+ 'pela', 'pelas', 'pelo', 'pelos', 'per', 'por',
53
+ 'um', 'uma', 'umas', 'uns',
54
+ # English
55
+ 'an', 'and', 'at', 'by', 'in', 'of', 'or', 'on', 's', 'the'
56
+ ]
57
+ def FkStr.articles_and_others
58
+ return @@articles_and_others
59
+ end
60
+
61
+ @@countries_acronyms = [
62
+ # Brasil
63
+ 'br',
64
+ 'ac', 'al', 'ap', 'am', 'ba', 'ce', 'df', 'es', 'go', 'ma', 'mt','ms', 'mg', 'pa', 'pb',
65
+ 'pr', 'pe', 'pi', 'rj', 'rn', 'rs', 'ro', 'rr', 'sc', 'sp', 'se', 'to',
66
+ # USA
67
+ 'us',
68
+ 'ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'ia', 'id', 'il',
69
+ 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne',
70
+ 'nh', 'nj', 'nm', 'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut',
71
+ 'va', 'vt', 'wa', 'wi', 'wv', 'wy'
72
+ ]
73
+ def FkStr.countries_acronyms
74
+ return @@countries_acronyms
75
+ end
76
+
77
+ @@simple_downcase_letters = [
78
+ 'a', 'b', 'c', 'd', 'e',
79
+ 'f', 'g', 'h', 'i', 'j',
80
+ 'k', 'l', 'm', 'n', 'o',
81
+ 'p', 'q', 'r', 's', 't',
82
+ 'u', 'v', 'w', 'x', 'y',
83
+ 'z'
84
+ ]
85
+
86
+ @@simple_downcase_consonants = [
87
+ 'b', 'c', 'd',
88
+ 'f', 'g', 'h', 'j',
89
+ 'k', 'l', 'm', 'n',
90
+ 'p', 'q', 'r', 's', 't',
91
+ 'v', 'w', 'x', 'y',
92
+ 'z'
93
+ ]
94
+
95
+ @@separators = ['/', '-', '_', ',', '.', "'", '"', '(', ')', '[', ']', '{', '}', '|', '\\', ';']
96
+ def FkStr.separators
97
+ return @@separators
98
+ end
99
+
100
+ @@separators_regex = ['\/', '\-', "\'", '\"', '\(', '\)', '\[', '\]', '\{', '\}']
101
+
102
+ @@legal_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz _-!@#$%&*+=?^~´`,.:;\'"()[]{}|/\\<>ÁÃÂÀÄĂĀÅÆáãâàäăāåæÉẼÊÈËĔĒéẽêèëĕēÍĨÎÌÏĬĪíĩîìïĭīÓÕÔÒÖŎŌŐÐóõôòöŏōőðšŠÚŨÛÙÜŬŪǕúũûùüŭūǖÇçČčĞğÑñȲȳŸÿÝýỲỳŽž¹²³ºª – ’©®℗¿¡±“”•«»‘°'.split('')
103
+
104
+ @@invalid_sequences = ['é']
105
+
106
+ end
@@ -0,0 +1,255 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test/unit'
4
+ require 'fk_str'
5
+
6
+ class FkStrTest < Test::Unit::TestCase
7
+
8
+ def test_treat_encoding
9
+
10
+ assert_equal(
11
+ 'çUTF-8',
12
+ FkStr.treat_encoding("\xE7")+FkStr.treat_encoding("\xE7").encoding.to_s.upcase
13
+ )
14
+
15
+ assert_equal(
16
+ '©UTF-8',
17
+ FkStr.treat_encoding("\xC2\xA9")+FkStr.treat_encoding("\xC2\xA9").encoding.to_s.upcase
18
+ )
19
+
20
+ assert_equal(
21
+ 'caçaUTF-8',
22
+ FkStr.treat_encoding("ca\xE7a")+FkStr.treat_encoding("ca\xE7a").encoding.to_s.upcase
23
+ )
24
+
25
+ assert_equal(
26
+ 'casaUTF-8',
27
+ FkStr.treat_encoding('casa')+FkStr.treat_encoding('casa').encoding.to_s.upcase
28
+ )
29
+
30
+ end
31
+
32
+ def test_is_eq
33
+
34
+ assert_equal(
35
+ true,
36
+ FkStr.is_eq('Hangar 110', 'Hangar 110', 40)
37
+ )
38
+
39
+ assert_equal(
40
+ true,
41
+ FkStr.is_eq('Armagedon + Atos de Vingança', 'Armagedom')
42
+ )
43
+
44
+ assert_equal(
45
+ false,
46
+ FkStr.is_eq('Gato Cat', 'Cachorro Dog')
47
+ )
48
+
49
+ assert_equal(
50
+ true,
51
+ FkStr.is_eq('Creedence Clearwater Revisited', 'Creedence Clearwater')
52
+ )
53
+
54
+ end
55
+
56
+ def test_to_slug
57
+
58
+ assert_equal(
59
+ 'teste-dog',
60
+ FkStr.to_slug('teste:dog')
61
+ )
62
+
63
+ assert_equal(
64
+ 'centro-rio-de-janeiro-rj',
65
+ FkStr.to_slug('Centro - Rio de Janeiro [RJ]')
66
+ )
67
+
68
+ assert_equal(
69
+ 'sao-paulo-sp',
70
+ FkStr.to_slug('São Paulo/SP')
71
+ )
72
+
73
+ assert_equal(
74
+ 'sao-paulo-sp',
75
+ FkStr.to_slug('São Paulo_SP')
76
+ )
77
+
78
+ end
79
+
80
+ def test_to_term
81
+
82
+ assert_equal(
83
+ 'kasadujkakurururi',
84
+ FkStr.to_term('casa & dog and cachorro e lorem')
85
+ )
86
+
87
+ assert_equal(
88
+ 'tistiduj',
89
+ FkStr.to_term('teste:de\dog')
90
+ )
91
+
92
+ assert_equal(
93
+ 'saupauru',
94
+ FkStr.to_term('São Paulo-SP')
95
+ )
96
+
97
+ assert_equal(
98
+ 'tistiduj',
99
+ FkStr.to_term('teste:de:dog')
100
+ )
101
+
102
+ end
103
+
104
+ def test_remove_accents
105
+
106
+ assert_equal(
107
+ 'Sao Jose do Rio Preto - SP',
108
+ FkStr.remove_accents('São José do Rio Preto - SP')
109
+ )
110
+
111
+ assert_equal(
112
+ 'Sao Paulo',
113
+ FkStr.remove_accents('São Paulo')
114
+ )
115
+
116
+ assert_equal(
117
+ 'Acougue',
118
+ FkStr.remove_accents('Açougue')
119
+ )
120
+
121
+ assert_equal(
122
+ 'Lorem Ipsum',
123
+ FkStr.remove_accents('Lôrém Ipsum')
124
+ )
125
+
126
+ end
127
+
128
+ def test_upcasewords
129
+
130
+ assert_equal(
131
+ 'Charlie Brown Jr.',
132
+ FkStr.upcasewords('CHARLIE BROWN JR.')
133
+ )
134
+
135
+ assert_equal(
136
+ 'Coldplay',
137
+ FkStr.upcasewords('COLDPLAY')
138
+ )
139
+
140
+ assert_equal(
141
+ 'Queensrÿche',
142
+ FkStr.upcasewords('QUEENSRŸCHE')
143
+ )
144
+
145
+ assert_equal(
146
+ 'Mindflow',
147
+ FkStr.upcasewords('MINDFLOW')
148
+ )
149
+
150
+ end
151
+
152
+ def test_remove_if_ends_with
153
+
154
+ assert_equal(
155
+ 'Natal La Barra',
156
+ FkStr.remove_if_ends_with(
157
+ 'Natal La Barra - Caxias do Sul / RS',
158
+ ['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
159
+ ['Natal'],
160
+ 1
161
+ )
162
+ )
163
+
164
+ assert_equal(
165
+ 'Natal La Barra -',
166
+ FkStr.remove_if_ends_with(
167
+ 'Natal La Barra - Caxias do Sul / RS',
168
+ ['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
169
+ ['Natal'],
170
+ 2
171
+ )
172
+ )
173
+
174
+ assert_equal(
175
+ 'Natal La Barra - Caxias do Sul',
176
+ FkStr.remove_if_ends_with(
177
+ 'Natal La Barra - Caxias do Sul / RS',
178
+ ['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
179
+ ['Natal'],
180
+ 3
181
+ )
182
+ )
183
+
184
+ assert_equal(
185
+ 'Masp',
186
+ FkStr.remove_if_ends_with('Masp São Paulo/SP', ['São Paulo', 'SP', '/'])
187
+ )
188
+
189
+ end
190
+
191
+ def test_extract_dates
192
+
193
+ assert_equal(
194
+ [Time.new(2012, 12, 6)].uniq.sort,
195
+ FkStr.extract_dates('December 06, 2012', Time.new(2012, 9, 12))
196
+ )
197
+
198
+ assert_equal(
199
+ [Time.new(2012, 9, 14)].uniq.sort,
200
+ FkStr.extract_dates('FRI 09.14.2012', Time.new(2012, 9, 12), true)
201
+ )
202
+
203
+ assert_equal(
204
+ [Time.new(2011, 12, 8), Time.new(2012, 1, 9)].uniq.sort,
205
+ FkStr.extract_dates('8/dez lorem 9/jan/2012', Time.new(2011, 10, 8))
206
+ )
207
+
208
+ assert_equal(
209
+ [Time.new(2012, 1, 2)].uniq.sort,
210
+ FkStr.extract_dates('2 de janeiro', Time.new(2011, 10, 8))
211
+ )
212
+
213
+ end
214
+
215
+ def test_extract_time
216
+
217
+ assert_equal(
218
+ Time.new(2011, 07, 14, 22, 18, 0),
219
+ FkStr.extract_time(
220
+ 'Thu, 14 Jul 2011 22:18:49 +0000',
221
+ FkStr.extract_dates('Thu, 14 Jul 2011 22:18:49 +0000',Time.new(2012, 5, 28)).first,
222
+ Time.new(2012, 5, 28)
223
+ )
224
+ )
225
+
226
+ assert_equal(
227
+ Time.new(2011, 07, 14, 16, 15, 0),
228
+ FkStr.extract_time(
229
+ '14 Jul 2011 16:15',
230
+ FkStr.extract_dates('14 Jul 2011 16:15',Time.new(2012, 5, 28)).first,
231
+ Time.new(2012, 5, 28)
232
+ )
233
+ )
234
+
235
+ assert_equal(
236
+ Time.new(2011, 07, 14, 9, 0, 0),
237
+ FkStr.extract_time(
238
+ '14 Jul 2011 9:00',
239
+ FkStr.extract_dates('14 Jul 2011 9:00',Time.new(2012, 5, 28)).first,
240
+ Time.new(2012, 5, 28)
241
+ )
242
+ )
243
+
244
+ assert_equal(
245
+ Time.new(2011, 07, 14, 7, 35, 0),
246
+ FkStr.extract_time(
247
+ '14 Jul 2011 07:35',
248
+ FkStr.extract_dates('14 Jul 2011 07:35',Time.new(2012, 5, 28)).first,
249
+ Time.new(2012, 5, 28)
250
+ )
251
+ )
252
+
253
+ end
254
+
255
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fk_str
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Guilherme Baptista
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: String manipulation.
15
+ email: guilhermebaptistasilva@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE
21
+ - Rakefile
22
+ - lib/fk_str.rb
23
+ - lib/fk_str/dictionary.rb
24
+ - test/test_fk_str.rb
25
+ homepage: https://github.com/gbaptista/fk_str
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.24
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: FkStr
49
+ test_files: []