fk_str 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Guilherme Baptista
2
+ https://github.com/gbaptista/fk_str
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ of this software and associated documentation files (the "Software"), to deal
6
+ in the Software without restriction, including without limitation the rights
7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is
9
+ furnished to do so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in
12
+ all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc 'Run tests'
8
+ task :default => :test
data/lib/fk_str.rb ADDED
@@ -0,0 +1,497 @@
1
+ # encoding: utf-8
2
+
3
+ require 'date'
4
+ require 'fk_str/dictionary'
5
+
6
+ module FkStr
7
+
8
+ def self.treat_encoding str, debug=false
9
+ str_r = ''
10
+ str.lines.each_with_index { |l, i| str_r += ' ' + self.treat_encoding_s(l, debug) if !debug or (i > -1 and i < 1) }
11
+ return str_r.strip
12
+ end
13
+
14
+ def self.is_eq str, str_b, pct=1
15
+
16
+ str = self.to_term str, true
17
+ str_b = self.to_term str_b, true
18
+
19
+ str_c = str.concat str_b
20
+
21
+ return true if (100-(100*str_c.uniq.size/str_c.size)) >= pct
22
+
23
+ return false
24
+
25
+ end
26
+
27
+ def self.to_slug str
28
+
29
+ return str if str.to_s == ''
30
+
31
+ return self.remove_accents(str).gsub(/\s{1,}| {1,}/, ' ').gsub(/[\+\/_\-|:@#\\,]/, ' ').gsub('&', 'e').gsub(/[^a-zA-Z0-9 ]/, '').downcase.gsub(/\s{1,}| {1,}/, ' ').strip.gsub(' ', '-')
32
+
33
+ end
34
+
35
+ def self.to_term str, ar=false
36
+
37
+ return str if str.to_s == ''
38
+
39
+ str_ar = []
40
+
41
+ self.to_slug(str).split('-').each do |s|
42
+ s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r }
43
+ @@simple_downcase_consonants.each { |c| s = s.gsub /#{c}(h|r|l|u)/, c }
44
+ if !s.empty? and !@@countries_acronyms.include? s and !@@articles_and_others.include? s
45
+ s = s.gsub /m/, 'n'
46
+ s = s.gsub /l/, 'r'
47
+ s = s.gsub /z/, 's'
48
+ s = s.gsub /g/, 'j'
49
+ s = s.gsub /e|y/, 'i'
50
+ s = s.gsub /o|w/, 'u'
51
+ s = s.gsub /c|q/, 'k'
52
+ s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r }
53
+ s = s.gsub /(r|s|n)$/, ''
54
+ str_ar << s if !s.empty?
55
+ end
56
+ end
57
+
58
+ return str_ar.uniq if ar
59
+
60
+ return str_ar.uniq.join
61
+
62
+ end
63
+
64
+ def self.remove_accents str
65
+
66
+ return '' if str.to_s == ''
67
+ str = str.gsub(/[ÁÃÂÀÄĂĀÅÆ]/, 'A').gsub(/[áãâàäăāåæ]/, 'a')
68
+ str = str.gsub(/[ÉẼÊÈËĔĒ]/, 'E').gsub(/[éẽêèëĕē]/, 'e')
69
+ str = str.gsub(/[ÍĨÎÌÏĬĪ]/, 'I').gsub(/[íĩîìïĭī]/, 'i')
70
+ str = str.gsub(/[ÓÕÔÒÖŎŌŐÐ]/, 'O').gsub(/[óõôòöŏōőð]/, 'o')
71
+ str = str.gsub(/[ÚŨÛÙÜŬŪǕ]/, 'U').gsub(/[úũûùüŭūǖ]/, 'u')
72
+ str = str.gsub(/[ÇČ]/, 'C').gsub(/[çč]/, 'c').gsub(/Ğ/, 'G').gsub(/ğ/, 'g').gsub(/Ñ/, 'N').gsub(/ñ/, 'n').gsub(/Š/, 'S').gsub(/š/, 's')
73
+ str = str.gsub(/[ȲŸÝỲ]/, 'Y').gsub(/[ȳÿýỳ]/, 'y').gsub(/Ž/, 'Z').gsub(/ž/, 'z')
74
+
75
+ return str
76
+
77
+ end
78
+
79
+ # 35 seconds
80
+ # 18 seconds
81
+ # 16 seconds
82
+ def self.upcasewords str
83
+
84
+ return str if str.to_s == ''
85
+
86
+ # Trata espaçamentos duplicados ou inválidos.
87
+ str = str.gsub(/\s{1,}| {1,}/, ' ').strip
88
+
89
+ rstr = []
90
+ str.split(' ').each { |w| rstr << upcaseword(w) }
91
+ str = rstr.join(' ')
92
+
93
+ # Trata espaçamentos duplicados ou inválidos.
94
+ str = str.gsub(/\s{1,}| {1,}/, ' ')
95
+
96
+ # Maiúsculo na primeira letra
97
+ fl = @@letters_by_letter[remove_accents(str[0]).downcase]
98
+ fl.each { |l| str[0] = str[0].gsub(l[0], l[1]) } if fl
99
+
100
+ return str
101
+
102
+ end
103
+
104
+ def self.remove_if_ends_with str, texts, not_change_if_returns_with=nil, if_not_change_returns_with_last_removed=0
105
+
106
+ return str if str.split(' ').size == 1
107
+
108
+ texts.each_with_index { |t, i| texts.delete_at i if t == '' }
109
+
110
+ str_o = str
111
+
112
+ str = str.strip
113
+
114
+ str_t = self.remove_accents(str).downcase
115
+
116
+ texts = texts.uniq
117
+
118
+ texts.each_with_index { |v, i| texts[i] = self.remove_accents(v).downcase }
119
+
120
+ not_change_if_returns_with.each_with_index { |v, i| not_change_if_returns_with[i] = self.remove_accents(v).downcase } if !not_change_if_returns_with.nil?
121
+
122
+ removed = []
123
+
124
+ continue = true
125
+ while continue
126
+ continue = false
127
+ texts.each do |t|
128
+
129
+ # Se o final da string for igual ao termo...
130
+ if t == str_t[str_t.size-t.size..str_t.size].to_s
131
+
132
+ # Se antes do termo final na string não for igual à ' de ' ou ' da '...
133
+ if ![' de ', ' da '].include? str_t[str_t.size-t.size-4].to_s + str_t[str_t.size-t.size-3..str_t.size-t.size-2].to_s + str_t[str_t.size-t.size-1].to_s
134
+
135
+ # Se o primeiro char do termo não for uma letra ou se o char anterior ao termo não for uma letra...
136
+ if (!@@simple_downcase_letters.include? t[0] or !@@simple_downcase_letters.include? str_t[str_t.size-t.size-1]) and str_t.size > 1
137
+
138
+ str_l = str
139
+
140
+ str = str[0..str.size-t.size-1].strip
141
+ str_t = self.remove_accents(str).downcase
142
+
143
+ removed << str_l[str.size..str_l.size]
144
+
145
+ continue = true
146
+
147
+ end
148
+
149
+ end
150
+
151
+ end
152
+
153
+ end
154
+ end
155
+
156
+ # Se o retorno for igual à alguma condição que não deve ser retornada...
157
+ if !not_change_if_returns_with.nil?
158
+ if not_change_if_returns_with.include?(self.remove_accents(str).downcase)
159
+ # Se for solicitado que retorne apenas com x termos que foram removidos...
160
+ if if_not_change_returns_with_last_removed > 0
161
+ removed = removed.reverse
162
+ (1..if_not_change_returns_with_last_removed).each { |n| str += removed[n-1].to_s }
163
+ return str.strip
164
+ end
165
+ return str_o
166
+ end
167
+ end
168
+
169
+ return str
170
+
171
+ end
172
+
173
+ def self.extract_dates str, reference_date=Time.now, reverse_month_day=false
174
+
175
+ return [] if str.nil?
176
+
177
+ return [Time.new(str.year, str.month, str.day)] if str.kind_of?(Time) or str.kind_of?(Date) or str.kind_of?(DateTime)
178
+
179
+ o_str = str
180
+
181
+ years = []
182
+ (-30..20).each { |y| years << reference_date.year+y }
183
+
184
+ begin
185
+
186
+ str = str.gsub /[0-9]{1,}(º|ª)/, ' '
187
+
188
+ str = self.remove_accents str
189
+
190
+ str = str.downcase
191
+
192
+ str = str.gsub /[0-9]{1,}+[a-z]{1,}+[0-9]{1,}/, ''
193
+ str = str.gsub /[0-9]{1,}+[a-z]{1,}/, ' '
194
+ str = str.gsub /[a-z]{1,}+[0-9]{1,}/, ' '
195
+
196
+ str = str.gsub(/[^a-z|^0-9|^\/|^\-|^\.|^:]/i, ' ')
197
+
198
+ str = str.gsub(/[0-9]{1,}:[0-9]{1,}|:[0-9]{1,}|[0-9]{1,}h[0-9]{1,}|[0-9]{1,}%|[0-9]{1,}h |[0-9]{1,}h$|palco [0-9]{1,}/i, '')
199
+
200
+ str.scan(/[0-9]{1,}+.+[0-9]{1,}/).each { |d| str = str.gsub(d, d.gsub('.', '/')) }
201
+
202
+ if reverse_month_day
203
+ str.scan(/[0-9]{1,}\/[0-9]{1,}/).each do |d|
204
+ str = str.gsub(d, d.split('/')[1] + '/' + d.split('/')[0])
205
+ end
206
+ end
207
+
208
+ @@months_strs.each do |mc|
209
+ str.scan(/#{mc.first}.*[0-9]{1,2}+[1-9]{2,4}/).each do |md|
210
+ if md.scan(/[0-9]{1,2}/).size < 4 and md.scan(/[0-9]{4,}/).size < (md.scan(/[0-9]{2,2}/).size-1)
211
+
212
+ continue = true
213
+
214
+ @@months_strs.each do |smc|
215
+ md.scan(/[0-9].*#{smc.first}/).each do |d|
216
+ continue = false
217
+ end
218
+ end
219
+ if continue
220
+ m = md.scan(/[0-9]{1,2}/).first
221
+ str = str.gsub(/#{mc.first}.+#{m}/, "#{m} #{mc.first}").gsub(',', '')
222
+ end
223
+ end
224
+ end
225
+ end
226
+
227
+ str.scan(/[0-9]{4,4}-[0-9]{1,2}-[0-9]{1,2}/).each do |y|
228
+ str = str.gsub(y, y.split('-')[2] + '/' + y.split('-')[1] + '/' + y.split('-')[0])
229
+ end
230
+
231
+ str.scan(/[0-9]{4,4}\/[0-9]{1,2}\/[0-9]{1,2}/).each do |y|
232
+ str = str.gsub(y, y.split('/')[2] + '/' + y.split('/')[1] + '/' + y.split('/')[0])
233
+ end
234
+
235
+ str.scan(/[0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4}/).each do |y|
236
+ str = str.gsub(y, y.split('-')[0] + '/' + y.split('-')[1] + '/' + y.split('-')[2])
237
+ end
238
+
239
+ str.scan(/[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{1,}/).each do |y|
240
+ if y.split('/')[2].size < 4
241
+ sr = y.split('/').first + '/' + y.split('/')[1]
242
+ sy = y.split('/')[2]
243
+ if sy.size < 3
244
+ sy = '0' + sy if sy.size == 1
245
+ if years.include? (reference_date.year.to_s[0..1]+sy).to_i
246
+ sr += '/' + reference_date.year.to_s[0..1]+sy
247
+ elsif years.include? ((reference_date.year-100).to_s[0..1]+sy).to_i
248
+ sr += '/' + (reference_date.year-100).to_s[0..1]+sy
249
+ end
250
+ end
251
+ str = str.gsub(y, sr)
252
+ end
253
+ end
254
+
255
+ str = str.gsub(/[0-9]{5,}/, '')
256
+
257
+ dates = []
258
+ continue = true
259
+ while continue
260
+
261
+ @@months_strs.each do |m|
262
+
263
+ str.scan(/([0-9].*#{m.first})+([^0-9]|$)/).each do |d|
264
+ days = d.first.split(/(#{m.first})+([^0-9]|$)/).first
265
+ jump=false
266
+ @@months_strs.each do |mc|
267
+ if days.scan(/([0-9].*#{mc.first})+([^0-9]|$)/).size > 0
268
+ jump = true
269
+ end
270
+ end
271
+ if !jump
272
+
273
+ year = nil
274
+ str.scan(/#{days}#{m.first}.*[0-9]{4,4}/).each do |sc|
275
+ sy = sc.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '')
276
+
277
+ # [lorem 9/jan/2012] = false
278
+ # [2012 e 07/05/2012] = true
279
+ # [2012] = true
280
+
281
+ if sy.scan(/[0-9]{4,4}/).size > 1 or (sy.scan(/[0-9]{4,4}/).size == 1 and !sy.gsub(/[0-9]{4,4}/, '').match(/[0,9]/))
282
+ sy.scan(/[0-9]{4,4}/).each { |y| year=y.to_i if years.include? y.to_i; break; }
283
+ end
284
+ end
285
+
286
+ #puts '[' + str + '] => ' + year.inspect
287
+ str = str.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '')
288
+ #puts '[' + str + "\n\n"
289
+
290
+ days.gsub(/[0-9]{4,4}/, '').scan(/[0-9]{1,2}/).each do |day|
291
+ day = day.to_i
292
+ if day > 0 and day < 32
293
+ if year
294
+ dates<<Time.new(year, m[1], day)
295
+ elsif m[1]<(reference_date.month-3)
296
+ dates<<Time.new(reference_date.year+1, m[1], day)
297
+ else
298
+ dates<<Time.new(reference_date.year, m[1], day)
299
+ end
300
+ end
301
+ end
302
+ end
303
+ end
304
+ end
305
+ continue = false
306
+ @@months_strs.each do |mt|
307
+ if str.scan(/([0-9].*#{mt.first})+([^0-9]|$)/).size > 0
308
+ continue = true
309
+ end
310
+ end
311
+ end
312
+
313
+ return dates.uniq.sort
314
+
315
+ rescue => exc
316
+ return []
317
+ end
318
+
319
+ end
320
+
321
+ def self.extract_time str, date=nil, reference_time=Time.now
322
+
323
+ return nil if date.nil?
324
+
325
+ return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min) if str.nil? or !str.match /[0-9]{1,2}:[0-9]{1,2}/
326
+
327
+ begin
328
+ time = str.scan(/[0-9]{1,2}:[0-9]{1,2}/).first.split(':')
329
+ return Time.new(date.year, date.month, date.day, time[0], time[1])
330
+ rescue => exp
331
+ return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min)
332
+ end
333
+
334
+ end
335
+
336
+ private
337
+
338
+ def self.treat_encoding_s str, debug=false
339
+ begin
340
+ str_r = ''
341
+ ws = str.split(' ').each_slice(20)
342
+ ws.each_with_index do |w, i|
343
+ if i == 0
344
+ str_r += self.treat_encoding_i w.join(' '), 0, debug
345
+ else
346
+ str_r += ' ' + self.treat_encoding_i(w.join(' '), 0, debug)
347
+ end
348
+ end
349
+ rescue => exp
350
+ str_r = ''
351
+ str.chars.each_slice(200).each { |w| str_r += self.treat_encoding_i w.join, 0, debug }
352
+ end
353
+
354
+ return str_r
355
+
356
+ end
357
+
358
+ def self.valid_encoding str, tolerance=0, debug=false
359
+ str_v = str
360
+ begin
361
+ str_v.match 'á'
362
+ str_v = str_v.gsub /\s{1,}|\n{1,}|\r{1,}/, ''
363
+ @@legal_chars.each { |lc| str_v = str_v.gsub lc, '' }
364
+ @@invalid_sequences.each { |is| raise 'invalid sequence: ' + is if str.match is }
365
+ puts '[' + str_v + ']' if debug and str_v.size > 0
366
+ return false if str_v.size > tolerance
367
+ str_v.split('').each { |c| str = str.gsub c, '' } if str_v.size > 0
368
+ return str
369
+ rescue => exp
370
+ #puts '[error] ' + exp.message if debug or !exp.message.match /incompatible encoding|invalid byte sequence|invalid sequence/i
371
+ return false
372
+ end
373
+ end
374
+
375
+ def self.treat_encoding_i str, tolerance=0, debug=false
376
+
377
+ str_t = str
378
+
379
+ str_v = self.valid_encoding str_t, tolerance, debug
380
+ if !str_v
381
+ puts '[try force_encoding UTF-8]' if debug
382
+ begin
383
+ str_t = str.force_encoding 'UTF-8'
384
+ rescue => exp
385
+ end
386
+ else
387
+ return str_v
388
+ end
389
+
390
+ str_v = self.valid_encoding str_t, tolerance, debug
391
+ if !str_v
392
+ puts '[try WINDOWS-1252]' if debug
393
+ begin
394
+ str_t = str.encode 'UTF-8', 'WINDOWS-1252'
395
+ rescue => exp
396
+ end
397
+ else
398
+ return str_v
399
+ end
400
+
401
+ str_v = self.valid_encoding str_t, tolerance, debug
402
+ if !str_v
403
+ puts '[try UTF-8]' if debug
404
+ begin
405
+ str_t = str.encode 'UTF-8', 'UTF-8'
406
+ rescue => exp
407
+ end
408
+ else
409
+ return str_v
410
+ end
411
+
412
+ str_v = self.valid_encoding str_t, tolerance, debug
413
+ if !str_v
414
+ puts '[try ISO-8859-2]' if debug
415
+ begin
416
+ str_t = str.encode 'UTF-8', 'ISO-8859-2'
417
+ rescue => exp
418
+ end
419
+ else
420
+ return str_v
421
+ end
422
+
423
+ str_v = self.valid_encoding str_t, tolerance, debug
424
+ if !str_v
425
+ puts '[try ISO-8859-3]' if debug
426
+ begin
427
+ str_t = str.encode 'UTF-8', 'ISO-8859-3'
428
+ rescue => exp
429
+ end
430
+ else
431
+ return str_v
432
+ end
433
+
434
+ str_v = self.valid_encoding str_t, tolerance, debug
435
+ if tolerance == 0 and !str_v
436
+ str_t = self.treat_encoding_i str, 1, debug
437
+ end
438
+
439
+ return str_t
440
+
441
+ end
442
+
443
+ def self.upcaseword w
444
+
445
+ return w if w.to_s == ''
446
+
447
+ if w.scan(/#{@@separators_regex.join('|')}/).size == 0
448
+
449
+ # Cria uma Array apenas com os caracteres necessários por questões de performance.
450
+ letters = []
451
+ clean_word = self.remove_accents(w).downcase.gsub(/[^a-z]/, '')
452
+ clean_word.split('').uniq.each { |lt| @@letters_by_letter[lt].each { |l| letters << l } }
453
+
454
+ trf = 'tm'
455
+ trf = 'tfu' if w.size > 5 or !@@articles_and_others.include? clean_word
456
+ trf = 'tau' if !w.match(/^mr$|^jr$|^mr.$|^jr.$|^sr$|^sr.$/i) and ((w.size < 6 and clean_word.match(/[^aeiouwy]{4,}|[aeiouwy]{4,}|^[^aeiouwy]{2,3}$/)) or w.scan('.').size > 2)
457
+
458
+ letters.each do |l|
459
+
460
+ # Transforma tudo em minúsculo.
461
+ w = w.gsub l[1], l[0] if trf == 'tm' || trf == 'tfu'
462
+
463
+ # Maiúsculo na primeira letra caso não seja um artigo ou algo do gênero.
464
+ w = w.gsub /^#{l[0]}/, l[1] if trf == 'tfu'
465
+
466
+ # Transforma em maiúsculo:
467
+ # * Sequência de 4 ou mais consoantes.
468
+ # * Sequência de 4 ou mais vogais.
469
+ # * Sequência exata de 2 ou 3 vogais.
470
+
471
+ w = w.gsub l[0], l[1] if trf == 'tau'
472
+
473
+ end
474
+
475
+ else
476
+
477
+ # Quebra termos entre caracteres separadores como "'", "(", etc.
478
+ @@separators.each do |l|
479
+ sw = w.split(l)
480
+ if sw.size > 1
481
+ # Trata o termo isoladamente se não for uma letra única antes de "'"
482
+ sw.each_with_index { |v, i| sw[i] = upcaseword v if !(["'"].include? l and v.size == 1 and i == 0) }
483
+ if w[w.size-1] == l
484
+ w = sw.join(l) + l
485
+ else
486
+ w = sw.join(l)
487
+ end
488
+ end
489
+ end
490
+
491
+ end
492
+
493
+ return w
494
+
495
+ end
496
+
497
+ end
@@ -0,0 +1,106 @@
1
+ # encoding: utf-8
2
+
3
+ module FkStr
4
+
5
+ @@months_strs = {
6
+ 'jan' => 1, 'fev' => 2, 'mar' => 3, 'abr' => 4, 'mai' => 5, 'jun' => 6,
7
+ 'jul' => 7, 'ago' => 8, 'set' => 9, 'out' => 10, 'nov' => 11, 'dez' => 12,
8
+ '/1' => 1, '/2' => 2, '/3' => 3, '/4' => 4,
9
+ '/5' => 5, '/6' => 6, '/7' => 7, '/8' => 8, '/9' => 9,
10
+ '/01' => 1, '/02' => 2, '/03' => 3, '/04' => 4, '/05' => 5, '/06' => 6,
11
+ '/07' => 7, '/08' => 8, '/09' => 9, '/10' => 10, '/11' => 11, '/12' => 12,
12
+ 'feb' => 2, 'apr' => 4, 'may' => 5, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'dec' => 12
13
+ }
14
+
15
+ @@letters_by_letter = {
16
+ 'a' => { 'a' => 'A', 'á' => 'Á', 'ã' => 'Ã', 'â' => 'Â', 'à' => 'À', 'ä' => 'Ä', 'ă' => 'Ă', 'ā' => 'Ā', 'å' => 'Å', 'æ' => 'Æ' },
17
+ 'b' => { 'b' => 'B' },
18
+ 'c' => { 'c' => 'C', 'ç' => 'Ç', 'č' => 'Č' },
19
+ 'd' => { 'd' => 'D' },
20
+ 'e' => { 'e' => 'E', 'é' => 'É', 'ẽ' => 'Ẽ', 'ê' => 'Ê', 'è' => 'È', 'ë' => 'Ë', 'ĕ' => 'Ĕ', 'ē' => 'Ē' },
21
+ 'f' => { 'f' => 'F' },
22
+ 'g' => { 'g' => 'G', 'ğ' => 'Ğ' },
23
+ 'h' => { 'h' => 'H' },
24
+ 'i' => { 'i' => 'I', 'í' => 'Í', 'ĩ' => 'Ĩ', 'î' => 'Î', 'ì' => 'Ì', 'ï' => 'Ï', 'ĭ' => 'Ĭ', 'ī' => 'Ī' },
25
+ 'j' => { 'j' => 'J' },
26
+ 'k' => { 'k' => 'K' },
27
+ 'l' => { 'l' => 'L' },
28
+ 'm' => { 'm' => 'M' },
29
+ 'n' => { 'n' => 'N', 'ñ' => 'Ñ' },
30
+ 'o' => { 'o' => 'O', 'ó' => 'Ó', 'õ' => 'Õ', 'ô' => 'Ô', 'ò' => 'Ò', 'ö' => 'Ö', 'ŏ' => 'Ŏ', 'ō' => 'Ō', 'ő' => 'Ő', 'ð' => 'Ð' },
31
+ 'p' => { 'p' => 'P' },
32
+ 'q' => { 'q' => 'Q' },
33
+ 'r' => { 'r' => 'R' },
34
+ 's' => { 's' => 'S', 'š' => 'Š' },
35
+ 't' => { 't' => 'T' },
36
+ 'u' => { 'u' => 'U', 'ú' => 'Ú', 'ũ' => 'Ũ', 'û' => 'Û', 'ù' => 'Ù', 'ü' => 'Ü', 'ŭ' => 'Ŭ', 'ū' => 'Ū', 'ǖ' => 'Ǖ' },
37
+ 'v' => { 'v' => 'V' },
38
+ 'w' => { 'w' => 'W' },
39
+ 'x' => { 'x' => 'X' },
40
+ 'y' => { 'y' => 'Y', 'ȳ' => 'Ȳ', 'ÿ' => 'Ÿ', 'ý' => 'Ý', 'ỳ' => 'Ỳ' },
41
+ 'z' => { 'z' => 'Z', 'ž' => 'Ž' }
42
+ }
43
+
44
+ @@articles_and_others = [
45
+ # Português
46
+ 'a', 'ao', 'aos', 'as',
47
+ 'co', 'coa', 'coas', 'com', 'cos',
48
+ 'da', 'das', 'de', 'do', 'dos', 'dum', 'duma', 'dumas', 'duns',
49
+ 'e', 'em',
50
+ 'na', 'nas', 'no', 'nos', 'num', 'numa', 'numas', 'nuns',
51
+ 'o', 'os', 'ou',
52
+ 'pela', 'pelas', 'pelo', 'pelos', 'per', 'por',
53
+ 'um', 'uma', 'umas', 'uns',
54
+ # English
55
+ 'an', 'and', 'at', 'by', 'in', 'of', 'or', 'on', 's', 'the'
56
+ ]
57
+ def FkStr.articles_and_others
58
+ return @@articles_and_others
59
+ end
60
+
61
+ @@countries_acronyms = [
62
+ # Brasil
63
+ 'br',
64
+ 'ac', 'al', 'ap', 'am', 'ba', 'ce', 'df', 'es', 'go', 'ma', 'mt','ms', 'mg', 'pa', 'pb',
65
+ 'pr', 'pe', 'pi', 'rj', 'rn', 'rs', 'ro', 'rr', 'sc', 'sp', 'se', 'to',
66
+ # USA
67
+ 'us',
68
+ 'ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'ia', 'id', 'il',
69
+ 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne',
70
+ 'nh', 'nj', 'nm', 'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut',
71
+ 'va', 'vt', 'wa', 'wi', 'wv', 'wy'
72
+ ]
73
+ def FkStr.countries_acronyms
74
+ return @@countries_acronyms
75
+ end
76
+
77
+ @@simple_downcase_letters = [
78
+ 'a', 'b', 'c', 'd', 'e',
79
+ 'f', 'g', 'h', 'i', 'j',
80
+ 'k', 'l', 'm', 'n', 'o',
81
+ 'p', 'q', 'r', 's', 't',
82
+ 'u', 'v', 'w', 'x', 'y',
83
+ 'z'
84
+ ]
85
+
86
+ @@simple_downcase_consonants = [
87
+ 'b', 'c', 'd',
88
+ 'f', 'g', 'h', 'j',
89
+ 'k', 'l', 'm', 'n',
90
+ 'p', 'q', 'r', 's', 't',
91
+ 'v', 'w', 'x', 'y',
92
+ 'z'
93
+ ]
94
+
95
+ @@separators = ['/', '-', '_', ',', '.', "'", '"', '(', ')', '[', ']', '{', '}', '|', '\\', ';']
96
+ def FkStr.separators
97
+ return @@separators
98
+ end
99
+
100
+ @@separators_regex = ['\/', '\-', "\'", '\"', '\(', '\)', '\[', '\]', '\{', '\}']
101
+
102
+ @@legal_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz _-!@#$%&*+=?^~´`,.:;\'"()[]{}|/\\<>ÁÃÂÀÄĂĀÅÆáãâàäăāåæÉẼÊÈËĔĒéẽêèëĕēÍĨÎÌÏĬĪíĩîìïĭīÓÕÔÒÖŎŌŐÐóõôòöŏōőðšŠÚŨÛÙÜŬŪǕúũûùüŭūǖÇçČčĞğÑñȲȳŸÿÝýỲỳŽž¹²³ºª – ’©®℗¿¡±“”•«»‘°'.split('')
103
+
104
+ @@invalid_sequences = ['é']
105
+
106
+ end
@@ -0,0 +1,255 @@
1
+ # encoding: utf-8
2
+
3
+ require 'test/unit'
4
+ require 'fk_str'
5
+
6
+ class FkStrTest < Test::Unit::TestCase
7
+
8
+ def test_treat_encoding
9
+
10
+ assert_equal(
11
+ 'çUTF-8',
12
+ FkStr.treat_encoding("\xE7")+FkStr.treat_encoding("\xE7").encoding.to_s.upcase
13
+ )
14
+
15
+ assert_equal(
16
+ '©UTF-8',
17
+ FkStr.treat_encoding("\xC2\xA9")+FkStr.treat_encoding("\xC2\xA9").encoding.to_s.upcase
18
+ )
19
+
20
+ assert_equal(
21
+ 'caçaUTF-8',
22
+ FkStr.treat_encoding("ca\xE7a")+FkStr.treat_encoding("ca\xE7a").encoding.to_s.upcase
23
+ )
24
+
25
+ assert_equal(
26
+ 'casaUTF-8',
27
+ FkStr.treat_encoding('casa')+FkStr.treat_encoding('casa').encoding.to_s.upcase
28
+ )
29
+
30
+ end
31
+
32
+ def test_is_eq
33
+
34
+ assert_equal(
35
+ true,
36
+ FkStr.is_eq('Hangar 110', 'Hangar 110', 40)
37
+ )
38
+
39
+ assert_equal(
40
+ true,
41
+ FkStr.is_eq('Armagedon + Atos de Vingança', 'Armagedom')
42
+ )
43
+
44
+ assert_equal(
45
+ false,
46
+ FkStr.is_eq('Gato Cat', 'Cachorro Dog')
47
+ )
48
+
49
+ assert_equal(
50
+ true,
51
+ FkStr.is_eq('Creedence Clearwater Revisited', 'Creedence Clearwater')
52
+ )
53
+
54
+ end
55
+
56
+ def test_to_slug
57
+
58
+ assert_equal(
59
+ 'teste-dog',
60
+ FkStr.to_slug('teste:dog')
61
+ )
62
+
63
+ assert_equal(
64
+ 'centro-rio-de-janeiro-rj',
65
+ FkStr.to_slug('Centro - Rio de Janeiro [RJ]')
66
+ )
67
+
68
+ assert_equal(
69
+ 'sao-paulo-sp',
70
+ FkStr.to_slug('São Paulo/SP')
71
+ )
72
+
73
+ assert_equal(
74
+ 'sao-paulo-sp',
75
+ FkStr.to_slug('São Paulo_SP')
76
+ )
77
+
78
+ end
79
+
80
+ def test_to_term
81
+
82
+ assert_equal(
83
+ 'kasadujkakurururi',
84
+ FkStr.to_term('casa & dog and cachorro e lorem')
85
+ )
86
+
87
+ assert_equal(
88
+ 'tistiduj',
89
+ FkStr.to_term('teste:de\dog')
90
+ )
91
+
92
+ assert_equal(
93
+ 'saupauru',
94
+ FkStr.to_term('São Paulo-SP')
95
+ )
96
+
97
+ assert_equal(
98
+ 'tistiduj',
99
+ FkStr.to_term('teste:de:dog')
100
+ )
101
+
102
+ end
103
+
104
+ def test_remove_accents
105
+
106
+ assert_equal(
107
+ 'Sao Jose do Rio Preto - SP',
108
+ FkStr.remove_accents('São José do Rio Preto - SP')
109
+ )
110
+
111
+ assert_equal(
112
+ 'Sao Paulo',
113
+ FkStr.remove_accents('São Paulo')
114
+ )
115
+
116
+ assert_equal(
117
+ 'Acougue',
118
+ FkStr.remove_accents('Açougue')
119
+ )
120
+
121
+ assert_equal(
122
+ 'Lorem Ipsum',
123
+ FkStr.remove_accents('Lôrém Ipsum')
124
+ )
125
+
126
+ end
127
+
128
+ def test_upcasewords
129
+
130
+ assert_equal(
131
+ 'Charlie Brown Jr.',
132
+ FkStr.upcasewords('CHARLIE BROWN JR.')
133
+ )
134
+
135
+ assert_equal(
136
+ 'Coldplay',
137
+ FkStr.upcasewords('COLDPLAY')
138
+ )
139
+
140
+ assert_equal(
141
+ 'Queensrÿche',
142
+ FkStr.upcasewords('QUEENSRŸCHE')
143
+ )
144
+
145
+ assert_equal(
146
+ 'Mindflow',
147
+ FkStr.upcasewords('MINDFLOW')
148
+ )
149
+
150
+ end
151
+
152
+ def test_remove_if_ends_with
153
+
154
+ assert_equal(
155
+ 'Natal La Barra',
156
+ FkStr.remove_if_ends_with(
157
+ 'Natal La Barra - Caxias do Sul / RS',
158
+ ['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
159
+ ['Natal'],
160
+ 1
161
+ )
162
+ )
163
+
164
+ assert_equal(
165
+ 'Natal La Barra -',
166
+ FkStr.remove_if_ends_with(
167
+ 'Natal La Barra - Caxias do Sul / RS',
168
+ ['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
169
+ ['Natal'],
170
+ 2
171
+ )
172
+ )
173
+
174
+ assert_equal(
175
+ 'Natal La Barra - Caxias do Sul',
176
+ FkStr.remove_if_ends_with(
177
+ 'Natal La Barra - Caxias do Sul / RS',
178
+ ['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
179
+ ['Natal'],
180
+ 3
181
+ )
182
+ )
183
+
184
+ assert_equal(
185
+ 'Masp',
186
+ FkStr.remove_if_ends_with('Masp São Paulo/SP', ['São Paulo', 'SP', '/'])
187
+ )
188
+
189
+ end
190
+
191
+ def test_extract_dates
192
+
193
+ assert_equal(
194
+ [Time.new(2012, 12, 6)].uniq.sort,
195
+ FkStr.extract_dates('December 06, 2012', Time.new(2012, 9, 12))
196
+ )
197
+
198
+ assert_equal(
199
+ [Time.new(2012, 9, 14)].uniq.sort,
200
+ FkStr.extract_dates('FRI 09.14.2012', Time.new(2012, 9, 12), true)
201
+ )
202
+
203
+ assert_equal(
204
+ [Time.new(2011, 12, 8), Time.new(2012, 1, 9)].uniq.sort,
205
+ FkStr.extract_dates('8/dez lorem 9/jan/2012', Time.new(2011, 10, 8))
206
+ )
207
+
208
+ assert_equal(
209
+ [Time.new(2012, 1, 2)].uniq.sort,
210
+ FkStr.extract_dates('2 de janeiro', Time.new(2011, 10, 8))
211
+ )
212
+
213
+ end
214
+
215
+ def test_extract_time
216
+
217
+ assert_equal(
218
+ Time.new(2011, 07, 14, 22, 18, 0),
219
+ FkStr.extract_time(
220
+ 'Thu, 14 Jul 2011 22:18:49 +0000',
221
+ FkStr.extract_dates('Thu, 14 Jul 2011 22:18:49 +0000',Time.new(2012, 5, 28)).first,
222
+ Time.new(2012, 5, 28)
223
+ )
224
+ )
225
+
226
+ assert_equal(
227
+ Time.new(2011, 07, 14, 16, 15, 0),
228
+ FkStr.extract_time(
229
+ '14 Jul 2011 16:15',
230
+ FkStr.extract_dates('14 Jul 2011 16:15',Time.new(2012, 5, 28)).first,
231
+ Time.new(2012, 5, 28)
232
+ )
233
+ )
234
+
235
+ assert_equal(
236
+ Time.new(2011, 07, 14, 9, 0, 0),
237
+ FkStr.extract_time(
238
+ '14 Jul 2011 9:00',
239
+ FkStr.extract_dates('14 Jul 2011 9:00',Time.new(2012, 5, 28)).first,
240
+ Time.new(2012, 5, 28)
241
+ )
242
+ )
243
+
244
+ assert_equal(
245
+ Time.new(2011, 07, 14, 7, 35, 0),
246
+ FkStr.extract_time(
247
+ '14 Jul 2011 07:35',
248
+ FkStr.extract_dates('14 Jul 2011 07:35',Time.new(2012, 5, 28)).first,
249
+ Time.new(2012, 5, 28)
250
+ )
251
+ )
252
+
253
+ end
254
+
255
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fk_str
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Guilherme Baptista
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: String manipulation.
15
+ email: guilhermebaptistasilva@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE
21
+ - Rakefile
22
+ - lib/fk_str.rb
23
+ - lib/fk_str/dictionary.rb
24
+ - test/test_fk_str.rb
25
+ homepage: https://github.com/gbaptista/fk_str
26
+ licenses: []
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ none: false
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubyforge_project:
45
+ rubygems_version: 1.8.24
46
+ signing_key:
47
+ specification_version: 3
48
+ summary: FkStr
49
+ test_files: []