fk_str 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/Rakefile +8 -0
- data/lib/fk_str.rb +497 -0
- data/lib/fk_str/dictionary.rb +106 -0
- data/test/test_fk_str.rb +255 -0
- metadata +49 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Guilherme Baptista
|
2
|
+
https://github.com/gbaptista/fk_str
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
of this software and associated documentation files (the "Software"), to deal
|
6
|
+
in the Software without restriction, including without limitation the rights
|
7
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the Software is
|
9
|
+
furnished to do so, subject to the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included in
|
12
|
+
all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
20
|
+
THE SOFTWARE.
|
data/Rakefile
ADDED
data/lib/fk_str.rb
ADDED
@@ -0,0 +1,497 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'date'
|
4
|
+
require 'fk_str/dictionary'
|
5
|
+
|
6
|
+
module FkStr
|
7
|
+
|
8
|
+
def self.treat_encoding str, debug=false
|
9
|
+
str_r = ''
|
10
|
+
str.lines.each_with_index { |l, i| str_r += ' ' + self.treat_encoding_s(l, debug) if !debug or (i > -1 and i < 1) }
|
11
|
+
return str_r.strip
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.is_eq str, str_b, pct=1
|
15
|
+
|
16
|
+
str = self.to_term str, true
|
17
|
+
str_b = self.to_term str_b, true
|
18
|
+
|
19
|
+
str_c = str.concat str_b
|
20
|
+
|
21
|
+
return true if (100-(100*str_c.uniq.size/str_c.size)) >= pct
|
22
|
+
|
23
|
+
return false
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.to_slug str
|
28
|
+
|
29
|
+
return str if str.to_s == ''
|
30
|
+
|
31
|
+
return self.remove_accents(str).gsub(/\s{1,}| {1,}/, ' ').gsub(/[\+\/_\-|:@#\\,]/, ' ').gsub('&', 'e').gsub(/[^a-zA-Z0-9 ]/, '').downcase.gsub(/\s{1,}| {1,}/, ' ').strip.gsub(' ', '-')
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.to_term str, ar=false
|
36
|
+
|
37
|
+
return str if str.to_s == ''
|
38
|
+
|
39
|
+
str_ar = []
|
40
|
+
|
41
|
+
self.to_slug(str).split('-').each do |s|
|
42
|
+
s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r }
|
43
|
+
@@simple_downcase_consonants.each { |c| s = s.gsub /#{c}(h|r|l|u)/, c }
|
44
|
+
if !s.empty? and !@@countries_acronyms.include? s and !@@articles_and_others.include? s
|
45
|
+
s = s.gsub /m/, 'n'
|
46
|
+
s = s.gsub /l/, 'r'
|
47
|
+
s = s.gsub /z/, 's'
|
48
|
+
s = s.gsub /g/, 'j'
|
49
|
+
s = s.gsub /e|y/, 'i'
|
50
|
+
s = s.gsub /o|w/, 'u'
|
51
|
+
s = s.gsub /c|q/, 'k'
|
52
|
+
s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r }
|
53
|
+
s = s.gsub /(r|s|n)$/, ''
|
54
|
+
str_ar << s if !s.empty?
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
return str_ar.uniq if ar
|
59
|
+
|
60
|
+
return str_ar.uniq.join
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.remove_accents str
|
65
|
+
|
66
|
+
return '' if str.to_s == ''
|
67
|
+
str = str.gsub(/[ÁÃÂÀÄĂĀÅÆ]/, 'A').gsub(/[áãâàäăāåæ]/, 'a')
|
68
|
+
str = str.gsub(/[ÉẼÊÈËĔĒ]/, 'E').gsub(/[éẽêèëĕē]/, 'e')
|
69
|
+
str = str.gsub(/[ÍĨÎÌÏĬĪ]/, 'I').gsub(/[íĩîìïĭī]/, 'i')
|
70
|
+
str = str.gsub(/[ÓÕÔÒÖŎŌŐÐ]/, 'O').gsub(/[óõôòöŏōőð]/, 'o')
|
71
|
+
str = str.gsub(/[ÚŨÛÙÜŬŪǕ]/, 'U').gsub(/[úũûùüŭūǖ]/, 'u')
|
72
|
+
str = str.gsub(/[ÇČ]/, 'C').gsub(/[çč]/, 'c').gsub(/Ğ/, 'G').gsub(/ğ/, 'g').gsub(/Ñ/, 'N').gsub(/ñ/, 'n').gsub(/Š/, 'S').gsub(/š/, 's')
|
73
|
+
str = str.gsub(/[ȲŸÝỲ]/, 'Y').gsub(/[ȳÿýỳ]/, 'y').gsub(/Ž/, 'Z').gsub(/ž/, 'z')
|
74
|
+
|
75
|
+
return str
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
# 35 seconds
|
80
|
+
# 18 seconds
|
81
|
+
# 16 seconds
|
82
|
+
def self.upcasewords str
|
83
|
+
|
84
|
+
return str if str.to_s == ''
|
85
|
+
|
86
|
+
# Trata espaçamentos duplicados ou inválidos.
|
87
|
+
str = str.gsub(/\s{1,}| {1,}/, ' ').strip
|
88
|
+
|
89
|
+
rstr = []
|
90
|
+
str.split(' ').each { |w| rstr << upcaseword(w) }
|
91
|
+
str = rstr.join(' ')
|
92
|
+
|
93
|
+
# Trata espaçamentos duplicados ou inválidos.
|
94
|
+
str = str.gsub(/\s{1,}| {1,}/, ' ')
|
95
|
+
|
96
|
+
# Maiúsculo na primeira letra
|
97
|
+
fl = @@letters_by_letter[remove_accents(str[0]).downcase]
|
98
|
+
fl.each { |l| str[0] = str[0].gsub(l[0], l[1]) } if fl
|
99
|
+
|
100
|
+
return str
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.remove_if_ends_with str, texts, not_change_if_returns_with=nil, if_not_change_returns_with_last_removed=0
|
105
|
+
|
106
|
+
return str if str.split(' ').size == 1
|
107
|
+
|
108
|
+
texts.each_with_index { |t, i| texts.delete_at i if t == '' }
|
109
|
+
|
110
|
+
str_o = str
|
111
|
+
|
112
|
+
str = str.strip
|
113
|
+
|
114
|
+
str_t = self.remove_accents(str).downcase
|
115
|
+
|
116
|
+
texts = texts.uniq
|
117
|
+
|
118
|
+
texts.each_with_index { |v, i| texts[i] = self.remove_accents(v).downcase }
|
119
|
+
|
120
|
+
not_change_if_returns_with.each_with_index { |v, i| not_change_if_returns_with[i] = self.remove_accents(v).downcase } if !not_change_if_returns_with.nil?
|
121
|
+
|
122
|
+
removed = []
|
123
|
+
|
124
|
+
continue = true
|
125
|
+
while continue
|
126
|
+
continue = false
|
127
|
+
texts.each do |t|
|
128
|
+
|
129
|
+
# Se o final da string for igual ao termo...
|
130
|
+
if t == str_t[str_t.size-t.size..str_t.size].to_s
|
131
|
+
|
132
|
+
# Se antes do termo final na string não for igual à ' de ' ou ' da '...
|
133
|
+
if ![' de ', ' da '].include? str_t[str_t.size-t.size-4].to_s + str_t[str_t.size-t.size-3..str_t.size-t.size-2].to_s + str_t[str_t.size-t.size-1].to_s
|
134
|
+
|
135
|
+
# Se o primeiro char do termo não for uma letra ou se o char anterior ao termo não for uma letra...
|
136
|
+
if (!@@simple_downcase_letters.include? t[0] or !@@simple_downcase_letters.include? str_t[str_t.size-t.size-1]) and str_t.size > 1
|
137
|
+
|
138
|
+
str_l = str
|
139
|
+
|
140
|
+
str = str[0..str.size-t.size-1].strip
|
141
|
+
str_t = self.remove_accents(str).downcase
|
142
|
+
|
143
|
+
removed << str_l[str.size..str_l.size]
|
144
|
+
|
145
|
+
continue = true
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Se o retorno for igual à alguma condição que não deve ser retornada...
|
157
|
+
if !not_change_if_returns_with.nil?
|
158
|
+
if not_change_if_returns_with.include?(self.remove_accents(str).downcase)
|
159
|
+
# Se for solicitado que retorne apenas com x termos que foram removidos...
|
160
|
+
if if_not_change_returns_with_last_removed > 0
|
161
|
+
removed = removed.reverse
|
162
|
+
(1..if_not_change_returns_with_last_removed).each { |n| str += removed[n-1].to_s }
|
163
|
+
return str.strip
|
164
|
+
end
|
165
|
+
return str_o
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
return str
|
170
|
+
|
171
|
+
end
|
172
|
+
|
173
|
+
def self.extract_dates str, reference_date=Time.now, reverse_month_day=false
|
174
|
+
|
175
|
+
return [] if str.nil?
|
176
|
+
|
177
|
+
return [Time.new(str.year, str.month, str.day)] if str.kind_of?(Time) or str.kind_of?(Date) or str.kind_of?(DateTime)
|
178
|
+
|
179
|
+
o_str = str
|
180
|
+
|
181
|
+
years = []
|
182
|
+
(-30..20).each { |y| years << reference_date.year+y }
|
183
|
+
|
184
|
+
begin
|
185
|
+
|
186
|
+
str = str.gsub /[0-9]{1,}(º|ª)/, ' '
|
187
|
+
|
188
|
+
str = self.remove_accents str
|
189
|
+
|
190
|
+
str = str.downcase
|
191
|
+
|
192
|
+
str = str.gsub /[0-9]{1,}+[a-z]{1,}+[0-9]{1,}/, ''
|
193
|
+
str = str.gsub /[0-9]{1,}+[a-z]{1,}/, ' '
|
194
|
+
str = str.gsub /[a-z]{1,}+[0-9]{1,}/, ' '
|
195
|
+
|
196
|
+
str = str.gsub(/[^a-z|^0-9|^\/|^\-|^\.|^:]/i, ' ')
|
197
|
+
|
198
|
+
str = str.gsub(/[0-9]{1,}:[0-9]{1,}|:[0-9]{1,}|[0-9]{1,}h[0-9]{1,}|[0-9]{1,}%|[0-9]{1,}h |[0-9]{1,}h$|palco [0-9]{1,}/i, '')
|
199
|
+
|
200
|
+
str.scan(/[0-9]{1,}+.+[0-9]{1,}/).each { |d| str = str.gsub(d, d.gsub('.', '/')) }
|
201
|
+
|
202
|
+
if reverse_month_day
|
203
|
+
str.scan(/[0-9]{1,}\/[0-9]{1,}/).each do |d|
|
204
|
+
str = str.gsub(d, d.split('/')[1] + '/' + d.split('/')[0])
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
@@months_strs.each do |mc|
|
209
|
+
str.scan(/#{mc.first}.*[0-9]{1,2}+[1-9]{2,4}/).each do |md|
|
210
|
+
if md.scan(/[0-9]{1,2}/).size < 4 and md.scan(/[0-9]{4,}/).size < (md.scan(/[0-9]{2,2}/).size-1)
|
211
|
+
|
212
|
+
continue = true
|
213
|
+
|
214
|
+
@@months_strs.each do |smc|
|
215
|
+
md.scan(/[0-9].*#{smc.first}/).each do |d|
|
216
|
+
continue = false
|
217
|
+
end
|
218
|
+
end
|
219
|
+
if continue
|
220
|
+
m = md.scan(/[0-9]{1,2}/).first
|
221
|
+
str = str.gsub(/#{mc.first}.+#{m}/, "#{m} #{mc.first}").gsub(',', '')
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
str.scan(/[0-9]{4,4}-[0-9]{1,2}-[0-9]{1,2}/).each do |y|
|
228
|
+
str = str.gsub(y, y.split('-')[2] + '/' + y.split('-')[1] + '/' + y.split('-')[0])
|
229
|
+
end
|
230
|
+
|
231
|
+
str.scan(/[0-9]{4,4}\/[0-9]{1,2}\/[0-9]{1,2}/).each do |y|
|
232
|
+
str = str.gsub(y, y.split('/')[2] + '/' + y.split('/')[1] + '/' + y.split('/')[0])
|
233
|
+
end
|
234
|
+
|
235
|
+
str.scan(/[0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4}/).each do |y|
|
236
|
+
str = str.gsub(y, y.split('-')[0] + '/' + y.split('-')[1] + '/' + y.split('-')[2])
|
237
|
+
end
|
238
|
+
|
239
|
+
str.scan(/[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{1,}/).each do |y|
|
240
|
+
if y.split('/')[2].size < 4
|
241
|
+
sr = y.split('/').first + '/' + y.split('/')[1]
|
242
|
+
sy = y.split('/')[2]
|
243
|
+
if sy.size < 3
|
244
|
+
sy = '0' + sy if sy.size == 1
|
245
|
+
if years.include? (reference_date.year.to_s[0..1]+sy).to_i
|
246
|
+
sr += '/' + reference_date.year.to_s[0..1]+sy
|
247
|
+
elsif years.include? ((reference_date.year-100).to_s[0..1]+sy).to_i
|
248
|
+
sr += '/' + (reference_date.year-100).to_s[0..1]+sy
|
249
|
+
end
|
250
|
+
end
|
251
|
+
str = str.gsub(y, sr)
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
str = str.gsub(/[0-9]{5,}/, '')
|
256
|
+
|
257
|
+
dates = []
|
258
|
+
continue = true
|
259
|
+
while continue
|
260
|
+
|
261
|
+
@@months_strs.each do |m|
|
262
|
+
|
263
|
+
str.scan(/([0-9].*#{m.first})+([^0-9]|$)/).each do |d|
|
264
|
+
days = d.first.split(/(#{m.first})+([^0-9]|$)/).first
|
265
|
+
jump=false
|
266
|
+
@@months_strs.each do |mc|
|
267
|
+
if days.scan(/([0-9].*#{mc.first})+([^0-9]|$)/).size > 0
|
268
|
+
jump = true
|
269
|
+
end
|
270
|
+
end
|
271
|
+
if !jump
|
272
|
+
|
273
|
+
year = nil
|
274
|
+
str.scan(/#{days}#{m.first}.*[0-9]{4,4}/).each do |sc|
|
275
|
+
sy = sc.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '')
|
276
|
+
|
277
|
+
# [lorem 9/jan/2012] = false
|
278
|
+
# [2012 e 07/05/2012] = true
|
279
|
+
# [2012] = true
|
280
|
+
|
281
|
+
if sy.scan(/[0-9]{4,4}/).size > 1 or (sy.scan(/[0-9]{4,4}/).size == 1 and !sy.gsub(/[0-9]{4,4}/, '').match(/[0,9]/))
|
282
|
+
sy.scan(/[0-9]{4,4}/).each { |y| year=y.to_i if years.include? y.to_i; break; }
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
#puts '[' + str + '] => ' + year.inspect
|
287
|
+
str = str.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '')
|
288
|
+
#puts '[' + str + "\n\n"
|
289
|
+
|
290
|
+
days.gsub(/[0-9]{4,4}/, '').scan(/[0-9]{1,2}/).each do |day|
|
291
|
+
day = day.to_i
|
292
|
+
if day > 0 and day < 32
|
293
|
+
if year
|
294
|
+
dates<<Time.new(year, m[1], day)
|
295
|
+
elsif m[1]<(reference_date.month-3)
|
296
|
+
dates<<Time.new(reference_date.year+1, m[1], day)
|
297
|
+
else
|
298
|
+
dates<<Time.new(reference_date.year, m[1], day)
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
305
|
+
continue = false
|
306
|
+
@@months_strs.each do |mt|
|
307
|
+
if str.scan(/([0-9].*#{mt.first})+([^0-9]|$)/).size > 0
|
308
|
+
continue = true
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
return dates.uniq.sort
|
314
|
+
|
315
|
+
rescue => exc
|
316
|
+
return []
|
317
|
+
end
|
318
|
+
|
319
|
+
end
|
320
|
+
|
321
|
+
def self.extract_time str, date=nil, reference_time=Time.now
|
322
|
+
|
323
|
+
return nil if date.nil?
|
324
|
+
|
325
|
+
return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min) if str.nil? or !str.match /[0-9]{1,2}:[0-9]{1,2}/
|
326
|
+
|
327
|
+
begin
|
328
|
+
time = str.scan(/[0-9]{1,2}:[0-9]{1,2}/).first.split(':')
|
329
|
+
return Time.new(date.year, date.month, date.day, time[0], time[1])
|
330
|
+
rescue => exp
|
331
|
+
return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min)
|
332
|
+
end
|
333
|
+
|
334
|
+
end
|
335
|
+
|
336
|
+
private
|
337
|
+
|
338
|
+
def self.treat_encoding_s str, debug=false
|
339
|
+
begin
|
340
|
+
str_r = ''
|
341
|
+
ws = str.split(' ').each_slice(20)
|
342
|
+
ws.each_with_index do |w, i|
|
343
|
+
if i == 0
|
344
|
+
str_r += self.treat_encoding_i w.join(' '), 0, debug
|
345
|
+
else
|
346
|
+
str_r += ' ' + self.treat_encoding_i(w.join(' '), 0, debug)
|
347
|
+
end
|
348
|
+
end
|
349
|
+
rescue => exp
|
350
|
+
str_r = ''
|
351
|
+
str.chars.each_slice(200).each { |w| str_r += self.treat_encoding_i w.join, 0, debug }
|
352
|
+
end
|
353
|
+
|
354
|
+
return str_r
|
355
|
+
|
356
|
+
end
|
357
|
+
|
358
|
+
def self.valid_encoding str, tolerance=0, debug=false
|
359
|
+
str_v = str
|
360
|
+
begin
|
361
|
+
str_v.match 'á'
|
362
|
+
str_v = str_v.gsub /\s{1,}|\n{1,}|\r{1,}/, ''
|
363
|
+
@@legal_chars.each { |lc| str_v = str_v.gsub lc, '' }
|
364
|
+
@@invalid_sequences.each { |is| raise 'invalid sequence: ' + is if str.match is }
|
365
|
+
puts '[' + str_v + ']' if debug and str_v.size > 0
|
366
|
+
return false if str_v.size > tolerance
|
367
|
+
str_v.split('').each { |c| str = str.gsub c, '' } if str_v.size > 0
|
368
|
+
return str
|
369
|
+
rescue => exp
|
370
|
+
#puts '[error] ' + exp.message if debug or !exp.message.match /incompatible encoding|invalid byte sequence|invalid sequence/i
|
371
|
+
return false
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
def self.treat_encoding_i str, tolerance=0, debug=false
|
376
|
+
|
377
|
+
str_t = str
|
378
|
+
|
379
|
+
str_v = self.valid_encoding str_t, tolerance, debug
|
380
|
+
if !str_v
|
381
|
+
puts '[try force_encoding UTF-8]' if debug
|
382
|
+
begin
|
383
|
+
str_t = str.force_encoding 'UTF-8'
|
384
|
+
rescue => exp
|
385
|
+
end
|
386
|
+
else
|
387
|
+
return str_v
|
388
|
+
end
|
389
|
+
|
390
|
+
str_v = self.valid_encoding str_t, tolerance, debug
|
391
|
+
if !str_v
|
392
|
+
puts '[try WINDOWS-1252]' if debug
|
393
|
+
begin
|
394
|
+
str_t = str.encode 'UTF-8', 'WINDOWS-1252'
|
395
|
+
rescue => exp
|
396
|
+
end
|
397
|
+
else
|
398
|
+
return str_v
|
399
|
+
end
|
400
|
+
|
401
|
+
str_v = self.valid_encoding str_t, tolerance, debug
|
402
|
+
if !str_v
|
403
|
+
puts '[try UTF-8]' if debug
|
404
|
+
begin
|
405
|
+
str_t = str.encode 'UTF-8', 'UTF-8'
|
406
|
+
rescue => exp
|
407
|
+
end
|
408
|
+
else
|
409
|
+
return str_v
|
410
|
+
end
|
411
|
+
|
412
|
+
str_v = self.valid_encoding str_t, tolerance, debug
|
413
|
+
if !str_v
|
414
|
+
puts '[try ISO-8859-2]' if debug
|
415
|
+
begin
|
416
|
+
str_t = str.encode 'UTF-8', 'ISO-8859-2'
|
417
|
+
rescue => exp
|
418
|
+
end
|
419
|
+
else
|
420
|
+
return str_v
|
421
|
+
end
|
422
|
+
|
423
|
+
str_v = self.valid_encoding str_t, tolerance, debug
|
424
|
+
if !str_v
|
425
|
+
puts '[try ISO-8859-3]' if debug
|
426
|
+
begin
|
427
|
+
str_t = str.encode 'UTF-8', 'ISO-8859-3'
|
428
|
+
rescue => exp
|
429
|
+
end
|
430
|
+
else
|
431
|
+
return str_v
|
432
|
+
end
|
433
|
+
|
434
|
+
str_v = self.valid_encoding str_t, tolerance, debug
|
435
|
+
if tolerance == 0 and !str_v
|
436
|
+
str_t = self.treat_encoding_i str, 1, debug
|
437
|
+
end
|
438
|
+
|
439
|
+
return str_t
|
440
|
+
|
441
|
+
end
|
442
|
+
|
443
|
+
def self.upcaseword w
|
444
|
+
|
445
|
+
return w if w.to_s == ''
|
446
|
+
|
447
|
+
if w.scan(/#{@@separators_regex.join('|')}/).size == 0
|
448
|
+
|
449
|
+
# Cria uma Array apenas com os caracteres necessários por questões de performance.
|
450
|
+
letters = []
|
451
|
+
clean_word = self.remove_accents(w).downcase.gsub(/[^a-z]/, '')
|
452
|
+
clean_word.split('').uniq.each { |lt| @@letters_by_letter[lt].each { |l| letters << l } }
|
453
|
+
|
454
|
+
trf = 'tm'
|
455
|
+
trf = 'tfu' if w.size > 5 or !@@articles_and_others.include? clean_word
|
456
|
+
trf = 'tau' if !w.match(/^mr$|^jr$|^mr.$|^jr.$|^sr$|^sr.$/i) and ((w.size < 6 and clean_word.match(/[^aeiouwy]{4,}|[aeiouwy]{4,}|^[^aeiouwy]{2,3}$/)) or w.scan('.').size > 2)
|
457
|
+
|
458
|
+
letters.each do |l|
|
459
|
+
|
460
|
+
# Transforma tudo em minúsculo.
|
461
|
+
w = w.gsub l[1], l[0] if trf == 'tm' || trf == 'tfu'
|
462
|
+
|
463
|
+
# Maiúsculo na primeira letra caso não seja um artigo ou algo do gênero.
|
464
|
+
w = w.gsub /^#{l[0]}/, l[1] if trf == 'tfu'
|
465
|
+
|
466
|
+
# Transforma em maiúsculo:
|
467
|
+
# * Sequência de 4 ou mais consoantes.
|
468
|
+
# * Sequência de 4 ou mais vogais.
|
469
|
+
# * Sequência exata de 2 ou 3 vogais.
|
470
|
+
|
471
|
+
w = w.gsub l[0], l[1] if trf == 'tau'
|
472
|
+
|
473
|
+
end
|
474
|
+
|
475
|
+
else
|
476
|
+
|
477
|
+
# Quebra termos entre caracteres separadores como "'", "(", etc.
|
478
|
+
@@separators.each do |l|
|
479
|
+
sw = w.split(l)
|
480
|
+
if sw.size > 1
|
481
|
+
# Trata o termo isoladamente se não for uma letra única antes de "'"
|
482
|
+
sw.each_with_index { |v, i| sw[i] = upcaseword v if !(["'"].include? l and v.size == 1 and i == 0) }
|
483
|
+
if w[w.size-1] == l
|
484
|
+
w = sw.join(l) + l
|
485
|
+
else
|
486
|
+
w = sw.join(l)
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
end
|
492
|
+
|
493
|
+
return w
|
494
|
+
|
495
|
+
end
|
496
|
+
|
497
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module FkStr
|
4
|
+
|
5
|
+
@@months_strs = {
|
6
|
+
'jan' => 1, 'fev' => 2, 'mar' => 3, 'abr' => 4, 'mai' => 5, 'jun' => 6,
|
7
|
+
'jul' => 7, 'ago' => 8, 'set' => 9, 'out' => 10, 'nov' => 11, 'dez' => 12,
|
8
|
+
'/1' => 1, '/2' => 2, '/3' => 3, '/4' => 4,
|
9
|
+
'/5' => 5, '/6' => 6, '/7' => 7, '/8' => 8, '/9' => 9,
|
10
|
+
'/01' => 1, '/02' => 2, '/03' => 3, '/04' => 4, '/05' => 5, '/06' => 6,
|
11
|
+
'/07' => 7, '/08' => 8, '/09' => 9, '/10' => 10, '/11' => 11, '/12' => 12,
|
12
|
+
'feb' => 2, 'apr' => 4, 'may' => 5, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'dec' => 12
|
13
|
+
}
|
14
|
+
|
15
|
+
@@letters_by_letter = {
|
16
|
+
'a' => { 'a' => 'A', 'á' => 'Á', 'ã' => 'Ã', 'â' => 'Â', 'à' => 'À', 'ä' => 'Ä', 'ă' => 'Ă', 'ā' => 'Ā', 'å' => 'Å', 'æ' => 'Æ' },
|
17
|
+
'b' => { 'b' => 'B' },
|
18
|
+
'c' => { 'c' => 'C', 'ç' => 'Ç', 'č' => 'Č' },
|
19
|
+
'd' => { 'd' => 'D' },
|
20
|
+
'e' => { 'e' => 'E', 'é' => 'É', 'ẽ' => 'Ẽ', 'ê' => 'Ê', 'è' => 'È', 'ë' => 'Ë', 'ĕ' => 'Ĕ', 'ē' => 'Ē' },
|
21
|
+
'f' => { 'f' => 'F' },
|
22
|
+
'g' => { 'g' => 'G', 'ğ' => 'Ğ' },
|
23
|
+
'h' => { 'h' => 'H' },
|
24
|
+
'i' => { 'i' => 'I', 'í' => 'Í', 'ĩ' => 'Ĩ', 'î' => 'Î', 'ì' => 'Ì', 'ï' => 'Ï', 'ĭ' => 'Ĭ', 'ī' => 'Ī' },
|
25
|
+
'j' => { 'j' => 'J' },
|
26
|
+
'k' => { 'k' => 'K' },
|
27
|
+
'l' => { 'l' => 'L' },
|
28
|
+
'm' => { 'm' => 'M' },
|
29
|
+
'n' => { 'n' => 'N', 'ñ' => 'Ñ' },
|
30
|
+
'o' => { 'o' => 'O', 'ó' => 'Ó', 'õ' => 'Õ', 'ô' => 'Ô', 'ò' => 'Ò', 'ö' => 'Ö', 'ŏ' => 'Ŏ', 'ō' => 'Ō', 'ő' => 'Ő', 'ð' => 'Ð' },
|
31
|
+
'p' => { 'p' => 'P' },
|
32
|
+
'q' => { 'q' => 'Q' },
|
33
|
+
'r' => { 'r' => 'R' },
|
34
|
+
's' => { 's' => 'S', 'š' => 'Š' },
|
35
|
+
't' => { 't' => 'T' },
|
36
|
+
'u' => { 'u' => 'U', 'ú' => 'Ú', 'ũ' => 'Ũ', 'û' => 'Û', 'ù' => 'Ù', 'ü' => 'Ü', 'ŭ' => 'Ŭ', 'ū' => 'Ū', 'ǖ' => 'Ǖ' },
|
37
|
+
'v' => { 'v' => 'V' },
|
38
|
+
'w' => { 'w' => 'W' },
|
39
|
+
'x' => { 'x' => 'X' },
|
40
|
+
'y' => { 'y' => 'Y', 'ȳ' => 'Ȳ', 'ÿ' => 'Ÿ', 'ý' => 'Ý', 'ỳ' => 'Ỳ' },
|
41
|
+
'z' => { 'z' => 'Z', 'ž' => 'Ž' }
|
42
|
+
}
|
43
|
+
|
44
|
+
@@articles_and_others = [
|
45
|
+
# Português
|
46
|
+
'a', 'ao', 'aos', 'as',
|
47
|
+
'co', 'coa', 'coas', 'com', 'cos',
|
48
|
+
'da', 'das', 'de', 'do', 'dos', 'dum', 'duma', 'dumas', 'duns',
|
49
|
+
'e', 'em',
|
50
|
+
'na', 'nas', 'no', 'nos', 'num', 'numa', 'numas', 'nuns',
|
51
|
+
'o', 'os', 'ou',
|
52
|
+
'pela', 'pelas', 'pelo', 'pelos', 'per', 'por',
|
53
|
+
'um', 'uma', 'umas', 'uns',
|
54
|
+
# English
|
55
|
+
'an', 'and', 'at', 'by', 'in', 'of', 'or', 'on', 's', 'the'
|
56
|
+
]
|
57
|
+
def FkStr.articles_and_others
|
58
|
+
return @@articles_and_others
|
59
|
+
end
|
60
|
+
|
61
|
+
@@countries_acronyms = [
|
62
|
+
# Brasil
|
63
|
+
'br',
|
64
|
+
'ac', 'al', 'ap', 'am', 'ba', 'ce', 'df', 'es', 'go', 'ma', 'mt','ms', 'mg', 'pa', 'pb',
|
65
|
+
'pr', 'pe', 'pi', 'rj', 'rn', 'rs', 'ro', 'rr', 'sc', 'sp', 'se', 'to',
|
66
|
+
# USA
|
67
|
+
'us',
|
68
|
+
'ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'ia', 'id', 'il',
|
69
|
+
'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne',
|
70
|
+
'nh', 'nj', 'nm', 'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut',
|
71
|
+
'va', 'vt', 'wa', 'wi', 'wv', 'wy'
|
72
|
+
]
|
73
|
+
def FkStr.countries_acronyms
|
74
|
+
return @@countries_acronyms
|
75
|
+
end
|
76
|
+
|
77
|
+
@@simple_downcase_letters = [
|
78
|
+
'a', 'b', 'c', 'd', 'e',
|
79
|
+
'f', 'g', 'h', 'i', 'j',
|
80
|
+
'k', 'l', 'm', 'n', 'o',
|
81
|
+
'p', 'q', 'r', 's', 't',
|
82
|
+
'u', 'v', 'w', 'x', 'y',
|
83
|
+
'z'
|
84
|
+
]
|
85
|
+
|
86
|
+
@@simple_downcase_consonants = [
|
87
|
+
'b', 'c', 'd',
|
88
|
+
'f', 'g', 'h', 'j',
|
89
|
+
'k', 'l', 'm', 'n',
|
90
|
+
'p', 'q', 'r', 's', 't',
|
91
|
+
'v', 'w', 'x', 'y',
|
92
|
+
'z'
|
93
|
+
]
|
94
|
+
|
95
|
+
@@separators = ['/', '-', '_', ',', '.', "'", '"', '(', ')', '[', ']', '{', '}', '|', '\\', ';']
|
96
|
+
def FkStr.separators
|
97
|
+
return @@separators
|
98
|
+
end
|
99
|
+
|
100
|
+
@@separators_regex = ['\/', '\-', "\'", '\"', '\(', '\)', '\[', '\]', '\{', '\}']
|
101
|
+
|
102
|
+
@@legal_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz _-!@#$%&*+=?^~´`,.:;\'"()[]{}|/\\<>ÁÃÂÀÄĂĀÅÆáãâàäăāåæÉẼÊÈËĔĒéẽêèëĕēÍĨÎÌÏĬĪíĩîìïĭīÓÕÔÒÖŎŌŐÐóõôòöŏōőðšŠÚŨÛÙÜŬŪǕúũûùüŭūǖÇçČčĞğÑñȲȳŸÿÝýỲỳŽž¹²³ºª – ’©®℗¿¡±“”•«»‘°'.split('')
|
103
|
+
|
104
|
+
@@invalid_sequences = ['é']
|
105
|
+
|
106
|
+
end
|
data/test/test_fk_str.rb
ADDED
@@ -0,0 +1,255 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'fk_str'
|
5
|
+
|
6
|
+
class FkStrTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_treat_encoding
|
9
|
+
|
10
|
+
assert_equal(
|
11
|
+
'çUTF-8',
|
12
|
+
FkStr.treat_encoding("\xE7")+FkStr.treat_encoding("\xE7").encoding.to_s.upcase
|
13
|
+
)
|
14
|
+
|
15
|
+
assert_equal(
|
16
|
+
'©UTF-8',
|
17
|
+
FkStr.treat_encoding("\xC2\xA9")+FkStr.treat_encoding("\xC2\xA9").encoding.to_s.upcase
|
18
|
+
)
|
19
|
+
|
20
|
+
assert_equal(
|
21
|
+
'caçaUTF-8',
|
22
|
+
FkStr.treat_encoding("ca\xE7a")+FkStr.treat_encoding("ca\xE7a").encoding.to_s.upcase
|
23
|
+
)
|
24
|
+
|
25
|
+
assert_equal(
|
26
|
+
'casaUTF-8',
|
27
|
+
FkStr.treat_encoding('casa')+FkStr.treat_encoding('casa').encoding.to_s.upcase
|
28
|
+
)
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_is_eq
|
33
|
+
|
34
|
+
assert_equal(
|
35
|
+
true,
|
36
|
+
FkStr.is_eq('Hangar 110', 'Hangar 110', 40)
|
37
|
+
)
|
38
|
+
|
39
|
+
assert_equal(
|
40
|
+
true,
|
41
|
+
FkStr.is_eq('Armagedon + Atos de Vingança', 'Armagedom')
|
42
|
+
)
|
43
|
+
|
44
|
+
assert_equal(
|
45
|
+
false,
|
46
|
+
FkStr.is_eq('Gato Cat', 'Cachorro Dog')
|
47
|
+
)
|
48
|
+
|
49
|
+
assert_equal(
|
50
|
+
true,
|
51
|
+
FkStr.is_eq('Creedence Clearwater Revisited', 'Creedence Clearwater')
|
52
|
+
)
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_to_slug
|
57
|
+
|
58
|
+
assert_equal(
|
59
|
+
'teste-dog',
|
60
|
+
FkStr.to_slug('teste:dog')
|
61
|
+
)
|
62
|
+
|
63
|
+
assert_equal(
|
64
|
+
'centro-rio-de-janeiro-rj',
|
65
|
+
FkStr.to_slug('Centro - Rio de Janeiro [RJ]')
|
66
|
+
)
|
67
|
+
|
68
|
+
assert_equal(
|
69
|
+
'sao-paulo-sp',
|
70
|
+
FkStr.to_slug('São Paulo/SP')
|
71
|
+
)
|
72
|
+
|
73
|
+
assert_equal(
|
74
|
+
'sao-paulo-sp',
|
75
|
+
FkStr.to_slug('São Paulo_SP')
|
76
|
+
)
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_to_term
|
81
|
+
|
82
|
+
assert_equal(
|
83
|
+
'kasadujkakurururi',
|
84
|
+
FkStr.to_term('casa & dog and cachorro e lorem')
|
85
|
+
)
|
86
|
+
|
87
|
+
assert_equal(
|
88
|
+
'tistiduj',
|
89
|
+
FkStr.to_term('teste:de\dog')
|
90
|
+
)
|
91
|
+
|
92
|
+
assert_equal(
|
93
|
+
'saupauru',
|
94
|
+
FkStr.to_term('São Paulo-SP')
|
95
|
+
)
|
96
|
+
|
97
|
+
assert_equal(
|
98
|
+
'tistiduj',
|
99
|
+
FkStr.to_term('teste:de:dog')
|
100
|
+
)
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_remove_accents
|
105
|
+
|
106
|
+
assert_equal(
|
107
|
+
'Sao Jose do Rio Preto - SP',
|
108
|
+
FkStr.remove_accents('São José do Rio Preto - SP')
|
109
|
+
)
|
110
|
+
|
111
|
+
assert_equal(
|
112
|
+
'Sao Paulo',
|
113
|
+
FkStr.remove_accents('São Paulo')
|
114
|
+
)
|
115
|
+
|
116
|
+
assert_equal(
|
117
|
+
'Acougue',
|
118
|
+
FkStr.remove_accents('Açougue')
|
119
|
+
)
|
120
|
+
|
121
|
+
assert_equal(
|
122
|
+
'Lorem Ipsum',
|
123
|
+
FkStr.remove_accents('Lôrém Ipsum')
|
124
|
+
)
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_upcasewords
|
129
|
+
|
130
|
+
assert_equal(
|
131
|
+
'Charlie Brown Jr.',
|
132
|
+
FkStr.upcasewords('CHARLIE BROWN JR.')
|
133
|
+
)
|
134
|
+
|
135
|
+
assert_equal(
|
136
|
+
'Coldplay',
|
137
|
+
FkStr.upcasewords('COLDPLAY')
|
138
|
+
)
|
139
|
+
|
140
|
+
assert_equal(
|
141
|
+
'Queensrÿche',
|
142
|
+
FkStr.upcasewords('QUEENSRŸCHE')
|
143
|
+
)
|
144
|
+
|
145
|
+
assert_equal(
|
146
|
+
'Mindflow',
|
147
|
+
FkStr.upcasewords('MINDFLOW')
|
148
|
+
)
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
def test_remove_if_ends_with
|
153
|
+
|
154
|
+
assert_equal(
|
155
|
+
'Natal La Barra',
|
156
|
+
FkStr.remove_if_ends_with(
|
157
|
+
'Natal La Barra - Caxias do Sul / RS',
|
158
|
+
['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
|
159
|
+
['Natal'],
|
160
|
+
1
|
161
|
+
)
|
162
|
+
)
|
163
|
+
|
164
|
+
assert_equal(
|
165
|
+
'Natal La Barra -',
|
166
|
+
FkStr.remove_if_ends_with(
|
167
|
+
'Natal La Barra - Caxias do Sul / RS',
|
168
|
+
['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
|
169
|
+
['Natal'],
|
170
|
+
2
|
171
|
+
)
|
172
|
+
)
|
173
|
+
|
174
|
+
assert_equal(
|
175
|
+
'Natal La Barra - Caxias do Sul',
|
176
|
+
FkStr.remove_if_ends_with(
|
177
|
+
'Natal La Barra - Caxias do Sul / RS',
|
178
|
+
['La Barra', 'Caxias do Sul', 'RS', '/', '-'],
|
179
|
+
['Natal'],
|
180
|
+
3
|
181
|
+
)
|
182
|
+
)
|
183
|
+
|
184
|
+
assert_equal(
|
185
|
+
'Masp',
|
186
|
+
FkStr.remove_if_ends_with('Masp São Paulo/SP', ['São Paulo', 'SP', '/'])
|
187
|
+
)
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
def test_extract_dates
|
192
|
+
|
193
|
+
assert_equal(
|
194
|
+
[Time.new(2012, 12, 6)].uniq.sort,
|
195
|
+
FkStr.extract_dates('December 06, 2012', Time.new(2012, 9, 12))
|
196
|
+
)
|
197
|
+
|
198
|
+
assert_equal(
|
199
|
+
[Time.new(2012, 9, 14)].uniq.sort,
|
200
|
+
FkStr.extract_dates('FRI 09.14.2012', Time.new(2012, 9, 12), true)
|
201
|
+
)
|
202
|
+
|
203
|
+
assert_equal(
|
204
|
+
[Time.new(2011, 12, 8), Time.new(2012, 1, 9)].uniq.sort,
|
205
|
+
FkStr.extract_dates('8/dez lorem 9/jan/2012', Time.new(2011, 10, 8))
|
206
|
+
)
|
207
|
+
|
208
|
+
assert_equal(
|
209
|
+
[Time.new(2012, 1, 2)].uniq.sort,
|
210
|
+
FkStr.extract_dates('2 de janeiro', Time.new(2011, 10, 8))
|
211
|
+
)
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
def test_extract_time
|
216
|
+
|
217
|
+
assert_equal(
|
218
|
+
Time.new(2011, 07, 14, 22, 18, 0),
|
219
|
+
FkStr.extract_time(
|
220
|
+
'Thu, 14 Jul 2011 22:18:49 +0000',
|
221
|
+
FkStr.extract_dates('Thu, 14 Jul 2011 22:18:49 +0000',Time.new(2012, 5, 28)).first,
|
222
|
+
Time.new(2012, 5, 28)
|
223
|
+
)
|
224
|
+
)
|
225
|
+
|
226
|
+
assert_equal(
|
227
|
+
Time.new(2011, 07, 14, 16, 15, 0),
|
228
|
+
FkStr.extract_time(
|
229
|
+
'14 Jul 2011 16:15',
|
230
|
+
FkStr.extract_dates('14 Jul 2011 16:15',Time.new(2012, 5, 28)).first,
|
231
|
+
Time.new(2012, 5, 28)
|
232
|
+
)
|
233
|
+
)
|
234
|
+
|
235
|
+
assert_equal(
|
236
|
+
Time.new(2011, 07, 14, 9, 0, 0),
|
237
|
+
FkStr.extract_time(
|
238
|
+
'14 Jul 2011 9:00',
|
239
|
+
FkStr.extract_dates('14 Jul 2011 9:00',Time.new(2012, 5, 28)).first,
|
240
|
+
Time.new(2012, 5, 28)
|
241
|
+
)
|
242
|
+
)
|
243
|
+
|
244
|
+
assert_equal(
|
245
|
+
Time.new(2011, 07, 14, 7, 35, 0),
|
246
|
+
FkStr.extract_time(
|
247
|
+
'14 Jul 2011 07:35',
|
248
|
+
FkStr.extract_dates('14 Jul 2011 07:35',Time.new(2012, 5, 28)).first,
|
249
|
+
Time.new(2012, 5, 28)
|
250
|
+
)
|
251
|
+
)
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fk_str
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Guilherme Baptista
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-03-13 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: String manipulation.
|
15
|
+
email: guilhermebaptistasilva@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- LICENSE
|
21
|
+
- Rakefile
|
22
|
+
- lib/fk_str.rb
|
23
|
+
- lib/fk_str/dictionary.rb
|
24
|
+
- test/test_fk_str.rb
|
25
|
+
homepage: https://github.com/gbaptista/fk_str
|
26
|
+
licenses: []
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
none: false
|
33
|
+
requirements:
|
34
|
+
- - ! '>='
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ! '>='
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubyforge_project:
|
45
|
+
rubygems_version: 1.8.24
|
46
|
+
signing_key:
|
47
|
+
specification_version: 3
|
48
|
+
summary: FkStr
|
49
|
+
test_files: []
|