banklink_lv 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,730 @@
1
+ String.class_eval do
2
+ def to_squawk
3
+ "squawk! #{self}".strip
4
+ end
5
+ end
6
+
7
+ class String
8
+
9
+ require 'iconv'
10
+ require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
11
+
12
+ # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
13
+ UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
14
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
15
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
16
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
17
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
18
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
19
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
20
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
21
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
22
+ )*\z/mnx
23
+
24
+
25
+ # create UTF-8 character arrays (as class instance variables)
26
+ #
27
+ # mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
28
+ # - http://unicode.org/Public/UNIDATA/UnicodeData.txt
29
+ # - http://unicode.org/Public/UNIDATA/CaseFolding.txt
30
+ # - http://www.decodeunicode.org
31
+ # - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
32
+ # - http://camomile.sourceforge.net
33
+ # - Character Palette (Mac OS X)
34
+
35
+
36
+ # test data
37
+ @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
38
+
39
+
40
+ @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
41
+
42
+
43
+ @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
44
+
45
+ if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
46
+ if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
47
+ if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
48
+
49
+
50
+ @unicode_array = []
51
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
52
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
53
+ # f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
54
+ #end
55
+
56
+ #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
57
+ @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
58
+
59
+ # Hash[*array_with_keys.zip(array_with_values).flatten]
60
+ @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
61
+ @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
62
+ @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
63
+
64
+ class << self
65
+ attr_accessor :small_letters_utf8
66
+ attr_accessor :capital_letters_utf8
67
+ attr_accessor :other_letters_utf8
68
+ attr_accessor :letters_utf8
69
+ attr_accessor :letters_utf8_hash
70
+ attr_accessor :unicode_array
71
+ attr_accessor :downcase_table_utf8
72
+ attr_accessor :upcase_table_utf8
73
+ end
74
+
75
+
76
+ def each_utf8_char
77
+ scan(/./mu) { |c| yield c }
78
+ end
79
+
80
+ def each_utf8_char_with_index
81
+ i = -1
82
+ scan(/./mu) { |c| i+=1; yield(c, i) }
83
+ end
84
+
85
+ def length_utf8
86
+ #scan(/./mu).size
87
+ count = 0
88
+ scan(/./mu) { count += 1 }
89
+ count
90
+ end
91
+ alias :size_utf8 :length_utf8
92
+
93
+ def reverse_utf8
94
+ split(//mu).reverse.join
95
+ end
96
+
97
+ def reverse_utf8!
98
+ split(//mu).reverse!.join
99
+ end
100
+
101
+ def swapcase_utf8
102
+ gsub(/./mu) do |char|
103
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
104
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
105
+ else char.swapcase
106
+ end
107
+ end
108
+ end
109
+
110
+ def swapcase_utf8!
111
+ gsub!(/./mu) do |char|
112
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
113
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
114
+ else ret = char.swapcase end
115
+ end
116
+ end
117
+
118
+ def downcase_utf8
119
+ gsub(/./mu) do |char|
120
+ small_char = String.downcase_table_utf8[char]
121
+ small_char.nil? ? char.downcase : small_char
122
+ end
123
+ end
124
+
125
+ def downcase_utf8!
126
+ gsub!(/./mu) do |char|
127
+ small_char = String.downcase_table_utf8[char]
128
+ small_char.nil? ? char.downcase : small_char
129
+ end
130
+ end
131
+
132
+ def upcase_utf8
133
+ gsub(/./mu) do |char|
134
+ capital_char = String.upcase_table_utf8[char]
135
+ capital_char.nil? ? char.upcase : capital_char
136
+ end
137
+ end
138
+
139
+ def upcase_utf8!
140
+ gsub!(/./mu) do |char|
141
+ capital_char = String.upcase_table_utf8[char]
142
+ capital_char.nil? ? char.upcase : capital_char
143
+ end
144
+ end
145
+
146
+ def count_utf8(c)
147
+ return nil if c.empty?
148
+ r = %r{[#{c}]}mu
149
+ scan(r).size
150
+ end
151
+
152
+ def delete_utf8(c)
153
+ return self if c.empty?
154
+ r = %r{[#{c}]}mu
155
+ gsub(r, '')
156
+ end
157
+
158
+ def delete_utf8!(c)
159
+ return self if c.empty?
160
+ r = %r{[#{c}]}mu
161
+ gsub!(r, '')
162
+ end
163
+
164
+ def first_utf8
165
+ self[/\A./mu]
166
+ end
167
+
168
+ def last_utf8
169
+ self[/.\z/mu]
170
+ end
171
+
172
+ def capitalize_utf8
173
+ return self if self =~ /\A[[:space:]]*\z/m
174
+ ret = ""
175
+ split(/\x20/).each do |w|
176
+ count = 0
177
+ w.gsub(/./mu) do |char|
178
+ count += 1
179
+ capital_char = String.upcase_table_utf8[char]
180
+ if count == 1 then
181
+ capital_char.nil? ? char.upcase : char.upcase_utf8
182
+ else
183
+ capital_char.nil? ? char.downcase : char.downcase_utf8
184
+ end
185
+ end
186
+ ret << w + ' '
187
+ end
188
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
189
+ end
190
+
191
+ def capitalize_utf8!
192
+ return self if self =~ /\A[[:space:]]*\z/m
193
+ ret = ""
194
+ split(/\x20/).each do |w|
195
+ count = 0
196
+ w.gsub!(/./mu) do |char|
197
+ count += 1
198
+ capital_char = String.upcase_table_utf8[char]
199
+ if count == 1 then
200
+ capital_char.nil? ? char.upcase : char.upcase_utf8
201
+ else
202
+ capital_char.nil? ? char.downcase : char.downcase_utf8
203
+ end
204
+ end
205
+ ret << w + ' '
206
+ end
207
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
208
+ end
209
+
210
+
211
+ def index_utf8(s)
212
+
213
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
214
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
215
+
216
+ if s.class == Regexp
217
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
218
+ if opts.count('u') == 0 then opts = opts + "u" end
219
+ str = s.source
220
+ return nil if str.empty?
221
+ str = "%r{#{str}}" + opts
222
+ r = eval(str)
223
+ l = ""
224
+ sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
225
+ l.empty? ? nil : l.length_utf8
226
+
227
+ else
228
+
229
+ return nil if s.empty?
230
+ r = %r{#{s}}mu
231
+ l = ""
232
+ sub(r) { l << $`; " " }
233
+ l.empty? ? nil : l.length_utf8
234
+
235
+ # this would be a non-regex solution
236
+ =begin
237
+ return nil if s.empty?
238
+ return nil unless self =~ %r{#{s}}mu
239
+ indices = []
240
+ s.split(//mu).each do |x|
241
+ ar = []
242
+ self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
243
+ indices << ar unless ar.empty?
244
+ end
245
+ if indices.empty?
246
+ return nil
247
+ elsif indices.size == 1
248
+ indices.first.first
249
+ else
250
+ #p indices
251
+ ret = []
252
+ a0 = indices.shift
253
+ a0.each do |i|
254
+ ret << i
255
+ indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
256
+ return ret.first unless ret.empty?
257
+ end
258
+ ret.empty? ? nil : ret.first
259
+ end
260
+ =end
261
+
262
+ end
263
+ end
264
+
265
+
266
+ def rindex_utf8(s)
267
+
268
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
269
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
270
+
271
+ if s.class == Regexp
272
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
273
+ if opts.count('u') == 0 then opts = opts + "u" end
274
+ str = s.source
275
+ return nil if str.empty?
276
+ str = "%r{#{str}}" + opts
277
+ r = eval(str)
278
+ l = ""
279
+ scan(r) { l = $` }
280
+ #gsub(r) { l = $`; " " }
281
+ l.empty? ? nil : l.length_utf8
282
+ else
283
+ return nil if s.empty?
284
+ r = %r{#{s}}mu
285
+ l = ""
286
+ scan(r) { l = $` }
287
+ #gsub(r) { l = $`; " " }
288
+ l.empty? ? nil : l.length_utf8
289
+ end
290
+
291
+ end
292
+
293
+
294
+ # note that the i option does not work in special cases with back references
295
+ # example: "àÀ".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
296
+ def slice_utf8(regex)
297
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
298
+ if opts.count('u') == 0 then opts = opts + "u" end
299
+ s = regex.source
300
+ str = "%r{#{s}}" + opts
301
+ r = eval(str)
302
+ slice(r)
303
+ end
304
+
305
+ def slice_utf8!(regex)
306
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
307
+ if opts.count('u') == 0 then opts = opts + "u" end
308
+ s = regex.source
309
+ str = "%r{#{s}}" + opts
310
+ r = eval(str)
311
+ slice!(r)
312
+ end
313
+
314
+ def cut_utf8(p,l) # (index) position, length
315
+ raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
316
+ s = self.length_utf8
317
+ #if p < 0 then p = s - p.abs end
318
+ if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
319
+ return nil if l > s or p > (s - 1)
320
+ ret = ""
321
+ count = 0
322
+ each_utf8_char_with_index do |c,i|
323
+ break if count >= l
324
+ if i >= p && count < l then count += 1; ret << c; end
325
+ end
326
+ ret
327
+ end
328
+
329
+ def starts_with_utf8?(s)
330
+ return nil if self.empty? or s.empty?
331
+ cut_utf8(0, s.size_utf8) == s
332
+ end
333
+
334
+ def ends_with_utf8?(s)
335
+ return nil if self.empty? or s.empty?
336
+ cut_utf8(-(s.size_utf8), s.size_utf8) == s
337
+ end
338
+
339
+ def insert_utf8(i,s) # insert_utf8(index, string)
340
+ return self if s.empty?
341
+ l = self.length_utf8
342
+ if l == 0 then return s end
343
+ if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
344
+ #return nil if i > (l - 1) # return nil ...
345
+ spaces = ""
346
+ if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
347
+ str = self << spaces
348
+ s1 = str.cut_utf8(0, i)
349
+ s2 = str.cut_utf8(i, l - s1.length_utf8)
350
+ s1 << s << s2
351
+ end
352
+
353
+ def split_utf8(regex)
354
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
355
+ if opts.count('u') == 0 then opts = opts + "u" end
356
+ s = regex.source
357
+ str = "%r{#{s}}" + opts
358
+ r = eval(str)
359
+ split(r)
360
+ end
361
+
362
+ def scan_utf8(regex)
363
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
364
+ if opts.count('u') == 0 then opts = opts + "u" end
365
+ s = regex.source
366
+ str = "%r{#{s}}" + opts
367
+ r = eval(str)
368
+ if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
369
+ end
370
+
371
+ def range_utf8(r)
372
+
373
+ return nil if r.class != Range
374
+ #raise(ArgumentError, "No Range object given!", caller) if r.class != Range
375
+
376
+ a = r.to_s[/^[\+\-]?\d+/].to_i
377
+ b = r.to_s[/[\+\-]?\d+$/].to_i
378
+ d = r.to_s[/\.+/]
379
+
380
+ if d.size == 2 then d = 2 else d = d.size end
381
+
382
+ l = self.length_utf8
383
+
384
+ return nil if b.abs > l || a.abs > l || d < 2 || d > 3
385
+
386
+ if a < 0 then a = l - a.abs end
387
+ if b < 0 then b = l - b.abs end
388
+
389
+ return nil if a > b
390
+
391
+ str = ""
392
+
393
+ each_utf8_char_with_index do |c,i|
394
+ break if i > b
395
+ if d == 2
396
+ (i >= a && i <= b) ? str << c : next
397
+ else
398
+ (i >= a && i < b) ? str << c : next
399
+ end
400
+ end
401
+
402
+ str
403
+
404
+ end
405
+
406
+ def utf8?
407
+ self =~ UTF8REGEX
408
+ end
409
+
410
+ def clean_utf8
411
+ t = ""
412
+ self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
413
+ t
414
+ end
415
+
416
+
417
+ def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
418
+
419
+ file = self
420
+ str = ""
421
+
422
+ if file =~ /^http:\/\//
423
+
424
+ url = file
425
+
426
+ if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
427
+
428
+ seconds = 30
429
+
430
+ # check if web site is reachable
431
+ # on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
432
+ var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
433
+
434
+ #return false unless var == 0
435
+ raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
436
+
437
+ str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
438
+ /usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
439
+ p str
440
+ return true if str =~ /utf-?8/i
441
+ return false if !str.empty? && str !~ /utf-?8/i
442
+
443
+ # solutions with downloaded file
444
+
445
+ # download HTML file
446
+ #downloaded_file = "/tmp/html"
447
+ downloaded_file = "~/Desktop/html"
448
+ downloaded_file = File.expand_path(downloaded_file)
449
+ %x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
450
+ raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
451
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
452
+
453
+ simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
454
+ p simple_test
455
+
456
+ # read entire file into a string
457
+ File.open(downloaded_file).read.each(nil) do |str|
458
+ #return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
459
+ str.utf8? ? (return true) : (return false)
460
+ end
461
+
462
+ #check each line of the downloaded file
463
+ #count_lines = 0
464
+ #count_utf8 = 0
465
+ #File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
466
+ #count_lines == count_utf8 ? (return true) : (return false)
467
+
468
+
469
+ # in-memory solutions
470
+
471
+ #html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
472
+ #p html_file_cleaned_utf8.utf8?
473
+
474
+ count_lines = 0
475
+ count_utf8 = 0
476
+ #%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
477
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
478
+ #return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
479
+ count_lines += 1
480
+ count_utf8 += 1 if line.utf8?
481
+ break if count_lines != count_utf8
482
+ end
483
+ count_lines == count_utf8 ? (return true) : (return false)
484
+
485
+ else
486
+
487
+ # check each line of the HTML file (or the entire HTML file at once)
488
+ # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
489
+ count_lines = 0
490
+ count_utf8 = 0
491
+ open(url) do |f|
492
+ # p f.meta, f.content_encoding, f.content_type
493
+ cs = f.charset
494
+ return true if cs =~ /utf-?8/i
495
+ #f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
496
+ f.each_line do |line|
497
+ count_lines += 1
498
+ count_utf8 += 1 if line.utf8?
499
+ break unless count_lines == count_utf8
500
+ end
501
+ end
502
+ count_lines == count_utf8 ? (return true) : (return false)
503
+
504
+ end
505
+
506
+ else
507
+
508
+ return false unless File.file?(file)
509
+
510
+ if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
511
+
512
+ # read entire file into a string
513
+ #File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
514
+
515
+ # check each line of the file
516
+ count_lines = 0
517
+ count_utf8 = 0
518
+ File.foreach(file) do |line|
519
+ return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
520
+ count_lines += 1;
521
+ count_utf8 += 1 if line.utf8?;
522
+ break if count_lines != count_utf8
523
+ end
524
+
525
+ count_lines == count_utf8 ? (return true) : (return false)
526
+
527
+ end
528
+
529
+ str =~ /utf-?8/i ? true : false
530
+
531
+ end
532
+
533
+
534
+ # cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
535
+ def validate_utf8
536
+ Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
537
+ end
538
+
539
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
540
+ def asciify_utf8
541
+ return nil unless self.utf8?
542
+ #Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
543
+ # delete all punctuation characters inside words except "-" in words such as up-to-date
544
+ Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
545
+ end
546
+
547
+ def latin1_to_utf8 # ISO-8859-1 to UTF-8
548
+ ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
549
+ ret.utf8? ? ret : nil
550
+ end
551
+
552
+ def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
553
+ ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
554
+ ret.utf8? ? ret : nil
555
+ end
556
+
557
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
558
+ def utf16le_to_utf8
559
+ ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
560
+ ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
561
+ ret.utf8? ? ret : nil
562
+ end
563
+
564
+ def utf8_to_utf16le
565
+ return nil unless self.utf8?
566
+ ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
567
+ end
568
+
569
+ def utf8_to_unicode
570
+ return nil unless self.utf8?
571
+ str = ""
572
+ scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
573
+ str
574
+ end
575
+
576
+ def unicode_to_utf8
577
+ return self if self =~ /\A[[:space:]]*\z/m
578
+ str = ""
579
+ #scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
580
+ #scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
581
+ scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
582
+ c = $1
583
+ if c =~ /^U\+/
584
+ str << [c[2..-1].hex].pack("U*")
585
+ else
586
+ str << c
587
+ end
588
+ end
589
+ str.utf8? ? str : nil
590
+ end
591
+
592
+
593
+ # dec, hex, oct conversions (experimental!)
594
+
595
+ def utf8_to_dec
596
+ return nil unless self.utf8?
597
+ str = ""
598
+ scan(/./mu) do |c|
599
+ if c =~ /^\x00$/
600
+ str << "aaa\x00" # encode \x00 as "aaa"
601
+ else
602
+ str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
603
+ end
604
+ end
605
+ str[0..-2]
606
+ end
607
+
608
+ def dec_to_utf8 # \x00 is encoded as "aaa"
609
+ return self if self.empty?
610
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
611
+ str = ""
612
+ split(/\x00/).each do |c|
613
+ if c.eql?("aaa")
614
+ str << "\x00"
615
+ else
616
+ str << [c.to_i].pack("U*")
617
+ end
618
+ end
619
+ str
620
+ end
621
+
622
+
623
+ def utf8_to_dec_2
624
+ return nil unless self.utf8?
625
+ str = ""
626
+ tmpstr = ""
627
+ null_str = "\x00"
628
+ scan(/./mu) do |c|
629
+ if c =~ /^\x00$/
630
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
631
+ else
632
+ tmpstr = ""
633
+ c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
634
+ str << tmpstr << null_str
635
+ end
636
+ end
637
+ str[0..-3]
638
+ end
639
+
640
+ def dec_to_utf8_2 # \x00 is encoded as "aaa"
641
+ return self if self.empty?
642
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
643
+ str = ""
644
+ split(/\x00\x00/).each do |c|
645
+ if c =~ /\x00/
646
+ c.split(/\x00/).each { |x| str << x.to_i.chr }
647
+ elsif c.eql?("aaa")
648
+ str << "\x00"
649
+ else
650
+ str << c.to_i.chr
651
+ end
652
+ end
653
+ str
654
+ end
655
+
656
+
657
+ def utf8_to_hex
658
+ return nil unless self.utf8?
659
+ str = ""
660
+ tmpstr = ""
661
+ null_str = "\x00"
662
+ scan(/./mu) do |c|
663
+ if c =~ /^\x00$/
664
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
665
+ else
666
+ tmpstr = ""
667
+ c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
668
+ str << tmpstr << null_str
669
+ end
670
+ end
671
+ str[0..-3]
672
+ end
673
+
674
+ def hex_to_utf8 # \x00 is encoded as "aaa"
675
+ return self if self.empty?
676
+ return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
677
+ str = ""
678
+ split(/\x00\x00/).each do |c|
679
+ if c =~ /\x00/
680
+ c.split(/\x00/).each { |x| str << x.hex.chr }
681
+ elsif c.eql?("aaa")
682
+ str << "\x00"
683
+ else
684
+ str << c.hex.chr
685
+ end
686
+ end
687
+ str
688
+ end
689
+
690
+
691
+ def utf8_to_oct
692
+ return nil unless self.utf8?
693
+ str = ""
694
+ tmpstr = ""
695
+ null_str = "\x00"
696
+ scan(/./mu) do |c|
697
+ if c =~ /^\x00$/
698
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
699
+ else
700
+ tmpstr = ""
701
+ c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
702
+ str << tmpstr << null_str
703
+ end
704
+ end
705
+ str[0..-3]
706
+ end
707
+
708
+ def oct_to_utf8 # \x00 is encoded as "aaa"
709
+ return self if self.empty?
710
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
711
+ str = ""
712
+ split(/\x00\x00/).each do |c|
713
+ if c =~ /\x00/
714
+ c.split(/\x00/).each { |x| str << x.oct.chr }
715
+ elsif c.eql?("aaa")
716
+ str << "\x00"
717
+ else
718
+ str << c.oct.chr
719
+ end
720
+ end
721
+ str
722
+ end
723
+
724
+ # cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
725
+ def email_subject_utf8
726
+ return nil unless self.utf8?
727
+ "=?utf-8?b?#{[self].pack("m").delete("\n")}?="
728
+ end
729
+
730
+ end