banklink_lv 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,730 @@
1
+ String.class_eval do
2
+ def to_squawk
3
+ "squawk! #{self}".strip
4
+ end
5
+ end
6
+
7
+ class String
8
+
9
+ require 'iconv'
10
+ require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
11
+
12
+ # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
13
+ UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
14
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
15
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
16
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
17
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
18
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
19
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
20
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
21
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
22
+ )*\z/mnx
23
+
24
+
25
+ # create UTF-8 character arrays (as class instance variables)
26
+ #
27
+ # mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
28
+ # - http://unicode.org/Public/UNIDATA/UnicodeData.txt
29
+ # - http://unicode.org/Public/UNIDATA/CaseFolding.txt
30
+ # - http://www.decodeunicode.org
31
+ # - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
32
+ # - http://camomile.sourceforge.net
33
+ # - Character Palette (Mac OS X)
34
+
35
+
36
+ # test data
37
+ @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
38
+
39
+
40
+ @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
41
+
42
+
43
+ @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
44
+
45
+ if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
46
+ if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
47
+ if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
48
+
49
+
50
+ @unicode_array = []
51
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
52
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
53
+ # f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
54
+ #end
55
+
56
+ #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
57
+ @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
58
+
59
+ # Hash[*array_with_keys.zip(array_with_values).flatten]
60
+ @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
61
+ @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
62
+ @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
63
+
64
+ class << self
65
+ attr_accessor :small_letters_utf8
66
+ attr_accessor :capital_letters_utf8
67
+ attr_accessor :other_letters_utf8
68
+ attr_accessor :letters_utf8
69
+ attr_accessor :letters_utf8_hash
70
+ attr_accessor :unicode_array
71
+ attr_accessor :downcase_table_utf8
72
+ attr_accessor :upcase_table_utf8
73
+ end
74
+
75
+
76
+ def each_utf8_char
77
+ scan(/./mu) { |c| yield c }
78
+ end
79
+
80
+ def each_utf8_char_with_index
81
+ i = -1
82
+ scan(/./mu) { |c| i+=1; yield(c, i) }
83
+ end
84
+
85
+ def length_utf8
86
+ #scan(/./mu).size
87
+ count = 0
88
+ scan(/./mu) { count += 1 }
89
+ count
90
+ end
91
+ alias :size_utf8 :length_utf8
92
+
93
+ def reverse_utf8
94
+ split(//mu).reverse.join
95
+ end
96
+
97
+ def reverse_utf8!
98
+ split(//mu).reverse!.join
99
+ end
100
+
101
+ def swapcase_utf8
102
+ gsub(/./mu) do |char|
103
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
104
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
105
+ else char.swapcase
106
+ end
107
+ end
108
+ end
109
+
110
+ def swapcase_utf8!
111
+ gsub!(/./mu) do |char|
112
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
113
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
114
+ else ret = char.swapcase end
115
+ end
116
+ end
117
+
118
+ def downcase_utf8
119
+ gsub(/./mu) do |char|
120
+ small_char = String.downcase_table_utf8[char]
121
+ small_char.nil? ? char.downcase : small_char
122
+ end
123
+ end
124
+
125
+ def downcase_utf8!
126
+ gsub!(/./mu) do |char|
127
+ small_char = String.downcase_table_utf8[char]
128
+ small_char.nil? ? char.downcase : small_char
129
+ end
130
+ end
131
+
132
+ def upcase_utf8
133
+ gsub(/./mu) do |char|
134
+ capital_char = String.upcase_table_utf8[char]
135
+ capital_char.nil? ? char.upcase : capital_char
136
+ end
137
+ end
138
+
139
+ def upcase_utf8!
140
+ gsub!(/./mu) do |char|
141
+ capital_char = String.upcase_table_utf8[char]
142
+ capital_char.nil? ? char.upcase : capital_char
143
+ end
144
+ end
145
+
146
+ def count_utf8(c)
147
+ return nil if c.empty?
148
+ r = %r{[#{c}]}mu
149
+ scan(r).size
150
+ end
151
+
152
+ def delete_utf8(c)
153
+ return self if c.empty?
154
+ r = %r{[#{c}]}mu
155
+ gsub(r, '')
156
+ end
157
+
158
+ def delete_utf8!(c)
159
+ return self if c.empty?
160
+ r = %r{[#{c}]}mu
161
+ gsub!(r, '')
162
+ end
163
+
164
+ def first_utf8
165
+ self[/\A./mu]
166
+ end
167
+
168
+ def last_utf8
169
+ self[/.\z/mu]
170
+ end
171
+
172
+ def capitalize_utf8
173
+ return self if self =~ /\A[[:space:]]*\z/m
174
+ ret = ""
175
+ split(/\x20/).each do |w|
176
+ count = 0
177
+ w.gsub(/./mu) do |char|
178
+ count += 1
179
+ capital_char = String.upcase_table_utf8[char]
180
+ if count == 1 then
181
+ capital_char.nil? ? char.upcase : char.upcase_utf8
182
+ else
183
+ capital_char.nil? ? char.downcase : char.downcase_utf8
184
+ end
185
+ end
186
+ ret << w + ' '
187
+ end
188
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
189
+ end
190
+
191
+ def capitalize_utf8!
192
+ return self if self =~ /\A[[:space:]]*\z/m
193
+ ret = ""
194
+ split(/\x20/).each do |w|
195
+ count = 0
196
+ w.gsub!(/./mu) do |char|
197
+ count += 1
198
+ capital_char = String.upcase_table_utf8[char]
199
+ if count == 1 then
200
+ capital_char.nil? ? char.upcase : char.upcase_utf8
201
+ else
202
+ capital_char.nil? ? char.downcase : char.downcase_utf8
203
+ end
204
+ end
205
+ ret << w + ' '
206
+ end
207
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
208
+ end
209
+
210
+
211
+ def index_utf8(s)
212
+
213
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
214
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
215
+
216
+ if s.class == Regexp
217
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
218
+ if opts.count('u') == 0 then opts = opts + "u" end
219
+ str = s.source
220
+ return nil if str.empty?
221
+ str = "%r{#{str}}" + opts
222
+ r = eval(str)
223
+ l = ""
224
+ sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
225
+ l.empty? ? nil : l.length_utf8
226
+
227
+ else
228
+
229
+ return nil if s.empty?
230
+ r = %r{#{s}}mu
231
+ l = ""
232
+ sub(r) { l << $`; " " }
233
+ l.empty? ? nil : l.length_utf8
234
+
235
+ # this would be a non-regex solution
236
+ =begin
237
+ return nil if s.empty?
238
+ return nil unless self =~ %r{#{s}}mu
239
+ indices = []
240
+ s.split(//mu).each do |x|
241
+ ar = []
242
+ self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
243
+ indices << ar unless ar.empty?
244
+ end
245
+ if indices.empty?
246
+ return nil
247
+ elsif indices.size == 1
248
+ indices.first.first
249
+ else
250
+ #p indices
251
+ ret = []
252
+ a0 = indices.shift
253
+ a0.each do |i|
254
+ ret << i
255
+ indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
256
+ return ret.first unless ret.empty?
257
+ end
258
+ ret.empty? ? nil : ret.first
259
+ end
260
+ =end
261
+
262
+ end
263
+ end
264
+
265
+
266
+ def rindex_utf8(s)
267
+
268
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
269
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
270
+
271
+ if s.class == Regexp
272
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
273
+ if opts.count('u') == 0 then opts = opts + "u" end
274
+ str = s.source
275
+ return nil if str.empty?
276
+ str = "%r{#{str}}" + opts
277
+ r = eval(str)
278
+ l = ""
279
+ scan(r) { l = $` }
280
+ #gsub(r) { l = $`; " " }
281
+ l.empty? ? nil : l.length_utf8
282
+ else
283
+ return nil if s.empty?
284
+ r = %r{#{s}}mu
285
+ l = ""
286
+ scan(r) { l = $` }
287
+ #gsub(r) { l = $`; " " }
288
+ l.empty? ? nil : l.length_utf8
289
+ end
290
+
291
+ end
292
+
293
+
294
+ # note that the i option does not work in special cases with back references
295
+ # example: "àÀ".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
296
+ def slice_utf8(regex)
297
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
298
+ if opts.count('u') == 0 then opts = opts + "u" end
299
+ s = regex.source
300
+ str = "%r{#{s}}" + opts
301
+ r = eval(str)
302
+ slice(r)
303
+ end
304
+
305
+ def slice_utf8!(regex)
306
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
307
+ if opts.count('u') == 0 then opts = opts + "u" end
308
+ s = regex.source
309
+ str = "%r{#{s}}" + opts
310
+ r = eval(str)
311
+ slice!(r)
312
+ end
313
+
314
+ def cut_utf8(p,l) # (index) position, length
315
+ raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
316
+ s = self.length_utf8
317
+ #if p < 0 then p = s - p.abs end
318
+ if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
319
+ return nil if l > s or p > (s - 1)
320
+ ret = ""
321
+ count = 0
322
+ each_utf8_char_with_index do |c,i|
323
+ break if count >= l
324
+ if i >= p && count < l then count += 1; ret << c; end
325
+ end
326
+ ret
327
+ end
328
+
329
+ def starts_with_utf8?(s)
330
+ return nil if self.empty? or s.empty?
331
+ cut_utf8(0, s.size_utf8) == s
332
+ end
333
+
334
+ def ends_with_utf8?(s)
335
+ return nil if self.empty? or s.empty?
336
+ cut_utf8(-(s.size_utf8), s.size_utf8) == s
337
+ end
338
+
339
+ def insert_utf8(i,s) # insert_utf8(index, string)
340
+ return self if s.empty?
341
+ l = self.length_utf8
342
+ if l == 0 then return s end
343
+ if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
344
+ #return nil if i > (l - 1) # return nil ...
345
+ spaces = ""
346
+ if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
347
+ str = self << spaces
348
+ s1 = str.cut_utf8(0, i)
349
+ s2 = str.cut_utf8(i, l - s1.length_utf8)
350
+ s1 << s << s2
351
+ end
352
+
353
+ def split_utf8(regex)
354
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
355
+ if opts.count('u') == 0 then opts = opts + "u" end
356
+ s = regex.source
357
+ str = "%r{#{s}}" + opts
358
+ r = eval(str)
359
+ split(r)
360
+ end
361
+
362
+ def scan_utf8(regex)
363
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
364
+ if opts.count('u') == 0 then opts = opts + "u" end
365
+ s = regex.source
366
+ str = "%r{#{s}}" + opts
367
+ r = eval(str)
368
+ if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
369
+ end
370
+
371
+ def range_utf8(r)
372
+
373
+ return nil if r.class != Range
374
+ #raise(ArgumentError, "No Range object given!", caller) if r.class != Range
375
+
376
+ a = r.to_s[/^[\+\-]?\d+/].to_i
377
+ b = r.to_s[/[\+\-]?\d+$/].to_i
378
+ d = r.to_s[/\.+/]
379
+
380
+ if d.size == 2 then d = 2 else d = d.size end
381
+
382
+ l = self.length_utf8
383
+
384
+ return nil if b.abs > l || a.abs > l || d < 2 || d > 3
385
+
386
+ if a < 0 then a = l - a.abs end
387
+ if b < 0 then b = l - b.abs end
388
+
389
+ return nil if a > b
390
+
391
+ str = ""
392
+
393
+ each_utf8_char_with_index do |c,i|
394
+ break if i > b
395
+ if d == 2
396
+ (i >= a && i <= b) ? str << c : next
397
+ else
398
+ (i >= a && i < b) ? str << c : next
399
+ end
400
+ end
401
+
402
+ str
403
+
404
+ end
405
+
406
+ def utf8?
407
+ self =~ UTF8REGEX
408
+ end
409
+
410
+ def clean_utf8
411
+ t = ""
412
+ self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
413
+ t
414
+ end
415
+
416
+
417
+ def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
418
+
419
+ file = self
420
+ str = ""
421
+
422
+ if file =~ /^http:\/\//
423
+
424
+ url = file
425
+
426
+ if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
427
+
428
+ seconds = 30
429
+
430
+ # check if web site is reachable
431
+ # on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
432
+ var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
433
+
434
+ #return false unless var == 0
435
+ raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
436
+
437
+ str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
438
+ /usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
439
+ p str
440
+ return true if str =~ /utf-?8/i
441
+ return false if !str.empty? && str !~ /utf-?8/i
442
+
443
+ # solutions with downloaded file
444
+
445
+ # download HTML file
446
+ #downloaded_file = "/tmp/html"
447
+ downloaded_file = "~/Desktop/html"
448
+ downloaded_file = File.expand_path(downloaded_file)
449
+ %x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
450
+ raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
451
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
452
+
453
+ simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
454
+ p simple_test
455
+
456
+ # read entire file into a string
457
+ File.open(downloaded_file).read.each(nil) do |str|
458
+ #return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
459
+ str.utf8? ? (return true) : (return false)
460
+ end
461
+
462
+ #check each line of the downloaded file
463
+ #count_lines = 0
464
+ #count_utf8 = 0
465
+ #File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
466
+ #count_lines == count_utf8 ? (return true) : (return false)
467
+
468
+
469
+ # in-memory solutions
470
+
471
+ #html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
472
+ #p html_file_cleaned_utf8.utf8?
473
+
474
+ count_lines = 0
475
+ count_utf8 = 0
476
+ #%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
477
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
478
+ #return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
479
+ count_lines += 1
480
+ count_utf8 += 1 if line.utf8?
481
+ break if count_lines != count_utf8
482
+ end
483
+ count_lines == count_utf8 ? (return true) : (return false)
484
+
485
+ else
486
+
487
+ # check each line of the HTML file (or the entire HTML file at once)
488
+ # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
489
+ count_lines = 0
490
+ count_utf8 = 0
491
+ open(url) do |f|
492
+ # p f.meta, f.content_encoding, f.content_type
493
+ cs = f.charset
494
+ return true if cs =~ /utf-?8/i
495
+ #f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
496
+ f.each_line do |line|
497
+ count_lines += 1
498
+ count_utf8 += 1 if line.utf8?
499
+ break unless count_lines == count_utf8
500
+ end
501
+ end
502
+ count_lines == count_utf8 ? (return true) : (return false)
503
+
504
+ end
505
+
506
+ else
507
+
508
+ return false unless File.file?(file)
509
+
510
+ if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
511
+
512
+ # read entire file into a string
513
+ #File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
514
+
515
+ # check each line of the file
516
+ count_lines = 0
517
+ count_utf8 = 0
518
+ File.foreach(file) do |line|
519
+ return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
520
+ count_lines += 1;
521
+ count_utf8 += 1 if line.utf8?;
522
+ break if count_lines != count_utf8
523
+ end
524
+
525
+ count_lines == count_utf8 ? (return true) : (return false)
526
+
527
+ end
528
+
529
+ str =~ /utf-?8/i ? true : false
530
+
531
+ end
532
+
533
+
534
+ # cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
535
+ def validate_utf8
536
+ Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
537
+ end
538
+
539
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
540
+ def asciify_utf8
541
+ return nil unless self.utf8?
542
+ #Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
543
+ # delete all punctuation characters inside words except "-" in words such as up-to-date
544
+ Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
545
+ end
546
+
547
+ def latin1_to_utf8 # ISO-8859-1 to UTF-8
548
+ ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
549
+ ret.utf8? ? ret : nil
550
+ end
551
+
552
+ def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
553
+ ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
554
+ ret.utf8? ? ret : nil
555
+ end
556
+
557
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
558
+ def utf16le_to_utf8
559
+ ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
560
+ ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
561
+ ret.utf8? ? ret : nil
562
+ end
563
+
564
+ def utf8_to_utf16le
565
+ return nil unless self.utf8?
566
+ ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
567
+ end
568
+
569
+ def utf8_to_unicode
570
+ return nil unless self.utf8?
571
+ str = ""
572
+ scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
573
+ str
574
+ end
575
+
576
+ def unicode_to_utf8
577
+ return self if self =~ /\A[[:space:]]*\z/m
578
+ str = ""
579
+ #scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
580
+ #scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
581
+ scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
582
+ c = $1
583
+ if c =~ /^U\+/
584
+ str << [c[2..-1].hex].pack("U*")
585
+ else
586
+ str << c
587
+ end
588
+ end
589
+ str.utf8? ? str : nil
590
+ end
591
+
592
+
593
+ # dec, hex, oct conversions (experimental!)
594
+
595
+ def utf8_to_dec
596
+ return nil unless self.utf8?
597
+ str = ""
598
+ scan(/./mu) do |c|
599
+ if c =~ /^\x00$/
600
+ str << "aaa\x00" # encode \x00 as "aaa"
601
+ else
602
+ str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
603
+ end
604
+ end
605
+ str[0..-2]
606
+ end
607
+
608
+ def dec_to_utf8 # \x00 is encoded as "aaa"
609
+ return self if self.empty?
610
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
611
+ str = ""
612
+ split(/\x00/).each do |c|
613
+ if c.eql?("aaa")
614
+ str << "\x00"
615
+ else
616
+ str << [c.to_i].pack("U*")
617
+ end
618
+ end
619
+ str
620
+ end
621
+
622
+
623
+ def utf8_to_dec_2
624
+ return nil unless self.utf8?
625
+ str = ""
626
+ tmpstr = ""
627
+ null_str = "\x00"
628
+ scan(/./mu) do |c|
629
+ if c =~ /^\x00$/
630
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
631
+ else
632
+ tmpstr = ""
633
+ c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
634
+ str << tmpstr << null_str
635
+ end
636
+ end
637
+ str[0..-3]
638
+ end
639
+
640
+ def dec_to_utf8_2 # \x00 is encoded as "aaa"
641
+ return self if self.empty?
642
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
643
+ str = ""
644
+ split(/\x00\x00/).each do |c|
645
+ if c =~ /\x00/
646
+ c.split(/\x00/).each { |x| str << x.to_i.chr }
647
+ elsif c.eql?("aaa")
648
+ str << "\x00"
649
+ else
650
+ str << c.to_i.chr
651
+ end
652
+ end
653
+ str
654
+ end
655
+
656
+
657
+ def utf8_to_hex
658
+ return nil unless self.utf8?
659
+ str = ""
660
+ tmpstr = ""
661
+ null_str = "\x00"
662
+ scan(/./mu) do |c|
663
+ if c =~ /^\x00$/
664
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
665
+ else
666
+ tmpstr = ""
667
+ c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
668
+ str << tmpstr << null_str
669
+ end
670
+ end
671
+ str[0..-3]
672
+ end
673
+
674
+ def hex_to_utf8 # \x00 is encoded as "aaa"
675
+ return self if self.empty?
676
+ return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
677
+ str = ""
678
+ split(/\x00\x00/).each do |c|
679
+ if c =~ /\x00/
680
+ c.split(/\x00/).each { |x| str << x.hex.chr }
681
+ elsif c.eql?("aaa")
682
+ str << "\x00"
683
+ else
684
+ str << c.hex.chr
685
+ end
686
+ end
687
+ str
688
+ end
689
+
690
+
691
+ def utf8_to_oct
692
+ return nil unless self.utf8?
693
+ str = ""
694
+ tmpstr = ""
695
+ null_str = "\x00"
696
+ scan(/./mu) do |c|
697
+ if c =~ /^\x00$/
698
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
699
+ else
700
+ tmpstr = ""
701
+ c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
702
+ str << tmpstr << null_str
703
+ end
704
+ end
705
+ str[0..-3]
706
+ end
707
+
708
+ def oct_to_utf8 # \x00 is encoded as "aaa"
709
+ return self if self.empty?
710
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
711
+ str = ""
712
+ split(/\x00\x00/).each do |c|
713
+ if c =~ /\x00/
714
+ c.split(/\x00/).each { |x| str << x.oct.chr }
715
+ elsif c.eql?("aaa")
716
+ str << "\x00"
717
+ else
718
+ str << c.oct.chr
719
+ end
720
+ end
721
+ str
722
+ end
723
+
724
+ # cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
725
+ def email_subject_utf8
726
+ return nil unless self.utf8?
727
+ "=?utf-8?b?#{[self].pack("m").delete("\n")}?="
728
+ end
729
+
730
+ end