prawn-arabic 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: prawn-arabic
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dynamix Solutions
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-04-19 00:00:00.000000000 Z
11
+ date: 2017-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: iconv
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  description: Gem which improve workflow with arabic text
28
42
  email: ahmed.nasser@dynamix-systems.com
29
43
  executables: []
@@ -36,7 +50,6 @@ files:
36
50
  - README.md
37
51
  - Rakefile
38
52
  - lib/prawn-arabic.rb
39
- - lib/string_utf_support.rb
40
53
  homepage: https://github.com/ozeron/arabic-prawn
41
54
  licenses:
42
55
  - MIT
@@ -1,730 +0,0 @@
1
- # encoding: ascii-8bit
2
- class String
3
-
4
- require 'iconv'
5
- require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
6
-
7
- # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
8
- # UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
9
- # [\x09\x0A\x0D\x20-\x7E] # ASCII
10
- # | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
11
- # | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
12
- # | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
13
- # | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
14
- # | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
15
- # | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
16
- # | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
17
- # )*\z/mxn
18
-
19
-
20
- # create UTF-8 character arrays (as class instance variables)
21
- #
22
- # mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
23
- # - http://unicode.org/Public/UNIDATA/UnicodeData.txt
24
- # - http://unicode.org/Public/UNIDATA/CaseFolding.txt
25
- # - http://www.decodeunicode.org
26
- # - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
27
- # - http://camomile.sourceforge.net
28
- # - Character Palette (Mac OS X)
29
-
30
-
31
- # test data
32
- # @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
33
- @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil}
34
-
35
-
36
- # @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
37
- @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil }
38
-
39
-
40
- # @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
41
- @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil }
42
-
43
- if @small_letters_utf8.size != @small_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @small_letters_utf8!" end
44
- if @capital_letters_utf8.size != @capital_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
45
- if @other_letters_utf8.size != @other_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @other_letters_utf8!" end
46
-
47
-
48
- @unicode_array = []
49
- #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
50
- #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
51
- # f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
52
- #end
53
-
54
- #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
55
- @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
56
-
57
- # Hash[*array_with_keys.zip(array_with_values).flatten]
58
- @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
59
- @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
60
- @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
61
-
62
- class << self
63
- attr_accessor :small_letters_utf8
64
- attr_accessor :capital_letters_utf8
65
- attr_accessor :other_letters_utf8
66
- attr_accessor :letters_utf8
67
- attr_accessor :letters_utf8_hash
68
- attr_accessor :unicode_array
69
- attr_accessor :downcase_table_utf8
70
- attr_accessor :upcase_table_utf8
71
- end
72
-
73
-
74
- def each_utf8_char
75
- scan(/./mu) { |c| yield c }
76
- end
77
-
78
- def each_utf8_char_with_index
79
- i = -1
80
- scan(/./mu) { |c| i+=1; yield(c, i) }
81
- end
82
-
83
- def length_utf8
84
- #scan(/./mu).size
85
- count = 0
86
- scan(/./mu) { count += 1 }
87
- count
88
- end
89
- alias :size_utf8 :length_utf8
90
-
91
- def reverse_utf8
92
- split(//mu).reverse.join
93
- end
94
-
95
- def reverse_utf8!
96
- split(//mu).reverse!.join
97
- end
98
-
99
- def swapcase_utf8
100
- gsub(/./mu) do |char|
101
- if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
102
- elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
103
- else char.swapcase
104
- end
105
- end
106
- end
107
-
108
- def swapcase_utf8!
109
- gsub!(/./mu) do |char|
110
- if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
111
- elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
112
- else ret = char.swapcase end
113
- end
114
- end
115
-
116
- def downcase_utf8
117
- gsub(/./mu) do |char|
118
- small_char = String.downcase_table_utf8[char]
119
- small_char.nil? ? char.downcase : small_char
120
- end
121
- end
122
-
123
- def downcase_utf8!
124
- gsub!(/./mu) do |char|
125
- small_char = String.downcase_table_utf8[char]
126
- small_char.nil? ? char.downcase : small_char
127
- end
128
- end
129
-
130
- def upcase_utf8
131
- gsub(/./mu) do |char|
132
- capital_char = String.upcase_table_utf8[char]
133
- capital_char.nil? ? char.upcase : capital_char
134
- end
135
- end
136
-
137
- def upcase_utf8!
138
- gsub!(/./mu) do |char|
139
- capital_char = String.upcase_table_utf8[char]
140
- capital_char.nil? ? char.upcase : capital_char
141
- end
142
- end
143
-
144
- def count_utf8(c)
145
- return nil if c.empty?
146
- r = %r{[#{c}]}mu
147
- scan(r).size
148
- end
149
-
150
- def delete_utf8(c)
151
- return self if c.empty?
152
- r = %r{[#{c}]}mu
153
- gsub(r, '')
154
- end
155
-
156
- def delete_utf8!(c)
157
- return self if c.empty?
158
- r = %r{[#{c}]}mu
159
- gsub!(r, '')
160
- end
161
-
162
- def first_utf8
163
- self[/\A./mu]
164
- end
165
-
166
- def last_utf8
167
- self[/.\z/mu]
168
- end
169
-
170
- def capitalize_utf8
171
- return self if self =~ /\A[[:space:]]*\z/m
172
- ret = ""
173
- split(/\x20/).each do |w|
174
- count = 0
175
- w.gsub(/./mu) do |char|
176
- count += 1
177
- capital_char = String.upcase_table_utf8[char]
178
- if count == 1 then
179
- capital_char.nil? ? char.upcase : char.upcase_utf8
180
- else
181
- capital_char.nil? ? char.downcase : char.downcase_utf8
182
- end
183
- end
184
- ret << w + ' '
185
- end
186
- ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
187
- end
188
-
189
- def capitalize_utf8!
190
- return self if self =~ /\A[[:space:]]*\z/m
191
- ret = ""
192
- split(/\x20/).each do |w|
193
- count = 0
194
- w.gsub!(/./mu) do |char|
195
- count += 1
196
- capital_char = String.upcase_table_utf8[char]
197
- if count == 1 then
198
- capital_char.nil? ? char.upcase : char.upcase_utf8
199
- else
200
- capital_char.nil? ? char.downcase : char.downcase_utf8
201
- end
202
- end
203
- ret << w + ' '
204
- end
205
- ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
206
- end
207
-
208
-
209
- def index_utf8(s)
210
-
211
- return nil unless !self.empty? && (s.class == Regexp || s.class == String)
212
- #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
213
-
214
- if s.class == Regexp
215
- opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
216
- if opts.count('u') == 0 then opts = opts + "u" end
217
- str = s.source
218
- return nil if str.empty?
219
- str = "%r{#{str}}" + opts
220
- r = eval(str)
221
- l = ""
222
- sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
223
- l.empty? ? nil : l.length_utf8
224
-
225
- else
226
-
227
- return nil if s.empty?
228
- r = %r{#{s}}mu
229
- l = ""
230
- sub(r) { l << $`; " " }
231
- l.empty? ? nil : l.length_utf8
232
-
233
- # this would be a non-regex solution
234
- =begin
235
- return nil if s.empty?
236
- return nil unless self =~ %r{#{s}}mu
237
- indices = []
238
- s.split(//mu).each do |x|
239
- ar = []
240
- self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
241
- indices << ar unless ar.empty?
242
- end
243
- if indices.empty?
244
- return nil
245
- elsif indices.size == 1
246
- indices.first.first
247
- else
248
- #p indices
249
- ret = []
250
- a0 = indices.shift
251
- a0.each do |i|
252
- ret << i
253
- indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
254
- return ret.first unless ret.empty?
255
- end
256
- ret.empty? ? nil : ret.first
257
- end
258
- =end
259
-
260
- end
261
- end
262
-
263
-
264
- def rindex_utf8(s)
265
-
266
- return nil unless !self.empty? && (s.class == Regexp || s.class == String)
267
- #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
268
-
269
- if s.class == Regexp
270
- opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
271
- if opts.count('u') == 0 then opts = opts + "u" end
272
- str = s.source
273
- return nil if str.empty?
274
- str = "%r{#{str}}" + opts
275
- r = eval(str)
276
- l = ""
277
- scan(r) { l = $` }
278
- #gsub(r) { l = $`; " " }
279
- l.empty? ? nil : l.length_utf8
280
- else
281
- return nil if s.empty?
282
- r = %r{#{s}}mu
283
- l = ""
284
- scan(r) { l = $` }
285
- #gsub(r) { l = $`; " " }
286
- l.empty? ? nil : l.length_utf8
287
- end
288
-
289
- end
290
-
291
-
292
- # note that the i option does not work in special cases with back references
293
- # example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
294
- def slice_utf8(regex)
295
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
296
- if opts.count('u') == 0 then opts = opts + "u" end
297
- s = regex.source
298
- str = "%r{#{s}}" + opts
299
- r = eval(str)
300
- slice(r)
301
- end
302
-
303
- def slice_utf8!(regex)
304
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
305
- if opts.count('u') == 0 then opts = opts + "u" end
306
- s = regex.source
307
- str = "%r{#{s}}" + opts
308
- r = eval(str)
309
- slice!(r)
310
- end
311
-
312
- def cut_utf8(p,l) # (index) position, length
313
- raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
314
- s = self.length_utf8
315
- #if p < 0 then p = s - p.abs end
316
- if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
317
- return nil if l > s or p > (s - 1)
318
- ret = ""
319
- count = 0
320
- each_utf8_char_with_index do |c,i|
321
- break if count >= l
322
- if i >= p && count < l then count += 1; ret << c; end
323
- end
324
- ret
325
- end
326
-
327
- def starts_with_utf8?(s)
328
- return nil if self.empty? or s.empty?
329
- cut_utf8(0, s.size_utf8) == s
330
- end
331
-
332
- def ends_with_utf8?(s)
333
- return nil if self.empty? or s.empty?
334
- cut_utf8(-(s.size_utf8), s.size_utf8) == s
335
- end
336
-
337
- def insert_utf8(i,s) # insert_utf8(index, string)
338
- return self if s.empty?
339
- l = self.length_utf8
340
- if l == 0 then return s end
341
- if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
342
- #return nil if i > (l - 1) # return nil ...
343
- spaces = ""
344
- if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
345
- str = self << spaces
346
- s1 = str.cut_utf8(0, i)
347
- s2 = str.cut_utf8(i, l - s1.length_utf8)
348
- s1 << s << s2
349
- end
350
-
351
- def split_utf8(regex)
352
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
353
- if opts.count('u') == 0 then opts = opts + "u" end
354
- s = regex.source
355
- str = "%r{#{s}}" + opts
356
- r = eval(str)
357
- split(r)
358
- end
359
-
360
- def scan_utf8(regex)
361
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
362
- if opts.count('u') == 0 then opts = opts + "u" end
363
- s = regex.source
364
- str = "%r{#{s}}" + opts
365
- r = eval(str)
366
- if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
367
- end
368
-
369
- def range_utf8(r)
370
-
371
- return nil if r.class != Range
372
- #raise(ArgumentError, "No Range object given!", caller) if r.class != Range
373
-
374
- a = r.to_s[/^[\+\-]?\d+/].to_i
375
- b = r.to_s[/[\+\-]?\d+$/].to_i
376
- d = r.to_s[/\.+/]
377
-
378
- if d.size == 2 then d = 2 else d = d.size end
379
-
380
- l = self.length_utf8
381
-
382
- return nil if b.abs > l || a.abs > l || d < 2 || d > 3
383
-
384
- if a < 0 then a = l - a.abs end
385
- if b < 0 then b = l - b.abs end
386
-
387
- return nil if a > b
388
-
389
- str = ""
390
-
391
- each_utf8_char_with_index do |c,i|
392
- break if i > b
393
- if d == 2
394
- (i >= a && i <= b) ? str << c : next
395
- else
396
- (i >= a && i < b) ? str << c : next
397
- end
398
- end
399
-
400
- str
401
-
402
- end
403
-
404
- def utf8?
405
- # self =~ UTF8REGEX
406
- encoding == Encoding.find("UTF-8") && valid_encoding?
407
- end
408
-
409
- def a
410
- t = ""
411
- # self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
412
- chars.each { |c| t << c if c.utf8? }
413
- t
414
- end
415
-
416
-
417
- def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
418
-
419
- file = self
420
- str = ""
421
-
422
- if file =~ /^http:\/\//
423
-
424
- url = file
425
-
426
- if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
427
-
428
- seconds = 30
429
-
430
- # check if web site is reachable
431
- # on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
432
- var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
433
-
434
- #return false unless var == 0
435
- raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
436
-
437
- str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
438
- /usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
439
- p str
440
- return true if str =~ /utf-?8/i
441
- return false if !str.empty? && str !~ /utf-?8/i
442
-
443
- # solutions with downloaded file
444
-
445
- # download HTML file
446
- #downloaded_file = "/tmp/html"
447
- downloaded_file = "~/Desktop/html"
448
- downloaded_file = File.expand_path(downloaded_file)
449
- %x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
450
- raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
451
- %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
452
-
453
- simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
454
- p simple_test
455
-
456
- # read entire file into a string
457
- File.open(downloaded_file).read.each(nil) do |str|
458
- #return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
459
- str.utf8? ? (return true) : (return false)
460
- end
461
-
462
- #check each line of the downloaded file
463
- #count_lines = 0
464
- #count_utf8 = 0
465
- #File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
466
- #count_lines == count_utf8 ? (return true) : (return false)
467
-
468
-
469
- # in-memory solutions
470
-
471
- #html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
472
- #p html_file_cleaned_utf8.utf8?
473
-
474
- count_lines = 0
475
- count_utf8 = 0
476
- #%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
477
- %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
478
- #return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
479
- count_lines += 1
480
- count_utf8 += 1 if line.utf8?
481
- break if count_lines != count_utf8
482
- end
483
- count_lines == count_utf8 ? (return true) : (return false)
484
-
485
- else
486
-
487
- # check each line of the HTML file (or the entire HTML file at once)
488
- # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
489
- count_lines = 0
490
- count_utf8 = 0
491
- open(url) do |f|
492
- # p f.meta, f.content_encoding, f.content_type
493
- cs = f.charset
494
- return true if cs =~ /utf-?8/i
495
- #f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
496
- f.each_line do |line|
497
- count_lines += 1
498
- count_utf8 += 1 if line.utf8?
499
- break unless count_lines == count_utf8
500
- end
501
- end
502
- count_lines == count_utf8 ? (return true) : (return false)
503
-
504
- end
505
-
506
- else
507
-
508
- return false unless File.file?(file)
509
-
510
- if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
511
-
512
- # read entire file into a string
513
- #File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
514
-
515
- # check each line of the file
516
- count_lines = 0
517
- count_utf8 = 0
518
- File.foreach(file) do |line|
519
- return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
520
- count_lines += 1;
521
- count_utf8 += 1 if line.utf8?;
522
- break if count_lines != count_utf8
523
- end
524
-
525
- count_lines == count_utf8 ? (return true) : (return false)
526
-
527
- end
528
-
529
- str =~ /utf-?8/i ? true : false
530
-
531
- end
532
-
533
-
534
- # cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
535
- def validate_utf8
536
- Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
537
- end
538
-
539
- # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
540
- def asciify_utf8
541
- return nil unless self.utf8?
542
- #Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
543
- # delete all punctuation characters inside words except "-" in words such as up-to-date
544
- Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
545
- end
546
-
547
- def latin1_to_utf8 # ISO-8859-1 to UTF-8
548
- ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
549
- ret.utf8? ? ret : nil
550
- end
551
-
552
- def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
553
- ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
554
- ret.utf8? ? ret : nil
555
- end
556
-
557
- # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
558
- def utf16le_to_utf8
559
- ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
560
- ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
561
- ret.utf8? ? ret : nil
562
- end
563
-
564
- def utf8_to_utf16le
565
- return nil unless self.utf8?
566
- ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
567
- end
568
-
569
- def utf8_to_unicode
570
- return nil unless self.utf8?
571
- str = ""
572
- scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
573
- str
574
- end
575
-
576
- def unicode_to_utf8
577
- return self if self =~ /\A[[:space:]]*\z/m
578
- str = ""
579
- #scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
580
- #scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
581
- scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
582
- c = $1
583
- if c =~ /^U\+/
584
- str << [c[2..-1].hex].pack("U*")
585
- else
586
- str << c
587
- end
588
- end
589
- str.utf8? ? str : nil
590
- end
591
-
592
-
593
- # dec, hex, oct conversions (experimental!)
594
-
595
- def utf8_to_dec
596
- return nil unless self.utf8?
597
- str = ""
598
- scan(/./mu) do |c|
599
- if c =~ /^\x00$/
600
- str << "aaa\x00" # encode \x00 as "aaa"
601
- else
602
- str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
603
- end
604
- end
605
- str[0..-2]
606
- end
607
-
608
- def dec_to_utf8 # \x00 is encoded as "aaa"
609
- return self if self.empty?
610
- return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
611
- str = ""
612
- split(/\x00/).each do |c|
613
- if c.eql?("aaa")
614
- str << "\x00"
615
- else
616
- str << [c.to_i].pack("U*")
617
- end
618
- end
619
- str
620
- end
621
-
622
-
623
- def utf8_to_dec_2
624
- return nil unless self.utf8?
625
- str = ""
626
- tmpstr = ""
627
- null_str = "\x00"
628
- scan(/./mu) do |c|
629
- if c =~ /^\x00$/
630
- str << "aaa\x00\x00" # encode \x00 as "aaa"
631
- else
632
- tmpstr = ""
633
- c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
634
- str << tmpstr << null_str
635
- end
636
- end
637
- str[0..-3]
638
- end
639
-
640
- def dec_to_utf8_2 # \x00 is encoded as "aaa"
641
- return self if self.empty?
642
- return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
643
- str = ""
644
- split(/\x00\x00/).each do |c|
645
- if c =~ /\x00/
646
- c.split(/\x00/).each { |x| str << x.to_i.chr }
647
- elsif c.eql?("aaa")
648
- str << "\x00"
649
- else
650
- str << c.to_i.chr
651
- end
652
- end
653
- str
654
- end
655
-
656
-
657
- def utf8_to_hex
658
- return nil unless self.utf8?
659
- str = ""
660
- tmpstr = ""
661
- null_str = "\x00"
662
- scan(/./mu) do |c|
663
- if c =~ /^\x00$/
664
- str << "aaa\x00\x00" # encode \x00 as "aaa"
665
- else
666
- tmpstr = ""
667
- c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
668
- str << tmpstr << null_str
669
- end
670
- end
671
- str[0..-3]
672
- end
673
-
674
- def hex_to_utf8 # \x00 is encoded as "aaa"
675
- return self if self.empty?
676
- return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
677
- str = ""
678
- split(/\x00\x00/).each do |c|
679
- if c =~ /\x00/
680
- c.split(/\x00/).each { |x| str << x.hex.chr }
681
- elsif c.eql?("aaa")
682
- str << "\x00"
683
- else
684
- str << c.hex.chr
685
- end
686
- end
687
- str
688
- end
689
-
690
-
691
- def utf8_to_oct
692
- return nil unless self.utf8?
693
- str = ""
694
- tmpstr = ""
695
- null_str = "\x00"
696
- scan(/./mu) do |c|
697
- if c =~ /^\x00$/
698
- str << "aaa\x00\x00" # encode \x00 as "aaa"
699
- else
700
- tmpstr = ""
701
- c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
702
- str << tmpstr << null_str
703
- end
704
- end
705
- str[0..-3]
706
- end
707
-
708
- def oct_to_utf8 # \x00 is encoded as "aaa"
709
- return self if self.empty?
710
- return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
711
- str = ""
712
- split(/\x00\x00/).each do |c|
713
- if c =~ /\x00/
714
- c.split(/\x00/).each { |x| str << x.oct.chr }
715
- elsif c.eql?("aaa")
716
- str << "\x00"
717
- else
718
- str << c.oct.chr
719
- end
720
- end
721
- str
722
- end
723
-
724
- # cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
725
- def email_subject_utf8
726
- return nil unless self.utf8?
727
- "=?utf-8?b?#{[self].pack("m").delete("\n")}?="
728
- end
729
-
730
- end