prawn-arabic 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: prawn-arabic
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dynamix Solutions
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-04-19 00:00:00.000000000 Z
11
+ date: 2017-04-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: iconv
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  description: Gem which improve workflow with arabic text
28
42
  email: ahmed.nasser@dynamix-systems.com
29
43
  executables: []
@@ -36,7 +50,6 @@ files:
36
50
  - README.md
37
51
  - Rakefile
38
52
  - lib/prawn-arabic.rb
39
- - lib/string_utf_support.rb
40
53
  homepage: https://github.com/ozeron/arabic-prawn
41
54
  licenses:
42
55
  - MIT
@@ -1,730 +0,0 @@
1
- # encoding: ascii-8bit
2
- class String
3
-
4
- require 'iconv'
5
- require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
6
-
7
- # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
8
- # UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
9
- # [\x09\x0A\x0D\x20-\x7E] # ASCII
10
- # | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
11
- # | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
12
- # | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
13
- # | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
14
- # | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
15
- # | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
16
- # | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
17
- # )*\z/mxn
18
-
19
-
20
- # create UTF-8 character arrays (as class instance variables)
21
- #
22
- # mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
23
- # - http://unicode.org/Public/UNIDATA/UnicodeData.txt
24
- # - http://unicode.org/Public/UNIDATA/CaseFolding.txt
25
- # - http://www.decodeunicode.org
26
- # - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
27
- # - http://camomile.sourceforge.net
28
- # - Character Palette (Mac OS X)
29
-
30
-
31
- # test data
32
- # @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
33
- @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil}
34
-
35
-
36
- # @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
37
- @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil }
38
-
39
-
40
- # @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
41
- @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil }
42
-
43
- if @small_letters_utf8.size != @small_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @small_letters_utf8!" end
44
- if @capital_letters_utf8.size != @capital_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
45
- if @other_letters_utf8.size != @other_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @other_letters_utf8!" end
46
-
47
-
48
- @unicode_array = []
49
- #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
50
- #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
51
- # f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
52
- #end
53
-
54
- #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
55
- @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
56
-
57
- # Hash[*array_with_keys.zip(array_with_values).flatten]
58
- @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
59
- @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
60
- @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
61
-
62
- class << self
63
- attr_accessor :small_letters_utf8
64
- attr_accessor :capital_letters_utf8
65
- attr_accessor :other_letters_utf8
66
- attr_accessor :letters_utf8
67
- attr_accessor :letters_utf8_hash
68
- attr_accessor :unicode_array
69
- attr_accessor :downcase_table_utf8
70
- attr_accessor :upcase_table_utf8
71
- end
72
-
73
-
74
- def each_utf8_char
75
- scan(/./mu) { |c| yield c }
76
- end
77
-
78
- def each_utf8_char_with_index
79
- i = -1
80
- scan(/./mu) { |c| i+=1; yield(c, i) }
81
- end
82
-
83
- def length_utf8
84
- #scan(/./mu).size
85
- count = 0
86
- scan(/./mu) { count += 1 }
87
- count
88
- end
89
- alias :size_utf8 :length_utf8
90
-
91
- def reverse_utf8
92
- split(//mu).reverse.join
93
- end
94
-
95
- def reverse_utf8!
96
- split(//mu).reverse!.join
97
- end
98
-
99
- def swapcase_utf8
100
- gsub(/./mu) do |char|
101
- if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
102
- elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
103
- else char.swapcase
104
- end
105
- end
106
- end
107
-
108
- def swapcase_utf8!
109
- gsub!(/./mu) do |char|
110
- if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
111
- elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
112
- else ret = char.swapcase end
113
- end
114
- end
115
-
116
- def downcase_utf8
117
- gsub(/./mu) do |char|
118
- small_char = String.downcase_table_utf8[char]
119
- small_char.nil? ? char.downcase : small_char
120
- end
121
- end
122
-
123
- def downcase_utf8!
124
- gsub!(/./mu) do |char|
125
- small_char = String.downcase_table_utf8[char]
126
- small_char.nil? ? char.downcase : small_char
127
- end
128
- end
129
-
130
- def upcase_utf8
131
- gsub(/./mu) do |char|
132
- capital_char = String.upcase_table_utf8[char]
133
- capital_char.nil? ? char.upcase : capital_char
134
- end
135
- end
136
-
137
- def upcase_utf8!
138
- gsub!(/./mu) do |char|
139
- capital_char = String.upcase_table_utf8[char]
140
- capital_char.nil? ? char.upcase : capital_char
141
- end
142
- end
143
-
144
- def count_utf8(c)
145
- return nil if c.empty?
146
- r = %r{[#{c}]}mu
147
- scan(r).size
148
- end
149
-
150
- def delete_utf8(c)
151
- return self if c.empty?
152
- r = %r{[#{c}]}mu
153
- gsub(r, '')
154
- end
155
-
156
- def delete_utf8!(c)
157
- return self if c.empty?
158
- r = %r{[#{c}]}mu
159
- gsub!(r, '')
160
- end
161
-
162
- def first_utf8
163
- self[/\A./mu]
164
- end
165
-
166
- def last_utf8
167
- self[/.\z/mu]
168
- end
169
-
170
- def capitalize_utf8
171
- return self if self =~ /\A[[:space:]]*\z/m
172
- ret = ""
173
- split(/\x20/).each do |w|
174
- count = 0
175
- w.gsub(/./mu) do |char|
176
- count += 1
177
- capital_char = String.upcase_table_utf8[char]
178
- if count == 1 then
179
- capital_char.nil? ? char.upcase : char.upcase_utf8
180
- else
181
- capital_char.nil? ? char.downcase : char.downcase_utf8
182
- end
183
- end
184
- ret << w + ' '
185
- end
186
- ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
187
- end
188
-
189
- def capitalize_utf8!
190
- return self if self =~ /\A[[:space:]]*\z/m
191
- ret = ""
192
- split(/\x20/).each do |w|
193
- count = 0
194
- w.gsub!(/./mu) do |char|
195
- count += 1
196
- capital_char = String.upcase_table_utf8[char]
197
- if count == 1 then
198
- capital_char.nil? ? char.upcase : char.upcase_utf8
199
- else
200
- capital_char.nil? ? char.downcase : char.downcase_utf8
201
- end
202
- end
203
- ret << w + ' '
204
- end
205
- ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
206
- end
207
-
208
-
209
- def index_utf8(s)
210
-
211
- return nil unless !self.empty? && (s.class == Regexp || s.class == String)
212
- #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
213
-
214
- if s.class == Regexp
215
- opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
216
- if opts.count('u') == 0 then opts = opts + "u" end
217
- str = s.source
218
- return nil if str.empty?
219
- str = "%r{#{str}}" + opts
220
- r = eval(str)
221
- l = ""
222
- sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
223
- l.empty? ? nil : l.length_utf8
224
-
225
- else
226
-
227
- return nil if s.empty?
228
- r = %r{#{s}}mu
229
- l = ""
230
- sub(r) { l << $`; " " }
231
- l.empty? ? nil : l.length_utf8
232
-
233
- # this would be a non-regex solution
234
- =begin
235
- return nil if s.empty?
236
- return nil unless self =~ %r{#{s}}mu
237
- indices = []
238
- s.split(//mu).each do |x|
239
- ar = []
240
- self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
241
- indices << ar unless ar.empty?
242
- end
243
- if indices.empty?
244
- return nil
245
- elsif indices.size == 1
246
- indices.first.first
247
- else
248
- #p indices
249
- ret = []
250
- a0 = indices.shift
251
- a0.each do |i|
252
- ret << i
253
- indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
254
- return ret.first unless ret.empty?
255
- end
256
- ret.empty? ? nil : ret.first
257
- end
258
- =end
259
-
260
- end
261
- end
262
-
263
-
264
- def rindex_utf8(s)
265
-
266
- return nil unless !self.empty? && (s.class == Regexp || s.class == String)
267
- #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
268
-
269
- if s.class == Regexp
270
- opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
271
- if opts.count('u') == 0 then opts = opts + "u" end
272
- str = s.source
273
- return nil if str.empty?
274
- str = "%r{#{str}}" + opts
275
- r = eval(str)
276
- l = ""
277
- scan(r) { l = $` }
278
- #gsub(r) { l = $`; " " }
279
- l.empty? ? nil : l.length_utf8
280
- else
281
- return nil if s.empty?
282
- r = %r{#{s}}mu
283
- l = ""
284
- scan(r) { l = $` }
285
- #gsub(r) { l = $`; " " }
286
- l.empty? ? nil : l.length_utf8
287
- end
288
-
289
- end
290
-
291
-
292
- # note that the i option does not work in special cases with back references
293
- # example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
294
- def slice_utf8(regex)
295
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
296
- if opts.count('u') == 0 then opts = opts + "u" end
297
- s = regex.source
298
- str = "%r{#{s}}" + opts
299
- r = eval(str)
300
- slice(r)
301
- end
302
-
303
- def slice_utf8!(regex)
304
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
305
- if opts.count('u') == 0 then opts = opts + "u" end
306
- s = regex.source
307
- str = "%r{#{s}}" + opts
308
- r = eval(str)
309
- slice!(r)
310
- end
311
-
312
- def cut_utf8(p,l) # (index) position, length
313
- raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
314
- s = self.length_utf8
315
- #if p < 0 then p = s - p.abs end
316
- if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
317
- return nil if l > s or p > (s - 1)
318
- ret = ""
319
- count = 0
320
- each_utf8_char_with_index do |c,i|
321
- break if count >= l
322
- if i >= p && count < l then count += 1; ret << c; end
323
- end
324
- ret
325
- end
326
-
327
- def starts_with_utf8?(s)
328
- return nil if self.empty? or s.empty?
329
- cut_utf8(0, s.size_utf8) == s
330
- end
331
-
332
- def ends_with_utf8?(s)
333
- return nil if self.empty? or s.empty?
334
- cut_utf8(-(s.size_utf8), s.size_utf8) == s
335
- end
336
-
337
- def insert_utf8(i,s) # insert_utf8(index, string)
338
- return self if s.empty?
339
- l = self.length_utf8
340
- if l == 0 then return s end
341
- if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
342
- #return nil if i > (l - 1) # return nil ...
343
- spaces = ""
344
- if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
345
- str = self << spaces
346
- s1 = str.cut_utf8(0, i)
347
- s2 = str.cut_utf8(i, l - s1.length_utf8)
348
- s1 << s << s2
349
- end
350
-
351
- def split_utf8(regex)
352
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
353
- if opts.count('u') == 0 then opts = opts + "u" end
354
- s = regex.source
355
- str = "%r{#{s}}" + opts
356
- r = eval(str)
357
- split(r)
358
- end
359
-
360
- def scan_utf8(regex)
361
- opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
362
- if opts.count('u') == 0 then opts = opts + "u" end
363
- s = regex.source
364
- str = "%r{#{s}}" + opts
365
- r = eval(str)
366
- if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
367
- end
368
-
369
- def range_utf8(r)
370
-
371
- return nil if r.class != Range
372
- #raise(ArgumentError, "No Range object given!", caller) if r.class != Range
373
-
374
- a = r.to_s[/^[\+\-]?\d+/].to_i
375
- b = r.to_s[/[\+\-]?\d+$/].to_i
376
- d = r.to_s[/\.+/]
377
-
378
- if d.size == 2 then d = 2 else d = d.size end
379
-
380
- l = self.length_utf8
381
-
382
- return nil if b.abs > l || a.abs > l || d < 2 || d > 3
383
-
384
- if a < 0 then a = l - a.abs end
385
- if b < 0 then b = l - b.abs end
386
-
387
- return nil if a > b
388
-
389
- str = ""
390
-
391
- each_utf8_char_with_index do |c,i|
392
- break if i > b
393
- if d == 2
394
- (i >= a && i <= b) ? str << c : next
395
- else
396
- (i >= a && i < b) ? str << c : next
397
- end
398
- end
399
-
400
- str
401
-
402
- end
403
-
404
- def utf8?
405
- # self =~ UTF8REGEX
406
- encoding == Encoding.find("UTF-8") && valid_encoding?
407
- end
408
-
409
- def a
410
- t = ""
411
- # self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
412
- chars.each { |c| t << c if c.utf8? }
413
- t
414
- end
415
-
416
-
417
- def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
418
-
419
- file = self
420
- str = ""
421
-
422
- if file =~ /^http:\/\//
423
-
424
- url = file
425
-
426
- if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
427
-
428
- seconds = 30
429
-
430
- # check if web site is reachable
431
- # on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
432
- var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
433
-
434
- #return false unless var == 0
435
- raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
436
-
437
- str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
438
- /usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
439
- p str
440
- return true if str =~ /utf-?8/i
441
- return false if !str.empty? && str !~ /utf-?8/i
442
-
443
- # solutions with downloaded file
444
-
445
- # download HTML file
446
- #downloaded_file = "/tmp/html"
447
- downloaded_file = "~/Desktop/html"
448
- downloaded_file = File.expand_path(downloaded_file)
449
- %x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
450
- raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
451
- %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
452
-
453
- simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
454
- p simple_test
455
-
456
- # read entire file into a string
457
- File.open(downloaded_file).read.each(nil) do |str|
458
- #return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
459
- str.utf8? ? (return true) : (return false)
460
- end
461
-
462
- #check each line of the downloaded file
463
- #count_lines = 0
464
- #count_utf8 = 0
465
- #File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
466
- #count_lines == count_utf8 ? (return true) : (return false)
467
-
468
-
469
- # in-memory solutions
470
-
471
- #html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
472
- #p html_file_cleaned_utf8.utf8?
473
-
474
- count_lines = 0
475
- count_utf8 = 0
476
- #%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
477
- %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
478
- #return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
479
- count_lines += 1
480
- count_utf8 += 1 if line.utf8?
481
- break if count_lines != count_utf8
482
- end
483
- count_lines == count_utf8 ? (return true) : (return false)
484
-
485
- else
486
-
487
- # check each line of the HTML file (or the entire HTML file at once)
488
- # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
489
- count_lines = 0
490
- count_utf8 = 0
491
- open(url) do |f|
492
- # p f.meta, f.content_encoding, f.content_type
493
- cs = f.charset
494
- return true if cs =~ /utf-?8/i
495
- #f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
496
- f.each_line do |line|
497
- count_lines += 1
498
- count_utf8 += 1 if line.utf8?
499
- break unless count_lines == count_utf8
500
- end
501
- end
502
- count_lines == count_utf8 ? (return true) : (return false)
503
-
504
- end
505
-
506
- else
507
-
508
- return false unless File.file?(file)
509
-
510
- if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
511
-
512
- # read entire file into a string
513
- #File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
514
-
515
- # check each line of the file
516
- count_lines = 0
517
- count_utf8 = 0
518
- File.foreach(file) do |line|
519
- return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
520
- count_lines += 1;
521
- count_utf8 += 1 if line.utf8?;
522
- break if count_lines != count_utf8
523
- end
524
-
525
- count_lines == count_utf8 ? (return true) : (return false)
526
-
527
- end
528
-
529
- str =~ /utf-?8/i ? true : false
530
-
531
- end
532
-
533
-
534
- # cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
535
- def validate_utf8
536
- Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
537
- end
538
-
539
- # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
540
- def asciify_utf8
541
- return nil unless self.utf8?
542
- #Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
543
- # delete all punctuation characters inside words except "-" in words such as up-to-date
544
- Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
545
- end
546
-
547
- def latin1_to_utf8 # ISO-8859-1 to UTF-8
548
- ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
549
- ret.utf8? ? ret : nil
550
- end
551
-
552
- def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
553
- ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
554
- ret.utf8? ? ret : nil
555
- end
556
-
557
- # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
558
- def utf16le_to_utf8
559
- ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
560
- ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
561
- ret.utf8? ? ret : nil
562
- end
563
-
564
- def utf8_to_utf16le
565
- return nil unless self.utf8?
566
- ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
567
- end
568
-
569
- def utf8_to_unicode
570
- return nil unless self.utf8?
571
- str = ""
572
- scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
573
- str
574
- end
575
-
576
- def unicode_to_utf8
577
- return self if self =~ /\A[[:space:]]*\z/m
578
- str = ""
579
- #scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
580
- #scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
581
- scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
582
- c = $1
583
- if c =~ /^U\+/
584
- str << [c[2..-1].hex].pack("U*")
585
- else
586
- str << c
587
- end
588
- end
589
- str.utf8? ? str : nil
590
- end
591
-
592
-
593
- # dec, hex, oct conversions (experimental!)
594
-
595
- def utf8_to_dec
596
- return nil unless self.utf8?
597
- str = ""
598
- scan(/./mu) do |c|
599
- if c =~ /^\x00$/
600
- str << "aaa\x00" # encode \x00 as "aaa"
601
- else
602
- str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
603
- end
604
- end
605
- str[0..-2]
606
- end
607
-
608
- def dec_to_utf8 # \x00 is encoded as "aaa"
609
- return self if self.empty?
610
- return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
611
- str = ""
612
- split(/\x00/).each do |c|
613
- if c.eql?("aaa")
614
- str << "\x00"
615
- else
616
- str << [c.to_i].pack("U*")
617
- end
618
- end
619
- str
620
- end
621
-
622
-
623
- def utf8_to_dec_2
624
- return nil unless self.utf8?
625
- str = ""
626
- tmpstr = ""
627
- null_str = "\x00"
628
- scan(/./mu) do |c|
629
- if c =~ /^\x00$/
630
- str << "aaa\x00\x00" # encode \x00 as "aaa"
631
- else
632
- tmpstr = ""
633
- c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
634
- str << tmpstr << null_str
635
- end
636
- end
637
- str[0..-3]
638
- end
639
-
640
- def dec_to_utf8_2 # \x00 is encoded as "aaa"
641
- return self if self.empty?
642
- return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
643
- str = ""
644
- split(/\x00\x00/).each do |c|
645
- if c =~ /\x00/
646
- c.split(/\x00/).each { |x| str << x.to_i.chr }
647
- elsif c.eql?("aaa")
648
- str << "\x00"
649
- else
650
- str << c.to_i.chr
651
- end
652
- end
653
- str
654
- end
655
-
656
-
657
- def utf8_to_hex
658
- return nil unless self.utf8?
659
- str = ""
660
- tmpstr = ""
661
- null_str = "\x00"
662
- scan(/./mu) do |c|
663
- if c =~ /^\x00$/
664
- str << "aaa\x00\x00" # encode \x00 as "aaa"
665
- else
666
- tmpstr = ""
667
- c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
668
- str << tmpstr << null_str
669
- end
670
- end
671
- str[0..-3]
672
- end
673
-
674
- def hex_to_utf8 # \x00 is encoded as "aaa"
675
- return self if self.empty?
676
- return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
677
- str = ""
678
- split(/\x00\x00/).each do |c|
679
- if c =~ /\x00/
680
- c.split(/\x00/).each { |x| str << x.hex.chr }
681
- elsif c.eql?("aaa")
682
- str << "\x00"
683
- else
684
- str << c.hex.chr
685
- end
686
- end
687
- str
688
- end
689
-
690
-
691
- def utf8_to_oct
692
- return nil unless self.utf8?
693
- str = ""
694
- tmpstr = ""
695
- null_str = "\x00"
696
- scan(/./mu) do |c|
697
- if c =~ /^\x00$/
698
- str << "aaa\x00\x00" # encode \x00 as "aaa"
699
- else
700
- tmpstr = ""
701
- c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
702
- str << tmpstr << null_str
703
- end
704
- end
705
- str[0..-3]
706
- end
707
-
708
- def oct_to_utf8 # \x00 is encoded as "aaa"
709
- return self if self.empty?
710
- return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
711
- str = ""
712
- split(/\x00\x00/).each do |c|
713
- if c =~ /\x00/
714
- c.split(/\x00/).each { |x| str << x.oct.chr }
715
- elsif c.eql?("aaa")
716
- str << "\x00"
717
- else
718
- str << c.oct.chr
719
- end
720
- end
721
- str
722
- end
723
-
724
- # cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
725
- def email_subject_utf8
726
- return nil unless self.utf8?
727
- "=?utf-8?b?#{[self].pack("m").delete("\n")}?="
728
- end
729
-
730
- end