prawn-arabic 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +10 -0
- data/lib/prawn-arabic.rb +192 -194
- metadata +16 -3
- data/lib/string_utf_support.rb +0 -730
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: prawn-arabic
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dynamix Solutions
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-04-
|
11
|
+
date: 2017-04-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: iconv
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
description: Gem which improve workflow with arabic text
|
28
42
|
email: ahmed.nasser@dynamix-systems.com
|
29
43
|
executables: []
|
@@ -36,7 +50,6 @@ files:
|
|
36
50
|
- README.md
|
37
51
|
- Rakefile
|
38
52
|
- lib/prawn-arabic.rb
|
39
|
-
- lib/string_utf_support.rb
|
40
53
|
homepage: https://github.com/ozeron/arabic-prawn
|
41
54
|
licenses:
|
42
55
|
- MIT
|
data/lib/string_utf_support.rb
DELETED
@@ -1,730 +0,0 @@
|
|
1
|
-
# encoding: ascii-8bit
|
2
|
-
class String
|
3
|
-
|
4
|
-
require 'iconv'
|
5
|
-
require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
6
|
-
|
7
|
-
# taken from: http://www.w3.org/International/questions/qa-forms-utf-8
|
8
|
-
# UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
|
9
|
-
# [\x09\x0A\x0D\x20-\x7E] # ASCII
|
10
|
-
# | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
11
|
-
# | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
12
|
-
# | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
13
|
-
# | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
14
|
-
# | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
15
|
-
# | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
16
|
-
# | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
17
|
-
# )*\z/mxn
|
18
|
-
|
19
|
-
|
20
|
-
# create UTF-8 character arrays (as class instance variables)
|
21
|
-
#
|
22
|
-
# mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
|
23
|
-
# - http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
24
|
-
# - http://unicode.org/Public/UNIDATA/CaseFolding.txt
|
25
|
-
# - http://www.decodeunicode.org
|
26
|
-
# - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
|
27
|
-
# - http://camomile.sourceforge.net
|
28
|
-
# - Character Palette (Mac OS X)
|
29
|
-
|
30
|
-
|
31
|
-
# test data
|
32
|
-
# @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
33
|
-
@small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil}
|
34
|
-
|
35
|
-
|
36
|
-
# @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
37
|
-
@capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil }
|
38
|
-
|
39
|
-
|
40
|
-
# @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
41
|
-
@other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u.valid_encoding? ? u : nil }
|
42
|
-
|
43
|
-
if @small_letters_utf8.size != @small_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @small_letters_utf8!" end
|
44
|
-
if @capital_letters_utf8.size != @capital_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
|
45
|
-
if @other_letters_utf8.size != @other_letters_utf8.count{|x| !x.nil?} then raise "Invalid UTF-8 char in @other_letters_utf8!" end
|
46
|
-
|
47
|
-
|
48
|
-
@unicode_array = []
|
49
|
-
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
|
50
|
-
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
|
51
|
-
# f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
|
52
|
-
#end
|
53
|
-
|
54
|
-
#@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
|
55
|
-
@letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
|
56
|
-
|
57
|
-
# Hash[*array_with_keys.zip(array_with_values).flatten]
|
58
|
-
@downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
|
59
|
-
@upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
|
60
|
-
@letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
|
61
|
-
|
62
|
-
class << self
|
63
|
-
attr_accessor :small_letters_utf8
|
64
|
-
attr_accessor :capital_letters_utf8
|
65
|
-
attr_accessor :other_letters_utf8
|
66
|
-
attr_accessor :letters_utf8
|
67
|
-
attr_accessor :letters_utf8_hash
|
68
|
-
attr_accessor :unicode_array
|
69
|
-
attr_accessor :downcase_table_utf8
|
70
|
-
attr_accessor :upcase_table_utf8
|
71
|
-
end
|
72
|
-
|
73
|
-
|
74
|
-
def each_utf8_char
|
75
|
-
scan(/./mu) { |c| yield c }
|
76
|
-
end
|
77
|
-
|
78
|
-
def each_utf8_char_with_index
|
79
|
-
i = -1
|
80
|
-
scan(/./mu) { |c| i+=1; yield(c, i) }
|
81
|
-
end
|
82
|
-
|
83
|
-
def length_utf8
|
84
|
-
#scan(/./mu).size
|
85
|
-
count = 0
|
86
|
-
scan(/./mu) { count += 1 }
|
87
|
-
count
|
88
|
-
end
|
89
|
-
alias :size_utf8 :length_utf8
|
90
|
-
|
91
|
-
def reverse_utf8
|
92
|
-
split(//mu).reverse.join
|
93
|
-
end
|
94
|
-
|
95
|
-
def reverse_utf8!
|
96
|
-
split(//mu).reverse!.join
|
97
|
-
end
|
98
|
-
|
99
|
-
def swapcase_utf8
|
100
|
-
gsub(/./mu) do |char|
|
101
|
-
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
102
|
-
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
103
|
-
else char.swapcase
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def swapcase_utf8!
|
109
|
-
gsub!(/./mu) do |char|
|
110
|
-
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
111
|
-
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
112
|
-
else ret = char.swapcase end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def downcase_utf8
|
117
|
-
gsub(/./mu) do |char|
|
118
|
-
small_char = String.downcase_table_utf8[char]
|
119
|
-
small_char.nil? ? char.downcase : small_char
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
def downcase_utf8!
|
124
|
-
gsub!(/./mu) do |char|
|
125
|
-
small_char = String.downcase_table_utf8[char]
|
126
|
-
small_char.nil? ? char.downcase : small_char
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
def upcase_utf8
|
131
|
-
gsub(/./mu) do |char|
|
132
|
-
capital_char = String.upcase_table_utf8[char]
|
133
|
-
capital_char.nil? ? char.upcase : capital_char
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
def upcase_utf8!
|
138
|
-
gsub!(/./mu) do |char|
|
139
|
-
capital_char = String.upcase_table_utf8[char]
|
140
|
-
capital_char.nil? ? char.upcase : capital_char
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
def count_utf8(c)
|
145
|
-
return nil if c.empty?
|
146
|
-
r = %r{[#{c}]}mu
|
147
|
-
scan(r).size
|
148
|
-
end
|
149
|
-
|
150
|
-
def delete_utf8(c)
|
151
|
-
return self if c.empty?
|
152
|
-
r = %r{[#{c}]}mu
|
153
|
-
gsub(r, '')
|
154
|
-
end
|
155
|
-
|
156
|
-
def delete_utf8!(c)
|
157
|
-
return self if c.empty?
|
158
|
-
r = %r{[#{c}]}mu
|
159
|
-
gsub!(r, '')
|
160
|
-
end
|
161
|
-
|
162
|
-
def first_utf8
|
163
|
-
self[/\A./mu]
|
164
|
-
end
|
165
|
-
|
166
|
-
def last_utf8
|
167
|
-
self[/.\z/mu]
|
168
|
-
end
|
169
|
-
|
170
|
-
def capitalize_utf8
|
171
|
-
return self if self =~ /\A[[:space:]]*\z/m
|
172
|
-
ret = ""
|
173
|
-
split(/\x20/).each do |w|
|
174
|
-
count = 0
|
175
|
-
w.gsub(/./mu) do |char|
|
176
|
-
count += 1
|
177
|
-
capital_char = String.upcase_table_utf8[char]
|
178
|
-
if count == 1 then
|
179
|
-
capital_char.nil? ? char.upcase : char.upcase_utf8
|
180
|
-
else
|
181
|
-
capital_char.nil? ? char.downcase : char.downcase_utf8
|
182
|
-
end
|
183
|
-
end
|
184
|
-
ret << w + ' '
|
185
|
-
end
|
186
|
-
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
187
|
-
end
|
188
|
-
|
189
|
-
def capitalize_utf8!
|
190
|
-
return self if self =~ /\A[[:space:]]*\z/m
|
191
|
-
ret = ""
|
192
|
-
split(/\x20/).each do |w|
|
193
|
-
count = 0
|
194
|
-
w.gsub!(/./mu) do |char|
|
195
|
-
count += 1
|
196
|
-
capital_char = String.upcase_table_utf8[char]
|
197
|
-
if count == 1 then
|
198
|
-
capital_char.nil? ? char.upcase : char.upcase_utf8
|
199
|
-
else
|
200
|
-
capital_char.nil? ? char.downcase : char.downcase_utf8
|
201
|
-
end
|
202
|
-
end
|
203
|
-
ret << w + ' '
|
204
|
-
end
|
205
|
-
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
206
|
-
end
|
207
|
-
|
208
|
-
|
209
|
-
def index_utf8(s)
|
210
|
-
|
211
|
-
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
212
|
-
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
213
|
-
|
214
|
-
if s.class == Regexp
|
215
|
-
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
216
|
-
if opts.count('u') == 0 then opts = opts + "u" end
|
217
|
-
str = s.source
|
218
|
-
return nil if str.empty?
|
219
|
-
str = "%r{#{str}}" + opts
|
220
|
-
r = eval(str)
|
221
|
-
l = ""
|
222
|
-
sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
|
223
|
-
l.empty? ? nil : l.length_utf8
|
224
|
-
|
225
|
-
else
|
226
|
-
|
227
|
-
return nil if s.empty?
|
228
|
-
r = %r{#{s}}mu
|
229
|
-
l = ""
|
230
|
-
sub(r) { l << $`; " " }
|
231
|
-
l.empty? ? nil : l.length_utf8
|
232
|
-
|
233
|
-
# this would be a non-regex solution
|
234
|
-
=begin
|
235
|
-
return nil if s.empty?
|
236
|
-
return nil unless self =~ %r{#{s}}mu
|
237
|
-
indices = []
|
238
|
-
s.split(//mu).each do |x|
|
239
|
-
ar = []
|
240
|
-
self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
|
241
|
-
indices << ar unless ar.empty?
|
242
|
-
end
|
243
|
-
if indices.empty?
|
244
|
-
return nil
|
245
|
-
elsif indices.size == 1
|
246
|
-
indices.first.first
|
247
|
-
else
|
248
|
-
#p indices
|
249
|
-
ret = []
|
250
|
-
a0 = indices.shift
|
251
|
-
a0.each do |i|
|
252
|
-
ret << i
|
253
|
-
indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
|
254
|
-
return ret.first unless ret.empty?
|
255
|
-
end
|
256
|
-
ret.empty? ? nil : ret.first
|
257
|
-
end
|
258
|
-
=end
|
259
|
-
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
|
264
|
-
def rindex_utf8(s)
|
265
|
-
|
266
|
-
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
267
|
-
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
268
|
-
|
269
|
-
if s.class == Regexp
|
270
|
-
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
271
|
-
if opts.count('u') == 0 then opts = opts + "u" end
|
272
|
-
str = s.source
|
273
|
-
return nil if str.empty?
|
274
|
-
str = "%r{#{str}}" + opts
|
275
|
-
r = eval(str)
|
276
|
-
l = ""
|
277
|
-
scan(r) { l = $` }
|
278
|
-
#gsub(r) { l = $`; " " }
|
279
|
-
l.empty? ? nil : l.length_utf8
|
280
|
-
else
|
281
|
-
return nil if s.empty?
|
282
|
-
r = %r{#{s}}mu
|
283
|
-
l = ""
|
284
|
-
scan(r) { l = $` }
|
285
|
-
#gsub(r) { l = $`; " " }
|
286
|
-
l.empty? ? nil : l.length_utf8
|
287
|
-
end
|
288
|
-
|
289
|
-
end
|
290
|
-
|
291
|
-
|
292
|
-
# note that the i option does not work in special cases with back references
|
293
|
-
# example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
|
294
|
-
def slice_utf8(regex)
|
295
|
-
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
296
|
-
if opts.count('u') == 0 then opts = opts + "u" end
|
297
|
-
s = regex.source
|
298
|
-
str = "%r{#{s}}" + opts
|
299
|
-
r = eval(str)
|
300
|
-
slice(r)
|
301
|
-
end
|
302
|
-
|
303
|
-
def slice_utf8!(regex)
|
304
|
-
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
305
|
-
if opts.count('u') == 0 then opts = opts + "u" end
|
306
|
-
s = regex.source
|
307
|
-
str = "%r{#{s}}" + opts
|
308
|
-
r = eval(str)
|
309
|
-
slice!(r)
|
310
|
-
end
|
311
|
-
|
312
|
-
def cut_utf8(p,l) # (index) position, length
|
313
|
-
raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
|
314
|
-
s = self.length_utf8
|
315
|
-
#if p < 0 then p = s - p.abs end
|
316
|
-
if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
|
317
|
-
return nil if l > s or p > (s - 1)
|
318
|
-
ret = ""
|
319
|
-
count = 0
|
320
|
-
each_utf8_char_with_index do |c,i|
|
321
|
-
break if count >= l
|
322
|
-
if i >= p && count < l then count += 1; ret << c; end
|
323
|
-
end
|
324
|
-
ret
|
325
|
-
end
|
326
|
-
|
327
|
-
def starts_with_utf8?(s)
|
328
|
-
return nil if self.empty? or s.empty?
|
329
|
-
cut_utf8(0, s.size_utf8) == s
|
330
|
-
end
|
331
|
-
|
332
|
-
def ends_with_utf8?(s)
|
333
|
-
return nil if self.empty? or s.empty?
|
334
|
-
cut_utf8(-(s.size_utf8), s.size_utf8) == s
|
335
|
-
end
|
336
|
-
|
337
|
-
def insert_utf8(i,s) # insert_utf8(index, string)
|
338
|
-
return self if s.empty?
|
339
|
-
l = self.length_utf8
|
340
|
-
if l == 0 then return s end
|
341
|
-
if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
|
342
|
-
#return nil if i > (l - 1) # return nil ...
|
343
|
-
spaces = ""
|
344
|
-
if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
|
345
|
-
str = self << spaces
|
346
|
-
s1 = str.cut_utf8(0, i)
|
347
|
-
s2 = str.cut_utf8(i, l - s1.length_utf8)
|
348
|
-
s1 << s << s2
|
349
|
-
end
|
350
|
-
|
351
|
-
def split_utf8(regex)
|
352
|
-
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
353
|
-
if opts.count('u') == 0 then opts = opts + "u" end
|
354
|
-
s = regex.source
|
355
|
-
str = "%r{#{s}}" + opts
|
356
|
-
r = eval(str)
|
357
|
-
split(r)
|
358
|
-
end
|
359
|
-
|
360
|
-
def scan_utf8(regex)
|
361
|
-
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
362
|
-
if opts.count('u') == 0 then opts = opts + "u" end
|
363
|
-
s = regex.source
|
364
|
-
str = "%r{#{s}}" + opts
|
365
|
-
r = eval(str)
|
366
|
-
if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
|
367
|
-
end
|
368
|
-
|
369
|
-
def range_utf8(r)
|
370
|
-
|
371
|
-
return nil if r.class != Range
|
372
|
-
#raise(ArgumentError, "No Range object given!", caller) if r.class != Range
|
373
|
-
|
374
|
-
a = r.to_s[/^[\+\-]?\d+/].to_i
|
375
|
-
b = r.to_s[/[\+\-]?\d+$/].to_i
|
376
|
-
d = r.to_s[/\.+/]
|
377
|
-
|
378
|
-
if d.size == 2 then d = 2 else d = d.size end
|
379
|
-
|
380
|
-
l = self.length_utf8
|
381
|
-
|
382
|
-
return nil if b.abs > l || a.abs > l || d < 2 || d > 3
|
383
|
-
|
384
|
-
if a < 0 then a = l - a.abs end
|
385
|
-
if b < 0 then b = l - b.abs end
|
386
|
-
|
387
|
-
return nil if a > b
|
388
|
-
|
389
|
-
str = ""
|
390
|
-
|
391
|
-
each_utf8_char_with_index do |c,i|
|
392
|
-
break if i > b
|
393
|
-
if d == 2
|
394
|
-
(i >= a && i <= b) ? str << c : next
|
395
|
-
else
|
396
|
-
(i >= a && i < b) ? str << c : next
|
397
|
-
end
|
398
|
-
end
|
399
|
-
|
400
|
-
str
|
401
|
-
|
402
|
-
end
|
403
|
-
|
404
|
-
def utf8?
|
405
|
-
# self =~ UTF8REGEX
|
406
|
-
encoding == Encoding.find("UTF-8") && valid_encoding?
|
407
|
-
end
|
408
|
-
|
409
|
-
def a
|
410
|
-
t = ""
|
411
|
-
# self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
|
412
|
-
chars.each { |c| t << c if c.utf8? }
|
413
|
-
t
|
414
|
-
end
|
415
|
-
|
416
|
-
|
417
|
-
def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
|
418
|
-
|
419
|
-
file = self
|
420
|
-
str = ""
|
421
|
-
|
422
|
-
if file =~ /^http:\/\//
|
423
|
-
|
424
|
-
url = file
|
425
|
-
|
426
|
-
if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
|
427
|
-
|
428
|
-
seconds = 30
|
429
|
-
|
430
|
-
# check if web site is reachable
|
431
|
-
# on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
|
432
|
-
var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
|
433
|
-
|
434
|
-
#return false unless var == 0
|
435
|
-
raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
|
436
|
-
|
437
|
-
str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
|
438
|
-
/usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
|
439
|
-
p str
|
440
|
-
return true if str =~ /utf-?8/i
|
441
|
-
return false if !str.empty? && str !~ /utf-?8/i
|
442
|
-
|
443
|
-
# solutions with downloaded file
|
444
|
-
|
445
|
-
# download HTML file
|
446
|
-
#downloaded_file = "/tmp/html"
|
447
|
-
downloaded_file = "~/Desktop/html"
|
448
|
-
downloaded_file = File.expand_path(downloaded_file)
|
449
|
-
%x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
|
450
|
-
raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
|
451
|
-
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
|
452
|
-
|
453
|
-
simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
|
454
|
-
p simple_test
|
455
|
-
|
456
|
-
# read entire file into a string
|
457
|
-
File.open(downloaded_file).read.each(nil) do |str|
|
458
|
-
#return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
|
459
|
-
str.utf8? ? (return true) : (return false)
|
460
|
-
end
|
461
|
-
|
462
|
-
#check each line of the downloaded file
|
463
|
-
#count_lines = 0
|
464
|
-
#count_utf8 = 0
|
465
|
-
#File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
|
466
|
-
#count_lines == count_utf8 ? (return true) : (return false)
|
467
|
-
|
468
|
-
|
469
|
-
# in-memory solutions
|
470
|
-
|
471
|
-
#html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
|
472
|
-
#p html_file_cleaned_utf8.utf8?
|
473
|
-
|
474
|
-
count_lines = 0
|
475
|
-
count_utf8 = 0
|
476
|
-
#%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
|
477
|
-
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
|
478
|
-
#return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
479
|
-
count_lines += 1
|
480
|
-
count_utf8 += 1 if line.utf8?
|
481
|
-
break if count_lines != count_utf8
|
482
|
-
end
|
483
|
-
count_lines == count_utf8 ? (return true) : (return false)
|
484
|
-
|
485
|
-
else
|
486
|
-
|
487
|
-
# check each line of the HTML file (or the entire HTML file at once)
|
488
|
-
# cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
489
|
-
count_lines = 0
|
490
|
-
count_utf8 = 0
|
491
|
-
open(url) do |f|
|
492
|
-
# p f.meta, f.content_encoding, f.content_type
|
493
|
-
cs = f.charset
|
494
|
-
return true if cs =~ /utf-?8/i
|
495
|
-
#f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
|
496
|
-
f.each_line do |line|
|
497
|
-
count_lines += 1
|
498
|
-
count_utf8 += 1 if line.utf8?
|
499
|
-
break unless count_lines == count_utf8
|
500
|
-
end
|
501
|
-
end
|
502
|
-
count_lines == count_utf8 ? (return true) : (return false)
|
503
|
-
|
504
|
-
end
|
505
|
-
|
506
|
-
else
|
507
|
-
|
508
|
-
return false unless File.file?(file)
|
509
|
-
|
510
|
-
if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
|
511
|
-
|
512
|
-
# read entire file into a string
|
513
|
-
#File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
|
514
|
-
|
515
|
-
# check each line of the file
|
516
|
-
count_lines = 0
|
517
|
-
count_utf8 = 0
|
518
|
-
File.foreach(file) do |line|
|
519
|
-
return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
520
|
-
count_lines += 1;
|
521
|
-
count_utf8 += 1 if line.utf8?;
|
522
|
-
break if count_lines != count_utf8
|
523
|
-
end
|
524
|
-
|
525
|
-
count_lines == count_utf8 ? (return true) : (return false)
|
526
|
-
|
527
|
-
end
|
528
|
-
|
529
|
-
str =~ /utf-?8/i ? true : false
|
530
|
-
|
531
|
-
end
|
532
|
-
|
533
|
-
|
534
|
-
# cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
|
535
|
-
def validate_utf8
|
536
|
-
Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
|
537
|
-
end
|
538
|
-
|
539
|
-
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
540
|
-
def asciify_utf8
|
541
|
-
return nil unless self.utf8?
|
542
|
-
#Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
|
543
|
-
# delete all punctuation characters inside words except "-" in words such as up-to-date
|
544
|
-
Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
|
545
|
-
end
|
546
|
-
|
547
|
-
def latin1_to_utf8 # ISO-8859-1 to UTF-8
|
548
|
-
ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
|
549
|
-
ret.utf8? ? ret : nil
|
550
|
-
end
|
551
|
-
|
552
|
-
def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
|
553
|
-
ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
|
554
|
-
ret.utf8? ? ret : nil
|
555
|
-
end
|
556
|
-
|
557
|
-
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
558
|
-
def utf16le_to_utf8
|
559
|
-
ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
|
560
|
-
ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
|
561
|
-
ret.utf8? ? ret : nil
|
562
|
-
end
|
563
|
-
|
564
|
-
def utf8_to_utf16le
|
565
|
-
return nil unless self.utf8?
|
566
|
-
ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
|
567
|
-
end
|
568
|
-
|
569
|
-
def utf8_to_unicode
|
570
|
-
return nil unless self.utf8?
|
571
|
-
str = ""
|
572
|
-
scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
|
573
|
-
str
|
574
|
-
end
|
575
|
-
|
576
|
-
def unicode_to_utf8
|
577
|
-
return self if self =~ /\A[[:space:]]*\z/m
|
578
|
-
str = ""
|
579
|
-
#scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
580
|
-
#scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
581
|
-
scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
|
582
|
-
c = $1
|
583
|
-
if c =~ /^U\+/
|
584
|
-
str << [c[2..-1].hex].pack("U*")
|
585
|
-
else
|
586
|
-
str << c
|
587
|
-
end
|
588
|
-
end
|
589
|
-
str.utf8? ? str : nil
|
590
|
-
end
|
591
|
-
|
592
|
-
|
593
|
-
# dec, hex, oct conversions (experimental!)
|
594
|
-
|
595
|
-
def utf8_to_dec
|
596
|
-
return nil unless self.utf8?
|
597
|
-
str = ""
|
598
|
-
scan(/./mu) do |c|
|
599
|
-
if c =~ /^\x00$/
|
600
|
-
str << "aaa\x00" # encode \x00 as "aaa"
|
601
|
-
else
|
602
|
-
str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
|
603
|
-
end
|
604
|
-
end
|
605
|
-
str[0..-2]
|
606
|
-
end
|
607
|
-
|
608
|
-
def dec_to_utf8 # \x00 is encoded as "aaa"
|
609
|
-
return self if self.empty?
|
610
|
-
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
611
|
-
str = ""
|
612
|
-
split(/\x00/).each do |c|
|
613
|
-
if c.eql?("aaa")
|
614
|
-
str << "\x00"
|
615
|
-
else
|
616
|
-
str << [c.to_i].pack("U*")
|
617
|
-
end
|
618
|
-
end
|
619
|
-
str
|
620
|
-
end
|
621
|
-
|
622
|
-
|
623
|
-
def utf8_to_dec_2
|
624
|
-
return nil unless self.utf8?
|
625
|
-
str = ""
|
626
|
-
tmpstr = ""
|
627
|
-
null_str = "\x00"
|
628
|
-
scan(/./mu) do |c|
|
629
|
-
if c =~ /^\x00$/
|
630
|
-
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
631
|
-
else
|
632
|
-
tmpstr = ""
|
633
|
-
c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
|
634
|
-
str << tmpstr << null_str
|
635
|
-
end
|
636
|
-
end
|
637
|
-
str[0..-3]
|
638
|
-
end
|
639
|
-
|
640
|
-
def dec_to_utf8_2 # \x00 is encoded as "aaa"
|
641
|
-
return self if self.empty?
|
642
|
-
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
643
|
-
str = ""
|
644
|
-
split(/\x00\x00/).each do |c|
|
645
|
-
if c =~ /\x00/
|
646
|
-
c.split(/\x00/).each { |x| str << x.to_i.chr }
|
647
|
-
elsif c.eql?("aaa")
|
648
|
-
str << "\x00"
|
649
|
-
else
|
650
|
-
str << c.to_i.chr
|
651
|
-
end
|
652
|
-
end
|
653
|
-
str
|
654
|
-
end
|
655
|
-
|
656
|
-
|
657
|
-
def utf8_to_hex
|
658
|
-
return nil unless self.utf8?
|
659
|
-
str = ""
|
660
|
-
tmpstr = ""
|
661
|
-
null_str = "\x00"
|
662
|
-
scan(/./mu) do |c|
|
663
|
-
if c =~ /^\x00$/
|
664
|
-
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
665
|
-
else
|
666
|
-
tmpstr = ""
|
667
|
-
c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
|
668
|
-
str << tmpstr << null_str
|
669
|
-
end
|
670
|
-
end
|
671
|
-
str[0..-3]
|
672
|
-
end
|
673
|
-
|
674
|
-
def hex_to_utf8 # \x00 is encoded as "aaa"
|
675
|
-
return self if self.empty?
|
676
|
-
return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
|
677
|
-
str = ""
|
678
|
-
split(/\x00\x00/).each do |c|
|
679
|
-
if c =~ /\x00/
|
680
|
-
c.split(/\x00/).each { |x| str << x.hex.chr }
|
681
|
-
elsif c.eql?("aaa")
|
682
|
-
str << "\x00"
|
683
|
-
else
|
684
|
-
str << c.hex.chr
|
685
|
-
end
|
686
|
-
end
|
687
|
-
str
|
688
|
-
end
|
689
|
-
|
690
|
-
|
691
|
-
def utf8_to_oct
|
692
|
-
return nil unless self.utf8?
|
693
|
-
str = ""
|
694
|
-
tmpstr = ""
|
695
|
-
null_str = "\x00"
|
696
|
-
scan(/./mu) do |c|
|
697
|
-
if c =~ /^\x00$/
|
698
|
-
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
699
|
-
else
|
700
|
-
tmpstr = ""
|
701
|
-
c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
|
702
|
-
str << tmpstr << null_str
|
703
|
-
end
|
704
|
-
end
|
705
|
-
str[0..-3]
|
706
|
-
end
|
707
|
-
|
708
|
-
def oct_to_utf8 # \x00 is encoded as "aaa"
|
709
|
-
return self if self.empty?
|
710
|
-
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
711
|
-
str = ""
|
712
|
-
split(/\x00\x00/).each do |c|
|
713
|
-
if c =~ /\x00/
|
714
|
-
c.split(/\x00/).each { |x| str << x.oct.chr }
|
715
|
-
elsif c.eql?("aaa")
|
716
|
-
str << "\x00"
|
717
|
-
else
|
718
|
-
str << c.oct.chr
|
719
|
-
end
|
720
|
-
end
|
721
|
-
str
|
722
|
-
end
|
723
|
-
|
724
|
-
# cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
|
725
|
-
def email_subject_utf8
|
726
|
-
return nil unless self.utf8?
|
727
|
-
"=?utf-8?b?#{[self].pack("m").delete("\n")}?="
|
728
|
-
end
|
729
|
-
|
730
|
-
end
|