Arabic-Prawn 0.0.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of Arabic-Prawn might be problematic. Click here for more details.

data/LICENSE ADDED
@@ -0,0 +1,3 @@
1
+ == Arabic-Prawn
2
+
3
+ Put appropriate LICENSE for your project here.
data/README ADDED
@@ -0,0 +1,3 @@
1
+ == Arabic-Prawn
2
+
3
+ You should document your project here.
@@ -0,0 +1,50 @@
1
+ #
2
+ # To change this template, choose Tools | Templates
3
+ # and open the template in the editor.
4
+
5
+
6
+ require 'rubygems'
7
+ require 'rake'
8
+ require 'rake/clean'
9
+ require 'rake/gempackagetask'
10
+ require 'rake/rdoctask'
11
+ require 'rake/testtask'
12
+ require 'spec/rake/spectask'
13
+
14
+ spec = Gem::Specification.new do |s|
15
+ s.name = 'Arabic-Prawn'
16
+ s.version = '0.0.1'
17
+ s.has_rdoc = true
18
+ s.extra_rdoc_files = ['README', 'LICENSE']
19
+ s.summary = 'Your summary here'
20
+ s.description = s.summary
21
+ s.author = 'Dynamix Solutions'
22
+ s.email = 'ahmed.nasser@dynamix-systems.com'
23
+ # s.executables = ['your_executable_here']
24
+ s.files = %w(LICENSE README Rakefile) + Dir.glob("{bin,lib,spec}/**/*")
25
+ s.require_path = "lib"
26
+ s.bindir = "bin"
27
+ end
28
+
29
+ Rake::GemPackageTask.new(spec) do |p|
30
+ p.gem_spec = spec
31
+ p.need_tar = true
32
+ p.need_zip = true
33
+ end
34
+
35
+ Rake::RDocTask.new do |rdoc|
36
+ files =['README', 'LICENSE', 'lib/**/*.rb']
37
+ rdoc.rdoc_files.add(files)
38
+ rdoc.main = "README" # page to start on
39
+ rdoc.title = "Arabic-Prawn Docs"
40
+ rdoc.rdoc_dir = 'doc/rdoc' # rdoc output folder
41
+ rdoc.options << '--line-numbers'
42
+ end
43
+
44
+ Rake::TestTask.new do |t|
45
+ t.test_files = FileList['test/**/*.rb']
46
+ end
47
+
48
+ Spec::Rake::SpecTask.new do |t|
49
+ t.spec_files = FileList['spec/**/*.rb']
50
+ end
@@ -0,0 +1,527 @@
1
+ require 'string_utf_support'
2
+
3
+ class CharacterFormat
4
+ Isolated = 1
5
+ Initial = 2
6
+ Medial = 3
7
+ Final = 4
8
+ end
9
+
10
+
11
+ class ArabicCharacterInfo
12
+
13
+ @@arabic_characters_map = nil
14
+ attr_accessor :common_encoding , :format_encodings, :is_connected
15
+
16
+ def initialize(common, isolated, final, initial, medial, is_connected)
17
+ @common_encoding = common.unicode_to_utf8
18
+ @format_encodings = Hash.new
19
+ @format_encodings[CharacterFormat::Isolated] = isolated.unicode_to_utf8
20
+ @format_encodings[CharacterFormat::Initial] = initial.unicode_to_utf8
21
+ @format_encodings[CharacterFormat::Medial] = medial.unicode_to_utf8
22
+ @format_encodings[CharacterFormat::Final] = final.unicode_to_utf8
23
+ @is_connected = is_connected
24
+ end
25
+
26
+ def ArabicCharacterInfo.get_arabic_characters_map
27
+
28
+ if !@@arabic_characters_map.nil?
29
+ return @@arabic_characters_map
30
+ end
31
+
32
+ map = Hash.new
33
+
34
+ #Alef
35
+ new_character = ArabicCharacterInfo.new(
36
+ "U+0627", #Common
37
+ "U+fe8d", #Isolated
38
+ "U+fe8e", #Final
39
+ "U+fe8d", #Initial
40
+ "U+fe8e", #Medial
41
+ false)
42
+ map[new_character.common_encoding] = new_character
43
+
44
+ #Beh
45
+ new_character = ArabicCharacterInfo.new(
46
+ "U+0628", #Common
47
+ "U+fe8f", #Isolated
48
+ "U+fe90", #Final
49
+ "U+fe91", #Initial
50
+ "U+fe92", #Medial
51
+ true)
52
+ map[new_character.common_encoding] = new_character
53
+
54
+ #Teh
55
+ new_character = ArabicCharacterInfo.new(
56
+ "U+062a", #Common
57
+ "U+fe95", #Isolated
58
+ "U+fe96", #Final
59
+ "U+fe97", #Initial
60
+ "U+fe98", #Medial
61
+ true)
62
+ map[new_character.common_encoding] = new_character
63
+
64
+ #Theh
65
+ new_character = ArabicCharacterInfo.new(
66
+ "U+062b", #Common
67
+ "U+fe99", #Isolated
68
+ "U+fe9a", #Final
69
+ "U+fe9b", #Initial
70
+ "U+fe9c", #Medial
71
+ true)
72
+ map[new_character.common_encoding] = new_character
73
+
74
+ #Jeem
75
+ new_character = ArabicCharacterInfo.new(
76
+ "U+062c", #Common
77
+ "U+fe9d", #Isolated
78
+ "U+fe9e", #Final
79
+ "U+fe9f", #Initial
80
+ "U+fea0", #Medial
81
+ true)
82
+ map[new_character.common_encoding] = new_character
83
+
84
+ #7ah
85
+ new_character = ArabicCharacterInfo.new(
86
+ "U+062d", #Common
87
+ "U+fea1", #Isolated
88
+ "U+fea2", #Final
89
+ "U+fea3", #Initial
90
+ "U+fea4", #Medial
91
+ true)
92
+ map[new_character.common_encoding] = new_character
93
+
94
+ #7'ah
95
+ new_character = ArabicCharacterInfo.new(
96
+ "U+062e", #Common
97
+ "U+fea5", #Isolated
98
+ "U+fea6", #Final
99
+ "U+fea7", #Initial
100
+ "U+fea8", #Medial
101
+ true)
102
+ map[new_character.common_encoding] = new_character
103
+
104
+ #Dal
105
+ new_character = ArabicCharacterInfo.new(
106
+ "U+062f", #Common
107
+ "U+fea9", #Isolated
108
+ "U+feaa", #Final
109
+ "U+fea9", #Initial
110
+ "U+feaa", #Medial
111
+ false)
112
+ map[new_character.common_encoding] = new_character
113
+
114
+ #Thal
115
+ new_character = ArabicCharacterInfo.new(
116
+ "U+0630", #Common
117
+ "U+feab", #Isolated
118
+ "U+feac", #Final
119
+ "U+feab", #Initial
120
+ "U+feac", #Medial
121
+ false)
122
+ map[new_character.common_encoding] = new_character
123
+
124
+ #Rah
125
+ new_character = ArabicCharacterInfo.new(
126
+ "U+0631", #Common
127
+ "U+fead", #Isolated
128
+ "U+feae", #Final
129
+ "U+fead", #Initial
130
+ "U+feae", #Medial
131
+ false)
132
+ map[new_character.common_encoding] = new_character
133
+
134
+ #Zein
135
+ new_character = ArabicCharacterInfo.new(
136
+ "U+0632", #Common
137
+ "U+feaf", #Isolated
138
+ "U+feb0", #Final
139
+ "U+feaf", #Initial
140
+ "U+feb0", #Medial
141
+ false)
142
+ map[new_character.common_encoding] = new_character
143
+
144
+ #Seen
145
+ new_character = ArabicCharacterInfo.new(
146
+ "U+0633", #Common
147
+ "U+feb1", #Isolated
148
+ "U+feb2", #Final
149
+ "U+feb3", #Initial
150
+ "U+feb4", #Medial
151
+ true)
152
+ map[new_character.common_encoding] = new_character
153
+
154
+ #Sheen
155
+ new_character = ArabicCharacterInfo.new(
156
+ "U+0634", #Common
157
+ "U+feb5", #Isolated
158
+ "U+feb6", #Final
159
+ "U+feb7", #Initial
160
+ "U+feb8", #Medial
161
+ true)
162
+ map[new_character.common_encoding] = new_character
163
+
164
+
165
+ #Sad
166
+ new_character = ArabicCharacterInfo.new(
167
+ "U+0635", #Common
168
+ "U+feb9", #Isolated
169
+ "U+feba", #Final
170
+ "U+febb", #Initial
171
+ "U+febc", #Medial
172
+ true)
173
+ map[new_character.common_encoding] = new_character
174
+
175
+
176
+ #Dad
177
+ new_character = ArabicCharacterInfo.new(
178
+ "U+0636", #Common
179
+ "U+febd", #Isolated
180
+ "U+febe", #Final
181
+ "U+febf", #Initial
182
+ "U+fec0", #Medial
183
+ true)
184
+ map[new_character.common_encoding] = new_character
185
+
186
+
187
+ #Tah
188
+ new_character = ArabicCharacterInfo.new(
189
+ "U+0637", #Common
190
+ "U+fec1", #Isolated
191
+ "U+fec2", #Final
192
+ "U+fec3", #Initial
193
+ "U+fec4", #Medial
194
+ true)
195
+ map[new_character.common_encoding] = new_character
196
+
197
+ #Thah
198
+ new_character = ArabicCharacterInfo.new(
199
+ "U+0638", #Common
200
+ "U+fec5", #Isolated
201
+ "U+fec6", #Final
202
+ "U+fec7", #Initial
203
+ "U+fec8", #Medial
204
+ true)
205
+ map[new_character.common_encoding] = new_character
206
+
207
+ #3ein
208
+ new_character = ArabicCharacterInfo.new(
209
+ "U+0639", #Common
210
+ "U+fec9", #Isolated
211
+ "U+feca", #Final
212
+ "U+fecb", #Initial
213
+ "U+fecc", #Medial
214
+ true)
215
+ map[new_character.common_encoding] = new_character
216
+
217
+
218
+ #3'ein
219
+ new_character = ArabicCharacterInfo.new(
220
+ "U+063a", #Common
221
+ "U+fecd", #Isolated
222
+ "U+fece", #Final
223
+ "U+fecf", #Initial
224
+ "U+fed0", #Medial
225
+ true)
226
+ map[new_character.common_encoding] = new_character
227
+
228
+ #Feh
229
+ new_character = ArabicCharacterInfo.new(
230
+ "U+0641", #Common
231
+ "U+fed1", #Isolated
232
+ "U+fed2", #Final
233
+ "U+fed3", #Initial
234
+ "U+fed4", #Medial
235
+ true)
236
+ map[new_character.common_encoding] = new_character
237
+
238
+
239
+ #Qaf
240
+ new_character = ArabicCharacterInfo.new(
241
+ "U+0642", #Common
242
+ "U+fed5", #Isolated
243
+ "U+fed6", #Final
244
+ "U+fed7", #Initial
245
+ "U+fed8", #Medial
246
+ true)
247
+ map[new_character.common_encoding] = new_character
248
+
249
+
250
+ #Kaf
251
+ new_character = ArabicCharacterInfo.new(
252
+ "U+0643", #Common
253
+ "U+fed9", #Isolated
254
+ "U+feda", #Final
255
+ "U+fedb", #Initial
256
+ "U+fedc", #Medial
257
+ true)
258
+ map[new_character.common_encoding] = new_character
259
+
260
+ #Lam
261
+ new_character = ArabicCharacterInfo.new(
262
+ "U+0644", #Common
263
+ "U+fedd", #Isolated
264
+ "U+fede", #Final
265
+ "U+fedf", #Initial
266
+ "U+fee0", #Medial
267
+ true)
268
+ map[new_character.common_encoding] = new_character
269
+
270
+ #Meem
271
+ new_character = ArabicCharacterInfo.new(
272
+ "U+0645", #Common
273
+ "U+fee1", #Isolated
274
+ "U+fee2", #Final
275
+ "U+fee3", #Initial
276
+ "U+fee4", #Medial
277
+ true)
278
+ map[new_character.common_encoding] = new_character
279
+
280
+ #Noon
281
+ new_character = ArabicCharacterInfo.new(
282
+ "U+0646", #Common
283
+ "U+fee5", #Isolated
284
+ "U+fee6", #Final
285
+ "U+fee7", #Initial
286
+ "U+fee8", #Medial
287
+ true)
288
+ map[new_character.common_encoding] = new_character
289
+
290
+ #Heh
291
+ new_character = ArabicCharacterInfo.new(
292
+ "U+0647", #Common
293
+ "U+fee9", #Isolated
294
+ "U+feea", #Final
295
+ "U+feeb", #Initial
296
+ "U+feec", #Medial
297
+ true)
298
+ map[new_character.common_encoding] = new_character
299
+
300
+ #Waw
301
+ new_character = ArabicCharacterInfo.new(
302
+ "U+0648", #Common
303
+ "U+feed", #Isolated
304
+ "U+feee", #Final
305
+ "U+feed", #Initial
306
+ "U+feee", #Medial
307
+ false)
308
+ map[new_character.common_encoding] = new_character
309
+
310
+ #Yeh
311
+ new_character = ArabicCharacterInfo.new(
312
+ "U+064a", #Common
313
+ "U+fef1", #Isolated
314
+ "U+fef2", #Final
315
+ "U+fef3", #Initial
316
+ "U+fef4", #Medial
317
+ true)
318
+ map[new_character.common_encoding] = new_character
319
+
320
+ #Hamza
321
+ new_character = ArabicCharacterInfo.new(
322
+ "U+0621", #Common
323
+ "U+fe80", #Isolated
324
+ "U+fe80", #Final
325
+ "U+fe80", #Initial
326
+ "U+fe80", #Medial
327
+ false)
328
+ map[new_character.common_encoding] = new_character
329
+
330
+
331
+ # Alef Madda
332
+ new_character = ArabicCharacterInfo.new(
333
+ "U+0622", #Common
334
+ "U+fe81", #Isolated
335
+ "U+fe82", #Final
336
+ "U+fe81", #Initial
337
+ "U+fe82", #Medial
338
+ false)
339
+ map[new_character.common_encoding] = new_character
340
+
341
+ # Alef Hamza Above
342
+ new_character = ArabicCharacterInfo.new(
343
+ "U+0623", #Common
344
+ "U+fe83", #Isolated
345
+ "U+fe84", #Final
346
+ "U+fe83", #Initial
347
+ "U+fe84", #Medial
348
+ false)
349
+ map[new_character.common_encoding] = new_character
350
+
351
+ # Waw Hamza
352
+ new_character = ArabicCharacterInfo.new(
353
+ "U+0624", #Common
354
+ "U+fe85", #Isolated
355
+ "U+fe86", #Final
356
+ "U+fe85", #Initial
357
+ "U+fe86", #Medial
358
+ false)
359
+ map[new_character.common_encoding] = new_character
360
+
361
+ # Alef Hamza Below
362
+ new_character = ArabicCharacterInfo.new(
363
+ "U+0625", #Common
364
+ "U+fe87", #Isolated
365
+ "U+fe88", #Final
366
+ "U+fe87", #Initial
367
+ "U+fe88", #Medial
368
+ false)
369
+ map[new_character.common_encoding] = new_character
370
+
371
+ # Yeh Hamza
372
+ new_character = ArabicCharacterInfo.new(
373
+ "U+0626", #Common
374
+ "U+fe89", #Isolated
375
+ "U+fe8a", #Final
376
+ "U+fe8b", #Initial
377
+ "U+fe8c", #Medial
378
+ true)
379
+ map[new_character.common_encoding] = new_character
380
+
381
+ # Teh Marbuta
382
+ new_character = ArabicCharacterInfo.new(
383
+ "U+0629", #Common
384
+ "U+fe93", #Isolated
385
+ "U+fe94", #Final
386
+ "U+fe93", #Initial
387
+ "U+fe94", #Medial
388
+ false)
389
+ map[new_character.common_encoding] = new_character
390
+
391
+ # Tatweel
392
+ new_character = ArabicCharacterInfo.new(
393
+ "U+0640", #Common
394
+ "U+0640", #Isolated
395
+ "U+0640", #Final
396
+ "U+0640", #Initial
397
+ "U+0640", #Medial
398
+ true)
399
+ map[new_character.common_encoding] = new_character
400
+
401
+ # Alef Layyena
402
+ new_character = ArabicCharacterInfo.new(
403
+ "U+0649", #Common
404
+ "U+feef", #Isolated
405
+ "U+fef0", #Final
406
+ "U+feef", #Initial
407
+ "U+fef0", #Medial
408
+ false)
409
+ map[new_character.common_encoding] = new_character
410
+
411
+ @@arabic_characters_map = map
412
+
413
+ return @@arabic_characters_map
414
+ end
415
+
416
+ end
417
+
418
+
419
+
420
+ class String
421
+
422
+
423
+
424
+ def determine_format(before_c, after_c)
425
+
426
+ charmap = ArabicCharacterInfo.get_arabic_characters_map
427
+
428
+ previous_is_character = charmap.key?(before_c)
429
+ after_is_character = charmap.key?(after_c)
430
+
431
+ if !after_is_character and (!previous_is_character or !charmap[before_c].is_connected)
432
+ return CharacterFormat::Isolated
433
+ end
434
+
435
+ if !after_is_character
436
+ return CharacterFormat::Final
437
+ end
438
+
439
+ if !previous_is_character or !charmap[before_c].is_connected
440
+ return CharacterFormat::Initial
441
+ end
442
+
443
+ return CharacterFormat::Medial
444
+
445
+ end
446
+
447
+ def get_letter_in_format(format, c)
448
+ charmap = ArabicCharacterInfo.get_arabic_characters_map
449
+ character = charmap[c]
450
+ if character.nil?
451
+ return c
452
+ end
453
+ return character.format_encodings[format]
454
+ end
455
+
456
+ def fix_word
457
+
458
+ is_arabic = false
459
+ connected_arabic = ""
460
+ previous_letter = ''
461
+ before_previous_letter = ''
462
+
463
+ self.each_utf8_char {|c|
464
+
465
+ if previous_letter != ''
466
+
467
+ format = determine_format(before_previous_letter, c)
468
+ fixed_character = get_letter_in_format(format, previous_letter)
469
+ connected_arabic += fixed_character
470
+ if fixed_character != previous_letter
471
+ is_arabic = true
472
+ end
473
+
474
+ end
475
+
476
+ before_previous_letter = previous_letter
477
+ previous_letter = c
478
+ }
479
+
480
+ if previous_letter != ''
481
+
482
+ format = determine_format(before_previous_letter, '')
483
+ fixed_character = get_letter_in_format(format, previous_letter)
484
+ connected_arabic += fixed_character
485
+ if fixed_character != previous_letter
486
+ is_arabic = true
487
+ end
488
+ end
489
+
490
+ if is_arabic
491
+ return connected_arabic.reverse_utf8!
492
+ else
493
+ return connected_arabic
494
+ end
495
+ end
496
+
497
+ def fix_arabic_glyphs
498
+
499
+ words = self.split(" ")
500
+ result = ""
501
+
502
+ #assuming default is rtl
503
+ ltr_buffer = ""
504
+
505
+ words.each { |word|
506
+ fixed_word = word.fix_word
507
+ if(fixed_word == word)
508
+ #a non-arabic word (ltr) so we will buffer to see if more ltr words will follow
509
+ ltr_buffer = ltr_buffer + " " + fixed_word
510
+ else
511
+ if(ltr_buffer.empty?)
512
+ result = fixed_word + " " + result
513
+ else
514
+ result = ltr_buffer + " " + result
515
+ result = fixed_word + " " + result
516
+ ltr_buffer = ""
517
+ end
518
+ end
519
+ }
520
+
521
+ if(!(ltr_buffer.empty?))
522
+ result = ltr_buffer + " " + result
523
+ end
524
+
525
+ return result
526
+ end
527
+ end
@@ -0,0 +1,726 @@
1
+ class String
2
+
3
+ require 'iconv'
4
+ require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
5
+
6
+ # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
7
+ UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
8
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
9
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
10
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
11
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
12
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
13
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
14
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
15
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
16
+ )*\z/mnx
17
+
18
+
19
+ # create UTF-8 character arrays (as class instance variables)
20
+ #
21
+ # mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
22
+ # - http://unicode.org/Public/UNIDATA/UnicodeData.txt
23
+ # - http://unicode.org/Public/UNIDATA/CaseFolding.txt
24
+ # - http://www.decodeunicode.org
25
+ # - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
26
+ # - http://camomile.sourceforge.net
27
+ # - Character Palette (Mac OS X)
28
+
29
+
30
+ # test data
31
+ @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
32
+
33
+
34
+ @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
35
+
36
+
37
+ @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
38
+
39
+ if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
40
+ if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
41
+ if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
42
+
43
+
44
+ @unicode_array = []
45
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
46
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
47
+ # f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
48
+ #end
49
+
50
+ #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
51
+ @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
52
+
53
+ # Hash[*array_with_keys.zip(array_with_values).flatten]
54
+ @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
55
+ @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
56
+ @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
57
+
58
+ class << self
59
+ attr_accessor :small_letters_utf8
60
+ attr_accessor :capital_letters_utf8
61
+ attr_accessor :other_letters_utf8
62
+ attr_accessor :letters_utf8
63
+ attr_accessor :letters_utf8_hash
64
+ attr_accessor :unicode_array
65
+ attr_accessor :downcase_table_utf8
66
+ attr_accessor :upcase_table_utf8
67
+ end
68
+
69
+
70
+ def each_utf8_char
71
+ scan(/./mu) { |c| yield c }
72
+ end
73
+
74
+ def each_utf8_char_with_index
75
+ i = -1
76
+ scan(/./mu) { |c| i+=1; yield(c, i) }
77
+ end
78
+
79
+ def length_utf8
80
+ #scan(/./mu).size
81
+ count = 0
82
+ scan(/./mu) { count += 1 }
83
+ count
84
+ end
85
+ alias :size_utf8 :length_utf8
86
+
87
+ def reverse_utf8
88
+ split(//mu).reverse.join
89
+ end
90
+
91
+ def reverse_utf8!
92
+ split(//mu).reverse!.join
93
+ end
94
+
95
+ def swapcase_utf8
96
+ gsub(/./mu) do |char|
97
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
98
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
99
+ else char.swapcase
100
+ end
101
+ end
102
+ end
103
+
104
+ def swapcase_utf8!
105
+ gsub!(/./mu) do |char|
106
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
107
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
108
+ else ret = char.swapcase end
109
+ end
110
+ end
111
+
112
+ def downcase_utf8
113
+ gsub(/./mu) do |char|
114
+ small_char = String.downcase_table_utf8[char]
115
+ small_char.nil? ? char.downcase : small_char
116
+ end
117
+ end
118
+
119
+ def downcase_utf8!
120
+ gsub!(/./mu) do |char|
121
+ small_char = String.downcase_table_utf8[char]
122
+ small_char.nil? ? char.downcase : small_char
123
+ end
124
+ end
125
+
126
+ def upcase_utf8
127
+ gsub(/./mu) do |char|
128
+ capital_char = String.upcase_table_utf8[char]
129
+ capital_char.nil? ? char.upcase : capital_char
130
+ end
131
+ end
132
+
133
+ def upcase_utf8!
134
+ gsub!(/./mu) do |char|
135
+ capital_char = String.upcase_table_utf8[char]
136
+ capital_char.nil? ? char.upcase : capital_char
137
+ end
138
+ end
139
+
140
+ def count_utf8(c)
141
+ return nil if c.empty?
142
+ r = %r{[#{c}]}mu
143
+ scan(r).size
144
+ end
145
+
146
+ def delete_utf8(c)
147
+ return self if c.empty?
148
+ r = %r{[#{c}]}mu
149
+ gsub(r, '')
150
+ end
151
+
152
+ def delete_utf8!(c)
153
+ return self if c.empty?
154
+ r = %r{[#{c}]}mu
155
+ gsub!(r, '')
156
+ end
157
+
158
+ def first_utf8
159
+ self[/\A./mu]
160
+ end
161
+
162
+ def last_utf8
163
+ self[/.\z/mu]
164
+ end
165
+
166
+ def capitalize_utf8
167
+ return self if self =~ /\A[[:space:]]*\z/m
168
+ ret = ""
169
+ split(/\x20/).each do |w|
170
+ count = 0
171
+ w.gsub(/./mu) do |char|
172
+ count += 1
173
+ capital_char = String.upcase_table_utf8[char]
174
+ if count == 1 then
175
+ capital_char.nil? ? char.upcase : char.upcase_utf8
176
+ else
177
+ capital_char.nil? ? char.downcase : char.downcase_utf8
178
+ end
179
+ end
180
+ ret << w + ' '
181
+ end
182
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
183
+ end
184
+
185
+ def capitalize_utf8!
186
+ return self if self =~ /\A[[:space:]]*\z/m
187
+ ret = ""
188
+ split(/\x20/).each do |w|
189
+ count = 0
190
+ w.gsub!(/./mu) do |char|
191
+ count += 1
192
+ capital_char = String.upcase_table_utf8[char]
193
+ if count == 1 then
194
+ capital_char.nil? ? char.upcase : char.upcase_utf8
195
+ else
196
+ capital_char.nil? ? char.downcase : char.downcase_utf8
197
+ end
198
+ end
199
+ ret << w + ' '
200
+ end
201
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
202
+ end
203
+
204
+
205
+ def index_utf8(s)
206
+
207
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
208
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
209
+
210
+ if s.class == Regexp
211
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
212
+ if opts.count('u') == 0 then opts = opts + "u" end
213
+ str = s.source
214
+ return nil if str.empty?
215
+ str = "%r{#{str}}" + opts
216
+ r = eval(str)
217
+ l = ""
218
+ sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
219
+ l.empty? ? nil : l.length_utf8
220
+
221
+ else
222
+
223
+ return nil if s.empty?
224
+ r = %r{#{s}}mu
225
+ l = ""
226
+ sub(r) { l << $`; " " }
227
+ l.empty? ? nil : l.length_utf8
228
+
229
+ # this would be a non-regex solution
230
+ =begin
231
+ return nil if s.empty?
232
+ return nil unless self =~ %r{#{s}}mu
233
+ indices = []
234
+ s.split(//mu).each do |x|
235
+ ar = []
236
+ self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
237
+ indices << ar unless ar.empty?
238
+ end
239
+ if indices.empty?
240
+ return nil
241
+ elsif indices.size == 1
242
+ indices.first.first
243
+ else
244
+ #p indices
245
+ ret = []
246
+ a0 = indices.shift
247
+ a0.each do |i|
248
+ ret << i
249
+ indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
250
+ return ret.first unless ret.empty?
251
+ end
252
+ ret.empty? ? nil : ret.first
253
+ end
254
+ =end
255
+
256
+ end
257
+ end
258
+
259
+
260
+ def rindex_utf8(s)
261
+
262
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
263
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
264
+
265
+ if s.class == Regexp
266
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
267
+ if opts.count('u') == 0 then opts = opts + "u" end
268
+ str = s.source
269
+ return nil if str.empty?
270
+ str = "%r{#{str}}" + opts
271
+ r = eval(str)
272
+ l = ""
273
+ scan(r) { l = $` }
274
+ #gsub(r) { l = $`; " " }
275
+ l.empty? ? nil : l.length_utf8
276
+ else
277
+ return nil if s.empty?
278
+ r = %r{#{s}}mu
279
+ l = ""
280
+ scan(r) { l = $` }
281
+ #gsub(r) { l = $`; " " }
282
+ l.empty? ? nil : l.length_utf8
283
+ end
284
+
285
+ end
286
+
287
+
288
+ # note that the i option does not work in special cases with back references
289
+ # example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
290
+ def slice_utf8(regex)
291
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
292
+ if opts.count('u') == 0 then opts = opts + "u" end
293
+ s = regex.source
294
+ str = "%r{#{s}}" + opts
295
+ r = eval(str)
296
+ slice(r)
297
+ end
298
+
299
+ def slice_utf8!(regex)
300
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
301
+ if opts.count('u') == 0 then opts = opts + "u" end
302
+ s = regex.source
303
+ str = "%r{#{s}}" + opts
304
+ r = eval(str)
305
+ slice!(r)
306
+ end
307
+
308
+ def cut_utf8(p,l) # (index) position, length
309
+ raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
310
+ s = self.length_utf8
311
+ #if p < 0 then p = s - p.abs end
312
+ if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
313
+ return nil if l > s or p > (s - 1)
314
+ ret = ""
315
+ count = 0
316
+ each_utf8_char_with_index do |c,i|
317
+ break if count >= l
318
+ if i >= p && count < l then count += 1; ret << c; end
319
+ end
320
+ ret
321
+ end
322
+
323
+ def starts_with_utf8?(s)
324
+ return nil if self.empty? or s.empty?
325
+ cut_utf8(0, s.size_utf8) == s
326
+ end
327
+
328
+ def ends_with_utf8?(s)
329
+ return nil if self.empty? or s.empty?
330
+ cut_utf8(-(s.size_utf8), s.size_utf8) == s
331
+ end
332
+
333
+ def insert_utf8(i,s) # insert_utf8(index, string)
334
+ return self if s.empty?
335
+ l = self.length_utf8
336
+ if l == 0 then return s end
337
+ if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
338
+ #return nil if i > (l - 1) # return nil ...
339
+ spaces = ""
340
+ if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
341
+ str = self << spaces
342
+ s1 = str.cut_utf8(0, i)
343
+ s2 = str.cut_utf8(i, l - s1.length_utf8)
344
+ s1 << s << s2
345
+ end
346
+
347
+ def split_utf8(regex)
348
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
349
+ if opts.count('u') == 0 then opts = opts + "u" end
350
+ s = regex.source
351
+ str = "%r{#{s}}" + opts
352
+ r = eval(str)
353
+ split(r)
354
+ end
355
+
356
+ def scan_utf8(regex)
357
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
358
+ if opts.count('u') == 0 then opts = opts + "u" end
359
+ s = regex.source
360
+ str = "%r{#{s}}" + opts
361
+ r = eval(str)
362
+ if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
363
+ end
364
+
365
+ def range_utf8(r)
366
+
367
+ return nil if r.class != Range
368
+ #raise(ArgumentError, "No Range object given!", caller) if r.class != Range
369
+
370
+ a = r.to_s[/^[\+\-]?\d+/].to_i
371
+ b = r.to_s[/[\+\-]?\d+$/].to_i
372
+ d = r.to_s[/\.+/]
373
+
374
+ if d.size == 2 then d = 2 else d = d.size end
375
+
376
+ l = self.length_utf8
377
+
378
+ return nil if b.abs > l || a.abs > l || d < 2 || d > 3
379
+
380
+ if a < 0 then a = l - a.abs end
381
+ if b < 0 then b = l - b.abs end
382
+
383
+ return nil if a > b
384
+
385
+ str = ""
386
+
387
+ each_utf8_char_with_index do |c,i|
388
+ break if i > b
389
+ if d == 2
390
+ (i >= a && i <= b) ? str << c : next
391
+ else
392
+ (i >= a && i < b) ? str << c : next
393
+ end
394
+ end
395
+
396
+ str
397
+
398
+ end
399
+
400
+ def utf8?
401
+ self =~ UTF8REGEX
402
+ end
403
+
404
+ def clean_utf8
405
+ t = ""
406
+ self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
407
+ t
408
+ end
409
+
410
+
411
+ def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
412
+
413
+ file = self
414
+ str = ""
415
+
416
+ if file =~ /^http:\/\//
417
+
418
+ url = file
419
+
420
+ if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
421
+
422
+ seconds = 30
423
+
424
+ # check if web site is reachable
425
+ # on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
426
+ var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
427
+
428
+ #return false unless var == 0
429
+ raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
430
+
431
+ str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
432
+ /usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
433
+ p str
434
+ return true if str =~ /utf-?8/i
435
+ return false if !str.empty? && str !~ /utf-?8/i
436
+
437
+ # solutions with downloaded file
438
+
439
+ # download HTML file
440
+ #downloaded_file = "/tmp/html"
441
+ downloaded_file = "~/Desktop/html"
442
+ downloaded_file = File.expand_path(downloaded_file)
443
+ %x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
444
+ raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
445
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
446
+
447
+ simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
448
+ p simple_test
449
+
450
+ # read entire file into a string
451
+ File.open(downloaded_file).read.each(nil) do |str|
452
+ #return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
453
+ str.utf8? ? (return true) : (return false)
454
+ end
455
+
456
+ #check each line of the downloaded file
457
+ #count_lines = 0
458
+ #count_utf8 = 0
459
+ #File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
460
+ #count_lines == count_utf8 ? (return true) : (return false)
461
+
462
+
463
+ # in-memory solutions
464
+
465
+ #html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
466
+ #p html_file_cleaned_utf8.utf8?
467
+
468
+ count_lines = 0
469
+ count_utf8 = 0
470
+ #%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
471
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
472
+ #return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
473
+ count_lines += 1
474
+ count_utf8 += 1 if line.utf8?
475
+ break if count_lines != count_utf8
476
+ end
477
+ count_lines == count_utf8 ? (return true) : (return false)
478
+
479
+ else
480
+
481
+ # check each line of the HTML file (or the entire HTML file at once)
482
+ # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
483
+ count_lines = 0
484
+ count_utf8 = 0
485
+ open(url) do |f|
486
+ # p f.meta, f.content_encoding, f.content_type
487
+ cs = f.charset
488
+ return true if cs =~ /utf-?8/i
489
+ #f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
490
+ f.each_line do |line|
491
+ count_lines += 1
492
+ count_utf8 += 1 if line.utf8?
493
+ break unless count_lines == count_utf8
494
+ end
495
+ end
496
+ count_lines == count_utf8 ? (return true) : (return false)
497
+
498
+ end
499
+
500
+ else
501
+
502
+ return false unless File.file?(file)
503
+
504
+ if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
505
+
506
+ # read entire file into a string
507
+ #File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
508
+
509
+ # check each line of the file
510
+ count_lines = 0
511
+ count_utf8 = 0
512
+ File.foreach(file) do |line|
513
+ return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
514
+ count_lines += 1;
515
+ count_utf8 += 1 if line.utf8?;
516
+ break if count_lines != count_utf8
517
+ end
518
+
519
+ count_lines == count_utf8 ? (return true) : (return false)
520
+
521
+ end
522
+
523
+ str =~ /utf-?8/i ? true : false
524
+
525
+ end
526
+
527
+
528
+ # cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
529
+ def validate_utf8
530
+ Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
531
+ end
532
+
533
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
534
+ def asciify_utf8
535
+ return nil unless self.utf8?
536
+ #Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
537
+ # delete all punctuation characters inside words except "-" in words such as up-to-date
538
+ Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
539
+ end
540
+
541
+ def latin1_to_utf8 # ISO-8859-1 to UTF-8
542
+ ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
543
+ ret.utf8? ? ret : nil
544
+ end
545
+
546
+ def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
547
+ ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
548
+ ret.utf8? ? ret : nil
549
+ end
550
+
551
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
552
+ def utf16le_to_utf8
553
+ ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
554
+ ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
555
+ ret.utf8? ? ret : nil
556
+ end
557
+
558
+ def utf8_to_utf16le
559
+ return nil unless self.utf8?
560
+ ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
561
+ end
562
+
563
+ def utf8_to_unicode
564
+ return nil unless self.utf8?
565
+ str = ""
566
+ scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
567
+ str
568
+ end
569
+
570
+ def unicode_to_utf8
571
+ return self if self =~ /\A[[:space:]]*\z/m
572
+ str = ""
573
+ #scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
574
+ #scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
575
+ scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
576
+ c = $1
577
+ if c =~ /^U\+/
578
+ str << [c[2..-1].hex].pack("U*")
579
+ else
580
+ str << c
581
+ end
582
+ end
583
+ str.utf8? ? str : nil
584
+ end
585
+
586
+
587
+ # dec, hex, oct conversions (experimental!)
588
+
589
+ def utf8_to_dec
590
+ return nil unless self.utf8?
591
+ str = ""
592
+ scan(/./mu) do |c|
593
+ if c =~ /^\x00$/
594
+ str << "aaa\x00" # encode \x00 as "aaa"
595
+ else
596
+ str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
597
+ end
598
+ end
599
+ str[0..-2]
600
+ end
601
+
602
+ def dec_to_utf8 # \x00 is encoded as "aaa"
603
+ return self if self.empty?
604
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
605
+ str = ""
606
+ split(/\x00/).each do |c|
607
+ if c.eql?("aaa")
608
+ str << "\x00"
609
+ else
610
+ str << [c.to_i].pack("U*")
611
+ end
612
+ end
613
+ str
614
+ end
615
+
616
+
617
+ def utf8_to_dec_2
618
+ return nil unless self.utf8?
619
+ str = ""
620
+ tmpstr = ""
621
+ null_str = "\x00"
622
+ scan(/./mu) do |c|
623
+ if c =~ /^\x00$/
624
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
625
+ else
626
+ tmpstr = ""
627
+ c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
628
+ str << tmpstr << null_str
629
+ end
630
+ end
631
+ str[0..-3]
632
+ end
633
+
634
+ def dec_to_utf8_2 # \x00 is encoded as "aaa"
635
+ return self if self.empty?
636
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
637
+ str = ""
638
+ split(/\x00\x00/).each do |c|
639
+ if c =~ /\x00/
640
+ c.split(/\x00/).each { |x| str << x.to_i.chr }
641
+ elsif c.eql?("aaa")
642
+ str << "\x00"
643
+ else
644
+ str << c.to_i.chr
645
+ end
646
+ end
647
+ str
648
+ end
649
+
650
+
651
+ def utf8_to_hex
652
+ return nil unless self.utf8?
653
+ str = ""
654
+ tmpstr = ""
655
+ null_str = "\x00"
656
+ scan(/./mu) do |c|
657
+ if c =~ /^\x00$/
658
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
659
+ else
660
+ tmpstr = ""
661
+ c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
662
+ str << tmpstr << null_str
663
+ end
664
+ end
665
+ str[0..-3]
666
+ end
667
+
668
+ def hex_to_utf8 # \x00 is encoded as "aaa"
669
+ return self if self.empty?
670
+ return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
671
+ str = ""
672
+ split(/\x00\x00/).each do |c|
673
+ if c =~ /\x00/
674
+ c.split(/\x00/).each { |x| str << x.hex.chr }
675
+ elsif c.eql?("aaa")
676
+ str << "\x00"
677
+ else
678
+ str << c.hex.chr
679
+ end
680
+ end
681
+ str
682
+ end
683
+
684
+
685
+ def utf8_to_oct
686
+ return nil unless self.utf8?
687
+ str = ""
688
+ tmpstr = ""
689
+ null_str = "\x00"
690
+ scan(/./mu) do |c|
691
+ if c =~ /^\x00$/
692
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
693
+ else
694
+ tmpstr = ""
695
+ c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
696
+ str << tmpstr << null_str
697
+ end
698
+ end
699
+ str[0..-3]
700
+ end
701
+
702
+ def oct_to_utf8 # \x00 is encoded as "aaa"
703
+ return self if self.empty?
704
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
705
+ str = ""
706
+ split(/\x00\x00/).each do |c|
707
+ if c =~ /\x00/
708
+ c.split(/\x00/).each { |x| str << x.oct.chr }
709
+ elsif c.eql?("aaa")
710
+ str << "\x00"
711
+ else
712
+ str << c.oct.chr
713
+ end
714
+ end
715
+ str
716
+ end
717
+
718
+ # cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
719
+ def email_subject_utf8
720
+ return nil unless self.utf8?
721
+ "=?utf-8?b?#{[self].pack("m").delete("\n")}?="
722
+ end
723
+
724
+ end
725
+
726
+
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: Arabic-Prawn
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Dynamix Solutions
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-02-28 00:00:00 +02:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Allows printing arabic to PDFs generated by prawn
22
+ email: ahmed.nasser@dynamix-systems.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files:
28
+ - README
29
+ - LICENSE
30
+ files:
31
+ - LICENSE
32
+ - README
33
+ - Rakefile
34
+ - lib/arabic-prawn.rb
35
+ - lib/string_utf_support.rb
36
+ has_rdoc: true
37
+ homepage:
38
+ licenses: []
39
+
40
+ post_install_message:
41
+ rdoc_options: []
42
+
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ segments:
50
+ - 0
51
+ version: "0"
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ requirements: []
60
+
61
+ rubyforge_project:
62
+ rubygems_version: 1.3.6
63
+ signing_key:
64
+ specification_version: 3
65
+ summary: Allows printing arabic to PDFs generated by prawn
66
+ test_files: []
67
+