prawn-arabic 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1b1e46c30adc95d058af51d71a6c140146023122
4
+ data.tar.gz: 204379e44e33c33f7aa1f242a635a10b57f4d0b4
5
+ SHA512:
6
+ metadata.gz: ef85cd7f4e4faf73246c81b724808b3e03036b3b9b7ccf4ce126052c32adc678c28b668b429687786f1407da22311e68f845f47afcfe1a6ec08eb3769d98a7a9
7
+ data.tar.gz: bd3607dfcd57a3520ee6eedcf777d7059a93d64be0877165ee7407891825f20a3a6ab040a89af5c7d2462eff37d49ba6e5a898201d1deed4a912816731f44200
data/LICENSE ADDED
@@ -0,0 +1,10 @@
1
+ == Arabic-Prawn
2
+
3
+ MIT License
4
+ Copyright (c) 2017 Alex Lapchenko
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
7
+
8
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,14 @@
1
+ # Arabic-Prawn
2
+
3
+ Arabic language string helpers for Prawn. [Original gem on rubygems](https://rubygems.org/gems/Arabic-Prawn/versions/0.0.1)
4
+ Copied source code to github in order to provide open source support and development for this gem.
5
+
6
+ ## What is this
7
+ This gem patch `String` class and provide few helper for arabic language support.
8
+ - `#determine_format(before_c, after_c)`
9
+ - `#fix_arabic_glyphs`
10
+ - `#fix_word`
11
+ - `#get_letter_in_format(format, c)`
12
+
13
+ ## Thanks
14
+ - Creator [#44017, Dynamix Solutions](https://rubygems.org/profiles/44017)
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ $LOAD_PATH.unshift File.expand_path("../lib", __FILE__)
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'rake/clean'
6
+ require 'rubygems/package_task'
7
+ require 'rake/testtask'
8
+ require 'rdoc/task'
9
+ require 'rspec/core/rake_task'
10
+
11
+ task :build do
12
+ system "gem build arabic-prawn.gemspec"
13
+ end
14
+
15
+ RDoc::Task.new do |rdoc|
16
+ files =['README', 'LICENSE', 'lib/**/*.rb']
17
+ rdoc.rdoc_files.add(files)
18
+ rdoc.main = "README" # page to start on
19
+ rdoc.title = "Arabic-Prawn Docs"
20
+ rdoc.rdoc_dir = 'doc/rdoc' # rdoc output folder
21
+ rdoc.options << '--line-numbers'
22
+ end
23
+
24
+ Rake::TestTask.new do |t|
25
+ t.test_files = FileList['test/**/*.rb']
26
+ end
27
+
28
+ RSpec::Core::RakeTask.new do |t|
29
+ t.spec_files = FileList['spec/**/*.rb']
30
+ end
@@ -0,0 +1,527 @@
1
+ require 'string_utf_support'
2
+
3
+ class CharacterFormat
4
+ Isolated = 1
5
+ Initial = 2
6
+ Medial = 3
7
+ Final = 4
8
+ end
9
+
10
+
11
+ class ArabicCharacterInfo
12
+
13
+ @@arabic_characters_map = nil
14
+ attr_accessor :common_encoding , :format_encodings, :is_connected
15
+
16
+ def initialize(common, isolated, final, initial, medial, is_connected)
17
+ @common_encoding = common.unicode_to_utf8
18
+ @format_encodings = Hash.new
19
+ @format_encodings[CharacterFormat::Isolated] = isolated.unicode_to_utf8
20
+ @format_encodings[CharacterFormat::Initial] = initial.unicode_to_utf8
21
+ @format_encodings[CharacterFormat::Medial] = medial.unicode_to_utf8
22
+ @format_encodings[CharacterFormat::Final] = final.unicode_to_utf8
23
+ @is_connected = is_connected
24
+ end
25
+
26
+ def ArabicCharacterInfo.get_arabic_characters_map
27
+
28
+ if !@@arabic_characters_map.nil?
29
+ return @@arabic_characters_map
30
+ end
31
+
32
+ map = Hash.new
33
+
34
+ #Alef
35
+ new_character = ArabicCharacterInfo.new(
36
+ "U+0627", #Common
37
+ "U+fe8d", #Isolated
38
+ "U+fe8e", #Final
39
+ "U+fe8d", #Initial
40
+ "U+fe8e", #Medial
41
+ false)
42
+ map[new_character.common_encoding] = new_character
43
+
44
+ #Beh
45
+ new_character = ArabicCharacterInfo.new(
46
+ "U+0628", #Common
47
+ "U+fe8f", #Isolated
48
+ "U+fe90", #Final
49
+ "U+fe91", #Initial
50
+ "U+fe92", #Medial
51
+ true)
52
+ map[new_character.common_encoding] = new_character
53
+
54
+ #Teh
55
+ new_character = ArabicCharacterInfo.new(
56
+ "U+062a", #Common
57
+ "U+fe95", #Isolated
58
+ "U+fe96", #Final
59
+ "U+fe97", #Initial
60
+ "U+fe98", #Medial
61
+ true)
62
+ map[new_character.common_encoding] = new_character
63
+
64
+ #Theh
65
+ new_character = ArabicCharacterInfo.new(
66
+ "U+062b", #Common
67
+ "U+fe99", #Isolated
68
+ "U+fe9a", #Final
69
+ "U+fe9b", #Initial
70
+ "U+fe9c", #Medial
71
+ true)
72
+ map[new_character.common_encoding] = new_character
73
+
74
+ #Jeem
75
+ new_character = ArabicCharacterInfo.new(
76
+ "U+062c", #Common
77
+ "U+fe9d", #Isolated
78
+ "U+fe9e", #Final
79
+ "U+fe9f", #Initial
80
+ "U+fea0", #Medial
81
+ true)
82
+ map[new_character.common_encoding] = new_character
83
+
84
+ #7ah
85
+ new_character = ArabicCharacterInfo.new(
86
+ "U+062d", #Common
87
+ "U+fea1", #Isolated
88
+ "U+fea2", #Final
89
+ "U+fea3", #Initial
90
+ "U+fea4", #Medial
91
+ true)
92
+ map[new_character.common_encoding] = new_character
93
+
94
+ #7'ah
95
+ new_character = ArabicCharacterInfo.new(
96
+ "U+062e", #Common
97
+ "U+fea5", #Isolated
98
+ "U+fea6", #Final
99
+ "U+fea7", #Initial
100
+ "U+fea8", #Medial
101
+ true)
102
+ map[new_character.common_encoding] = new_character
103
+
104
+ #Dal
105
+ new_character = ArabicCharacterInfo.new(
106
+ "U+062f", #Common
107
+ "U+fea9", #Isolated
108
+ "U+feaa", #Final
109
+ "U+fea9", #Initial
110
+ "U+feaa", #Medial
111
+ false)
112
+ map[new_character.common_encoding] = new_character
113
+
114
+ #Thal
115
+ new_character = ArabicCharacterInfo.new(
116
+ "U+0630", #Common
117
+ "U+feab", #Isolated
118
+ "U+feac", #Final
119
+ "U+feab", #Initial
120
+ "U+feac", #Medial
121
+ false)
122
+ map[new_character.common_encoding] = new_character
123
+
124
+ #Rah
125
+ new_character = ArabicCharacterInfo.new(
126
+ "U+0631", #Common
127
+ "U+fead", #Isolated
128
+ "U+feae", #Final
129
+ "U+fead", #Initial
130
+ "U+feae", #Medial
131
+ false)
132
+ map[new_character.common_encoding] = new_character
133
+
134
+ #Zein
135
+ new_character = ArabicCharacterInfo.new(
136
+ "U+0632", #Common
137
+ "U+feaf", #Isolated
138
+ "U+feb0", #Final
139
+ "U+feaf", #Initial
140
+ "U+feb0", #Medial
141
+ false)
142
+ map[new_character.common_encoding] = new_character
143
+
144
+ #Seen
145
+ new_character = ArabicCharacterInfo.new(
146
+ "U+0633", #Common
147
+ "U+feb1", #Isolated
148
+ "U+feb2", #Final
149
+ "U+feb3", #Initial
150
+ "U+feb4", #Medial
151
+ true)
152
+ map[new_character.common_encoding] = new_character
153
+
154
+ #Sheen
155
+ new_character = ArabicCharacterInfo.new(
156
+ "U+0634", #Common
157
+ "U+feb5", #Isolated
158
+ "U+feb6", #Final
159
+ "U+feb7", #Initial
160
+ "U+feb8", #Medial
161
+ true)
162
+ map[new_character.common_encoding] = new_character
163
+
164
+
165
+ #Sad
166
+ new_character = ArabicCharacterInfo.new(
167
+ "U+0635", #Common
168
+ "U+feb9", #Isolated
169
+ "U+feba", #Final
170
+ "U+febb", #Initial
171
+ "U+febc", #Medial
172
+ true)
173
+ map[new_character.common_encoding] = new_character
174
+
175
+
176
+ #Dad
177
+ new_character = ArabicCharacterInfo.new(
178
+ "U+0636", #Common
179
+ "U+febd", #Isolated
180
+ "U+febe", #Final
181
+ "U+febf", #Initial
182
+ "U+fec0", #Medial
183
+ true)
184
+ map[new_character.common_encoding] = new_character
185
+
186
+
187
+ #Tah
188
+ new_character = ArabicCharacterInfo.new(
189
+ "U+0637", #Common
190
+ "U+fec1", #Isolated
191
+ "U+fec2", #Final
192
+ "U+fec3", #Initial
193
+ "U+fec4", #Medial
194
+ true)
195
+ map[new_character.common_encoding] = new_character
196
+
197
+ #Thah
198
+ new_character = ArabicCharacterInfo.new(
199
+ "U+0638", #Common
200
+ "U+fec5", #Isolated
201
+ "U+fec6", #Final
202
+ "U+fec7", #Initial
203
+ "U+fec8", #Medial
204
+ true)
205
+ map[new_character.common_encoding] = new_character
206
+
207
+ #3ein
208
+ new_character = ArabicCharacterInfo.new(
209
+ "U+0639", #Common
210
+ "U+fec9", #Isolated
211
+ "U+feca", #Final
212
+ "U+fecb", #Initial
213
+ "U+fecc", #Medial
214
+ true)
215
+ map[new_character.common_encoding] = new_character
216
+
217
+
218
+ #3'ein
219
+ new_character = ArabicCharacterInfo.new(
220
+ "U+063a", #Common
221
+ "U+fecd", #Isolated
222
+ "U+fece", #Final
223
+ "U+fecf", #Initial
224
+ "U+fed0", #Medial
225
+ true)
226
+ map[new_character.common_encoding] = new_character
227
+
228
+ #Feh
229
+ new_character = ArabicCharacterInfo.new(
230
+ "U+0641", #Common
231
+ "U+fed1", #Isolated
232
+ "U+fed2", #Final
233
+ "U+fed3", #Initial
234
+ "U+fed4", #Medial
235
+ true)
236
+ map[new_character.common_encoding] = new_character
237
+
238
+
239
+ #Qaf
240
+ new_character = ArabicCharacterInfo.new(
241
+ "U+0642", #Common
242
+ "U+fed5", #Isolated
243
+ "U+fed6", #Final
244
+ "U+fed7", #Initial
245
+ "U+fed8", #Medial
246
+ true)
247
+ map[new_character.common_encoding] = new_character
248
+
249
+
250
+ #Kaf
251
+ new_character = ArabicCharacterInfo.new(
252
+ "U+0643", #Common
253
+ "U+fed9", #Isolated
254
+ "U+feda", #Final
255
+ "U+fedb", #Initial
256
+ "U+fedc", #Medial
257
+ true)
258
+ map[new_character.common_encoding] = new_character
259
+
260
+ #Lam
261
+ new_character = ArabicCharacterInfo.new(
262
+ "U+0644", #Common
263
+ "U+fedd", #Isolated
264
+ "U+fede", #Final
265
+ "U+fedf", #Initial
266
+ "U+fee0", #Medial
267
+ true)
268
+ map[new_character.common_encoding] = new_character
269
+
270
+ #Meem
271
+ new_character = ArabicCharacterInfo.new(
272
+ "U+0645", #Common
273
+ "U+fee1", #Isolated
274
+ "U+fee2", #Final
275
+ "U+fee3", #Initial
276
+ "U+fee4", #Medial
277
+ true)
278
+ map[new_character.common_encoding] = new_character
279
+
280
+ #Noon
281
+ new_character = ArabicCharacterInfo.new(
282
+ "U+0646", #Common
283
+ "U+fee5", #Isolated
284
+ "U+fee6", #Final
285
+ "U+fee7", #Initial
286
+ "U+fee8", #Medial
287
+ true)
288
+ map[new_character.common_encoding] = new_character
289
+
290
+ #Heh
291
+ new_character = ArabicCharacterInfo.new(
292
+ "U+0647", #Common
293
+ "U+fee9", #Isolated
294
+ "U+feea", #Final
295
+ "U+feeb", #Initial
296
+ "U+feec", #Medial
297
+ true)
298
+ map[new_character.common_encoding] = new_character
299
+
300
+ #Waw
301
+ new_character = ArabicCharacterInfo.new(
302
+ "U+0648", #Common
303
+ "U+feed", #Isolated
304
+ "U+feee", #Final
305
+ "U+feed", #Initial
306
+ "U+feee", #Medial
307
+ false)
308
+ map[new_character.common_encoding] = new_character
309
+
310
+ #Yeh
311
+ new_character = ArabicCharacterInfo.new(
312
+ "U+064a", #Common
313
+ "U+fef1", #Isolated
314
+ "U+fef2", #Final
315
+ "U+fef3", #Initial
316
+ "U+fef4", #Medial
317
+ true)
318
+ map[new_character.common_encoding] = new_character
319
+
320
+ #Hamza
321
+ new_character = ArabicCharacterInfo.new(
322
+ "U+0621", #Common
323
+ "U+fe80", #Isolated
324
+ "U+fe80", #Final
325
+ "U+fe80", #Initial
326
+ "U+fe80", #Medial
327
+ false)
328
+ map[new_character.common_encoding] = new_character
329
+
330
+
331
+ # Alef Madda
332
+ new_character = ArabicCharacterInfo.new(
333
+ "U+0622", #Common
334
+ "U+fe81", #Isolated
335
+ "U+fe82", #Final
336
+ "U+fe81", #Initial
337
+ "U+fe82", #Medial
338
+ false)
339
+ map[new_character.common_encoding] = new_character
340
+
341
+ # Alef Hamza Above
342
+ new_character = ArabicCharacterInfo.new(
343
+ "U+0623", #Common
344
+ "U+fe83", #Isolated
345
+ "U+fe84", #Final
346
+ "U+fe83", #Initial
347
+ "U+fe84", #Medial
348
+ false)
349
+ map[new_character.common_encoding] = new_character
350
+
351
+ # Waw Hamza
352
+ new_character = ArabicCharacterInfo.new(
353
+ "U+0624", #Common
354
+ "U+fe85", #Isolated
355
+ "U+fe86", #Final
356
+ "U+fe85", #Initial
357
+ "U+fe86", #Medial
358
+ false)
359
+ map[new_character.common_encoding] = new_character
360
+
361
+ # Alef Hamza Below
362
+ new_character = ArabicCharacterInfo.new(
363
+ "U+0625", #Common
364
+ "U+fe87", #Isolated
365
+ "U+fe88", #Final
366
+ "U+fe87", #Initial
367
+ "U+fe88", #Medial
368
+ false)
369
+ map[new_character.common_encoding] = new_character
370
+
371
+ # Yeh Hamza
372
+ new_character = ArabicCharacterInfo.new(
373
+ "U+0626", #Common
374
+ "U+fe89", #Isolated
375
+ "U+fe8a", #Final
376
+ "U+fe8b", #Initial
377
+ "U+fe8c", #Medial
378
+ true)
379
+ map[new_character.common_encoding] = new_character
380
+
381
+ # Teh Marbuta
382
+ new_character = ArabicCharacterInfo.new(
383
+ "U+0629", #Common
384
+ "U+fe93", #Isolated
385
+ "U+fe94", #Final
386
+ "U+fe93", #Initial
387
+ "U+fe94", #Medial
388
+ false)
389
+ map[new_character.common_encoding] = new_character
390
+
391
+ # Tatweel
392
+ new_character = ArabicCharacterInfo.new(
393
+ "U+0640", #Common
394
+ "U+0640", #Isolated
395
+ "U+0640", #Final
396
+ "U+0640", #Initial
397
+ "U+0640", #Medial
398
+ true)
399
+ map[new_character.common_encoding] = new_character
400
+
401
+ # Alef Layyena
402
+ new_character = ArabicCharacterInfo.new(
403
+ "U+0649", #Common
404
+ "U+feef", #Isolated
405
+ "U+fef0", #Final
406
+ "U+feef", #Initial
407
+ "U+fef0", #Medial
408
+ false)
409
+ map[new_character.common_encoding] = new_character
410
+
411
+ @@arabic_characters_map = map
412
+
413
+ return @@arabic_characters_map
414
+ end
415
+
416
+ end
417
+
418
+
419
+
420
+ class String
421
+
422
+
423
+
424
+ def determine_format(before_c, after_c)
425
+
426
+ charmap = ArabicCharacterInfo.get_arabic_characters_map
427
+
428
+ previous_is_character = charmap.key?(before_c)
429
+ after_is_character = charmap.key?(after_c)
430
+
431
+ if !after_is_character and (!previous_is_character or !charmap[before_c].is_connected)
432
+ return CharacterFormat::Isolated
433
+ end
434
+
435
+ if !after_is_character
436
+ return CharacterFormat::Final
437
+ end
438
+
439
+ if !previous_is_character or !charmap[before_c].is_connected
440
+ return CharacterFormat::Initial
441
+ end
442
+
443
+ return CharacterFormat::Medial
444
+
445
+ end
446
+
447
+ def get_letter_in_format(format, c)
448
+ charmap = ArabicCharacterInfo.get_arabic_characters_map
449
+ character = charmap[c]
450
+ if character.nil?
451
+ return c
452
+ end
453
+ return character.format_encodings[format]
454
+ end
455
+
456
+ def fix_word
457
+
458
+ is_arabic = false
459
+ connected_arabic = ""
460
+ previous_letter = ''
461
+ before_previous_letter = ''
462
+
463
+ self.each_utf8_char {|c|
464
+
465
+ if previous_letter != ''
466
+
467
+ format = determine_format(before_previous_letter, c)
468
+ fixed_character = get_letter_in_format(format, previous_letter)
469
+ connected_arabic += fixed_character
470
+ if fixed_character != previous_letter
471
+ is_arabic = true
472
+ end
473
+
474
+ end
475
+
476
+ before_previous_letter = previous_letter
477
+ previous_letter = c
478
+ }
479
+
480
+ if previous_letter != ''
481
+
482
+ format = determine_format(before_previous_letter, '')
483
+ fixed_character = get_letter_in_format(format, previous_letter)
484
+ connected_arabic += fixed_character
485
+ if fixed_character != previous_letter
486
+ is_arabic = true
487
+ end
488
+ end
489
+
490
+ if is_arabic
491
+ return connected_arabic.reverse_utf8!
492
+ else
493
+ return connected_arabic
494
+ end
495
+ end
496
+
497
+ def fix_arabic_glyphs
498
+
499
+ words = self.split(" ")
500
+ result = ""
501
+
502
+ #assuming default is rtl
503
+ ltr_buffer = ""
504
+
505
+ words.each { |word|
506
+ fixed_word = word.fix_word
507
+ if(fixed_word == word)
508
+ #a non-arabic word (ltr) so we will buffer to see if more ltr words will follow
509
+ ltr_buffer = ltr_buffer + " " + fixed_word
510
+ else
511
+ if(ltr_buffer.empty?)
512
+ result = fixed_word + " " + result
513
+ else
514
+ result = ltr_buffer + " " + result
515
+ result = fixed_word + " " + result
516
+ ltr_buffer = ""
517
+ end
518
+ end
519
+ }
520
+
521
+ if(!(ltr_buffer.empty?))
522
+ result = ltr_buffer + " " + result
523
+ end
524
+
525
+ return result
526
+ end
527
+ end
@@ -0,0 +1,726 @@
1
+ class String
2
+
3
+ require 'iconv'
4
+ require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
5
+
6
+ # taken from: http://www.w3.org/International/questions/qa-forms-utf-8
7
+ UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
8
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
9
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
10
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
11
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
12
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
13
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
14
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
15
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
16
+ )*\z/mnx
17
+
18
+
19
+ # create UTF-8 character arrays (as class instance variables)
20
+ #
21
+ # mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
22
+ # - http://unicode.org/Public/UNIDATA/UnicodeData.txt
23
+ # - http://unicode.org/Public/UNIDATA/CaseFolding.txt
24
+ # - http://www.decodeunicode.org
25
+ # - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
26
+ # - http://camomile.sourceforge.net
27
+ # - Character Palette (Mac OS X)
28
+
29
+
30
+ # test data
31
+ @small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
32
+
33
+
34
+ @capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
35
+
36
+
37
+ @other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
38
+
39
+ if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
40
+ if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
41
+ if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
42
+
43
+
44
+ @unicode_array = []
45
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
46
+ #open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
47
+ # f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
48
+ #end
49
+
50
+ #@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
51
+ @letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
52
+
53
+ # Hash[*array_with_keys.zip(array_with_values).flatten]
54
+ @downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
55
+ @upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
56
+ @letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
57
+
58
+ class << self
59
+ attr_accessor :small_letters_utf8
60
+ attr_accessor :capital_letters_utf8
61
+ attr_accessor :other_letters_utf8
62
+ attr_accessor :letters_utf8
63
+ attr_accessor :letters_utf8_hash
64
+ attr_accessor :unicode_array
65
+ attr_accessor :downcase_table_utf8
66
+ attr_accessor :upcase_table_utf8
67
+ end
68
+
69
+
70
+ def each_utf8_char
71
+ scan(/./mu) { |c| yield c }
72
+ end
73
+
74
+ def each_utf8_char_with_index
75
+ i = -1
76
+ scan(/./mu) { |c| i+=1; yield(c, i) }
77
+ end
78
+
79
+ def length_utf8
80
+ #scan(/./mu).size
81
+ count = 0
82
+ scan(/./mu) { count += 1 }
83
+ count
84
+ end
85
+ alias :size_utf8 :length_utf8
86
+
87
+ def reverse_utf8
88
+ split(//mu).reverse.join
89
+ end
90
+
91
+ def reverse_utf8!
92
+ split(//mu).reverse!.join
93
+ end
94
+
95
+ def swapcase_utf8
96
+ gsub(/./mu) do |char|
97
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
98
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
99
+ else char.swapcase
100
+ end
101
+ end
102
+ end
103
+
104
+ def swapcase_utf8!
105
+ gsub!(/./mu) do |char|
106
+ if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
107
+ elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
108
+ else ret = char.swapcase end
109
+ end
110
+ end
111
+
112
+ def downcase_utf8
113
+ gsub(/./mu) do |char|
114
+ small_char = String.downcase_table_utf8[char]
115
+ small_char.nil? ? char.downcase : small_char
116
+ end
117
+ end
118
+
119
+ def downcase_utf8!
120
+ gsub!(/./mu) do |char|
121
+ small_char = String.downcase_table_utf8[char]
122
+ small_char.nil? ? char.downcase : small_char
123
+ end
124
+ end
125
+
126
+ def upcase_utf8
127
+ gsub(/./mu) do |char|
128
+ capital_char = String.upcase_table_utf8[char]
129
+ capital_char.nil? ? char.upcase : capital_char
130
+ end
131
+ end
132
+
133
+ def upcase_utf8!
134
+ gsub!(/./mu) do |char|
135
+ capital_char = String.upcase_table_utf8[char]
136
+ capital_char.nil? ? char.upcase : capital_char
137
+ end
138
+ end
139
+
140
+ def count_utf8(c)
141
+ return nil if c.empty?
142
+ r = %r{[#{c}]}mu
143
+ scan(r).size
144
+ end
145
+
146
+ def delete_utf8(c)
147
+ return self if c.empty?
148
+ r = %r{[#{c}]}mu
149
+ gsub(r, '')
150
+ end
151
+
152
+ def delete_utf8!(c)
153
+ return self if c.empty?
154
+ r = %r{[#{c}]}mu
155
+ gsub!(r, '')
156
+ end
157
+
158
+ def first_utf8
159
+ self[/\A./mu]
160
+ end
161
+
162
+ def last_utf8
163
+ self[/.\z/mu]
164
+ end
165
+
166
+ def capitalize_utf8
167
+ return self if self =~ /\A[[:space:]]*\z/m
168
+ ret = ""
169
+ split(/\x20/).each do |w|
170
+ count = 0
171
+ w.gsub(/./mu) do |char|
172
+ count += 1
173
+ capital_char = String.upcase_table_utf8[char]
174
+ if count == 1 then
175
+ capital_char.nil? ? char.upcase : char.upcase_utf8
176
+ else
177
+ capital_char.nil? ? char.downcase : char.downcase_utf8
178
+ end
179
+ end
180
+ ret << w + ' '
181
+ end
182
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
183
+ end
184
+
185
+ def capitalize_utf8!
186
+ return self if self =~ /\A[[:space:]]*\z/m
187
+ ret = ""
188
+ split(/\x20/).each do |w|
189
+ count = 0
190
+ w.gsub!(/./mu) do |char|
191
+ count += 1
192
+ capital_char = String.upcase_table_utf8[char]
193
+ if count == 1 then
194
+ capital_char.nil? ? char.upcase : char.upcase_utf8
195
+ else
196
+ capital_char.nil? ? char.downcase : char.downcase_utf8
197
+ end
198
+ end
199
+ ret << w + ' '
200
+ end
201
+ ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
202
+ end
203
+
204
+
205
+ def index_utf8(s)
206
+
207
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
208
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
209
+
210
+ if s.class == Regexp
211
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
212
+ if opts.count('u') == 0 then opts = opts + "u" end
213
+ str = s.source
214
+ return nil if str.empty?
215
+ str = "%r{#{str}}" + opts
216
+ r = eval(str)
217
+ l = ""
218
+ sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
219
+ l.empty? ? nil : l.length_utf8
220
+
221
+ else
222
+
223
+ return nil if s.empty?
224
+ r = %r{#{s}}mu
225
+ l = ""
226
+ sub(r) { l << $`; " " }
227
+ l.empty? ? nil : l.length_utf8
228
+
229
+ # this would be a non-regex solution
230
+ =begin
231
+ return nil if s.empty?
232
+ return nil unless self =~ %r{#{s}}mu
233
+ indices = []
234
+ s.split(//mu).each do |x|
235
+ ar = []
236
+ self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
237
+ indices << ar unless ar.empty?
238
+ end
239
+ if indices.empty?
240
+ return nil
241
+ elsif indices.size == 1
242
+ indices.first.first
243
+ else
244
+ #p indices
245
+ ret = []
246
+ a0 = indices.shift
247
+ a0.each do |i|
248
+ ret << i
249
+ indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
250
+ return ret.first unless ret.empty?
251
+ end
252
+ ret.empty? ? nil : ret.first
253
+ end
254
+ =end
255
+
256
+ end
257
+ end
258
+
259
+
260
+ def rindex_utf8(s)
261
+
262
+ return nil unless !self.empty? && (s.class == Regexp || s.class == String)
263
+ #raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
264
+
265
+ if s.class == Regexp
266
+ opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
267
+ if opts.count('u') == 0 then opts = opts + "u" end
268
+ str = s.source
269
+ return nil if str.empty?
270
+ str = "%r{#{str}}" + opts
271
+ r = eval(str)
272
+ l = ""
273
+ scan(r) { l = $` }
274
+ #gsub(r) { l = $`; " " }
275
+ l.empty? ? nil : l.length_utf8
276
+ else
277
+ return nil if s.empty?
278
+ r = %r{#{s}}mu
279
+ l = ""
280
+ scan(r) { l = $` }
281
+ #gsub(r) { l = $`; " " }
282
+ l.empty? ? nil : l.length_utf8
283
+ end
284
+
285
+ end
286
+
287
+
288
+ # note that the i option does not work in special cases with back references
289
+ # example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
290
+ def slice_utf8(regex)
291
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
292
+ if opts.count('u') == 0 then opts = opts + "u" end
293
+ s = regex.source
294
+ str = "%r{#{s}}" + opts
295
+ r = eval(str)
296
+ slice(r)
297
+ end
298
+
299
+ def slice_utf8!(regex)
300
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
301
+ if opts.count('u') == 0 then opts = opts + "u" end
302
+ s = regex.source
303
+ str = "%r{#{s}}" + opts
304
+ r = eval(str)
305
+ slice!(r)
306
+ end
307
+
308
+ def cut_utf8(p,l) # (index) position, length
309
+ raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
310
+ s = self.length_utf8
311
+ #if p < 0 then p = s - p.abs end
312
+ if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
313
+ return nil if l > s or p > (s - 1)
314
+ ret = ""
315
+ count = 0
316
+ each_utf8_char_with_index do |c,i|
317
+ break if count >= l
318
+ if i >= p && count < l then count += 1; ret << c; end
319
+ end
320
+ ret
321
+ end
322
+
323
+ def starts_with_utf8?(s)
324
+ return nil if self.empty? or s.empty?
325
+ cut_utf8(0, s.size_utf8) == s
326
+ end
327
+
328
+ def ends_with_utf8?(s)
329
+ return nil if self.empty? or s.empty?
330
+ cut_utf8(-(s.size_utf8), s.size_utf8) == s
331
+ end
332
+
333
+ def insert_utf8(i,s) # insert_utf8(index, string)
334
+ return self if s.empty?
335
+ l = self.length_utf8
336
+ if l == 0 then return s end
337
+ if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
338
+ #return nil if i > (l - 1) # return nil ...
339
+ spaces = ""
340
+ if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
341
+ str = self << spaces
342
+ s1 = str.cut_utf8(0, i)
343
+ s2 = str.cut_utf8(i, l - s1.length_utf8)
344
+ s1 << s << s2
345
+ end
346
+
347
+ def split_utf8(regex)
348
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
349
+ if opts.count('u') == 0 then opts = opts + "u" end
350
+ s = regex.source
351
+ str = "%r{#{s}}" + opts
352
+ r = eval(str)
353
+ split(r)
354
+ end
355
+
356
+ def scan_utf8(regex)
357
+ opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
358
+ if opts.count('u') == 0 then opts = opts + "u" end
359
+ s = regex.source
360
+ str = "%r{#{s}}" + opts
361
+ r = eval(str)
362
+ if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
363
+ end
364
+
365
+ def range_utf8(r)
366
+
367
+ return nil if r.class != Range
368
+ #raise(ArgumentError, "No Range object given!", caller) if r.class != Range
369
+
370
+ a = r.to_s[/^[\+\-]?\d+/].to_i
371
+ b = r.to_s[/[\+\-]?\d+$/].to_i
372
+ d = r.to_s[/\.+/]
373
+
374
+ if d.size == 2 then d = 2 else d = d.size end
375
+
376
+ l = self.length_utf8
377
+
378
+ return nil if b.abs > l || a.abs > l || d < 2 || d > 3
379
+
380
+ if a < 0 then a = l - a.abs end
381
+ if b < 0 then b = l - b.abs end
382
+
383
+ return nil if a > b
384
+
385
+ str = ""
386
+
387
+ each_utf8_char_with_index do |c,i|
388
+ break if i > b
389
+ if d == 2
390
+ (i >= a && i <= b) ? str << c : next
391
+ else
392
+ (i >= a && i < b) ? str << c : next
393
+ end
394
+ end
395
+
396
+ str
397
+
398
+ end
399
+
400
+ def utf8?
401
+ self =~ UTF8REGEX
402
+ end
403
+
404
+ def clean_utf8
405
+ t = ""
406
+ self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
407
+ t
408
+ end
409
+
410
+
411
+ def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
412
+
413
+ file = self
414
+ str = ""
415
+
416
+ if file =~ /^http:\/\//
417
+
418
+ url = file
419
+
420
+ if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
421
+
422
+ seconds = 30
423
+
424
+ # check if web site is reachable
425
+ # on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
426
+ var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
427
+
428
+ #return false unless var == 0
429
+ raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
430
+
431
+ str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
432
+ /usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
433
+ p str
434
+ return true if str =~ /utf-?8/i
435
+ return false if !str.empty? && str !~ /utf-?8/i
436
+
437
+ # solutions with downloaded file
438
+
439
+ # download HTML file
440
+ #downloaded_file = "/tmp/html"
441
+ downloaded_file = "~/Desktop/html"
442
+ downloaded_file = File.expand_path(downloaded_file)
443
+ %x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
444
+ raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
445
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
446
+
447
+ simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
448
+ p simple_test
449
+
450
+ # read entire file into a string
451
+ File.open(downloaded_file).read.each(nil) do |str|
452
+ #return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
453
+ str.utf8? ? (return true) : (return false)
454
+ end
455
+
456
+ #check each line of the downloaded file
457
+ #count_lines = 0
458
+ #count_utf8 = 0
459
+ #File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
460
+ #count_lines == count_utf8 ? (return true) : (return false)
461
+
462
+
463
+ # in-memory solutions
464
+
465
+ #html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
466
+ #p html_file_cleaned_utf8.utf8?
467
+
468
+ count_lines = 0
469
+ count_utf8 = 0
470
+ #%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
471
+ %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
472
+ #return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
473
+ count_lines += 1
474
+ count_utf8 += 1 if line.utf8?
475
+ break if count_lines != count_utf8
476
+ end
477
+ count_lines == count_utf8 ? (return true) : (return false)
478
+
479
+ else
480
+
481
+ # check each line of the HTML file (or the entire HTML file at once)
482
+ # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
483
+ count_lines = 0
484
+ count_utf8 = 0
485
+ open(url) do |f|
486
+ # p f.meta, f.content_encoding, f.content_type
487
+ cs = f.charset
488
+ return true if cs =~ /utf-?8/i
489
+ #f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
490
+ f.each_line do |line|
491
+ count_lines += 1
492
+ count_utf8 += 1 if line.utf8?
493
+ break unless count_lines == count_utf8
494
+ end
495
+ end
496
+ count_lines == count_utf8 ? (return true) : (return false)
497
+
498
+ end
499
+
500
+ else
501
+
502
+ return false unless File.file?(file)
503
+
504
+ if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
505
+
506
+ # read entire file into a string
507
+ #File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
508
+
509
+ # check each line of the file
510
+ count_lines = 0
511
+ count_utf8 = 0
512
+ File.foreach(file) do |line|
513
+ return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
514
+ count_lines += 1;
515
+ count_utf8 += 1 if line.utf8?;
516
+ break if count_lines != count_utf8
517
+ end
518
+
519
+ count_lines == count_utf8 ? (return true) : (return false)
520
+
521
+ end
522
+
523
+ str =~ /utf-?8/i ? true : false
524
+
525
+ end
526
+
527
+
528
+ # cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
529
+ def validate_utf8
530
+ Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
531
+ end
532
+
533
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
534
+ def asciify_utf8
535
+ return nil unless self.utf8?
536
+ #Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
537
+ # delete all punctuation characters inside words except "-" in words such as up-to-date
538
+ Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
539
+ end
540
+
541
+ def latin1_to_utf8 # ISO-8859-1 to UTF-8
542
+ ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
543
+ ret.utf8? ? ret : nil
544
+ end
545
+
546
+ def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
547
+ ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
548
+ ret.utf8? ? ret : nil
549
+ end
550
+
551
+ # cf. Paul Battley, http://www.ruby-forum.com/topic/70357
552
+ def utf16le_to_utf8
553
+ ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
554
+ ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
555
+ ret.utf8? ? ret : nil
556
+ end
557
+
558
+ def utf8_to_utf16le
559
+ return nil unless self.utf8?
560
+ ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
561
+ end
562
+
563
+ def utf8_to_unicode
564
+ return nil unless self.utf8?
565
+ str = ""
566
+ scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
567
+ str
568
+ end
569
+
570
+ def unicode_to_utf8
571
+ return self if self =~ /\A[[:space:]]*\z/m
572
+ str = ""
573
+ #scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
574
+ #scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
575
+ scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
576
+ c = $1
577
+ if c =~ /^U\+/
578
+ str << [c[2..-1].hex].pack("U*")
579
+ else
580
+ str << c
581
+ end
582
+ end
583
+ str.utf8? ? str : nil
584
+ end
585
+
586
+
587
+ # dec, hex, oct conversions (experimental!)
588
+
589
+ def utf8_to_dec
590
+ return nil unless self.utf8?
591
+ str = ""
592
+ scan(/./mu) do |c|
593
+ if c =~ /^\x00$/
594
+ str << "aaa\x00" # encode \x00 as "aaa"
595
+ else
596
+ str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
597
+ end
598
+ end
599
+ str[0..-2]
600
+ end
601
+
602
+ def dec_to_utf8 # \x00 is encoded as "aaa"
603
+ return self if self.empty?
604
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
605
+ str = ""
606
+ split(/\x00/).each do |c|
607
+ if c.eql?("aaa")
608
+ str << "\x00"
609
+ else
610
+ str << [c.to_i].pack("U*")
611
+ end
612
+ end
613
+ str
614
+ end
615
+
616
+
617
+ def utf8_to_dec_2
618
+ return nil unless self.utf8?
619
+ str = ""
620
+ tmpstr = ""
621
+ null_str = "\x00"
622
+ scan(/./mu) do |c|
623
+ if c =~ /^\x00$/
624
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
625
+ else
626
+ tmpstr = ""
627
+ c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
628
+ str << tmpstr << null_str
629
+ end
630
+ end
631
+ str[0..-3]
632
+ end
633
+
634
+ def dec_to_utf8_2 # \x00 is encoded as "aaa"
635
+ return self if self.empty?
636
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
637
+ str = ""
638
+ split(/\x00\x00/).each do |c|
639
+ if c =~ /\x00/
640
+ c.split(/\x00/).each { |x| str << x.to_i.chr }
641
+ elsif c.eql?("aaa")
642
+ str << "\x00"
643
+ else
644
+ str << c.to_i.chr
645
+ end
646
+ end
647
+ str
648
+ end
649
+
650
+
651
+ def utf8_to_hex
652
+ return nil unless self.utf8?
653
+ str = ""
654
+ tmpstr = ""
655
+ null_str = "\x00"
656
+ scan(/./mu) do |c|
657
+ if c =~ /^\x00$/
658
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
659
+ else
660
+ tmpstr = ""
661
+ c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
662
+ str << tmpstr << null_str
663
+ end
664
+ end
665
+ str[0..-3]
666
+ end
667
+
668
+ def hex_to_utf8 # \x00 is encoded as "aaa"
669
+ return self if self.empty?
670
+ return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
671
+ str = ""
672
+ split(/\x00\x00/).each do |c|
673
+ if c =~ /\x00/
674
+ c.split(/\x00/).each { |x| str << x.hex.chr }
675
+ elsif c.eql?("aaa")
676
+ str << "\x00"
677
+ else
678
+ str << c.hex.chr
679
+ end
680
+ end
681
+ str
682
+ end
683
+
684
+
685
+ def utf8_to_oct
686
+ return nil unless self.utf8?
687
+ str = ""
688
+ tmpstr = ""
689
+ null_str = "\x00"
690
+ scan(/./mu) do |c|
691
+ if c =~ /^\x00$/
692
+ str << "aaa\x00\x00" # encode \x00 as "aaa"
693
+ else
694
+ tmpstr = ""
695
+ c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
696
+ str << tmpstr << null_str
697
+ end
698
+ end
699
+ str[0..-3]
700
+ end
701
+
702
+ def oct_to_utf8 # \x00 is encoded as "aaa"
703
+ return self if self.empty?
704
+ return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
705
+ str = ""
706
+ split(/\x00\x00/).each do |c|
707
+ if c =~ /\x00/
708
+ c.split(/\x00/).each { |x| str << x.oct.chr }
709
+ elsif c.eql?("aaa")
710
+ str << "\x00"
711
+ else
712
+ str << c.oct.chr
713
+ end
714
+ end
715
+ str
716
+ end
717
+
718
+ # cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
719
+ def email_subject_utf8
720
+ return nil unless self.utf8?
721
+ "=?utf-8?b?#{[self].pack("m").delete("\n")}?="
722
+ end
723
+
724
+ end
725
+
726
+
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: prawn-arabic
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Dynamix Solutions
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-04-19 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Gem which improve workflow with arabic text
14
+ email: ahmed.nasser@dynamix-systems.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files:
18
+ - README.md
19
+ - LICENSE
20
+ files:
21
+ - LICENSE
22
+ - README.md
23
+ - Rakefile
24
+ - lib/prawn-arabic.rb
25
+ - lib/string_utf_support.rb
26
+ homepage: https://github.com/ozeron/arabic-prawn
27
+ licenses:
28
+ - MIT
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.5.1
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: Arabic language string helpers
50
+ test_files: []