Arabic-Prawn 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of Arabic-Prawn might be problematic. Click here for more details.
- data/LICENSE +3 -0
- data/README +3 -0
- data/Rakefile +50 -0
- data/lib/arabic-prawn.rb +527 -0
- data/lib/string_utf_support.rb +726 -0
- metadata +67 -0
data/LICENSE
ADDED
data/README
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#
|
2
|
+
# To change this template, choose Tools | Templates
|
3
|
+
# and open the template in the editor.
|
4
|
+
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'rake'
|
8
|
+
require 'rake/clean'
|
9
|
+
require 'rake/gempackagetask'
|
10
|
+
require 'rake/rdoctask'
|
11
|
+
require 'rake/testtask'
|
12
|
+
require 'spec/rake/spectask'
|
13
|
+
|
14
|
+
spec = Gem::Specification.new do |s|
|
15
|
+
s.name = 'Arabic-Prawn'
|
16
|
+
s.version = '0.0.1'
|
17
|
+
s.has_rdoc = true
|
18
|
+
s.extra_rdoc_files = ['README', 'LICENSE']
|
19
|
+
s.summary = 'Your summary here'
|
20
|
+
s.description = s.summary
|
21
|
+
s.author = 'Dynamix Solutions'
|
22
|
+
s.email = 'ahmed.nasser@dynamix-systems.com'
|
23
|
+
# s.executables = ['your_executable_here']
|
24
|
+
s.files = %w(LICENSE README Rakefile) + Dir.glob("{bin,lib,spec}/**/*")
|
25
|
+
s.require_path = "lib"
|
26
|
+
s.bindir = "bin"
|
27
|
+
end
|
28
|
+
|
29
|
+
Rake::GemPackageTask.new(spec) do |p|
|
30
|
+
p.gem_spec = spec
|
31
|
+
p.need_tar = true
|
32
|
+
p.need_zip = true
|
33
|
+
end
|
34
|
+
|
35
|
+
Rake::RDocTask.new do |rdoc|
|
36
|
+
files =['README', 'LICENSE', 'lib/**/*.rb']
|
37
|
+
rdoc.rdoc_files.add(files)
|
38
|
+
rdoc.main = "README" # page to start on
|
39
|
+
rdoc.title = "Arabic-Prawn Docs"
|
40
|
+
rdoc.rdoc_dir = 'doc/rdoc' # rdoc output folder
|
41
|
+
rdoc.options << '--line-numbers'
|
42
|
+
end
|
43
|
+
|
44
|
+
Rake::TestTask.new do |t|
|
45
|
+
t.test_files = FileList['test/**/*.rb']
|
46
|
+
end
|
47
|
+
|
48
|
+
Spec::Rake::SpecTask.new do |t|
|
49
|
+
t.spec_files = FileList['spec/**/*.rb']
|
50
|
+
end
|
data/lib/arabic-prawn.rb
ADDED
@@ -0,0 +1,527 @@
|
|
1
|
+
require 'string_utf_support'
|
2
|
+
|
3
|
+
class CharacterFormat
|
4
|
+
Isolated = 1
|
5
|
+
Initial = 2
|
6
|
+
Medial = 3
|
7
|
+
Final = 4
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
class ArabicCharacterInfo
|
12
|
+
|
13
|
+
@@arabic_characters_map = nil
|
14
|
+
attr_accessor :common_encoding , :format_encodings, :is_connected
|
15
|
+
|
16
|
+
def initialize(common, isolated, final, initial, medial, is_connected)
|
17
|
+
@common_encoding = common.unicode_to_utf8
|
18
|
+
@format_encodings = Hash.new
|
19
|
+
@format_encodings[CharacterFormat::Isolated] = isolated.unicode_to_utf8
|
20
|
+
@format_encodings[CharacterFormat::Initial] = initial.unicode_to_utf8
|
21
|
+
@format_encodings[CharacterFormat::Medial] = medial.unicode_to_utf8
|
22
|
+
@format_encodings[CharacterFormat::Final] = final.unicode_to_utf8
|
23
|
+
@is_connected = is_connected
|
24
|
+
end
|
25
|
+
|
26
|
+
def ArabicCharacterInfo.get_arabic_characters_map
|
27
|
+
|
28
|
+
if !@@arabic_characters_map.nil?
|
29
|
+
return @@arabic_characters_map
|
30
|
+
end
|
31
|
+
|
32
|
+
map = Hash.new
|
33
|
+
|
34
|
+
#Alef
|
35
|
+
new_character = ArabicCharacterInfo.new(
|
36
|
+
"U+0627", #Common
|
37
|
+
"U+fe8d", #Isolated
|
38
|
+
"U+fe8e", #Final
|
39
|
+
"U+fe8d", #Initial
|
40
|
+
"U+fe8e", #Medial
|
41
|
+
false)
|
42
|
+
map[new_character.common_encoding] = new_character
|
43
|
+
|
44
|
+
#Beh
|
45
|
+
new_character = ArabicCharacterInfo.new(
|
46
|
+
"U+0628", #Common
|
47
|
+
"U+fe8f", #Isolated
|
48
|
+
"U+fe90", #Final
|
49
|
+
"U+fe91", #Initial
|
50
|
+
"U+fe92", #Medial
|
51
|
+
true)
|
52
|
+
map[new_character.common_encoding] = new_character
|
53
|
+
|
54
|
+
#Teh
|
55
|
+
new_character = ArabicCharacterInfo.new(
|
56
|
+
"U+062a", #Common
|
57
|
+
"U+fe95", #Isolated
|
58
|
+
"U+fe96", #Final
|
59
|
+
"U+fe97", #Initial
|
60
|
+
"U+fe98", #Medial
|
61
|
+
true)
|
62
|
+
map[new_character.common_encoding] = new_character
|
63
|
+
|
64
|
+
#Theh
|
65
|
+
new_character = ArabicCharacterInfo.new(
|
66
|
+
"U+062b", #Common
|
67
|
+
"U+fe99", #Isolated
|
68
|
+
"U+fe9a", #Final
|
69
|
+
"U+fe9b", #Initial
|
70
|
+
"U+fe9c", #Medial
|
71
|
+
true)
|
72
|
+
map[new_character.common_encoding] = new_character
|
73
|
+
|
74
|
+
#Jeem
|
75
|
+
new_character = ArabicCharacterInfo.new(
|
76
|
+
"U+062c", #Common
|
77
|
+
"U+fe9d", #Isolated
|
78
|
+
"U+fe9e", #Final
|
79
|
+
"U+fe9f", #Initial
|
80
|
+
"U+fea0", #Medial
|
81
|
+
true)
|
82
|
+
map[new_character.common_encoding] = new_character
|
83
|
+
|
84
|
+
#7ah
|
85
|
+
new_character = ArabicCharacterInfo.new(
|
86
|
+
"U+062d", #Common
|
87
|
+
"U+fea1", #Isolated
|
88
|
+
"U+fea2", #Final
|
89
|
+
"U+fea3", #Initial
|
90
|
+
"U+fea4", #Medial
|
91
|
+
true)
|
92
|
+
map[new_character.common_encoding] = new_character
|
93
|
+
|
94
|
+
#7'ah
|
95
|
+
new_character = ArabicCharacterInfo.new(
|
96
|
+
"U+062e", #Common
|
97
|
+
"U+fea5", #Isolated
|
98
|
+
"U+fea6", #Final
|
99
|
+
"U+fea7", #Initial
|
100
|
+
"U+fea8", #Medial
|
101
|
+
true)
|
102
|
+
map[new_character.common_encoding] = new_character
|
103
|
+
|
104
|
+
#Dal
|
105
|
+
new_character = ArabicCharacterInfo.new(
|
106
|
+
"U+062f", #Common
|
107
|
+
"U+fea9", #Isolated
|
108
|
+
"U+feaa", #Final
|
109
|
+
"U+fea9", #Initial
|
110
|
+
"U+feaa", #Medial
|
111
|
+
false)
|
112
|
+
map[new_character.common_encoding] = new_character
|
113
|
+
|
114
|
+
#Thal
|
115
|
+
new_character = ArabicCharacterInfo.new(
|
116
|
+
"U+0630", #Common
|
117
|
+
"U+feab", #Isolated
|
118
|
+
"U+feac", #Final
|
119
|
+
"U+feab", #Initial
|
120
|
+
"U+feac", #Medial
|
121
|
+
false)
|
122
|
+
map[new_character.common_encoding] = new_character
|
123
|
+
|
124
|
+
#Rah
|
125
|
+
new_character = ArabicCharacterInfo.new(
|
126
|
+
"U+0631", #Common
|
127
|
+
"U+fead", #Isolated
|
128
|
+
"U+feae", #Final
|
129
|
+
"U+fead", #Initial
|
130
|
+
"U+feae", #Medial
|
131
|
+
false)
|
132
|
+
map[new_character.common_encoding] = new_character
|
133
|
+
|
134
|
+
#Zein
|
135
|
+
new_character = ArabicCharacterInfo.new(
|
136
|
+
"U+0632", #Common
|
137
|
+
"U+feaf", #Isolated
|
138
|
+
"U+feb0", #Final
|
139
|
+
"U+feaf", #Initial
|
140
|
+
"U+feb0", #Medial
|
141
|
+
false)
|
142
|
+
map[new_character.common_encoding] = new_character
|
143
|
+
|
144
|
+
#Seen
|
145
|
+
new_character = ArabicCharacterInfo.new(
|
146
|
+
"U+0633", #Common
|
147
|
+
"U+feb1", #Isolated
|
148
|
+
"U+feb2", #Final
|
149
|
+
"U+feb3", #Initial
|
150
|
+
"U+feb4", #Medial
|
151
|
+
true)
|
152
|
+
map[new_character.common_encoding] = new_character
|
153
|
+
|
154
|
+
#Sheen
|
155
|
+
new_character = ArabicCharacterInfo.new(
|
156
|
+
"U+0634", #Common
|
157
|
+
"U+feb5", #Isolated
|
158
|
+
"U+feb6", #Final
|
159
|
+
"U+feb7", #Initial
|
160
|
+
"U+feb8", #Medial
|
161
|
+
true)
|
162
|
+
map[new_character.common_encoding] = new_character
|
163
|
+
|
164
|
+
|
165
|
+
#Sad
|
166
|
+
new_character = ArabicCharacterInfo.new(
|
167
|
+
"U+0635", #Common
|
168
|
+
"U+feb9", #Isolated
|
169
|
+
"U+feba", #Final
|
170
|
+
"U+febb", #Initial
|
171
|
+
"U+febc", #Medial
|
172
|
+
true)
|
173
|
+
map[new_character.common_encoding] = new_character
|
174
|
+
|
175
|
+
|
176
|
+
#Dad
|
177
|
+
new_character = ArabicCharacterInfo.new(
|
178
|
+
"U+0636", #Common
|
179
|
+
"U+febd", #Isolated
|
180
|
+
"U+febe", #Final
|
181
|
+
"U+febf", #Initial
|
182
|
+
"U+fec0", #Medial
|
183
|
+
true)
|
184
|
+
map[new_character.common_encoding] = new_character
|
185
|
+
|
186
|
+
|
187
|
+
#Tah
|
188
|
+
new_character = ArabicCharacterInfo.new(
|
189
|
+
"U+0637", #Common
|
190
|
+
"U+fec1", #Isolated
|
191
|
+
"U+fec2", #Final
|
192
|
+
"U+fec3", #Initial
|
193
|
+
"U+fec4", #Medial
|
194
|
+
true)
|
195
|
+
map[new_character.common_encoding] = new_character
|
196
|
+
|
197
|
+
#Thah
|
198
|
+
new_character = ArabicCharacterInfo.new(
|
199
|
+
"U+0638", #Common
|
200
|
+
"U+fec5", #Isolated
|
201
|
+
"U+fec6", #Final
|
202
|
+
"U+fec7", #Initial
|
203
|
+
"U+fec8", #Medial
|
204
|
+
true)
|
205
|
+
map[new_character.common_encoding] = new_character
|
206
|
+
|
207
|
+
#3ein
|
208
|
+
new_character = ArabicCharacterInfo.new(
|
209
|
+
"U+0639", #Common
|
210
|
+
"U+fec9", #Isolated
|
211
|
+
"U+feca", #Final
|
212
|
+
"U+fecb", #Initial
|
213
|
+
"U+fecc", #Medial
|
214
|
+
true)
|
215
|
+
map[new_character.common_encoding] = new_character
|
216
|
+
|
217
|
+
|
218
|
+
#3'ein
|
219
|
+
new_character = ArabicCharacterInfo.new(
|
220
|
+
"U+063a", #Common
|
221
|
+
"U+fecd", #Isolated
|
222
|
+
"U+fece", #Final
|
223
|
+
"U+fecf", #Initial
|
224
|
+
"U+fed0", #Medial
|
225
|
+
true)
|
226
|
+
map[new_character.common_encoding] = new_character
|
227
|
+
|
228
|
+
#Feh
|
229
|
+
new_character = ArabicCharacterInfo.new(
|
230
|
+
"U+0641", #Common
|
231
|
+
"U+fed1", #Isolated
|
232
|
+
"U+fed2", #Final
|
233
|
+
"U+fed3", #Initial
|
234
|
+
"U+fed4", #Medial
|
235
|
+
true)
|
236
|
+
map[new_character.common_encoding] = new_character
|
237
|
+
|
238
|
+
|
239
|
+
#Qaf
|
240
|
+
new_character = ArabicCharacterInfo.new(
|
241
|
+
"U+0642", #Common
|
242
|
+
"U+fed5", #Isolated
|
243
|
+
"U+fed6", #Final
|
244
|
+
"U+fed7", #Initial
|
245
|
+
"U+fed8", #Medial
|
246
|
+
true)
|
247
|
+
map[new_character.common_encoding] = new_character
|
248
|
+
|
249
|
+
|
250
|
+
#Kaf
|
251
|
+
new_character = ArabicCharacterInfo.new(
|
252
|
+
"U+0643", #Common
|
253
|
+
"U+fed9", #Isolated
|
254
|
+
"U+feda", #Final
|
255
|
+
"U+fedb", #Initial
|
256
|
+
"U+fedc", #Medial
|
257
|
+
true)
|
258
|
+
map[new_character.common_encoding] = new_character
|
259
|
+
|
260
|
+
#Lam
|
261
|
+
new_character = ArabicCharacterInfo.new(
|
262
|
+
"U+0644", #Common
|
263
|
+
"U+fedd", #Isolated
|
264
|
+
"U+fede", #Final
|
265
|
+
"U+fedf", #Initial
|
266
|
+
"U+fee0", #Medial
|
267
|
+
true)
|
268
|
+
map[new_character.common_encoding] = new_character
|
269
|
+
|
270
|
+
#Meem
|
271
|
+
new_character = ArabicCharacterInfo.new(
|
272
|
+
"U+0645", #Common
|
273
|
+
"U+fee1", #Isolated
|
274
|
+
"U+fee2", #Final
|
275
|
+
"U+fee3", #Initial
|
276
|
+
"U+fee4", #Medial
|
277
|
+
true)
|
278
|
+
map[new_character.common_encoding] = new_character
|
279
|
+
|
280
|
+
#Noon
|
281
|
+
new_character = ArabicCharacterInfo.new(
|
282
|
+
"U+0646", #Common
|
283
|
+
"U+fee5", #Isolated
|
284
|
+
"U+fee6", #Final
|
285
|
+
"U+fee7", #Initial
|
286
|
+
"U+fee8", #Medial
|
287
|
+
true)
|
288
|
+
map[new_character.common_encoding] = new_character
|
289
|
+
|
290
|
+
#Heh
|
291
|
+
new_character = ArabicCharacterInfo.new(
|
292
|
+
"U+0647", #Common
|
293
|
+
"U+fee9", #Isolated
|
294
|
+
"U+feea", #Final
|
295
|
+
"U+feeb", #Initial
|
296
|
+
"U+feec", #Medial
|
297
|
+
true)
|
298
|
+
map[new_character.common_encoding] = new_character
|
299
|
+
|
300
|
+
#Waw
|
301
|
+
new_character = ArabicCharacterInfo.new(
|
302
|
+
"U+0648", #Common
|
303
|
+
"U+feed", #Isolated
|
304
|
+
"U+feee", #Final
|
305
|
+
"U+feed", #Initial
|
306
|
+
"U+feee", #Medial
|
307
|
+
false)
|
308
|
+
map[new_character.common_encoding] = new_character
|
309
|
+
|
310
|
+
#Yeh
|
311
|
+
new_character = ArabicCharacterInfo.new(
|
312
|
+
"U+064a", #Common
|
313
|
+
"U+fef1", #Isolated
|
314
|
+
"U+fef2", #Final
|
315
|
+
"U+fef3", #Initial
|
316
|
+
"U+fef4", #Medial
|
317
|
+
true)
|
318
|
+
map[new_character.common_encoding] = new_character
|
319
|
+
|
320
|
+
#Hamza
|
321
|
+
new_character = ArabicCharacterInfo.new(
|
322
|
+
"U+0621", #Common
|
323
|
+
"U+fe80", #Isolated
|
324
|
+
"U+fe80", #Final
|
325
|
+
"U+fe80", #Initial
|
326
|
+
"U+fe80", #Medial
|
327
|
+
false)
|
328
|
+
map[new_character.common_encoding] = new_character
|
329
|
+
|
330
|
+
|
331
|
+
# Alef Madda
|
332
|
+
new_character = ArabicCharacterInfo.new(
|
333
|
+
"U+0622", #Common
|
334
|
+
"U+fe81", #Isolated
|
335
|
+
"U+fe82", #Final
|
336
|
+
"U+fe81", #Initial
|
337
|
+
"U+fe82", #Medial
|
338
|
+
false)
|
339
|
+
map[new_character.common_encoding] = new_character
|
340
|
+
|
341
|
+
# Alef Hamza Above
|
342
|
+
new_character = ArabicCharacterInfo.new(
|
343
|
+
"U+0623", #Common
|
344
|
+
"U+fe83", #Isolated
|
345
|
+
"U+fe84", #Final
|
346
|
+
"U+fe83", #Initial
|
347
|
+
"U+fe84", #Medial
|
348
|
+
false)
|
349
|
+
map[new_character.common_encoding] = new_character
|
350
|
+
|
351
|
+
# Waw Hamza
|
352
|
+
new_character = ArabicCharacterInfo.new(
|
353
|
+
"U+0624", #Common
|
354
|
+
"U+fe85", #Isolated
|
355
|
+
"U+fe86", #Final
|
356
|
+
"U+fe85", #Initial
|
357
|
+
"U+fe86", #Medial
|
358
|
+
false)
|
359
|
+
map[new_character.common_encoding] = new_character
|
360
|
+
|
361
|
+
# Alef Hamza Below
|
362
|
+
new_character = ArabicCharacterInfo.new(
|
363
|
+
"U+0625", #Common
|
364
|
+
"U+fe87", #Isolated
|
365
|
+
"U+fe88", #Final
|
366
|
+
"U+fe87", #Initial
|
367
|
+
"U+fe88", #Medial
|
368
|
+
false)
|
369
|
+
map[new_character.common_encoding] = new_character
|
370
|
+
|
371
|
+
# Yeh Hamza
|
372
|
+
new_character = ArabicCharacterInfo.new(
|
373
|
+
"U+0626", #Common
|
374
|
+
"U+fe89", #Isolated
|
375
|
+
"U+fe8a", #Final
|
376
|
+
"U+fe8b", #Initial
|
377
|
+
"U+fe8c", #Medial
|
378
|
+
true)
|
379
|
+
map[new_character.common_encoding] = new_character
|
380
|
+
|
381
|
+
# Teh Marbuta
|
382
|
+
new_character = ArabicCharacterInfo.new(
|
383
|
+
"U+0629", #Common
|
384
|
+
"U+fe93", #Isolated
|
385
|
+
"U+fe94", #Final
|
386
|
+
"U+fe93", #Initial
|
387
|
+
"U+fe94", #Medial
|
388
|
+
false)
|
389
|
+
map[new_character.common_encoding] = new_character
|
390
|
+
|
391
|
+
# Tatweel
|
392
|
+
new_character = ArabicCharacterInfo.new(
|
393
|
+
"U+0640", #Common
|
394
|
+
"U+0640", #Isolated
|
395
|
+
"U+0640", #Final
|
396
|
+
"U+0640", #Initial
|
397
|
+
"U+0640", #Medial
|
398
|
+
true)
|
399
|
+
map[new_character.common_encoding] = new_character
|
400
|
+
|
401
|
+
# Alef Layyena
|
402
|
+
new_character = ArabicCharacterInfo.new(
|
403
|
+
"U+0649", #Common
|
404
|
+
"U+feef", #Isolated
|
405
|
+
"U+fef0", #Final
|
406
|
+
"U+feef", #Initial
|
407
|
+
"U+fef0", #Medial
|
408
|
+
false)
|
409
|
+
map[new_character.common_encoding] = new_character
|
410
|
+
|
411
|
+
@@arabic_characters_map = map
|
412
|
+
|
413
|
+
return @@arabic_characters_map
|
414
|
+
end
|
415
|
+
|
416
|
+
end
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
class String
|
421
|
+
|
422
|
+
|
423
|
+
|
424
|
+
def determine_format(before_c, after_c)
|
425
|
+
|
426
|
+
charmap = ArabicCharacterInfo.get_arabic_characters_map
|
427
|
+
|
428
|
+
previous_is_character = charmap.key?(before_c)
|
429
|
+
after_is_character = charmap.key?(after_c)
|
430
|
+
|
431
|
+
if !after_is_character and (!previous_is_character or !charmap[before_c].is_connected)
|
432
|
+
return CharacterFormat::Isolated
|
433
|
+
end
|
434
|
+
|
435
|
+
if !after_is_character
|
436
|
+
return CharacterFormat::Final
|
437
|
+
end
|
438
|
+
|
439
|
+
if !previous_is_character or !charmap[before_c].is_connected
|
440
|
+
return CharacterFormat::Initial
|
441
|
+
end
|
442
|
+
|
443
|
+
return CharacterFormat::Medial
|
444
|
+
|
445
|
+
end
|
446
|
+
|
447
|
+
def get_letter_in_format(format, c)
|
448
|
+
charmap = ArabicCharacterInfo.get_arabic_characters_map
|
449
|
+
character = charmap[c]
|
450
|
+
if character.nil?
|
451
|
+
return c
|
452
|
+
end
|
453
|
+
return character.format_encodings[format]
|
454
|
+
end
|
455
|
+
|
456
|
+
def fix_word
|
457
|
+
|
458
|
+
is_arabic = false
|
459
|
+
connected_arabic = ""
|
460
|
+
previous_letter = ''
|
461
|
+
before_previous_letter = ''
|
462
|
+
|
463
|
+
self.each_utf8_char {|c|
|
464
|
+
|
465
|
+
if previous_letter != ''
|
466
|
+
|
467
|
+
format = determine_format(before_previous_letter, c)
|
468
|
+
fixed_character = get_letter_in_format(format, previous_letter)
|
469
|
+
connected_arabic += fixed_character
|
470
|
+
if fixed_character != previous_letter
|
471
|
+
is_arabic = true
|
472
|
+
end
|
473
|
+
|
474
|
+
end
|
475
|
+
|
476
|
+
before_previous_letter = previous_letter
|
477
|
+
previous_letter = c
|
478
|
+
}
|
479
|
+
|
480
|
+
if previous_letter != ''
|
481
|
+
|
482
|
+
format = determine_format(before_previous_letter, '')
|
483
|
+
fixed_character = get_letter_in_format(format, previous_letter)
|
484
|
+
connected_arabic += fixed_character
|
485
|
+
if fixed_character != previous_letter
|
486
|
+
is_arabic = true
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
if is_arabic
|
491
|
+
return connected_arabic.reverse_utf8!
|
492
|
+
else
|
493
|
+
return connected_arabic
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
def fix_arabic_glyphs
|
498
|
+
|
499
|
+
words = self.split(" ")
|
500
|
+
result = ""
|
501
|
+
|
502
|
+
#assuming default is rtl
|
503
|
+
ltr_buffer = ""
|
504
|
+
|
505
|
+
words.each { |word|
|
506
|
+
fixed_word = word.fix_word
|
507
|
+
if(fixed_word == word)
|
508
|
+
#a non-arabic word (ltr) so we will buffer to see if more ltr words will follow
|
509
|
+
ltr_buffer = ltr_buffer + " " + fixed_word
|
510
|
+
else
|
511
|
+
if(ltr_buffer.empty?)
|
512
|
+
result = fixed_word + " " + result
|
513
|
+
else
|
514
|
+
result = ltr_buffer + " " + result
|
515
|
+
result = fixed_word + " " + result
|
516
|
+
ltr_buffer = ""
|
517
|
+
end
|
518
|
+
end
|
519
|
+
}
|
520
|
+
|
521
|
+
if(!(ltr_buffer.empty?))
|
522
|
+
result = ltr_buffer + " " + result
|
523
|
+
end
|
524
|
+
|
525
|
+
return result
|
526
|
+
end
|
527
|
+
end
|
@@ -0,0 +1,726 @@
|
|
1
|
+
class String
|
2
|
+
|
3
|
+
require 'iconv'
|
4
|
+
require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
5
|
+
|
6
|
+
# taken from: http://www.w3.org/International/questions/qa-forms-utf-8
|
7
|
+
UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
|
8
|
+
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
9
|
+
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
10
|
+
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
11
|
+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
12
|
+
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
13
|
+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
14
|
+
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
15
|
+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
16
|
+
)*\z/mnx
|
17
|
+
|
18
|
+
|
19
|
+
# create UTF-8 character arrays (as class instance variables)
|
20
|
+
#
|
21
|
+
# mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
|
22
|
+
# - http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
23
|
+
# - http://unicode.org/Public/UNIDATA/CaseFolding.txt
|
24
|
+
# - http://www.decodeunicode.org
|
25
|
+
# - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
|
26
|
+
# - http://camomile.sourceforge.net
|
27
|
+
# - Character Palette (Mac OS X)
|
28
|
+
|
29
|
+
|
30
|
+
# test data
|
31
|
+
@small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
32
|
+
|
33
|
+
|
34
|
+
@capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
35
|
+
|
36
|
+
|
37
|
+
@other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
38
|
+
|
39
|
+
if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
|
40
|
+
if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
|
41
|
+
if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
|
42
|
+
|
43
|
+
|
44
|
+
@unicode_array = []
|
45
|
+
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
|
46
|
+
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
|
47
|
+
# f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
|
48
|
+
#end
|
49
|
+
|
50
|
+
#@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
|
51
|
+
@letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
|
52
|
+
|
53
|
+
# Hash[*array_with_keys.zip(array_with_values).flatten]
|
54
|
+
@downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
|
55
|
+
@upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
|
56
|
+
@letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
|
57
|
+
|
58
|
+
class << self
|
59
|
+
attr_accessor :small_letters_utf8
|
60
|
+
attr_accessor :capital_letters_utf8
|
61
|
+
attr_accessor :other_letters_utf8
|
62
|
+
attr_accessor :letters_utf8
|
63
|
+
attr_accessor :letters_utf8_hash
|
64
|
+
attr_accessor :unicode_array
|
65
|
+
attr_accessor :downcase_table_utf8
|
66
|
+
attr_accessor :upcase_table_utf8
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def each_utf8_char
|
71
|
+
scan(/./mu) { |c| yield c }
|
72
|
+
end
|
73
|
+
|
74
|
+
def each_utf8_char_with_index
|
75
|
+
i = -1
|
76
|
+
scan(/./mu) { |c| i+=1; yield(c, i) }
|
77
|
+
end
|
78
|
+
|
79
|
+
def length_utf8
|
80
|
+
#scan(/./mu).size
|
81
|
+
count = 0
|
82
|
+
scan(/./mu) { count += 1 }
|
83
|
+
count
|
84
|
+
end
|
85
|
+
alias :size_utf8 :length_utf8
|
86
|
+
|
87
|
+
def reverse_utf8
|
88
|
+
split(//mu).reverse.join
|
89
|
+
end
|
90
|
+
|
91
|
+
def reverse_utf8!
|
92
|
+
split(//mu).reverse!.join
|
93
|
+
end
|
94
|
+
|
95
|
+
def swapcase_utf8
|
96
|
+
gsub(/./mu) do |char|
|
97
|
+
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
98
|
+
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
99
|
+
else char.swapcase
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def swapcase_utf8!
|
105
|
+
gsub!(/./mu) do |char|
|
106
|
+
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
107
|
+
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
108
|
+
else ret = char.swapcase end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def downcase_utf8
|
113
|
+
gsub(/./mu) do |char|
|
114
|
+
small_char = String.downcase_table_utf8[char]
|
115
|
+
small_char.nil? ? char.downcase : small_char
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def downcase_utf8!
|
120
|
+
gsub!(/./mu) do |char|
|
121
|
+
small_char = String.downcase_table_utf8[char]
|
122
|
+
small_char.nil? ? char.downcase : small_char
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def upcase_utf8
|
127
|
+
gsub(/./mu) do |char|
|
128
|
+
capital_char = String.upcase_table_utf8[char]
|
129
|
+
capital_char.nil? ? char.upcase : capital_char
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def upcase_utf8!
|
134
|
+
gsub!(/./mu) do |char|
|
135
|
+
capital_char = String.upcase_table_utf8[char]
|
136
|
+
capital_char.nil? ? char.upcase : capital_char
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def count_utf8(c)
|
141
|
+
return nil if c.empty?
|
142
|
+
r = %r{[#{c}]}mu
|
143
|
+
scan(r).size
|
144
|
+
end
|
145
|
+
|
146
|
+
def delete_utf8(c)
|
147
|
+
return self if c.empty?
|
148
|
+
r = %r{[#{c}]}mu
|
149
|
+
gsub(r, '')
|
150
|
+
end
|
151
|
+
|
152
|
+
def delete_utf8!(c)
|
153
|
+
return self if c.empty?
|
154
|
+
r = %r{[#{c}]}mu
|
155
|
+
gsub!(r, '')
|
156
|
+
end
|
157
|
+
|
158
|
+
def first_utf8
|
159
|
+
self[/\A./mu]
|
160
|
+
end
|
161
|
+
|
162
|
+
def last_utf8
|
163
|
+
self[/.\z/mu]
|
164
|
+
end
|
165
|
+
|
166
|
+
def capitalize_utf8
|
167
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
168
|
+
ret = ""
|
169
|
+
split(/\x20/).each do |w|
|
170
|
+
count = 0
|
171
|
+
w.gsub(/./mu) do |char|
|
172
|
+
count += 1
|
173
|
+
capital_char = String.upcase_table_utf8[char]
|
174
|
+
if count == 1 then
|
175
|
+
capital_char.nil? ? char.upcase : char.upcase_utf8
|
176
|
+
else
|
177
|
+
capital_char.nil? ? char.downcase : char.downcase_utf8
|
178
|
+
end
|
179
|
+
end
|
180
|
+
ret << w + ' '
|
181
|
+
end
|
182
|
+
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
183
|
+
end
|
184
|
+
|
185
|
+
def capitalize_utf8!
|
186
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
187
|
+
ret = ""
|
188
|
+
split(/\x20/).each do |w|
|
189
|
+
count = 0
|
190
|
+
w.gsub!(/./mu) do |char|
|
191
|
+
count += 1
|
192
|
+
capital_char = String.upcase_table_utf8[char]
|
193
|
+
if count == 1 then
|
194
|
+
capital_char.nil? ? char.upcase : char.upcase_utf8
|
195
|
+
else
|
196
|
+
capital_char.nil? ? char.downcase : char.downcase_utf8
|
197
|
+
end
|
198
|
+
end
|
199
|
+
ret << w + ' '
|
200
|
+
end
|
201
|
+
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def index_utf8(s)
|
206
|
+
|
207
|
+
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
208
|
+
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
209
|
+
|
210
|
+
if s.class == Regexp
|
211
|
+
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
212
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
213
|
+
str = s.source
|
214
|
+
return nil if str.empty?
|
215
|
+
str = "%r{#{str}}" + opts
|
216
|
+
r = eval(str)
|
217
|
+
l = ""
|
218
|
+
sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
|
219
|
+
l.empty? ? nil : l.length_utf8
|
220
|
+
|
221
|
+
else
|
222
|
+
|
223
|
+
return nil if s.empty?
|
224
|
+
r = %r{#{s}}mu
|
225
|
+
l = ""
|
226
|
+
sub(r) { l << $`; " " }
|
227
|
+
l.empty? ? nil : l.length_utf8
|
228
|
+
|
229
|
+
# this would be a non-regex solution
|
230
|
+
=begin
|
231
|
+
return nil if s.empty?
|
232
|
+
return nil unless self =~ %r{#{s}}mu
|
233
|
+
indices = []
|
234
|
+
s.split(//mu).each do |x|
|
235
|
+
ar = []
|
236
|
+
self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
|
237
|
+
indices << ar unless ar.empty?
|
238
|
+
end
|
239
|
+
if indices.empty?
|
240
|
+
return nil
|
241
|
+
elsif indices.size == 1
|
242
|
+
indices.first.first
|
243
|
+
else
|
244
|
+
#p indices
|
245
|
+
ret = []
|
246
|
+
a0 = indices.shift
|
247
|
+
a0.each do |i|
|
248
|
+
ret << i
|
249
|
+
indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
|
250
|
+
return ret.first unless ret.empty?
|
251
|
+
end
|
252
|
+
ret.empty? ? nil : ret.first
|
253
|
+
end
|
254
|
+
=end
|
255
|
+
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
def rindex_utf8(s)
|
261
|
+
|
262
|
+
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
263
|
+
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
264
|
+
|
265
|
+
if s.class == Regexp
|
266
|
+
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
267
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
268
|
+
str = s.source
|
269
|
+
return nil if str.empty?
|
270
|
+
str = "%r{#{str}}" + opts
|
271
|
+
r = eval(str)
|
272
|
+
l = ""
|
273
|
+
scan(r) { l = $` }
|
274
|
+
#gsub(r) { l = $`; " " }
|
275
|
+
l.empty? ? nil : l.length_utf8
|
276
|
+
else
|
277
|
+
return nil if s.empty?
|
278
|
+
r = %r{#{s}}mu
|
279
|
+
l = ""
|
280
|
+
scan(r) { l = $` }
|
281
|
+
#gsub(r) { l = $`; " " }
|
282
|
+
l.empty? ? nil : l.length_utf8
|
283
|
+
end
|
284
|
+
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
# note that the i option does not work in special cases with back references
|
289
|
+
# example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
|
290
|
+
def slice_utf8(regex)
|
291
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
292
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
293
|
+
s = regex.source
|
294
|
+
str = "%r{#{s}}" + opts
|
295
|
+
r = eval(str)
|
296
|
+
slice(r)
|
297
|
+
end
|
298
|
+
|
299
|
+
def slice_utf8!(regex)
|
300
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
301
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
302
|
+
s = regex.source
|
303
|
+
str = "%r{#{s}}" + opts
|
304
|
+
r = eval(str)
|
305
|
+
slice!(r)
|
306
|
+
end
|
307
|
+
|
308
|
+
def cut_utf8(p,l) # (index) position, length
|
309
|
+
raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
|
310
|
+
s = self.length_utf8
|
311
|
+
#if p < 0 then p = s - p.abs end
|
312
|
+
if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
|
313
|
+
return nil if l > s or p > (s - 1)
|
314
|
+
ret = ""
|
315
|
+
count = 0
|
316
|
+
each_utf8_char_with_index do |c,i|
|
317
|
+
break if count >= l
|
318
|
+
if i >= p && count < l then count += 1; ret << c; end
|
319
|
+
end
|
320
|
+
ret
|
321
|
+
end
|
322
|
+
|
323
|
+
def starts_with_utf8?(s)
|
324
|
+
return nil if self.empty? or s.empty?
|
325
|
+
cut_utf8(0, s.size_utf8) == s
|
326
|
+
end
|
327
|
+
|
328
|
+
def ends_with_utf8?(s)
|
329
|
+
return nil if self.empty? or s.empty?
|
330
|
+
cut_utf8(-(s.size_utf8), s.size_utf8) == s
|
331
|
+
end
|
332
|
+
|
333
|
+
def insert_utf8(i,s) # insert_utf8(index, string)
|
334
|
+
return self if s.empty?
|
335
|
+
l = self.length_utf8
|
336
|
+
if l == 0 then return s end
|
337
|
+
if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
|
338
|
+
#return nil if i > (l - 1) # return nil ...
|
339
|
+
spaces = ""
|
340
|
+
if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
|
341
|
+
str = self << spaces
|
342
|
+
s1 = str.cut_utf8(0, i)
|
343
|
+
s2 = str.cut_utf8(i, l - s1.length_utf8)
|
344
|
+
s1 << s << s2
|
345
|
+
end
|
346
|
+
|
347
|
+
def split_utf8(regex)
|
348
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
349
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
350
|
+
s = regex.source
|
351
|
+
str = "%r{#{s}}" + opts
|
352
|
+
r = eval(str)
|
353
|
+
split(r)
|
354
|
+
end
|
355
|
+
|
356
|
+
def scan_utf8(regex)
|
357
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
358
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
359
|
+
s = regex.source
|
360
|
+
str = "%r{#{s}}" + opts
|
361
|
+
r = eval(str)
|
362
|
+
if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
|
363
|
+
end
|
364
|
+
|
365
|
+
def range_utf8(r)
|
366
|
+
|
367
|
+
return nil if r.class != Range
|
368
|
+
#raise(ArgumentError, "No Range object given!", caller) if r.class != Range
|
369
|
+
|
370
|
+
a = r.to_s[/^[\+\-]?\d+/].to_i
|
371
|
+
b = r.to_s[/[\+\-]?\d+$/].to_i
|
372
|
+
d = r.to_s[/\.+/]
|
373
|
+
|
374
|
+
if d.size == 2 then d = 2 else d = d.size end
|
375
|
+
|
376
|
+
l = self.length_utf8
|
377
|
+
|
378
|
+
return nil if b.abs > l || a.abs > l || d < 2 || d > 3
|
379
|
+
|
380
|
+
if a < 0 then a = l - a.abs end
|
381
|
+
if b < 0 then b = l - b.abs end
|
382
|
+
|
383
|
+
return nil if a > b
|
384
|
+
|
385
|
+
str = ""
|
386
|
+
|
387
|
+
each_utf8_char_with_index do |c,i|
|
388
|
+
break if i > b
|
389
|
+
if d == 2
|
390
|
+
(i >= a && i <= b) ? str << c : next
|
391
|
+
else
|
392
|
+
(i >= a && i < b) ? str << c : next
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
str
|
397
|
+
|
398
|
+
end
|
399
|
+
|
400
|
+
def utf8?
|
401
|
+
self =~ UTF8REGEX
|
402
|
+
end
|
403
|
+
|
404
|
+
def clean_utf8
|
405
|
+
t = ""
|
406
|
+
self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
|
407
|
+
t
|
408
|
+
end
|
409
|
+
|
410
|
+
|
411
|
+
def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
|
412
|
+
|
413
|
+
file = self
|
414
|
+
str = ""
|
415
|
+
|
416
|
+
if file =~ /^http:\/\//
|
417
|
+
|
418
|
+
url = file
|
419
|
+
|
420
|
+
if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
|
421
|
+
|
422
|
+
seconds = 30
|
423
|
+
|
424
|
+
# check if web site is reachable
|
425
|
+
# on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
|
426
|
+
var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
|
427
|
+
|
428
|
+
#return false unless var == 0
|
429
|
+
raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
|
430
|
+
|
431
|
+
str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
|
432
|
+
/usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
|
433
|
+
p str
|
434
|
+
return true if str =~ /utf-?8/i
|
435
|
+
return false if !str.empty? && str !~ /utf-?8/i
|
436
|
+
|
437
|
+
# solutions with downloaded file
|
438
|
+
|
439
|
+
# download HTML file
|
440
|
+
#downloaded_file = "/tmp/html"
|
441
|
+
downloaded_file = "~/Desktop/html"
|
442
|
+
downloaded_file = File.expand_path(downloaded_file)
|
443
|
+
%x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
|
444
|
+
raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
|
445
|
+
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
|
446
|
+
|
447
|
+
simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
|
448
|
+
p simple_test
|
449
|
+
|
450
|
+
# read entire file into a string
|
451
|
+
File.open(downloaded_file).read.each(nil) do |str|
|
452
|
+
#return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
|
453
|
+
str.utf8? ? (return true) : (return false)
|
454
|
+
end
|
455
|
+
|
456
|
+
#check each line of the downloaded file
|
457
|
+
#count_lines = 0
|
458
|
+
#count_utf8 = 0
|
459
|
+
#File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
|
460
|
+
#count_lines == count_utf8 ? (return true) : (return false)
|
461
|
+
|
462
|
+
|
463
|
+
# in-memory solutions
|
464
|
+
|
465
|
+
#html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
|
466
|
+
#p html_file_cleaned_utf8.utf8?
|
467
|
+
|
468
|
+
count_lines = 0
|
469
|
+
count_utf8 = 0
|
470
|
+
#%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
|
471
|
+
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
|
472
|
+
#return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
473
|
+
count_lines += 1
|
474
|
+
count_utf8 += 1 if line.utf8?
|
475
|
+
break if count_lines != count_utf8
|
476
|
+
end
|
477
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
478
|
+
|
479
|
+
else
|
480
|
+
|
481
|
+
# check each line of the HTML file (or the entire HTML file at once)
|
482
|
+
# cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
483
|
+
count_lines = 0
|
484
|
+
count_utf8 = 0
|
485
|
+
open(url) do |f|
|
486
|
+
# p f.meta, f.content_encoding, f.content_type
|
487
|
+
cs = f.charset
|
488
|
+
return true if cs =~ /utf-?8/i
|
489
|
+
#f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
|
490
|
+
f.each_line do |line|
|
491
|
+
count_lines += 1
|
492
|
+
count_utf8 += 1 if line.utf8?
|
493
|
+
break unless count_lines == count_utf8
|
494
|
+
end
|
495
|
+
end
|
496
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
497
|
+
|
498
|
+
end
|
499
|
+
|
500
|
+
else
|
501
|
+
|
502
|
+
return false unless File.file?(file)
|
503
|
+
|
504
|
+
if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
|
505
|
+
|
506
|
+
# read entire file into a string
|
507
|
+
#File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
|
508
|
+
|
509
|
+
# check each line of the file
|
510
|
+
count_lines = 0
|
511
|
+
count_utf8 = 0
|
512
|
+
File.foreach(file) do |line|
|
513
|
+
return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
514
|
+
count_lines += 1;
|
515
|
+
count_utf8 += 1 if line.utf8?;
|
516
|
+
break if count_lines != count_utf8
|
517
|
+
end
|
518
|
+
|
519
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
520
|
+
|
521
|
+
end
|
522
|
+
|
523
|
+
str =~ /utf-?8/i ? true : false
|
524
|
+
|
525
|
+
end
|
526
|
+
|
527
|
+
|
528
|
+
# cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
|
529
|
+
def validate_utf8
|
530
|
+
Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
|
531
|
+
end
|
532
|
+
|
533
|
+
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
534
|
+
def asciify_utf8
|
535
|
+
return nil unless self.utf8?
|
536
|
+
#Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
|
537
|
+
# delete all punctuation characters inside words except "-" in words such as up-to-date
|
538
|
+
Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
|
539
|
+
end
|
540
|
+
|
541
|
+
def latin1_to_utf8 # ISO-8859-1 to UTF-8
|
542
|
+
ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
|
543
|
+
ret.utf8? ? ret : nil
|
544
|
+
end
|
545
|
+
|
546
|
+
def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
|
547
|
+
ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
|
548
|
+
ret.utf8? ? ret : nil
|
549
|
+
end
|
550
|
+
|
551
|
+
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
552
|
+
def utf16le_to_utf8
|
553
|
+
ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
|
554
|
+
ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
|
555
|
+
ret.utf8? ? ret : nil
|
556
|
+
end
|
557
|
+
|
558
|
+
def utf8_to_utf16le
|
559
|
+
return nil unless self.utf8?
|
560
|
+
ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
|
561
|
+
end
|
562
|
+
|
563
|
+
def utf8_to_unicode
|
564
|
+
return nil unless self.utf8?
|
565
|
+
str = ""
|
566
|
+
scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
|
567
|
+
str
|
568
|
+
end
|
569
|
+
|
570
|
+
def unicode_to_utf8
|
571
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
572
|
+
str = ""
|
573
|
+
#scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
574
|
+
#scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
575
|
+
scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
|
576
|
+
c = $1
|
577
|
+
if c =~ /^U\+/
|
578
|
+
str << [c[2..-1].hex].pack("U*")
|
579
|
+
else
|
580
|
+
str << c
|
581
|
+
end
|
582
|
+
end
|
583
|
+
str.utf8? ? str : nil
|
584
|
+
end
|
585
|
+
|
586
|
+
|
587
|
+
# dec, hex, oct conversions (experimental!)
|
588
|
+
|
589
|
+
def utf8_to_dec
|
590
|
+
return nil unless self.utf8?
|
591
|
+
str = ""
|
592
|
+
scan(/./mu) do |c|
|
593
|
+
if c =~ /^\x00$/
|
594
|
+
str << "aaa\x00" # encode \x00 as "aaa"
|
595
|
+
else
|
596
|
+
str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
|
597
|
+
end
|
598
|
+
end
|
599
|
+
str[0..-2]
|
600
|
+
end
|
601
|
+
|
602
|
+
def dec_to_utf8 # \x00 is encoded as "aaa"
|
603
|
+
return self if self.empty?
|
604
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
605
|
+
str = ""
|
606
|
+
split(/\x00/).each do |c|
|
607
|
+
if c.eql?("aaa")
|
608
|
+
str << "\x00"
|
609
|
+
else
|
610
|
+
str << [c.to_i].pack("U*")
|
611
|
+
end
|
612
|
+
end
|
613
|
+
str
|
614
|
+
end
|
615
|
+
|
616
|
+
|
617
|
+
def utf8_to_dec_2
|
618
|
+
return nil unless self.utf8?
|
619
|
+
str = ""
|
620
|
+
tmpstr = ""
|
621
|
+
null_str = "\x00"
|
622
|
+
scan(/./mu) do |c|
|
623
|
+
if c =~ /^\x00$/
|
624
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
625
|
+
else
|
626
|
+
tmpstr = ""
|
627
|
+
c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
|
628
|
+
str << tmpstr << null_str
|
629
|
+
end
|
630
|
+
end
|
631
|
+
str[0..-3]
|
632
|
+
end
|
633
|
+
|
634
|
+
def dec_to_utf8_2 # \x00 is encoded as "aaa"
|
635
|
+
return self if self.empty?
|
636
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
637
|
+
str = ""
|
638
|
+
split(/\x00\x00/).each do |c|
|
639
|
+
if c =~ /\x00/
|
640
|
+
c.split(/\x00/).each { |x| str << x.to_i.chr }
|
641
|
+
elsif c.eql?("aaa")
|
642
|
+
str << "\x00"
|
643
|
+
else
|
644
|
+
str << c.to_i.chr
|
645
|
+
end
|
646
|
+
end
|
647
|
+
str
|
648
|
+
end
|
649
|
+
|
650
|
+
|
651
|
+
def utf8_to_hex
|
652
|
+
return nil unless self.utf8?
|
653
|
+
str = ""
|
654
|
+
tmpstr = ""
|
655
|
+
null_str = "\x00"
|
656
|
+
scan(/./mu) do |c|
|
657
|
+
if c =~ /^\x00$/
|
658
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
659
|
+
else
|
660
|
+
tmpstr = ""
|
661
|
+
c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
|
662
|
+
str << tmpstr << null_str
|
663
|
+
end
|
664
|
+
end
|
665
|
+
str[0..-3]
|
666
|
+
end
|
667
|
+
|
668
|
+
def hex_to_utf8 # \x00 is encoded as "aaa"
|
669
|
+
return self if self.empty?
|
670
|
+
return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
|
671
|
+
str = ""
|
672
|
+
split(/\x00\x00/).each do |c|
|
673
|
+
if c =~ /\x00/
|
674
|
+
c.split(/\x00/).each { |x| str << x.hex.chr }
|
675
|
+
elsif c.eql?("aaa")
|
676
|
+
str << "\x00"
|
677
|
+
else
|
678
|
+
str << c.hex.chr
|
679
|
+
end
|
680
|
+
end
|
681
|
+
str
|
682
|
+
end
|
683
|
+
|
684
|
+
|
685
|
+
def utf8_to_oct
|
686
|
+
return nil unless self.utf8?
|
687
|
+
str = ""
|
688
|
+
tmpstr = ""
|
689
|
+
null_str = "\x00"
|
690
|
+
scan(/./mu) do |c|
|
691
|
+
if c =~ /^\x00$/
|
692
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
693
|
+
else
|
694
|
+
tmpstr = ""
|
695
|
+
c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
|
696
|
+
str << tmpstr << null_str
|
697
|
+
end
|
698
|
+
end
|
699
|
+
str[0..-3]
|
700
|
+
end
|
701
|
+
|
702
|
+
def oct_to_utf8 # \x00 is encoded as "aaa"
|
703
|
+
return self if self.empty?
|
704
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
705
|
+
str = ""
|
706
|
+
split(/\x00\x00/).each do |c|
|
707
|
+
if c =~ /\x00/
|
708
|
+
c.split(/\x00/).each { |x| str << x.oct.chr }
|
709
|
+
elsif c.eql?("aaa")
|
710
|
+
str << "\x00"
|
711
|
+
else
|
712
|
+
str << c.oct.chr
|
713
|
+
end
|
714
|
+
end
|
715
|
+
str
|
716
|
+
end
|
717
|
+
|
718
|
+
# cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
|
719
|
+
def email_subject_utf8
|
720
|
+
return nil unless self.utf8?
|
721
|
+
"=?utf-8?b?#{[self].pack("m").delete("\n")}?="
|
722
|
+
end
|
723
|
+
|
724
|
+
end
|
725
|
+
|
726
|
+
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: Arabic-Prawn
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Dynamix Solutions
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-02-28 00:00:00 +02:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Allows printing arabic to PDFs generated by prawn
|
22
|
+
email: ahmed.nasser@dynamix-systems.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files:
|
28
|
+
- README
|
29
|
+
- LICENSE
|
30
|
+
files:
|
31
|
+
- LICENSE
|
32
|
+
- README
|
33
|
+
- Rakefile
|
34
|
+
- lib/arabic-prawn.rb
|
35
|
+
- lib/string_utf_support.rb
|
36
|
+
has_rdoc: true
|
37
|
+
homepage:
|
38
|
+
licenses: []
|
39
|
+
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
segments:
|
50
|
+
- 0
|
51
|
+
version: "0"
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
requirements: []
|
60
|
+
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 1.3.6
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: Allows printing arabic to PDFs generated by prawn
|
66
|
+
test_files: []
|
67
|
+
|