Arabic-Prawn 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -0
- data/README +3 -0
- data/Rakefile +50 -0
- data/lib/arabic-prawn.rb +527 -0
- data/lib/string_utf_support.rb +726 -0
- metadata +67 -0
data/LICENSE
ADDED
data/README
ADDED
data/Rakefile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#
|
|
2
|
+
# To change this template, choose Tools | Templates
|
|
3
|
+
# and open the template in the editor.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
require 'rubygems'
|
|
7
|
+
require 'rake'
|
|
8
|
+
require 'rake/clean'
|
|
9
|
+
require 'rake/gempackagetask'
|
|
10
|
+
require 'rake/rdoctask'
|
|
11
|
+
require 'rake/testtask'
|
|
12
|
+
require 'spec/rake/spectask'
|
|
13
|
+
|
|
14
|
+
spec = Gem::Specification.new do |s|
|
|
15
|
+
s.name = 'Arabic-Prawn'
|
|
16
|
+
s.version = '0.0.1'
|
|
17
|
+
s.has_rdoc = true
|
|
18
|
+
s.extra_rdoc_files = ['README', 'LICENSE']
|
|
19
|
+
s.summary = 'Your summary here'
|
|
20
|
+
s.description = s.summary
|
|
21
|
+
s.author = 'Dynamix Solutions'
|
|
22
|
+
s.email = 'ahmed.nasser@dynamix-systems.com'
|
|
23
|
+
# s.executables = ['your_executable_here']
|
|
24
|
+
s.files = %w(LICENSE README Rakefile) + Dir.glob("{bin,lib,spec}/**/*")
|
|
25
|
+
s.require_path = "lib"
|
|
26
|
+
s.bindir = "bin"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
Rake::GemPackageTask.new(spec) do |p|
|
|
30
|
+
p.gem_spec = spec
|
|
31
|
+
p.need_tar = true
|
|
32
|
+
p.need_zip = true
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
Rake::RDocTask.new do |rdoc|
|
|
36
|
+
files =['README', 'LICENSE', 'lib/**/*.rb']
|
|
37
|
+
rdoc.rdoc_files.add(files)
|
|
38
|
+
rdoc.main = "README" # page to start on
|
|
39
|
+
rdoc.title = "Arabic-Prawn Docs"
|
|
40
|
+
rdoc.rdoc_dir = 'doc/rdoc' # rdoc output folder
|
|
41
|
+
rdoc.options << '--line-numbers'
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
Rake::TestTask.new do |t|
|
|
45
|
+
t.test_files = FileList['test/**/*.rb']
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
Spec::Rake::SpecTask.new do |t|
|
|
49
|
+
t.spec_files = FileList['spec/**/*.rb']
|
|
50
|
+
end
|
data/lib/arabic-prawn.rb
ADDED
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
require 'string_utf_support'
|
|
2
|
+
|
|
3
|
+
class CharacterFormat
|
|
4
|
+
Isolated = 1
|
|
5
|
+
Initial = 2
|
|
6
|
+
Medial = 3
|
|
7
|
+
Final = 4
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ArabicCharacterInfo
|
|
12
|
+
|
|
13
|
+
@@arabic_characters_map = nil
|
|
14
|
+
attr_accessor :common_encoding , :format_encodings, :is_connected
|
|
15
|
+
|
|
16
|
+
def initialize(common, isolated, final, initial, medial, is_connected)
|
|
17
|
+
@common_encoding = common.unicode_to_utf8
|
|
18
|
+
@format_encodings = Hash.new
|
|
19
|
+
@format_encodings[CharacterFormat::Isolated] = isolated.unicode_to_utf8
|
|
20
|
+
@format_encodings[CharacterFormat::Initial] = initial.unicode_to_utf8
|
|
21
|
+
@format_encodings[CharacterFormat::Medial] = medial.unicode_to_utf8
|
|
22
|
+
@format_encodings[CharacterFormat::Final] = final.unicode_to_utf8
|
|
23
|
+
@is_connected = is_connected
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def ArabicCharacterInfo.get_arabic_characters_map
|
|
27
|
+
|
|
28
|
+
if !@@arabic_characters_map.nil?
|
|
29
|
+
return @@arabic_characters_map
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
map = Hash.new
|
|
33
|
+
|
|
34
|
+
#Alef
|
|
35
|
+
new_character = ArabicCharacterInfo.new(
|
|
36
|
+
"U+0627", #Common
|
|
37
|
+
"U+fe8d", #Isolated
|
|
38
|
+
"U+fe8e", #Final
|
|
39
|
+
"U+fe8d", #Initial
|
|
40
|
+
"U+fe8e", #Medial
|
|
41
|
+
false)
|
|
42
|
+
map[new_character.common_encoding] = new_character
|
|
43
|
+
|
|
44
|
+
#Beh
|
|
45
|
+
new_character = ArabicCharacterInfo.new(
|
|
46
|
+
"U+0628", #Common
|
|
47
|
+
"U+fe8f", #Isolated
|
|
48
|
+
"U+fe90", #Final
|
|
49
|
+
"U+fe91", #Initial
|
|
50
|
+
"U+fe92", #Medial
|
|
51
|
+
true)
|
|
52
|
+
map[new_character.common_encoding] = new_character
|
|
53
|
+
|
|
54
|
+
#Teh
|
|
55
|
+
new_character = ArabicCharacterInfo.new(
|
|
56
|
+
"U+062a", #Common
|
|
57
|
+
"U+fe95", #Isolated
|
|
58
|
+
"U+fe96", #Final
|
|
59
|
+
"U+fe97", #Initial
|
|
60
|
+
"U+fe98", #Medial
|
|
61
|
+
true)
|
|
62
|
+
map[new_character.common_encoding] = new_character
|
|
63
|
+
|
|
64
|
+
#Theh
|
|
65
|
+
new_character = ArabicCharacterInfo.new(
|
|
66
|
+
"U+062b", #Common
|
|
67
|
+
"U+fe99", #Isolated
|
|
68
|
+
"U+fe9a", #Final
|
|
69
|
+
"U+fe9b", #Initial
|
|
70
|
+
"U+fe9c", #Medial
|
|
71
|
+
true)
|
|
72
|
+
map[new_character.common_encoding] = new_character
|
|
73
|
+
|
|
74
|
+
#Jeem
|
|
75
|
+
new_character = ArabicCharacterInfo.new(
|
|
76
|
+
"U+062c", #Common
|
|
77
|
+
"U+fe9d", #Isolated
|
|
78
|
+
"U+fe9e", #Final
|
|
79
|
+
"U+fe9f", #Initial
|
|
80
|
+
"U+fea0", #Medial
|
|
81
|
+
true)
|
|
82
|
+
map[new_character.common_encoding] = new_character
|
|
83
|
+
|
|
84
|
+
#7ah
|
|
85
|
+
new_character = ArabicCharacterInfo.new(
|
|
86
|
+
"U+062d", #Common
|
|
87
|
+
"U+fea1", #Isolated
|
|
88
|
+
"U+fea2", #Final
|
|
89
|
+
"U+fea3", #Initial
|
|
90
|
+
"U+fea4", #Medial
|
|
91
|
+
true)
|
|
92
|
+
map[new_character.common_encoding] = new_character
|
|
93
|
+
|
|
94
|
+
#7'ah
|
|
95
|
+
new_character = ArabicCharacterInfo.new(
|
|
96
|
+
"U+062e", #Common
|
|
97
|
+
"U+fea5", #Isolated
|
|
98
|
+
"U+fea6", #Final
|
|
99
|
+
"U+fea7", #Initial
|
|
100
|
+
"U+fea8", #Medial
|
|
101
|
+
true)
|
|
102
|
+
map[new_character.common_encoding] = new_character
|
|
103
|
+
|
|
104
|
+
#Dal
|
|
105
|
+
new_character = ArabicCharacterInfo.new(
|
|
106
|
+
"U+062f", #Common
|
|
107
|
+
"U+fea9", #Isolated
|
|
108
|
+
"U+feaa", #Final
|
|
109
|
+
"U+fea9", #Initial
|
|
110
|
+
"U+feaa", #Medial
|
|
111
|
+
false)
|
|
112
|
+
map[new_character.common_encoding] = new_character
|
|
113
|
+
|
|
114
|
+
#Thal
|
|
115
|
+
new_character = ArabicCharacterInfo.new(
|
|
116
|
+
"U+0630", #Common
|
|
117
|
+
"U+feab", #Isolated
|
|
118
|
+
"U+feac", #Final
|
|
119
|
+
"U+feab", #Initial
|
|
120
|
+
"U+feac", #Medial
|
|
121
|
+
false)
|
|
122
|
+
map[new_character.common_encoding] = new_character
|
|
123
|
+
|
|
124
|
+
#Rah
|
|
125
|
+
new_character = ArabicCharacterInfo.new(
|
|
126
|
+
"U+0631", #Common
|
|
127
|
+
"U+fead", #Isolated
|
|
128
|
+
"U+feae", #Final
|
|
129
|
+
"U+fead", #Initial
|
|
130
|
+
"U+feae", #Medial
|
|
131
|
+
false)
|
|
132
|
+
map[new_character.common_encoding] = new_character
|
|
133
|
+
|
|
134
|
+
#Zein
|
|
135
|
+
new_character = ArabicCharacterInfo.new(
|
|
136
|
+
"U+0632", #Common
|
|
137
|
+
"U+feaf", #Isolated
|
|
138
|
+
"U+feb0", #Final
|
|
139
|
+
"U+feaf", #Initial
|
|
140
|
+
"U+feb0", #Medial
|
|
141
|
+
false)
|
|
142
|
+
map[new_character.common_encoding] = new_character
|
|
143
|
+
|
|
144
|
+
#Seen
|
|
145
|
+
new_character = ArabicCharacterInfo.new(
|
|
146
|
+
"U+0633", #Common
|
|
147
|
+
"U+feb1", #Isolated
|
|
148
|
+
"U+feb2", #Final
|
|
149
|
+
"U+feb3", #Initial
|
|
150
|
+
"U+feb4", #Medial
|
|
151
|
+
true)
|
|
152
|
+
map[new_character.common_encoding] = new_character
|
|
153
|
+
|
|
154
|
+
#Sheen
|
|
155
|
+
new_character = ArabicCharacterInfo.new(
|
|
156
|
+
"U+0634", #Common
|
|
157
|
+
"U+feb5", #Isolated
|
|
158
|
+
"U+feb6", #Final
|
|
159
|
+
"U+feb7", #Initial
|
|
160
|
+
"U+feb8", #Medial
|
|
161
|
+
true)
|
|
162
|
+
map[new_character.common_encoding] = new_character
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
#Sad
|
|
166
|
+
new_character = ArabicCharacterInfo.new(
|
|
167
|
+
"U+0635", #Common
|
|
168
|
+
"U+feb9", #Isolated
|
|
169
|
+
"U+feba", #Final
|
|
170
|
+
"U+febb", #Initial
|
|
171
|
+
"U+febc", #Medial
|
|
172
|
+
true)
|
|
173
|
+
map[new_character.common_encoding] = new_character
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
#Dad
|
|
177
|
+
new_character = ArabicCharacterInfo.new(
|
|
178
|
+
"U+0636", #Common
|
|
179
|
+
"U+febd", #Isolated
|
|
180
|
+
"U+febe", #Final
|
|
181
|
+
"U+febf", #Initial
|
|
182
|
+
"U+fec0", #Medial
|
|
183
|
+
true)
|
|
184
|
+
map[new_character.common_encoding] = new_character
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
#Tah
|
|
188
|
+
new_character = ArabicCharacterInfo.new(
|
|
189
|
+
"U+0637", #Common
|
|
190
|
+
"U+fec1", #Isolated
|
|
191
|
+
"U+fec2", #Final
|
|
192
|
+
"U+fec3", #Initial
|
|
193
|
+
"U+fec4", #Medial
|
|
194
|
+
true)
|
|
195
|
+
map[new_character.common_encoding] = new_character
|
|
196
|
+
|
|
197
|
+
#Thah
|
|
198
|
+
new_character = ArabicCharacterInfo.new(
|
|
199
|
+
"U+0638", #Common
|
|
200
|
+
"U+fec5", #Isolated
|
|
201
|
+
"U+fec6", #Final
|
|
202
|
+
"U+fec7", #Initial
|
|
203
|
+
"U+fec8", #Medial
|
|
204
|
+
true)
|
|
205
|
+
map[new_character.common_encoding] = new_character
|
|
206
|
+
|
|
207
|
+
#3ein
|
|
208
|
+
new_character = ArabicCharacterInfo.new(
|
|
209
|
+
"U+0639", #Common
|
|
210
|
+
"U+fec9", #Isolated
|
|
211
|
+
"U+feca", #Final
|
|
212
|
+
"U+fecb", #Initial
|
|
213
|
+
"U+fecc", #Medial
|
|
214
|
+
true)
|
|
215
|
+
map[new_character.common_encoding] = new_character
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
#3'ein
|
|
219
|
+
new_character = ArabicCharacterInfo.new(
|
|
220
|
+
"U+063a", #Common
|
|
221
|
+
"U+fecd", #Isolated
|
|
222
|
+
"U+fece", #Final
|
|
223
|
+
"U+fecf", #Initial
|
|
224
|
+
"U+fed0", #Medial
|
|
225
|
+
true)
|
|
226
|
+
map[new_character.common_encoding] = new_character
|
|
227
|
+
|
|
228
|
+
#Feh
|
|
229
|
+
new_character = ArabicCharacterInfo.new(
|
|
230
|
+
"U+0641", #Common
|
|
231
|
+
"U+fed1", #Isolated
|
|
232
|
+
"U+fed2", #Final
|
|
233
|
+
"U+fed3", #Initial
|
|
234
|
+
"U+fed4", #Medial
|
|
235
|
+
true)
|
|
236
|
+
map[new_character.common_encoding] = new_character
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
#Qaf
|
|
240
|
+
new_character = ArabicCharacterInfo.new(
|
|
241
|
+
"U+0642", #Common
|
|
242
|
+
"U+fed5", #Isolated
|
|
243
|
+
"U+fed6", #Final
|
|
244
|
+
"U+fed7", #Initial
|
|
245
|
+
"U+fed8", #Medial
|
|
246
|
+
true)
|
|
247
|
+
map[new_character.common_encoding] = new_character
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
#Kaf
|
|
251
|
+
new_character = ArabicCharacterInfo.new(
|
|
252
|
+
"U+0643", #Common
|
|
253
|
+
"U+fed9", #Isolated
|
|
254
|
+
"U+feda", #Final
|
|
255
|
+
"U+fedb", #Initial
|
|
256
|
+
"U+fedc", #Medial
|
|
257
|
+
true)
|
|
258
|
+
map[new_character.common_encoding] = new_character
|
|
259
|
+
|
|
260
|
+
#Lam
|
|
261
|
+
new_character = ArabicCharacterInfo.new(
|
|
262
|
+
"U+0644", #Common
|
|
263
|
+
"U+fedd", #Isolated
|
|
264
|
+
"U+fede", #Final
|
|
265
|
+
"U+fedf", #Initial
|
|
266
|
+
"U+fee0", #Medial
|
|
267
|
+
true)
|
|
268
|
+
map[new_character.common_encoding] = new_character
|
|
269
|
+
|
|
270
|
+
#Meem
|
|
271
|
+
new_character = ArabicCharacterInfo.new(
|
|
272
|
+
"U+0645", #Common
|
|
273
|
+
"U+fee1", #Isolated
|
|
274
|
+
"U+fee2", #Final
|
|
275
|
+
"U+fee3", #Initial
|
|
276
|
+
"U+fee4", #Medial
|
|
277
|
+
true)
|
|
278
|
+
map[new_character.common_encoding] = new_character
|
|
279
|
+
|
|
280
|
+
#Noon
|
|
281
|
+
new_character = ArabicCharacterInfo.new(
|
|
282
|
+
"U+0646", #Common
|
|
283
|
+
"U+fee5", #Isolated
|
|
284
|
+
"U+fee6", #Final
|
|
285
|
+
"U+fee7", #Initial
|
|
286
|
+
"U+fee8", #Medial
|
|
287
|
+
true)
|
|
288
|
+
map[new_character.common_encoding] = new_character
|
|
289
|
+
|
|
290
|
+
#Heh
|
|
291
|
+
new_character = ArabicCharacterInfo.new(
|
|
292
|
+
"U+0647", #Common
|
|
293
|
+
"U+fee9", #Isolated
|
|
294
|
+
"U+feea", #Final
|
|
295
|
+
"U+feeb", #Initial
|
|
296
|
+
"U+feec", #Medial
|
|
297
|
+
true)
|
|
298
|
+
map[new_character.common_encoding] = new_character
|
|
299
|
+
|
|
300
|
+
#Waw
|
|
301
|
+
new_character = ArabicCharacterInfo.new(
|
|
302
|
+
"U+0648", #Common
|
|
303
|
+
"U+feed", #Isolated
|
|
304
|
+
"U+feee", #Final
|
|
305
|
+
"U+feed", #Initial
|
|
306
|
+
"U+feee", #Medial
|
|
307
|
+
false)
|
|
308
|
+
map[new_character.common_encoding] = new_character
|
|
309
|
+
|
|
310
|
+
#Yeh
|
|
311
|
+
new_character = ArabicCharacterInfo.new(
|
|
312
|
+
"U+064a", #Common
|
|
313
|
+
"U+fef1", #Isolated
|
|
314
|
+
"U+fef2", #Final
|
|
315
|
+
"U+fef3", #Initial
|
|
316
|
+
"U+fef4", #Medial
|
|
317
|
+
true)
|
|
318
|
+
map[new_character.common_encoding] = new_character
|
|
319
|
+
|
|
320
|
+
#Hamza
|
|
321
|
+
new_character = ArabicCharacterInfo.new(
|
|
322
|
+
"U+0621", #Common
|
|
323
|
+
"U+fe80", #Isolated
|
|
324
|
+
"U+fe80", #Final
|
|
325
|
+
"U+fe80", #Initial
|
|
326
|
+
"U+fe80", #Medial
|
|
327
|
+
false)
|
|
328
|
+
map[new_character.common_encoding] = new_character
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# Alef Madda
|
|
332
|
+
new_character = ArabicCharacterInfo.new(
|
|
333
|
+
"U+0622", #Common
|
|
334
|
+
"U+fe81", #Isolated
|
|
335
|
+
"U+fe82", #Final
|
|
336
|
+
"U+fe81", #Initial
|
|
337
|
+
"U+fe82", #Medial
|
|
338
|
+
false)
|
|
339
|
+
map[new_character.common_encoding] = new_character
|
|
340
|
+
|
|
341
|
+
# Alef Hamza Above
|
|
342
|
+
new_character = ArabicCharacterInfo.new(
|
|
343
|
+
"U+0623", #Common
|
|
344
|
+
"U+fe83", #Isolated
|
|
345
|
+
"U+fe84", #Final
|
|
346
|
+
"U+fe83", #Initial
|
|
347
|
+
"U+fe84", #Medial
|
|
348
|
+
false)
|
|
349
|
+
map[new_character.common_encoding] = new_character
|
|
350
|
+
|
|
351
|
+
# Waw Hamza
|
|
352
|
+
new_character = ArabicCharacterInfo.new(
|
|
353
|
+
"U+0624", #Common
|
|
354
|
+
"U+fe85", #Isolated
|
|
355
|
+
"U+fe86", #Final
|
|
356
|
+
"U+fe85", #Initial
|
|
357
|
+
"U+fe86", #Medial
|
|
358
|
+
false)
|
|
359
|
+
map[new_character.common_encoding] = new_character
|
|
360
|
+
|
|
361
|
+
# Alef Hamza Below
|
|
362
|
+
new_character = ArabicCharacterInfo.new(
|
|
363
|
+
"U+0625", #Common
|
|
364
|
+
"U+fe87", #Isolated
|
|
365
|
+
"U+fe88", #Final
|
|
366
|
+
"U+fe87", #Initial
|
|
367
|
+
"U+fe88", #Medial
|
|
368
|
+
false)
|
|
369
|
+
map[new_character.common_encoding] = new_character
|
|
370
|
+
|
|
371
|
+
# Yeh Hamza
|
|
372
|
+
new_character = ArabicCharacterInfo.new(
|
|
373
|
+
"U+0626", #Common
|
|
374
|
+
"U+fe89", #Isolated
|
|
375
|
+
"U+fe8a", #Final
|
|
376
|
+
"U+fe8b", #Initial
|
|
377
|
+
"U+fe8c", #Medial
|
|
378
|
+
true)
|
|
379
|
+
map[new_character.common_encoding] = new_character
|
|
380
|
+
|
|
381
|
+
# Teh Marbuta
|
|
382
|
+
new_character = ArabicCharacterInfo.new(
|
|
383
|
+
"U+0629", #Common
|
|
384
|
+
"U+fe93", #Isolated
|
|
385
|
+
"U+fe94", #Final
|
|
386
|
+
"U+fe93", #Initial
|
|
387
|
+
"U+fe94", #Medial
|
|
388
|
+
false)
|
|
389
|
+
map[new_character.common_encoding] = new_character
|
|
390
|
+
|
|
391
|
+
# Tatweel
|
|
392
|
+
new_character = ArabicCharacterInfo.new(
|
|
393
|
+
"U+0640", #Common
|
|
394
|
+
"U+0640", #Isolated
|
|
395
|
+
"U+0640", #Final
|
|
396
|
+
"U+0640", #Initial
|
|
397
|
+
"U+0640", #Medial
|
|
398
|
+
true)
|
|
399
|
+
map[new_character.common_encoding] = new_character
|
|
400
|
+
|
|
401
|
+
# Alef Layyena
|
|
402
|
+
new_character = ArabicCharacterInfo.new(
|
|
403
|
+
"U+0649", #Common
|
|
404
|
+
"U+feef", #Isolated
|
|
405
|
+
"U+fef0", #Final
|
|
406
|
+
"U+feef", #Initial
|
|
407
|
+
"U+fef0", #Medial
|
|
408
|
+
false)
|
|
409
|
+
map[new_character.common_encoding] = new_character
|
|
410
|
+
|
|
411
|
+
@@arabic_characters_map = map
|
|
412
|
+
|
|
413
|
+
return @@arabic_characters_map
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
class String
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def determine_format(before_c, after_c)
|
|
425
|
+
|
|
426
|
+
charmap = ArabicCharacterInfo.get_arabic_characters_map
|
|
427
|
+
|
|
428
|
+
previous_is_character = charmap.key?(before_c)
|
|
429
|
+
after_is_character = charmap.key?(after_c)
|
|
430
|
+
|
|
431
|
+
if !after_is_character and (!previous_is_character or !charmap[before_c].is_connected)
|
|
432
|
+
return CharacterFormat::Isolated
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
if !after_is_character
|
|
436
|
+
return CharacterFormat::Final
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
if !previous_is_character or !charmap[before_c].is_connected
|
|
440
|
+
return CharacterFormat::Initial
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
return CharacterFormat::Medial
|
|
444
|
+
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
def get_letter_in_format(format, c)
|
|
448
|
+
charmap = ArabicCharacterInfo.get_arabic_characters_map
|
|
449
|
+
character = charmap[c]
|
|
450
|
+
if character.nil?
|
|
451
|
+
return c
|
|
452
|
+
end
|
|
453
|
+
return character.format_encodings[format]
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def fix_word
|
|
457
|
+
|
|
458
|
+
is_arabic = false
|
|
459
|
+
connected_arabic = ""
|
|
460
|
+
previous_letter = ''
|
|
461
|
+
before_previous_letter = ''
|
|
462
|
+
|
|
463
|
+
self.each_utf8_char {|c|
|
|
464
|
+
|
|
465
|
+
if previous_letter != ''
|
|
466
|
+
|
|
467
|
+
format = determine_format(before_previous_letter, c)
|
|
468
|
+
fixed_character = get_letter_in_format(format, previous_letter)
|
|
469
|
+
connected_arabic += fixed_character
|
|
470
|
+
if fixed_character != previous_letter
|
|
471
|
+
is_arabic = true
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
before_previous_letter = previous_letter
|
|
477
|
+
previous_letter = c
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if previous_letter != ''
|
|
481
|
+
|
|
482
|
+
format = determine_format(before_previous_letter, '')
|
|
483
|
+
fixed_character = get_letter_in_format(format, previous_letter)
|
|
484
|
+
connected_arabic += fixed_character
|
|
485
|
+
if fixed_character != previous_letter
|
|
486
|
+
is_arabic = true
|
|
487
|
+
end
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
if is_arabic
|
|
491
|
+
return connected_arabic.reverse_utf8!
|
|
492
|
+
else
|
|
493
|
+
return connected_arabic
|
|
494
|
+
end
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
def fix_arabic_glyphs
|
|
498
|
+
|
|
499
|
+
words = self.split(" ")
|
|
500
|
+
result = ""
|
|
501
|
+
|
|
502
|
+
#assuming default is rtl
|
|
503
|
+
ltr_buffer = ""
|
|
504
|
+
|
|
505
|
+
words.each { |word|
|
|
506
|
+
fixed_word = word.fix_word
|
|
507
|
+
if(fixed_word == word)
|
|
508
|
+
#a non-arabic word (ltr) so we will buffer to see if more ltr words will follow
|
|
509
|
+
ltr_buffer = ltr_buffer + " " + fixed_word
|
|
510
|
+
else
|
|
511
|
+
if(ltr_buffer.empty?)
|
|
512
|
+
result = fixed_word + " " + result
|
|
513
|
+
else
|
|
514
|
+
result = ltr_buffer + " " + result
|
|
515
|
+
result = fixed_word + " " + result
|
|
516
|
+
ltr_buffer = ""
|
|
517
|
+
end
|
|
518
|
+
end
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
if(!(ltr_buffer.empty?))
|
|
522
|
+
result = ltr_buffer + " " + result
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
return result
|
|
526
|
+
end
|
|
527
|
+
end
|
|
@@ -0,0 +1,726 @@
|
|
|
1
|
+
class String
|
|
2
|
+
|
|
3
|
+
require 'iconv'
|
|
4
|
+
require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
|
5
|
+
|
|
6
|
+
# taken from: http://www.w3.org/International/questions/qa-forms-utf-8
|
|
7
|
+
UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
|
|
8
|
+
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
|
9
|
+
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
|
10
|
+
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
|
11
|
+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
|
12
|
+
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
|
13
|
+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
|
14
|
+
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
|
15
|
+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
|
16
|
+
)*\z/mnx
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# create UTF-8 character arrays (as class instance variables)
|
|
20
|
+
#
|
|
21
|
+
# mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
|
|
22
|
+
# - http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
|
23
|
+
# - http://unicode.org/Public/UNIDATA/CaseFolding.txt
|
|
24
|
+
# - http://www.decodeunicode.org
|
|
25
|
+
# - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
|
|
26
|
+
# - http://camomile.sourceforge.net
|
|
27
|
+
# - Character Palette (Mac OS X)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# test data
|
|
31
|
+
@small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
|
38
|
+
|
|
39
|
+
if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
|
|
40
|
+
if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
|
|
41
|
+
if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@unicode_array = []
|
|
45
|
+
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
|
|
46
|
+
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
|
|
47
|
+
# f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
|
|
48
|
+
#end
|
|
49
|
+
|
|
50
|
+
#@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
|
|
51
|
+
@letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
|
|
52
|
+
|
|
53
|
+
# Hash[*array_with_keys.zip(array_with_values).flatten]
|
|
54
|
+
@downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
|
|
55
|
+
@upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
|
|
56
|
+
@letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
|
|
57
|
+
|
|
58
|
+
class << self
|
|
59
|
+
attr_accessor :small_letters_utf8
|
|
60
|
+
attr_accessor :capital_letters_utf8
|
|
61
|
+
attr_accessor :other_letters_utf8
|
|
62
|
+
attr_accessor :letters_utf8
|
|
63
|
+
attr_accessor :letters_utf8_hash
|
|
64
|
+
attr_accessor :unicode_array
|
|
65
|
+
attr_accessor :downcase_table_utf8
|
|
66
|
+
attr_accessor :upcase_table_utf8
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def each_utf8_char
|
|
71
|
+
scan(/./mu) { |c| yield c }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def each_utf8_char_with_index
|
|
75
|
+
i = -1
|
|
76
|
+
scan(/./mu) { |c| i+=1; yield(c, i) }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def length_utf8
|
|
80
|
+
#scan(/./mu).size
|
|
81
|
+
count = 0
|
|
82
|
+
scan(/./mu) { count += 1 }
|
|
83
|
+
count
|
|
84
|
+
end
|
|
85
|
+
alias :size_utf8 :length_utf8
|
|
86
|
+
|
|
87
|
+
def reverse_utf8
|
|
88
|
+
split(//mu).reverse.join
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def reverse_utf8!
|
|
92
|
+
split(//mu).reverse!.join
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def swapcase_utf8
|
|
96
|
+
gsub(/./mu) do |char|
|
|
97
|
+
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
|
98
|
+
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
|
99
|
+
else char.swapcase
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def swapcase_utf8!
|
|
105
|
+
gsub!(/./mu) do |char|
|
|
106
|
+
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
|
107
|
+
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
|
108
|
+
else ret = char.swapcase end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def downcase_utf8
|
|
113
|
+
gsub(/./mu) do |char|
|
|
114
|
+
small_char = String.downcase_table_utf8[char]
|
|
115
|
+
small_char.nil? ? char.downcase : small_char
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def downcase_utf8!
|
|
120
|
+
gsub!(/./mu) do |char|
|
|
121
|
+
small_char = String.downcase_table_utf8[char]
|
|
122
|
+
small_char.nil? ? char.downcase : small_char
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def upcase_utf8
|
|
127
|
+
gsub(/./mu) do |char|
|
|
128
|
+
capital_char = String.upcase_table_utf8[char]
|
|
129
|
+
capital_char.nil? ? char.upcase : capital_char
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def upcase_utf8!
|
|
134
|
+
gsub!(/./mu) do |char|
|
|
135
|
+
capital_char = String.upcase_table_utf8[char]
|
|
136
|
+
capital_char.nil? ? char.upcase : capital_char
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def count_utf8(c)
|
|
141
|
+
return nil if c.empty?
|
|
142
|
+
r = %r{[#{c}]}mu
|
|
143
|
+
scan(r).size
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def delete_utf8(c)
|
|
147
|
+
return self if c.empty?
|
|
148
|
+
r = %r{[#{c}]}mu
|
|
149
|
+
gsub(r, '')
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def delete_utf8!(c)
|
|
153
|
+
return self if c.empty?
|
|
154
|
+
r = %r{[#{c}]}mu
|
|
155
|
+
gsub!(r, '')
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def first_utf8
|
|
159
|
+
self[/\A./mu]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def last_utf8
|
|
163
|
+
self[/.\z/mu]
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def capitalize_utf8
|
|
167
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
|
168
|
+
ret = ""
|
|
169
|
+
split(/\x20/).each do |w|
|
|
170
|
+
count = 0
|
|
171
|
+
w.gsub(/./mu) do |char|
|
|
172
|
+
count += 1
|
|
173
|
+
capital_char = String.upcase_table_utf8[char]
|
|
174
|
+
if count == 1 then
|
|
175
|
+
capital_char.nil? ? char.upcase : char.upcase_utf8
|
|
176
|
+
else
|
|
177
|
+
capital_char.nil? ? char.downcase : char.downcase_utf8
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
ret << w + ' '
|
|
181
|
+
end
|
|
182
|
+
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def capitalize_utf8!
|
|
186
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
|
187
|
+
ret = ""
|
|
188
|
+
split(/\x20/).each do |w|
|
|
189
|
+
count = 0
|
|
190
|
+
w.gsub!(/./mu) do |char|
|
|
191
|
+
count += 1
|
|
192
|
+
capital_char = String.upcase_table_utf8[char]
|
|
193
|
+
if count == 1 then
|
|
194
|
+
capital_char.nil? ? char.upcase : char.upcase_utf8
|
|
195
|
+
else
|
|
196
|
+
capital_char.nil? ? char.downcase : char.downcase_utf8
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
ret << w + ' '
|
|
200
|
+
end
|
|
201
|
+
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def index_utf8(s)
|
|
206
|
+
|
|
207
|
+
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
|
208
|
+
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
|
209
|
+
|
|
210
|
+
if s.class == Regexp
|
|
211
|
+
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
|
212
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
|
213
|
+
str = s.source
|
|
214
|
+
return nil if str.empty?
|
|
215
|
+
str = "%r{#{str}}" + opts
|
|
216
|
+
r = eval(str)
|
|
217
|
+
l = ""
|
|
218
|
+
sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
|
|
219
|
+
l.empty? ? nil : l.length_utf8
|
|
220
|
+
|
|
221
|
+
else
|
|
222
|
+
|
|
223
|
+
return nil if s.empty?
|
|
224
|
+
r = %r{#{s}}mu
|
|
225
|
+
l = ""
|
|
226
|
+
sub(r) { l << $`; " " }
|
|
227
|
+
l.empty? ? nil : l.length_utf8
|
|
228
|
+
|
|
229
|
+
# this would be a non-regex solution
|
|
230
|
+
=begin
|
|
231
|
+
return nil if s.empty?
|
|
232
|
+
return nil unless self =~ %r{#{s}}mu
|
|
233
|
+
indices = []
|
|
234
|
+
s.split(//mu).each do |x|
|
|
235
|
+
ar = []
|
|
236
|
+
self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
|
|
237
|
+
indices << ar unless ar.empty?
|
|
238
|
+
end
|
|
239
|
+
if indices.empty?
|
|
240
|
+
return nil
|
|
241
|
+
elsif indices.size == 1
|
|
242
|
+
indices.first.first
|
|
243
|
+
else
|
|
244
|
+
#p indices
|
|
245
|
+
ret = []
|
|
246
|
+
a0 = indices.shift
|
|
247
|
+
a0.each do |i|
|
|
248
|
+
ret << i
|
|
249
|
+
indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
|
|
250
|
+
return ret.first unless ret.empty?
|
|
251
|
+
end
|
|
252
|
+
ret.empty? ? nil : ret.first
|
|
253
|
+
end
|
|
254
|
+
=end
|
|
255
|
+
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def rindex_utf8(s)
|
|
261
|
+
|
|
262
|
+
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
|
263
|
+
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
|
264
|
+
|
|
265
|
+
if s.class == Regexp
|
|
266
|
+
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
|
267
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
|
268
|
+
str = s.source
|
|
269
|
+
return nil if str.empty?
|
|
270
|
+
str = "%r{#{str}}" + opts
|
|
271
|
+
r = eval(str)
|
|
272
|
+
l = ""
|
|
273
|
+
scan(r) { l = $` }
|
|
274
|
+
#gsub(r) { l = $`; " " }
|
|
275
|
+
l.empty? ? nil : l.length_utf8
|
|
276
|
+
else
|
|
277
|
+
return nil if s.empty?
|
|
278
|
+
r = %r{#{s}}mu
|
|
279
|
+
l = ""
|
|
280
|
+
scan(r) { l = $` }
|
|
281
|
+
#gsub(r) { l = $`; " " }
|
|
282
|
+
l.empty? ? nil : l.length_utf8
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
# note that the i option does not work in special cases with back references
|
|
289
|
+
# example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
|
|
290
|
+
def slice_utf8(regex)
|
|
291
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
|
292
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
|
293
|
+
s = regex.source
|
|
294
|
+
str = "%r{#{s}}" + opts
|
|
295
|
+
r = eval(str)
|
|
296
|
+
slice(r)
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def slice_utf8!(regex)
|
|
300
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
|
301
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
|
302
|
+
s = regex.source
|
|
303
|
+
str = "%r{#{s}}" + opts
|
|
304
|
+
r = eval(str)
|
|
305
|
+
slice!(r)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def cut_utf8(p,l) # (index) position, length
|
|
309
|
+
raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
|
|
310
|
+
s = self.length_utf8
|
|
311
|
+
#if p < 0 then p = s - p.abs end
|
|
312
|
+
if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
|
|
313
|
+
return nil if l > s or p > (s - 1)
|
|
314
|
+
ret = ""
|
|
315
|
+
count = 0
|
|
316
|
+
each_utf8_char_with_index do |c,i|
|
|
317
|
+
break if count >= l
|
|
318
|
+
if i >= p && count < l then count += 1; ret << c; end
|
|
319
|
+
end
|
|
320
|
+
ret
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def starts_with_utf8?(s)
|
|
324
|
+
return nil if self.empty? or s.empty?
|
|
325
|
+
cut_utf8(0, s.size_utf8) == s
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
def ends_with_utf8?(s)
|
|
329
|
+
return nil if self.empty? or s.empty?
|
|
330
|
+
cut_utf8(-(s.size_utf8), s.size_utf8) == s
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def insert_utf8(i,s) # insert_utf8(index, string)
|
|
334
|
+
return self if s.empty?
|
|
335
|
+
l = self.length_utf8
|
|
336
|
+
if l == 0 then return s end
|
|
337
|
+
if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
|
|
338
|
+
#return nil if i > (l - 1) # return nil ...
|
|
339
|
+
spaces = ""
|
|
340
|
+
if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
|
|
341
|
+
str = self << spaces
|
|
342
|
+
s1 = str.cut_utf8(0, i)
|
|
343
|
+
s2 = str.cut_utf8(i, l - s1.length_utf8)
|
|
344
|
+
s1 << s << s2
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
def split_utf8(regex)
|
|
348
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
|
349
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
|
350
|
+
s = regex.source
|
|
351
|
+
str = "%r{#{s}}" + opts
|
|
352
|
+
r = eval(str)
|
|
353
|
+
split(r)
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
def scan_utf8(regex)
|
|
357
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
|
358
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
|
359
|
+
s = regex.source
|
|
360
|
+
str = "%r{#{s}}" + opts
|
|
361
|
+
r = eval(str)
|
|
362
|
+
if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
def range_utf8(r)
|
|
366
|
+
|
|
367
|
+
return nil if r.class != Range
|
|
368
|
+
#raise(ArgumentError, "No Range object given!", caller) if r.class != Range
|
|
369
|
+
|
|
370
|
+
a = r.to_s[/^[\+\-]?\d+/].to_i
|
|
371
|
+
b = r.to_s[/[\+\-]?\d+$/].to_i
|
|
372
|
+
d = r.to_s[/\.+/]
|
|
373
|
+
|
|
374
|
+
if d.size == 2 then d = 2 else d = d.size end
|
|
375
|
+
|
|
376
|
+
l = self.length_utf8
|
|
377
|
+
|
|
378
|
+
return nil if b.abs > l || a.abs > l || d < 2 || d > 3
|
|
379
|
+
|
|
380
|
+
if a < 0 then a = l - a.abs end
|
|
381
|
+
if b < 0 then b = l - b.abs end
|
|
382
|
+
|
|
383
|
+
return nil if a > b
|
|
384
|
+
|
|
385
|
+
str = ""
|
|
386
|
+
|
|
387
|
+
each_utf8_char_with_index do |c,i|
|
|
388
|
+
break if i > b
|
|
389
|
+
if d == 2
|
|
390
|
+
(i >= a && i <= b) ? str << c : next
|
|
391
|
+
else
|
|
392
|
+
(i >= a && i < b) ? str << c : next
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
str
|
|
397
|
+
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def utf8?
|
|
401
|
+
self =~ UTF8REGEX
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
def clean_utf8
|
|
405
|
+
t = ""
|
|
406
|
+
self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
|
|
407
|
+
t
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
|
|
412
|
+
|
|
413
|
+
file = self
|
|
414
|
+
str = ""
|
|
415
|
+
|
|
416
|
+
if file =~ /^http:\/\//
|
|
417
|
+
|
|
418
|
+
url = file
|
|
419
|
+
|
|
420
|
+
if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
|
|
421
|
+
|
|
422
|
+
seconds = 30
|
|
423
|
+
|
|
424
|
+
# check if web site is reachable
|
|
425
|
+
# on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
|
|
426
|
+
var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
|
|
427
|
+
|
|
428
|
+
#return false unless var == 0
|
|
429
|
+
raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
|
|
430
|
+
|
|
431
|
+
str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
|
|
432
|
+
/usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
|
|
433
|
+
p str
|
|
434
|
+
return true if str =~ /utf-?8/i
|
|
435
|
+
return false if !str.empty? && str !~ /utf-?8/i
|
|
436
|
+
|
|
437
|
+
# solutions with downloaded file
|
|
438
|
+
|
|
439
|
+
# download HTML file
|
|
440
|
+
#downloaded_file = "/tmp/html"
|
|
441
|
+
downloaded_file = "~/Desktop/html"
|
|
442
|
+
downloaded_file = File.expand_path(downloaded_file)
|
|
443
|
+
%x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
|
|
444
|
+
raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
|
|
445
|
+
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
|
|
446
|
+
|
|
447
|
+
simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
|
|
448
|
+
p simple_test
|
|
449
|
+
|
|
450
|
+
# read entire file into a string
|
|
451
|
+
File.open(downloaded_file).read.each(nil) do |str|
|
|
452
|
+
#return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
|
|
453
|
+
str.utf8? ? (return true) : (return false)
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
#check each line of the downloaded file
|
|
457
|
+
#count_lines = 0
|
|
458
|
+
#count_utf8 = 0
|
|
459
|
+
#File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
|
|
460
|
+
#count_lines == count_utf8 ? (return true) : (return false)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# in-memory solutions
|
|
464
|
+
|
|
465
|
+
#html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
|
|
466
|
+
#p html_file_cleaned_utf8.utf8?
|
|
467
|
+
|
|
468
|
+
count_lines = 0
|
|
469
|
+
count_utf8 = 0
|
|
470
|
+
#%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
|
|
471
|
+
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
|
|
472
|
+
#return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
|
473
|
+
count_lines += 1
|
|
474
|
+
count_utf8 += 1 if line.utf8?
|
|
475
|
+
break if count_lines != count_utf8
|
|
476
|
+
end
|
|
477
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
|
478
|
+
|
|
479
|
+
else
|
|
480
|
+
|
|
481
|
+
# check each line of the HTML file (or the entire HTML file at once)
|
|
482
|
+
# cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
|
483
|
+
count_lines = 0
|
|
484
|
+
count_utf8 = 0
|
|
485
|
+
open(url) do |f|
|
|
486
|
+
# p f.meta, f.content_encoding, f.content_type
|
|
487
|
+
cs = f.charset
|
|
488
|
+
return true if cs =~ /utf-?8/i
|
|
489
|
+
#f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
|
|
490
|
+
f.each_line do |line|
|
|
491
|
+
count_lines += 1
|
|
492
|
+
count_utf8 += 1 if line.utf8?
|
|
493
|
+
break unless count_lines == count_utf8
|
|
494
|
+
end
|
|
495
|
+
end
|
|
496
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
|
497
|
+
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
else
|
|
501
|
+
|
|
502
|
+
return false unless File.file?(file)
|
|
503
|
+
|
|
504
|
+
if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
|
|
505
|
+
|
|
506
|
+
# read entire file into a string
|
|
507
|
+
#File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
|
|
508
|
+
|
|
509
|
+
# check each line of the file
|
|
510
|
+
count_lines = 0
|
|
511
|
+
count_utf8 = 0
|
|
512
|
+
File.foreach(file) do |line|
|
|
513
|
+
return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
|
514
|
+
count_lines += 1;
|
|
515
|
+
count_utf8 += 1 if line.utf8?;
|
|
516
|
+
break if count_lines != count_utf8
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
|
520
|
+
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
str =~ /utf-?8/i ? true : false
|
|
524
|
+
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
# cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
|
|
529
|
+
def validate_utf8
|
|
530
|
+
Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
|
534
|
+
def asciify_utf8
|
|
535
|
+
return nil unless self.utf8?
|
|
536
|
+
#Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
|
|
537
|
+
# delete all punctuation characters inside words except "-" in words such as up-to-date
|
|
538
|
+
Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
def latin1_to_utf8 # ISO-8859-1 to UTF-8
|
|
542
|
+
ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
|
|
543
|
+
ret.utf8? ? ret : nil
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
|
|
547
|
+
ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
|
|
548
|
+
ret.utf8? ? ret : nil
|
|
549
|
+
end
|
|
550
|
+
|
|
551
|
+
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
|
552
|
+
def utf16le_to_utf8
|
|
553
|
+
ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
|
|
554
|
+
ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
|
|
555
|
+
ret.utf8? ? ret : nil
|
|
556
|
+
end
|
|
557
|
+
|
|
558
|
+
def utf8_to_utf16le
|
|
559
|
+
return nil unless self.utf8?
|
|
560
|
+
ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
def utf8_to_unicode
|
|
564
|
+
return nil unless self.utf8?
|
|
565
|
+
str = ""
|
|
566
|
+
scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
|
|
567
|
+
str
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
def unicode_to_utf8
|
|
571
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
|
572
|
+
str = ""
|
|
573
|
+
#scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
|
574
|
+
#scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
|
575
|
+
scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
|
|
576
|
+
c = $1
|
|
577
|
+
if c =~ /^U\+/
|
|
578
|
+
str << [c[2..-1].hex].pack("U*")
|
|
579
|
+
else
|
|
580
|
+
str << c
|
|
581
|
+
end
|
|
582
|
+
end
|
|
583
|
+
str.utf8? ? str : nil
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
# dec, hex, oct conversions (experimental!)
|
|
588
|
+
|
|
589
|
+
def utf8_to_dec
|
|
590
|
+
return nil unless self.utf8?
|
|
591
|
+
str = ""
|
|
592
|
+
scan(/./mu) do |c|
|
|
593
|
+
if c =~ /^\x00$/
|
|
594
|
+
str << "aaa\x00" # encode \x00 as "aaa"
|
|
595
|
+
else
|
|
596
|
+
str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
|
|
597
|
+
end
|
|
598
|
+
end
|
|
599
|
+
str[0..-2]
|
|
600
|
+
end
|
|
601
|
+
|
|
602
|
+
def dec_to_utf8 # \x00 is encoded as "aaa"
|
|
603
|
+
return self if self.empty?
|
|
604
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
|
605
|
+
str = ""
|
|
606
|
+
split(/\x00/).each do |c|
|
|
607
|
+
if c.eql?("aaa")
|
|
608
|
+
str << "\x00"
|
|
609
|
+
else
|
|
610
|
+
str << [c.to_i].pack("U*")
|
|
611
|
+
end
|
|
612
|
+
end
|
|
613
|
+
str
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def utf8_to_dec_2
|
|
618
|
+
return nil unless self.utf8?
|
|
619
|
+
str = ""
|
|
620
|
+
tmpstr = ""
|
|
621
|
+
null_str = "\x00"
|
|
622
|
+
scan(/./mu) do |c|
|
|
623
|
+
if c =~ /^\x00$/
|
|
624
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
|
625
|
+
else
|
|
626
|
+
tmpstr = ""
|
|
627
|
+
c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
|
|
628
|
+
str << tmpstr << null_str
|
|
629
|
+
end
|
|
630
|
+
end
|
|
631
|
+
str[0..-3]
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
def dec_to_utf8_2 # \x00 is encoded as "aaa"
|
|
635
|
+
return self if self.empty?
|
|
636
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
|
637
|
+
str = ""
|
|
638
|
+
split(/\x00\x00/).each do |c|
|
|
639
|
+
if c =~ /\x00/
|
|
640
|
+
c.split(/\x00/).each { |x| str << x.to_i.chr }
|
|
641
|
+
elsif c.eql?("aaa")
|
|
642
|
+
str << "\x00"
|
|
643
|
+
else
|
|
644
|
+
str << c.to_i.chr
|
|
645
|
+
end
|
|
646
|
+
end
|
|
647
|
+
str
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def utf8_to_hex
|
|
652
|
+
return nil unless self.utf8?
|
|
653
|
+
str = ""
|
|
654
|
+
tmpstr = ""
|
|
655
|
+
null_str = "\x00"
|
|
656
|
+
scan(/./mu) do |c|
|
|
657
|
+
if c =~ /^\x00$/
|
|
658
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
|
659
|
+
else
|
|
660
|
+
tmpstr = ""
|
|
661
|
+
c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
|
|
662
|
+
str << tmpstr << null_str
|
|
663
|
+
end
|
|
664
|
+
end
|
|
665
|
+
str[0..-3]
|
|
666
|
+
end
|
|
667
|
+
|
|
668
|
+
def hex_to_utf8 # \x00 is encoded as "aaa"
|
|
669
|
+
return self if self.empty?
|
|
670
|
+
return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
|
|
671
|
+
str = ""
|
|
672
|
+
split(/\x00\x00/).each do |c|
|
|
673
|
+
if c =~ /\x00/
|
|
674
|
+
c.split(/\x00/).each { |x| str << x.hex.chr }
|
|
675
|
+
elsif c.eql?("aaa")
|
|
676
|
+
str << "\x00"
|
|
677
|
+
else
|
|
678
|
+
str << c.hex.chr
|
|
679
|
+
end
|
|
680
|
+
end
|
|
681
|
+
str
|
|
682
|
+
end
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def utf8_to_oct
|
|
686
|
+
return nil unless self.utf8?
|
|
687
|
+
str = ""
|
|
688
|
+
tmpstr = ""
|
|
689
|
+
null_str = "\x00"
|
|
690
|
+
scan(/./mu) do |c|
|
|
691
|
+
if c =~ /^\x00$/
|
|
692
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
|
693
|
+
else
|
|
694
|
+
tmpstr = ""
|
|
695
|
+
c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
|
|
696
|
+
str << tmpstr << null_str
|
|
697
|
+
end
|
|
698
|
+
end
|
|
699
|
+
str[0..-3]
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
def oct_to_utf8 # \x00 is encoded as "aaa"
|
|
703
|
+
return self if self.empty?
|
|
704
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
|
705
|
+
str = ""
|
|
706
|
+
split(/\x00\x00/).each do |c|
|
|
707
|
+
if c =~ /\x00/
|
|
708
|
+
c.split(/\x00/).each { |x| str << x.oct.chr }
|
|
709
|
+
elsif c.eql?("aaa")
|
|
710
|
+
str << "\x00"
|
|
711
|
+
else
|
|
712
|
+
str << c.oct.chr
|
|
713
|
+
end
|
|
714
|
+
end
|
|
715
|
+
str
|
|
716
|
+
end
|
|
717
|
+
|
|
718
|
+
# cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
|
|
719
|
+
def email_subject_utf8
|
|
720
|
+
return nil unless self.utf8?
|
|
721
|
+
"=?utf-8?b?#{[self].pack("m").delete("\n")}?="
|
|
722
|
+
end
|
|
723
|
+
|
|
724
|
+
end
|
|
725
|
+
|
|
726
|
+
|
metadata
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: Arabic-Prawn
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 0
|
|
8
|
+
- 1
|
|
9
|
+
version: 0.0.1
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- Dynamix Solutions
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2010-02-28 00:00:00 +02:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies: []
|
|
20
|
+
|
|
21
|
+
description: Allows printing arabic to PDFs generated by prawn
|
|
22
|
+
email: ahmed.nasser@dynamix-systems.com
|
|
23
|
+
executables: []
|
|
24
|
+
|
|
25
|
+
extensions: []
|
|
26
|
+
|
|
27
|
+
extra_rdoc_files:
|
|
28
|
+
- README
|
|
29
|
+
- LICENSE
|
|
30
|
+
files:
|
|
31
|
+
- LICENSE
|
|
32
|
+
- README
|
|
33
|
+
- Rakefile
|
|
34
|
+
- lib/arabic-prawn.rb
|
|
35
|
+
- lib/string_utf_support.rb
|
|
36
|
+
has_rdoc: true
|
|
37
|
+
homepage:
|
|
38
|
+
licenses: []
|
|
39
|
+
|
|
40
|
+
post_install_message:
|
|
41
|
+
rdoc_options: []
|
|
42
|
+
|
|
43
|
+
require_paths:
|
|
44
|
+
- lib
|
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - ">="
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
segments:
|
|
50
|
+
- 0
|
|
51
|
+
version: "0"
|
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
|
+
requirements:
|
|
54
|
+
- - ">="
|
|
55
|
+
- !ruby/object:Gem::Version
|
|
56
|
+
segments:
|
|
57
|
+
- 0
|
|
58
|
+
version: "0"
|
|
59
|
+
requirements: []
|
|
60
|
+
|
|
61
|
+
rubyforge_project:
|
|
62
|
+
rubygems_version: 1.3.6
|
|
63
|
+
signing_key:
|
|
64
|
+
specification_version: 3
|
|
65
|
+
summary: Allows printing arabic to PDFs generated by prawn
|
|
66
|
+
test_files: []
|
|
67
|
+
|