prawn-arabic 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +10 -0
- data/README.md +14 -0
- data/Rakefile +30 -0
- data/lib/prawn-arabic.rb +527 -0
- data/lib/string_utf_support.rb +726 -0
- metadata +50 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1b1e46c30adc95d058af51d71a6c140146023122
|
4
|
+
data.tar.gz: 204379e44e33c33f7aa1f242a635a10b57f4d0b4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ef85cd7f4e4faf73246c81b724808b3e03036b3b9b7ccf4ce126052c32adc678c28b668b429687786f1407da22311e68f845f47afcfe1a6ec08eb3769d98a7a9
|
7
|
+
data.tar.gz: bd3607dfcd57a3520ee6eedcf777d7059a93d64be0877165ee7407891825f20a3a6ab040a89af5c7d2462eff37d49ba6e5a898201d1deed4a912816731f44200
|
data/LICENSE
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
== Arabic-Prawn
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
Copyright (c) 2017 Alex Lapchenko
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
7
|
+
|
8
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
9
|
+
|
10
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Arabic-Prawn
|
2
|
+
|
3
|
+
Arabic language string helpers for Prawn. [Original gem on rubygems](https://rubygems.org/gems/Arabic-Prawn/versions/0.0.1)
|
4
|
+
Copied source code to github in order to provide open source support and development for this gem.
|
5
|
+
|
6
|
+
## What is this
|
7
|
+
This gem patch `String` class and provide few helper for arabic language support.
|
8
|
+
- `#determine_format(before_c, after_c)`
|
9
|
+
- `#fix_arabic_glyphs`
|
10
|
+
- `#fix_word`
|
11
|
+
- `#get_letter_in_format(format, c)`
|
12
|
+
|
13
|
+
## Thanks
|
14
|
+
- Creator [#44017, Dynamix Solutions](https://rubygems.org/profiles/44017)
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path("../lib", __FILE__)
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'rake/clean'
|
6
|
+
require 'rubygems/package_task'
|
7
|
+
require 'rake/testtask'
|
8
|
+
require 'rdoc/task'
|
9
|
+
require 'rspec/core/rake_task'
|
10
|
+
|
11
|
+
task :build do
|
12
|
+
system "gem build arabic-prawn.gemspec"
|
13
|
+
end
|
14
|
+
|
15
|
+
RDoc::Task.new do |rdoc|
|
16
|
+
files =['README', 'LICENSE', 'lib/**/*.rb']
|
17
|
+
rdoc.rdoc_files.add(files)
|
18
|
+
rdoc.main = "README" # page to start on
|
19
|
+
rdoc.title = "Arabic-Prawn Docs"
|
20
|
+
rdoc.rdoc_dir = 'doc/rdoc' # rdoc output folder
|
21
|
+
rdoc.options << '--line-numbers'
|
22
|
+
end
|
23
|
+
|
24
|
+
Rake::TestTask.new do |t|
|
25
|
+
t.test_files = FileList['test/**/*.rb']
|
26
|
+
end
|
27
|
+
|
28
|
+
RSpec::Core::RakeTask.new do |t|
|
29
|
+
t.spec_files = FileList['spec/**/*.rb']
|
30
|
+
end
|
data/lib/prawn-arabic.rb
ADDED
@@ -0,0 +1,527 @@
|
|
1
|
+
require 'string_utf_support'
|
2
|
+
|
3
|
+
class CharacterFormat
|
4
|
+
Isolated = 1
|
5
|
+
Initial = 2
|
6
|
+
Medial = 3
|
7
|
+
Final = 4
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
class ArabicCharacterInfo
|
12
|
+
|
13
|
+
@@arabic_characters_map = nil
|
14
|
+
attr_accessor :common_encoding , :format_encodings, :is_connected
|
15
|
+
|
16
|
+
def initialize(common, isolated, final, initial, medial, is_connected)
|
17
|
+
@common_encoding = common.unicode_to_utf8
|
18
|
+
@format_encodings = Hash.new
|
19
|
+
@format_encodings[CharacterFormat::Isolated] = isolated.unicode_to_utf8
|
20
|
+
@format_encodings[CharacterFormat::Initial] = initial.unicode_to_utf8
|
21
|
+
@format_encodings[CharacterFormat::Medial] = medial.unicode_to_utf8
|
22
|
+
@format_encodings[CharacterFormat::Final] = final.unicode_to_utf8
|
23
|
+
@is_connected = is_connected
|
24
|
+
end
|
25
|
+
|
26
|
+
def ArabicCharacterInfo.get_arabic_characters_map
|
27
|
+
|
28
|
+
if !@@arabic_characters_map.nil?
|
29
|
+
return @@arabic_characters_map
|
30
|
+
end
|
31
|
+
|
32
|
+
map = Hash.new
|
33
|
+
|
34
|
+
#Alef
|
35
|
+
new_character = ArabicCharacterInfo.new(
|
36
|
+
"U+0627", #Common
|
37
|
+
"U+fe8d", #Isolated
|
38
|
+
"U+fe8e", #Final
|
39
|
+
"U+fe8d", #Initial
|
40
|
+
"U+fe8e", #Medial
|
41
|
+
false)
|
42
|
+
map[new_character.common_encoding] = new_character
|
43
|
+
|
44
|
+
#Beh
|
45
|
+
new_character = ArabicCharacterInfo.new(
|
46
|
+
"U+0628", #Common
|
47
|
+
"U+fe8f", #Isolated
|
48
|
+
"U+fe90", #Final
|
49
|
+
"U+fe91", #Initial
|
50
|
+
"U+fe92", #Medial
|
51
|
+
true)
|
52
|
+
map[new_character.common_encoding] = new_character
|
53
|
+
|
54
|
+
#Teh
|
55
|
+
new_character = ArabicCharacterInfo.new(
|
56
|
+
"U+062a", #Common
|
57
|
+
"U+fe95", #Isolated
|
58
|
+
"U+fe96", #Final
|
59
|
+
"U+fe97", #Initial
|
60
|
+
"U+fe98", #Medial
|
61
|
+
true)
|
62
|
+
map[new_character.common_encoding] = new_character
|
63
|
+
|
64
|
+
#Theh
|
65
|
+
new_character = ArabicCharacterInfo.new(
|
66
|
+
"U+062b", #Common
|
67
|
+
"U+fe99", #Isolated
|
68
|
+
"U+fe9a", #Final
|
69
|
+
"U+fe9b", #Initial
|
70
|
+
"U+fe9c", #Medial
|
71
|
+
true)
|
72
|
+
map[new_character.common_encoding] = new_character
|
73
|
+
|
74
|
+
#Jeem
|
75
|
+
new_character = ArabicCharacterInfo.new(
|
76
|
+
"U+062c", #Common
|
77
|
+
"U+fe9d", #Isolated
|
78
|
+
"U+fe9e", #Final
|
79
|
+
"U+fe9f", #Initial
|
80
|
+
"U+fea0", #Medial
|
81
|
+
true)
|
82
|
+
map[new_character.common_encoding] = new_character
|
83
|
+
|
84
|
+
#7ah
|
85
|
+
new_character = ArabicCharacterInfo.new(
|
86
|
+
"U+062d", #Common
|
87
|
+
"U+fea1", #Isolated
|
88
|
+
"U+fea2", #Final
|
89
|
+
"U+fea3", #Initial
|
90
|
+
"U+fea4", #Medial
|
91
|
+
true)
|
92
|
+
map[new_character.common_encoding] = new_character
|
93
|
+
|
94
|
+
#7'ah
|
95
|
+
new_character = ArabicCharacterInfo.new(
|
96
|
+
"U+062e", #Common
|
97
|
+
"U+fea5", #Isolated
|
98
|
+
"U+fea6", #Final
|
99
|
+
"U+fea7", #Initial
|
100
|
+
"U+fea8", #Medial
|
101
|
+
true)
|
102
|
+
map[new_character.common_encoding] = new_character
|
103
|
+
|
104
|
+
#Dal
|
105
|
+
new_character = ArabicCharacterInfo.new(
|
106
|
+
"U+062f", #Common
|
107
|
+
"U+fea9", #Isolated
|
108
|
+
"U+feaa", #Final
|
109
|
+
"U+fea9", #Initial
|
110
|
+
"U+feaa", #Medial
|
111
|
+
false)
|
112
|
+
map[new_character.common_encoding] = new_character
|
113
|
+
|
114
|
+
#Thal
|
115
|
+
new_character = ArabicCharacterInfo.new(
|
116
|
+
"U+0630", #Common
|
117
|
+
"U+feab", #Isolated
|
118
|
+
"U+feac", #Final
|
119
|
+
"U+feab", #Initial
|
120
|
+
"U+feac", #Medial
|
121
|
+
false)
|
122
|
+
map[new_character.common_encoding] = new_character
|
123
|
+
|
124
|
+
#Rah
|
125
|
+
new_character = ArabicCharacterInfo.new(
|
126
|
+
"U+0631", #Common
|
127
|
+
"U+fead", #Isolated
|
128
|
+
"U+feae", #Final
|
129
|
+
"U+fead", #Initial
|
130
|
+
"U+feae", #Medial
|
131
|
+
false)
|
132
|
+
map[new_character.common_encoding] = new_character
|
133
|
+
|
134
|
+
#Zein
|
135
|
+
new_character = ArabicCharacterInfo.new(
|
136
|
+
"U+0632", #Common
|
137
|
+
"U+feaf", #Isolated
|
138
|
+
"U+feb0", #Final
|
139
|
+
"U+feaf", #Initial
|
140
|
+
"U+feb0", #Medial
|
141
|
+
false)
|
142
|
+
map[new_character.common_encoding] = new_character
|
143
|
+
|
144
|
+
#Seen
|
145
|
+
new_character = ArabicCharacterInfo.new(
|
146
|
+
"U+0633", #Common
|
147
|
+
"U+feb1", #Isolated
|
148
|
+
"U+feb2", #Final
|
149
|
+
"U+feb3", #Initial
|
150
|
+
"U+feb4", #Medial
|
151
|
+
true)
|
152
|
+
map[new_character.common_encoding] = new_character
|
153
|
+
|
154
|
+
#Sheen
|
155
|
+
new_character = ArabicCharacterInfo.new(
|
156
|
+
"U+0634", #Common
|
157
|
+
"U+feb5", #Isolated
|
158
|
+
"U+feb6", #Final
|
159
|
+
"U+feb7", #Initial
|
160
|
+
"U+feb8", #Medial
|
161
|
+
true)
|
162
|
+
map[new_character.common_encoding] = new_character
|
163
|
+
|
164
|
+
|
165
|
+
#Sad
|
166
|
+
new_character = ArabicCharacterInfo.new(
|
167
|
+
"U+0635", #Common
|
168
|
+
"U+feb9", #Isolated
|
169
|
+
"U+feba", #Final
|
170
|
+
"U+febb", #Initial
|
171
|
+
"U+febc", #Medial
|
172
|
+
true)
|
173
|
+
map[new_character.common_encoding] = new_character
|
174
|
+
|
175
|
+
|
176
|
+
#Dad
|
177
|
+
new_character = ArabicCharacterInfo.new(
|
178
|
+
"U+0636", #Common
|
179
|
+
"U+febd", #Isolated
|
180
|
+
"U+febe", #Final
|
181
|
+
"U+febf", #Initial
|
182
|
+
"U+fec0", #Medial
|
183
|
+
true)
|
184
|
+
map[new_character.common_encoding] = new_character
|
185
|
+
|
186
|
+
|
187
|
+
#Tah
|
188
|
+
new_character = ArabicCharacterInfo.new(
|
189
|
+
"U+0637", #Common
|
190
|
+
"U+fec1", #Isolated
|
191
|
+
"U+fec2", #Final
|
192
|
+
"U+fec3", #Initial
|
193
|
+
"U+fec4", #Medial
|
194
|
+
true)
|
195
|
+
map[new_character.common_encoding] = new_character
|
196
|
+
|
197
|
+
#Thah
|
198
|
+
new_character = ArabicCharacterInfo.new(
|
199
|
+
"U+0638", #Common
|
200
|
+
"U+fec5", #Isolated
|
201
|
+
"U+fec6", #Final
|
202
|
+
"U+fec7", #Initial
|
203
|
+
"U+fec8", #Medial
|
204
|
+
true)
|
205
|
+
map[new_character.common_encoding] = new_character
|
206
|
+
|
207
|
+
#3ein
|
208
|
+
new_character = ArabicCharacterInfo.new(
|
209
|
+
"U+0639", #Common
|
210
|
+
"U+fec9", #Isolated
|
211
|
+
"U+feca", #Final
|
212
|
+
"U+fecb", #Initial
|
213
|
+
"U+fecc", #Medial
|
214
|
+
true)
|
215
|
+
map[new_character.common_encoding] = new_character
|
216
|
+
|
217
|
+
|
218
|
+
#3'ein
|
219
|
+
new_character = ArabicCharacterInfo.new(
|
220
|
+
"U+063a", #Common
|
221
|
+
"U+fecd", #Isolated
|
222
|
+
"U+fece", #Final
|
223
|
+
"U+fecf", #Initial
|
224
|
+
"U+fed0", #Medial
|
225
|
+
true)
|
226
|
+
map[new_character.common_encoding] = new_character
|
227
|
+
|
228
|
+
#Feh
|
229
|
+
new_character = ArabicCharacterInfo.new(
|
230
|
+
"U+0641", #Common
|
231
|
+
"U+fed1", #Isolated
|
232
|
+
"U+fed2", #Final
|
233
|
+
"U+fed3", #Initial
|
234
|
+
"U+fed4", #Medial
|
235
|
+
true)
|
236
|
+
map[new_character.common_encoding] = new_character
|
237
|
+
|
238
|
+
|
239
|
+
#Qaf
|
240
|
+
new_character = ArabicCharacterInfo.new(
|
241
|
+
"U+0642", #Common
|
242
|
+
"U+fed5", #Isolated
|
243
|
+
"U+fed6", #Final
|
244
|
+
"U+fed7", #Initial
|
245
|
+
"U+fed8", #Medial
|
246
|
+
true)
|
247
|
+
map[new_character.common_encoding] = new_character
|
248
|
+
|
249
|
+
|
250
|
+
#Kaf
|
251
|
+
new_character = ArabicCharacterInfo.new(
|
252
|
+
"U+0643", #Common
|
253
|
+
"U+fed9", #Isolated
|
254
|
+
"U+feda", #Final
|
255
|
+
"U+fedb", #Initial
|
256
|
+
"U+fedc", #Medial
|
257
|
+
true)
|
258
|
+
map[new_character.common_encoding] = new_character
|
259
|
+
|
260
|
+
#Lam
|
261
|
+
new_character = ArabicCharacterInfo.new(
|
262
|
+
"U+0644", #Common
|
263
|
+
"U+fedd", #Isolated
|
264
|
+
"U+fede", #Final
|
265
|
+
"U+fedf", #Initial
|
266
|
+
"U+fee0", #Medial
|
267
|
+
true)
|
268
|
+
map[new_character.common_encoding] = new_character
|
269
|
+
|
270
|
+
#Meem
|
271
|
+
new_character = ArabicCharacterInfo.new(
|
272
|
+
"U+0645", #Common
|
273
|
+
"U+fee1", #Isolated
|
274
|
+
"U+fee2", #Final
|
275
|
+
"U+fee3", #Initial
|
276
|
+
"U+fee4", #Medial
|
277
|
+
true)
|
278
|
+
map[new_character.common_encoding] = new_character
|
279
|
+
|
280
|
+
#Noon
|
281
|
+
new_character = ArabicCharacterInfo.new(
|
282
|
+
"U+0646", #Common
|
283
|
+
"U+fee5", #Isolated
|
284
|
+
"U+fee6", #Final
|
285
|
+
"U+fee7", #Initial
|
286
|
+
"U+fee8", #Medial
|
287
|
+
true)
|
288
|
+
map[new_character.common_encoding] = new_character
|
289
|
+
|
290
|
+
#Heh
|
291
|
+
new_character = ArabicCharacterInfo.new(
|
292
|
+
"U+0647", #Common
|
293
|
+
"U+fee9", #Isolated
|
294
|
+
"U+feea", #Final
|
295
|
+
"U+feeb", #Initial
|
296
|
+
"U+feec", #Medial
|
297
|
+
true)
|
298
|
+
map[new_character.common_encoding] = new_character
|
299
|
+
|
300
|
+
#Waw
|
301
|
+
new_character = ArabicCharacterInfo.new(
|
302
|
+
"U+0648", #Common
|
303
|
+
"U+feed", #Isolated
|
304
|
+
"U+feee", #Final
|
305
|
+
"U+feed", #Initial
|
306
|
+
"U+feee", #Medial
|
307
|
+
false)
|
308
|
+
map[new_character.common_encoding] = new_character
|
309
|
+
|
310
|
+
#Yeh
|
311
|
+
new_character = ArabicCharacterInfo.new(
|
312
|
+
"U+064a", #Common
|
313
|
+
"U+fef1", #Isolated
|
314
|
+
"U+fef2", #Final
|
315
|
+
"U+fef3", #Initial
|
316
|
+
"U+fef4", #Medial
|
317
|
+
true)
|
318
|
+
map[new_character.common_encoding] = new_character
|
319
|
+
|
320
|
+
#Hamza
|
321
|
+
new_character = ArabicCharacterInfo.new(
|
322
|
+
"U+0621", #Common
|
323
|
+
"U+fe80", #Isolated
|
324
|
+
"U+fe80", #Final
|
325
|
+
"U+fe80", #Initial
|
326
|
+
"U+fe80", #Medial
|
327
|
+
false)
|
328
|
+
map[new_character.common_encoding] = new_character
|
329
|
+
|
330
|
+
|
331
|
+
# Alef Madda
|
332
|
+
new_character = ArabicCharacterInfo.new(
|
333
|
+
"U+0622", #Common
|
334
|
+
"U+fe81", #Isolated
|
335
|
+
"U+fe82", #Final
|
336
|
+
"U+fe81", #Initial
|
337
|
+
"U+fe82", #Medial
|
338
|
+
false)
|
339
|
+
map[new_character.common_encoding] = new_character
|
340
|
+
|
341
|
+
# Alef Hamza Above
|
342
|
+
new_character = ArabicCharacterInfo.new(
|
343
|
+
"U+0623", #Common
|
344
|
+
"U+fe83", #Isolated
|
345
|
+
"U+fe84", #Final
|
346
|
+
"U+fe83", #Initial
|
347
|
+
"U+fe84", #Medial
|
348
|
+
false)
|
349
|
+
map[new_character.common_encoding] = new_character
|
350
|
+
|
351
|
+
# Waw Hamza
|
352
|
+
new_character = ArabicCharacterInfo.new(
|
353
|
+
"U+0624", #Common
|
354
|
+
"U+fe85", #Isolated
|
355
|
+
"U+fe86", #Final
|
356
|
+
"U+fe85", #Initial
|
357
|
+
"U+fe86", #Medial
|
358
|
+
false)
|
359
|
+
map[new_character.common_encoding] = new_character
|
360
|
+
|
361
|
+
# Alef Hamza Below
|
362
|
+
new_character = ArabicCharacterInfo.new(
|
363
|
+
"U+0625", #Common
|
364
|
+
"U+fe87", #Isolated
|
365
|
+
"U+fe88", #Final
|
366
|
+
"U+fe87", #Initial
|
367
|
+
"U+fe88", #Medial
|
368
|
+
false)
|
369
|
+
map[new_character.common_encoding] = new_character
|
370
|
+
|
371
|
+
# Yeh Hamza
|
372
|
+
new_character = ArabicCharacterInfo.new(
|
373
|
+
"U+0626", #Common
|
374
|
+
"U+fe89", #Isolated
|
375
|
+
"U+fe8a", #Final
|
376
|
+
"U+fe8b", #Initial
|
377
|
+
"U+fe8c", #Medial
|
378
|
+
true)
|
379
|
+
map[new_character.common_encoding] = new_character
|
380
|
+
|
381
|
+
# Teh Marbuta
|
382
|
+
new_character = ArabicCharacterInfo.new(
|
383
|
+
"U+0629", #Common
|
384
|
+
"U+fe93", #Isolated
|
385
|
+
"U+fe94", #Final
|
386
|
+
"U+fe93", #Initial
|
387
|
+
"U+fe94", #Medial
|
388
|
+
false)
|
389
|
+
map[new_character.common_encoding] = new_character
|
390
|
+
|
391
|
+
# Tatweel
|
392
|
+
new_character = ArabicCharacterInfo.new(
|
393
|
+
"U+0640", #Common
|
394
|
+
"U+0640", #Isolated
|
395
|
+
"U+0640", #Final
|
396
|
+
"U+0640", #Initial
|
397
|
+
"U+0640", #Medial
|
398
|
+
true)
|
399
|
+
map[new_character.common_encoding] = new_character
|
400
|
+
|
401
|
+
# Alef Layyena
|
402
|
+
new_character = ArabicCharacterInfo.new(
|
403
|
+
"U+0649", #Common
|
404
|
+
"U+feef", #Isolated
|
405
|
+
"U+fef0", #Final
|
406
|
+
"U+feef", #Initial
|
407
|
+
"U+fef0", #Medial
|
408
|
+
false)
|
409
|
+
map[new_character.common_encoding] = new_character
|
410
|
+
|
411
|
+
@@arabic_characters_map = map
|
412
|
+
|
413
|
+
return @@arabic_characters_map
|
414
|
+
end
|
415
|
+
|
416
|
+
end
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
class String
|
421
|
+
|
422
|
+
|
423
|
+
|
424
|
+
def determine_format(before_c, after_c)
|
425
|
+
|
426
|
+
charmap = ArabicCharacterInfo.get_arabic_characters_map
|
427
|
+
|
428
|
+
previous_is_character = charmap.key?(before_c)
|
429
|
+
after_is_character = charmap.key?(after_c)
|
430
|
+
|
431
|
+
if !after_is_character and (!previous_is_character or !charmap[before_c].is_connected)
|
432
|
+
return CharacterFormat::Isolated
|
433
|
+
end
|
434
|
+
|
435
|
+
if !after_is_character
|
436
|
+
return CharacterFormat::Final
|
437
|
+
end
|
438
|
+
|
439
|
+
if !previous_is_character or !charmap[before_c].is_connected
|
440
|
+
return CharacterFormat::Initial
|
441
|
+
end
|
442
|
+
|
443
|
+
return CharacterFormat::Medial
|
444
|
+
|
445
|
+
end
|
446
|
+
|
447
|
+
def get_letter_in_format(format, c)
|
448
|
+
charmap = ArabicCharacterInfo.get_arabic_characters_map
|
449
|
+
character = charmap[c]
|
450
|
+
if character.nil?
|
451
|
+
return c
|
452
|
+
end
|
453
|
+
return character.format_encodings[format]
|
454
|
+
end
|
455
|
+
|
456
|
+
def fix_word
|
457
|
+
|
458
|
+
is_arabic = false
|
459
|
+
connected_arabic = ""
|
460
|
+
previous_letter = ''
|
461
|
+
before_previous_letter = ''
|
462
|
+
|
463
|
+
self.each_utf8_char {|c|
|
464
|
+
|
465
|
+
if previous_letter != ''
|
466
|
+
|
467
|
+
format = determine_format(before_previous_letter, c)
|
468
|
+
fixed_character = get_letter_in_format(format, previous_letter)
|
469
|
+
connected_arabic += fixed_character
|
470
|
+
if fixed_character != previous_letter
|
471
|
+
is_arabic = true
|
472
|
+
end
|
473
|
+
|
474
|
+
end
|
475
|
+
|
476
|
+
before_previous_letter = previous_letter
|
477
|
+
previous_letter = c
|
478
|
+
}
|
479
|
+
|
480
|
+
if previous_letter != ''
|
481
|
+
|
482
|
+
format = determine_format(before_previous_letter, '')
|
483
|
+
fixed_character = get_letter_in_format(format, previous_letter)
|
484
|
+
connected_arabic += fixed_character
|
485
|
+
if fixed_character != previous_letter
|
486
|
+
is_arabic = true
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
if is_arabic
|
491
|
+
return connected_arabic.reverse_utf8!
|
492
|
+
else
|
493
|
+
return connected_arabic
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
def fix_arabic_glyphs
|
498
|
+
|
499
|
+
words = self.split(" ")
|
500
|
+
result = ""
|
501
|
+
|
502
|
+
#assuming default is rtl
|
503
|
+
ltr_buffer = ""
|
504
|
+
|
505
|
+
words.each { |word|
|
506
|
+
fixed_word = word.fix_word
|
507
|
+
if(fixed_word == word)
|
508
|
+
#a non-arabic word (ltr) so we will buffer to see if more ltr words will follow
|
509
|
+
ltr_buffer = ltr_buffer + " " + fixed_word
|
510
|
+
else
|
511
|
+
if(ltr_buffer.empty?)
|
512
|
+
result = fixed_word + " " + result
|
513
|
+
else
|
514
|
+
result = ltr_buffer + " " + result
|
515
|
+
result = fixed_word + " " + result
|
516
|
+
ltr_buffer = ""
|
517
|
+
end
|
518
|
+
end
|
519
|
+
}
|
520
|
+
|
521
|
+
if(!(ltr_buffer.empty?))
|
522
|
+
result = ltr_buffer + " " + result
|
523
|
+
end
|
524
|
+
|
525
|
+
return result
|
526
|
+
end
|
527
|
+
end
|
@@ -0,0 +1,726 @@
|
|
1
|
+
class String
|
2
|
+
|
3
|
+
require 'iconv'
|
4
|
+
require 'open-uri' # cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
5
|
+
|
6
|
+
# taken from: http://www.w3.org/International/questions/qa-forms-utf-8
|
7
|
+
UTF8REGEX = /\A(?: # ?: non-capturing group (grouping with no back references)
|
8
|
+
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
9
|
+
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
10
|
+
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
11
|
+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
12
|
+
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
13
|
+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
14
|
+
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
15
|
+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
16
|
+
)*\z/mnx
|
17
|
+
|
18
|
+
|
19
|
+
# create UTF-8 character arrays (as class instance variables)
|
20
|
+
#
|
21
|
+
# mapping tables: - http://www.unicode.org/Public/UCA/latest/allkeys.txt
|
22
|
+
# - http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
23
|
+
# - http://unicode.org/Public/UNIDATA/CaseFolding.txt
|
24
|
+
# - http://www.decodeunicode.org
|
25
|
+
# - ftp://ftp.mars.org/pub/ruby/Unicode.tar.bz2
|
26
|
+
# - http://camomile.sourceforge.net
|
27
|
+
# - Character Palette (Mac OS X)
|
28
|
+
|
29
|
+
|
30
|
+
# test data
|
31
|
+
@small_letters_utf8 = ["U+00F1", "U+00F4", "U+00E6", "U+00F8", "U+00E0", "U+00E1", "U+00E2", "U+00E4", "U+00E5", "U+00E7", "U+00E8", "U+00E9", "U+00EA", "U+00EB", "U+0153"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
32
|
+
|
33
|
+
|
34
|
+
@capital_letters_utf8 = ["U+00D1", "U+00D4", "U+00C6", "U+00D8", "U+00C0", "U+00C1", "U+00C2", "U+00C4", "U+00C5", "U+00C7", "U+00C8", "U+00C9", "U+00CA", "U+00CB", "U+0152"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
35
|
+
|
36
|
+
|
37
|
+
@other_letters_utf8 = ["U+03A3", "U+0639", "U+0041", "U+F8D0", "U+F8FF", "U+4E2D", "U+F4EE", "U+00FE", "U+10FFFF", "U+00A9", "U+20AC", "U+221E", "U+20AC", "U+FEFF", "U+FFFD", "U+00FF", "U+00FE", "U+FFFE", "U+FEFF"].map { |x| u = [x[2..-1].hex].pack("U*"); u =~ UTF8REGEX ? u : nil }
|
38
|
+
|
39
|
+
if @small_letters_utf8.size != @small_letters_utf8.nitems then raise "Invalid UTF-8 char in @small_letters_utf8!" end
|
40
|
+
if @capital_letters_utf8.size != @capital_letters_utf8.nitems then raise "Invalid UTF-8 char in @capital_letters_utf8!" end
|
41
|
+
if @other_letters_utf8.size != @other_letters_utf8.nitems then raise "Invalid UTF-8 char in @other_letters_utf8!" end
|
42
|
+
|
43
|
+
|
44
|
+
@unicode_array = []
|
45
|
+
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f| f.each(nil) { |line| line.scan(/^[^;]+/) { |u| @unicode_array << u } } end
|
46
|
+
#open('http://unicode.org/Public/UNIDATA/UnicodeData.txt') do |f|
|
47
|
+
# f.each do |line| line =~ /LATIN|GREEK|CYRILLIC/ ? ( line.scan(/^[^;]+/) { |u| @unicode_array << u } ) : next end
|
48
|
+
#end
|
49
|
+
|
50
|
+
#@letters_utf8 = @unicode_array.map { |x| u = [x.hex].pack("U*"); u =~ UTF8REGEX ? u : nil }.compact # code points from UnicodeData.txt
|
51
|
+
@letters_utf8 = @small_letters_utf8 + @capital_letters_utf8 + @other_letters_utf8 # test data only
|
52
|
+
|
53
|
+
# Hash[*array_with_keys.zip(array_with_values).flatten]
|
54
|
+
@downcase_table_utf8 = Hash[*@capital_letters_utf8.zip(@small_letters_utf8).flatten]
|
55
|
+
@upcase_table_utf8 = Hash[*@small_letters_utf8.zip(@capital_letters_utf8).flatten]
|
56
|
+
@letters_utf8_hash = Hash[*@letters_utf8.zip([]).flatten] #=> ... "\341\272\242"=>nil ...
|
57
|
+
|
58
|
+
class << self
|
59
|
+
attr_accessor :small_letters_utf8
|
60
|
+
attr_accessor :capital_letters_utf8
|
61
|
+
attr_accessor :other_letters_utf8
|
62
|
+
attr_accessor :letters_utf8
|
63
|
+
attr_accessor :letters_utf8_hash
|
64
|
+
attr_accessor :unicode_array
|
65
|
+
attr_accessor :downcase_table_utf8
|
66
|
+
attr_accessor :upcase_table_utf8
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def each_utf8_char
|
71
|
+
scan(/./mu) { |c| yield c }
|
72
|
+
end
|
73
|
+
|
74
|
+
def each_utf8_char_with_index
|
75
|
+
i = -1
|
76
|
+
scan(/./mu) { |c| i+=1; yield(c, i) }
|
77
|
+
end
|
78
|
+
|
79
|
+
def length_utf8
|
80
|
+
#scan(/./mu).size
|
81
|
+
count = 0
|
82
|
+
scan(/./mu) { count += 1 }
|
83
|
+
count
|
84
|
+
end
|
85
|
+
alias :size_utf8 :length_utf8
|
86
|
+
|
87
|
+
def reverse_utf8
|
88
|
+
split(//mu).reverse.join
|
89
|
+
end
|
90
|
+
|
91
|
+
def reverse_utf8!
|
92
|
+
split(//mu).reverse!.join
|
93
|
+
end
|
94
|
+
|
95
|
+
def swapcase_utf8
|
96
|
+
gsub(/./mu) do |char|
|
97
|
+
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
98
|
+
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
99
|
+
else char.swapcase
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def swapcase_utf8!
|
105
|
+
gsub!(/./mu) do |char|
|
106
|
+
if !String.downcase_table_utf8[char].nil? then String.downcase_table_utf8[char]
|
107
|
+
elsif !String.upcase_table_utf8[char].nil? then String.upcase_table_utf8[char]
|
108
|
+
else ret = char.swapcase end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def downcase_utf8
|
113
|
+
gsub(/./mu) do |char|
|
114
|
+
small_char = String.downcase_table_utf8[char]
|
115
|
+
small_char.nil? ? char.downcase : small_char
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def downcase_utf8!
|
120
|
+
gsub!(/./mu) do |char|
|
121
|
+
small_char = String.downcase_table_utf8[char]
|
122
|
+
small_char.nil? ? char.downcase : small_char
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def upcase_utf8
|
127
|
+
gsub(/./mu) do |char|
|
128
|
+
capital_char = String.upcase_table_utf8[char]
|
129
|
+
capital_char.nil? ? char.upcase : capital_char
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def upcase_utf8!
|
134
|
+
gsub!(/./mu) do |char|
|
135
|
+
capital_char = String.upcase_table_utf8[char]
|
136
|
+
capital_char.nil? ? char.upcase : capital_char
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def count_utf8(c)
|
141
|
+
return nil if c.empty?
|
142
|
+
r = %r{[#{c}]}mu
|
143
|
+
scan(r).size
|
144
|
+
end
|
145
|
+
|
146
|
+
def delete_utf8(c)
|
147
|
+
return self if c.empty?
|
148
|
+
r = %r{[#{c}]}mu
|
149
|
+
gsub(r, '')
|
150
|
+
end
|
151
|
+
|
152
|
+
def delete_utf8!(c)
|
153
|
+
return self if c.empty?
|
154
|
+
r = %r{[#{c}]}mu
|
155
|
+
gsub!(r, '')
|
156
|
+
end
|
157
|
+
|
158
|
+
def first_utf8
|
159
|
+
self[/\A./mu]
|
160
|
+
end
|
161
|
+
|
162
|
+
def last_utf8
|
163
|
+
self[/.\z/mu]
|
164
|
+
end
|
165
|
+
|
166
|
+
def capitalize_utf8
|
167
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
168
|
+
ret = ""
|
169
|
+
split(/\x20/).each do |w|
|
170
|
+
count = 0
|
171
|
+
w.gsub(/./mu) do |char|
|
172
|
+
count += 1
|
173
|
+
capital_char = String.upcase_table_utf8[char]
|
174
|
+
if count == 1 then
|
175
|
+
capital_char.nil? ? char.upcase : char.upcase_utf8
|
176
|
+
else
|
177
|
+
capital_char.nil? ? char.downcase : char.downcase_utf8
|
178
|
+
end
|
179
|
+
end
|
180
|
+
ret << w + ' '
|
181
|
+
end
|
182
|
+
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
183
|
+
end
|
184
|
+
|
185
|
+
def capitalize_utf8!
|
186
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
187
|
+
ret = ""
|
188
|
+
split(/\x20/).each do |w|
|
189
|
+
count = 0
|
190
|
+
w.gsub!(/./mu) do |char|
|
191
|
+
count += 1
|
192
|
+
capital_char = String.upcase_table_utf8[char]
|
193
|
+
if count == 1 then
|
194
|
+
capital_char.nil? ? char.upcase : char.upcase_utf8
|
195
|
+
else
|
196
|
+
capital_char.nil? ? char.downcase : char.downcase_utf8
|
197
|
+
end
|
198
|
+
end
|
199
|
+
ret << w + ' '
|
200
|
+
end
|
201
|
+
ret =~ /\x20\z/ ? ret.sub!(/\x20\z/, '') : ret
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def index_utf8(s)
|
206
|
+
|
207
|
+
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
208
|
+
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
209
|
+
|
210
|
+
if s.class == Regexp
|
211
|
+
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
212
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
213
|
+
str = s.source
|
214
|
+
return nil if str.empty?
|
215
|
+
str = "%r{#{str}}" + opts
|
216
|
+
r = eval(str)
|
217
|
+
l = ""
|
218
|
+
sub(r) { l << $`; " " } # $`: The string to the left of the last successful match (cf. http://www.zenspider.com/Languages/Ruby/QuickRef.html)
|
219
|
+
l.empty? ? nil : l.length_utf8
|
220
|
+
|
221
|
+
else
|
222
|
+
|
223
|
+
return nil if s.empty?
|
224
|
+
r = %r{#{s}}mu
|
225
|
+
l = ""
|
226
|
+
sub(r) { l << $`; " " }
|
227
|
+
l.empty? ? nil : l.length_utf8
|
228
|
+
|
229
|
+
# this would be a non-regex solution
|
230
|
+
=begin
|
231
|
+
return nil if s.empty?
|
232
|
+
return nil unless self =~ %r{#{s}}mu
|
233
|
+
indices = []
|
234
|
+
s.split(//mu).each do |x|
|
235
|
+
ar = []
|
236
|
+
self.each_utf8_char_with_index { |c,i| if c == x then ar << i end } # first get all matching indices c == x
|
237
|
+
indices << ar unless ar.empty?
|
238
|
+
end
|
239
|
+
if indices.empty?
|
240
|
+
return nil
|
241
|
+
elsif indices.size == 1
|
242
|
+
indices.first.first
|
243
|
+
else
|
244
|
+
#p indices
|
245
|
+
ret = []
|
246
|
+
a0 = indices.shift
|
247
|
+
a0.each do |i|
|
248
|
+
ret << i
|
249
|
+
indices.each { |a| if a.include?(i+1) then i += 1; ret << i else ret = []; break end }
|
250
|
+
return ret.first unless ret.empty?
|
251
|
+
end
|
252
|
+
ret.empty? ? nil : ret.first
|
253
|
+
end
|
254
|
+
=end
|
255
|
+
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
def rindex_utf8(s)
|
261
|
+
|
262
|
+
return nil unless !self.empty? && (s.class == Regexp || s.class == String)
|
263
|
+
#raise(ArgumentError, "Wrong argument for method index_utf8!", caller) unless !self.empty? && (s.class == Regexp || s.class == String)
|
264
|
+
|
265
|
+
if s.class == Regexp
|
266
|
+
opts = s.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
267
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
268
|
+
str = s.source
|
269
|
+
return nil if str.empty?
|
270
|
+
str = "%r{#{str}}" + opts
|
271
|
+
r = eval(str)
|
272
|
+
l = ""
|
273
|
+
scan(r) { l = $` }
|
274
|
+
#gsub(r) { l = $`; " " }
|
275
|
+
l.empty? ? nil : l.length_utf8
|
276
|
+
else
|
277
|
+
return nil if s.empty?
|
278
|
+
r = %r{#{s}}mu
|
279
|
+
l = ""
|
280
|
+
scan(r) { l = $` }
|
281
|
+
#gsub(r) { l = $`; " " }
|
282
|
+
l.empty? ? nil : l.length_utf8
|
283
|
+
end
|
284
|
+
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
# note that the i option does not work in special cases with back references
|
289
|
+
# example: "��".slice_utf8(/(.).*?\1/i) returns nil whereas "aA".slice(/(.).*?\1/i) returns "aA"
|
290
|
+
def slice_utf8(regex)
|
291
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
292
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
293
|
+
s = regex.source
|
294
|
+
str = "%r{#{s}}" + opts
|
295
|
+
r = eval(str)
|
296
|
+
slice(r)
|
297
|
+
end
|
298
|
+
|
299
|
+
def slice_utf8!(regex)
|
300
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
301
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
302
|
+
s = regex.source
|
303
|
+
str = "%r{#{s}}" + opts
|
304
|
+
r = eval(str)
|
305
|
+
slice!(r)
|
306
|
+
end
|
307
|
+
|
308
|
+
def cut_utf8(p,l) # (index) position, length
|
309
|
+
raise(ArgumentError, "Error: argument is not Fixnum", caller) if p.class != Fixnum or l.class != Fixnum
|
310
|
+
s = self.length_utf8
|
311
|
+
#if p < 0 then p = s - p.abs end
|
312
|
+
if p < 0 then p.abs > s ? (p = 0) : (p = s - p.abs) end # or: ... p.abs > s ? (return nil) : ...
|
313
|
+
return nil if l > s or p > (s - 1)
|
314
|
+
ret = ""
|
315
|
+
count = 0
|
316
|
+
each_utf8_char_with_index do |c,i|
|
317
|
+
break if count >= l
|
318
|
+
if i >= p && count < l then count += 1; ret << c; end
|
319
|
+
end
|
320
|
+
ret
|
321
|
+
end
|
322
|
+
|
323
|
+
def starts_with_utf8?(s)
|
324
|
+
return nil if self.empty? or s.empty?
|
325
|
+
cut_utf8(0, s.size_utf8) == s
|
326
|
+
end
|
327
|
+
|
328
|
+
def ends_with_utf8?(s)
|
329
|
+
return nil if self.empty? or s.empty?
|
330
|
+
cut_utf8(-(s.size_utf8), s.size_utf8) == s
|
331
|
+
end
|
332
|
+
|
333
|
+
def insert_utf8(i,s) # insert_utf8(index, string)
|
334
|
+
return self if s.empty?
|
335
|
+
l = self.length_utf8
|
336
|
+
if l == 0 then return s end
|
337
|
+
if i < 0 then i.abs > l ? (i = 0) : (i = l - i.abs) end # or: ... i.abs > l ? (return nil) : ...
|
338
|
+
#return nil if i > (l - 1) # return nil ...
|
339
|
+
spaces = ""
|
340
|
+
if i > (l-1) then spaces = " " * (i - (l-1)) end # ... or add spaces
|
341
|
+
str = self << spaces
|
342
|
+
s1 = str.cut_utf8(0, i)
|
343
|
+
s2 = str.cut_utf8(i, l - s1.length_utf8)
|
344
|
+
s1 << s << s2
|
345
|
+
end
|
346
|
+
|
347
|
+
def split_utf8(regex)
|
348
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
349
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
350
|
+
s = regex.source
|
351
|
+
str = "%r{#{s}}" + opts
|
352
|
+
r = eval(str)
|
353
|
+
split(r)
|
354
|
+
end
|
355
|
+
|
356
|
+
def scan_utf8(regex)
|
357
|
+
opts = regex.inspect.gsub(/\A(.).*\1([eimnosux]*)\z/mu, '\2')
|
358
|
+
if opts.count('u') == 0 then opts = opts + "u" end
|
359
|
+
s = regex.source
|
360
|
+
str = "%r{#{s}}" + opts
|
361
|
+
r = eval(str)
|
362
|
+
if block_given? then scan(r) { |a,*m| yield(a,*m) } else scan(r) end
|
363
|
+
end
|
364
|
+
|
365
|
+
def range_utf8(r)
|
366
|
+
|
367
|
+
return nil if r.class != Range
|
368
|
+
#raise(ArgumentError, "No Range object given!", caller) if r.class != Range
|
369
|
+
|
370
|
+
a = r.to_s[/^[\+\-]?\d+/].to_i
|
371
|
+
b = r.to_s[/[\+\-]?\d+$/].to_i
|
372
|
+
d = r.to_s[/\.+/]
|
373
|
+
|
374
|
+
if d.size == 2 then d = 2 else d = d.size end
|
375
|
+
|
376
|
+
l = self.length_utf8
|
377
|
+
|
378
|
+
return nil if b.abs > l || a.abs > l || d < 2 || d > 3
|
379
|
+
|
380
|
+
if a < 0 then a = l - a.abs end
|
381
|
+
if b < 0 then b = l - b.abs end
|
382
|
+
|
383
|
+
return nil if a > b
|
384
|
+
|
385
|
+
str = ""
|
386
|
+
|
387
|
+
each_utf8_char_with_index do |c,i|
|
388
|
+
break if i > b
|
389
|
+
if d == 2
|
390
|
+
(i >= a && i <= b) ? str << c : next
|
391
|
+
else
|
392
|
+
(i >= a && i < b) ? str << c : next
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
str
|
397
|
+
|
398
|
+
end
|
399
|
+
|
400
|
+
def utf8?
|
401
|
+
self =~ UTF8REGEX
|
402
|
+
end
|
403
|
+
|
404
|
+
def clean_utf8
|
405
|
+
t = ""
|
406
|
+
self.scan(/./um) { |c| t << c if c =~ UTF8REGEX }
|
407
|
+
t
|
408
|
+
end
|
409
|
+
|
410
|
+
|
411
|
+
def utf8_encoded_file? # check (or rather guess) if (HTML) file encoding is UTF-8 (experimental, so use at your own risk!)
|
412
|
+
|
413
|
+
file = self
|
414
|
+
str = ""
|
415
|
+
|
416
|
+
if file =~ /^http:\/\//
|
417
|
+
|
418
|
+
url = file
|
419
|
+
|
420
|
+
if RUBY_PLATFORM =~ /darwin/i # Mac OS X 10.4.10
|
421
|
+
|
422
|
+
seconds = 30
|
423
|
+
|
424
|
+
# check if web site is reachable
|
425
|
+
# on Windows try to use curb, http://curb.rubyforge.org (sudo gem install curb)
|
426
|
+
var = %x{ /usr/bin/curl -I -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url}; /bin/echo -n $? }.to_i
|
427
|
+
|
428
|
+
#return false unless var == 0
|
429
|
+
raise "Failed to create connection to web site: #{url} -- curl error code: #{var} -- " unless var == 0
|
430
|
+
|
431
|
+
str = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} | \
|
432
|
+
/usr/bin/grep -Eo -m 1 \"(charset|encoding)=[\\"']?[^\\"'>]+\" | /usr/bin/grep -Eo \"[^=\\"'>]+$\" }
|
433
|
+
p str
|
434
|
+
return true if str =~ /utf-?8/i
|
435
|
+
return false if !str.empty? && str !~ /utf-?8/i
|
436
|
+
|
437
|
+
# solutions with downloaded file
|
438
|
+
|
439
|
+
# download HTML file
|
440
|
+
#downloaded_file = "/tmp/html"
|
441
|
+
downloaded_file = "~/Desktop/html"
|
442
|
+
downloaded_file = File.expand_path(downloaded_file)
|
443
|
+
%x{ /usr/bin/touch #{downloaded_file} 2>/dev/null }
|
444
|
+
raise "No valid HTML download file (path) specified!" unless File.file?(downloaded_file)
|
445
|
+
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} -o #{downloaded_file} #{url} }
|
446
|
+
|
447
|
+
simple_test = %x{ /usr/bin/file -ik #{downloaded_file} } # cf. man file
|
448
|
+
p simple_test
|
449
|
+
|
450
|
+
# read entire file into a string
|
451
|
+
File.open(downloaded_file).read.each(nil) do |str|
|
452
|
+
#return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i
|
453
|
+
str.utf8? ? (return true) : (return false)
|
454
|
+
end
|
455
|
+
|
456
|
+
#check each line of the downloaded file
|
457
|
+
#count_lines = 0
|
458
|
+
#count_utf8 = 0
|
459
|
+
#File.foreach(downloaded_file) { |line| return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i; count_lines += 1; count_utf8 += 1 if line.clean_utf8.utf8?; break if count_lines != count_utf8 }
|
460
|
+
#count_lines == count_utf8 ? (return true) : (return false)
|
461
|
+
|
462
|
+
|
463
|
+
# in-memory solutions
|
464
|
+
|
465
|
+
#html_file_cleaned_utf8 = %x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.clean_utf8
|
466
|
+
#p html_file_cleaned_utf8.utf8?
|
467
|
+
|
468
|
+
count_lines = 0
|
469
|
+
count_utf8 = 0
|
470
|
+
#%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each(nil) do |line| # read entire file into string
|
471
|
+
%x{ /usr/bin/curl -L --fail --silent --connect-timeout #{seconds} --max-time #{seconds+10} #{url} }.each('\n') do |line|
|
472
|
+
#return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
473
|
+
count_lines += 1
|
474
|
+
count_utf8 += 1 if line.utf8?
|
475
|
+
break if count_lines != count_utf8
|
476
|
+
end
|
477
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
478
|
+
|
479
|
+
else
|
480
|
+
|
481
|
+
# check each line of the HTML file (or the entire HTML file at once)
|
482
|
+
# cf. http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/index.html
|
483
|
+
count_lines = 0
|
484
|
+
count_utf8 = 0
|
485
|
+
open(url) do |f|
|
486
|
+
# p f.meta, f.content_encoding, f.content_type
|
487
|
+
cs = f.charset
|
488
|
+
return true if cs =~ /utf-?8/i
|
489
|
+
#f.each(nil) do |str| str.utf8? ? (return true) : (return false) end # read entire file into string
|
490
|
+
f.each_line do |line|
|
491
|
+
count_lines += 1
|
492
|
+
count_utf8 += 1 if line.utf8?
|
493
|
+
break unless count_lines == count_utf8
|
494
|
+
end
|
495
|
+
end
|
496
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
497
|
+
|
498
|
+
end
|
499
|
+
|
500
|
+
else
|
501
|
+
|
502
|
+
return false unless File.file?(file)
|
503
|
+
|
504
|
+
if RUBY_PLATFORM =~ /darwin/i then str = %x{ /usr/bin/file -ik #{file} }; return true if str =~ /utf-?8/i end
|
505
|
+
|
506
|
+
# read entire file into a string
|
507
|
+
#File.open(file).read.each(nil) do |str| return true if str =~ /(charset|encoding) *= *["']? *utf-?8/i; str.utf8? ? (return true) : (return false) end
|
508
|
+
|
509
|
+
# check each line of the file
|
510
|
+
count_lines = 0
|
511
|
+
count_utf8 = 0
|
512
|
+
File.foreach(file) do |line|
|
513
|
+
return true if line =~ /(charset|encoding) *= *["']? *utf-?8/i
|
514
|
+
count_lines += 1;
|
515
|
+
count_utf8 += 1 if line.utf8?;
|
516
|
+
break if count_lines != count_utf8
|
517
|
+
end
|
518
|
+
|
519
|
+
count_lines == count_utf8 ? (return true) : (return false)
|
520
|
+
|
521
|
+
end
|
522
|
+
|
523
|
+
str =~ /utf-?8/i ? true : false
|
524
|
+
|
525
|
+
end
|
526
|
+
|
527
|
+
|
528
|
+
# cf. Paul Battley, http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
|
529
|
+
def validate_utf8
|
530
|
+
Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
|
531
|
+
end
|
532
|
+
|
533
|
+
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
534
|
+
def asciify_utf8
|
535
|
+
return nil unless self.utf8?
|
536
|
+
#Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2]
|
537
|
+
# delete all punctuation characters inside words except "-" in words such as up-to-date
|
538
|
+
Iconv.iconv('US-ASCII//IGNORE//TRANSLIT', 'UTF-8', (self + ' ') ).first[0..-2].gsub(/(?!-.*)\b[[:punct:]]+\b/, '')
|
539
|
+
end
|
540
|
+
|
541
|
+
def latin1_to_utf8 # ISO-8859-1 to UTF-8
|
542
|
+
ret = Iconv.iconv("UTF-8//IGNORE", "ISO-8859-1", (self + "\x20") ).first[0..-2]
|
543
|
+
ret.utf8? ? ret : nil
|
544
|
+
end
|
545
|
+
|
546
|
+
def cp1252_to_utf8 # CP1252 (WINDOWS-1252) to UTF-8
|
547
|
+
ret = Iconv.iconv("UTF-8//IGNORE", "CP1252", (self + "\x20") ).first[0..-2]
|
548
|
+
ret.utf8? ? ret : nil
|
549
|
+
end
|
550
|
+
|
551
|
+
# cf. Paul Battley, http://www.ruby-forum.com/topic/70357
|
552
|
+
def utf16le_to_utf8
|
553
|
+
ret = Iconv.iconv('UTF-8//IGNORE', 'UTF-16LE', (self[0,(self.length/2*2)] + "\000\000") ).first[0..-2]
|
554
|
+
ret =~ /\x00\z/ ? ret.sub!(/\x00\z/, '') : ret
|
555
|
+
ret.utf8? ? ret : nil
|
556
|
+
end
|
557
|
+
|
558
|
+
def utf8_to_utf16le
|
559
|
+
return nil unless self.utf8?
|
560
|
+
ret = Iconv.iconv('UTF-16LE//IGNORE', 'UTF-8', self ).first
|
561
|
+
end
|
562
|
+
|
563
|
+
def utf8_to_unicode
|
564
|
+
return nil unless self.utf8?
|
565
|
+
str = ""
|
566
|
+
scan(/./mu) { |c| str << "U+" << sprintf("%04X", c.unpack("U*").first) }
|
567
|
+
str
|
568
|
+
end
|
569
|
+
|
570
|
+
def unicode_to_utf8
|
571
|
+
return self if self =~ /\A[[:space:]]*\z/m
|
572
|
+
str = ""
|
573
|
+
#scan(/U\+([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
574
|
+
#scan(/U\+([[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})/) { |u| str << [u.first.hex].pack("U*") }
|
575
|
+
scan(/(U\+(?:[[:digit:][:xdigit:]]{4,5}|10[[:digit:][:xdigit:]]{4})|.)/mu) do # for mixed strings such as "U+00bfHabla espaU+00f1ol?"
|
576
|
+
c = $1
|
577
|
+
if c =~ /^U\+/
|
578
|
+
str << [c[2..-1].hex].pack("U*")
|
579
|
+
else
|
580
|
+
str << c
|
581
|
+
end
|
582
|
+
end
|
583
|
+
str.utf8? ? str : nil
|
584
|
+
end
|
585
|
+
|
586
|
+
|
587
|
+
# dec, hex, oct conversions (experimental!)
|
588
|
+
|
589
|
+
def utf8_to_dec
|
590
|
+
return nil unless self.utf8?
|
591
|
+
str = ""
|
592
|
+
scan(/./mu) do |c|
|
593
|
+
if c =~ /^\x00$/
|
594
|
+
str << "aaa\x00" # encode \x00 as "aaa"
|
595
|
+
else
|
596
|
+
str << sprintf("%04X", c.unpack("U*").first).hex.to_s << "\x00" # convert to decimal
|
597
|
+
end
|
598
|
+
end
|
599
|
+
str[0..-2]
|
600
|
+
end
|
601
|
+
|
602
|
+
def dec_to_utf8 # \x00 is encoded as "aaa"
|
603
|
+
return self if self.empty?
|
604
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
605
|
+
str = ""
|
606
|
+
split(/\x00/).each do |c|
|
607
|
+
if c.eql?("aaa")
|
608
|
+
str << "\x00"
|
609
|
+
else
|
610
|
+
str << [c.to_i].pack("U*")
|
611
|
+
end
|
612
|
+
end
|
613
|
+
str
|
614
|
+
end
|
615
|
+
|
616
|
+
|
617
|
+
def utf8_to_dec_2
|
618
|
+
return nil unless self.utf8?
|
619
|
+
str = ""
|
620
|
+
tmpstr = ""
|
621
|
+
null_str = "\x00"
|
622
|
+
scan(/./mu) do |c|
|
623
|
+
if c =~ /^\x00$/
|
624
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
625
|
+
else
|
626
|
+
tmpstr = ""
|
627
|
+
c.each_byte { |x| tmpstr << x.to_s << null_str } # convert to decimal
|
628
|
+
str << tmpstr << null_str
|
629
|
+
end
|
630
|
+
end
|
631
|
+
str[0..-3]
|
632
|
+
end
|
633
|
+
|
634
|
+
def dec_to_utf8_2 # \x00 is encoded as "aaa"
|
635
|
+
return self if self.empty?
|
636
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
637
|
+
str = ""
|
638
|
+
split(/\x00\x00/).each do |c|
|
639
|
+
if c =~ /\x00/
|
640
|
+
c.split(/\x00/).each { |x| str << x.to_i.chr }
|
641
|
+
elsif c.eql?("aaa")
|
642
|
+
str << "\x00"
|
643
|
+
else
|
644
|
+
str << c.to_i.chr
|
645
|
+
end
|
646
|
+
end
|
647
|
+
str
|
648
|
+
end
|
649
|
+
|
650
|
+
|
651
|
+
def utf8_to_hex
|
652
|
+
return nil unless self.utf8?
|
653
|
+
str = ""
|
654
|
+
tmpstr = ""
|
655
|
+
null_str = "\x00"
|
656
|
+
scan(/./mu) do |c|
|
657
|
+
if c =~ /^\x00$/
|
658
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
659
|
+
else
|
660
|
+
tmpstr = ""
|
661
|
+
c.each_byte { |x| tmpstr << sprintf("%X", x) << null_str } # convert to hexadecimal
|
662
|
+
str << tmpstr << null_str
|
663
|
+
end
|
664
|
+
end
|
665
|
+
str[0..-3]
|
666
|
+
end
|
667
|
+
|
668
|
+
def hex_to_utf8 # \x00 is encoded as "aaa"
|
669
|
+
return self if self.empty?
|
670
|
+
return nil unless self =~ /\A[[:xdigit:]]+\x00/ && self =~ /[[:xdigit:]]+\x00\x00/ && self =~ /\A[a[:xdigit:]\x00]+\z/
|
671
|
+
str = ""
|
672
|
+
split(/\x00\x00/).each do |c|
|
673
|
+
if c =~ /\x00/
|
674
|
+
c.split(/\x00/).each { |x| str << x.hex.chr }
|
675
|
+
elsif c.eql?("aaa")
|
676
|
+
str << "\x00"
|
677
|
+
else
|
678
|
+
str << c.hex.chr
|
679
|
+
end
|
680
|
+
end
|
681
|
+
str
|
682
|
+
end
|
683
|
+
|
684
|
+
|
685
|
+
def utf8_to_oct
|
686
|
+
return nil unless self.utf8?
|
687
|
+
str = ""
|
688
|
+
tmpstr = ""
|
689
|
+
null_str = "\x00"
|
690
|
+
scan(/./mu) do |c|
|
691
|
+
if c =~ /^\x00$/
|
692
|
+
str << "aaa\x00\x00" # encode \x00 as "aaa"
|
693
|
+
else
|
694
|
+
tmpstr = ""
|
695
|
+
c.each_byte { |x| tmpstr << sprintf("%o", x) << null_str } # convert to octal
|
696
|
+
str << tmpstr << null_str
|
697
|
+
end
|
698
|
+
end
|
699
|
+
str[0..-3]
|
700
|
+
end
|
701
|
+
|
702
|
+
def oct_to_utf8 # \x00 is encoded as "aaa"
|
703
|
+
return self if self.empty?
|
704
|
+
return nil unless self =~ /\A[[:digit:]]+\x00/ && self =~ /[[:digit:]]+\x00\x00/ && self =~ /\A[a[:digit:]\x00]+\z/
|
705
|
+
str = ""
|
706
|
+
split(/\x00\x00/).each do |c|
|
707
|
+
if c =~ /\x00/
|
708
|
+
c.split(/\x00/).each { |x| str << x.oct.chr }
|
709
|
+
elsif c.eql?("aaa")
|
710
|
+
str << "\x00"
|
711
|
+
else
|
712
|
+
str << c.oct.chr
|
713
|
+
end
|
714
|
+
end
|
715
|
+
str
|
716
|
+
end
|
717
|
+
|
718
|
+
# cf. http://node-0.mneisen.org/2007/03/13/email-subjects-in-utf-8-mit-ruby-kodieren/
|
719
|
+
def email_subject_utf8
|
720
|
+
return nil unless self.utf8?
|
721
|
+
"=?utf-8?b?#{[self].pack("m").delete("\n")}?="
|
722
|
+
end
|
723
|
+
|
724
|
+
end
|
725
|
+
|
726
|
+
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: prawn-arabic
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dynamix Solutions
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-04-19 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Gem which improve workflow with arabic text
|
14
|
+
email: ahmed.nasser@dynamix-systems.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files:
|
18
|
+
- README.md
|
19
|
+
- LICENSE
|
20
|
+
files:
|
21
|
+
- LICENSE
|
22
|
+
- README.md
|
23
|
+
- Rakefile
|
24
|
+
- lib/prawn-arabic.rb
|
25
|
+
- lib/string_utf_support.rb
|
26
|
+
homepage: https://github.com/ozeron/arabic-prawn
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.5.1
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: Arabic language string helpers
|
50
|
+
test_files: []
|