bidi 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +19 -0
- data/README.md +57 -0
- data/lib/README +36 -0
- data/lib/bidi.rb +2 -0
- data/lib/bidi/bidi.rb +771 -0
- data/lib/bidi/datformirror.rb +34 -0
- data/lib/bidi/indexfile.rb +31 -0
- data/lib/bidi/weakhashmap.rb +53 -0
- data/lib/data/BidiMirroring.dat +0 -0
- data/lib/data/BidiMirroring.txt +611 -0
- data/lib/data/UnicodeData.idx +0 -0
- data/lib/data/UnicodeData.txt +24428 -0
- metadata +55 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1e84e59022f13c923a1a5a9d89f3ccda0eb8fd00
|
4
|
+
data.tar.gz: 12fb80c3da23660d578c4748e8677e8eca1c7221
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3e52bd9c42d30666cda2214177025d8a4bfe2de716d4c61d4dbd63672fb46ecf87fdcaa8096d2c03fdc555b16087adc876d23630cb7a1b0e79595887625136b2
|
7
|
+
data.tar.gz: 50cf772702417cdb69e1e2dcb819f1c80b5b86d4ef43b93f16b0b0ebfc232ce2b1f05af4d6ed4c9038d5044ecffe922841df4a2e91c65b1964de0f1caf5b1da0
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2014 Amit Yaron <amit@phpandmore.net>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# Ruby BiDi
|
2
|
+
|
3
|
+
Ruby gem to help working with bidirectional (left-to-right and right-to-left) text.
|
4
|
+
|
5
|
+
### Install
|
6
|
+
|
7
|
+
```shell
|
8
|
+
gem install bidi
|
9
|
+
```
|
10
|
+
|
11
|
+
### Use
|
12
|
+
|
13
|
+
Require the `bidi` module and use `to_visual`:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
require "bidi"
|
17
|
+
|
18
|
+
bidi = Bidi.new
|
19
|
+
bidi_string = bidi.to_visual "משפט עם עברית ו-English. מספרים: 12345 (וגם כל מיני סימני פיסוק) וגם סימן קריאה!"
|
20
|
+
```
|
21
|
+
|
22
|
+
When rendering right-to-left text, some writers require reversing the string before passing it to them. [Prawn](https://github.com/prawnpdf/prawn) is one such example. The `render_visual` function does this for you:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
require "prawn"
|
26
|
+
require "bidi"
|
27
|
+
|
28
|
+
Prawn::Document.generate("hello.pdf") do
|
29
|
+
self.text_direction = :rtl
|
30
|
+
|
31
|
+
bidi = Bidi.new
|
32
|
+
text bidi.render_visual "משפט עם עברית ו-English. מספרים: 12345 (וגם כל מיני סימני פיסוק) וגם סימן קריאה!"
|
33
|
+
end
|
34
|
+
|
35
|
+
```
|
36
|
+
|
37
|
+
### License
|
38
|
+
|
39
|
+
Copyright (c) 2014 Amit Yaron <<amit@phpandmore.net>>
|
40
|
+
|
41
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
42
|
+
of this software and associated documentation files (the "Software"), to deal
|
43
|
+
in the Software without restriction, including without limitation the rights
|
44
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
45
|
+
copies of the Software, and to permit persons to whom the Software is
|
46
|
+
furnished to do so, subject to the following conditions:
|
47
|
+
|
48
|
+
The above copyright notice and this permission notice shall be included in
|
49
|
+
all copies or substantial portions of the Software.
|
50
|
+
|
51
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
52
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
53
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
54
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
55
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
56
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
57
|
+
THE SOFTWARE.
|
data/lib/README
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
Bidirectional Text
|
2
|
+
==================
|
3
|
+
This package contains a function that converts your input logical UTF-8 string into a visual string according to the Bidi algorithm
|
4
|
+
found in http://www.unicode.org/reports/tr9/
|
5
|
+
|
6
|
+
Requirements:
|
7
|
+
* Ruby 1.9
|
8
|
+
* The Ruby llibrary 'weakref'
|
9
|
+
|
10
|
+
The conversion function is found in "bidi.rb"
|
11
|
+
|
12
|
+
To use the conversion function:
|
13
|
+
1. Define an object of class 'Bidi'. We'll call this object bidi.
|
14
|
+
2. call 'bidi.to_visual <your string> <default paragraph direction>'
|
15
|
+
The values for default paragraph direction:
|
16
|
+
* 'R' or 'RTL' - Right to Left text.
|
17
|
+
* 'L' or 'LTR' - Left to right text.
|
18
|
+
* other values or omitted - the default for each paragraph.
|
19
|
+
|
20
|
+
Constants:
|
21
|
+
* Bidi.RLE - Right to left embedding.
|
22
|
+
* Bidi.LRE - Left to right embedding.
|
23
|
+
* Bidi.RLO - Right to left override.
|
24
|
+
* Bidi.LRO - Left to right override.
|
25
|
+
* Bidi.PDF - Pop Directional Formatting.
|
26
|
+
* Bidi.RLM - Right to left mark.
|
27
|
+
* Bidi.LRM - Left to right mark.
|
28
|
+
|
29
|
+
To run a script that calls 'bidi.to_visual', type
|
30
|
+
ruby -Ku <script name.rb>
|
31
|
+
|
32
|
+
'K' stands for Kanji, letters commonly used in japan and in China. this will cause Ruby to interpret the extended character set as UTF-8 character set, and will prevent the embarrassing error message 'invalid multibyte char (US-ASCII)'.
|
33
|
+
|
34
|
+
|
35
|
+
"bidi.rb" also contains a method named "to_utf8_char", which extends the Integer class. You can use it to define additional UTF-8 characters.
|
36
|
+
|
data/lib/bidi.rb
ADDED
data/lib/bidi/bidi.rb
ADDED
@@ -0,0 +1,771 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'bidi/weakhashmap'
|
4
|
+
|
5
|
+
class Integer
|
6
|
+
def to_utf8_char
|
7
|
+
raise RangeError "Value #{self} is out of range for UTF8 Char" if self<0 or self > 0x10fffd
|
8
|
+
if self >> 7 == 0 # less than 0x80? If so, return an ASCII char
|
9
|
+
return self.chr
|
10
|
+
end
|
11
|
+
prefix = 0x80 # First UTF-8 byte, the initial value of the
|
12
|
+
# prefix is 110b
|
13
|
+
temp = self
|
14
|
+
byte_arr=Array.new
|
15
|
+
bytes_to_shift=0
|
16
|
+
rem_digits = 6
|
17
|
+
while true
|
18
|
+
rest=temp >> rem_digits
|
19
|
+
rem_digits -= 1
|
20
|
+
if rest == 0
|
21
|
+
byte_arr.push prefix | temp
|
22
|
+
break
|
23
|
+
else
|
24
|
+
byte_arr.push 0x80 | (temp & 0x3f)
|
25
|
+
temp >>= 6
|
26
|
+
prefix >>= 1
|
27
|
+
prefix |= 0x80
|
28
|
+
end
|
29
|
+
end
|
30
|
+
last_pos=byte_arr.length - 1
|
31
|
+
ret_value=String.new
|
32
|
+
last_pos.downto 0 do |i|
|
33
|
+
ret_value << byte_arr[i].chr
|
34
|
+
end
|
35
|
+
ret_value.force_encoding 'UTF-8'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
$weakHashMap = WeakHashMap.new
|
40
|
+
$mirrorMap = WeakHashMap.new
|
41
|
+
|
42
|
+
class StringError < Exception
|
43
|
+
def initialize byte, afterString
|
44
|
+
@byte=byte
|
45
|
+
@afterString=afterString
|
46
|
+
end
|
47
|
+
def message
|
48
|
+
'Unexpected byte(s): ' + byte + ' after \'' + afterString + '\''
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Bidi
|
53
|
+
# constants
|
54
|
+
def Bidi.RLE
|
55
|
+
0x202b.to_utf8_char
|
56
|
+
end
|
57
|
+
|
58
|
+
def Bidi.LRE
|
59
|
+
0x202a.to_utf8_char
|
60
|
+
end
|
61
|
+
|
62
|
+
def Bidi.RLO
|
63
|
+
0x202e.to_utf8_char
|
64
|
+
end
|
65
|
+
|
66
|
+
def Bidi.LRO
|
67
|
+
0x202d.to_utf8_char
|
68
|
+
end
|
69
|
+
|
70
|
+
def Bidi.LRM
|
71
|
+
0x200e.to_utf8_char
|
72
|
+
end
|
73
|
+
|
74
|
+
def Bidi.RLM
|
75
|
+
0x200f.to_utf8_char
|
76
|
+
end
|
77
|
+
|
78
|
+
def Bidi.PDF
|
79
|
+
0x202c.to_utf8_char
|
80
|
+
end
|
81
|
+
|
82
|
+
class ParagraphType < Hash
|
83
|
+
def initialize default_direction=nil
|
84
|
+
upcase_default_direction = default_direction.upcase if default_direction
|
85
|
+
case upcase_default_direction
|
86
|
+
when 'R', 'RTL'
|
87
|
+
self['level']=1
|
88
|
+
when 'L', 'LTR'
|
89
|
+
self['level']=0
|
90
|
+
else
|
91
|
+
self['level']=-1
|
92
|
+
end
|
93
|
+
self['characters']=Array.new
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class UtfChar < Hash
|
98
|
+
def initialize value, bidiType, mirroredInd
|
99
|
+
self['value']=value
|
100
|
+
self['bidiType']=bidiType
|
101
|
+
self['mirroredInd']=mirroredInd
|
102
|
+
end
|
103
|
+
|
104
|
+
def is_neutral
|
105
|
+
bidiType = self["bidiType"]
|
106
|
+
bidiType == 'B' or bidiType == 'S' or bidiType == 'WS' or bidiType == 'ON'
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
CHAR_START=1
|
111
|
+
CHAR_END=2
|
112
|
+
CHAR_BEFORE_LAST=3
|
113
|
+
CHAR_SECOND_OF_FOUR=4
|
114
|
+
|
115
|
+
def initialize
|
116
|
+
@@idx_record_len=7
|
117
|
+
@dataPath = Gem.loaded_specs["bidi"].full_gem_path + "/lib/data/";
|
118
|
+
@idxFile = File.open(@dataPath + "UnicodeData.idx", "r");
|
119
|
+
@dataFile = File.open(@dataPath + "UnicodeData.txt", "r");
|
120
|
+
@mirrorFile = File.open(@dataPath + "BidiMirroring.dat", "r");
|
121
|
+
ObjectSpace.define_finalizer(self, proc {@idxFile.close; @dataFile.close; @mirrorFile.close})
|
122
|
+
@num_of_indexes = @idxFile.stat.size / @@idx_record_len
|
123
|
+
@mirror_record_len=6
|
124
|
+
@num_of_mirror_chars=@mirrorFile.stat.size / @mirror_record_len
|
125
|
+
end # initialize
|
126
|
+
|
127
|
+
def retrieve_rec key
|
128
|
+
value = $weakHashMap[key]
|
129
|
+
return value if value
|
130
|
+
|
131
|
+
# Binary search of the key
|
132
|
+
bottom=0
|
133
|
+
top = @num_of_indexes
|
134
|
+
while (top >= bottom) do
|
135
|
+
middle = (top + bottom) / 2
|
136
|
+
addr = middle * @@idx_record_len
|
137
|
+
@idxFile.pos=addr
|
138
|
+
str=@idxFile.read 3
|
139
|
+
intValue=0;
|
140
|
+
str.each_byte do |b|
|
141
|
+
intValue <<= 8
|
142
|
+
intValue |= b
|
143
|
+
end # each
|
144
|
+
if intValue == key # Found - read the record
|
145
|
+
str=@idxFile.read 4
|
146
|
+
dataPos = 0
|
147
|
+
str.each_byte do |b|
|
148
|
+
dataPos <<= 8
|
149
|
+
dataPos |= b
|
150
|
+
end # each
|
151
|
+
@dataFile.pos=dataPos
|
152
|
+
record=@dataFile.readline
|
153
|
+
arr=record.split ';', -1
|
154
|
+
$weakHashMap[key]=arr
|
155
|
+
return arr
|
156
|
+
end # if
|
157
|
+
if key < intValue
|
158
|
+
top = middle - 1
|
159
|
+
else
|
160
|
+
bottom = middle + 1
|
161
|
+
end # if
|
162
|
+
end
|
163
|
+
nil
|
164
|
+
end
|
165
|
+
|
166
|
+
# Run = run of characters of the same level
|
167
|
+
def split_into_runs par
|
168
|
+
sor=0
|
169
|
+
sor_level=par['level']
|
170
|
+
run = Hash.new
|
171
|
+
run['sor']=sor
|
172
|
+
chars=par['characters']
|
173
|
+
len=chars.length
|
174
|
+
par['runs']=Array.new
|
175
|
+
0.upto(len - 1) do |index|
|
176
|
+
char=chars[index]
|
177
|
+
next unless char['level']
|
178
|
+
if char['level'] != sor_level
|
179
|
+
run['sor']=sor
|
180
|
+
run['sorType']=chars[sor]['level'].odd? ? 'R' : 'L'
|
181
|
+
run['eor']=index
|
182
|
+
run['eorType']=chars[index]['level'].odd? ? 'R' : 'L'
|
183
|
+
sor=index
|
184
|
+
par['runs'].push run
|
185
|
+
run=Hash.new
|
186
|
+
sor_level=char['level']
|
187
|
+
end
|
188
|
+
end # upto
|
189
|
+
run['sor']=sor
|
190
|
+
run['sorType']=chars[sor]['level'].odd? ? 'R' : 'L'
|
191
|
+
run['eor']=len
|
192
|
+
run['eorType']=par['level'].odd? ? 'R' : 'L'
|
193
|
+
par['runs'].push run
|
194
|
+
end
|
195
|
+
|
196
|
+
# Determine the direction ('L', 'R') of the nonspacing mark
|
197
|
+
# and a little bit of European Number handling
|
198
|
+
def resolve_nsm par, run
|
199
|
+
previous_direction = run['sorType']
|
200
|
+
sor=run['sor']
|
201
|
+
eor_m1=run['eor'] - 1
|
202
|
+
chars=par['characters']
|
203
|
+
sor.upto eor_m1 do |ind|
|
204
|
+
case chars[ind]['bidiType']
|
205
|
+
when 'NSM'
|
206
|
+
chars[ind]['bidiType']=previous_direction
|
207
|
+
chars[ind]['origType']='NSM'
|
208
|
+
when 'L','R', 'AL'
|
209
|
+
previous_direction=chars[ind]['bidiType']
|
210
|
+
when 'EN'
|
211
|
+
chars[ind]['bidiType']='AN' if previous_direction=='AL'
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# Change the AL bidiType to R
|
217
|
+
def change_AL_to_R par, run
|
218
|
+
sor=run['sor']
|
219
|
+
eor_m1=run['eor'] - 1
|
220
|
+
chars=par['characters']
|
221
|
+
sor.upto eor_m1 do |ind|
|
222
|
+
chars[ind]['bidiType']='R' if chars[ind]['bidiType']=='AL'
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
# 'ES' between two 'EN's' is change to EN
|
227
|
+
# 'CS' between two numbers of the same type is changed to that
|
228
|
+
# type.
|
229
|
+
def handle_cs_and_es par, run
|
230
|
+
sor=run['sor']
|
231
|
+
eor_m1=run['eor'] - 1
|
232
|
+
chars=par['characters']
|
233
|
+
sor.upto eor_m1 do |ind|
|
234
|
+
case chars[ind]['bidiType']
|
235
|
+
when 'ES'
|
236
|
+
before_sep = ind>sor ? chars[ind-1]['bidiType'] : nil
|
237
|
+
after_sep = ind<eor_m1 ? chars[ind+1]['bidiType'] : nil
|
238
|
+
if (before_sep == 'EN' and after_sep=='EN')
|
239
|
+
chars[ind]['bidiType']='EN'
|
240
|
+
else
|
241
|
+
chars[ind]['bidiType']='ON'
|
242
|
+
end
|
243
|
+
when 'CS'
|
244
|
+
before_sep = ind>sor ? chars[ind-1]['bidiType'] : nil
|
245
|
+
after_sep = ind<eor_m1 ? chars[ind+1]['bidiType'] : nil
|
246
|
+
if (before_sep == 'EN' and after_sep=='EN')
|
247
|
+
chars[ind]['bidiType']='EN'
|
248
|
+
else if (before_sep == 'AN' and after_sep=='AN')
|
249
|
+
chars[ind]['bidiType']='AN'
|
250
|
+
else
|
251
|
+
chars[ind]['bidiType']='ON'
|
252
|
+
end
|
253
|
+
end # if
|
254
|
+
end # case
|
255
|
+
end # upto
|
256
|
+
end
|
257
|
+
|
258
|
+
def handle_en_et_sequences par, run
|
259
|
+
sOTHERS=0
|
260
|
+
sET_FOUND=1
|
261
|
+
sEN_FOUND=2
|
262
|
+
state=sOTHERS
|
263
|
+
sor=run['sor']
|
264
|
+
eor_m1=run['eor'] - 1
|
265
|
+
seq_start=nil
|
266
|
+
seq_end=nil
|
267
|
+
chars=par['characters']
|
268
|
+
sor.upto eor_m1 do |ind|
|
269
|
+
case state
|
270
|
+
when sOTHERS
|
271
|
+
case chars[ind]['bidiType']
|
272
|
+
when 'EN'
|
273
|
+
state=sEN_FOUND
|
274
|
+
seq_start = seq_end = ind
|
275
|
+
when 'ET'
|
276
|
+
state=sET_FOUND
|
277
|
+
seq_start = seq_end = ind
|
278
|
+
end
|
279
|
+
when sET_FOUND
|
280
|
+
case chars[ind]['bidiType']
|
281
|
+
when 'EN'
|
282
|
+
state=sEN_FOUND
|
283
|
+
seq_end = ind
|
284
|
+
when 'ET'
|
285
|
+
seq_end = ind
|
286
|
+
else
|
287
|
+
seq_start.upto seq_end do |ind1|
|
288
|
+
chars[ind1]['bidiType']='ON'
|
289
|
+
end
|
290
|
+
seq_start = seq_end = nil
|
291
|
+
state=sOTHERS
|
292
|
+
end
|
293
|
+
when sEN_FOUND
|
294
|
+
case chars[ind]['bidiType']
|
295
|
+
when 'EN', 'ET'
|
296
|
+
seq_end = ind
|
297
|
+
else
|
298
|
+
seq_start.upto seq_end do |ind1|
|
299
|
+
chars[ind1]['bidiType']='EN'
|
300
|
+
end
|
301
|
+
seq_start = seq_end = nil
|
302
|
+
state=sOTHERS
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
def resolve_neutral_types par, run
|
309
|
+
sNO_N_FOUND=0
|
310
|
+
sN_FOUND=1
|
311
|
+
start_direction=run['sorType']
|
312
|
+
sor=run['sor']
|
313
|
+
eor_m1=run['eor']-1
|
314
|
+
chars=par['characters']
|
315
|
+
seq_start=0
|
316
|
+
seq_end=-1
|
317
|
+
state=sNO_N_FOUND
|
318
|
+
sor.upto eor_m1 do |ind|
|
319
|
+
type=chars[ind]['bidiType']
|
320
|
+
case type
|
321
|
+
when 'R','AN','EN'
|
322
|
+
l_or_r='R'
|
323
|
+
when 'L'
|
324
|
+
l_or_r='L'
|
325
|
+
else
|
326
|
+
l_or_r=nil
|
327
|
+
end #case
|
328
|
+
|
329
|
+
case state
|
330
|
+
when sNO_N_FOUND
|
331
|
+
if chars[ind].is_neutral
|
332
|
+
seq_start=seq_end=ind
|
333
|
+
state=sN_FOUND
|
334
|
+
else
|
335
|
+
start_direction=l_or_r
|
336
|
+
end
|
337
|
+
when sN_FOUND
|
338
|
+
if l_or_r or ind=eor_m1
|
339
|
+
end_direction=l_or_r ? l_or_r : run['eorType']
|
340
|
+
change_n_to=start_direction==end_direction ? end_direction : nil
|
341
|
+
seq_start.upto seq_end do |ind1|
|
342
|
+
if chars[ind1].is_neutral
|
343
|
+
if change_n_to
|
344
|
+
chars[ind1]['bidiType']=change_n_to
|
345
|
+
else
|
346
|
+
chars[ind1]['bidiType']=chars[ind1]['level'].odd? ? 'R' : 'L'
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
state=sNO_N_FOUND
|
351
|
+
else
|
352
|
+
if chars[ind].is_neutral
|
353
|
+
seq_end=ind
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
# Change each character's level according to its embedding level
|
361
|
+
# and bidiType.
|
362
|
+
def resolve_implicit_levels par
|
363
|
+
par['characters'].each {|char|
|
364
|
+
embedding_level=char['level']
|
365
|
+
bidiType=char['bidiType']
|
366
|
+
case bidiType
|
367
|
+
when 'L'
|
368
|
+
char['level']=embedding_level + 1 if embedding_level.odd?
|
369
|
+
when 'R'
|
370
|
+
char['level']=embedding_level + 1 if embedding_level.even?
|
371
|
+
when 'AN','EN'
|
372
|
+
char['level']=embedding_level + (embedding_level.odd? ? 1 : 2)
|
373
|
+
end
|
374
|
+
char['level']=0 if char['value']==0x0A or char['value']==0x0D
|
375
|
+
}
|
376
|
+
end
|
377
|
+
|
378
|
+
# Reset the embedding level of paragraph and segment separators
|
379
|
+
# to the paragraph level. Do the same with spaces preceding them
|
380
|
+
def reset_separator_levels par
|
381
|
+
paragraph_level=par['level']
|
382
|
+
chars=par['characters']
|
383
|
+
len=chars.length
|
384
|
+
before_sep=true
|
385
|
+
(len-1).downto 0 do |ind|
|
386
|
+
char=chars[ind]
|
387
|
+
if char['bidiType']=='B' or char['bidiType']=='S'
|
388
|
+
before_sep=true
|
389
|
+
char['level']=paragraph_level
|
390
|
+
next
|
391
|
+
end
|
392
|
+
char['level']=paragraph_level if char['bidiType']=='WS' and before_sep
|
393
|
+
before_sep = false if char['bidiType'] != 'WS'
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
|
398
|
+
def resolve_weak_types par
|
399
|
+
runs = par['runs']
|
400
|
+
runs.each do |run|
|
401
|
+
resolve_nsm par, run
|
402
|
+
change_AL_to_R par, run
|
403
|
+
handle_cs_and_es par, run
|
404
|
+
handle_en_et_sequences par, run
|
405
|
+
resolve_neutral_types par, run
|
406
|
+
par.delete 'runs'
|
407
|
+
resolve_implicit_levels par
|
408
|
+
reset_separator_levels par
|
409
|
+
end #each
|
410
|
+
end
|
411
|
+
|
412
|
+
#
|
413
|
+
# Reverse odd levels (i.e. levels of characters written right-to-left
|
414
|
+
#
|
415
|
+
def reverse_rtl_chars par
|
416
|
+
min_odd_level = max_level = nil
|
417
|
+
levels = Hash.new # Where I want to store info about the level
|
418
|
+
chars=par['characters']
|
419
|
+
last=chars.length - 1
|
420
|
+
0.upto last do |ind|
|
421
|
+
char=chars[ind]
|
422
|
+
level=char['level']
|
423
|
+
min_odd_level = level if level.odd? && (!min_odd_level or level<min_odd_level)
|
424
|
+
max_level=level if !max_level or level>max_level
|
425
|
+
if !levels[level] then
|
426
|
+
hsh = levels[level] = Hash.new
|
427
|
+
hsh['start']=ind
|
428
|
+
else
|
429
|
+
hsh = levels[level]
|
430
|
+
end
|
431
|
+
hsh['end']=ind
|
432
|
+
end # upto
|
433
|
+
return unless min_odd_level
|
434
|
+
|
435
|
+
done=false
|
436
|
+
cur_lvl=max_level
|
437
|
+
while !done do
|
438
|
+
lvl=cur_lvl - 1
|
439
|
+
if cur_lvl > min_odd_level then
|
440
|
+
while !levels[lvl] do
|
441
|
+
lvl -= 1
|
442
|
+
end
|
443
|
+
end
|
444
|
+
hsh_cur=levels[cur_lvl]
|
445
|
+
if lvl >= min_odd_level
|
446
|
+
hsh_low=levels[lvl]
|
447
|
+
hsh_low['start'] = hsh_cur['start'] if hsh_cur['start'] < hsh_low['start']
|
448
|
+
hsh_low['end'] = hsh_cur['end'] if hsh_cur['end'] > hsh_low['end']
|
449
|
+
end
|
450
|
+
if (cur_lvl==min_odd_level) or (lvl.odd? != cur_lvl.odd?)
|
451
|
+
rearrange_level par, cur_lvl, hsh_cur
|
452
|
+
end
|
453
|
+
|
454
|
+
done=true if cur_lvl == min_odd_level
|
455
|
+
cur_lvl=lvl
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
|
460
|
+
def handle_paragraph par
|
461
|
+
par['level']=0 if par['level']==-1
|
462
|
+
embedding_level = par['level']
|
463
|
+
override_status=nil
|
464
|
+
level_stack=Array.new
|
465
|
+
invalid_level_changes=0
|
466
|
+
par['characters'].each do |char|
|
467
|
+
bidi_type=char['bidiType']
|
468
|
+
case bidi_type
|
469
|
+
#--------------------#
|
470
|
+
# Explicit Embedding #
|
471
|
+
#--------------------#
|
472
|
+
when 'RLE'
|
473
|
+
next_odd = embedding_level + (embedding_level.odd? ? 2 : 1)
|
474
|
+
if (next_odd <= 61)
|
475
|
+
hsh=Hash.new
|
476
|
+
hsh['level']=embedding_level
|
477
|
+
hsh['override_status']=override_status
|
478
|
+
embedding_level = next_odd
|
479
|
+
override_status=nil
|
480
|
+
level_stack.push hsh
|
481
|
+
else
|
482
|
+
invalid_level_changes += 1
|
483
|
+
end
|
484
|
+
when 'LRE'
|
485
|
+
next_even = embedding_level + (embedding_level.even? ? 2 : 1)
|
486
|
+
if (next_even <= 61)
|
487
|
+
hsh=Hash.new
|
488
|
+
hsh['level']=embedding_level
|
489
|
+
hsh['override_status']=override_status
|
490
|
+
embedding_level = next_even
|
491
|
+
override_status=nil
|
492
|
+
level_stack.push hsh
|
493
|
+
else
|
494
|
+
invalid_level_changes += 1
|
495
|
+
end
|
496
|
+
#-------------------#
|
497
|
+
# Explicit Override #
|
498
|
+
#-------------------#
|
499
|
+
when 'RLO'
|
500
|
+
next_odd = embedding_level + (embedding_level.odd? ? 2 : 1)
|
501
|
+
if (next_odd <= 61)
|
502
|
+
hsh=Hash.new
|
503
|
+
hsh['level']=embedding_level
|
504
|
+
hsh['override_status']=override_status
|
505
|
+
embedding_level = next_odd
|
506
|
+
override_status='R'
|
507
|
+
level_stack.push hsh
|
508
|
+
else
|
509
|
+
invalid_level_changes += 1
|
510
|
+
end
|
511
|
+
when 'LRO'
|
512
|
+
next_even = embedding_level + (embedding_level.even? ? 2 : 1)
|
513
|
+
if (next_even <= 61)
|
514
|
+
hsh=Hash.new
|
515
|
+
hsh['level']=embedding_level
|
516
|
+
hsh['override_status']=override_status
|
517
|
+
embedding_level = next_even
|
518
|
+
override_status='L'
|
519
|
+
level_stack.push hsh
|
520
|
+
else
|
521
|
+
invalid_level_changes += 1
|
522
|
+
end
|
523
|
+
# PDF - End of embedding/override
|
524
|
+
when 'PDF'
|
525
|
+
if invalid_level_changes == 0
|
526
|
+
hsh = level_stack.pop
|
527
|
+
embedding_level=hsh['level']
|
528
|
+
override_status = hsh['override_status']
|
529
|
+
else
|
530
|
+
invalid_level_changes -= 1
|
531
|
+
end
|
532
|
+
else # of 'case'
|
533
|
+
if bidi_type != 'BN'
|
534
|
+
char['level']=embedding_level
|
535
|
+
char['bidiType']=override_status if override_status
|
536
|
+
end
|
537
|
+
end # case
|
538
|
+
end # each
|
539
|
+
par['characters'].delete_if {|char|
|
540
|
+
char['bidiType']=='RLE' or
|
541
|
+
char['bidiType']=='LRE' or
|
542
|
+
char['bidiType']=='RLO' or
|
543
|
+
char['bidiType']=='LRO' or
|
544
|
+
char['bidiType']=='PDF' or
|
545
|
+
char['bidiType']=='BN'
|
546
|
+
}
|
547
|
+
split_into_runs par
|
548
|
+
resolve_weak_types par
|
549
|
+
reverse_rtl_chars par
|
550
|
+
end # function
|
551
|
+
|
552
|
+
|
553
|
+
def to_paragraphs default_direction=nil
|
554
|
+
ret_value = Array.new
|
555
|
+
first_utf8_char=true
|
556
|
+
new_par=true
|
557
|
+
par=nil
|
558
|
+
@valueArray.each do |value|
|
559
|
+
if first_utf8_char
|
560
|
+
first_utf8_char=false
|
561
|
+
new_par=true
|
562
|
+
par=ParagraphType.new default_direction
|
563
|
+
ret_value.push par
|
564
|
+
end
|
565
|
+
if value==0x0A or value==0x0D
|
566
|
+
# Add new lines to the current paragaph
|
567
|
+
par['characters'].push UtfChar.new value, nil, 'N'
|
568
|
+
new_par=false
|
569
|
+
else
|
570
|
+
unless new_par
|
571
|
+
new_par=true
|
572
|
+
par=ParagraphType.new default_direction
|
573
|
+
ret_value.push par
|
574
|
+
end
|
575
|
+
rec=retrieve_rec value
|
576
|
+
bidiType=rec ? rec[4] : nil
|
577
|
+
mirroredInd = rec ? rec[9] : nil
|
578
|
+
|
579
|
+
par['characters'].push UtfChar.new value, bidiType, mirroredInd
|
580
|
+
if par['level']==-1
|
581
|
+
if bidiType=='R' or bidiType=='AL'
|
582
|
+
par['level']=1
|
583
|
+
else
|
584
|
+
par['level']=0 if bidiType=='L'
|
585
|
+
end
|
586
|
+
end
|
587
|
+
end
|
588
|
+
end
|
589
|
+
ret_value
|
590
|
+
end
|
591
|
+
|
592
|
+
def search_mirrored_value key
|
593
|
+
bottom=0
|
594
|
+
top=@num_of_mirror_chars
|
595
|
+
while top>=bottom
|
596
|
+
middle=(top + bottom) / 2
|
597
|
+
addr=middle * @mirror_record_len
|
598
|
+
@mirrorFile.pos=addr
|
599
|
+
str=@mirrorFile.read 3
|
600
|
+
intValue = 0
|
601
|
+
str.each_byte do |byte|
|
602
|
+
intValue <<= 8
|
603
|
+
intValue |= byte
|
604
|
+
end
|
605
|
+
if key == intValue
|
606
|
+
str=@mirrorFile.read 3
|
607
|
+
retValue=0
|
608
|
+
str.each_byte do |byte|
|
609
|
+
retValue <<= 8
|
610
|
+
retValue |= byte
|
611
|
+
end
|
612
|
+
$mirrorMap[key]=[retValue]
|
613
|
+
return retValue
|
614
|
+
end
|
615
|
+
if key < intValue
|
616
|
+
top=middle - 1
|
617
|
+
else
|
618
|
+
bottom=middle + 1
|
619
|
+
end
|
620
|
+
end
|
621
|
+
key
|
622
|
+
end
|
623
|
+
|
624
|
+
def get_mirrored_value char
|
625
|
+
key=char['value']
|
626
|
+
ret_value=$mirrorMap[key]
|
627
|
+
return ret_value[0] if ret_value
|
628
|
+
search_mirrored_value key
|
629
|
+
end
|
630
|
+
|
631
|
+
#
|
632
|
+
# to_visual - the function that converts a UTF-8 string
|
633
|
+
# to visual.
|
634
|
+
#
|
635
|
+
# i_string - the input string.
|
636
|
+
# default_direction - each paragraph's default direction.
|
637
|
+
# values:
|
638
|
+
# 'R', 'RTL' - right to left text.
|
639
|
+
# 'L', 'LTR' - left to right text.
|
640
|
+
# Not set, other values - default behaviour.
|
641
|
+
#
|
642
|
+
def to_visual i_string, default_direction=nil
|
643
|
+
@valueArray = Array.new # Array of values
|
644
|
+
state=CHAR_START
|
645
|
+
charVal=0;
|
646
|
+
handledString=''
|
647
|
+
charForError=''
|
648
|
+
byteList='q'
|
649
|
+
i_string.each_byte do |byte|
|
650
|
+
charForError += byte.chr;
|
651
|
+
case state
|
652
|
+
when CHAR_START
|
653
|
+
byteList=byte.to_s
|
654
|
+
charVal=byte
|
655
|
+
if byte & 0x80 == 0 # regular ASCII
|
656
|
+
@valueArray.push byte
|
657
|
+
handledString=handledString + charForError
|
658
|
+
charForError=''
|
659
|
+
next
|
660
|
+
end
|
661
|
+
if byte & 0xE0 == 0xC0 # Begins with 110b - two bytes
|
662
|
+
charVal = byte & 0x1F
|
663
|
+
state = CHAR_END
|
664
|
+
next
|
665
|
+
end
|
666
|
+
if byte & 0xF0 == 0xE0 # Begins with 1110b - three bytes
|
667
|
+
charVal = byte & 0x0F
|
668
|
+
state = CHAR_BEFORE_LAST
|
669
|
+
next
|
670
|
+
end
|
671
|
+
if byte & 0xF8 == 0xF0 # Begins with 11110b - four bytes
|
672
|
+
charVal = byte & 0x07
|
673
|
+
state = CHAR_SECOND_OF_FOUR
|
674
|
+
next
|
675
|
+
end
|
676
|
+
raise StringError.new byteList, handledstring
|
677
|
+
when CHAR_END
|
678
|
+
byteList += ', ' + byte.to_s
|
679
|
+
if byte & 0xC0 != 0x80 # The byte should begin with 10b
|
680
|
+
raise StringError.new byteList, handledstring
|
681
|
+
end
|
682
|
+
charVal <<= 6
|
683
|
+
charVal |= (byte & 0x3F)
|
684
|
+
@valueArray. push charVal
|
685
|
+
state = CHAR_START
|
686
|
+
handledString=handledString + charForError
|
687
|
+
charForError=''
|
688
|
+
when CHAR_BEFORE_LAST
|
689
|
+
byteList += ', ' + byte.to_s
|
690
|
+
if byte & 0xC0 != 0x80 # The byte should begin with 10b
|
691
|
+
raise StringError.new byteList, handledstring
|
692
|
+
end
|
693
|
+
charVal <<= 6
|
694
|
+
charVal |= (byte & 0x3F)
|
695
|
+
state = CHAR_END
|
696
|
+
when CHAR_SECOND_OF_FOUR
|
697
|
+
byteList += ', ' + byte.to_s
|
698
|
+
if byte & 0xC0 != 0x80 # The byte should begin with 10b
|
699
|
+
raise StringError.new byteList, handledstring
|
700
|
+
end
|
701
|
+
charVal <<= 6
|
702
|
+
charVal |= (byte & 0x3F)
|
703
|
+
state = CHAR_BEFORE_LAST
|
704
|
+
end
|
705
|
+
end
|
706
|
+
# First step - split the text into paragraphs
|
707
|
+
paragraphs = to_paragraphs default_direction
|
708
|
+
paragraphs.each do |par|
|
709
|
+
handle_paragraph par
|
710
|
+
end
|
711
|
+
|
712
|
+
# Now, make a string
|
713
|
+
ret_value=''
|
714
|
+
paragraphs.each do |par|
|
715
|
+
chars=par['characters']
|
716
|
+
nsm_stack=Array.new
|
717
|
+
chars.each do |char|
|
718
|
+
char['value']=get_mirrored_value char if char['mirroredInd']=='Y' and char['level'].odd?
|
719
|
+
|
720
|
+
if char['origType']=='NSM' and char['bidiType']=='R'
|
721
|
+
nsm_stack.push char['value']
|
722
|
+
else
|
723
|
+
ret_value += char['value'].to_utf8_char if char['bidiType']=='R'
|
724
|
+
ret_value += (nsm_stack.pop).to_utf8_char while not nsm_stack.empty?
|
725
|
+
ret_value += char['value'].to_utf8_char if char['bidiType']!='R'
|
726
|
+
end
|
727
|
+
end
|
728
|
+
ret_value += (nsm_stack.pop).to_utf8_char while not nsm_stack.empty?
|
729
|
+
end
|
730
|
+
|
731
|
+
ret_value
|
732
|
+
end
|
733
|
+
|
734
|
+
# Helper function to reverse the string for us before rendering.
|
735
|
+
def render_visual i_string, default_direction=nil
|
736
|
+
return to_visual(i_string, default_direction).reverse!
|
737
|
+
end
|
738
|
+
|
739
|
+
def rearrange_level par, lvl, hsh_cur
|
740
|
+
start=hsh_cur['start']
|
741
|
+
end_p1=hsh_cur['end'] + 1
|
742
|
+
run_started=false
|
743
|
+
forward_index=nil
|
744
|
+
start.upto end_p1 do |ind|
|
745
|
+
chars=par['characters']
|
746
|
+
char=chars[ind]
|
747
|
+
if !run_started and char and char['level']>=lvl
|
748
|
+
forward_index=ind
|
749
|
+
end
|
750
|
+
run_started=true if char and char['level']>=lvl
|
751
|
+
if run_started and (ind==end_p1 or char['level']<lvl) then
|
752
|
+
backward_index=ind - 1
|
753
|
+
interval_length = backward_index - forward_index
|
754
|
+
halfway = interval_length / 2
|
755
|
+
halfway -= 1 if interval_length.even?
|
756
|
+
0.upto halfway do
|
757
|
+
temp = chars[forward_index]
|
758
|
+
chars[forward_index]=chars[backward_index]
|
759
|
+
chars[backward_index] = temp
|
760
|
+
forward_index += 1
|
761
|
+
backward_index -= 1
|
762
|
+
end
|
763
|
+
run_started=false
|
764
|
+
next
|
765
|
+
end
|
766
|
+
|
767
|
+
|
768
|
+
end
|
769
|
+
end
|
770
|
+
end
|
771
|
+
|