character-encodings 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,1065 @@
|
|
1
|
+
#! /usr/bin/ruby -w
|
2
|
+
=begin
|
3
|
+
:contents: Generate Unicode table headers.
|
4
|
+
:arch-tag: 98c7456d-c7d9-4b40-9971-409428593ad5
|
5
|
+
|
6
|
+
Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
|
7
|
+
|
8
|
+
This program is free software; you can redistribute it and/or modify
|
9
|
+
it under the terms of the GNU General Public License as published by
|
10
|
+
the Free Software Foundation; either version 2 of the License, or
|
11
|
+
(at your option) any later version.
|
12
|
+
|
13
|
+
This program is distributed in the hope that it will be useful,
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
GNU General Public License for more details.
|
17
|
+
|
18
|
+
You should have received a copy of the GNU General Public License
|
19
|
+
along with this program; if not, write to the Free Software
|
20
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
21
|
+
=end
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
def error(fmt, *args)
|
26
|
+
$stderr.printf("%s: %s\n", File.basename($0), sprintf(fmt, *args))
|
27
|
+
exit(1)
|
28
|
+
end
|
29
|
+
|
30
|
+
class File
|
31
|
+
def self.process(path)
|
32
|
+
begin
|
33
|
+
File.open(path) do |file|
|
34
|
+
file.each_line do |line|
|
35
|
+
next if line =~ /^(#|\s*$)/
|
36
|
+
yield line
|
37
|
+
end
|
38
|
+
end
|
39
|
+
rescue IOError => e
|
40
|
+
error("I/O error while processing input:\n" +
|
41
|
+
" file: %s\n" +
|
42
|
+
" error: %s\n", path, e.message)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class String
|
48
|
+
def escape
|
49
|
+
self.unpack('H*')[0].gsub(/(.{2})/, '\\x\1')
|
50
|
+
end
|
51
|
+
|
52
|
+
def width
|
53
|
+
self.gsub(/\t/, ' ' * 8).length
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Array
|
58
|
+
def verify_size(wanted, path, index)
|
59
|
+
if !(wanted === self.size)
|
60
|
+
error("entry doesn't contain the required %s fields:\n" +
|
61
|
+
" file: %s\n" +
|
62
|
+
" entry: %s\n" +
|
63
|
+
" field count: %d\n",
|
64
|
+
wanted.to_s,
|
65
|
+
path,
|
66
|
+
(self.size > index) ? self[index] : 'N/A',
|
67
|
+
self.size)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def verify_field(index, code, path, raw_code, type, ccase)
|
72
|
+
if self[index].to_i(16) != code
|
73
|
+
error("entry has type %s but UCD_%s(%s) != %s:\n" +
|
74
|
+
" file: %s\n" +
|
75
|
+
" entry: %s\n",
|
76
|
+
type, ccase, raw_code, raw_code, path, raw_code)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class Hash
|
82
|
+
def enumerate_ordered
|
83
|
+
n = 0
|
84
|
+
self.keys.sort.each do |code|
|
85
|
+
if self[code] == 1
|
86
|
+
self.delete(code)
|
87
|
+
next
|
88
|
+
end
|
89
|
+
self[code] = n
|
90
|
+
n += 1
|
91
|
+
end
|
92
|
+
n
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# XXX: this is too memory consuming to keep like this. We need to split it up
|
97
|
+
# like the perl script does in hashes and arrays. Argh!
|
98
|
+
class UnicodeCodepoint
|
99
|
+
def initialize(code)
|
100
|
+
@code = code
|
101
|
+
@type = @value = @lower = @upper = @cclass = @compat = nil
|
102
|
+
@compositions = @decompositions = @break_props = nil
|
103
|
+
end
|
104
|
+
|
105
|
+
attr_accessor :code
|
106
|
+
attr_accessor :type, :value, :lower, :upper, :cclass, :compat
|
107
|
+
attr_accessor :compositions, :decompositions, :break_props
|
108
|
+
end
|
109
|
+
|
110
|
+
# XXX: cleanup
|
111
|
+
class CollectedData
|
112
|
+
def initialize(dir = '.', indent = "\t")
|
113
|
+
@dir = dir
|
114
|
+
@indent = indent
|
115
|
+
@cps = []
|
116
|
+
|
117
|
+
@excludes = nil
|
118
|
+
|
119
|
+
@pages_before_e0000 = 0
|
120
|
+
@last = 0x10ffff
|
121
|
+
|
122
|
+
@type = []
|
123
|
+
@value = []
|
124
|
+
@title_to_lower = {}
|
125
|
+
@title_to_upper = {}
|
126
|
+
@cclass = []
|
127
|
+
@decompose_compat = []
|
128
|
+
@compositions = {}
|
129
|
+
@decompositions = []
|
130
|
+
|
131
|
+
@break_props = []
|
132
|
+
|
133
|
+
@special_case_offsets = []
|
134
|
+
@special_cases = []
|
135
|
+
|
136
|
+
@casefold = []
|
137
|
+
@casefold_longest = -1
|
138
|
+
|
139
|
+
@bidimirror = []
|
140
|
+
end
|
141
|
+
|
142
|
+
attr :dir
|
143
|
+
attr :indent
|
144
|
+
attr :cps, true
|
145
|
+
attr :excludes, true
|
146
|
+
attr :pages_before_e0000, true
|
147
|
+
attr :last
|
148
|
+
attr_accessor :type, :value, :title_to_lower, :title_to_upper, :cclass,
|
149
|
+
:decompose_compat, :compositions, :decompositions
|
150
|
+
attr :break_props, true
|
151
|
+
attr :special_case_offsets, true
|
152
|
+
attr :special_cases, true
|
153
|
+
attr :casefold, true
|
154
|
+
attr :casefold_longest, true
|
155
|
+
attr :bidimirror, true
|
156
|
+
end
|
157
|
+
|
158
|
+
class CompositionExclusions
|
159
|
+
def process(data)
|
160
|
+
data.excludes = Hash.new
|
161
|
+
File.process(File.join(data.dir, 'CompositionExclusions.txt')) do |line|
|
162
|
+
data.excludes[line.chomp.sub(/^\s*(.*?)\s*(#.*)?$/,'\1').to_i(16)] = true
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
class UnicodeData
|
168
|
+
CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY,
|
169
|
+
DECOMPOSITION, DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED,
|
170
|
+
OLD_NAME, COMMENT, UPPER, LOWER, TITLE = (0..14).to_a
|
171
|
+
|
172
|
+
def process(data)
|
173
|
+
prev_code = -1
|
174
|
+
path = File.join(data.dir, 'UnicodeData.txt')
|
175
|
+
File.process(path) do |line|
|
176
|
+
fields = line.chomp.split(/;/, -1)
|
177
|
+
fields.verify_size(15, path, CODE)
|
178
|
+
code = fields[CODE].to_i(16)
|
179
|
+
|
180
|
+
if code >= 0xe0000 and prev_code < 0xe0000
|
181
|
+
data.pages_before_e0000 = (prev_code >> 8) + 1
|
182
|
+
end
|
183
|
+
|
184
|
+
if code > prev_code + 1
|
185
|
+
process_gap(data,
|
186
|
+
prev_code + 1,
|
187
|
+
code - 1,
|
188
|
+
fields[NAME] =~ /Last>$/ ? fields : new_gap_fields)
|
189
|
+
end
|
190
|
+
process_one(data, code, fields)
|
191
|
+
prev_code = code
|
192
|
+
end
|
193
|
+
process_gap(data, prev_code + 1, 0x10ffff, new_gap_fields)
|
194
|
+
end
|
195
|
+
|
196
|
+
private
|
197
|
+
|
198
|
+
def new_gap_fields
|
199
|
+
['', '', 'Cn', '0', '', '', '', '', '', '', '', '', '', '', '']
|
200
|
+
end
|
201
|
+
|
202
|
+
def process_gap(data, low, hi, fields)
|
203
|
+
low.upto(hi) do |i|
|
204
|
+
fields[CODE] = sprintf('%04x', i)
|
205
|
+
process_one(data, i, fields)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def process_one(data, code, fields)
|
210
|
+
# puts(code.to_s)
|
211
|
+
# data.cps[code] ||= UnicodeCodepoint.new(code)
|
212
|
+
data.type[code] = fields[CATEGORY]
|
213
|
+
|
214
|
+
# TODO: Why not process things like 'Nl'?
|
215
|
+
case data.type[code]
|
216
|
+
when 'Nd'
|
217
|
+
data.value[code] = fields[DECIMAL_VALUE].to_i
|
218
|
+
when 'Ll'
|
219
|
+
data.value[code] = fields[UPPER].to_i(16)
|
220
|
+
when 'Lu'
|
221
|
+
data.value[code] = fields[LOWER].to_i(16)
|
222
|
+
when 'Lt'
|
223
|
+
data.title_to_lower[code] = fields[LOWER].to_i(16)
|
224
|
+
data.title_to_upper[code] = fields[UPPER].to_i(16)
|
225
|
+
end
|
226
|
+
|
227
|
+
data.cclass[code] = fields[COMBINING_CLASSES]
|
228
|
+
|
229
|
+
unless fields[DECOMPOSITION] == ''
|
230
|
+
if fields[DECOMPOSITION] =~ /^\<.*\>\s*(.*)/
|
231
|
+
data.decompose_compat[code] = true
|
232
|
+
fields[DECOMPOSITION] = $1
|
233
|
+
else
|
234
|
+
data.decompose_compat[code] = false
|
235
|
+
unless data.excludes.include?(code)
|
236
|
+
data.compositions[code] = fields[DECOMPOSITION]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
data.decompositions[code] = fields[DECOMPOSITION]
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
class LineBreak
|
245
|
+
BREAK_CODE, BREAK_PROPERTY = (0..1).to_a
|
246
|
+
|
247
|
+
def process(data)
|
248
|
+
prev_code = -1
|
249
|
+
path = File.join(data.dir, 'LineBreak.txt')
|
250
|
+
File.process(path) do |line|
|
251
|
+
fields = line.chomp.sub(/\s*#.*/, '').split(/;/, -1)
|
252
|
+
fields.verify_size(2, path, BREAK_CODE)
|
253
|
+
|
254
|
+
if fields[BREAK_CODE] =~ /([0-9A-F]{4,6})\.\.([0-9A-F]{4,6})/
|
255
|
+
start_code, end_code = $1.to_i(16), $2.to_i(16)
|
256
|
+
else
|
257
|
+
start_code = end_code = fields[BREAK_CODE].to_i(16)
|
258
|
+
end
|
259
|
+
|
260
|
+
if start_code > prev_code + 1
|
261
|
+
process_gap(data, prev_code + 1, start_code - 1)
|
262
|
+
end
|
263
|
+
|
264
|
+
start_code.upto(end_code) do |i|
|
265
|
+
data.break_props[i] = fields[BREAK_PROPERTY]
|
266
|
+
end
|
267
|
+
|
268
|
+
prev_code = end_code
|
269
|
+
end
|
270
|
+
|
271
|
+
process_gap(data, prev_code + 1, 0x10ffff)
|
272
|
+
end
|
273
|
+
|
274
|
+
private
|
275
|
+
|
276
|
+
def process_gap(data, low, hi)
|
277
|
+
low.upto(hi) do |i|
|
278
|
+
data.break_props[i] = (data.type[i] == 'Cn') ? 'XX' : 'AL'
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
class SpecialCasing
|
284
|
+
CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = (0..4).to_a
|
285
|
+
|
286
|
+
def initialize
|
287
|
+
@offset = 0
|
288
|
+
end
|
289
|
+
|
290
|
+
def process(data)
|
291
|
+
path = File.join(data.dir, 'SpecialCasing.txt')
|
292
|
+
File.process(path) do |line|
|
293
|
+
fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
|
294
|
+
fields.verify_size((5..6), path, CASE_CODE)
|
295
|
+
raw_code, code = fields[CASE_CODE], fields[CASE_CODE].to_i(16)
|
296
|
+
unless data.type[code].nil?
|
297
|
+
# We ignore conditional special cases
|
298
|
+
next if fields.size == 6
|
299
|
+
|
300
|
+
case data.type[code]
|
301
|
+
when 'Lu'
|
302
|
+
fields.verify_field(CASE_UPPER, code, path, raw_code, 'Lu', 'Upper')
|
303
|
+
add_special_case(data, code, data.value[code],
|
304
|
+
fields[CASE_LOWER], fields[CASE_TITLE])
|
305
|
+
when 'Lt'
|
306
|
+
fields.verify_field(CASE_TITLE, code, path, raw_code, 'Lt', 'Title')
|
307
|
+
add_special_case(data, code, nil,
|
308
|
+
fields[CASE_LOWER], fields[CASE_UPPER])
|
309
|
+
when 'Ll'
|
310
|
+
fields.verify_field(CASE_LOWER, code, path, raw_code, 'Ll', 'Lower')
|
311
|
+
add_special_case(data, code, data.value[code],
|
312
|
+
fields[CASE_UPPER], fields[CASE_TITLE])
|
313
|
+
else
|
314
|
+
error("special case for non-alphabetic code point:\n" +
|
315
|
+
" file: %s\n" +
|
316
|
+
" type: %s\n" +
|
317
|
+
" code point/entry: %s\n",
|
318
|
+
path, data.type[code], raw_code)
|
319
|
+
end
|
320
|
+
else
|
321
|
+
error("special case for code point which doesn't have a type:\n" +
|
322
|
+
" file: %s\n" +
|
323
|
+
" code point/entry: %d\n",
|
324
|
+
path, code)
|
325
|
+
end
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
private
|
330
|
+
|
331
|
+
def add_special_case(data, code, single, field1, field2)
|
332
|
+
values = [
|
333
|
+
single.nil? ? nil : [single],
|
334
|
+
field1.split(/\s+/).map{ |s| s.to_i(16) },
|
335
|
+
[0],
|
336
|
+
field2.split(/\s+/).map{ |s| s.to_i(16) },
|
337
|
+
]
|
338
|
+
result = ''
|
339
|
+
values.each{ |value| result += value.pack('U*') unless value.nil? }
|
340
|
+
|
341
|
+
data.special_case_offsets.push(@offset)
|
342
|
+
data.value[code] = 0x1000000 + @offset
|
343
|
+
data.special_cases.push(result.escape)
|
344
|
+
@offset += 1 + result.length
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
class CaseFolding
|
349
|
+
FOLDING_CODE, FOLDING_STATUS, FOLDING_MAPPING = (0..2).to_a
|
350
|
+
|
351
|
+
def process(data)
|
352
|
+
path = File.join(data.dir, 'CaseFolding.txt')
|
353
|
+
File.process(path) do |line|
|
354
|
+
fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
|
355
|
+
fields.verify_size(4, path, FOLDING_CODE)
|
356
|
+
|
357
|
+
# skip Simple and Turkic rules
|
358
|
+
next if fields[FOLDING_STATUS] =~ /^[ST]$/
|
359
|
+
|
360
|
+
raw_code, code = fields[FOLDING_CODE], fields[FOLDING_CODE].to_i(16)
|
361
|
+
values = fields[FOLDING_MAPPING].split(/\s+/).map{ |s| s.to_i(16) }
|
362
|
+
if values.size == 1 &&
|
363
|
+
!(!data.value[code].nil? && data.value[code] >= 0x1000000) &&
|
364
|
+
!data.type[code].nil?
|
365
|
+
case data.type[code]
|
366
|
+
when 'Ll'
|
367
|
+
lower = code
|
368
|
+
when 'Lt'
|
369
|
+
lower = data.title_to_lower[code]
|
370
|
+
when 'Lu'
|
371
|
+
lower = data.value[code]
|
372
|
+
else
|
373
|
+
lower = code
|
374
|
+
end
|
375
|
+
next if lower == values[0]
|
376
|
+
end
|
377
|
+
|
378
|
+
string = values.pack('U*')
|
379
|
+
if string.length + 1 > data.casefold_longest
|
380
|
+
data.casefold_longest = string.length + 1
|
381
|
+
end
|
382
|
+
data.casefold.push([code, string.escape])
|
383
|
+
end
|
384
|
+
end
|
385
|
+
end
|
386
|
+
|
387
|
+
class BidiMirroring
|
388
|
+
def process(data)
|
389
|
+
path = File.join(data.dir, 'BidiMirroring.txt')
|
390
|
+
File.process(path) do |line|
|
391
|
+
fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
|
392
|
+
fields.verify_size(2, path, 0)
|
393
|
+
data.bidimirror.push([fields[0].to_i(16), fields[1].to_i(16)])
|
394
|
+
end
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
class Printer
|
399
|
+
def initialize
|
400
|
+
@index = 0
|
401
|
+
end
|
402
|
+
|
403
|
+
def process(data)
|
404
|
+
@last_char_part1_i = data.pages_before_e0000 * 256 - 1
|
405
|
+
@last_char_part1_x = sprintf('0x%04x', @last_char_part1_i)
|
406
|
+
@last_char_part1_X = sprintf('%04X', @last_char_part1_i)
|
407
|
+
print_tables(data)
|
408
|
+
print_decomp(data)
|
409
|
+
print_composition_table(data)
|
410
|
+
print_line_break(data)
|
411
|
+
end
|
412
|
+
|
413
|
+
private
|
414
|
+
|
415
|
+
# Map general category code onto symbolic name.
|
416
|
+
Mappings = {
|
417
|
+
# Normative.
|
418
|
+
'Lu' => 'UNICODE_UPPERCASE_LETTER',
|
419
|
+
'Ll' => 'UNICODE_LOWERCASE_LETTER',
|
420
|
+
'Lt' => 'UNICODE_TITLECASE_LETTER',
|
421
|
+
'Mn' => 'UNICODE_NON_SPACING_MARK',
|
422
|
+
'Mc' => 'UNICODE_COMBINING_MARK',
|
423
|
+
'Me' => 'UNICODE_ENCLOSING_MARK',
|
424
|
+
'Nd' => 'UNICODE_DECIMAL_NUMBER',
|
425
|
+
'Nl' => 'UNICODE_LETTER_NUMBER',
|
426
|
+
'No' => 'UNICODE_OTHER_NUMBER',
|
427
|
+
'Zs' => 'UNICODE_SPACE_SEPARATOR',
|
428
|
+
'Zl' => 'UNICODE_LINE_SEPARATOR',
|
429
|
+
'Zp' => 'UNICODE_PARAGRAPH_SEPARATOR',
|
430
|
+
'Cc' => 'UNICODE_CONTROL',
|
431
|
+
'Cf' => 'UNICODE_FORMAT',
|
432
|
+
'Cs' => 'UNICODE_SURROGATE',
|
433
|
+
'Co' => 'UNICODE_PRIVATE_USE',
|
434
|
+
'Cn' => 'UNICODE_UNASSIGNED',
|
435
|
+
|
436
|
+
# Informative.
|
437
|
+
'Lm' => 'UNICODE_MODIFIER_LETTER',
|
438
|
+
'Lo' => 'UNICODE_OTHER_LETTER',
|
439
|
+
'Pc' => 'UNICODE_CONNECT_PUNCTUATION',
|
440
|
+
'Pd' => 'UNICODE_DASH_PUNCTUATION',
|
441
|
+
'Ps' => 'UNICODE_OPEN_PUNCTUATION',
|
442
|
+
'Pe' => 'UNICODE_CLOSE_PUNCTUATION',
|
443
|
+
'Pi' => 'UNICODE_INITIAL_PUNCTUATION',
|
444
|
+
'Pf' => 'UNICODE_FINAL_PUNCTUATION',
|
445
|
+
'Po' => 'UNICODE_OTHER_PUNCTUATION',
|
446
|
+
'Sm' => 'UNICODE_MATH_SYMBOL',
|
447
|
+
'Sc' => 'UNICODE_CURRENCY_SYMBOL',
|
448
|
+
'Sk' => 'UNICODE_MODIFIER_SYMBOL',
|
449
|
+
'So' => 'UNICODE_OTHER_SYMBOL'
|
450
|
+
}
|
451
|
+
|
452
|
+
BreakMappings = {
|
453
|
+
'BK' => 'UNICODE_BREAK_MANDATORY',
|
454
|
+
'CR' => 'UNICODE_BREAK_CARRIAGE_RETURN',
|
455
|
+
'LF' => 'UNICODE_BREAK_LINE_FEED',
|
456
|
+
'CM' => 'UNICODE_BREAK_COMBINING_MARK',
|
457
|
+
'SG' => 'UNICODE_BREAK_SURROGATE',
|
458
|
+
'ZW' => 'UNICODE_BREAK_ZERO_WIDTH_SPACE',
|
459
|
+
'IN' => 'UNICODE_BREAK_INSEPARABLE',
|
460
|
+
'GL' => 'UNICODE_BREAK_NON_BREAKING_GLUE',
|
461
|
+
'CB' => 'UNICODE_BREAK_CONTINGENT',
|
462
|
+
'SP' => 'UNICODE_BREAK_SPACE',
|
463
|
+
'BA' => 'UNICODE_BREAK_AFTER',
|
464
|
+
'BB' => 'UNICODE_BREAK_BEFORE',
|
465
|
+
'B2' => 'UNICODE_BREAK_BEFORE_AND_AFTER',
|
466
|
+
'HY' => 'UNICODE_BREAK_HYPHEN',
|
467
|
+
'NS' => 'UNICODE_BREAK_NON_STARTER',
|
468
|
+
'OP' => 'UNICODE_BREAK_OPEN_PUNCTUATION',
|
469
|
+
'CL' => 'UNICODE_BREAK_CLOSE_PUNCTUATION',
|
470
|
+
'QU' => 'UNICODE_BREAK_QUOTATION',
|
471
|
+
'EX' => 'UNICODE_BREAK_EXCLAMATION',
|
472
|
+
'ID' => 'UNICODE_BREAK_IDEOGRAPHIC',
|
473
|
+
'NU' => 'UNICODE_BREAK_NUMERIC',
|
474
|
+
'IS' => 'UNICODE_BREAK_INFIX_SEPARATOR',
|
475
|
+
'SY' => 'UNICODE_BREAK_SYMBOL',
|
476
|
+
'AL' => 'UNICODE_BREAK_ALPHABETIC',
|
477
|
+
'PR' => 'UNICODE_BREAK_PREFIX',
|
478
|
+
'PO' => 'UNICODE_BREAK_POSTFIX',
|
479
|
+
'SA' => 'UNICODE_BREAK_COMPLEX_CONTEXT',
|
480
|
+
'AI' => 'UNICODE_BREAK_AMBIGUOUS',
|
481
|
+
'NL' => 'UNICODE_BREAK_NEXT_LINE',
|
482
|
+
'WJ' => 'UNICODE_BREAK_WORD_JOINER',
|
483
|
+
'XX' => 'UNICODE_BREAK_UNKNOWN',
|
484
|
+
'JL' => 'UNICODE_BREAK_HANGUL_L_JAMO',
|
485
|
+
'JV' => "UNICODE_BREAK_HANGUL_V_JAMO",
|
486
|
+
'JT' => "UNICODE_BREAK_HANGUL_T_JAMO",
|
487
|
+
'H2' => "UNICODE_BREAK_HANGUL_LV_SYLLABLE",
|
488
|
+
'H3' => "UNICODE_BREAK_HANGUL_LVT_SYLLABLE"
|
489
|
+
};
|
490
|
+
|
491
|
+
NOT_PRESENT_OFFSET = 65535
|
492
|
+
|
493
|
+
def print_table(data, low, mid, hi, size, header, part1_h, part2_h, &f)
|
494
|
+
@index = 0
|
495
|
+
rows = []
|
496
|
+
print(header)
|
497
|
+
low.step(hi, 256) do |i|
|
498
|
+
rows[i / 256] = print_row(data, i, size){ |i| f.call(i) }
|
499
|
+
end
|
500
|
+
print("\n};\n")
|
501
|
+
print(part1_h)
|
502
|
+
low.step(mid, 256) do |i|
|
503
|
+
printf("%s%s,\n", data.indent, rows[i / 256])
|
504
|
+
end
|
505
|
+
print("};\n")
|
506
|
+
if mid != hi
|
507
|
+
print(part2_h)
|
508
|
+
0xe0000.step(hi, 256) do |i|
|
509
|
+
printf("%s%s,\n", data.indent, rows[i / 256])
|
510
|
+
end
|
511
|
+
print("};\n")
|
512
|
+
end
|
513
|
+
end
|
514
|
+
|
515
|
+
def print_tables(data, outfile = 'character-tables.h')
|
516
|
+
row = []
|
517
|
+
saved_stdout = $stdout
|
518
|
+
File.open(outfile, 'w') do |file|
|
519
|
+
header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
|
520
|
+
$stdout = file
|
521
|
+
print <<EOF
|
522
|
+
/* Automatically generated file */
|
523
|
+
|
524
|
+
#ifndef #{header_h}
|
525
|
+
#define #{header_h}
|
526
|
+
|
527
|
+
#define UNICODE_DATA_VERSION "#{UnicodeVersion}"
|
528
|
+
|
529
|
+
#define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
|
530
|
+
|
531
|
+
#define UNICODE_MAX_TABLE_INDEX 10000
|
532
|
+
|
533
|
+
#define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
|
534
|
+
|
535
|
+
#define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
|
536
|
+
|
537
|
+
#define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
|
538
|
+
EOF
|
539
|
+
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
540
|
+
<<EOH, <<EOH1, <<EOH2){ |i| Mappings[data.type[i]] }
|
541
|
+
|
542
|
+
|
543
|
+
static const char type_data[][256] = {
|
544
|
+
EOH
|
545
|
+
|
546
|
+
|
547
|
+
/* U+0000 through U+#{@last_char_part1_X} */
|
548
|
+
static const int16_t type_table_part1[#{data.pages_before_e0000}] = {
|
549
|
+
EOH1
|
550
|
+
|
551
|
+
|
552
|
+
/* U+E0000 through U+#{sprintf('%04X', data.last)} */
|
553
|
+
static const int16_t type_table_part2[768] = {
|
554
|
+
EOH2
|
555
|
+
|
556
|
+
print_table(data, 0, @last_char_part1_i, data.last, 4,
|
557
|
+
<<EOH, <<EOH1, <<EOH2) { |i| data.value[i].nil? ? '0x0000' : sprintf('0x%04x', data.value[i]) }
|
558
|
+
|
559
|
+
|
560
|
+
static const unichar attr_data[][256] = {
|
561
|
+
EOH
|
562
|
+
|
563
|
+
|
564
|
+
/* U+0000 through U+#{@last_char_part1_X} */
|
565
|
+
static const int16_t attr_table_part1[#{data.pages_before_e0000}] = {
|
566
|
+
EOH1
|
567
|
+
|
568
|
+
|
569
|
+
/* U+E0000 through U+#{sprintf('%04X', data.last)} */
|
570
|
+
static const int16_t attr_table_part2[768] = {
|
571
|
+
EOH2
|
572
|
+
|
573
|
+
print <<EOF
|
574
|
+
|
575
|
+
|
576
|
+
static const unichar title_table[][3] = {
|
577
|
+
EOF
|
578
|
+
data.title_to_lower.keys.sort.each do |code|
|
579
|
+
printf("%s{ 0x%04x, 0x%04x, 0x%04x },\n", data.indent,
|
580
|
+
code, data.title_to_upper[code], data.title_to_lower[code])
|
581
|
+
end
|
582
|
+
print("};\n")
|
583
|
+
|
584
|
+
print_special_case_table(data)
|
585
|
+
print_case_fold_table(data)
|
586
|
+
|
587
|
+
print <<EOF
|
588
|
+
static const struct {
|
589
|
+
#{data.indent}unichar ch;
|
590
|
+
#{data.indent}unichar mirrored_ch;
|
591
|
+
} bidi_mirroring_table[] = {
|
592
|
+
EOF
|
593
|
+
data.bidimirror.each do |item|
|
594
|
+
printf("%s{ 0x%04x, 0x%04x },\n", data.indent, item[0], item[1])
|
595
|
+
end
|
596
|
+
print <<EOF
|
597
|
+
};
|
598
|
+
|
599
|
+
#endif /* #{header_h} */
|
600
|
+
EOF
|
601
|
+
end
|
602
|
+
$stdout = saved_stdout
|
603
|
+
end
|
604
|
+
|
605
|
+
def print_row(data, start, type_size)
|
606
|
+
flag = true
|
607
|
+
values = []
|
608
|
+
0.upto(255) do |i|
|
609
|
+
values[i] = yield(start + i)
|
610
|
+
flag = false if values[i] != values[0]
|
611
|
+
end
|
612
|
+
return values[0] + " + UNICODE_MAX_TABLE_INDEX" if flag
|
613
|
+
|
614
|
+
puts(',') if @index != 0
|
615
|
+
printf("%s{ /* page %d, index %d */\n%s",
|
616
|
+
data.indent, start / 256, @index, data.indent * 2)
|
617
|
+
column = data.indent.width * 2
|
618
|
+
start.upto(start + 255) do |i|
|
619
|
+
text = values[i - start]
|
620
|
+
if text.length + column + 2 > 79
|
621
|
+
printf("\n%s", data.indent * 2)
|
622
|
+
column = data.indent.width * 2
|
623
|
+
end
|
624
|
+
|
625
|
+
printf("%s, ", text)
|
626
|
+
column += text.width + 2
|
627
|
+
end
|
628
|
+
|
629
|
+
print("\n#{data.indent}}")
|
630
|
+
@index += 1
|
631
|
+
return sprintf("%d /* page %d */", @index - 1, start / 256);
|
632
|
+
end
|
633
|
+
|
634
|
+
def print_special_case_table(data)
|
635
|
+
print <<EOF
|
636
|
+
|
637
|
+
|
638
|
+
/*
|
639
|
+
* Table of special cases for case conversion; each record contains
|
640
|
+
* First, the best single character mapping to lowercase if Lu,
|
641
|
+
* and to uppercase if Ll, followed by the output mapping for the two cases
|
642
|
+
* other than the case of the codepoint, in the order Ll, Lu, Lt, encoded in
|
643
|
+
* UTF-8, separated and terminated by a NUL character.
|
644
|
+
*/
|
645
|
+
static const char special_case_table[] = {
|
646
|
+
EOF
|
647
|
+
data.special_cases.each_with_index do |sc, i|
|
648
|
+
printf(%Q< "%s\\0" /* offset %d */\n>, sc, data.special_case_offsets[i])
|
649
|
+
end
|
650
|
+
print <<EOF
|
651
|
+
};
|
652
|
+
|
653
|
+
EOF
|
654
|
+
end
|
655
|
+
|
656
|
+
def print_case_fold_table(data)
|
657
|
+
print <<EOF
|
658
|
+
|
659
|
+
/*
|
660
|
+
* Table of casefolding cases that can't be derived by lowercasing.
|
661
|
+
*/
|
662
|
+
static const struct {
|
663
|
+
#{data.indent}uint16_t ch;
|
664
|
+
#{data.indent}char data[#{data.casefold_longest}];
|
665
|
+
} casefold_table[] = {
|
666
|
+
EOF
|
667
|
+
data.casefold.sort_by{ |a| a[0] }.each do |cf|
|
668
|
+
if cf[0] > 0xffff
|
669
|
+
error('casefold_table.ch field too short.' +
|
670
|
+
' Upgrade to unichar to fit values beyond 0xffff.')
|
671
|
+
end
|
672
|
+
printf(%Q<%s{ 0x%04x, "%s" },\n>, data.indent, cf[0], cf[1])
|
673
|
+
end
|
674
|
+
print <<EOF
|
675
|
+
};
|
676
|
+
EOF
|
677
|
+
end
|
678
|
+
|
679
|
+
def print_decomp(data, outfile = 'decompose.h')
|
680
|
+
row = []
|
681
|
+
saved_stdout = $stdout
|
682
|
+
File.open(outfile, 'w') do |file|
|
683
|
+
header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
|
684
|
+
$stdout = file
|
685
|
+
print <<EOF
|
686
|
+
/* Automatically generated file */
|
687
|
+
|
688
|
+
#ifndef #{header_h}
|
689
|
+
#define #{header_h}
|
690
|
+
|
691
|
+
|
692
|
+
#define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
|
693
|
+
|
694
|
+
#define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
|
695
|
+
|
696
|
+
#define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
|
697
|
+
#define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
|
698
|
+
|
699
|
+
#define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
|
700
|
+
EOF
|
701
|
+
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
702
|
+
<<EOH, <<EOH1, <<EOH2){ |i| data.cclass[i] }
|
703
|
+
|
704
|
+
|
705
|
+
static const uint8_t cclass_data[][256] = {
|
706
|
+
EOH
|
707
|
+
|
708
|
+
|
709
|
+
static const int16_t combining_class_table_part1[#{data.pages_before_e0000}] = {
|
710
|
+
EOH1
|
711
|
+
|
712
|
+
|
713
|
+
static const int16_t combining_class_table_part2[768] = {
|
714
|
+
EOH2
|
715
|
+
|
716
|
+
print <<EOL
|
717
|
+
|
718
|
+
|
719
|
+
static const struct {
|
720
|
+
#{data.indent}unichar ch;
|
721
|
+
#{data.indent}uint16_t canon_offset;
|
722
|
+
#{data.indent}uint16_t compat_offset;
|
723
|
+
} decomp_table[] = {
|
724
|
+
EOL
|
725
|
+
decomp_offsets = {}
|
726
|
+
decomp_string = ''
|
727
|
+
@decomp_string_offset = 0
|
728
|
+
0.upto(data.last) do |i|
|
729
|
+
unless data.decompositions[i].nil?
|
730
|
+
canon_decomp = data.decompose_compat[i] ?
|
731
|
+
nil : make_decomp(data, i, false)
|
732
|
+
compat_decomp = make_decomp(data, i, true)
|
733
|
+
if not canon_decomp.nil? and compat_decomp == canon_decomp
|
734
|
+
compat_decomp = nil
|
735
|
+
end
|
736
|
+
canon_offset = handle_decomp(canon_decomp, decomp_offsets,
|
737
|
+
decomp_string)
|
738
|
+
compat_offset = handle_decomp(compat_decomp, decomp_offsets,
|
739
|
+
decomp_string)
|
740
|
+
|
741
|
+
if @decomp_string_offset > NOT_PRESENT_OFFSET
|
742
|
+
error('decomposition string offset beyond not-present-offset,' +
|
743
|
+
" upgrade value:\n" +
|
744
|
+
" offset: %d\n" +
|
745
|
+
" max: %d\n",
|
746
|
+
@decomp_string_offset, NOT_PRESENT_OFFSET)
|
747
|
+
end
|
748
|
+
printf("%s{ 0x%04x, %s, %s },\n",
|
749
|
+
data.indent, i, canon_offset, compat_offset)
|
750
|
+
end
|
751
|
+
end
|
752
|
+
print("\n};")
|
753
|
+
|
754
|
+
print <<EOL
|
755
|
+
|
756
|
+
static const char decomp_expansion_string[] = #{decomp_string};
|
757
|
+
|
758
|
+
|
759
|
+
#endif /* #{header_h} */
|
760
|
+
EOL
|
761
|
+
end
|
762
|
+
$stdout = saved_stdout
|
763
|
+
end
|
764
|
+
|
765
|
+
def expand_decomp(data, code, compat)
|
766
|
+
ary = []
|
767
|
+
data.decompositions[code].split(/ /).each do |item|
|
768
|
+
pos = item.to_i(16)
|
769
|
+
if not data.decompositions[pos].nil? and
|
770
|
+
(compat or not data.decompose_compat[pos])
|
771
|
+
ary.concat(expand_decomp(data, pos, compat))
|
772
|
+
else
|
773
|
+
ary.push(pos)
|
774
|
+
end
|
775
|
+
end
|
776
|
+
ary
|
777
|
+
end
|
778
|
+
|
779
|
+
def make_decomp(data, code, compat)
|
780
|
+
str = ''
|
781
|
+
expand_decomp(data, code, compat).each do |item|
|
782
|
+
str += item.is_a?(Array) ? item.flatten.pack('U') : [item].pack('U')
|
783
|
+
end
|
784
|
+
str
|
785
|
+
end
|
786
|
+
|
787
|
+
def handle_decomp(decomp, decomp_offsets,
|
788
|
+
decomp_string)
|
789
|
+
offset = 'UNICODE_NOT_PRESENT_OFFSET'
|
790
|
+
unless decomp.nil?
|
791
|
+
if decomp_offsets.member?(decomp)
|
792
|
+
offset = decomp_offsets[decomp]
|
793
|
+
else
|
794
|
+
offset = @decomp_string_offset
|
795
|
+
decomp_offsets[decomp] = offset
|
796
|
+
decomp_string << ("\n \"" + decomp.escape +
|
797
|
+
"\\0\" /* offset #{offset} */")
|
798
|
+
@decomp_string_offset += decomp.length + 1
|
799
|
+
end
|
800
|
+
end
|
801
|
+
offset
|
802
|
+
end
|
803
|
+
|
804
|
+
def print_composition_table(data, outfile = 'compose.h')
|
805
|
+
first = Hash.new(0)
|
806
|
+
second = Hash.new(0)
|
807
|
+
|
808
|
+
data.compositions.each do |code, value|
|
809
|
+
values = value.split(/\s+/).map{ |s| s.to_i(16) }
|
810
|
+
|
811
|
+
# skip non-starters and single-character decompositions
|
812
|
+
if data.cclass[values[0]] != '0' or values.size == 1
|
813
|
+
data.compositions.delete(code)
|
814
|
+
next
|
815
|
+
end
|
816
|
+
|
817
|
+
if values.size != 2
|
818
|
+
error("decomposition of entry contains more than two elements:\n" +
|
819
|
+
" entry: %d\n" +
|
820
|
+
" elements: %d\n",
|
821
|
+
code, values.size)
|
822
|
+
end
|
823
|
+
|
824
|
+
first[values[0]] += 1
|
825
|
+
end
|
826
|
+
|
827
|
+
n_first = first.enumerate_ordered
|
828
|
+
|
829
|
+
data.compositions.each do |code, value|
|
830
|
+
values = value.split(/\s+/).map{ |s| s.to_i(16) }
|
831
|
+
|
832
|
+
second[values[1]] += 1 if first.member?(values[0])
|
833
|
+
end
|
834
|
+
|
835
|
+
n_second = second.enumerate_ordered
|
836
|
+
|
837
|
+
first_singletons = []
|
838
|
+
second_singletons = []
|
839
|
+
reverse = {}
|
840
|
+
data.compositions.each do |code, value|
|
841
|
+
values = value.split(/\s+/).map{ |s| s.to_i(16) }
|
842
|
+
|
843
|
+
if first.member?(values[0]) and second.member?(values[1])
|
844
|
+
reverse["#{first[values[0]]}|#{second[values[1]]}"] = code
|
845
|
+
elsif not first.member?(values[0])
|
846
|
+
first_singletons.push([values[0], values[1], code])
|
847
|
+
else
|
848
|
+
second_singletons.push([values[1], values[0], code])
|
849
|
+
end
|
850
|
+
end
|
851
|
+
|
852
|
+
first_singletons = first_singletons.sort_by{ |a| a[0] }
|
853
|
+
second_singletons = second_singletons.sort_by{ |a| a[0] }
|
854
|
+
|
855
|
+
row = []
|
856
|
+
saved_stdout = $stdout
|
857
|
+
File.open(outfile, 'w') do |file|
|
858
|
+
header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
|
859
|
+
$stdout = file
|
860
|
+
values = {}
|
861
|
+
total = first_start = 1
|
862
|
+
last = 0
|
863
|
+
|
864
|
+
first.each do |code, value|
|
865
|
+
values[code] = value + total
|
866
|
+
last = code if code > last
|
867
|
+
end
|
868
|
+
total += n_first
|
869
|
+
|
870
|
+
first_single_start = total
|
871
|
+
first_singletons.each_with_index do |item, i|
|
872
|
+
code = item[0]
|
873
|
+
values[code] = i + total
|
874
|
+
last = code if code > last
|
875
|
+
end
|
876
|
+
total += first_singletons.size
|
877
|
+
|
878
|
+
second_start = total
|
879
|
+
second.each do |code, value|
|
880
|
+
values[code] = value + total
|
881
|
+
last = code if code > last
|
882
|
+
end
|
883
|
+
total += n_second
|
884
|
+
|
885
|
+
second_single_start = total
|
886
|
+
second_singletons.each_with_index do |item, i|
|
887
|
+
code = item[0]
|
888
|
+
values[code] = i + total
|
889
|
+
last = code if code > last
|
890
|
+
end
|
891
|
+
|
892
|
+
print <<EOL
|
893
|
+
/* Automatically generated file */
|
894
|
+
|
895
|
+
#ifndef #{header_h}
|
896
|
+
#define #{header_h}
|
897
|
+
|
898
|
+
|
899
|
+
#define COMPOSE_FIRST_START #{first_start}
|
900
|
+
#define COMPOSE_FIRST_SINGLE_START #{first_single_start}
|
901
|
+
#define COMPOSE_SECOND_START #{second_start}
|
902
|
+
#define COMPOSE_SECOND_SINGLE_START #{second_single_start}
|
903
|
+
#define COMPOSE_TABLE_LAST #{last / 256}
|
904
|
+
EOL
|
905
|
+
|
906
|
+
print_table(data, 0, last, last, 2,
|
907
|
+
<<EOH, <<EOH1, nil){ |i| values.member?(i) ? values[i].to_s : '0' }
|
908
|
+
|
909
|
+
|
910
|
+
static const uint16_t compose_data[][256] = {
|
911
|
+
EOH
|
912
|
+
|
913
|
+
|
914
|
+
static const int16_t compose_table[COMPOSE_TABLE_LAST + 1] = {
|
915
|
+
EOH1
|
916
|
+
|
917
|
+
print <<EOL
|
918
|
+
|
919
|
+
|
920
|
+
static const uint16_t compose_first_single[][2] = {
|
921
|
+
EOL
|
922
|
+
first_singletons.each_with_index do |item, i|
|
923
|
+
if item[1] > 0xffff or item[2] > 0xffff
|
924
|
+
error("compose_first_single table field too short." +
|
925
|
+
" Upgrade to unichar to fit values beyond 0xffff.")
|
926
|
+
end
|
927
|
+
printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
|
928
|
+
end
|
929
|
+
print("};\n")
|
930
|
+
|
931
|
+
print <<EOL
|
932
|
+
|
933
|
+
|
934
|
+
static const uint16_t compose_second_single[][2] = {
|
935
|
+
EOL
|
936
|
+
second_singletons.each_with_index do |item, i|
|
937
|
+
if item[1] > 0xffff or item[2] > 0xffff
|
938
|
+
error("compose_second_single table field too short." +
|
939
|
+
" Upgrade to unichar to fit values beyond 0xffff.")
|
940
|
+
end
|
941
|
+
printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
|
942
|
+
end
|
943
|
+
print("};\n")
|
944
|
+
|
945
|
+
print <<EOL
|
946
|
+
|
947
|
+
|
948
|
+
static const uint16_t compose_array[#{n_first}][#{n_second}] = {
|
949
|
+
EOL
|
950
|
+
0.upto(n_first - 1) do |i|
|
951
|
+
printf("%s{\n%s", data.indent, data.indent * 2)
|
952
|
+
column = data.indent.width * 2
|
953
|
+
0.upto(n_second - 1) do |j|
|
954
|
+
if column + 8 > 79
|
955
|
+
printf("\n%s", data.indent * 2)
|
956
|
+
column = data.indent.width * 2
|
957
|
+
end
|
958
|
+
if reverse.member?("#{i}|#{j}")
|
959
|
+
if reverse["#{i}|#{j}"] > 0xffff
|
960
|
+
error("compose_array table field too short." +
|
961
|
+
" Upgrade to unichar to fit values beyond 0xffff.")
|
962
|
+
end
|
963
|
+
printf("0x%04x, ", reverse["#{i}|#{j}"])
|
964
|
+
else
|
965
|
+
print(" 0, ")
|
966
|
+
end
|
967
|
+
column += 8
|
968
|
+
end
|
969
|
+
printf("\n%s},\n", data.indent)
|
970
|
+
end
|
971
|
+
print("};\n")
|
972
|
+
|
973
|
+
print <<EOL
|
974
|
+
|
975
|
+
|
976
|
+
#endif /* #{header_h} */
|
977
|
+
EOL
|
978
|
+
end
|
979
|
+
$stdout = saved_stdout
|
980
|
+
end
|
981
|
+
|
982
|
+
def print_line_break(data, outfile = 'break.h')
|
983
|
+
row = []
|
984
|
+
saved_stdout = $stdout
|
985
|
+
File.open(outfile, 'w') do |file|
|
986
|
+
header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
|
987
|
+
$stdout = file
|
988
|
+
print <<EOF
|
989
|
+
/* Automatically generated file */
|
990
|
+
|
991
|
+
#ifndef #{header_h}
|
992
|
+
#define #{header_h}
|
993
|
+
|
994
|
+
#define UNICODE_DATA_VERSION "#{UnicodeVersion}"
|
995
|
+
|
996
|
+
#define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
|
997
|
+
|
998
|
+
#define UNICODE_MAX_TABLE_INDEX 10000
|
999
|
+
|
1000
|
+
/*
|
1001
|
+
* The last code point that should be looked up in break_property_table_part1.
|
1002
|
+
*/
|
1003
|
+
#define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
|
1004
|
+
|
1005
|
+
/*
|
1006
|
+
* The first code point that should be looked up in break_property_table_part2.
|
1007
|
+
*/
|
1008
|
+
#define UNICODE_FIRST_CHAR_PART2 0xe0000
|
1009
|
+
EOF
|
1010
|
+
print_table(data, 0, @last_char_part1_i, data.last, 1,
|
1011
|
+
<<EOH, <<EOH1, <<EOH2){ |i| BreakMappings[data.break_props[i]] }
|
1012
|
+
|
1013
|
+
|
1014
|
+
static const int8_t break_property_data[][256] = {
|
1015
|
+
EOH
|
1016
|
+
|
1017
|
+
|
1018
|
+
/* U+0000 through U+#{@last_char_part1_X} */
|
1019
|
+
static const int16_t break_property_table_part1[#{data.pages_before_e0000}] = {
|
1020
|
+
EOH1
|
1021
|
+
|
1022
|
+
|
1023
|
+
/* U+E0000 through U+#{sprintf('%04X', data.last)} */
|
1024
|
+
static const int16_t break_property_table_part2[768] = {
|
1025
|
+
EOH2
|
1026
|
+
|
1027
|
+
print <<EOF
|
1028
|
+
|
1029
|
+
|
1030
|
+
#endif /* #{header_h} */
|
1031
|
+
EOF
|
1032
|
+
end
|
1033
|
+
$stdout = saved_stdout
|
1034
|
+
end
|
1035
|
+
end
|
1036
|
+
|
1037
|
+
UnicodeVersion = ARGV[0]
|
1038
|
+
|
1039
|
+
class Runner
|
1040
|
+
def main
|
1041
|
+
check_for_data_files(ARGV[1])
|
1042
|
+
data = CollectedData.new(ARGV[1], "\t")
|
1043
|
+
[CompositionExclusions, UnicodeData, LineBreak,
|
1044
|
+
SpecialCasing, CaseFolding, BidiMirroring, Printer].each do |klass|
|
1045
|
+
klass.new.process(data)
|
1046
|
+
end
|
1047
|
+
end
|
1048
|
+
|
1049
|
+
private
|
1050
|
+
def check_for_data_files(dir)
|
1051
|
+
['UnicodeData.txt', 'LineBreak.txt', 'SpecialCasing.txt', 'CaseFolding.txt',
|
1052
|
+
'CompositionExclusions.txt', 'BidiMirroring.txt'].each do |file|
|
1053
|
+
path = File.join(dir, file)
|
1054
|
+
unless File.readable?(path)
|
1055
|
+
error('missing required file: %s', path)
|
1056
|
+
end
|
1057
|
+
end
|
1058
|
+
end
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
Runner.new.main
|
1062
|
+
|
1063
|
+
|
1064
|
+
|
1065
|
+
# vim: set sts=2 sw=2:
|