u 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,1070 @@
1
+ #! /usr/bin/ruby -w
2
+ =begin
3
+ :contents: Generate Unicode table headers.
4
+ :arch-tag: 98c7456d-c7d9-4b40-9971-409428593ad5
5
+
6
+ Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
7
+
8
+ This program is free software; you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation; either version 2 of the License, or
11
+ (at your option) any later version.
12
+
13
+ This program is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with this program; if not, write to the Free Software
20
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
+ =end
22
+
23
+
24
+
25
+ def error(fmt, *args)
26
+ $stderr.printf("%s: %s\n", File.basename($0), sprintf(fmt, *args))
27
+ exit(1)
28
+ end
29
+
30
+ class File
31
+ def self.process(path)
32
+ begin
33
+ File.open(path) do |file|
34
+ file.each_line do |line|
35
+ next if line =~ /^(#|\s*$)/
36
+ yield line
37
+ end
38
+ end
39
+ rescue IOError => e
40
+ error("I/O error while processing input:\n" +
41
+ " file: %s\n" +
42
+ " error: %s\n", path, e.message)
43
+ end
44
+ end
45
+ end
46
+
47
+ class String
48
+ def escape
49
+ self.unpack('H*')[0].gsub(/(.{2})/, '\\x\1')
50
+ end
51
+
52
+ def width
53
+ self.gsub(/\t/, ' ' * 8).length
54
+ end
55
+ end
56
+
57
+ class Array
58
+ def verify_size(wanted, path, index)
59
+ if !(wanted === self.size)
60
+ error("entry doesn't contain the required %s fields:\n" +
61
+ " file: %s\n" +
62
+ " entry: %s\n" +
63
+ " field count: %d\n",
64
+ wanted.to_s,
65
+ path,
66
+ (self.size > index) ? self[index] : 'N/A',
67
+ self.size)
68
+ end
69
+ end
70
+
71
+ def verify_field(index, code, path, raw_code, type, ccase)
72
+ if self[index].to_i(16) != code
73
+ error("entry has type %s but UCD_%s(%s) != %s:\n" +
74
+ " file: %s\n" +
75
+ " entry: %s\n",
76
+ type, ccase, raw_code, raw_code, path, raw_code)
77
+ end
78
+ end
79
+ end
80
+
81
+ class Hash
82
+ def enumerate_ordered
83
+ n = 0
84
+ self.keys.sort.each do |code|
85
+ if self[code] == 1
86
+ self.delete(code)
87
+ next
88
+ end
89
+ self[code] = n
90
+ n += 1
91
+ end
92
+ n
93
+ end
94
+ end
95
+
96
+ # XXX: this is too memory consuming to keep like this. We need to split it up
97
+ # like the perl script does in hashes and arrays. Argh!
98
+ class UnicodeCodepoint
99
+ def initialize(code)
100
+ @code = code
101
+ @type = @value = @lower = @upper = @cclass = @compat = nil
102
+ @compositions = @decompositions = @break_props = nil
103
+ end
104
+
105
+ attr_accessor :code
106
+ attr_accessor :type, :value, :lower, :upper, :cclass, :compat
107
+ attr_accessor :compositions, :decompositions, :break_props
108
+ end
109
+
110
+ # XXX: cleanup
111
+ class CollectedData
112
+ def initialize(dir = '.', indent = "\t")
113
+ @dir = dir
114
+ @indent = indent
115
+ @cps = []
116
+
117
+ @excludes = nil
118
+
119
+ @pages_before_e0000 = 0
120
+ @last = 0x10ffff
121
+
122
+ @type = []
123
+ @value = []
124
+ @title_to_lower = {}
125
+ @title_to_upper = {}
126
+ @cclass = []
127
+ @decompose_compat = []
128
+ @compositions = {}
129
+ @decompositions = []
130
+
131
+ @break_props = []
132
+
133
+ @special_case_offsets = []
134
+ @special_cases = []
135
+
136
+ @casefold = []
137
+ @casefold_longest = -1
138
+
139
+ @bidimirror = []
140
+ end
141
+
142
+ attr :dir
143
+ attr :indent
144
+ attr :cps, true
145
+ attr :excludes, true
146
+ attr :pages_before_e0000, true
147
+ attr :last
148
+ attr_accessor :type, :value, :title_to_lower, :title_to_upper, :cclass,
149
+ :decompose_compat, :compositions, :decompositions
150
+ attr :break_props, true
151
+ attr :special_case_offsets, true
152
+ attr :special_cases, true
153
+ attr :casefold, true
154
+ attr :casefold_longest, true
155
+ attr :bidimirror, true
156
+ end
157
+
158
+ class CompositionExclusions
159
+ def process(data)
160
+ data.excludes = Hash.new
161
+ File.process(File.join(data.dir, 'CompositionExclusions.txt')) do |line|
162
+ data.excludes[line.chomp.sub(/^\s*(.*?)\s*(#.*)?$/,'\1').to_i(16)] = true
163
+ end
164
+ end
165
+ end
166
+
167
+ class UnicodeData
168
+ CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY,
169
+ DECOMPOSITION, DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED,
170
+ OLD_NAME, COMMENT, UPPER, LOWER, TITLE = (0..14).to_a
171
+
172
+ def process(data)
173
+ prev_code = -1
174
+ path = File.join(data.dir, 'UnicodeData.txt')
175
+ File.process(path) do |line|
176
+ fields = line.chomp.split(/;/, -1)
177
+ fields.verify_size(15, path, CODE)
178
+ code = fields[CODE].to_i(16)
179
+
180
+ if code >= 0xe0000 and prev_code < 0xe0000
181
+ data.pages_before_e0000 = (prev_code >> 8) + 1
182
+ end
183
+
184
+ if code > prev_code + 1
185
+ process_gap(data,
186
+ prev_code + 1,
187
+ code - 1,
188
+ fields[NAME] =~ /Last>$/ ? fields : new_gap_fields)
189
+ end
190
+ process_one(data, code, fields)
191
+ prev_code = code
192
+ end
193
+ process_gap(data, prev_code + 1, 0x10ffff, new_gap_fields)
194
+ end
195
+
196
+ private
197
+
198
+ def new_gap_fields
199
+ ['', '', 'Cn', '0', '', '', '', '', '', '', '', '', '', '', '']
200
+ end
201
+
202
+ def process_gap(data, low, hi, fields)
203
+ low.upto(hi) do |i|
204
+ fields[CODE] = sprintf('%04x', i)
205
+ process_one(data, i, fields)
206
+ end
207
+ end
208
+
209
+ def process_one(data, code, fields)
210
+ # puts(code.to_s)
211
+ # data.cps[code] ||= UnicodeCodepoint.new(code)
212
+ data.type[code] = fields[CATEGORY]
213
+
214
+ # TODO: Why not process things like 'Nl'?
215
+ case data.type[code]
216
+ when 'Nd'
217
+ data.value[code] = fields[DECIMAL_VALUE].to_i
218
+ when 'Ll'
219
+ data.value[code] = fields[UPPER].to_i(16)
220
+ when 'Lu'
221
+ data.value[code] = fields[LOWER].to_i(16)
222
+ when 'Lt'
223
+ data.title_to_lower[code] = fields[LOWER].to_i(16)
224
+ data.title_to_upper[code] = fields[UPPER].to_i(16)
225
+ end
226
+
227
+ data.cclass[code] = fields[COMBINING_CLASSES]
228
+
229
+ unless fields[DECOMPOSITION] == ''
230
+ if fields[DECOMPOSITION] =~ /^\<.*\>\s*(.*)/
231
+ data.decompose_compat[code] = true
232
+ fields[DECOMPOSITION] = $1
233
+ else
234
+ data.decompose_compat[code] = false
235
+ unless data.excludes.include?(code)
236
+ data.compositions[code] = fields[DECOMPOSITION]
237
+ end
238
+ end
239
+ data.decompositions[code] = fields[DECOMPOSITION]
240
+ end
241
+ end
242
+ end
243
+
244
+ class LineBreak
245
+ BREAK_CODE, BREAK_PROPERTY = (0..1).to_a
246
+
247
+ def process(data)
248
+ prev_code = -1
249
+ path = File.join(data.dir, 'LineBreak.txt')
250
+ File.process(path) do |line|
251
+ fields = line.chomp.sub(/\s*#.*/, '').split(/;/, -1)
252
+ fields.verify_size(2, path, BREAK_CODE)
253
+
254
+ if fields[BREAK_CODE] =~ /([0-9A-F]{4,6})\.\.([0-9A-F]{4,6})/
255
+ start_code, end_code = $1.to_i(16), $2.to_i(16)
256
+ else
257
+ start_code = end_code = fields[BREAK_CODE].to_i(16)
258
+ end
259
+
260
+ if start_code > prev_code + 1
261
+ process_gap(data, prev_code + 1, start_code - 1)
262
+ end
263
+
264
+ start_code.upto(end_code) do |i|
265
+ data.break_props[i] = fields[BREAK_PROPERTY]
266
+ end
267
+
268
+ prev_code = end_code
269
+ end
270
+
271
+ process_gap(data, prev_code + 1, 0x10ffff)
272
+ end
273
+
274
+ private
275
+
276
+ def process_gap(data, low, hi)
277
+ low.upto(hi) do |i|
278
+ data.break_props[i] = (data.type[i] == 'Cn') ? 'XX' : 'AL'
279
+ end
280
+ end
281
+ end
282
+
283
+ class SpecialCasing
284
+ CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = (0..4).to_a
285
+
286
+ def initialize
287
+ @offset = 0
288
+ end
289
+
290
+ def process(data)
291
+ path = File.join(data.dir, 'SpecialCasing.txt')
292
+ File.process(path) do |line|
293
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
294
+ fields.verify_size((5..6), path, CASE_CODE)
295
+ raw_code, code = fields[CASE_CODE], fields[CASE_CODE].to_i(16)
296
+ unless data.type[code].nil?
297
+ # We ignore conditional special cases
298
+ next if fields.size == 6
299
+
300
+ case data.type[code]
301
+ when 'Lu'
302
+ fields.verify_field(CASE_UPPER, code, path, raw_code, 'Lu', 'Upper')
303
+ add_special_case(data, code, data.value[code],
304
+ fields[CASE_LOWER], fields[CASE_TITLE])
305
+ when 'Lt'
306
+ fields.verify_field(CASE_TITLE, code, path, raw_code, 'Lt', 'Title')
307
+ add_special_case(data, code, nil,
308
+ fields[CASE_LOWER], fields[CASE_UPPER])
309
+ when 'Ll'
310
+ fields.verify_field(CASE_LOWER, code, path, raw_code, 'Ll', 'Lower')
311
+ add_special_case(data, code, data.value[code],
312
+ fields[CASE_UPPER], fields[CASE_TITLE])
313
+ else
314
+ error("special case for non-alphabetic code point:\n" +
315
+ " file: %s\n" +
316
+ " type: %s\n" +
317
+ " code point/entry: %s\n",
318
+ path, data.type[code], raw_code)
319
+ end
320
+ else
321
+ error("special case for code point which doesn't have a type:\n" +
322
+ " file: %s\n" +
323
+ " code point/entry: %d\n",
324
+ path, code)
325
+ end
326
+ end
327
+ end
328
+
329
+ private
330
+
331
+ def add_special_case(data, code, single, field1, field2)
332
+ values = [
333
+ single.nil? ? nil : [single],
334
+ field1.split(/\s+/).map{ |s| s.to_i(16) },
335
+ [0],
336
+ field2.split(/\s+/).map{ |s| s.to_i(16) },
337
+ ]
338
+ result = ''
339
+ values.each{ |value| result += value.pack('U*') unless value.nil? }
340
+
341
+ data.special_case_offsets.push(@offset)
342
+ data.value[code] = 0x1000000 + @offset
343
+ data.special_cases.push(result.escape)
344
+ @offset += 1 + result.length
345
+ end
346
+ end
347
+
348
+ class CaseFolding
349
+ FOLDING_CODE, FOLDING_STATUS, FOLDING_MAPPING = (0..2).to_a
350
+
351
+ def process(data)
352
+ path = File.join(data.dir, 'CaseFolding.txt')
353
+ File.process(path) do |line|
354
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
355
+ fields.verify_size(4, path, FOLDING_CODE)
356
+
357
+ # skip Simple and Turkic rules
358
+ next if fields[FOLDING_STATUS] =~ /^[ST]$/
359
+
360
+ raw_code, code = fields[FOLDING_CODE], fields[FOLDING_CODE].to_i(16)
361
+ values = fields[FOLDING_MAPPING].split(/\s+/).map{ |s| s.to_i(16) }
362
+ if values.size == 1 &&
363
+ !(!data.value[code].nil? && data.value[code] >= 0x1000000) &&
364
+ !data.type[code].nil?
365
+ case data.type[code]
366
+ when 'Ll'
367
+ lower = code
368
+ when 'Lt'
369
+ lower = data.title_to_lower[code]
370
+ when 'Lu'
371
+ lower = data.value[code]
372
+ else
373
+ lower = code
374
+ end
375
+ next if lower == values[0]
376
+ end
377
+
378
+ string = values.pack('U*')
379
+ if string.length + 1 > data.casefold_longest
380
+ data.casefold_longest = string.length + 1
381
+ end
382
+ data.casefold.push([code, string.escape])
383
+ end
384
+ end
385
+ end
386
+
387
+ class BidiMirroring
388
+ def process(data)
389
+ path = File.join(data.dir, 'BidiMirroring.txt')
390
+ File.process(path) do |line|
391
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
392
+ fields.verify_size(2, path, 0)
393
+ data.bidimirror.push([fields[0].to_i(16), fields[1].to_i(16)])
394
+ end
395
+ end
396
+ end
397
+
398
+ class Printer
399
+ def initialize
400
+ @index = 0
401
+ end
402
+
403
+ def process(data)
404
+ @last_char_part1_i = data.pages_before_e0000 * 256 - 1
405
+ @last_char_part1_x = sprintf('0x%04x', @last_char_part1_i)
406
+ @last_char_part1_X = sprintf('%04X', @last_char_part1_i)
407
+ print_tables(data)
408
+ print_decomp(data)
409
+ print_composition_table(data)
410
+ print_line_break(data)
411
+ end
412
+
413
+ private
414
+
415
+ # Map general category code onto symbolic name.
416
+ Mappings = {
417
+ # Normative.
418
+ 'Lu' => 'UNICODE_UPPERCASE_LETTER',
419
+ 'Ll' => 'UNICODE_LOWERCASE_LETTER',
420
+ 'Lt' => 'UNICODE_TITLECASE_LETTER',
421
+ 'Mn' => 'UNICODE_NON_SPACING_MARK',
422
+ 'Mc' => 'UNICODE_COMBINING_MARK',
423
+ 'Me' => 'UNICODE_ENCLOSING_MARK',
424
+ 'Nd' => 'UNICODE_DECIMAL_NUMBER',
425
+ 'Nl' => 'UNICODE_LETTER_NUMBER',
426
+ 'No' => 'UNICODE_OTHER_NUMBER',
427
+ 'Zs' => 'UNICODE_SPACE_SEPARATOR',
428
+ 'Zl' => 'UNICODE_LINE_SEPARATOR',
429
+ 'Zp' => 'UNICODE_PARAGRAPH_SEPARATOR',
430
+ 'Cc' => 'UNICODE_CONTROL',
431
+ 'Cf' => 'UNICODE_FORMAT',
432
+ 'Cs' => 'UNICODE_SURROGATE',
433
+ 'Co' => 'UNICODE_PRIVATE_USE',
434
+ 'Cn' => 'UNICODE_UNASSIGNED',
435
+
436
+ # Informative.
437
+ 'Lm' => 'UNICODE_MODIFIER_LETTER',
438
+ 'Lo' => 'UNICODE_OTHER_LETTER',
439
+ 'Pc' => 'UNICODE_CONNECT_PUNCTUATION',
440
+ 'Pd' => 'UNICODE_DASH_PUNCTUATION',
441
+ 'Ps' => 'UNICODE_OPEN_PUNCTUATION',
442
+ 'Pe' => 'UNICODE_CLOSE_PUNCTUATION',
443
+ 'Pi' => 'UNICODE_INITIAL_PUNCTUATION',
444
+ 'Pf' => 'UNICODE_FINAL_PUNCTUATION',
445
+ 'Po' => 'UNICODE_OTHER_PUNCTUATION',
446
+ 'Sm' => 'UNICODE_MATH_SYMBOL',
447
+ 'Sc' => 'UNICODE_CURRENCY_SYMBOL',
448
+ 'Sk' => 'UNICODE_MODIFIER_SYMBOL',
449
+ 'So' => 'UNICODE_OTHER_SYMBOL'
450
+ }
451
+
452
+ BreakMappings = {
453
+ 'BK' => 'UNICODE_BREAK_MANDATORY',
454
+ 'CR' => 'UNICODE_BREAK_CARRIAGE_RETURN',
455
+ 'LF' => 'UNICODE_BREAK_LINE_FEED',
456
+ 'CM' => 'UNICODE_BREAK_COMBINING_MARK',
457
+ 'SG' => 'UNICODE_BREAK_SURROGATE',
458
+ 'ZW' => 'UNICODE_BREAK_ZERO_WIDTH_SPACE',
459
+ 'IN' => 'UNICODE_BREAK_INSEPARABLE',
460
+ 'GL' => 'UNICODE_BREAK_NON_BREAKING_GLUE',
461
+ 'CB' => 'UNICODE_BREAK_CONTINGENT',
462
+ 'SP' => 'UNICODE_BREAK_SPACE',
463
+ 'BA' => 'UNICODE_BREAK_AFTER',
464
+ 'BB' => 'UNICODE_BREAK_BEFORE',
465
+ 'B2' => 'UNICODE_BREAK_BEFORE_AND_AFTER',
466
+ 'HY' => 'UNICODE_BREAK_HYPHEN',
467
+ 'NS' => 'UNICODE_BREAK_NON_STARTER',
468
+ 'OP' => 'UNICODE_BREAK_OPEN_PUNCTUATION',
469
+ 'CL' => 'UNICODE_BREAK_CLOSE_PUNCTUATION',
470
+ 'QU' => 'UNICODE_BREAK_QUOTATION',
471
+ 'EX' => 'UNICODE_BREAK_EXCLAMATION',
472
+ 'ID' => 'UNICODE_BREAK_IDEOGRAPHIC',
473
+ 'NU' => 'UNICODE_BREAK_NUMERIC',
474
+ 'IS' => 'UNICODE_BREAK_INFIX_SEPARATOR',
475
+ 'SY' => 'UNICODE_BREAK_SYMBOL',
476
+ 'AL' => 'UNICODE_BREAK_ALPHABETIC',
477
+ 'PR' => 'UNICODE_BREAK_PREFIX',
478
+ 'PO' => 'UNICODE_BREAK_POSTFIX',
479
+ 'SA' => 'UNICODE_BREAK_COMPLEX_CONTEXT',
480
+ 'AI' => 'UNICODE_BREAK_AMBIGUOUS',
481
+ 'NL' => 'UNICODE_BREAK_NEXT_LINE',
482
+ 'WJ' => 'UNICODE_BREAK_WORD_JOINER',
483
+ 'XX' => 'UNICODE_BREAK_UNKNOWN',
484
+ 'JL' => 'UNICODE_BREAK_HANGUL_L_JAMO',
485
+ 'JV' => "UNICODE_BREAK_HANGUL_V_JAMO",
486
+ 'JT' => "UNICODE_BREAK_HANGUL_T_JAMO",
487
+ 'H2' => "UNICODE_BREAK_HANGUL_LV_SYLLABLE",
488
+ 'H3' => "UNICODE_BREAK_HANGUL_LVT_SYLLABLE"
489
+ };
490
+
491
+ NOT_PRESENT_OFFSET = 65535
492
+
493
+ def print_table(data, low, mid, hi, size, header, part1_h, part2_h, &f)
494
+ @index = 0
495
+ rows = []
496
+ print(header)
497
+ low.step(hi, 256) do |i|
498
+ rows[i / 256] = print_row(data, i, size){ |i| f.call(i) }
499
+ end
500
+ print("\n};\n")
501
+ print(part1_h)
502
+ low.step(mid, 256) do |i|
503
+ printf("%s%s,\n", data.indent, rows[i / 256])
504
+ end
505
+ print("};\n")
506
+ if mid != hi
507
+ print(part2_h)
508
+ 0xe0000.step(hi, 256) do |i|
509
+ printf("%s%s,\n", data.indent, rows[i / 256])
510
+ end
511
+ print("};\n")
512
+ end
513
+ end
514
+
515
+ def print_tables(data, outfile = 'character-tables.h')
516
+ row = []
517
+ saved_stdout = $stdout
518
+ File.open(outfile, 'w') do |file|
519
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
520
+ $stdout = file
521
+ print <<EOF
522
+ /* Automatically generated file */
523
+
524
+ #ifndef #{header_h}
525
+ #define #{header_h}
526
+
527
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
528
+
529
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
530
+
531
+ #define UNICODE_MAX_TABLE_INDEX 10000
532
+
533
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
534
+
535
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
536
+
537
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
538
+
539
+ #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
540
+ EOF
541
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
542
+ <<EOH, <<EOH1, <<EOH2){ |i| Mappings[data.type[i]] }
543
+
544
+
545
+ static const char type_data[][256] = {
546
+ EOH
547
+
548
+
549
+ /* U+0000 through U+#{@last_char_part1_X} */
550
+ static const int16_t type_table_part1[#{data.pages_before_e0000}] = {
551
+ EOH1
552
+
553
+
554
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
555
+ static const int16_t type_table_part2[768] = {
556
+ EOH2
557
+
558
+ print_table(data, 0, @last_char_part1_i, data.last, 4,
559
+ <<EOH, <<EOH1, <<EOH2) { |i| data.value[i].nil? ? '0x0000' : sprintf('0x%04x', data.value[i]) }
560
+
561
+
562
+ static const unichar attr_data[][256] = {
563
+ EOH
564
+
565
+
566
+ /* U+0000 through U+#{@last_char_part1_X} */
567
+ static const int16_t attr_table_part1[#{data.pages_before_e0000}] = {
568
+ EOH1
569
+
570
+
571
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
572
+ static const int16_t attr_table_part2[768] = {
573
+ EOH2
574
+
575
+ print <<EOF
576
+
577
+
578
+ static const unichar title_table[][3] = {
579
+ EOF
580
+ data.title_to_lower.keys.sort.each do |code|
581
+ printf("%s{ 0x%04x, 0x%04x, 0x%04x },\n", data.indent,
582
+ code, data.title_to_upper[code], data.title_to_lower[code])
583
+ end
584
+ print("};\n")
585
+
586
+ print_special_case_table(data)
587
+ print_case_fold_table(data)
588
+
589
+ print <<EOF
590
+ static const struct {
591
+ #{data.indent}unichar ch;
592
+ #{data.indent}unichar mirrored_ch;
593
+ } bidi_mirroring_table[] = {
594
+ EOF
595
+ data.bidimirror.each do |item|
596
+ printf("%s{ 0x%04x, 0x%04x },\n", data.indent, item[0], item[1])
597
+ end
598
+ print <<EOF
599
+ };
600
+
601
+ #endif /* #{header_h} */
602
+ EOF
603
+ end
604
+ $stdout = saved_stdout
605
+ end
606
+
607
+ def print_row(data, start, type_size)
608
+ flag = true
609
+ values = []
610
+ 0.upto(255) do |i|
611
+ values[i] = yield(start + i)
612
+ flag = false if values[i] != values[0]
613
+ end
614
+ return values[0] + " + UNICODE_MAX_TABLE_INDEX" if flag
615
+
616
+ puts(',') if @index != 0
617
+ printf("%s{ /* page %d, index %d */\n%s",
618
+ data.indent, start / 256, @index, data.indent * 2)
619
+ column = data.indent.width * 2
620
+ start.upto(start + 255) do |i|
621
+ text = values[i - start]
622
+ if text.length + column + 2 > 79
623
+ printf("\n%s", data.indent * 2)
624
+ column = data.indent.width * 2
625
+ end
626
+
627
+ printf("%s, ", text)
628
+ column += text.width + 2
629
+ end
630
+
631
+ print("\n#{data.indent}}")
632
+ @index += 1
633
+ return sprintf("%d /* page %d */", @index - 1, start / 256);
634
+ end
635
+
636
+ def print_special_case_table(data)
637
+ print <<EOF
638
+
639
+
640
+ /*
641
+ * Table of special cases for case conversion; each record contains
642
+ * First, the best single character mapping to lowercase if Lu,
643
+ * and to uppercase if Ll, followed by the output mapping for the two cases
644
+ * other than the case of the codepoint, in the order Ll, Lu, Lt, encoded in
645
+ * UTF-8, separated and terminated by a NUL character.
646
+ */
647
+ static const char special_case_table[] = {
648
+ EOF
649
+ data.special_cases.each_with_index do |sc, i|
650
+ printf(%Q< "%s\\0" /* offset %d */\n>, sc, data.special_case_offsets[i])
651
+ end
652
+ print <<EOF
653
+ };
654
+
655
+ EOF
656
+ end
657
+
658
+ def print_case_fold_table(data)
659
+ print <<EOF
660
+
661
+ /*
662
+ * Table of casefolding cases that can't be derived by lowercasing.
663
+ */
664
+ static const struct {
665
+ #{data.indent}uint16_t ch;
666
+ #{data.indent}char data[#{data.casefold_longest}];
667
+ } casefold_table[] = {
668
+ EOF
669
+ data.casefold.sort_by{ |a| a[0] }.each do |cf|
670
+ if cf[0] > 0xffff
671
+ error('casefold_table.ch field too short.' +
672
+ ' Upgrade to unichar to fit values beyond 0xffff.')
673
+ end
674
+ printf(%Q<%s{ 0x%04x, "%s" },\n>, data.indent, cf[0], cf[1])
675
+ end
676
+ print <<EOF
677
+ };
678
+ EOF
679
+ end
680
+
681
+ def print_decomp(data, outfile = 'decompose.h')
682
+ row = []
683
+ saved_stdout = $stdout
684
+ File.open(outfile, 'w') do |file|
685
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
686
+ $stdout = file
687
+ print <<EOF
688
+ /* Automatically generated file */
689
+
690
+ #ifndef #{header_h}
691
+ #define #{header_h}
692
+
693
+
694
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
695
+
696
+ #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
697
+
698
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
699
+
700
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
701
+
702
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
703
+
704
+ #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
705
+ EOF
706
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
707
+ <<EOH, <<EOH1, <<EOH2){ |i| data.cclass[i] }
708
+
709
+
710
+ static const uint8_t cclass_data[][256] = {
711
+ EOH
712
+
713
+
714
+ static const int16_t combining_class_table_part1[#{data.pages_before_e0000}] = {
715
+ EOH1
716
+
717
+
718
+ static const int16_t combining_class_table_part2[768] = {
719
+ EOH2
720
+
721
+ print <<EOL
722
+
723
+
724
+ static const struct {
725
+ #{data.indent}unichar ch;
726
+ #{data.indent}uint16_t canon_offset;
727
+ #{data.indent}uint16_t compat_offset;
728
+ } decomp_table[] = {
729
+ EOL
730
+ decomp_offsets = {}
731
+ decomp_string = ''
732
+ @decomp_string_offset = 0
733
+ 0.upto(data.last) do |i|
734
+ unless data.decompositions[i].nil?
735
+ canon_decomp = data.decompose_compat[i] ?
736
+ nil : make_decomp(data, i, false)
737
+ compat_decomp = make_decomp(data, i, true)
738
+ if not canon_decomp.nil? and compat_decomp == canon_decomp
739
+ compat_decomp = nil
740
+ end
741
+ canon_offset = handle_decomp(canon_decomp, decomp_offsets,
742
+ decomp_string)
743
+ compat_offset = handle_decomp(compat_decomp, decomp_offsets,
744
+ decomp_string)
745
+
746
+ if @decomp_string_offset > NOT_PRESENT_OFFSET
747
+ error('decomposition string offset beyond not-present-offset,' +
748
+ " upgrade value:\n" +
749
+ " offset: %d\n" +
750
+ " max: %d\n",
751
+ @decomp_string_offset, NOT_PRESENT_OFFSET)
752
+ end
753
+ printf("%s{ 0x%04x, %s, %s },\n",
754
+ data.indent, i, canon_offset, compat_offset)
755
+ end
756
+ end
757
+ print("\n};")
758
+
759
+ print <<EOL
760
+
761
+ static const char decomp_expansion_string[] = #{decomp_string};
762
+
763
+
764
+ #endif /* #{header_h} */
765
+ EOL
766
+ end
767
+ $stdout = saved_stdout
768
+ end
769
+
770
+ def expand_decomp(data, code, compat)
771
+ ary = []
772
+ data.decompositions[code].split(/ /).each do |item|
773
+ pos = item.to_i(16)
774
+ if not data.decompositions[pos].nil? and
775
+ (compat or not data.decompose_compat[pos])
776
+ ary.concat(expand_decomp(data, pos, compat))
777
+ else
778
+ ary.push(pos)
779
+ end
780
+ end
781
+ ary
782
+ end
783
+
784
+ def make_decomp(data, code, compat)
785
+ str = ''
786
+ expand_decomp(data, code, compat).each do |item|
787
+ str += item.is_a?(Array) ? item.flatten.pack('U') : [item].pack('U')
788
+ end
789
+ str
790
+ end
791
+
792
+ def handle_decomp(decomp, decomp_offsets,
793
+ decomp_string)
794
+ offset = 'UNICODE_NOT_PRESENT_OFFSET'
795
+ unless decomp.nil?
796
+ if decomp_offsets.member?(decomp)
797
+ offset = decomp_offsets[decomp]
798
+ else
799
+ offset = @decomp_string_offset
800
+ decomp_offsets[decomp] = offset
801
+ decomp_string << ("\n \"" + decomp.escape +
802
+ "\\0\" /* offset #{offset} */")
803
+ @decomp_string_offset += decomp.length + 1
804
+ end
805
+ end
806
+ offset
807
+ end
808
+
809
+ def print_composition_table(data, outfile = 'compose.h')
810
+ first = Hash.new(0)
811
+ second = Hash.new(0)
812
+
813
+ data.compositions.each do |code, value|
814
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
815
+
816
+ # skip non-starters and single-character decompositions
817
+ if data.cclass[values[0]] != '0' or values.size == 1
818
+ data.compositions.delete(code)
819
+ next
820
+ end
821
+
822
+ if values.size != 2
823
+ error("decomposition of entry contains more than two elements:\n" +
824
+ " entry: %d\n" +
825
+ " elements: %d\n",
826
+ code, values.size)
827
+ end
828
+
829
+ first[values[0]] += 1
830
+ end
831
+
832
+ n_first = first.enumerate_ordered
833
+
834
+ data.compositions.each do |code, value|
835
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
836
+
837
+ second[values[1]] += 1 if first.member?(values[0])
838
+ end
839
+
840
+ n_second = second.enumerate_ordered
841
+
842
+ first_singletons = []
843
+ second_singletons = []
844
+ reverse = {}
845
+ data.compositions.each do |code, value|
846
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
847
+
848
+ if first.member?(values[0]) and second.member?(values[1])
849
+ reverse["#{first[values[0]]}|#{second[values[1]]}"] = code
850
+ elsif not first.member?(values[0])
851
+ first_singletons.push([values[0], values[1], code])
852
+ else
853
+ second_singletons.push([values[1], values[0], code])
854
+ end
855
+ end
856
+
857
+ first_singletons = first_singletons.sort_by{ |a| a[0] }
858
+ second_singletons = second_singletons.sort_by{ |a| a[0] }
859
+
860
+ row = []
861
+ saved_stdout = $stdout
862
+ File.open(outfile, 'w') do |file|
863
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
864
+ $stdout = file
865
+ values = {}
866
+ total = first_start = 1
867
+ last = 0
868
+
869
+ first.each do |code, value|
870
+ values[code] = value + total
871
+ last = code if code > last
872
+ end
873
+ total += n_first
874
+
875
+ first_single_start = total
876
+ first_singletons.each_with_index do |item, i|
877
+ code = item[0]
878
+ values[code] = i + total
879
+ last = code if code > last
880
+ end
881
+ total += first_singletons.size
882
+
883
+ second_start = total
884
+ second.each do |code, value|
885
+ values[code] = value + total
886
+ last = code if code > last
887
+ end
888
+ total += n_second
889
+
890
+ second_single_start = total
891
+ second_singletons.each_with_index do |item, i|
892
+ code = item[0]
893
+ values[code] = i + total
894
+ last = code if code > last
895
+ end
896
+
897
+ print <<EOL
898
+ /* Automatically generated file */
899
+
900
+ #ifndef #{header_h}
901
+ #define #{header_h}
902
+
903
+
904
+ #define COMPOSE_FIRST_START #{first_start}
905
+ #define COMPOSE_FIRST_SINGLE_START #{first_single_start}
906
+ #define COMPOSE_SECOND_START #{second_start}
907
+ #define COMPOSE_SECOND_SINGLE_START #{second_single_start}
908
+ #define COMPOSE_TABLE_LAST #{last / 256}
909
+ EOL
910
+
911
+ print_table(data, 0, last, last, 2,
912
+ <<EOH, <<EOH1, nil){ |i| values.member?(i) ? values[i].to_s : '0' }
913
+
914
+
915
+ static const uint16_t compose_data[][256] = {
916
+ EOH
917
+
918
+
919
+ static const int16_t compose_table[COMPOSE_TABLE_LAST + 1] = {
920
+ EOH1
921
+
922
+ print <<EOL
923
+
924
+
925
+ static const uint16_t compose_first_single[][2] = {
926
+ EOL
927
+ first_singletons.each_with_index do |item, i|
928
+ if item[1] > 0xffff or item[2] > 0xffff
929
+ error("compose_first_single table field too short." +
930
+ " Upgrade to unichar to fit values beyond 0xffff.")
931
+ end
932
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
933
+ end
934
+ print("};\n")
935
+
936
+ print <<EOL
937
+
938
+
939
+ static const uint16_t compose_second_single[][2] = {
940
+ EOL
941
+ second_singletons.each_with_index do |item, i|
942
+ if item[1] > 0xffff or item[2] > 0xffff
943
+ error("compose_second_single table field too short." +
944
+ " Upgrade to unichar to fit values beyond 0xffff.")
945
+ end
946
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
947
+ end
948
+ print("};\n")
949
+
950
+ print <<EOL
951
+
952
+
953
+ static const uint16_t compose_array[#{n_first}][#{n_second}] = {
954
+ EOL
955
+ 0.upto(n_first - 1) do |i|
956
+ printf("%s{\n%s", data.indent, data.indent * 2)
957
+ column = data.indent.width * 2
958
+ 0.upto(n_second - 1) do |j|
959
+ if column + 8 > 79
960
+ printf("\n%s", data.indent * 2)
961
+ column = data.indent.width * 2
962
+ end
963
+ if reverse.member?("#{i}|#{j}")
964
+ if reverse["#{i}|#{j}"] > 0xffff
965
+ error("compose_array table field too short." +
966
+ " Upgrade to unichar to fit values beyond 0xffff.")
967
+ end
968
+ printf("0x%04x, ", reverse["#{i}|#{j}"])
969
+ else
970
+ print(" 0, ")
971
+ end
972
+ column += 8
973
+ end
974
+ printf("\n%s},\n", data.indent)
975
+ end
976
+ print("};\n")
977
+
978
+ print <<EOL
979
+
980
+
981
+ #endif /* #{header_h} */
982
+ EOL
983
+ end
984
+ $stdout = saved_stdout
985
+ end
986
+
987
+ def print_line_break(data, outfile = 'break.h')
988
+ row = []
989
+ saved_stdout = $stdout
990
+ File.open(outfile, 'w') do |file|
991
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
992
+ $stdout = file
993
+ print <<EOF
994
+ /* Automatically generated file */
995
+
996
+ #ifndef #{header_h}
997
+ #define #{header_h}
998
+
999
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
1000
+
1001
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
1002
+
1003
+ #define UNICODE_MAX_TABLE_INDEX 10000
1004
+
1005
+ /*
1006
+ * The last code point that should be looked up in break_property_table_part1.
1007
+ */
1008
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
1009
+
1010
+ /*
1011
+ * The first code point that should be looked up in break_property_table_part2.
1012
+ */
1013
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
1014
+ EOF
1015
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
1016
+ <<EOH, <<EOH1, <<EOH2){ |i| BreakMappings[data.break_props[i]] }
1017
+
1018
+
1019
+ static const int8_t break_property_data[][256] = {
1020
+ EOH
1021
+
1022
+
1023
+ /* U+0000 through U+#{@last_char_part1_X} */
1024
+ static const int16_t break_property_table_part1[#{data.pages_before_e0000}] = {
1025
+ EOH1
1026
+
1027
+
1028
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
1029
+ static const int16_t break_property_table_part2[768] = {
1030
+ EOH2
1031
+
1032
+ print <<EOF
1033
+
1034
+
1035
+ #endif /* #{header_h} */
1036
+ EOF
1037
+ end
1038
+ $stdout = saved_stdout
1039
+ end
1040
+ end
1041
+
1042
+ UnicodeVersion = ARGV[0]
1043
+
1044
+ class Runner
1045
+ def main
1046
+ check_for_data_files(ARGV[1])
1047
+ data = CollectedData.new(ARGV[1], "\t")
1048
+ [CompositionExclusions, UnicodeData, LineBreak,
1049
+ SpecialCasing, CaseFolding, BidiMirroring, Printer].each do |klass|
1050
+ klass.new.process(data)
1051
+ end
1052
+ end
1053
+
1054
+ private
1055
+ def check_for_data_files(dir)
1056
+ ['UnicodeData.txt', 'LineBreak.txt', 'SpecialCasing.txt', 'CaseFolding.txt',
1057
+ 'CompositionExclusions.txt', 'BidiMirroring.txt'].each do |file|
1058
+ path = File.join(dir, file)
1059
+ unless File.readable?(path)
1060
+ error('missing required file: %s', path)
1061
+ end
1062
+ end
1063
+ end
1064
+ end
1065
+
1066
+ Runner.new.main
1067
+
1068
+
1069
+
1070
+ # vim: set sts=2 sw=2: