character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,1065 @@
1
+ #! /usr/bin/ruby -w
2
+ =begin
3
+ :contents: Generate Unicode table headers.
4
+ :arch-tag: 98c7456d-c7d9-4b40-9971-409428593ad5
5
+
6
+ Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
7
+
8
+ This program is free software; you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation; either version 2 of the License, or
11
+ (at your option) any later version.
12
+
13
+ This program is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with this program; if not, write to the Free Software
20
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
+ =end
22
+
23
+
24
+
25
+ def error(fmt, *args)
26
+ $stderr.printf("%s: %s\n", File.basename($0), sprintf(fmt, *args))
27
+ exit(1)
28
+ end
29
+
30
+ class File
31
+ def self.process(path)
32
+ begin
33
+ File.open(path) do |file|
34
+ file.each_line do |line|
35
+ next if line =~ /^(#|\s*$)/
36
+ yield line
37
+ end
38
+ end
39
+ rescue IOError => e
40
+ error("I/O error while processing input:\n" +
41
+ " file: %s\n" +
42
+ " error: %s\n", path, e.message)
43
+ end
44
+ end
45
+ end
46
+
47
+ class String
48
+ def escape
49
+ self.unpack('H*')[0].gsub(/(.{2})/, '\\x\1')
50
+ end
51
+
52
+ def width
53
+ self.gsub(/\t/, ' ' * 8).length
54
+ end
55
+ end
56
+
57
+ class Array
58
+ def verify_size(wanted, path, index)
59
+ if !(wanted === self.size)
60
+ error("entry doesn't contain the required %s fields:\n" +
61
+ " file: %s\n" +
62
+ " entry: %s\n" +
63
+ " field count: %d\n",
64
+ wanted.to_s,
65
+ path,
66
+ (self.size > index) ? self[index] : 'N/A',
67
+ self.size)
68
+ end
69
+ end
70
+
71
+ def verify_field(index, code, path, raw_code, type, ccase)
72
+ if self[index].to_i(16) != code
73
+ error("entry has type %s but UCD_%s(%s) != %s:\n" +
74
+ " file: %s\n" +
75
+ " entry: %s\n",
76
+ type, ccase, raw_code, raw_code, path, raw_code)
77
+ end
78
+ end
79
+ end
80
+
81
+ class Hash
82
+ def enumerate_ordered
83
+ n = 0
84
+ self.keys.sort.each do |code|
85
+ if self[code] == 1
86
+ self.delete(code)
87
+ next
88
+ end
89
+ self[code] = n
90
+ n += 1
91
+ end
92
+ n
93
+ end
94
+ end
95
+
96
+ # XXX: this is too memory consuming to keep like this. We need to split it up
97
+ # like the perl script does in hashes and arrays. Argh!
98
+ class UnicodeCodepoint
99
+ def initialize(code)
100
+ @code = code
101
+ @type = @value = @lower = @upper = @cclass = @compat = nil
102
+ @compositions = @decompositions = @break_props = nil
103
+ end
104
+
105
+ attr_accessor :code
106
+ attr_accessor :type, :value, :lower, :upper, :cclass, :compat
107
+ attr_accessor :compositions, :decompositions, :break_props
108
+ end
109
+
110
+ # XXX: cleanup
111
+ class CollectedData
112
+ def initialize(dir = '.', indent = "\t")
113
+ @dir = dir
114
+ @indent = indent
115
+ @cps = []
116
+
117
+ @excludes = nil
118
+
119
+ @pages_before_e0000 = 0
120
+ @last = 0x10ffff
121
+
122
+ @type = []
123
+ @value = []
124
+ @title_to_lower = {}
125
+ @title_to_upper = {}
126
+ @cclass = []
127
+ @decompose_compat = []
128
+ @compositions = {}
129
+ @decompositions = []
130
+
131
+ @break_props = []
132
+
133
+ @special_case_offsets = []
134
+ @special_cases = []
135
+
136
+ @casefold = []
137
+ @casefold_longest = -1
138
+
139
+ @bidimirror = []
140
+ end
141
+
142
+ attr :dir
143
+ attr :indent
144
+ attr :cps, true
145
+ attr :excludes, true
146
+ attr :pages_before_e0000, true
147
+ attr :last
148
+ attr_accessor :type, :value, :title_to_lower, :title_to_upper, :cclass,
149
+ :decompose_compat, :compositions, :decompositions
150
+ attr :break_props, true
151
+ attr :special_case_offsets, true
152
+ attr :special_cases, true
153
+ attr :casefold, true
154
+ attr :casefold_longest, true
155
+ attr :bidimirror, true
156
+ end
157
+
158
+ class CompositionExclusions
159
+ def process(data)
160
+ data.excludes = Hash.new
161
+ File.process(File.join(data.dir, 'CompositionExclusions.txt')) do |line|
162
+ data.excludes[line.chomp.sub(/^\s*(.*?)\s*(#.*)?$/,'\1').to_i(16)] = true
163
+ end
164
+ end
165
+ end
166
+
167
+ class UnicodeData
168
+ CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY,
169
+ DECOMPOSITION, DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED,
170
+ OLD_NAME, COMMENT, UPPER, LOWER, TITLE = (0..14).to_a
171
+
172
+ def process(data)
173
+ prev_code = -1
174
+ path = File.join(data.dir, 'UnicodeData.txt')
175
+ File.process(path) do |line|
176
+ fields = line.chomp.split(/;/, -1)
177
+ fields.verify_size(15, path, CODE)
178
+ code = fields[CODE].to_i(16)
179
+
180
+ if code >= 0xe0000 and prev_code < 0xe0000
181
+ data.pages_before_e0000 = (prev_code >> 8) + 1
182
+ end
183
+
184
+ if code > prev_code + 1
185
+ process_gap(data,
186
+ prev_code + 1,
187
+ code - 1,
188
+ fields[NAME] =~ /Last>$/ ? fields : new_gap_fields)
189
+ end
190
+ process_one(data, code, fields)
191
+ prev_code = code
192
+ end
193
+ process_gap(data, prev_code + 1, 0x10ffff, new_gap_fields)
194
+ end
195
+
196
+ private
197
+
198
+ def new_gap_fields
199
+ ['', '', 'Cn', '0', '', '', '', '', '', '', '', '', '', '', '']
200
+ end
201
+
202
+ def process_gap(data, low, hi, fields)
203
+ low.upto(hi) do |i|
204
+ fields[CODE] = sprintf('%04x', i)
205
+ process_one(data, i, fields)
206
+ end
207
+ end
208
+
209
+ def process_one(data, code, fields)
210
+ # puts(code.to_s)
211
+ # data.cps[code] ||= UnicodeCodepoint.new(code)
212
+ data.type[code] = fields[CATEGORY]
213
+
214
+ # TODO: Why not process things like 'Nl'?
215
+ case data.type[code]
216
+ when 'Nd'
217
+ data.value[code] = fields[DECIMAL_VALUE].to_i
218
+ when 'Ll'
219
+ data.value[code] = fields[UPPER].to_i(16)
220
+ when 'Lu'
221
+ data.value[code] = fields[LOWER].to_i(16)
222
+ when 'Lt'
223
+ data.title_to_lower[code] = fields[LOWER].to_i(16)
224
+ data.title_to_upper[code] = fields[UPPER].to_i(16)
225
+ end
226
+
227
+ data.cclass[code] = fields[COMBINING_CLASSES]
228
+
229
+ unless fields[DECOMPOSITION] == ''
230
+ if fields[DECOMPOSITION] =~ /^\<.*\>\s*(.*)/
231
+ data.decompose_compat[code] = true
232
+ fields[DECOMPOSITION] = $1
233
+ else
234
+ data.decompose_compat[code] = false
235
+ unless data.excludes.include?(code)
236
+ data.compositions[code] = fields[DECOMPOSITION]
237
+ end
238
+ end
239
+ data.decompositions[code] = fields[DECOMPOSITION]
240
+ end
241
+ end
242
+ end
243
+
244
+ class LineBreak
245
+ BREAK_CODE, BREAK_PROPERTY = (0..1).to_a
246
+
247
+ def process(data)
248
+ prev_code = -1
249
+ path = File.join(data.dir, 'LineBreak.txt')
250
+ File.process(path) do |line|
251
+ fields = line.chomp.sub(/\s*#.*/, '').split(/;/, -1)
252
+ fields.verify_size(2, path, BREAK_CODE)
253
+
254
+ if fields[BREAK_CODE] =~ /([0-9A-F]{4,6})\.\.([0-9A-F]{4,6})/
255
+ start_code, end_code = $1.to_i(16), $2.to_i(16)
256
+ else
257
+ start_code = end_code = fields[BREAK_CODE].to_i(16)
258
+ end
259
+
260
+ if start_code > prev_code + 1
261
+ process_gap(data, prev_code + 1, start_code - 1)
262
+ end
263
+
264
+ start_code.upto(end_code) do |i|
265
+ data.break_props[i] = fields[BREAK_PROPERTY]
266
+ end
267
+
268
+ prev_code = end_code
269
+ end
270
+
271
+ process_gap(data, prev_code + 1, 0x10ffff)
272
+ end
273
+
274
+ private
275
+
276
+ def process_gap(data, low, hi)
277
+ low.upto(hi) do |i|
278
+ data.break_props[i] = (data.type[i] == 'Cn') ? 'XX' : 'AL'
279
+ end
280
+ end
281
+ end
282
+
283
+ class SpecialCasing
284
+ CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = (0..4).to_a
285
+
286
+ def initialize
287
+ @offset = 0
288
+ end
289
+
290
+ def process(data)
291
+ path = File.join(data.dir, 'SpecialCasing.txt')
292
+ File.process(path) do |line|
293
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
294
+ fields.verify_size((5..6), path, CASE_CODE)
295
+ raw_code, code = fields[CASE_CODE], fields[CASE_CODE].to_i(16)
296
+ unless data.type[code].nil?
297
+ # We ignore conditional special cases
298
+ next if fields.size == 6
299
+
300
+ case data.type[code]
301
+ when 'Lu'
302
+ fields.verify_field(CASE_UPPER, code, path, raw_code, 'Lu', 'Upper')
303
+ add_special_case(data, code, data.value[code],
304
+ fields[CASE_LOWER], fields[CASE_TITLE])
305
+ when 'Lt'
306
+ fields.verify_field(CASE_TITLE, code, path, raw_code, 'Lt', 'Title')
307
+ add_special_case(data, code, nil,
308
+ fields[CASE_LOWER], fields[CASE_UPPER])
309
+ when 'Ll'
310
+ fields.verify_field(CASE_LOWER, code, path, raw_code, 'Ll', 'Lower')
311
+ add_special_case(data, code, data.value[code],
312
+ fields[CASE_UPPER], fields[CASE_TITLE])
313
+ else
314
+ error("special case for non-alphabetic code point:\n" +
315
+ " file: %s\n" +
316
+ " type: %s\n" +
317
+ " code point/entry: %s\n",
318
+ path, data.type[code], raw_code)
319
+ end
320
+ else
321
+ error("special case for code point which doesn't have a type:\n" +
322
+ " file: %s\n" +
323
+ " code point/entry: %d\n",
324
+ path, code)
325
+ end
326
+ end
327
+ end
328
+
329
+ private
330
+
331
+ def add_special_case(data, code, single, field1, field2)
332
+ values = [
333
+ single.nil? ? nil : [single],
334
+ field1.split(/\s+/).map{ |s| s.to_i(16) },
335
+ [0],
336
+ field2.split(/\s+/).map{ |s| s.to_i(16) },
337
+ ]
338
+ result = ''
339
+ values.each{ |value| result += value.pack('U*') unless value.nil? }
340
+
341
+ data.special_case_offsets.push(@offset)
342
+ data.value[code] = 0x1000000 + @offset
343
+ data.special_cases.push(result.escape)
344
+ @offset += 1 + result.length
345
+ end
346
+ end
347
+
348
+ class CaseFolding
349
+ FOLDING_CODE, FOLDING_STATUS, FOLDING_MAPPING = (0..2).to_a
350
+
351
+ def process(data)
352
+ path = File.join(data.dir, 'CaseFolding.txt')
353
+ File.process(path) do |line|
354
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
355
+ fields.verify_size(4, path, FOLDING_CODE)
356
+
357
+ # skip Simple and Turkic rules
358
+ next if fields[FOLDING_STATUS] =~ /^[ST]$/
359
+
360
+ raw_code, code = fields[FOLDING_CODE], fields[FOLDING_CODE].to_i(16)
361
+ values = fields[FOLDING_MAPPING].split(/\s+/).map{ |s| s.to_i(16) }
362
+ if values.size == 1 &&
363
+ !(!data.value[code].nil? && data.value[code] >= 0x1000000) &&
364
+ !data.type[code].nil?
365
+ case data.type[code]
366
+ when 'Ll'
367
+ lower = code
368
+ when 'Lt'
369
+ lower = data.title_to_lower[code]
370
+ when 'Lu'
371
+ lower = data.value[code]
372
+ else
373
+ lower = code
374
+ end
375
+ next if lower == values[0]
376
+ end
377
+
378
+ string = values.pack('U*')
379
+ if string.length + 1 > data.casefold_longest
380
+ data.casefold_longest = string.length + 1
381
+ end
382
+ data.casefold.push([code, string.escape])
383
+ end
384
+ end
385
+ end
386
+
387
+ class BidiMirroring
388
+ def process(data)
389
+ path = File.join(data.dir, 'BidiMirroring.txt')
390
+ File.process(path) do |line|
391
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
392
+ fields.verify_size(2, path, 0)
393
+ data.bidimirror.push([fields[0].to_i(16), fields[1].to_i(16)])
394
+ end
395
+ end
396
+ end
397
+
398
+ class Printer
399
+ def initialize
400
+ @index = 0
401
+ end
402
+
403
+ def process(data)
404
+ @last_char_part1_i = data.pages_before_e0000 * 256 - 1
405
+ @last_char_part1_x = sprintf('0x%04x', @last_char_part1_i)
406
+ @last_char_part1_X = sprintf('%04X', @last_char_part1_i)
407
+ print_tables(data)
408
+ print_decomp(data)
409
+ print_composition_table(data)
410
+ print_line_break(data)
411
+ end
412
+
413
+ private
414
+
415
+ # Map general category code onto symbolic name.
416
+ Mappings = {
417
+ # Normative.
418
+ 'Lu' => 'UNICODE_UPPERCASE_LETTER',
419
+ 'Ll' => 'UNICODE_LOWERCASE_LETTER',
420
+ 'Lt' => 'UNICODE_TITLECASE_LETTER',
421
+ 'Mn' => 'UNICODE_NON_SPACING_MARK',
422
+ 'Mc' => 'UNICODE_COMBINING_MARK',
423
+ 'Me' => 'UNICODE_ENCLOSING_MARK',
424
+ 'Nd' => 'UNICODE_DECIMAL_NUMBER',
425
+ 'Nl' => 'UNICODE_LETTER_NUMBER',
426
+ 'No' => 'UNICODE_OTHER_NUMBER',
427
+ 'Zs' => 'UNICODE_SPACE_SEPARATOR',
428
+ 'Zl' => 'UNICODE_LINE_SEPARATOR',
429
+ 'Zp' => 'UNICODE_PARAGRAPH_SEPARATOR',
430
+ 'Cc' => 'UNICODE_CONTROL',
431
+ 'Cf' => 'UNICODE_FORMAT',
432
+ 'Cs' => 'UNICODE_SURROGATE',
433
+ 'Co' => 'UNICODE_PRIVATE_USE',
434
+ 'Cn' => 'UNICODE_UNASSIGNED',
435
+
436
+ # Informative.
437
+ 'Lm' => 'UNICODE_MODIFIER_LETTER',
438
+ 'Lo' => 'UNICODE_OTHER_LETTER',
439
+ 'Pc' => 'UNICODE_CONNECT_PUNCTUATION',
440
+ 'Pd' => 'UNICODE_DASH_PUNCTUATION',
441
+ 'Ps' => 'UNICODE_OPEN_PUNCTUATION',
442
+ 'Pe' => 'UNICODE_CLOSE_PUNCTUATION',
443
+ 'Pi' => 'UNICODE_INITIAL_PUNCTUATION',
444
+ 'Pf' => 'UNICODE_FINAL_PUNCTUATION',
445
+ 'Po' => 'UNICODE_OTHER_PUNCTUATION',
446
+ 'Sm' => 'UNICODE_MATH_SYMBOL',
447
+ 'Sc' => 'UNICODE_CURRENCY_SYMBOL',
448
+ 'Sk' => 'UNICODE_MODIFIER_SYMBOL',
449
+ 'So' => 'UNICODE_OTHER_SYMBOL'
450
+ }
451
+
452
+ BreakMappings = {
453
+ 'BK' => 'UNICODE_BREAK_MANDATORY',
454
+ 'CR' => 'UNICODE_BREAK_CARRIAGE_RETURN',
455
+ 'LF' => 'UNICODE_BREAK_LINE_FEED',
456
+ 'CM' => 'UNICODE_BREAK_COMBINING_MARK',
457
+ 'SG' => 'UNICODE_BREAK_SURROGATE',
458
+ 'ZW' => 'UNICODE_BREAK_ZERO_WIDTH_SPACE',
459
+ 'IN' => 'UNICODE_BREAK_INSEPARABLE',
460
+ 'GL' => 'UNICODE_BREAK_NON_BREAKING_GLUE',
461
+ 'CB' => 'UNICODE_BREAK_CONTINGENT',
462
+ 'SP' => 'UNICODE_BREAK_SPACE',
463
+ 'BA' => 'UNICODE_BREAK_AFTER',
464
+ 'BB' => 'UNICODE_BREAK_BEFORE',
465
+ 'B2' => 'UNICODE_BREAK_BEFORE_AND_AFTER',
466
+ 'HY' => 'UNICODE_BREAK_HYPHEN',
467
+ 'NS' => 'UNICODE_BREAK_NON_STARTER',
468
+ 'OP' => 'UNICODE_BREAK_OPEN_PUNCTUATION',
469
+ 'CL' => 'UNICODE_BREAK_CLOSE_PUNCTUATION',
470
+ 'QU' => 'UNICODE_BREAK_QUOTATION',
471
+ 'EX' => 'UNICODE_BREAK_EXCLAMATION',
472
+ 'ID' => 'UNICODE_BREAK_IDEOGRAPHIC',
473
+ 'NU' => 'UNICODE_BREAK_NUMERIC',
474
+ 'IS' => 'UNICODE_BREAK_INFIX_SEPARATOR',
475
+ 'SY' => 'UNICODE_BREAK_SYMBOL',
476
+ 'AL' => 'UNICODE_BREAK_ALPHABETIC',
477
+ 'PR' => 'UNICODE_BREAK_PREFIX',
478
+ 'PO' => 'UNICODE_BREAK_POSTFIX',
479
+ 'SA' => 'UNICODE_BREAK_COMPLEX_CONTEXT',
480
+ 'AI' => 'UNICODE_BREAK_AMBIGUOUS',
481
+ 'NL' => 'UNICODE_BREAK_NEXT_LINE',
482
+ 'WJ' => 'UNICODE_BREAK_WORD_JOINER',
483
+ 'XX' => 'UNICODE_BREAK_UNKNOWN',
484
+ 'JL' => 'UNICODE_BREAK_HANGUL_L_JAMO',
485
+ 'JV' => "UNICODE_BREAK_HANGUL_V_JAMO",
486
+ 'JT' => "UNICODE_BREAK_HANGUL_T_JAMO",
487
+ 'H2' => "UNICODE_BREAK_HANGUL_LV_SYLLABLE",
488
+ 'H3' => "UNICODE_BREAK_HANGUL_LVT_SYLLABLE"
489
+ };
490
+
491
+ NOT_PRESENT_OFFSET = 65535
492
+
493
+ def print_table(data, low, mid, hi, size, header, part1_h, part2_h, &f)
494
+ @index = 0
495
+ rows = []
496
+ print(header)
497
+ low.step(hi, 256) do |i|
498
+ rows[i / 256] = print_row(data, i, size){ |i| f.call(i) }
499
+ end
500
+ print("\n};\n")
501
+ print(part1_h)
502
+ low.step(mid, 256) do |i|
503
+ printf("%s%s,\n", data.indent, rows[i / 256])
504
+ end
505
+ print("};\n")
506
+ if mid != hi
507
+ print(part2_h)
508
+ 0xe0000.step(hi, 256) do |i|
509
+ printf("%s%s,\n", data.indent, rows[i / 256])
510
+ end
511
+ print("};\n")
512
+ end
513
+ end
514
+
515
+ def print_tables(data, outfile = 'character-tables.h')
516
+ row = []
517
+ saved_stdout = $stdout
518
+ File.open(outfile, 'w') do |file|
519
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
520
+ $stdout = file
521
+ print <<EOF
522
+ /* Automatically generated file */
523
+
524
+ #ifndef #{header_h}
525
+ #define #{header_h}
526
+
527
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
528
+
529
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
530
+
531
+ #define UNICODE_MAX_TABLE_INDEX 10000
532
+
533
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
534
+
535
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
536
+
537
+ #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
538
+ EOF
539
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
540
+ <<EOH, <<EOH1, <<EOH2){ |i| Mappings[data.type[i]] }
541
+
542
+
543
+ static const char type_data[][256] = {
544
+ EOH
545
+
546
+
547
+ /* U+0000 through U+#{@last_char_part1_X} */
548
+ static const int16_t type_table_part1[#{data.pages_before_e0000}] = {
549
+ EOH1
550
+
551
+
552
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
553
+ static const int16_t type_table_part2[768] = {
554
+ EOH2
555
+
556
+ print_table(data, 0, @last_char_part1_i, data.last, 4,
557
+ <<EOH, <<EOH1, <<EOH2) { |i| data.value[i].nil? ? '0x0000' : sprintf('0x%04x', data.value[i]) }
558
+
559
+
560
+ static const unichar attr_data[][256] = {
561
+ EOH
562
+
563
+
564
+ /* U+0000 through U+#{@last_char_part1_X} */
565
+ static const int16_t attr_table_part1[#{data.pages_before_e0000}] = {
566
+ EOH1
567
+
568
+
569
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
570
+ static const int16_t attr_table_part2[768] = {
571
+ EOH2
572
+
573
+ print <<EOF
574
+
575
+
576
+ static const unichar title_table[][3] = {
577
+ EOF
578
+ data.title_to_lower.keys.sort.each do |code|
579
+ printf("%s{ 0x%04x, 0x%04x, 0x%04x },\n", data.indent,
580
+ code, data.title_to_upper[code], data.title_to_lower[code])
581
+ end
582
+ print("};\n")
583
+
584
+ print_special_case_table(data)
585
+ print_case_fold_table(data)
586
+
587
+ print <<EOF
588
+ static const struct {
589
+ #{data.indent}unichar ch;
590
+ #{data.indent}unichar mirrored_ch;
591
+ } bidi_mirroring_table[] = {
592
+ EOF
593
+ data.bidimirror.each do |item|
594
+ printf("%s{ 0x%04x, 0x%04x },\n", data.indent, item[0], item[1])
595
+ end
596
+ print <<EOF
597
+ };
598
+
599
+ #endif /* #{header_h} */
600
+ EOF
601
+ end
602
+ $stdout = saved_stdout
603
+ end
604
+
605
+ def print_row(data, start, type_size)
606
+ flag = true
607
+ values = []
608
+ 0.upto(255) do |i|
609
+ values[i] = yield(start + i)
610
+ flag = false if values[i] != values[0]
611
+ end
612
+ return values[0] + " + UNICODE_MAX_TABLE_INDEX" if flag
613
+
614
+ puts(',') if @index != 0
615
+ printf("%s{ /* page %d, index %d */\n%s",
616
+ data.indent, start / 256, @index, data.indent * 2)
617
+ column = data.indent.width * 2
618
+ start.upto(start + 255) do |i|
619
+ text = values[i - start]
620
+ if text.length + column + 2 > 79
621
+ printf("\n%s", data.indent * 2)
622
+ column = data.indent.width * 2
623
+ end
624
+
625
+ printf("%s, ", text)
626
+ column += text.width + 2
627
+ end
628
+
629
+ print("\n#{data.indent}}")
630
+ @index += 1
631
+ return sprintf("%d /* page %d */", @index - 1, start / 256);
632
+ end
633
+
634
+ def print_special_case_table(data)
635
+ print <<EOF
636
+
637
+
638
+ /*
639
+ * Table of special cases for case conversion; each record contains
640
+ * First, the best single character mapping to lowercase if Lu,
641
+ * and to uppercase if Ll, followed by the output mapping for the two cases
642
+ * other than the case of the codepoint, in the order Ll, Lu, Lt, encoded in
643
+ * UTF-8, separated and terminated by a NUL character.
644
+ */
645
+ static const char special_case_table[] = {
646
+ EOF
647
+ data.special_cases.each_with_index do |sc, i|
648
+ printf(%Q< "%s\\0" /* offset %d */\n>, sc, data.special_case_offsets[i])
649
+ end
650
+ print <<EOF
651
+ };
652
+
653
+ EOF
654
+ end
655
+
656
+ def print_case_fold_table(data)
657
+ print <<EOF
658
+
659
+ /*
660
+ * Table of casefolding cases that can't be derived by lowercasing.
661
+ */
662
+ static const struct {
663
+ #{data.indent}uint16_t ch;
664
+ #{data.indent}char data[#{data.casefold_longest}];
665
+ } casefold_table[] = {
666
+ EOF
667
+ data.casefold.sort_by{ |a| a[0] }.each do |cf|
668
+ if cf[0] > 0xffff
669
+ error('casefold_table.ch field too short.' +
670
+ ' Upgrade to unichar to fit values beyond 0xffff.')
671
+ end
672
+ printf(%Q<%s{ 0x%04x, "%s" },\n>, data.indent, cf[0], cf[1])
673
+ end
674
+ print <<EOF
675
+ };
676
+ EOF
677
+ end
678
+
679
+ def print_decomp(data, outfile = 'decompose.h')
680
+ row = []
681
+ saved_stdout = $stdout
682
+ File.open(outfile, 'w') do |file|
683
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
684
+ $stdout = file
685
+ print <<EOF
686
+ /* Automatically generated file */
687
+
688
+ #ifndef #{header_h}
689
+ #define #{header_h}
690
+
691
+
692
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
693
+
694
+ #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
695
+
696
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
697
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
698
+
699
+ #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
700
+ EOF
701
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
702
+ <<EOH, <<EOH1, <<EOH2){ |i| data.cclass[i] }
703
+
704
+
705
+ static const uint8_t cclass_data[][256] = {
706
+ EOH
707
+
708
+
709
+ static const int16_t combining_class_table_part1[#{data.pages_before_e0000}] = {
710
+ EOH1
711
+
712
+
713
+ static const int16_t combining_class_table_part2[768] = {
714
+ EOH2
715
+
716
+ print <<EOL
717
+
718
+
719
+ static const struct {
720
+ #{data.indent}unichar ch;
721
+ #{data.indent}uint16_t canon_offset;
722
+ #{data.indent}uint16_t compat_offset;
723
+ } decomp_table[] = {
724
+ EOL
725
+ decomp_offsets = {}
726
+ decomp_string = ''
727
+ @decomp_string_offset = 0
728
+ 0.upto(data.last) do |i|
729
+ unless data.decompositions[i].nil?
730
+ canon_decomp = data.decompose_compat[i] ?
731
+ nil : make_decomp(data, i, false)
732
+ compat_decomp = make_decomp(data, i, true)
733
+ if not canon_decomp.nil? and compat_decomp == canon_decomp
734
+ compat_decomp = nil
735
+ end
736
+ canon_offset = handle_decomp(canon_decomp, decomp_offsets,
737
+ decomp_string)
738
+ compat_offset = handle_decomp(compat_decomp, decomp_offsets,
739
+ decomp_string)
740
+
741
+ if @decomp_string_offset > NOT_PRESENT_OFFSET
742
+ error('decomposition string offset beyond not-present-offset,' +
743
+ " upgrade value:\n" +
744
+ " offset: %d\n" +
745
+ " max: %d\n",
746
+ @decomp_string_offset, NOT_PRESENT_OFFSET)
747
+ end
748
+ printf("%s{ 0x%04x, %s, %s },\n",
749
+ data.indent, i, canon_offset, compat_offset)
750
+ end
751
+ end
752
+ print("\n};")
753
+
754
+ print <<EOL
755
+
756
+ static const char decomp_expansion_string[] = #{decomp_string};
757
+
758
+
759
+ #endif /* #{header_h} */
760
+ EOL
761
+ end
762
+ $stdout = saved_stdout
763
+ end
764
+
765
+ def expand_decomp(data, code, compat)
766
+ ary = []
767
+ data.decompositions[code].split(/ /).each do |item|
768
+ pos = item.to_i(16)
769
+ if not data.decompositions[pos].nil? and
770
+ (compat or not data.decompose_compat[pos])
771
+ ary.concat(expand_decomp(data, pos, compat))
772
+ else
773
+ ary.push(pos)
774
+ end
775
+ end
776
+ ary
777
+ end
778
+
779
+ def make_decomp(data, code, compat)
780
+ str = ''
781
+ expand_decomp(data, code, compat).each do |item|
782
+ str += item.is_a?(Array) ? item.flatten.pack('U') : [item].pack('U')
783
+ end
784
+ str
785
+ end
786
+
787
+ def handle_decomp(decomp, decomp_offsets,
788
+ decomp_string)
789
+ offset = 'UNICODE_NOT_PRESENT_OFFSET'
790
+ unless decomp.nil?
791
+ if decomp_offsets.member?(decomp)
792
+ offset = decomp_offsets[decomp]
793
+ else
794
+ offset = @decomp_string_offset
795
+ decomp_offsets[decomp] = offset
796
+ decomp_string << ("\n \"" + decomp.escape +
797
+ "\\0\" /* offset #{offset} */")
798
+ @decomp_string_offset += decomp.length + 1
799
+ end
800
+ end
801
+ offset
802
+ end
803
+
804
+ def print_composition_table(data, outfile = 'compose.h')
805
+ first = Hash.new(0)
806
+ second = Hash.new(0)
807
+
808
+ data.compositions.each do |code, value|
809
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
810
+
811
+ # skip non-starters and single-character decompositions
812
+ if data.cclass[values[0]] != '0' or values.size == 1
813
+ data.compositions.delete(code)
814
+ next
815
+ end
816
+
817
+ if values.size != 2
818
+ error("decomposition of entry contains more than two elements:\n" +
819
+ " entry: %d\n" +
820
+ " elements: %d\n",
821
+ code, values.size)
822
+ end
823
+
824
+ first[values[0]] += 1
825
+ end
826
+
827
+ n_first = first.enumerate_ordered
828
+
829
+ data.compositions.each do |code, value|
830
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
831
+
832
+ second[values[1]] += 1 if first.member?(values[0])
833
+ end
834
+
835
+ n_second = second.enumerate_ordered
836
+
837
+ first_singletons = []
838
+ second_singletons = []
839
+ reverse = {}
840
+ data.compositions.each do |code, value|
841
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
842
+
843
+ if first.member?(values[0]) and second.member?(values[1])
844
+ reverse["#{first[values[0]]}|#{second[values[1]]}"] = code
845
+ elsif not first.member?(values[0])
846
+ first_singletons.push([values[0], values[1], code])
847
+ else
848
+ second_singletons.push([values[1], values[0], code])
849
+ end
850
+ end
851
+
852
+ first_singletons = first_singletons.sort_by{ |a| a[0] }
853
+ second_singletons = second_singletons.sort_by{ |a| a[0] }
854
+
855
+ row = []
856
+ saved_stdout = $stdout
857
+ File.open(outfile, 'w') do |file|
858
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
859
+ $stdout = file
860
+ values = {}
861
+ total = first_start = 1
862
+ last = 0
863
+
864
+ first.each do |code, value|
865
+ values[code] = value + total
866
+ last = code if code > last
867
+ end
868
+ total += n_first
869
+
870
+ first_single_start = total
871
+ first_singletons.each_with_index do |item, i|
872
+ code = item[0]
873
+ values[code] = i + total
874
+ last = code if code > last
875
+ end
876
+ total += first_singletons.size
877
+
878
+ second_start = total
879
+ second.each do |code, value|
880
+ values[code] = value + total
881
+ last = code if code > last
882
+ end
883
+ total += n_second
884
+
885
+ second_single_start = total
886
+ second_singletons.each_with_index do |item, i|
887
+ code = item[0]
888
+ values[code] = i + total
889
+ last = code if code > last
890
+ end
891
+
892
+ print <<EOL
893
+ /* Automatically generated file */
894
+
895
+ #ifndef #{header_h}
896
+ #define #{header_h}
897
+
898
+
899
+ #define COMPOSE_FIRST_START #{first_start}
900
+ #define COMPOSE_FIRST_SINGLE_START #{first_single_start}
901
+ #define COMPOSE_SECOND_START #{second_start}
902
+ #define COMPOSE_SECOND_SINGLE_START #{second_single_start}
903
+ #define COMPOSE_TABLE_LAST #{last / 256}
904
+ EOL
905
+
906
+ print_table(data, 0, last, last, 2,
907
+ <<EOH, <<EOH1, nil){ |i| values.member?(i) ? values[i].to_s : '0' }
908
+
909
+
910
+ static const uint16_t compose_data[][256] = {
911
+ EOH
912
+
913
+
914
+ static const int16_t compose_table[COMPOSE_TABLE_LAST + 1] = {
915
+ EOH1
916
+
917
+ print <<EOL
918
+
919
+
920
+ static const uint16_t compose_first_single[][2] = {
921
+ EOL
922
+ first_singletons.each_with_index do |item, i|
923
+ if item[1] > 0xffff or item[2] > 0xffff
924
+ error("compose_first_single table field too short." +
925
+ " Upgrade to unichar to fit values beyond 0xffff.")
926
+ end
927
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
928
+ end
929
+ print("};\n")
930
+
931
+ print <<EOL
932
+
933
+
934
+ static const uint16_t compose_second_single[][2] = {
935
+ EOL
936
+ second_singletons.each_with_index do |item, i|
937
+ if item[1] > 0xffff or item[2] > 0xffff
938
+ error("compose_second_single table field too short." +
939
+ " Upgrade to unichar to fit values beyond 0xffff.")
940
+ end
941
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
942
+ end
943
+ print("};\n")
944
+
945
+ print <<EOL
946
+
947
+
948
+ static const uint16_t compose_array[#{n_first}][#{n_second}] = {
949
+ EOL
950
+ 0.upto(n_first - 1) do |i|
951
+ printf("%s{\n%s", data.indent, data.indent * 2)
952
+ column = data.indent.width * 2
953
+ 0.upto(n_second - 1) do |j|
954
+ if column + 8 > 79
955
+ printf("\n%s", data.indent * 2)
956
+ column = data.indent.width * 2
957
+ end
958
+ if reverse.member?("#{i}|#{j}")
959
+ if reverse["#{i}|#{j}"] > 0xffff
960
+ error("compose_array table field too short." +
961
+ " Upgrade to unichar to fit values beyond 0xffff.")
962
+ end
963
+ printf("0x%04x, ", reverse["#{i}|#{j}"])
964
+ else
965
+ print(" 0, ")
966
+ end
967
+ column += 8
968
+ end
969
+ printf("\n%s},\n", data.indent)
970
+ end
971
+ print("};\n")
972
+
973
+ print <<EOL
974
+
975
+
976
+ #endif /* #{header_h} */
977
+ EOL
978
+ end
979
+ $stdout = saved_stdout
980
+ end
981
+
982
+ def print_line_break(data, outfile = 'break.h')
983
+ row = []
984
+ saved_stdout = $stdout
985
+ File.open(outfile, 'w') do |file|
986
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
987
+ $stdout = file
988
+ print <<EOF
989
+ /* Automatically generated file */
990
+
991
+ #ifndef #{header_h}
992
+ #define #{header_h}
993
+
994
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
995
+
996
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
997
+
998
+ #define UNICODE_MAX_TABLE_INDEX 10000
999
+
1000
+ /*
1001
+ * The last code point that should be looked up in break_property_table_part1.
1002
+ */
1003
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
1004
+
1005
+ /*
1006
+ * The first code point that should be looked up in break_property_table_part2.
1007
+ */
1008
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
1009
+ EOF
1010
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
1011
+ <<EOH, <<EOH1, <<EOH2){ |i| BreakMappings[data.break_props[i]] }
1012
+
1013
+
1014
+ static const int8_t break_property_data[][256] = {
1015
+ EOH
1016
+
1017
+
1018
+ /* U+0000 through U+#{@last_char_part1_X} */
1019
+ static const int16_t break_property_table_part1[#{data.pages_before_e0000}] = {
1020
+ EOH1
1021
+
1022
+
1023
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
1024
+ static const int16_t break_property_table_part2[768] = {
1025
+ EOH2
1026
+
1027
+ print <<EOF
1028
+
1029
+
1030
+ #endif /* #{header_h} */
1031
+ EOF
1032
+ end
1033
+ $stdout = saved_stdout
1034
+ end
1035
+ end
1036
+
1037
+ UnicodeVersion = ARGV[0]
1038
+
1039
+ class Runner
1040
+ def main
1041
+ check_for_data_files(ARGV[1])
1042
+ data = CollectedData.new(ARGV[1], "\t")
1043
+ [CompositionExclusions, UnicodeData, LineBreak,
1044
+ SpecialCasing, CaseFolding, BidiMirroring, Printer].each do |klass|
1045
+ klass.new.process(data)
1046
+ end
1047
+ end
1048
+
1049
+ private
1050
+ def check_for_data_files(dir)
1051
+ ['UnicodeData.txt', 'LineBreak.txt', 'SpecialCasing.txt', 'CaseFolding.txt',
1052
+ 'CompositionExclusions.txt', 'BidiMirroring.txt'].each do |file|
1053
+ path = File.join(dir, file)
1054
+ unless File.readable?(path)
1055
+ error('missing required file: %s', path)
1056
+ end
1057
+ end
1058
+ end
1059
+ end
1060
+
1061
+ Runner.new.main
1062
+
1063
+
1064
+
1065
+ # vim: set sts=2 sw=2: