character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,1065 @@
1
+ #! /usr/bin/ruby -w
2
+ =begin
3
+ :contents: Generate Unicode table headers.
4
+ :arch-tag: 98c7456d-c7d9-4b40-9971-409428593ad5
5
+
6
+ Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
7
+
8
+ This program is free software; you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation; either version 2 of the License, or
11
+ (at your option) any later version.
12
+
13
+ This program is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with this program; if not, write to the Free Software
20
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
+ =end
22
+
23
+
24
+
25
+ def error(fmt, *args)
26
+ $stderr.printf("%s: %s\n", File.basename($0), sprintf(fmt, *args))
27
+ exit(1)
28
+ end
29
+
30
+ class File
31
+ def self.process(path)
32
+ begin
33
+ File.open(path) do |file|
34
+ file.each_line do |line|
35
+ next if line =~ /^(#|\s*$)/
36
+ yield line
37
+ end
38
+ end
39
+ rescue IOError => e
40
+ error("I/O error while processing input:\n" +
41
+ " file: %s\n" +
42
+ " error: %s\n", path, e.message)
43
+ end
44
+ end
45
+ end
46
+
47
+ class String
48
+ def escape
49
+ self.unpack('H*')[0].gsub(/(.{2})/, '\\x\1')
50
+ end
51
+
52
+ def width
53
+ self.gsub(/\t/, ' ' * 8).length
54
+ end
55
+ end
56
+
57
+ class Array
58
+ def verify_size(wanted, path, index)
59
+ if !(wanted === self.size)
60
+ error("entry doesn't contain the required %s fields:\n" +
61
+ " file: %s\n" +
62
+ " entry: %s\n" +
63
+ " field count: %d\n",
64
+ wanted.to_s,
65
+ path,
66
+ (self.size > index) ? self[index] : 'N/A',
67
+ self.size)
68
+ end
69
+ end
70
+
71
+ def verify_field(index, code, path, raw_code, type, ccase)
72
+ if self[index].to_i(16) != code
73
+ error("entry has type %s but UCD_%s(%s) != %s:\n" +
74
+ " file: %s\n" +
75
+ " entry: %s\n",
76
+ type, ccase, raw_code, raw_code, path, raw_code)
77
+ end
78
+ end
79
+ end
80
+
81
+ class Hash
82
+ def enumerate_ordered
83
+ n = 0
84
+ self.keys.sort.each do |code|
85
+ if self[code] == 1
86
+ self.delete(code)
87
+ next
88
+ end
89
+ self[code] = n
90
+ n += 1
91
+ end
92
+ n
93
+ end
94
+ end
95
+
96
+ # XXX: this is too memory consuming to keep like this. We need to split it up
97
+ # like the perl script does in hashes and arrays. Argh!
98
+ class UnicodeCodepoint
99
+ def initialize(code)
100
+ @code = code
101
+ @type = @value = @lower = @upper = @cclass = @compat = nil
102
+ @compositions = @decompositions = @break_props = nil
103
+ end
104
+
105
+ attr_accessor :code
106
+ attr_accessor :type, :value, :lower, :upper, :cclass, :compat
107
+ attr_accessor :compositions, :decompositions, :break_props
108
+ end
109
+
110
+ # XXX: cleanup
111
+ class CollectedData
112
+ def initialize(dir = '.', indent = "\t")
113
+ @dir = dir
114
+ @indent = indent
115
+ @cps = []
116
+
117
+ @excludes = nil
118
+
119
+ @pages_before_e0000 = 0
120
+ @last = 0x10ffff
121
+
122
+ @type = []
123
+ @value = []
124
+ @title_to_lower = {}
125
+ @title_to_upper = {}
126
+ @cclass = []
127
+ @decompose_compat = []
128
+ @compositions = {}
129
+ @decompositions = []
130
+
131
+ @break_props = []
132
+
133
+ @special_case_offsets = []
134
+ @special_cases = []
135
+
136
+ @casefold = []
137
+ @casefold_longest = -1
138
+
139
+ @bidimirror = []
140
+ end
141
+
142
+ attr :dir
143
+ attr :indent
144
+ attr :cps, true
145
+ attr :excludes, true
146
+ attr :pages_before_e0000, true
147
+ attr :last
148
+ attr_accessor :type, :value, :title_to_lower, :title_to_upper, :cclass,
149
+ :decompose_compat, :compositions, :decompositions
150
+ attr :break_props, true
151
+ attr :special_case_offsets, true
152
+ attr :special_cases, true
153
+ attr :casefold, true
154
+ attr :casefold_longest, true
155
+ attr :bidimirror, true
156
+ end
157
+
158
+ class CompositionExclusions
159
+ def process(data)
160
+ data.excludes = Hash.new
161
+ File.process(File.join(data.dir, 'CompositionExclusions.txt')) do |line|
162
+ data.excludes[line.chomp.sub(/^\s*(.*?)\s*(#.*)?$/,'\1').to_i(16)] = true
163
+ end
164
+ end
165
+ end
166
+
167
+ class UnicodeData
168
+ CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY,
169
+ DECOMPOSITION, DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED,
170
+ OLD_NAME, COMMENT, UPPER, LOWER, TITLE = (0..14).to_a
171
+
172
+ def process(data)
173
+ prev_code = -1
174
+ path = File.join(data.dir, 'UnicodeData.txt')
175
+ File.process(path) do |line|
176
+ fields = line.chomp.split(/;/, -1)
177
+ fields.verify_size(15, path, CODE)
178
+ code = fields[CODE].to_i(16)
179
+
180
+ if code >= 0xe0000 and prev_code < 0xe0000
181
+ data.pages_before_e0000 = (prev_code >> 8) + 1
182
+ end
183
+
184
+ if code > prev_code + 1
185
+ process_gap(data,
186
+ prev_code + 1,
187
+ code - 1,
188
+ fields[NAME] =~ /Last>$/ ? fields : new_gap_fields)
189
+ end
190
+ process_one(data, code, fields)
191
+ prev_code = code
192
+ end
193
+ process_gap(data, prev_code + 1, 0x10ffff, new_gap_fields)
194
+ end
195
+
196
+ private
197
+
198
+ def new_gap_fields
199
+ ['', '', 'Cn', '0', '', '', '', '', '', '', '', '', '', '', '']
200
+ end
201
+
202
+ def process_gap(data, low, hi, fields)
203
+ low.upto(hi) do |i|
204
+ fields[CODE] = sprintf('%04x', i)
205
+ process_one(data, i, fields)
206
+ end
207
+ end
208
+
209
+ def process_one(data, code, fields)
210
+ # puts(code.to_s)
211
+ # data.cps[code] ||= UnicodeCodepoint.new(code)
212
+ data.type[code] = fields[CATEGORY]
213
+
214
+ # TODO: Why not process things like 'Nl'?
215
+ case data.type[code]
216
+ when 'Nd'
217
+ data.value[code] = fields[DECIMAL_VALUE].to_i
218
+ when 'Ll'
219
+ data.value[code] = fields[UPPER].to_i(16)
220
+ when 'Lu'
221
+ data.value[code] = fields[LOWER].to_i(16)
222
+ when 'Lt'
223
+ data.title_to_lower[code] = fields[LOWER].to_i(16)
224
+ data.title_to_upper[code] = fields[UPPER].to_i(16)
225
+ end
226
+
227
+ data.cclass[code] = fields[COMBINING_CLASSES]
228
+
229
+ unless fields[DECOMPOSITION] == ''
230
+ if fields[DECOMPOSITION] =~ /^\<.*\>\s*(.*)/
231
+ data.decompose_compat[code] = true
232
+ fields[DECOMPOSITION] = $1
233
+ else
234
+ data.decompose_compat[code] = false
235
+ unless data.excludes.include?(code)
236
+ data.compositions[code] = fields[DECOMPOSITION]
237
+ end
238
+ end
239
+ data.decompositions[code] = fields[DECOMPOSITION]
240
+ end
241
+ end
242
+ end
243
+
244
+ class LineBreak
245
+ BREAK_CODE, BREAK_PROPERTY = (0..1).to_a
246
+
247
+ def process(data)
248
+ prev_code = -1
249
+ path = File.join(data.dir, 'LineBreak.txt')
250
+ File.process(path) do |line|
251
+ fields = line.chomp.sub(/\s*#.*/, '').split(/;/, -1)
252
+ fields.verify_size(2, path, BREAK_CODE)
253
+
254
+ if fields[BREAK_CODE] =~ /([0-9A-F]{4,6})\.\.([0-9A-F]{4,6})/
255
+ start_code, end_code = $1.to_i(16), $2.to_i(16)
256
+ else
257
+ start_code = end_code = fields[BREAK_CODE].to_i(16)
258
+ end
259
+
260
+ if start_code > prev_code + 1
261
+ process_gap(data, prev_code + 1, start_code - 1)
262
+ end
263
+
264
+ start_code.upto(end_code) do |i|
265
+ data.break_props[i] = fields[BREAK_PROPERTY]
266
+ end
267
+
268
+ prev_code = end_code
269
+ end
270
+
271
+ process_gap(data, prev_code + 1, 0x10ffff)
272
+ end
273
+
274
+ private
275
+
276
+ def process_gap(data, low, hi)
277
+ low.upto(hi) do |i|
278
+ data.break_props[i] = (data.type[i] == 'Cn') ? 'XX' : 'AL'
279
+ end
280
+ end
281
+ end
282
+
283
+ class SpecialCasing
284
+ CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = (0..4).to_a
285
+
286
+ def initialize
287
+ @offset = 0
288
+ end
289
+
290
+ def process(data)
291
+ path = File.join(data.dir, 'SpecialCasing.txt')
292
+ File.process(path) do |line|
293
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
294
+ fields.verify_size((5..6), path, CASE_CODE)
295
+ raw_code, code = fields[CASE_CODE], fields[CASE_CODE].to_i(16)
296
+ unless data.type[code].nil?
297
+ # We ignore conditional special cases
298
+ next if fields.size == 6
299
+
300
+ case data.type[code]
301
+ when 'Lu'
302
+ fields.verify_field(CASE_UPPER, code, path, raw_code, 'Lu', 'Upper')
303
+ add_special_case(data, code, data.value[code],
304
+ fields[CASE_LOWER], fields[CASE_TITLE])
305
+ when 'Lt'
306
+ fields.verify_field(CASE_TITLE, code, path, raw_code, 'Lt', 'Title')
307
+ add_special_case(data, code, nil,
308
+ fields[CASE_LOWER], fields[CASE_UPPER])
309
+ when 'Ll'
310
+ fields.verify_field(CASE_LOWER, code, path, raw_code, 'Ll', 'Lower')
311
+ add_special_case(data, code, data.value[code],
312
+ fields[CASE_UPPER], fields[CASE_TITLE])
313
+ else
314
+ error("special case for non-alphabetic code point:\n" +
315
+ " file: %s\n" +
316
+ " type: %s\n" +
317
+ " code point/entry: %s\n",
318
+ path, data.type[code], raw_code)
319
+ end
320
+ else
321
+ error("special case for code point which doesn't have a type:\n" +
322
+ " file: %s\n" +
323
+ " code point/entry: %d\n",
324
+ path, code)
325
+ end
326
+ end
327
+ end
328
+
329
+ private
330
+
331
+ def add_special_case(data, code, single, field1, field2)
332
+ values = [
333
+ single.nil? ? nil : [single],
334
+ field1.split(/\s+/).map{ |s| s.to_i(16) },
335
+ [0],
336
+ field2.split(/\s+/).map{ |s| s.to_i(16) },
337
+ ]
338
+ result = ''
339
+ values.each{ |value| result += value.pack('U*') unless value.nil? }
340
+
341
+ data.special_case_offsets.push(@offset)
342
+ data.value[code] = 0x1000000 + @offset
343
+ data.special_cases.push(result.escape)
344
+ @offset += 1 + result.length
345
+ end
346
+ end
347
+
348
+ class CaseFolding
349
+ FOLDING_CODE, FOLDING_STATUS, FOLDING_MAPPING = (0..2).to_a
350
+
351
+ def process(data)
352
+ path = File.join(data.dir, 'CaseFolding.txt')
353
+ File.process(path) do |line|
354
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
355
+ fields.verify_size(4, path, FOLDING_CODE)
356
+
357
+ # skip Simple and Turkic rules
358
+ next if fields[FOLDING_STATUS] =~ /^[ST]$/
359
+
360
+ raw_code, code = fields[FOLDING_CODE], fields[FOLDING_CODE].to_i(16)
361
+ values = fields[FOLDING_MAPPING].split(/\s+/).map{ |s| s.to_i(16) }
362
+ if values.size == 1 &&
363
+ !(!data.value[code].nil? && data.value[code] >= 0x1000000) &&
364
+ !data.type[code].nil?
365
+ case data.type[code]
366
+ when 'Ll'
367
+ lower = code
368
+ when 'Lt'
369
+ lower = data.title_to_lower[code]
370
+ when 'Lu'
371
+ lower = data.value[code]
372
+ else
373
+ lower = code
374
+ end
375
+ next if lower == values[0]
376
+ end
377
+
378
+ string = values.pack('U*')
379
+ if string.length + 1 > data.casefold_longest
380
+ data.casefold_longest = string.length + 1
381
+ end
382
+ data.casefold.push([code, string.escape])
383
+ end
384
+ end
385
+ end
386
+
387
+ class BidiMirroring
388
+ def process(data)
389
+ path = File.join(data.dir, 'BidiMirroring.txt')
390
+ File.process(path) do |line|
391
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
392
+ fields.verify_size(2, path, 0)
393
+ data.bidimirror.push([fields[0].to_i(16), fields[1].to_i(16)])
394
+ end
395
+ end
396
+ end
397
+
398
+ class Printer
399
+ def initialize
400
+ @index = 0
401
+ end
402
+
403
+ def process(data)
404
+ @last_char_part1_i = data.pages_before_e0000 * 256 - 1
405
+ @last_char_part1_x = sprintf('0x%04x', @last_char_part1_i)
406
+ @last_char_part1_X = sprintf('%04X', @last_char_part1_i)
407
+ print_tables(data)
408
+ print_decomp(data)
409
+ print_composition_table(data)
410
+ print_line_break(data)
411
+ end
412
+
413
+ private
414
+
415
+ # Map general category code onto symbolic name.
416
+ Mappings = {
417
+ # Normative.
418
+ 'Lu' => 'UNICODE_UPPERCASE_LETTER',
419
+ 'Ll' => 'UNICODE_LOWERCASE_LETTER',
420
+ 'Lt' => 'UNICODE_TITLECASE_LETTER',
421
+ 'Mn' => 'UNICODE_NON_SPACING_MARK',
422
+ 'Mc' => 'UNICODE_COMBINING_MARK',
423
+ 'Me' => 'UNICODE_ENCLOSING_MARK',
424
+ 'Nd' => 'UNICODE_DECIMAL_NUMBER',
425
+ 'Nl' => 'UNICODE_LETTER_NUMBER',
426
+ 'No' => 'UNICODE_OTHER_NUMBER',
427
+ 'Zs' => 'UNICODE_SPACE_SEPARATOR',
428
+ 'Zl' => 'UNICODE_LINE_SEPARATOR',
429
+ 'Zp' => 'UNICODE_PARAGRAPH_SEPARATOR',
430
+ 'Cc' => 'UNICODE_CONTROL',
431
+ 'Cf' => 'UNICODE_FORMAT',
432
+ 'Cs' => 'UNICODE_SURROGATE',
433
+ 'Co' => 'UNICODE_PRIVATE_USE',
434
+ 'Cn' => 'UNICODE_UNASSIGNED',
435
+
436
+ # Informative.
437
+ 'Lm' => 'UNICODE_MODIFIER_LETTER',
438
+ 'Lo' => 'UNICODE_OTHER_LETTER',
439
+ 'Pc' => 'UNICODE_CONNECT_PUNCTUATION',
440
+ 'Pd' => 'UNICODE_DASH_PUNCTUATION',
441
+ 'Ps' => 'UNICODE_OPEN_PUNCTUATION',
442
+ 'Pe' => 'UNICODE_CLOSE_PUNCTUATION',
443
+ 'Pi' => 'UNICODE_INITIAL_PUNCTUATION',
444
+ 'Pf' => 'UNICODE_FINAL_PUNCTUATION',
445
+ 'Po' => 'UNICODE_OTHER_PUNCTUATION',
446
+ 'Sm' => 'UNICODE_MATH_SYMBOL',
447
+ 'Sc' => 'UNICODE_CURRENCY_SYMBOL',
448
+ 'Sk' => 'UNICODE_MODIFIER_SYMBOL',
449
+ 'So' => 'UNICODE_OTHER_SYMBOL'
450
+ }
451
+
452
+ BreakMappings = {
453
+ 'BK' => 'UNICODE_BREAK_MANDATORY',
454
+ 'CR' => 'UNICODE_BREAK_CARRIAGE_RETURN',
455
+ 'LF' => 'UNICODE_BREAK_LINE_FEED',
456
+ 'CM' => 'UNICODE_BREAK_COMBINING_MARK',
457
+ 'SG' => 'UNICODE_BREAK_SURROGATE',
458
+ 'ZW' => 'UNICODE_BREAK_ZERO_WIDTH_SPACE',
459
+ 'IN' => 'UNICODE_BREAK_INSEPARABLE',
460
+ 'GL' => 'UNICODE_BREAK_NON_BREAKING_GLUE',
461
+ 'CB' => 'UNICODE_BREAK_CONTINGENT',
462
+ 'SP' => 'UNICODE_BREAK_SPACE',
463
+ 'BA' => 'UNICODE_BREAK_AFTER',
464
+ 'BB' => 'UNICODE_BREAK_BEFORE',
465
+ 'B2' => 'UNICODE_BREAK_BEFORE_AND_AFTER',
466
+ 'HY' => 'UNICODE_BREAK_HYPHEN',
467
+ 'NS' => 'UNICODE_BREAK_NON_STARTER',
468
+ 'OP' => 'UNICODE_BREAK_OPEN_PUNCTUATION',
469
+ 'CL' => 'UNICODE_BREAK_CLOSE_PUNCTUATION',
470
+ 'QU' => 'UNICODE_BREAK_QUOTATION',
471
+ 'EX' => 'UNICODE_BREAK_EXCLAMATION',
472
+ 'ID' => 'UNICODE_BREAK_IDEOGRAPHIC',
473
+ 'NU' => 'UNICODE_BREAK_NUMERIC',
474
+ 'IS' => 'UNICODE_BREAK_INFIX_SEPARATOR',
475
+ 'SY' => 'UNICODE_BREAK_SYMBOL',
476
+ 'AL' => 'UNICODE_BREAK_ALPHABETIC',
477
+ 'PR' => 'UNICODE_BREAK_PREFIX',
478
+ 'PO' => 'UNICODE_BREAK_POSTFIX',
479
+ 'SA' => 'UNICODE_BREAK_COMPLEX_CONTEXT',
480
+ 'AI' => 'UNICODE_BREAK_AMBIGUOUS',
481
+ 'NL' => 'UNICODE_BREAK_NEXT_LINE',
482
+ 'WJ' => 'UNICODE_BREAK_WORD_JOINER',
483
+ 'XX' => 'UNICODE_BREAK_UNKNOWN',
484
+ 'JL' => 'UNICODE_BREAK_HANGUL_L_JAMO',
485
+ 'JV' => "UNICODE_BREAK_HANGUL_V_JAMO",
486
+ 'JT' => "UNICODE_BREAK_HANGUL_T_JAMO",
487
+ 'H2' => "UNICODE_BREAK_HANGUL_LV_SYLLABLE",
488
+ 'H3' => "UNICODE_BREAK_HANGUL_LVT_SYLLABLE"
489
+ };
490
+
491
+ NOT_PRESENT_OFFSET = 65535
492
+
493
+ def print_table(data, low, mid, hi, size, header, part1_h, part2_h, &f)
494
+ @index = 0
495
+ rows = []
496
+ print(header)
497
+ low.step(hi, 256) do |i|
498
+ rows[i / 256] = print_row(data, i, size){ |i| f.call(i) }
499
+ end
500
+ print("\n};\n")
501
+ print(part1_h)
502
+ low.step(mid, 256) do |i|
503
+ printf("%s%s,\n", data.indent, rows[i / 256])
504
+ end
505
+ print("};\n")
506
+ if mid != hi
507
+ print(part2_h)
508
+ 0xe0000.step(hi, 256) do |i|
509
+ printf("%s%s,\n", data.indent, rows[i / 256])
510
+ end
511
+ print("};\n")
512
+ end
513
+ end
514
+
515
+ def print_tables(data, outfile = 'character-tables.h')
516
+ row = []
517
+ saved_stdout = $stdout
518
+ File.open(outfile, 'w') do |file|
519
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
520
+ $stdout = file
521
+ print <<EOF
522
+ /* Automatically generated file */
523
+
524
+ #ifndef #{header_h}
525
+ #define #{header_h}
526
+
527
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
528
+
529
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
530
+
531
+ #define UNICODE_MAX_TABLE_INDEX 10000
532
+
533
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
534
+
535
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
536
+
537
+ #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
538
+ EOF
539
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
540
+ <<EOH, <<EOH1, <<EOH2){ |i| Mappings[data.type[i]] }
541
+
542
+
543
+ static const char type_data[][256] = {
544
+ EOH
545
+
546
+
547
+ /* U+0000 through U+#{@last_char_part1_X} */
548
+ static const int16_t type_table_part1[#{data.pages_before_e0000}] = {
549
+ EOH1
550
+
551
+
552
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
553
+ static const int16_t type_table_part2[768] = {
554
+ EOH2
555
+
556
+ print_table(data, 0, @last_char_part1_i, data.last, 4,
557
+ <<EOH, <<EOH1, <<EOH2) { |i| data.value[i].nil? ? '0x0000' : sprintf('0x%04x', data.value[i]) }
558
+
559
+
560
+ static const unichar attr_data[][256] = {
561
+ EOH
562
+
563
+
564
+ /* U+0000 through U+#{@last_char_part1_X} */
565
+ static const int16_t attr_table_part1[#{data.pages_before_e0000}] = {
566
+ EOH1
567
+
568
+
569
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
570
+ static const int16_t attr_table_part2[768] = {
571
+ EOH2
572
+
573
+ print <<EOF
574
+
575
+
576
+ static const unichar title_table[][3] = {
577
+ EOF
578
+ data.title_to_lower.keys.sort.each do |code|
579
+ printf("%s{ 0x%04x, 0x%04x, 0x%04x },\n", data.indent,
580
+ code, data.title_to_upper[code], data.title_to_lower[code])
581
+ end
582
+ print("};\n")
583
+
584
+ print_special_case_table(data)
585
+ print_case_fold_table(data)
586
+
587
+ print <<EOF
588
+ static const struct {
589
+ #{data.indent}unichar ch;
590
+ #{data.indent}unichar mirrored_ch;
591
+ } bidi_mirroring_table[] = {
592
+ EOF
593
+ data.bidimirror.each do |item|
594
+ printf("%s{ 0x%04x, 0x%04x },\n", data.indent, item[0], item[1])
595
+ end
596
+ print <<EOF
597
+ };
598
+
599
+ #endif /* #{header_h} */
600
+ EOF
601
+ end
602
+ $stdout = saved_stdout
603
+ end
604
+
605
+ def print_row(data, start, type_size)
606
+ flag = true
607
+ values = []
608
+ 0.upto(255) do |i|
609
+ values[i] = yield(start + i)
610
+ flag = false if values[i] != values[0]
611
+ end
612
+ return values[0] + " + UNICODE_MAX_TABLE_INDEX" if flag
613
+
614
+ puts(',') if @index != 0
615
+ printf("%s{ /* page %d, index %d */\n%s",
616
+ data.indent, start / 256, @index, data.indent * 2)
617
+ column = data.indent.width * 2
618
+ start.upto(start + 255) do |i|
619
+ text = values[i - start]
620
+ if text.length + column + 2 > 79
621
+ printf("\n%s", data.indent * 2)
622
+ column = data.indent.width * 2
623
+ end
624
+
625
+ printf("%s, ", text)
626
+ column += text.width + 2
627
+ end
628
+
629
+ print("\n#{data.indent}}")
630
+ @index += 1
631
+ return sprintf("%d /* page %d */", @index - 1, start / 256);
632
+ end
633
+
634
+ def print_special_case_table(data)
635
+ print <<EOF
636
+
637
+
638
+ /*
639
+ * Table of special cases for case conversion; each record contains
640
+ * First, the best single character mapping to lowercase if Lu,
641
+ * and to uppercase if Ll, followed by the output mapping for the two cases
642
+ * other than the case of the codepoint, in the order Ll, Lu, Lt, encoded in
643
+ * UTF-8, separated and terminated by a NUL character.
644
+ */
645
+ static const char special_case_table[] = {
646
+ EOF
647
+ data.special_cases.each_with_index do |sc, i|
648
+ printf(%Q< "%s\\0" /* offset %d */\n>, sc, data.special_case_offsets[i])
649
+ end
650
+ print <<EOF
651
+ };
652
+
653
+ EOF
654
+ end
655
+
656
+ def print_case_fold_table(data)
657
+ print <<EOF
658
+
659
+ /*
660
+ * Table of casefolding cases that can't be derived by lowercasing.
661
+ */
662
+ static const struct {
663
+ #{data.indent}uint16_t ch;
664
+ #{data.indent}char data[#{data.casefold_longest}];
665
+ } casefold_table[] = {
666
+ EOF
667
+ data.casefold.sort_by{ |a| a[0] }.each do |cf|
668
+ if cf[0] > 0xffff
669
+ error('casefold_table.ch field too short.' +
670
+ ' Upgrade to unichar to fit values beyond 0xffff.')
671
+ end
672
+ printf(%Q<%s{ 0x%04x, "%s" },\n>, data.indent, cf[0], cf[1])
673
+ end
674
+ print <<EOF
675
+ };
676
+ EOF
677
+ end
678
+
679
+ def print_decomp(data, outfile = 'decompose.h')
680
+ row = []
681
+ saved_stdout = $stdout
682
+ File.open(outfile, 'w') do |file|
683
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
684
+ $stdout = file
685
+ print <<EOF
686
+ /* Automatically generated file */
687
+
688
+ #ifndef #{header_h}
689
+ #define #{header_h}
690
+
691
+
692
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
693
+
694
+ #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
695
+
696
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
697
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
698
+
699
+ #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
700
+ EOF
701
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
702
+ <<EOH, <<EOH1, <<EOH2){ |i| data.cclass[i] }
703
+
704
+
705
+ static const uint8_t cclass_data[][256] = {
706
+ EOH
707
+
708
+
709
+ static const int16_t combining_class_table_part1[#{data.pages_before_e0000}] = {
710
+ EOH1
711
+
712
+
713
+ static const int16_t combining_class_table_part2[768] = {
714
+ EOH2
715
+
716
+ print <<EOL
717
+
718
+
719
+ static const struct {
720
+ #{data.indent}unichar ch;
721
+ #{data.indent}uint16_t canon_offset;
722
+ #{data.indent}uint16_t compat_offset;
723
+ } decomp_table[] = {
724
+ EOL
725
+ decomp_offsets = {}
726
+ decomp_string = ''
727
+ @decomp_string_offset = 0
728
+ 0.upto(data.last) do |i|
729
+ unless data.decompositions[i].nil?
730
+ canon_decomp = data.decompose_compat[i] ?
731
+ nil : make_decomp(data, i, false)
732
+ compat_decomp = make_decomp(data, i, true)
733
+ if not canon_decomp.nil? and compat_decomp == canon_decomp
734
+ compat_decomp = nil
735
+ end
736
+ canon_offset = handle_decomp(canon_decomp, decomp_offsets,
737
+ decomp_string)
738
+ compat_offset = handle_decomp(compat_decomp, decomp_offsets,
739
+ decomp_string)
740
+
741
+ if @decomp_string_offset > NOT_PRESENT_OFFSET
742
+ error('decomposition string offset beyond not-present-offset,' +
743
+ " upgrade value:\n" +
744
+ " offset: %d\n" +
745
+ " max: %d\n",
746
+ @decomp_string_offset, NOT_PRESENT_OFFSET)
747
+ end
748
+ printf("%s{ 0x%04x, %s, %s },\n",
749
+ data.indent, i, canon_offset, compat_offset)
750
+ end
751
+ end
752
+ print("\n};")
753
+
754
+ print <<EOL
755
+
756
+ static const char decomp_expansion_string[] = #{decomp_string};
757
+
758
+
759
+ #endif /* #{header_h} */
760
+ EOL
761
+ end
762
+ $stdout = saved_stdout
763
+ end
764
+
765
+ def expand_decomp(data, code, compat)
766
+ ary = []
767
+ data.decompositions[code].split(/ /).each do |item|
768
+ pos = item.to_i(16)
769
+ if not data.decompositions[pos].nil? and
770
+ (compat or not data.decompose_compat[pos])
771
+ ary.concat(expand_decomp(data, pos, compat))
772
+ else
773
+ ary.push(pos)
774
+ end
775
+ end
776
+ ary
777
+ end
778
+
779
+ def make_decomp(data, code, compat)
780
+ str = ''
781
+ expand_decomp(data, code, compat).each do |item|
782
+ str += item.is_a?(Array) ? item.flatten.pack('U') : [item].pack('U')
783
+ end
784
+ str
785
+ end
786
+
787
+ def handle_decomp(decomp, decomp_offsets,
788
+ decomp_string)
789
+ offset = 'UNICODE_NOT_PRESENT_OFFSET'
790
+ unless decomp.nil?
791
+ if decomp_offsets.member?(decomp)
792
+ offset = decomp_offsets[decomp]
793
+ else
794
+ offset = @decomp_string_offset
795
+ decomp_offsets[decomp] = offset
796
+ decomp_string << ("\n \"" + decomp.escape +
797
+ "\\0\" /* offset #{offset} */")
798
+ @decomp_string_offset += decomp.length + 1
799
+ end
800
+ end
801
+ offset
802
+ end
803
+
804
+ def print_composition_table(data, outfile = 'compose.h')
805
+ first = Hash.new(0)
806
+ second = Hash.new(0)
807
+
808
+ data.compositions.each do |code, value|
809
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
810
+
811
+ # skip non-starters and single-character decompositions
812
+ if data.cclass[values[0]] != '0' or values.size == 1
813
+ data.compositions.delete(code)
814
+ next
815
+ end
816
+
817
+ if values.size != 2
818
+ error("decomposition of entry contains more than two elements:\n" +
819
+ " entry: %d\n" +
820
+ " elements: %d\n",
821
+ code, values.size)
822
+ end
823
+
824
+ first[values[0]] += 1
825
+ end
826
+
827
+ n_first = first.enumerate_ordered
828
+
829
+ data.compositions.each do |code, value|
830
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
831
+
832
+ second[values[1]] += 1 if first.member?(values[0])
833
+ end
834
+
835
+ n_second = second.enumerate_ordered
836
+
837
+ first_singletons = []
838
+ second_singletons = []
839
+ reverse = {}
840
+ data.compositions.each do |code, value|
841
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
842
+
843
+ if first.member?(values[0]) and second.member?(values[1])
844
+ reverse["#{first[values[0]]}|#{second[values[1]]}"] = code
845
+ elsif not first.member?(values[0])
846
+ first_singletons.push([values[0], values[1], code])
847
+ else
848
+ second_singletons.push([values[1], values[0], code])
849
+ end
850
+ end
851
+
852
+ first_singletons = first_singletons.sort_by{ |a| a[0] }
853
+ second_singletons = second_singletons.sort_by{ |a| a[0] }
854
+
855
+ row = []
856
+ saved_stdout = $stdout
857
+ File.open(outfile, 'w') do |file|
858
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
859
+ $stdout = file
860
+ values = {}
861
+ total = first_start = 1
862
+ last = 0
863
+
864
+ first.each do |code, value|
865
+ values[code] = value + total
866
+ last = code if code > last
867
+ end
868
+ total += n_first
869
+
870
+ first_single_start = total
871
+ first_singletons.each_with_index do |item, i|
872
+ code = item[0]
873
+ values[code] = i + total
874
+ last = code if code > last
875
+ end
876
+ total += first_singletons.size
877
+
878
+ second_start = total
879
+ second.each do |code, value|
880
+ values[code] = value + total
881
+ last = code if code > last
882
+ end
883
+ total += n_second
884
+
885
+ second_single_start = total
886
+ second_singletons.each_with_index do |item, i|
887
+ code = item[0]
888
+ values[code] = i + total
889
+ last = code if code > last
890
+ end
891
+
892
+ print <<EOL
893
+ /* Automatically generated file */
894
+
895
+ #ifndef #{header_h}
896
+ #define #{header_h}
897
+
898
+
899
+ #define COMPOSE_FIRST_START #{first_start}
900
+ #define COMPOSE_FIRST_SINGLE_START #{first_single_start}
901
+ #define COMPOSE_SECOND_START #{second_start}
902
+ #define COMPOSE_SECOND_SINGLE_START #{second_single_start}
903
+ #define COMPOSE_TABLE_LAST #{last / 256}
904
+ EOL
905
+
906
+ print_table(data, 0, last, last, 2,
907
+ <<EOH, <<EOH1, nil){ |i| values.member?(i) ? values[i].to_s : '0' }
908
+
909
+
910
+ static const uint16_t compose_data[][256] = {
911
+ EOH
912
+
913
+
914
+ static const int16_t compose_table[COMPOSE_TABLE_LAST + 1] = {
915
+ EOH1
916
+
917
+ print <<EOL
918
+
919
+
920
+ static const uint16_t compose_first_single[][2] = {
921
+ EOL
922
+ first_singletons.each_with_index do |item, i|
923
+ if item[1] > 0xffff or item[2] > 0xffff
924
+ error("compose_first_single table field too short." +
925
+ " Upgrade to unichar to fit values beyond 0xffff.")
926
+ end
927
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
928
+ end
929
+ print("};\n")
930
+
931
+ print <<EOL
932
+
933
+
934
+ static const uint16_t compose_second_single[][2] = {
935
+ EOL
936
+ second_singletons.each_with_index do |item, i|
937
+ if item[1] > 0xffff or item[2] > 0xffff
938
+ error("compose_second_single table field too short." +
939
+ " Upgrade to unichar to fit values beyond 0xffff.")
940
+ end
941
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
942
+ end
943
+ print("};\n")
944
+
945
+ print <<EOL
946
+
947
+
948
+ static const uint16_t compose_array[#{n_first}][#{n_second}] = {
949
+ EOL
950
+ 0.upto(n_first - 1) do |i|
951
+ printf("%s{\n%s", data.indent, data.indent * 2)
952
+ column = data.indent.width * 2
953
+ 0.upto(n_second - 1) do |j|
954
+ if column + 8 > 79
955
+ printf("\n%s", data.indent * 2)
956
+ column = data.indent.width * 2
957
+ end
958
+ if reverse.member?("#{i}|#{j}")
959
+ if reverse["#{i}|#{j}"] > 0xffff
960
+ error("compose_array table field too short." +
961
+ " Upgrade to unichar to fit values beyond 0xffff.")
962
+ end
963
+ printf("0x%04x, ", reverse["#{i}|#{j}"])
964
+ else
965
+ print(" 0, ")
966
+ end
967
+ column += 8
968
+ end
969
+ printf("\n%s},\n", data.indent)
970
+ end
971
+ print("};\n")
972
+
973
+ print <<EOL
974
+
975
+
976
+ #endif /* #{header_h} */
977
+ EOL
978
+ end
979
+ $stdout = saved_stdout
980
+ end
981
+
982
+ def print_line_break(data, outfile = 'break.h')
983
+ row = []
984
+ saved_stdout = $stdout
985
+ File.open(outfile, 'w') do |file|
986
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
987
+ $stdout = file
988
+ print <<EOF
989
+ /* Automatically generated file */
990
+
991
+ #ifndef #{header_h}
992
+ #define #{header_h}
993
+
994
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
995
+
996
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
997
+
998
+ #define UNICODE_MAX_TABLE_INDEX 10000
999
+
1000
+ /*
1001
+ * The last code point that should be looked up in break_property_table_part1.
1002
+ */
1003
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
1004
+
1005
+ /*
1006
+ * The first code point that should be looked up in break_property_table_part2.
1007
+ */
1008
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
1009
+ EOF
1010
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
1011
+ <<EOH, <<EOH1, <<EOH2){ |i| BreakMappings[data.break_props[i]] }
1012
+
1013
+
1014
+ static const int8_t break_property_data[][256] = {
1015
+ EOH
1016
+
1017
+
1018
+ /* U+0000 through U+#{@last_char_part1_X} */
1019
+ static const int16_t break_property_table_part1[#{data.pages_before_e0000}] = {
1020
+ EOH1
1021
+
1022
+
1023
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
1024
+ static const int16_t break_property_table_part2[768] = {
1025
+ EOH2
1026
+
1027
+ print <<EOF
1028
+
1029
+
1030
+ #endif /* #{header_h} */
1031
+ EOF
1032
+ end
1033
+ $stdout = saved_stdout
1034
+ end
1035
+ end
1036
+
1037
+ UnicodeVersion = ARGV[0]
1038
+
1039
+ class Runner
1040
+ def main
1041
+ check_for_data_files(ARGV[1])
1042
+ data = CollectedData.new(ARGV[1], "\t")
1043
+ [CompositionExclusions, UnicodeData, LineBreak,
1044
+ SpecialCasing, CaseFolding, BidiMirroring, Printer].each do |klass|
1045
+ klass.new.process(data)
1046
+ end
1047
+ end
1048
+
1049
+ private
1050
+ def check_for_data_files(dir)
1051
+ ['UnicodeData.txt', 'LineBreak.txt', 'SpecialCasing.txt', 'CaseFolding.txt',
1052
+ 'CompositionExclusions.txt', 'BidiMirroring.txt'].each do |file|
1053
+ path = File.join(dir, file)
1054
+ unless File.readable?(path)
1055
+ error('missing required file: %s', path)
1056
+ end
1057
+ end
1058
+ end
1059
+ end
1060
+
1061
+ Runner.new.main
1062
+
1063
+
1064
+
1065
+ # vim: set sts=2 sw=2: