u 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,1070 @@
1
+ #! /usr/bin/ruby -w
2
+ =begin
3
+ :contents: Generate Unicode table headers.
4
+ :arch-tag: 98c7456d-c7d9-4b40-9971-409428593ad5
5
+
6
+ Copyright (C) 2004 Nikolai Weibull <source@pcppopper.org>
7
+
8
+ This program is free software; you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation; either version 2 of the License, or
11
+ (at your option) any later version.
12
+
13
+ This program is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with this program; if not, write to the Free Software
20
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
+ =end
22
+
23
+
24
+
25
+ def error(fmt, *args)
26
+ $stderr.printf("%s: %s\n", File.basename($0), sprintf(fmt, *args))
27
+ exit(1)
28
+ end
29
+
30
+ class File
31
+ def self.process(path)
32
+ begin
33
+ File.open(path) do |file|
34
+ file.each_line do |line|
35
+ next if line =~ /^(#|\s*$)/
36
+ yield line
37
+ end
38
+ end
39
+ rescue IOError => e
40
+ error("I/O error while processing input:\n" +
41
+ " file: %s\n" +
42
+ " error: %s\n", path, e.message)
43
+ end
44
+ end
45
+ end
46
+
47
+ class String
48
+ def escape
49
+ self.unpack('H*')[0].gsub(/(.{2})/, '\\x\1')
50
+ end
51
+
52
+ def width
53
+ self.gsub(/\t/, ' ' * 8).length
54
+ end
55
+ end
56
+
57
+ class Array
58
+ def verify_size(wanted, path, index)
59
+ if !(wanted === self.size)
60
+ error("entry doesn't contain the required %s fields:\n" +
61
+ " file: %s\n" +
62
+ " entry: %s\n" +
63
+ " field count: %d\n",
64
+ wanted.to_s,
65
+ path,
66
+ (self.size > index) ? self[index] : 'N/A',
67
+ self.size)
68
+ end
69
+ end
70
+
71
+ def verify_field(index, code, path, raw_code, type, ccase)
72
+ if self[index].to_i(16) != code
73
+ error("entry has type %s but UCD_%s(%s) != %s:\n" +
74
+ " file: %s\n" +
75
+ " entry: %s\n",
76
+ type, ccase, raw_code, raw_code, path, raw_code)
77
+ end
78
+ end
79
+ end
80
+
81
+ class Hash
82
+ def enumerate_ordered
83
+ n = 0
84
+ self.keys.sort.each do |code|
85
+ if self[code] == 1
86
+ self.delete(code)
87
+ next
88
+ end
89
+ self[code] = n
90
+ n += 1
91
+ end
92
+ n
93
+ end
94
+ end
95
+
96
+ # XXX: this is too memory consuming to keep like this. We need to split it up
97
+ # like the perl script does in hashes and arrays. Argh!
98
+ class UnicodeCodepoint
99
+ def initialize(code)
100
+ @code = code
101
+ @type = @value = @lower = @upper = @cclass = @compat = nil
102
+ @compositions = @decompositions = @break_props = nil
103
+ end
104
+
105
+ attr_accessor :code
106
+ attr_accessor :type, :value, :lower, :upper, :cclass, :compat
107
+ attr_accessor :compositions, :decompositions, :break_props
108
+ end
109
+
110
+ # XXX: cleanup
111
+ class CollectedData
112
+ def initialize(dir = '.', indent = "\t")
113
+ @dir = dir
114
+ @indent = indent
115
+ @cps = []
116
+
117
+ @excludes = nil
118
+
119
+ @pages_before_e0000 = 0
120
+ @last = 0x10ffff
121
+
122
+ @type = []
123
+ @value = []
124
+ @title_to_lower = {}
125
+ @title_to_upper = {}
126
+ @cclass = []
127
+ @decompose_compat = []
128
+ @compositions = {}
129
+ @decompositions = []
130
+
131
+ @break_props = []
132
+
133
+ @special_case_offsets = []
134
+ @special_cases = []
135
+
136
+ @casefold = []
137
+ @casefold_longest = -1
138
+
139
+ @bidimirror = []
140
+ end
141
+
142
+ attr :dir
143
+ attr :indent
144
+ attr :cps, true
145
+ attr :excludes, true
146
+ attr :pages_before_e0000, true
147
+ attr :last
148
+ attr_accessor :type, :value, :title_to_lower, :title_to_upper, :cclass,
149
+ :decompose_compat, :compositions, :decompositions
150
+ attr :break_props, true
151
+ attr :special_case_offsets, true
152
+ attr :special_cases, true
153
+ attr :casefold, true
154
+ attr :casefold_longest, true
155
+ attr :bidimirror, true
156
+ end
157
+
158
+ class CompositionExclusions
159
+ def process(data)
160
+ data.excludes = Hash.new
161
+ File.process(File.join(data.dir, 'CompositionExclusions.txt')) do |line|
162
+ data.excludes[line.chomp.sub(/^\s*(.*?)\s*(#.*)?$/,'\1').to_i(16)] = true
163
+ end
164
+ end
165
+ end
166
+
167
+ class UnicodeData
168
+ CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY,
169
+ DECOMPOSITION, DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED,
170
+ OLD_NAME, COMMENT, UPPER, LOWER, TITLE = (0..14).to_a
171
+
172
+ def process(data)
173
+ prev_code = -1
174
+ path = File.join(data.dir, 'UnicodeData.txt')
175
+ File.process(path) do |line|
176
+ fields = line.chomp.split(/;/, -1)
177
+ fields.verify_size(15, path, CODE)
178
+ code = fields[CODE].to_i(16)
179
+
180
+ if code >= 0xe0000 and prev_code < 0xe0000
181
+ data.pages_before_e0000 = (prev_code >> 8) + 1
182
+ end
183
+
184
+ if code > prev_code + 1
185
+ process_gap(data,
186
+ prev_code + 1,
187
+ code - 1,
188
+ fields[NAME] =~ /Last>$/ ? fields : new_gap_fields)
189
+ end
190
+ process_one(data, code, fields)
191
+ prev_code = code
192
+ end
193
+ process_gap(data, prev_code + 1, 0x10ffff, new_gap_fields)
194
+ end
195
+
196
+ private
197
+
198
+ def new_gap_fields
199
+ ['', '', 'Cn', '0', '', '', '', '', '', '', '', '', '', '', '']
200
+ end
201
+
202
+ def process_gap(data, low, hi, fields)
203
+ low.upto(hi) do |i|
204
+ fields[CODE] = sprintf('%04x', i)
205
+ process_one(data, i, fields)
206
+ end
207
+ end
208
+
209
+ def process_one(data, code, fields)
210
+ # puts(code.to_s)
211
+ # data.cps[code] ||= UnicodeCodepoint.new(code)
212
+ data.type[code] = fields[CATEGORY]
213
+
214
+ # TODO: Why not process things like 'Nl'?
215
+ case data.type[code]
216
+ when 'Nd'
217
+ data.value[code] = fields[DECIMAL_VALUE].to_i
218
+ when 'Ll'
219
+ data.value[code] = fields[UPPER].to_i(16)
220
+ when 'Lu'
221
+ data.value[code] = fields[LOWER].to_i(16)
222
+ when 'Lt'
223
+ data.title_to_lower[code] = fields[LOWER].to_i(16)
224
+ data.title_to_upper[code] = fields[UPPER].to_i(16)
225
+ end
226
+
227
+ data.cclass[code] = fields[COMBINING_CLASSES]
228
+
229
+ unless fields[DECOMPOSITION] == ''
230
+ if fields[DECOMPOSITION] =~ /^\<.*\>\s*(.*)/
231
+ data.decompose_compat[code] = true
232
+ fields[DECOMPOSITION] = $1
233
+ else
234
+ data.decompose_compat[code] = false
235
+ unless data.excludes.include?(code)
236
+ data.compositions[code] = fields[DECOMPOSITION]
237
+ end
238
+ end
239
+ data.decompositions[code] = fields[DECOMPOSITION]
240
+ end
241
+ end
242
+ end
243
+
244
+ class LineBreak
245
+ BREAK_CODE, BREAK_PROPERTY = (0..1).to_a
246
+
247
+ def process(data)
248
+ prev_code = -1
249
+ path = File.join(data.dir, 'LineBreak.txt')
250
+ File.process(path) do |line|
251
+ fields = line.chomp.sub(/\s*#.*/, '').split(/;/, -1)
252
+ fields.verify_size(2, path, BREAK_CODE)
253
+
254
+ if fields[BREAK_CODE] =~ /([0-9A-F]{4,6})\.\.([0-9A-F]{4,6})/
255
+ start_code, end_code = $1.to_i(16), $2.to_i(16)
256
+ else
257
+ start_code = end_code = fields[BREAK_CODE].to_i(16)
258
+ end
259
+
260
+ if start_code > prev_code + 1
261
+ process_gap(data, prev_code + 1, start_code - 1)
262
+ end
263
+
264
+ start_code.upto(end_code) do |i|
265
+ data.break_props[i] = fields[BREAK_PROPERTY]
266
+ end
267
+
268
+ prev_code = end_code
269
+ end
270
+
271
+ process_gap(data, prev_code + 1, 0x10ffff)
272
+ end
273
+
274
+ private
275
+
276
+ def process_gap(data, low, hi)
277
+ low.upto(hi) do |i|
278
+ data.break_props[i] = (data.type[i] == 'Cn') ? 'XX' : 'AL'
279
+ end
280
+ end
281
+ end
282
+
283
+ class SpecialCasing
284
+ CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = (0..4).to_a
285
+
286
+ def initialize
287
+ @offset = 0
288
+ end
289
+
290
+ def process(data)
291
+ path = File.join(data.dir, 'SpecialCasing.txt')
292
+ File.process(path) do |line|
293
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
294
+ fields.verify_size((5..6), path, CASE_CODE)
295
+ raw_code, code = fields[CASE_CODE], fields[CASE_CODE].to_i(16)
296
+ unless data.type[code].nil?
297
+ # We ignore conditional special cases
298
+ next if fields.size == 6
299
+
300
+ case data.type[code]
301
+ when 'Lu'
302
+ fields.verify_field(CASE_UPPER, code, path, raw_code, 'Lu', 'Upper')
303
+ add_special_case(data, code, data.value[code],
304
+ fields[CASE_LOWER], fields[CASE_TITLE])
305
+ when 'Lt'
306
+ fields.verify_field(CASE_TITLE, code, path, raw_code, 'Lt', 'Title')
307
+ add_special_case(data, code, nil,
308
+ fields[CASE_LOWER], fields[CASE_UPPER])
309
+ when 'Ll'
310
+ fields.verify_field(CASE_LOWER, code, path, raw_code, 'Ll', 'Lower')
311
+ add_special_case(data, code, data.value[code],
312
+ fields[CASE_UPPER], fields[CASE_TITLE])
313
+ else
314
+ error("special case for non-alphabetic code point:\n" +
315
+ " file: %s\n" +
316
+ " type: %s\n" +
317
+ " code point/entry: %s\n",
318
+ path, data.type[code], raw_code)
319
+ end
320
+ else
321
+ error("special case for code point which doesn't have a type:\n" +
322
+ " file: %s\n" +
323
+ " code point/entry: %d\n",
324
+ path, code)
325
+ end
326
+ end
327
+ end
328
+
329
+ private
330
+
331
+ def add_special_case(data, code, single, field1, field2)
332
+ values = [
333
+ single.nil? ? nil : [single],
334
+ field1.split(/\s+/).map{ |s| s.to_i(16) },
335
+ [0],
336
+ field2.split(/\s+/).map{ |s| s.to_i(16) },
337
+ ]
338
+ result = ''
339
+ values.each{ |value| result += value.pack('U*') unless value.nil? }
340
+
341
+ data.special_case_offsets.push(@offset)
342
+ data.value[code] = 0x1000000 + @offset
343
+ data.special_cases.push(result.escape)
344
+ @offset += 1 + result.length
345
+ end
346
+ end
347
+
348
+ class CaseFolding
349
+ FOLDING_CODE, FOLDING_STATUS, FOLDING_MAPPING = (0..2).to_a
350
+
351
+ def process(data)
352
+ path = File.join(data.dir, 'CaseFolding.txt')
353
+ File.process(path) do |line|
354
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
355
+ fields.verify_size(4, path, FOLDING_CODE)
356
+
357
+ # skip Simple and Turkic rules
358
+ next if fields[FOLDING_STATUS] =~ /^[ST]$/
359
+
360
+ raw_code, code = fields[FOLDING_CODE], fields[FOLDING_CODE].to_i(16)
361
+ values = fields[FOLDING_MAPPING].split(/\s+/).map{ |s| s.to_i(16) }
362
+ if values.size == 1 &&
363
+ !(!data.value[code].nil? && data.value[code] >= 0x1000000) &&
364
+ !data.type[code].nil?
365
+ case data.type[code]
366
+ when 'Ll'
367
+ lower = code
368
+ when 'Lt'
369
+ lower = data.title_to_lower[code]
370
+ when 'Lu'
371
+ lower = data.value[code]
372
+ else
373
+ lower = code
374
+ end
375
+ next if lower == values[0]
376
+ end
377
+
378
+ string = values.pack('U*')
379
+ if string.length + 1 > data.casefold_longest
380
+ data.casefold_longest = string.length + 1
381
+ end
382
+ data.casefold.push([code, string.escape])
383
+ end
384
+ end
385
+ end
386
+
387
+ class BidiMirroring
388
+ def process(data)
389
+ path = File.join(data.dir, 'BidiMirroring.txt')
390
+ File.process(path) do |line|
391
+ fields = line.chomp.sub(/\s*#.*/, '').split(/\s*;\s*/, -1)
392
+ fields.verify_size(2, path, 0)
393
+ data.bidimirror.push([fields[0].to_i(16), fields[1].to_i(16)])
394
+ end
395
+ end
396
+ end
397
+
398
+ class Printer
399
+ def initialize
400
+ @index = 0
401
+ end
402
+
403
+ def process(data)
404
+ @last_char_part1_i = data.pages_before_e0000 * 256 - 1
405
+ @last_char_part1_x = sprintf('0x%04x', @last_char_part1_i)
406
+ @last_char_part1_X = sprintf('%04X', @last_char_part1_i)
407
+ print_tables(data)
408
+ print_decomp(data)
409
+ print_composition_table(data)
410
+ print_line_break(data)
411
+ end
412
+
413
+ private
414
+
415
+ # Map general category code onto symbolic name.
416
+ Mappings = {
417
+ # Normative.
418
+ 'Lu' => 'UNICODE_UPPERCASE_LETTER',
419
+ 'Ll' => 'UNICODE_LOWERCASE_LETTER',
420
+ 'Lt' => 'UNICODE_TITLECASE_LETTER',
421
+ 'Mn' => 'UNICODE_NON_SPACING_MARK',
422
+ 'Mc' => 'UNICODE_COMBINING_MARK',
423
+ 'Me' => 'UNICODE_ENCLOSING_MARK',
424
+ 'Nd' => 'UNICODE_DECIMAL_NUMBER',
425
+ 'Nl' => 'UNICODE_LETTER_NUMBER',
426
+ 'No' => 'UNICODE_OTHER_NUMBER',
427
+ 'Zs' => 'UNICODE_SPACE_SEPARATOR',
428
+ 'Zl' => 'UNICODE_LINE_SEPARATOR',
429
+ 'Zp' => 'UNICODE_PARAGRAPH_SEPARATOR',
430
+ 'Cc' => 'UNICODE_CONTROL',
431
+ 'Cf' => 'UNICODE_FORMAT',
432
+ 'Cs' => 'UNICODE_SURROGATE',
433
+ 'Co' => 'UNICODE_PRIVATE_USE',
434
+ 'Cn' => 'UNICODE_UNASSIGNED',
435
+
436
+ # Informative.
437
+ 'Lm' => 'UNICODE_MODIFIER_LETTER',
438
+ 'Lo' => 'UNICODE_OTHER_LETTER',
439
+ 'Pc' => 'UNICODE_CONNECT_PUNCTUATION',
440
+ 'Pd' => 'UNICODE_DASH_PUNCTUATION',
441
+ 'Ps' => 'UNICODE_OPEN_PUNCTUATION',
442
+ 'Pe' => 'UNICODE_CLOSE_PUNCTUATION',
443
+ 'Pi' => 'UNICODE_INITIAL_PUNCTUATION',
444
+ 'Pf' => 'UNICODE_FINAL_PUNCTUATION',
445
+ 'Po' => 'UNICODE_OTHER_PUNCTUATION',
446
+ 'Sm' => 'UNICODE_MATH_SYMBOL',
447
+ 'Sc' => 'UNICODE_CURRENCY_SYMBOL',
448
+ 'Sk' => 'UNICODE_MODIFIER_SYMBOL',
449
+ 'So' => 'UNICODE_OTHER_SYMBOL'
450
+ }
451
+
452
+ BreakMappings = {
453
+ 'BK' => 'UNICODE_BREAK_MANDATORY',
454
+ 'CR' => 'UNICODE_BREAK_CARRIAGE_RETURN',
455
+ 'LF' => 'UNICODE_BREAK_LINE_FEED',
456
+ 'CM' => 'UNICODE_BREAK_COMBINING_MARK',
457
+ 'SG' => 'UNICODE_BREAK_SURROGATE',
458
+ 'ZW' => 'UNICODE_BREAK_ZERO_WIDTH_SPACE',
459
+ 'IN' => 'UNICODE_BREAK_INSEPARABLE',
460
+ 'GL' => 'UNICODE_BREAK_NON_BREAKING_GLUE',
461
+ 'CB' => 'UNICODE_BREAK_CONTINGENT',
462
+ 'SP' => 'UNICODE_BREAK_SPACE',
463
+ 'BA' => 'UNICODE_BREAK_AFTER',
464
+ 'BB' => 'UNICODE_BREAK_BEFORE',
465
+ 'B2' => 'UNICODE_BREAK_BEFORE_AND_AFTER',
466
+ 'HY' => 'UNICODE_BREAK_HYPHEN',
467
+ 'NS' => 'UNICODE_BREAK_NON_STARTER',
468
+ 'OP' => 'UNICODE_BREAK_OPEN_PUNCTUATION',
469
+ 'CL' => 'UNICODE_BREAK_CLOSE_PUNCTUATION',
470
+ 'QU' => 'UNICODE_BREAK_QUOTATION',
471
+ 'EX' => 'UNICODE_BREAK_EXCLAMATION',
472
+ 'ID' => 'UNICODE_BREAK_IDEOGRAPHIC',
473
+ 'NU' => 'UNICODE_BREAK_NUMERIC',
474
+ 'IS' => 'UNICODE_BREAK_INFIX_SEPARATOR',
475
+ 'SY' => 'UNICODE_BREAK_SYMBOL',
476
+ 'AL' => 'UNICODE_BREAK_ALPHABETIC',
477
+ 'PR' => 'UNICODE_BREAK_PREFIX',
478
+ 'PO' => 'UNICODE_BREAK_POSTFIX',
479
+ 'SA' => 'UNICODE_BREAK_COMPLEX_CONTEXT',
480
+ 'AI' => 'UNICODE_BREAK_AMBIGUOUS',
481
+ 'NL' => 'UNICODE_BREAK_NEXT_LINE',
482
+ 'WJ' => 'UNICODE_BREAK_WORD_JOINER',
483
+ 'XX' => 'UNICODE_BREAK_UNKNOWN',
484
+ 'JL' => 'UNICODE_BREAK_HANGUL_L_JAMO',
485
+ 'JV' => "UNICODE_BREAK_HANGUL_V_JAMO",
486
+ 'JT' => "UNICODE_BREAK_HANGUL_T_JAMO",
487
+ 'H2' => "UNICODE_BREAK_HANGUL_LV_SYLLABLE",
488
+ 'H3' => "UNICODE_BREAK_HANGUL_LVT_SYLLABLE"
489
+ };
490
+
491
+ NOT_PRESENT_OFFSET = 65535
492
+
493
+ def print_table(data, low, mid, hi, size, header, part1_h, part2_h, &f)
494
+ @index = 0
495
+ rows = []
496
+ print(header)
497
+ low.step(hi, 256) do |i|
498
+ rows[i / 256] = print_row(data, i, size){ |i| f.call(i) }
499
+ end
500
+ print("\n};\n")
501
+ print(part1_h)
502
+ low.step(mid, 256) do |i|
503
+ printf("%s%s,\n", data.indent, rows[i / 256])
504
+ end
505
+ print("};\n")
506
+ if mid != hi
507
+ print(part2_h)
508
+ 0xe0000.step(hi, 256) do |i|
509
+ printf("%s%s,\n", data.indent, rows[i / 256])
510
+ end
511
+ print("};\n")
512
+ end
513
+ end
514
+
515
+ def print_tables(data, outfile = 'character-tables.h')
516
+ row = []
517
+ saved_stdout = $stdout
518
+ File.open(outfile, 'w') do |file|
519
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
520
+ $stdout = file
521
+ print <<EOF
522
+ /* Automatically generated file */
523
+
524
+ #ifndef #{header_h}
525
+ #define #{header_h}
526
+
527
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
528
+
529
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
530
+
531
+ #define UNICODE_MAX_TABLE_INDEX 10000
532
+
533
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
534
+
535
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
536
+
537
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
538
+
539
+ #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
540
+ EOF
541
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
542
+ <<EOH, <<EOH1, <<EOH2){ |i| Mappings[data.type[i]] }
543
+
544
+
545
+ static const char type_data[][256] = {
546
+ EOH
547
+
548
+
549
+ /* U+0000 through U+#{@last_char_part1_X} */
550
+ static const int16_t type_table_part1[#{data.pages_before_e0000}] = {
551
+ EOH1
552
+
553
+
554
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
555
+ static const int16_t type_table_part2[768] = {
556
+ EOH2
557
+
558
+ print_table(data, 0, @last_char_part1_i, data.last, 4,
559
+ <<EOH, <<EOH1, <<EOH2) { |i| data.value[i].nil? ? '0x0000' : sprintf('0x%04x', data.value[i]) }
560
+
561
+
562
+ static const unichar attr_data[][256] = {
563
+ EOH
564
+
565
+
566
+ /* U+0000 through U+#{@last_char_part1_X} */
567
+ static const int16_t attr_table_part1[#{data.pages_before_e0000}] = {
568
+ EOH1
569
+
570
+
571
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
572
+ static const int16_t attr_table_part2[768] = {
573
+ EOH2
574
+
575
+ print <<EOF
576
+
577
+
578
+ static const unichar title_table[][3] = {
579
+ EOF
580
+ data.title_to_lower.keys.sort.each do |code|
581
+ printf("%s{ 0x%04x, 0x%04x, 0x%04x },\n", data.indent,
582
+ code, data.title_to_upper[code], data.title_to_lower[code])
583
+ end
584
+ print("};\n")
585
+
586
+ print_special_case_table(data)
587
+ print_case_fold_table(data)
588
+
589
+ print <<EOF
590
+ static const struct {
591
+ #{data.indent}unichar ch;
592
+ #{data.indent}unichar mirrored_ch;
593
+ } bidi_mirroring_table[] = {
594
+ EOF
595
+ data.bidimirror.each do |item|
596
+ printf("%s{ 0x%04x, 0x%04x },\n", data.indent, item[0], item[1])
597
+ end
598
+ print <<EOF
599
+ };
600
+
601
+ #endif /* #{header_h} */
602
+ EOF
603
+ end
604
+ $stdout = saved_stdout
605
+ end
606
+
607
+ def print_row(data, start, type_size)
608
+ flag = true
609
+ values = []
610
+ 0.upto(255) do |i|
611
+ values[i] = yield(start + i)
612
+ flag = false if values[i] != values[0]
613
+ end
614
+ return values[0] + " + UNICODE_MAX_TABLE_INDEX" if flag
615
+
616
+ puts(',') if @index != 0
617
+ printf("%s{ /* page %d, index %d */\n%s",
618
+ data.indent, start / 256, @index, data.indent * 2)
619
+ column = data.indent.width * 2
620
+ start.upto(start + 255) do |i|
621
+ text = values[i - start]
622
+ if text.length + column + 2 > 79
623
+ printf("\n%s", data.indent * 2)
624
+ column = data.indent.width * 2
625
+ end
626
+
627
+ printf("%s, ", text)
628
+ column += text.width + 2
629
+ end
630
+
631
+ print("\n#{data.indent}}")
632
+ @index += 1
633
+ return sprintf("%d /* page %d */", @index - 1, start / 256);
634
+ end
635
+
636
+ def print_special_case_table(data)
637
+ print <<EOF
638
+
639
+
640
+ /*
641
+ * Table of special cases for case conversion; each record contains
642
+ * First, the best single character mapping to lowercase if Lu,
643
+ * and to uppercase if Ll, followed by the output mapping for the two cases
644
+ * other than the case of the codepoint, in the order Ll, Lu, Lt, encoded in
645
+ * UTF-8, separated and terminated by a NUL character.
646
+ */
647
+ static const char special_case_table[] = {
648
+ EOF
649
+ data.special_cases.each_with_index do |sc, i|
650
+ printf(%Q< "%s\\0" /* offset %d */\n>, sc, data.special_case_offsets[i])
651
+ end
652
+ print <<EOF
653
+ };
654
+
655
+ EOF
656
+ end
657
+
658
+ def print_case_fold_table(data)
659
+ print <<EOF
660
+
661
+ /*
662
+ * Table of casefolding cases that can't be derived by lowercasing.
663
+ */
664
+ static const struct {
665
+ #{data.indent}uint16_t ch;
666
+ #{data.indent}char data[#{data.casefold_longest}];
667
+ } casefold_table[] = {
668
+ EOF
669
+ data.casefold.sort_by{ |a| a[0] }.each do |cf|
670
+ if cf[0] > 0xffff
671
+ error('casefold_table.ch field too short.' +
672
+ ' Upgrade to unichar to fit values beyond 0xffff.')
673
+ end
674
+ printf(%Q<%s{ 0x%04x, "%s" },\n>, data.indent, cf[0], cf[1])
675
+ end
676
+ print <<EOF
677
+ };
678
+ EOF
679
+ end
680
+
681
+ def print_decomp(data, outfile = 'decompose.h')
682
+ row = []
683
+ saved_stdout = $stdout
684
+ File.open(outfile, 'w') do |file|
685
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
686
+ $stdout = file
687
+ print <<EOF
688
+ /* Automatically generated file */
689
+
690
+ #ifndef #{header_h}
691
+ #define #{header_h}
692
+
693
+
694
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
695
+
696
+ #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
697
+
698
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
699
+
700
+ #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
701
+
702
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
703
+
704
+ #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
705
+ EOF
706
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
707
+ <<EOH, <<EOH1, <<EOH2){ |i| data.cclass[i] }
708
+
709
+
710
+ static const uint8_t cclass_data[][256] = {
711
+ EOH
712
+
713
+
714
+ static const int16_t combining_class_table_part1[#{data.pages_before_e0000}] = {
715
+ EOH1
716
+
717
+
718
+ static const int16_t combining_class_table_part2[768] = {
719
+ EOH2
720
+
721
+ print <<EOL
722
+
723
+
724
+ static const struct {
725
+ #{data.indent}unichar ch;
726
+ #{data.indent}uint16_t canon_offset;
727
+ #{data.indent}uint16_t compat_offset;
728
+ } decomp_table[] = {
729
+ EOL
730
+ decomp_offsets = {}
731
+ decomp_string = ''
732
+ @decomp_string_offset = 0
733
+ 0.upto(data.last) do |i|
734
+ unless data.decompositions[i].nil?
735
+ canon_decomp = data.decompose_compat[i] ?
736
+ nil : make_decomp(data, i, false)
737
+ compat_decomp = make_decomp(data, i, true)
738
+ if not canon_decomp.nil? and compat_decomp == canon_decomp
739
+ compat_decomp = nil
740
+ end
741
+ canon_offset = handle_decomp(canon_decomp, decomp_offsets,
742
+ decomp_string)
743
+ compat_offset = handle_decomp(compat_decomp, decomp_offsets,
744
+ decomp_string)
745
+
746
+ if @decomp_string_offset > NOT_PRESENT_OFFSET
747
+ error('decomposition string offset beyond not-present-offset,' +
748
+ " upgrade value:\n" +
749
+ " offset: %d\n" +
750
+ " max: %d\n",
751
+ @decomp_string_offset, NOT_PRESENT_OFFSET)
752
+ end
753
+ printf("%s{ 0x%04x, %s, %s },\n",
754
+ data.indent, i, canon_offset, compat_offset)
755
+ end
756
+ end
757
+ print("\n};")
758
+
759
+ print <<EOL
760
+
761
+ static const char decomp_expansion_string[] = #{decomp_string};
762
+
763
+
764
+ #endif /* #{header_h} */
765
+ EOL
766
+ end
767
+ $stdout = saved_stdout
768
+ end
769
+
770
+ def expand_decomp(data, code, compat)
771
+ ary = []
772
+ data.decompositions[code].split(/ /).each do |item|
773
+ pos = item.to_i(16)
774
+ if not data.decompositions[pos].nil? and
775
+ (compat or not data.decompose_compat[pos])
776
+ ary.concat(expand_decomp(data, pos, compat))
777
+ else
778
+ ary.push(pos)
779
+ end
780
+ end
781
+ ary
782
+ end
783
+
784
+ def make_decomp(data, code, compat)
785
+ str = ''
786
+ expand_decomp(data, code, compat).each do |item|
787
+ str += item.is_a?(Array) ? item.flatten.pack('U') : [item].pack('U')
788
+ end
789
+ str
790
+ end
791
+
792
+ def handle_decomp(decomp, decomp_offsets,
793
+ decomp_string)
794
+ offset = 'UNICODE_NOT_PRESENT_OFFSET'
795
+ unless decomp.nil?
796
+ if decomp_offsets.member?(decomp)
797
+ offset = decomp_offsets[decomp]
798
+ else
799
+ offset = @decomp_string_offset
800
+ decomp_offsets[decomp] = offset
801
+ decomp_string << ("\n \"" + decomp.escape +
802
+ "\\0\" /* offset #{offset} */")
803
+ @decomp_string_offset += decomp.length + 1
804
+ end
805
+ end
806
+ offset
807
+ end
808
+
809
+ def print_composition_table(data, outfile = 'compose.h')
810
+ first = Hash.new(0)
811
+ second = Hash.new(0)
812
+
813
+ data.compositions.each do |code, value|
814
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
815
+
816
+ # skip non-starters and single-character decompositions
817
+ if data.cclass[values[0]] != '0' or values.size == 1
818
+ data.compositions.delete(code)
819
+ next
820
+ end
821
+
822
+ if values.size != 2
823
+ error("decomposition of entry contains more than two elements:\n" +
824
+ " entry: %d\n" +
825
+ " elements: %d\n",
826
+ code, values.size)
827
+ end
828
+
829
+ first[values[0]] += 1
830
+ end
831
+
832
+ n_first = first.enumerate_ordered
833
+
834
+ data.compositions.each do |code, value|
835
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
836
+
837
+ second[values[1]] += 1 if first.member?(values[0])
838
+ end
839
+
840
+ n_second = second.enumerate_ordered
841
+
842
+ first_singletons = []
843
+ second_singletons = []
844
+ reverse = {}
845
+ data.compositions.each do |code, value|
846
+ values = value.split(/\s+/).map{ |s| s.to_i(16) }
847
+
848
+ if first.member?(values[0]) and second.member?(values[1])
849
+ reverse["#{first[values[0]]}|#{second[values[1]]}"] = code
850
+ elsif not first.member?(values[0])
851
+ first_singletons.push([values[0], values[1], code])
852
+ else
853
+ second_singletons.push([values[1], values[0], code])
854
+ end
855
+ end
856
+
857
+ first_singletons = first_singletons.sort_by{ |a| a[0] }
858
+ second_singletons = second_singletons.sort_by{ |a| a[0] }
859
+
860
+ row = []
861
+ saved_stdout = $stdout
862
+ File.open(outfile, 'w') do |file|
863
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
864
+ $stdout = file
865
+ values = {}
866
+ total = first_start = 1
867
+ last = 0
868
+
869
+ first.each do |code, value|
870
+ values[code] = value + total
871
+ last = code if code > last
872
+ end
873
+ total += n_first
874
+
875
+ first_single_start = total
876
+ first_singletons.each_with_index do |item, i|
877
+ code = item[0]
878
+ values[code] = i + total
879
+ last = code if code > last
880
+ end
881
+ total += first_singletons.size
882
+
883
+ second_start = total
884
+ second.each do |code, value|
885
+ values[code] = value + total
886
+ last = code if code > last
887
+ end
888
+ total += n_second
889
+
890
+ second_single_start = total
891
+ second_singletons.each_with_index do |item, i|
892
+ code = item[0]
893
+ values[code] = i + total
894
+ last = code if code > last
895
+ end
896
+
897
+ print <<EOL
898
+ /* Automatically generated file */
899
+
900
+ #ifndef #{header_h}
901
+ #define #{header_h}
902
+
903
+
904
+ #define COMPOSE_FIRST_START #{first_start}
905
+ #define COMPOSE_FIRST_SINGLE_START #{first_single_start}
906
+ #define COMPOSE_SECOND_START #{second_start}
907
+ #define COMPOSE_SECOND_SINGLE_START #{second_single_start}
908
+ #define COMPOSE_TABLE_LAST #{last / 256}
909
+ EOL
910
+
911
+ print_table(data, 0, last, last, 2,
912
+ <<EOH, <<EOH1, nil){ |i| values.member?(i) ? values[i].to_s : '0' }
913
+
914
+
915
+ static const uint16_t compose_data[][256] = {
916
+ EOH
917
+
918
+
919
+ static const int16_t compose_table[COMPOSE_TABLE_LAST + 1] = {
920
+ EOH1
921
+
922
+ print <<EOL
923
+
924
+
925
+ static const uint16_t compose_first_single[][2] = {
926
+ EOL
927
+ first_singletons.each_with_index do |item, i|
928
+ if item[1] > 0xffff or item[2] > 0xffff
929
+ error("compose_first_single table field too short." +
930
+ " Upgrade to unichar to fit values beyond 0xffff.")
931
+ end
932
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
933
+ end
934
+ print("};\n")
935
+
936
+ print <<EOL
937
+
938
+
939
+ static const uint16_t compose_second_single[][2] = {
940
+ EOL
941
+ second_singletons.each_with_index do |item, i|
942
+ if item[1] > 0xffff or item[2] > 0xffff
943
+ error("compose_second_single table field too short." +
944
+ " Upgrade to unichar to fit values beyond 0xffff.")
945
+ end
946
+ printf("%s{ %#06x, %#06x },\n", data.indent, item[1], item[2])
947
+ end
948
+ print("};\n")
949
+
950
+ print <<EOL
951
+
952
+
953
+ static const uint16_t compose_array[#{n_first}][#{n_second}] = {
954
+ EOL
955
+ 0.upto(n_first - 1) do |i|
956
+ printf("%s{\n%s", data.indent, data.indent * 2)
957
+ column = data.indent.width * 2
958
+ 0.upto(n_second - 1) do |j|
959
+ if column + 8 > 79
960
+ printf("\n%s", data.indent * 2)
961
+ column = data.indent.width * 2
962
+ end
963
+ if reverse.member?("#{i}|#{j}")
964
+ if reverse["#{i}|#{j}"] > 0xffff
965
+ error("compose_array table field too short." +
966
+ " Upgrade to unichar to fit values beyond 0xffff.")
967
+ end
968
+ printf("0x%04x, ", reverse["#{i}|#{j}"])
969
+ else
970
+ print(" 0, ")
971
+ end
972
+ column += 8
973
+ end
974
+ printf("\n%s},\n", data.indent)
975
+ end
976
+ print("};\n")
977
+
978
+ print <<EOL
979
+
980
+
981
+ #endif /* #{header_h} */
982
+ EOL
983
+ end
984
+ $stdout = saved_stdout
985
+ end
986
+
987
+ def print_line_break(data, outfile = 'break.h')
988
+ row = []
989
+ saved_stdout = $stdout
990
+ File.open(outfile, 'w') do |file|
991
+ header_h = outfile.upcase.gsub(/[^A-Z0-9]/, '_')
992
+ $stdout = file
993
+ print <<EOF
994
+ /* Automatically generated file */
995
+
996
+ #ifndef #{header_h}
997
+ #define #{header_h}
998
+
999
+ #define UNICODE_DATA_VERSION "#{UnicodeVersion}"
1000
+
1001
+ #define UNICODE_LAST_CHAR #{sprintf('0x%04x', data.last)}
1002
+
1003
+ #define UNICODE_MAX_TABLE_INDEX 10000
1004
+
1005
+ /*
1006
+ * The last code point that should be looked up in break_property_table_part1.
1007
+ */
1008
+ #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
1009
+
1010
+ /*
1011
+ * The first code point that should be looked up in break_property_table_part2.
1012
+ */
1013
+ #define UNICODE_FIRST_CHAR_PART2 0xe0000
1014
+ EOF
1015
+ print_table(data, 0, @last_char_part1_i, data.last, 1,
1016
+ <<EOH, <<EOH1, <<EOH2){ |i| BreakMappings[data.break_props[i]] }
1017
+
1018
+
1019
+ static const int8_t break_property_data[][256] = {
1020
+ EOH
1021
+
1022
+
1023
+ /* U+0000 through U+#{@last_char_part1_X} */
1024
+ static const int16_t break_property_table_part1[#{data.pages_before_e0000}] = {
1025
+ EOH1
1026
+
1027
+
1028
+ /* U+E0000 through U+#{sprintf('%04X', data.last)} */
1029
+ static const int16_t break_property_table_part2[768] = {
1030
+ EOH2
1031
+
1032
+ print <<EOF
1033
+
1034
+
1035
+ #endif /* #{header_h} */
1036
+ EOF
1037
+ end
1038
+ $stdout = saved_stdout
1039
+ end
1040
+ end
1041
+
1042
+ UnicodeVersion = ARGV[0]
1043
+
1044
+ class Runner
1045
+ def main
1046
+ check_for_data_files(ARGV[1])
1047
+ data = CollectedData.new(ARGV[1], "\t")
1048
+ [CompositionExclusions, UnicodeData, LineBreak,
1049
+ SpecialCasing, CaseFolding, BidiMirroring, Printer].each do |klass|
1050
+ klass.new.process(data)
1051
+ end
1052
+ end
1053
+
1054
+ private
1055
+ def check_for_data_files(dir)
1056
+ ['UnicodeData.txt', 'LineBreak.txt', 'SpecialCasing.txt', 'CaseFolding.txt',
1057
+ 'CompositionExclusions.txt', 'BidiMirroring.txt'].each do |file|
1058
+ path = File.join(dir, file)
1059
+ unless File.readable?(path)
1060
+ error('missing required file: %s', path)
1061
+ end
1062
+ end
1063
+ end
1064
+ end
1065
+
1066
+ Runner.new.main
1067
+
1068
+
1069
+
1070
+ # vim: set sts=2 sw=2: