ucode 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ucode/code_chart/extractor.rb +1 -9
- data/lib/ucode/code_chart/writer.rb +1 -1
- data/lib/ucode/commands/canonical_build.rb +4 -4
- data/lib/ucode/commands/universal_set.rb +5 -3
- data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
- data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
- data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
- data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
- data/lib/ucode/coordinator/enrichment/display.rb +36 -0
- data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
- data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
- data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
- data/lib/ucode/coordinator/enrichment/names.rb +63 -0
- data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
- data/lib/ucode/coordinator/enrichment.rb +51 -0
- data/lib/ucode/coordinator/range_lookup.rb +65 -0
- data/lib/ucode/coordinator.rb +4 -276
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
- data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
- data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
- data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
- data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
- data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
- data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
- data/lib/ucode/glyphs/resolver_factory.rb +45 -0
- data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
- data/lib/ucode/glyphs.rb +1 -0
- data/lib/ucode/version.rb +1 -1
- metadata +20 -3
data/lib/ucode/coordinator.rb
CHANGED
|
@@ -27,6 +27,8 @@ module Ucode
|
|
|
27
27
|
# subsets.
|
|
28
28
|
class Coordinator
|
|
29
29
|
autoload :Indices, "ucode/coordinator/indices"
|
|
30
|
+
autoload :RangeLookup, "ucode/coordinator/range_lookup"
|
|
31
|
+
autoload :Enrichment, "ucode/coordinator/enrichment"
|
|
30
32
|
|
|
31
33
|
ISO_SCRIPT_PROPERTY = "sc".freeze
|
|
32
34
|
private_constant :ISO_SCRIPT_PROPERTY
|
|
@@ -249,282 +251,8 @@ module Ucode
|
|
|
249
251
|
|
|
250
252
|
def enrich(cp, indices)
|
|
251
253
|
cp.plane_number = cp.cp >> 16
|
|
252
|
-
cp.block_id = find_in_range(cp.cp, indices.blocks)&.id
|
|
253
|
-
|
|
254
|
-
assign_script_extensions(cp, indices)
|
|
255
|
-
assign_age(cp, indices)
|
|
256
|
-
assign_bidi(cp, indices)
|
|
257
|
-
assign_casing(cp, indices)
|
|
258
|
-
assign_case_folding(cp, indices)
|
|
259
|
-
assign_binary_properties(cp, indices)
|
|
260
|
-
assign_names_list(cp, indices)
|
|
261
|
-
assign_name_aliases(cp, indices)
|
|
262
|
-
assign_standardized_variants(cp, indices)
|
|
263
|
-
assign_unihan(cp, indices)
|
|
264
|
-
assign_cjk_radical(cp, indices)
|
|
265
|
-
assign_display(cp, indices)
|
|
266
|
-
assign_break_segmentation(cp, indices)
|
|
267
|
-
assign_indic(cp, indices)
|
|
268
|
-
assign_hangul(cp, indices)
|
|
269
|
-
assign_emoji(cp, indices)
|
|
270
|
-
assign_extra_binary_properties(cp, indices)
|
|
271
|
-
end
|
|
272
|
-
|
|
273
|
-
def assign_script(cp, indices)
|
|
274
|
-
script = find_in_range(cp.cp, indices.scripts)
|
|
275
|
-
return unless script
|
|
276
|
-
|
|
277
|
-
cp.script_code = script.code || script.name
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
def assign_script_extensions(cp, indices)
|
|
281
|
-
tuples = indices.script_extensions[cp.cp]
|
|
282
|
-
return unless tuples && !tuples.empty?
|
|
283
|
-
|
|
284
|
-
tuples.each { |tuple| cp.script_extensions << tuple.script_code }
|
|
285
|
-
end
|
|
286
|
-
|
|
287
|
-
def assign_age(cp, indices)
|
|
288
|
-
record = indices.derived_age[cp.cp]
|
|
289
|
-
return unless record
|
|
290
|
-
|
|
291
|
-
cp.age = record.age
|
|
292
|
-
end
|
|
293
|
-
|
|
294
|
-
def assign_bidi(cp, indices)
|
|
295
|
-
mirroring = indices.bidi_mirroring[cp.cp]
|
|
296
|
-
brackets = indices.bidi_brackets[cp.cp]
|
|
297
|
-
return unless mirroring || brackets
|
|
298
|
-
|
|
299
|
-
cp.bidi ||= Models::CodePoint::Bidi.new
|
|
300
|
-
if mirroring
|
|
301
|
-
cp.bidi.mirroring_glyph_id = mirroring.mirrored_id
|
|
302
|
-
end
|
|
303
|
-
if brackets
|
|
304
|
-
cp.bidi.paired_bracket_type = brackets.type
|
|
305
|
-
cp.bidi.paired_bracket_id = brackets.paired_id
|
|
306
|
-
end
|
|
307
|
-
end
|
|
308
|
-
|
|
309
|
-
def assign_casing(cp, indices)
|
|
310
|
-
rules = indices.special_casing[cp.cp]
|
|
311
|
-
return unless rules && !rules.empty?
|
|
312
|
-
|
|
313
|
-
# NOTE: do not uniq the *_ids arrays — a mapping like U+00DF → "SS"
|
|
314
|
-
# legitimately contains two U+0053 entries and they must be
|
|
315
|
-
# preserved in order. Conditions, by contrast, are categorical
|
|
316
|
-
# tags (Final_Sigma, tr, After_I) and deduping them is correct.
|
|
317
|
-
cp.casing ||= Models::CodePoint::Casing.new
|
|
318
|
-
cp.casing.full_upper_ids = rules.flat_map(&:upper_ids)
|
|
319
|
-
cp.casing.full_lower_ids = rules.flat_map(&:lower_ids)
|
|
320
|
-
cp.casing.full_title_ids = rules.flat_map(&:title_ids)
|
|
321
|
-
cp.casing.conditions = rules.flat_map(&:conditions).uniq
|
|
322
|
-
end
|
|
323
|
-
|
|
324
|
-
def assign_case_folding(cp, indices)
|
|
325
|
-
rules = indices.case_folding[cp.cp]
|
|
326
|
-
return unless rules && !rules.empty?
|
|
327
|
-
|
|
328
|
-
cp.case_folding ||= Models::CodePoint::CaseFolding.new
|
|
329
|
-
rules.each do |rule|
|
|
330
|
-
case rule.status
|
|
331
|
-
when "C" then cp.case_folding.common_id = rule.mapping_ids.first
|
|
332
|
-
when "S" then cp.case_folding.simple_id = rule.mapping_ids.first
|
|
333
|
-
when "T" then cp.case_folding.turkic_id = rule.mapping_ids.first
|
|
334
|
-
when "F" then cp.case_folding.full_ids = rule.mapping_ids
|
|
335
|
-
end
|
|
336
|
-
end
|
|
337
|
-
end
|
|
338
|
-
|
|
339
|
-
def assign_binary_properties(cp, indices)
|
|
340
|
-
records = indices.binary_properties[cp.cp]
|
|
341
|
-
return unless records && !records.empty?
|
|
342
|
-
|
|
343
|
-
cp.binary_properties = records.map(&:property_short)
|
|
344
|
-
end
|
|
345
|
-
|
|
346
|
-
def assign_names_list(cp, indices)
|
|
347
|
-
entry = indices.names_list[cp.cp]
|
|
348
|
-
return unless entry
|
|
349
|
-
|
|
350
|
-
cp.names_list = entry
|
|
351
|
-
cp.relationships.concat(entry.cross_references)
|
|
352
|
-
cp.relationships.concat(entry.sample_sequences)
|
|
353
|
-
cp.relationships.concat(entry.compatibility_equivalents)
|
|
354
|
-
cp.relationships.concat(entry.informal_aliases)
|
|
355
|
-
cp.relationships.concat(entry.footnotes)
|
|
356
|
-
end
|
|
357
|
-
|
|
358
|
-
def assign_name_aliases(cp, indices)
|
|
359
|
-
aliases = indices.name_aliases[cp.cp]
|
|
360
|
-
return unless aliases && !aliases.empty?
|
|
361
|
-
|
|
362
|
-
aliases.each do |alias_record|
|
|
363
|
-
cp.relationships << Models::Relationship::InformalAlias.new(
|
|
364
|
-
description: alias_record.text,
|
|
365
|
-
source: "name_aliases"
|
|
366
|
-
)
|
|
367
|
-
end
|
|
368
|
-
end
|
|
369
|
-
|
|
370
|
-
def assign_standardized_variants(cp, indices)
|
|
371
|
-
variants = indices.standardized_variants[cp.id]
|
|
372
|
-
return unless variants && !variants.empty?
|
|
373
|
-
|
|
374
|
-
cp.standardized_variants = variants
|
|
375
|
-
variants.each do |variant|
|
|
376
|
-
cp.relationships << Models::Relationship::VariationSequence.new(
|
|
377
|
-
target_ids: [variant.base_id, variant.variation_selector_id],
|
|
378
|
-
description: variant.description,
|
|
379
|
-
contexts: variant.contexts,
|
|
380
|
-
source: "standardized_variants"
|
|
381
|
-
)
|
|
382
|
-
end
|
|
383
|
-
end
|
|
384
|
-
|
|
385
|
-
def assign_unihan(cp, indices)
|
|
386
|
-
entry = indices.unihan[cp.cp]
|
|
387
|
-
return unless entry
|
|
388
|
-
|
|
389
|
-
cp.unihan = entry
|
|
390
|
-
end
|
|
391
|
-
|
|
392
|
-
def assign_cjk_radical(cp, indices)
|
|
393
|
-
radicals = indices.cjk_radicals[cp.id]
|
|
394
|
-
return unless radicals && !radicals.empty?
|
|
395
|
-
|
|
396
|
-
radicals.each do |radical|
|
|
397
|
-
cp.relationships << Models::Relationship::CrossReference.new(
|
|
398
|
-
target_ids: [radical.cjk_radical_id],
|
|
399
|
-
description: "KangXi radical ##{radical.radical_number}",
|
|
400
|
-
source: "cjk_radicals"
|
|
401
|
-
)
|
|
402
|
-
end
|
|
403
|
-
end
|
|
404
|
-
|
|
405
|
-
# Display: East Asian Width, Line Break Class, Vertical Orientation.
|
|
406
|
-
# All three are range+value files, looked up via bsearch on sorted
|
|
407
|
-
# arrays of ExtractedProperties::Tuple.
|
|
408
|
-
def assign_display(cp, indices)
|
|
409
|
-
tuple = find_in_range(cp.cp, indices.line_break)
|
|
410
|
-
lb = tuple&.value
|
|
411
|
-
tuple = find_in_range(cp.cp, indices.east_asian_width)
|
|
412
|
-
eaw = tuple&.value
|
|
413
|
-
tuple = find_in_range(cp.cp, indices.vertical_orientation)
|
|
414
|
-
vo = tuple&.value
|
|
415
|
-
return if lb.nil? && eaw.nil? && vo.nil?
|
|
416
|
-
|
|
417
|
-
cp.display ||= Models::CodePoint::Display.new
|
|
418
|
-
cp.display.line_break_class = lb if lb
|
|
419
|
-
cp.display.east_asian_width = eaw if eaw
|
|
420
|
-
cp.display.vertical_orientation = vo if vo
|
|
421
|
-
end
|
|
422
|
-
|
|
423
|
-
# UAX #29 segmentation: Grapheme / Word / Sentence break class.
|
|
424
|
-
def assign_break_segmentation(cp, indices)
|
|
425
|
-
grapheme = find_in_range(cp.cp, indices.grapheme_break)&.value
|
|
426
|
-
word = find_in_range(cp.cp, indices.word_break)&.value
|
|
427
|
-
sentence = find_in_range(cp.cp, indices.sentence_break)&.value
|
|
428
|
-
return if grapheme.nil? && word.nil? && sentence.nil?
|
|
429
|
-
|
|
430
|
-
cp.break_segmentation ||= Models::CodePoint::BreakSegmentation.new
|
|
431
|
-
cp.break_segmentation.grapheme = grapheme if grapheme
|
|
432
|
-
cp.break_segmentation.word = word if word
|
|
433
|
-
cp.break_segmentation.sentence = sentence if sentence
|
|
434
|
-
end
|
|
435
|
-
|
|
436
|
-
def assign_indic(cp, indices)
|
|
437
|
-
positional = find_in_range(cp.cp, indices.indic_positional)&.value
|
|
438
|
-
syllabic = find_in_range(cp.cp, indices.indic_syllabic)&.value
|
|
439
|
-
return if positional.nil? && syllabic.nil?
|
|
440
|
-
|
|
441
|
-
cp.indic ||= Models::CodePoint::Indic.new
|
|
442
|
-
cp.indic.positional_category = positional if positional
|
|
443
|
-
cp.indic.syllabic_category = syllabic if syllabic
|
|
444
|
-
end
|
|
445
|
-
|
|
446
|
-
def assign_hangul(cp, indices)
|
|
447
|
-
tuple = find_in_range(cp.cp, indices.hangul_syllable_type)
|
|
448
|
-
return unless tuple
|
|
449
|
-
|
|
450
|
-
cp.hangul ||= Models::CodePoint::HangulSyllable.new
|
|
451
|
-
cp.hangul.type = tuple.value
|
|
452
|
-
end
|
|
453
|
-
|
|
454
|
-
# Emoji property bundle. Each Emoji_* property from emoji-data.txt
|
|
455
|
-
# flips the matching boolean on the Emoji sub-model.
|
|
456
|
-
def assign_emoji(cp, indices)
|
|
457
|
-
return unless find_in_range(cp.cp, indices.emoji_properties)
|
|
458
|
-
|
|
459
|
-
props = all_range_values(cp.cp, indices.emoji_properties)
|
|
460
|
-
return if props.empty?
|
|
461
|
-
|
|
462
|
-
cp.emoji ||= Models::CodePoint::Emoji.new
|
|
463
|
-
props.each do |prop|
|
|
464
|
-
case prop
|
|
465
|
-
when "Emoji" then cp.emoji.is_emoji = true
|
|
466
|
-
when "Emoji_Presentation" then cp.emoji.is_presentation_default = true
|
|
467
|
-
when "Emoji_Modifier" then cp.emoji.is_modifier = true
|
|
468
|
-
when "Emoji_Modifier_Base" then cp.emoji.is_base = true
|
|
469
|
-
when "Emoji_Component" then cp.emoji.is_component = true
|
|
470
|
-
when "Extended_Pictographic" then cp.emoji.is_extended_pictographic = true
|
|
471
|
-
end
|
|
472
|
-
end
|
|
473
|
-
end
|
|
474
|
-
|
|
475
|
-
# PropList.txt carries binary properties beyond what's in
|
|
476
|
-
# DerivedCoreProperties (White_Space, Hyphen, Variation_Selector,
|
|
477
|
-
# etc.). Merge into the same binary_properties list.
|
|
478
|
-
def assign_extra_binary_properties(cp, indices)
|
|
479
|
-
extras = all_range_values(cp.cp, indices.extra_binary_properties)
|
|
480
|
-
return if extras.empty?
|
|
481
|
-
|
|
482
|
-
cp.binary_properties.concat(extras)
|
|
483
|
-
cp.binary_properties.uniq!
|
|
484
|
-
end
|
|
485
|
-
|
|
486
|
-
# Returns every value whose range contains `cp` in a sorted tuple
|
|
487
|
-
# array. Most codepoint+property pairs match at most one range, but
|
|
488
|
-
# a codepoint can carry multiple binary properties from PropList or
|
|
489
|
-
# emoji-data, so we collect them all.
|
|
490
|
-
def all_range_values(cp, sorted_ranges)
|
|
491
|
-
return [] if sorted_ranges.nil? || sorted_ranges.empty?
|
|
492
|
-
|
|
493
|
-
values = []
|
|
494
|
-
sorted_ranges.each do |record|
|
|
495
|
-
next if cp < record.range_first
|
|
496
|
-
break if cp > record.range_last && record.range_first > cp
|
|
497
|
-
|
|
498
|
-
if cp >= record.range_first && cp <= record.range_last
|
|
499
|
-
values << record.value
|
|
500
|
-
end
|
|
501
|
-
end
|
|
502
|
-
values
|
|
503
|
-
end
|
|
504
|
-
|
|
505
|
-
# ---- Range lookup (bsearch) ----------------------------------------
|
|
506
|
-
|
|
507
|
-
# Finds the range-containing record in a sorted array via bsearch.
|
|
508
|
-
# Records respond to `range_first` and `range_last`.
|
|
509
|
-
#
|
|
510
|
-
# bsearch_index integer-mode convention: return -1 to search LEFT,
|
|
511
|
-
# +1 to search RIGHT, 0 for a match. `cp < range_first` means the
|
|
512
|
-
# target range lies in earlier (lower-indexed) records, so we return
|
|
513
|
-
# -1; `cp > range_last` means it lies in later records, so we return
|
|
514
|
-
# +1.
|
|
515
|
-
def find_in_range(cp, sorted_ranges)
|
|
516
|
-
return nil if sorted_ranges.nil? || sorted_ranges.empty?
|
|
517
|
-
|
|
518
|
-
idx = sorted_ranges.bsearch_index do |record|
|
|
519
|
-
if cp < record.range_first
|
|
520
|
-
-1
|
|
521
|
-
elsif cp > record.range_last
|
|
522
|
-
1
|
|
523
|
-
else
|
|
524
|
-
0
|
|
525
|
-
end
|
|
526
|
-
end
|
|
527
|
-
idx.nil? ? nil : sorted_ranges[idx]
|
|
254
|
+
cp.block_id = RangeLookup.find_in_range(cp.cp, indices.blocks)&.id
|
|
255
|
+
Enrichment.apply(cp, indices)
|
|
528
256
|
end
|
|
529
257
|
end
|
|
530
258
|
end
|