ucode 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ucode/code_chart/extractor.rb +1 -9
  3. data/lib/ucode/code_chart/writer.rb +1 -1
  4. data/lib/ucode/commands/canonical_build.rb +4 -4
  5. data/lib/ucode/commands/universal_set.rb +5 -3
  6. data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
  7. data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
  8. data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
  9. data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
  10. data/lib/ucode/coordinator/enrichment/display.rb +36 -0
  11. data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
  12. data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
  13. data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
  14. data/lib/ucode/coordinator/enrichment/names.rb +63 -0
  15. data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
  16. data/lib/ucode/coordinator/enrichment.rb +51 -0
  17. data/lib/ucode/coordinator/range_lookup.rb +65 -0
  18. data/lib/ucode/coordinator.rb +4 -276
  19. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
  20. data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
  21. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
  22. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
  23. data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
  24. data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
  25. data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
  26. data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
  27. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
  28. data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
  29. data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
  30. data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
  31. data/lib/ucode/glyphs/resolver_factory.rb +45 -0
  32. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
  33. data/lib/ucode/glyphs.rb +1 -0
  34. data/lib/ucode/version.rb +1 -1
  35. metadata +20 -3
@@ -27,6 +27,8 @@ module Ucode
27
27
  # subsets.
28
28
  class Coordinator
29
29
  autoload :Indices, "ucode/coordinator/indices"
30
+ autoload :RangeLookup, "ucode/coordinator/range_lookup"
31
+ autoload :Enrichment, "ucode/coordinator/enrichment"
30
32
 
31
33
  ISO_SCRIPT_PROPERTY = "sc".freeze
32
34
  private_constant :ISO_SCRIPT_PROPERTY
@@ -249,282 +251,8 @@ module Ucode
249
251
 
250
252
  def enrich(cp, indices)
251
253
  cp.plane_number = cp.cp >> 16
252
- cp.block_id = find_in_range(cp.cp, indices.blocks)&.id
253
- assign_script(cp, indices)
254
- assign_script_extensions(cp, indices)
255
- assign_age(cp, indices)
256
- assign_bidi(cp, indices)
257
- assign_casing(cp, indices)
258
- assign_case_folding(cp, indices)
259
- assign_binary_properties(cp, indices)
260
- assign_names_list(cp, indices)
261
- assign_name_aliases(cp, indices)
262
- assign_standardized_variants(cp, indices)
263
- assign_unihan(cp, indices)
264
- assign_cjk_radical(cp, indices)
265
- assign_display(cp, indices)
266
- assign_break_segmentation(cp, indices)
267
- assign_indic(cp, indices)
268
- assign_hangul(cp, indices)
269
- assign_emoji(cp, indices)
270
- assign_extra_binary_properties(cp, indices)
271
- end
272
-
273
- def assign_script(cp, indices)
274
- script = find_in_range(cp.cp, indices.scripts)
275
- return unless script
276
-
277
- cp.script_code = script.code || script.name
278
- end
279
-
280
- def assign_script_extensions(cp, indices)
281
- tuples = indices.script_extensions[cp.cp]
282
- return unless tuples && !tuples.empty?
283
-
284
- tuples.each { |tuple| cp.script_extensions << tuple.script_code }
285
- end
286
-
287
- def assign_age(cp, indices)
288
- record = indices.derived_age[cp.cp]
289
- return unless record
290
-
291
- cp.age = record.age
292
- end
293
-
294
- def assign_bidi(cp, indices)
295
- mirroring = indices.bidi_mirroring[cp.cp]
296
- brackets = indices.bidi_brackets[cp.cp]
297
- return unless mirroring || brackets
298
-
299
- cp.bidi ||= Models::CodePoint::Bidi.new
300
- if mirroring
301
- cp.bidi.mirroring_glyph_id = mirroring.mirrored_id
302
- end
303
- if brackets
304
- cp.bidi.paired_bracket_type = brackets.type
305
- cp.bidi.paired_bracket_id = brackets.paired_id
306
- end
307
- end
308
-
309
- def assign_casing(cp, indices)
310
- rules = indices.special_casing[cp.cp]
311
- return unless rules && !rules.empty?
312
-
313
- # NOTE: do not uniq the *_ids arrays — a mapping like U+00DF → "SS"
314
- # legitimately contains two U+0053 entries and they must be
315
- # preserved in order. Conditions, by contrast, are categorical
316
- # tags (Final_Sigma, tr, After_I) and deduping them is correct.
317
- cp.casing ||= Models::CodePoint::Casing.new
318
- cp.casing.full_upper_ids = rules.flat_map(&:upper_ids)
319
- cp.casing.full_lower_ids = rules.flat_map(&:lower_ids)
320
- cp.casing.full_title_ids = rules.flat_map(&:title_ids)
321
- cp.casing.conditions = rules.flat_map(&:conditions).uniq
322
- end
323
-
324
- def assign_case_folding(cp, indices)
325
- rules = indices.case_folding[cp.cp]
326
- return unless rules && !rules.empty?
327
-
328
- cp.case_folding ||= Models::CodePoint::CaseFolding.new
329
- rules.each do |rule|
330
- case rule.status
331
- when "C" then cp.case_folding.common_id = rule.mapping_ids.first
332
- when "S" then cp.case_folding.simple_id = rule.mapping_ids.first
333
- when "T" then cp.case_folding.turkic_id = rule.mapping_ids.first
334
- when "F" then cp.case_folding.full_ids = rule.mapping_ids
335
- end
336
- end
337
- end
338
-
339
- def assign_binary_properties(cp, indices)
340
- records = indices.binary_properties[cp.cp]
341
- return unless records && !records.empty?
342
-
343
- cp.binary_properties = records.map(&:property_short)
344
- end
345
-
346
- def assign_names_list(cp, indices)
347
- entry = indices.names_list[cp.cp]
348
- return unless entry
349
-
350
- cp.names_list = entry
351
- cp.relationships.concat(entry.cross_references)
352
- cp.relationships.concat(entry.sample_sequences)
353
- cp.relationships.concat(entry.compatibility_equivalents)
354
- cp.relationships.concat(entry.informal_aliases)
355
- cp.relationships.concat(entry.footnotes)
356
- end
357
-
358
- def assign_name_aliases(cp, indices)
359
- aliases = indices.name_aliases[cp.cp]
360
- return unless aliases && !aliases.empty?
361
-
362
- aliases.each do |alias_record|
363
- cp.relationships << Models::Relationship::InformalAlias.new(
364
- description: alias_record.text,
365
- source: "name_aliases"
366
- )
367
- end
368
- end
369
-
370
- def assign_standardized_variants(cp, indices)
371
- variants = indices.standardized_variants[cp.id]
372
- return unless variants && !variants.empty?
373
-
374
- cp.standardized_variants = variants
375
- variants.each do |variant|
376
- cp.relationships << Models::Relationship::VariationSequence.new(
377
- target_ids: [variant.base_id, variant.variation_selector_id],
378
- description: variant.description,
379
- contexts: variant.contexts,
380
- source: "standardized_variants"
381
- )
382
- end
383
- end
384
-
385
- def assign_unihan(cp, indices)
386
- entry = indices.unihan[cp.cp]
387
- return unless entry
388
-
389
- cp.unihan = entry
390
- end
391
-
392
- def assign_cjk_radical(cp, indices)
393
- radicals = indices.cjk_radicals[cp.id]
394
- return unless radicals && !radicals.empty?
395
-
396
- radicals.each do |radical|
397
- cp.relationships << Models::Relationship::CrossReference.new(
398
- target_ids: [radical.cjk_radical_id],
399
- description: "KangXi radical ##{radical.radical_number}",
400
- source: "cjk_radicals"
401
- )
402
- end
403
- end
404
-
405
- # Display: East Asian Width, Line Break Class, Vertical Orientation.
406
- # All three are range+value files, looked up via bsearch on sorted
407
- # arrays of ExtractedProperties::Tuple.
408
- def assign_display(cp, indices)
409
- tuple = find_in_range(cp.cp, indices.line_break)
410
- lb = tuple&.value
411
- tuple = find_in_range(cp.cp, indices.east_asian_width)
412
- eaw = tuple&.value
413
- tuple = find_in_range(cp.cp, indices.vertical_orientation)
414
- vo = tuple&.value
415
- return if lb.nil? && eaw.nil? && vo.nil?
416
-
417
- cp.display ||= Models::CodePoint::Display.new
418
- cp.display.line_break_class = lb if lb
419
- cp.display.east_asian_width = eaw if eaw
420
- cp.display.vertical_orientation = vo if vo
421
- end
422
-
423
- # UAX #29 segmentation: Grapheme / Word / Sentence break class.
424
- def assign_break_segmentation(cp, indices)
425
- grapheme = find_in_range(cp.cp, indices.grapheme_break)&.value
426
- word = find_in_range(cp.cp, indices.word_break)&.value
427
- sentence = find_in_range(cp.cp, indices.sentence_break)&.value
428
- return if grapheme.nil? && word.nil? && sentence.nil?
429
-
430
- cp.break_segmentation ||= Models::CodePoint::BreakSegmentation.new
431
- cp.break_segmentation.grapheme = grapheme if grapheme
432
- cp.break_segmentation.word = word if word
433
- cp.break_segmentation.sentence = sentence if sentence
434
- end
435
-
436
- def assign_indic(cp, indices)
437
- positional = find_in_range(cp.cp, indices.indic_positional)&.value
438
- syllabic = find_in_range(cp.cp, indices.indic_syllabic)&.value
439
- return if positional.nil? && syllabic.nil?
440
-
441
- cp.indic ||= Models::CodePoint::Indic.new
442
- cp.indic.positional_category = positional if positional
443
- cp.indic.syllabic_category = syllabic if syllabic
444
- end
445
-
446
- def assign_hangul(cp, indices)
447
- tuple = find_in_range(cp.cp, indices.hangul_syllable_type)
448
- return unless tuple
449
-
450
- cp.hangul ||= Models::CodePoint::HangulSyllable.new
451
- cp.hangul.type = tuple.value
452
- end
453
-
454
- # Emoji property bundle. Each Emoji_* property from emoji-data.txt
455
- # flips the matching boolean on the Emoji sub-model.
456
- def assign_emoji(cp, indices)
457
- return unless find_in_range(cp.cp, indices.emoji_properties)
458
-
459
- props = all_range_values(cp.cp, indices.emoji_properties)
460
- return if props.empty?
461
-
462
- cp.emoji ||= Models::CodePoint::Emoji.new
463
- props.each do |prop|
464
- case prop
465
- when "Emoji" then cp.emoji.is_emoji = true
466
- when "Emoji_Presentation" then cp.emoji.is_presentation_default = true
467
- when "Emoji_Modifier" then cp.emoji.is_modifier = true
468
- when "Emoji_Modifier_Base" then cp.emoji.is_base = true
469
- when "Emoji_Component" then cp.emoji.is_component = true
470
- when "Extended_Pictographic" then cp.emoji.is_extended_pictographic = true
471
- end
472
- end
473
- end
474
-
475
- # PropList.txt carries binary properties beyond what's in
476
- # DerivedCoreProperties (White_Space, Hyphen, Variation_Selector,
477
- # etc.). Merge into the same binary_properties list.
478
- def assign_extra_binary_properties(cp, indices)
479
- extras = all_range_values(cp.cp, indices.extra_binary_properties)
480
- return if extras.empty?
481
-
482
- cp.binary_properties.concat(extras)
483
- cp.binary_properties.uniq!
484
- end
485
-
486
- # Returns every value whose range contains `cp` in a sorted tuple
487
- # array. Most codepoint+property pairs match at most one range, but
488
- # a codepoint can carry multiple binary properties from PropList or
489
- # emoji-data, so we collect them all.
490
- def all_range_values(cp, sorted_ranges)
491
- return [] if sorted_ranges.nil? || sorted_ranges.empty?
492
-
493
- values = []
494
- sorted_ranges.each do |record|
495
- next if cp < record.range_first
496
- break if cp > record.range_last && record.range_first > cp
497
-
498
- if cp >= record.range_first && cp <= record.range_last
499
- values << record.value
500
- end
501
- end
502
- values
503
- end
504
-
505
- # ---- Range lookup (bsearch) ----------------------------------------
506
-
507
- # Finds the range-containing record in a sorted array via bsearch.
508
- # Records respond to `range_first` and `range_last`.
509
- #
510
- # bsearch_index integer-mode convention: return -1 to search LEFT,
511
- # +1 to search RIGHT, 0 for a match. `cp < range_first` means the
512
- # target range lies in earlier (lower-indexed) records, so we return
513
- # -1; `cp > range_last` means it lies in later records, so we return
514
- # +1.
515
- def find_in_range(cp, sorted_ranges)
516
- return nil if sorted_ranges.nil? || sorted_ranges.empty?
517
-
518
- idx = sorted_ranges.bsearch_index do |record|
519
- if cp < record.range_first
520
- -1
521
- elsif cp > record.range_last
522
- 1
523
- else
524
- 0
525
- end
526
- end
527
- idx.nil? ? nil : sorted_ranges[idx]
254
+ cp.block_id = RangeLookup.find_in_range(cp.cp, indices.blocks)&.id
255
+ Enrichment.apply(cp, indices)
528
256
  end
529
257
  end
530
258
  end