ucode 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ucode/code_chart/extractor.rb +1 -9
  3. data/lib/ucode/code_chart/writer.rb +1 -1
  4. data/lib/ucode/commands/canonical_build.rb +4 -4
  5. data/lib/ucode/commands/universal_set.rb +5 -3
  6. data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
  7. data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
  8. data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
  9. data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
  10. data/lib/ucode/coordinator/enrichment/display.rb +36 -0
  11. data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
  12. data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
  13. data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
  14. data/lib/ucode/coordinator/enrichment/names.rb +63 -0
  15. data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
  16. data/lib/ucode/coordinator/enrichment.rb +51 -0
  17. data/lib/ucode/coordinator/range_lookup.rb +65 -0
  18. data/lib/ucode/coordinator.rb +4 -276
  19. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
  20. data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
  21. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
  22. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
  23. data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
  24. data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
  25. data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
  26. data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
  27. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
  28. data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
  29. data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
  30. data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
  31. data/lib/ucode/glyphs/resolver_factory.rb +45 -0
  32. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
  33. data/lib/ucode/glyphs.rb +1 -0
  34. data/lib/ucode/version.rb +1 -1
  35. metadata +20 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 409561757912083c19e4044c0ed37129945bf6de53bc3b029d349e4a8f16f10f
4
- data.tar.gz: 85a06e0383587af4d8a88342974a58105423635b85212ecc7b1783268e6c5e2a
3
+ metadata.gz: a6674c0e1a8cab0ef2ea7782878be4571e68603843742de78a43da6a6499220c
4
+ data.tar.gz: 416b0965b6fa4e1e22ef9c431a7c27a16093ba90b708224b7e9c9e1f15ac2c49
5
5
  SHA512:
6
- metadata.gz: 85660ae16bbfa2632131888872ddebca9cdee45d26791837ddce2fa629e18a721c9701b023c5424e058165eac03d1d4e1d16bb2a6c0b582a8ef4c1e0104ecdf5
7
- data.tar.gz: 411de21c9c5f3e46b559752d54462f02f4110aa59e261ee3ea6c19383ad9383fd43f013e1107fe59b3e6c3b296f906a8c99e7087da5f9c85f2b13ba385447b95
6
+ metadata.gz: 361d385dd8b5bb04d3ce528ddda0bed64b8c5db46a1ef7e2f77c79ba5c4cf41f67235e8e55d5a38ec2957e1650f7fa4610b50722a299f2dfd67f900bd25e66c0
7
+ data.tar.gz: 1049d2b20c925da363339af6bcf1b98fd1d7fcf93a50adf9bde2240314eb272f89af6981a6604d8e7b5fa8b075b6c36deb68d42e691425f18fa70fca94a4286e
@@ -2,14 +2,6 @@
2
2
 
3
3
  require "pathname"
4
4
 
5
- require "ucode/error"
6
- require "ucode/glyphs/embedded_fonts/catalog"
7
- require "ucode/glyphs/embedded_fonts/renderer"
8
- require "ucode/glyphs/embedded_fonts/source"
9
- require "ucode/glyphs/resolver"
10
- require "ucode/glyphs/sources/pillar1_embedded_tounicode"
11
- require "ucode/glyphs/sources/tier1_real_font"
12
-
13
5
  module Ucode
14
6
  module CodeChart
15
7
  # Walks every assigned codepoint in a block and returns one
@@ -108,7 +100,7 @@ module Ucode
108
100
  end
109
101
 
110
102
  def embedded_pillar_sources
111
- embedded_source = Glyphs::EmbeddedFonts::Source.new(
103
+ embedded_source = Glyphs::EmbeddedFonts::PdfLocation.new(
112
104
  pdf: @pdf_path, cache_dir: @cache_dir,
113
105
  )
114
106
  catalog = Glyphs::EmbeddedFonts::Catalog.new(embedded_source)
@@ -47,7 +47,7 @@ module Ucode
47
47
  # @param ucd_version [String, nil] UCD version to stamp on
48
48
  # provenance. nil = resolved via {VersionResolver.resolve(nil)}.
49
49
  # @param cache_dir [Pathname, String, nil] font-stream cache
50
- # directory for the EmbeddedFonts::Source.
50
+ # directory for the EmbeddedFonts::PdfLocation.
51
51
  # @param now [Time, nil] timestamp override (for tests).
52
52
  # @param pillar3_source, tier1_sources: forwarded to the Extractor.
53
53
  def initialize(output_root:, pdf_path:, ucd_version: nil,
@@ -118,10 +118,10 @@ module Ucode
118
118
  end
119
119
 
120
120
  def build_resolver(version, source_config_path)
121
- database = Database.open(version)
122
- config = Glyphs::SourceConfig.new(path: source_config_path_or_default(source_config_path))
123
- builder = Glyphs::SourceBuilder.new(config: config, database: database)
124
- Glyphs::Resolver.new(sources: builder.tier1_sources(install: false))
121
+ Glyphs::ResolverFactory.build(
122
+ version: version,
123
+ source_config_path: source_config_path,
124
+ )
125
125
  end
126
126
 
127
127
  def source_config_path_or_default(path)
@@ -126,9 +126,11 @@ module Ucode
126
126
  end
127
127
 
128
128
  def build_resolver(_version, config_path, database)
129
- config = Glyphs::SourceConfig.new(path: config_path)
130
- builder = Glyphs::SourceBuilder.new(config: config, database: database)
131
- Glyphs::Resolver.new(sources: builder.tier1_sources(install: false))
129
+ Glyphs::ResolverFactory.build(
130
+ version: _version,
131
+ source_config_path: config_path,
132
+ database: database,
133
+ )
132
134
  end
133
135
 
134
136
  def codepoint_enum(version)
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Bidirectional behaviour: mirroring glyph and paired bracket info.
7
+ module Bidi
8
+ class << self
9
+ # @param cp [Ucode::Models::CodePoint]
10
+ # @param indices [Ucode::Coordinator::Indices]
11
+ def enrich(cp, indices)
12
+ mirroring = indices.bidi_mirroring[cp.cp]
13
+ brackets = indices.bidi_brackets[cp.cp]
14
+ return unless mirroring || brackets
15
+
16
+ cp.bidi ||= Ucode::Models::CodePoint::Bidi.new
17
+ apply_mirroring(cp, mirroring) if mirroring
18
+ apply_brackets(cp, brackets) if brackets
19
+ end
20
+
21
+ private
22
+
23
+ def apply_mirroring(cp, mirroring)
24
+ cp.bidi.mirroring_glyph_id = mirroring.mirrored_id
25
+ end
26
+
27
+ def apply_brackets(cp, brackets)
28
+ cp.bidi.paired_bracket_type = brackets.type
29
+ cp.bidi.paired_bracket_id = brackets.paired_id
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Binary properties: DerivedCoreProperties (core) + PropList (extra).
7
+ # Both contribute to the same `cp.binary_properties` array.
8
+ module Binary
9
+ class << self
10
+ def enrich(cp, indices)
11
+ assign_core(cp, indices)
12
+ assign_extra(cp, indices)
13
+ end
14
+
15
+ private
16
+
17
+ def assign_core(cp, indices)
18
+ records = indices.binary_properties[cp.cp]
19
+ return unless records && !records.empty?
20
+
21
+ cp.binary_properties = records.map(&:property_short)
22
+ end
23
+
24
+ # PropList carries binary properties beyond DerivedCoreProperties
25
+ # (White_Space, Hyphen, Variation_Selector, etc.). Merge into the
26
+ # same binary_properties list, deduped.
27
+ def assign_extra(cp, indices)
28
+ extras = RangeLookup.all_range_values(cp.cp, indices.extra_binary_properties)
29
+ return if extras.empty?
30
+
31
+ cp.binary_properties.concat(extras)
32
+ cp.binary_properties.uniq!
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Context-sensitive case mappings and case folding for comparison.
7
+ module Casing
8
+ class << self
9
+ def enrich(cp, indices)
10
+ assign_special_casing(cp, indices)
11
+ assign_case_folding(cp, indices)
12
+ end
13
+
14
+ private
15
+
16
+ # NOTE: do not uniq the *_ids arrays — a mapping like U+00DF → "SS"
17
+ # legitimately contains two U+0053 entries and they must be
18
+ # preserved in order. Conditions, by contrast, are categorical
19
+ # tags (Final_Sigma, tr, After_I) and deduping them is correct.
20
+ def assign_special_casing(cp, indices)
21
+ rules = indices.special_casing[cp.cp]
22
+ return unless rules && !rules.empty?
23
+
24
+ cp.casing ||= Ucode::Models::CodePoint::Casing.new
25
+ apply_casing_rules(cp.casing, rules)
26
+ end
27
+
28
+ def apply_casing_rules(casing, rules)
29
+ casing.full_upper_ids = rules.flat_map(&:upper_ids)
30
+ casing.full_lower_ids = rules.flat_map(&:lower_ids)
31
+ casing.full_title_ids = rules.flat_map(&:title_ids)
32
+ casing.conditions = rules.flat_map(&:conditions).uniq
33
+ end
34
+
35
+ def assign_case_folding(cp, indices)
36
+ rules = indices.case_folding[cp.cp]
37
+ return unless rules && !rules.empty?
38
+
39
+ cp.case_folding ||= Ucode::Models::CodePoint::CaseFolding.new
40
+ rules.each { |rule| apply_folding_rule(cp, rule) }
41
+ end
42
+
43
+ def apply_folding_rule(cp, rule)
44
+ case rule.status
45
+ when "C" then cp.case_folding.common_id = rule.mapping_ids.first
46
+ when "S" then cp.case_folding.simple_id = rule.mapping_ids.first
47
+ when "T" then cp.case_folding.turkic_id = rule.mapping_ids.first
48
+ when "F" then cp.case_folding.full_ids = rule.mapping_ids
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # CJK-specific data: Unihan readings, KangXi radical mapping,
7
+ # Hangul syllable type.
8
+ module CJK
9
+ class << self
10
+ def enrich(cp, indices)
11
+ assign_unihan(cp, indices)
12
+ assign_cjk_radical(cp, indices)
13
+ assign_hangul(cp, indices)
14
+ end
15
+
16
+ private
17
+
18
+ def assign_unihan(cp, indices)
19
+ entry = indices.unihan[cp.cp]
20
+ return unless entry
21
+
22
+ cp.unihan = entry
23
+ end
24
+
25
+ def assign_cjk_radical(cp, indices)
26
+ radicals = indices.cjk_radicals[cp.id]
27
+ return unless radicals && !radicals.empty?
28
+
29
+ radicals.each do |radical|
30
+ cp.relationships << Ucode::Models::Relationship::CrossReference.new(
31
+ target_ids: [radical.cjk_radical_id],
32
+ description: "KangXi radical ##{radical.radical_number}",
33
+ source: "cjk_radicals",
34
+ )
35
+ end
36
+ end
37
+
38
+ def assign_hangul(cp, indices)
39
+ tuple = RangeLookup.find_in_range(cp.cp, indices.hangul_syllable_type)
40
+ return unless tuple
41
+
42
+ cp.hangul ||= Ucode::Models::CodePoint::HangulSyllable.new
43
+ cp.hangul.type = tuple.value
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Display layout properties: Line Break class, East Asian Width,
7
+ # Vertical Orientation. All three are range+value files, looked up
8
+ # via bsearch on sorted arrays of ExtractedProperties::Tuple.
9
+ module Display
10
+ class << self
11
+ def enrich(cp, indices)
12
+ lb = lookup_value(cp, indices.line_break)
13
+ eaw = lookup_value(cp, indices.east_asian_width)
14
+ vo = lookup_value(cp, indices.vertical_orientation)
15
+ return if lb.nil? && eaw.nil? && vo.nil?
16
+
17
+ cp.display ||= Ucode::Models::CodePoint::Display.new
18
+ apply_values(cp.display, lb, eaw, vo)
19
+ end
20
+
21
+ private
22
+
23
+ def lookup_value(cp, ranges)
24
+ RangeLookup.find_in_range(cp.cp, ranges)&.value
25
+ end
26
+
27
+ def apply_values(display, lb, eaw, vo)
28
+ display.line_break_class = lb if lb
29
+ display.east_asian_width = eaw if eaw
30
+ display.vertical_orientation = vo if vo
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Emoji property bundle. Each Emoji_* property from emoji-data.txt
7
+ # flips the matching boolean on the Emoji sub-model.
8
+ module Emoji
9
+ class << self
10
+ def enrich(cp, indices)
11
+ return unless RangeLookup.find_in_range(cp.cp, indices.emoji_properties)
12
+
13
+ props = RangeLookup.all_range_values(cp.cp, indices.emoji_properties)
14
+ return if props.empty?
15
+
16
+ cp.emoji ||= Ucode::Models::CodePoint::Emoji.new
17
+ props.each { |prop| apply_property(cp, prop) }
18
+ end
19
+
20
+ private
21
+
22
+ def apply_property(cp, prop)
23
+ case prop
24
+ when "Emoji" then cp.emoji.is_emoji = true
25
+ when "Emoji_Presentation" then cp.emoji.is_presentation_default = true
26
+ when "Emoji_Modifier" then cp.emoji.is_modifier = true
27
+ when "Emoji_Modifier_Base" then cp.emoji.is_base = true
28
+ when "Emoji_Component" then cp.emoji.is_component = true
29
+ when "Extended_Pictographic" then cp.emoji.is_extended_pictographic = true
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Basic identity properties: primary script, script extensions,
7
+ # and the Unicode version when the codepoint was introduced.
8
+ module Identity
9
+ class << self
10
+ def enrich(cp, indices)
11
+ assign_script(cp, indices)
12
+ assign_script_extensions(cp, indices)
13
+ assign_age(cp, indices)
14
+ end
15
+
16
+ private
17
+
18
+ def assign_script(cp, indices)
19
+ script = RangeLookup.find_in_range(cp.cp, indices.scripts)
20
+ return unless script
21
+
22
+ cp.script_code = script.code || script.name
23
+ end
24
+
25
+ def assign_script_extensions(cp, indices)
26
+ tuples = indices.script_extensions[cp.cp]
27
+ return unless tuples && !tuples.empty?
28
+
29
+ tuples.each { |tuple| cp.script_extensions << tuple.script_code }
30
+ end
31
+
32
+ def assign_age(cp, indices)
33
+ record = indices.derived_age[cp.cp]
34
+ return unless record
35
+
36
+ cp.age = record.age
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Indic script shaping categories: positional and syllabic.
7
+ module Indic
8
+ class << self
9
+ def enrich(cp, indices)
10
+ positional = lookup_value(cp, indices.indic_positional)
11
+ syllabic = lookup_value(cp, indices.indic_syllabic)
12
+ return if positional.nil? && syllabic.nil?
13
+
14
+ cp.indic ||= Ucode::Models::CodePoint::Indic.new
15
+ apply_values(cp.indic, positional, syllabic)
16
+ end
17
+
18
+ private
19
+
20
+ def lookup_value(cp, ranges)
21
+ RangeLookup.find_in_range(cp.cp, ranges)&.value
22
+ end
23
+
24
+ def apply_values(indic, positional, syllabic)
25
+ indic.positional_category = positional if positional
26
+ indic.syllabic_category = syllabic if syllabic
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # Human-curated name annotations: cross-references, informal aliases,
7
+ # footnotes from NamesList.txt; formal name aliases from
8
+ # NameAliases.txt; standardized variation sequences.
9
+ module Names
10
+ class << self
11
+ def enrich(cp, indices)
12
+ assign_names_list(cp, indices)
13
+ assign_name_aliases(cp, indices)
14
+ assign_standardized_variants(cp, indices)
15
+ end
16
+
17
+ private
18
+
19
+ def assign_names_list(cp, indices)
20
+ entry = indices.names_list[cp.cp]
21
+ return unless entry
22
+
23
+ cp.names_list = entry
24
+ cp.relationships.concat(entry.cross_references)
25
+ cp.relationships.concat(entry.sample_sequences)
26
+ cp.relationships.concat(entry.compatibility_equivalents)
27
+ cp.relationships.concat(entry.informal_aliases)
28
+ cp.relationships.concat(entry.footnotes)
29
+ end
30
+
31
+ def assign_name_aliases(cp, indices)
32
+ aliases = indices.name_aliases[cp.cp]
33
+ return unless aliases && !aliases.empty?
34
+
35
+ aliases.each do |alias_record|
36
+ cp.relationships << Ucode::Models::Relationship::InformalAlias.new(
37
+ description: alias_record.text,
38
+ source: "name_aliases",
39
+ )
40
+ end
41
+ end
42
+
43
+ def assign_standardized_variants(cp, indices)
44
+ variants = indices.standardized_variants[cp.id]
45
+ return unless variants && !variants.empty?
46
+
47
+ cp.standardized_variants = variants
48
+ variants.each { |v| add_variant_relationship(cp, v) }
49
+ end
50
+
51
+ def add_variant_relationship(cp, variant)
52
+ cp.relationships << Ucode::Models::Relationship::VariationSequence.new(
53
+ target_ids: [variant.base_id, variant.variation_selector_id],
54
+ description: variant.description,
55
+ contexts: variant.contexts,
56
+ source: "standardized_variants",
57
+ )
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ module Enrichment
6
+ # UAX #29 text segmentation: Grapheme / Word / Sentence break class.
7
+ module Segmentation
8
+ class << self
9
+ def enrich(cp, indices)
10
+ grapheme = lookup_value(cp, indices.grapheme_break)
11
+ word = lookup_value(cp, indices.word_break)
12
+ sentence = lookup_value(cp, indices.sentence_break)
13
+ return if grapheme.nil? && word.nil? && sentence.nil?
14
+
15
+ cp.break_segmentation ||= Ucode::Models::CodePoint::BreakSegmentation.new
16
+ apply_values(cp.break_segmentation, grapheme, word, sentence)
17
+ end
18
+
19
+ private
20
+
21
+ def lookup_value(cp, ranges)
22
+ RangeLookup.find_in_range(cp.cp, ranges)&.value
23
+ end
24
+
25
+ def apply_values(seg, grapheme, word, sentence)
26
+ seg.grapheme = grapheme if grapheme
27
+ seg.word = word if word
28
+ seg.sentence = sentence if sentence
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ # Registry of per-codepoint enrichment concerns. Each module under
6
+ # {Enrichment} owns one slice of the UCD/Unihan merge: Script, Bidi,
7
+ # Casing, Names, CJK, Display, Segmentation, Indic, Emoji, Binary.
8
+ #
9
+ # The registry is an ordered Array of modules. {Coordinator#enrich}
10
+ # iterates it, calling `enrich(cp, indices)` on each. New UCD
11
+ # properties land as a new module + one line in {REGISTRY}.
12
+ #
13
+ # Each module is pure: it reads from {Indices} and mutates the
14
+ # CodePoint model. Range lookups go through {RangeLookup}.
15
+ module Enrichment
16
+ autoload :Identity, "ucode/coordinator/enrichment/identity"
17
+ autoload :Bidi, "ucode/coordinator/enrichment/bidi"
18
+ autoload :Casing, "ucode/coordinator/enrichment/casing"
19
+ autoload :Binary, "ucode/coordinator/enrichment/binary"
20
+ autoload :Names, "ucode/coordinator/enrichment/names"
21
+ autoload :CJK, "ucode/coordinator/enrichment/cjk"
22
+ autoload :Display, "ucode/coordinator/enrichment/display"
23
+ autoload :Segmentation, "ucode/coordinator/enrichment/segmentation"
24
+ autoload :Indic, "ucode/coordinator/enrichment/indic"
25
+ autoload :Emoji, "ucode/coordinator/enrichment/emoji"
26
+
27
+ # Order matters only for determinism — each module sets disjoint
28
+ # fields on the CodePoint model. Preserved from the original flat
29
+ # dispatch for stable diff comparisons.
30
+ REGISTRY = [
31
+ Identity,
32
+ Bidi,
33
+ Casing,
34
+ Binary,
35
+ Names,
36
+ CJK,
37
+ Display,
38
+ Segmentation,
39
+ Indic,
40
+ Emoji,
41
+ ].freeze
42
+
43
+ # Apply every enrichment concern to `cp`, in registry order.
44
+ # @param cp [Ucode::Models::CodePoint]
45
+ # @param indices [Ucode::Coordinator::Indices]
46
+ def self.apply(cp, indices)
47
+ REGISTRY.each { |mod| mod.enrich(cp, indices) }
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ class Coordinator
5
+ # Pure-function range lookups shared by the enrichment pipeline.
6
+ #
7
+ # Extracted from Coordinator so that {Enrichment} modules can call
8
+ # them without inheriting Coordinator's instance context. Both
9
+ # methods are deterministic and side-effect free.
10
+ module RangeLookup
11
+ module_function
12
+
13
+ # Finds the single range-containing record in a sorted array via
14
+ # bsearch. Records respond to `range_first` and `range_last`.
15
+ #
16
+ # bsearch_index integer-mode convention: return -1 to search LEFT,
17
+ # +1 to search RIGHT, 0 for a match. `cp < range_first` means the
18
+ # target range lies in earlier (lower-indexed) records, so we
19
+ # return -1; `cp > range_last` means it lies in later records, so
20
+ # we return +1.
21
+ #
22
+ # @param cp [Integer]
23
+ # @param sorted_ranges [Array] sorted by range_first
24
+ # @return [Object, nil] the record whose range contains cp
25
+ def find_in_range(cp, sorted_ranges)
26
+ return nil if sorted_ranges.nil? || sorted_ranges.empty?
27
+
28
+ idx = sorted_ranges.bsearch_index { |record| compare_cp(cp, record) }
29
+ idx.nil? ? nil : sorted_ranges[idx]
30
+ end
31
+
32
+ def compare_cp(cp, record)
33
+ return -1 if cp < record.range_first
34
+ return 1 if cp > record.range_last
35
+
36
+ 0
37
+ end
38
+
39
+ # Returns every value whose range contains `cp` in a sorted tuple
40
+ # array. Most codepoint+property pairs match at most one range, but
41
+ # a codepoint can carry multiple binary properties from PropList or
42
+ # emoji-data, so we collect them all.
43
+ #
44
+ # Ranges are sorted by `range_first`. Once we hit a range that
45
+ # starts after `cp`, every subsequent range also starts after `cp`,
46
+ # so we break. Ranges that end before `cp` are skipped.
47
+ #
48
+ # @param cp [Integer]
49
+ # @param sorted_ranges [Array] sorted by range_first
50
+ # @return [Array] values of every range containing cp
51
+ def all_range_values(cp, sorted_ranges)
52
+ return [] if sorted_ranges.nil? || sorted_ranges.empty?
53
+
54
+ values = []
55
+ sorted_ranges.each do |record|
56
+ break if record.range_first > cp
57
+ next if record.range_last < cp
58
+
59
+ values << record.value
60
+ end
61
+ values
62
+ end
63
+ end
64
+ end
65
+ end