RubyGems - ucode - Versions diffs - 0.1.0 - Mend

ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

checksums.yaml +7 -0
data/CLAUDE.md +211 -0
data/Gemfile +22 -0
data/Gemfile.lock +406 -0
data/README.md +469 -0
data/Rakefile +18 -0
data/TODO.new/00-README.md +66 -0
data/TODO.new/01-pillar-terminology-alignment.md +69 -0
data/TODO.new/02-audit-schema-design.md +255 -0
data/TODO.new/03-directory-output-spec.md +203 -0
data/TODO.new/04-fontist-org-contract.md +173 -0
data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
data/TODO.new/06-audit-namespace-skeleton.md +105 -0
data/TODO.new/07-audit-models-port.md +132 -0
data/TODO.new/08-extractors-cheap-port.md +113 -0
data/TODO.new/09-extractors-expensive-port.md +99 -0
data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
data/TODO.new/12-formatters-port.md +115 -0
data/TODO.new/13-directory-emitter.md +147 -0
data/TODO.new/14-html-face-browser.md +144 -0
data/TODO.new/15-html-library-browser.md +102 -0
data/TODO.new/16-cli-audit-subcommands.md +142 -0
data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
data/TODO.new/19-fontisan-docs-update.md +155 -0
data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
data/TODO.new/21-canonical-unicode17-build.md +148 -0
data/TODO.new/22-implementation-order.md +176 -0
data/UCODE_CHANGELOG.md +97 -0
data/exe/ucode +8 -0
data/lib/ucode/aggregator.rb +77 -0
data/lib/ucode/audit/block_aggregator.rb +90 -0
data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
data/lib/ucode/audit/context.rb +137 -0
data/lib/ucode/audit/discrepancy_detector.rb +213 -0
data/lib/ucode/audit/extractors/aggregations.rb +70 -0
data/lib/ucode/audit/extractors/base.rb +21 -0
data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
data/lib/ucode/audit/extractors/coverage.rb +55 -0
data/lib/ucode/audit/extractors/hinting.rb +199 -0
data/lib/ucode/audit/extractors/identity.rb +65 -0
data/lib/ucode/audit/extractors/licensing.rb +75 -0
data/lib/ucode/audit/extractors/metrics.rb +108 -0
data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
data/lib/ucode/audit/extractors/provenance.rb +34 -0
data/lib/ucode/audit/extractors/style.rb +88 -0
data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
data/lib/ucode/audit/extractors.rb +31 -0
data/lib/ucode/audit/plane_aggregator.rb +37 -0
data/lib/ucode/audit/registry.rb +63 -0
data/lib/ucode/audit/script_aggregator.rb +92 -0
data/lib/ucode/audit.rb +27 -0
data/lib/ucode/cache.rb +113 -0
data/lib/ucode/cli.rb +272 -0
data/lib/ucode/commands/build.rb +68 -0
data/lib/ucode/commands/cache.rb +46 -0
data/lib/ucode/commands/fetch.rb +62 -0
data/lib/ucode/commands/font_coverage.rb +57 -0
data/lib/ucode/commands/glyphs.rb +136 -0
data/lib/ucode/commands/lookup.rb +65 -0
data/lib/ucode/commands/parse.rb +62 -0
data/lib/ucode/commands/site.rb +33 -0
data/lib/ucode/commands.rb +19 -0
data/lib/ucode/config.rb +110 -0
data/lib/ucode/coordinator/indices.rb +34 -0
data/lib/ucode/coordinator.rb +397 -0
data/lib/ucode/database.rb +214 -0
data/lib/ucode/db_builder.rb +107 -0
data/lib/ucode/error.rb +96 -0
data/lib/ucode/fetch/code_charts.rb +57 -0
data/lib/ucode/fetch/http.rb +83 -0
data/lib/ucode/fetch/ucd_zip.rb +57 -0
data/lib/ucode/fetch/unihan_zip.rb +57 -0
data/lib/ucode/fetch.rb +14 -0
data/lib/ucode/glyphs/cell_extractor.rb +130 -0
data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
data/lib/ucode/glyphs/grid.rb +30 -0
data/lib/ucode/glyphs/grid_detector.rb +165 -0
data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
data/lib/ucode/glyphs/last_resort/source.rb +125 -0
data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
data/lib/ucode/glyphs/last_resort.rb +36 -0
data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
data/lib/ucode/glyphs/page_renderer.rb +221 -0
data/lib/ucode/glyphs/path_bbox.rb +62 -0
data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
data/lib/ucode/glyphs/real_fonts.rb +32 -0
data/lib/ucode/glyphs/writer.rb +250 -0
data/lib/ucode/glyphs.rb +27 -0
data/lib/ucode/index.rb +106 -0
data/lib/ucode/index_builder.rb +94 -0
data/lib/ucode/models/audit/audit_axis.rb +30 -0
data/lib/ucode/models/audit/audit_diff.rb +77 -0
data/lib/ucode/models/audit/audit_report.rb +137 -0
data/lib/ucode/models/audit/baseline.rb +32 -0
data/lib/ucode/models/audit/block_summary.rb +72 -0
data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
data/lib/ucode/models/audit/codepoint_range.rb +39 -0
data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
data/lib/ucode/models/audit/color_capabilities.rb +91 -0
data/lib/ucode/models/audit/discrepancy.rb +38 -0
data/lib/ucode/models/audit/duplicate_group.rb +23 -0
data/lib/ucode/models/audit/embedding_type.rb +81 -0
data/lib/ucode/models/audit/field_change.rb +28 -0
data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
data/lib/ucode/models/audit/gasp_range.rb +63 -0
data/lib/ucode/models/audit/hinting.rb +99 -0
data/lib/ucode/models/audit/library_summary.rb +40 -0
data/lib/ucode/models/audit/licensing.rb +48 -0
data/lib/ucode/models/audit/metrics.rb +111 -0
data/lib/ucode/models/audit/named_instance.rb +41 -0
data/lib/ucode/models/audit/opentype_layout.rb +38 -0
data/lib/ucode/models/audit/plane_summary.rb +31 -0
data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
data/lib/ucode/models/audit/script_features.rb +28 -0
data/lib/ucode/models/audit/script_summary.rb +54 -0
data/lib/ucode/models/audit/variation_detail.rb +42 -0
data/lib/ucode/models/audit.rb +50 -0
data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
data/lib/ucode/models/bidi_mirroring.rb +19 -0
data/lib/ucode/models/binary_property_assignment.rb +26 -0
data/lib/ucode/models/block.rb +36 -0
data/lib/ucode/models/case_folding_rule.rb +23 -0
data/lib/ucode/models/cjk_radical.rb +23 -0
data/lib/ucode/models/codepoint/bidi.rb +28 -0
data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
data/lib/ucode/models/codepoint/case_folding.rb +25 -0
data/lib/ucode/models/codepoint/casing.rb +32 -0
data/lib/ucode/models/codepoint/decomposition.rb +27 -0
data/lib/ucode/models/codepoint/display.rb +24 -0
data/lib/ucode/models/codepoint/emoji.rb +29 -0
data/lib/ucode/models/codepoint/hangul.rb +20 -0
data/lib/ucode/models/codepoint/identifier.rb +30 -0
data/lib/ucode/models/codepoint/indic.rb +20 -0
data/lib/ucode/models/codepoint/joining.rb +20 -0
data/lib/ucode/models/codepoint/normalization.rb +35 -0
data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
data/lib/ucode/models/codepoint.rb +122 -0
data/lib/ucode/models/name_alias.rb +21 -0
data/lib/ucode/models/named_sequence.rb +19 -0
data/lib/ucode/models/names_list_entry.rb +38 -0
data/lib/ucode/models/plane.rb +36 -0
data/lib/ucode/models/property_alias.rb +24 -0
data/lib/ucode/models/property_value_alias.rb +26 -0
data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
data/lib/ucode/models/relationship/cross_reference.rb +17 -0
data/lib/ucode/models/relationship/footnote.rb +24 -0
data/lib/ucode/models/relationship/informal_alias.rb +18 -0
data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
data/lib/ucode/models/relationship.rb +57 -0
data/lib/ucode/models/script.rb +41 -0
data/lib/ucode/models/special_casing_rule.rb +28 -0
data/lib/ucode/models/standardized_variant.rb +24 -0
data/lib/ucode/models/unihan_entry.rb +23 -0
data/lib/ucode/models.rb +47 -0
data/lib/ucode/parsers/auxiliary.rb +26 -0
data/lib/ucode/parsers/base.rb +137 -0
data/lib/ucode/parsers/bidi_brackets.rb +41 -0
data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
data/lib/ucode/parsers/blocks.rb +63 -0
data/lib/ucode/parsers/case_folding.rb +53 -0
data/lib/ucode/parsers/cjk_radicals.rb +102 -0
data/lib/ucode/parsers/derived_age.rb +59 -0
data/lib/ucode/parsers/derived_core_properties.rb +60 -0
data/lib/ucode/parsers/extracted_properties.rb +74 -0
data/lib/ucode/parsers/name_aliases.rb +44 -0
data/lib/ucode/parsers/named_sequences.rb +51 -0
data/lib/ucode/parsers/names_list.rb +250 -0
data/lib/ucode/parsers/property_aliases.rb +41 -0
data/lib/ucode/parsers/property_value_aliases.rb +46 -0
data/lib/ucode/parsers/script_extensions.rb +64 -0
data/lib/ucode/parsers/scripts.rb +60 -0
data/lib/ucode/parsers/special_casing.rb +62 -0
data/lib/ucode/parsers/standardized_variants.rb +56 -0
data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
data/lib/ucode/parsers/unicode_data.rb +268 -0
data/lib/ucode/parsers/unihan.rb +125 -0
data/lib/ucode/parsers.rb +35 -0
data/lib/ucode/range_entry.rb +58 -0
data/lib/ucode/repo/aggregate_writer.rb +364 -0
data/lib/ucode/repo/atomic_writes.rb +48 -0
data/lib/ucode/repo/codepoint_writer.rb +96 -0
data/lib/ucode/repo/paths.rb +122 -0
data/lib/ucode/repo.rb +22 -0
data/lib/ucode/site/config_emitter.rb +124 -0
data/lib/ucode/site/generator.rb +178 -0
data/lib/ucode/site/search_index.rb +68 -0
data/lib/ucode/site/template/.gitignore +4 -0
data/lib/ucode/site/template/.vitepress/config.ts +8 -0
data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
data/lib/ucode/site/template/char/[codepoint].md +13 -0
data/lib/ucode/site/template/components/BlockView.vue +57 -0
data/lib/ucode/site/template/components/CharView.vue +85 -0
data/lib/ucode/site/template/components/PlaneView.vue +56 -0
data/lib/ucode/site/template/components/SearchView.vue +66 -0
data/lib/ucode/site/template/index.md +25 -0
data/lib/ucode/site/template/package.json +18 -0
data/lib/ucode/site/template/search.md +9 -0
data/lib/ucode/site.rb +13 -0
data/lib/ucode/version.rb +5 -0
data/lib/ucode/version_resolver.rb +76 -0
data/lib/ucode.rb +74 -0
data/ucode.gemspec +56 -0
metadata +404 -0

data/lib/ucode/coordinator.rb ADDED Viewed

@@ -0,0 +1,397 @@
+# frozen_string_literal: true
+require "pathname"
+require "ucode/parsers"
+require "ucode/models"
+module Ucode
+  # Orchestrates the UCD + Unihan parsers and produces per-codepoint
+  # CodePoint records for a downstream sink (a writer, an aggregator,
+  # a database builder).
+  #
+  # **Streaming architecture**:
+  #
+  #   1. Indices pass — load every range/point file into memory, keyed
+  #      by codepoint (hash) or sorted by `range_first` (bsearch).
+  #      Peak memory is ~10 MB of indices, NOT 160 k CodePoints.
+  #
+  #   2. Stream pass — `UnicodeData.each_record` drives the main loop.
+  #      For each yielded CodePoint, the Coordinator merges in data from
+  #      the indices, then yields to the sink. CodePoints are GC'd
+  #      after the sink processes them.
+  #
+  # Every data file is OPTIONAL — if a file is missing (partial fetch,
+  # incremental run), the corresponding indices stay empty and the
+  # matching CodePoint fields stay at their defaults. This makes the
+  # Coordinator resilient against partial fixtures and lets users run
+  # subsets.
+  class Coordinator
+    autoload :Indices, "ucode/coordinator/indices"
+    ISO_SCRIPT_PROPERTY = "sc".freeze
+    private_constant :ISO_SCRIPT_PROPERTY
+    attr_reader :config
+    def initialize(config = Ucode.configuration)
+      @config = config
+    end
+    # Stream-driven build. Calls `block` once per assigned codepoint.
+    def build(ucd_dir:, unihan_dir:, &block)
+      each_codepoint(ucd_dir: ucd_dir, unihan_dir: unihan_dir, &block)
+    end
+    # Iterates one enriched CodePoint per assigned codepoint. Returns a
+    # lazy Enumerator when called without a block.
+    def each_codepoint(ucd_dir:, unihan_dir:)
+      return enum_for(:each_codepoint, ucd_dir: ucd_dir, unihan_dir: unihan_dir) unless block_given?
+      indices = build_indices(ucd_dir, unihan_dir)
+      each_with_indices(ucd_dir: ucd_dir, unihan_dir: unihan_dir, indices: indices) do |cp|
+        yield cp
+      end
+      nil
+    end
+    # Like #each_codepoint but yields `(indices, cp)` so callers that
+    # need the indices for a post-stream flush (e.g. ParseCommand) can
+    # reuse them instead of re-building. Returns an Enumerator when no
+    # block is given.
+    def each_codepoint_with_indices(ucd_dir:, unihan_dir:)
+      unless block_given?
+        return enum_for(:each_codepoint_with_indices, ucd_dir: ucd_dir, unihan_dir: unihan_dir)
+      end
+      indices = build_indices(ucd_dir, unihan_dir)
+      each_with_indices(ucd_dir: ucd_dir, unihan_dir: unihan_dir, indices: indices) do |cp|
+        yield indices, cp
+      end
+      nil
+    end
+    # Build (and return) the Coordinator::Indices for the given UCD +
+    # Unihan dirs. Useful when the caller needs the indices separately
+    # from the streaming pass (e.g. AggregateWriter#flush).
+    def indices_for(ucd_dir:, unihan_dir:)
+      build_indices(ucd_dir, unihan_dir)
+    end
+    private
+    def each_with_indices(ucd_dir:, unihan_dir:, indices:)
+      unicode_data_path = Pathname.new(ucd_dir).join("UnicodeData.txt")
+      Parsers::UnicodeData.each_record(unicode_data_path) do |cp|
+        enrich(cp, indices)
+        yield cp
+      end
+    end
+    def build_indices(ucd_dir, unihan_dir)
+      property_value_aliases = property_value_aliases_index(ucd_dir)
+      Indices.new(
+        blocks: range_index(ucd_dir, "Blocks.txt", Parsers::Blocks),
+        scripts: scripts_index(ucd_dir, property_value_aliases),
+        property_value_aliases: property_value_aliases,
+        derived_age: cp_index(ucd_dir, "DerivedAge.txt", Parsers::DerivedAge, :cp),
+        binary_properties: multi_cp_index(ucd_dir, "DerivedCoreProperties.txt",
+                                         Parsers::DerivedCoreProperties),
+        script_extensions: multi_cp_index(ucd_dir, "ScriptExtensions.txt",
+                                          Parsers::ScriptExtensions, :cp),
+        bidi_mirroring: cp_index(ucd_dir, "BidiMirroring.txt",
+                                 Parsers::BidiMirroring, :codepoint),
+        bidi_brackets: cp_index(ucd_dir, "BidiBrackets.txt",
+                                Parsers::BidiBrackets, :codepoint),
+        special_casing: multi_cp_index(ucd_dir, "SpecialCasing.txt",
+                                       Parsers::SpecialCasing),
+        # CaseFolding: one cp can carry C, F, S, and T statuses; the
+        # Coordinator buckets each row into CodePoint::CaseFolding by
+        # status, so the index holds an Array per cp.
+        case_folding: multi_cp_index(ucd_dir, "CaseFolding.txt",
+                                     Parsers::CaseFolding, :codepoint),
+        name_aliases: multi_cp_index(ucd_dir, "NameAliases.txt",
+                                     Parsers::NameAliases),
+        # CJKRadicals maps a canonical ideograph (e.g. U+4E00) to its
+        # KangXi radical; the lookup key is the ideograph_id ("U+XXXX"),
+        # not the radical_number or the cjk_radical_id.
+        cjk_radicals: multi_cp_index_by_id(ucd_dir, "CJKRadicals.txt",
+                                           Parsers::CjkRadicals, :ideograph_id),
+        standardized_variants: multi_cp_index_by_id(ucd_dir, "StandardizedVariants.txt",
+                                                    Parsers::StandardizedVariants, :base_id),
+        names_list: names_list_index(ucd_dir),
+        unihan: unihan_index(unihan_dir)
+      )
+    end
+    # ---- Index builders -------------------------------------------------
+    def range_index(ucd_dir, filename, parser)
+      path = Pathname.new(ucd_dir).join(filename)
+      return [] unless path.exist?
+      parser.each_record(path).to_a.sort_by(&:range_first)
+    end
+    # Builds the sorted Script array and resolves each Script's ISO 15924
+    # code in one pass, using the pre-computed property_value_aliases map.
+    # This avoids re-resolving the alias on every per-cp lookup (160k ×
+    # hash lookup vs ~one lookup per Script range).
+    def scripts_index(ucd_dir, property_value_aliases)
+      path = Pathname.new(ucd_dir).join("Scripts.txt")
+      return [] unless path.exist?
+      Parsers::Scripts.each_record(path).map do |script|
+        script.code = property_value_aliases[script.name]
+        script
+      end.sort_by(&:range_first)
+    end
+    # Indexes by integer codepoint for parsers whose record exposes a
+    # `codepoint` integer accessor (or any method returning Integer).
+    def cp_index(ucd_dir, filename, parser, key_method)
+      path = Pathname.new(ucd_dir).join(filename)
+      return {} unless path.exist?
+      parser.each_record(path).each_with_object({}) do |record, h|
+        h[record.public_send(key_method)] = record
+      end
+    end
+    # Multi-valued index by integer codepoint. Each cp maps to an array
+    # of records (e.g. one cp can have several binary properties, several
+    # script extensions, several SpecialCasing rules).
+    def multi_cp_index(ucd_dir, filename, parser, key_method = :codepoint)
+      path = Pathname.new(ucd_dir).join(filename)
+      return {} unless path.exist?
+      parser.each_record(path).each_with_object(Hash.new { |h, k| h[k] = [] }) do |record, h|
+        h[record.public_send(key_method)] << record
+      end
+    end
+    # Multi-valued index keyed by a "U+XXXX" string id (e.g. standardized
+    # variants are keyed by base_id).
+    def multi_cp_index_by_id(ucd_dir, filename, parser, key_method)
+      path = Pathname.new(ucd_dir).join(filename)
+      return {} unless path.exist?
+      parser.each_record(path).each_with_object(Hash.new { |h, k| h[k] = [] }) do |record, h|
+        h[record.public_send(key_method)] << record
+      end
+    end
+    def property_value_aliases_index(ucd_dir)
+      path = Pathname.new(ucd_dir).join("PropertyValueAliases.txt")
+      return {} unless path.exist?
+      Parsers::PropertyValueAliases.each_record(path).each_with_object({}) do |pva, h|
+        next unless pva.property == ISO_SCRIPT_PROPERTY
+        h[pva.long] = pva.short
+      end
+    end
+    def names_list_index(ucd_dir)
+      path = Pathname.new(ucd_dir).join("NamesList.txt")
+      return {} unless path.exist?
+      Parsers::NamesList.each_record(path).each_with_object({}) do |entry, h|
+        h[entry.codepoint] = entry
+      end
+    end
+    def unihan_index(unihan_dir)
+      return {} if unihan_dir.nil?
+      dir = Pathname.new(unihan_dir)
+      return {} unless dir.exist?
+      by_field = Hash.new { |h, k| h[k] = {} }
+      Parsers::Unihan.each_in_dir(dir) do |record|
+        by_field[record.cp][record.field] = record.field_values
+      end
+      by_field.transform_values { |fields| Models::UnihanEntry.new(fields: fields) }
+    end
+    # ---- Per-codepoint enrichment --------------------------------------
+    def enrich(cp, indices)
+      cp.plane_number = cp.cp >> 16
+      cp.block_id = find_in_range(cp.cp, indices.blocks)&.id
+      assign_script(cp, indices)
+      assign_script_extensions(cp, indices)
+      assign_age(cp, indices)
+      assign_bidi(cp, indices)
+      assign_casing(cp, indices)
+      assign_case_folding(cp, indices)
+      assign_binary_properties(cp, indices)
+      assign_names_list(cp, indices)
+      assign_name_aliases(cp, indices)
+      assign_standardized_variants(cp, indices)
+      assign_unihan(cp, indices)
+      assign_cjk_radical(cp, indices)
+    end
+    def assign_script(cp, indices)
+      script = find_in_range(cp.cp, indices.scripts)
+      return unless script
+      cp.script_code = script.code || script.name
+    end
+    def assign_script_extensions(cp, indices)
+      tuples = indices.script_extensions[cp.cp]
+      return unless tuples && !tuples.empty?
+      tuples.each { |tuple| cp.script_extensions << tuple.script_code }
+    end
+    def assign_age(cp, indices)
+      record = indices.derived_age[cp.cp]
+      return unless record
+      cp.age = record.age
+    end
+    def assign_bidi(cp, indices)
+      mirroring = indices.bidi_mirroring[cp.cp]
+      brackets = indices.bidi_brackets[cp.cp]
+      return unless mirroring || brackets
+      cp.bidi ||= Models::CodePoint::Bidi.new
+      if mirroring
+        cp.bidi.mirroring_glyph_id = mirroring.mirrored_id
+      end
+      if brackets
+        cp.bidi.paired_bracket_type = brackets.type
+        cp.bidi.paired_bracket_id = brackets.paired_id
+      end
+    end
+    def assign_casing(cp, indices)
+      rules = indices.special_casing[cp.cp]
+      return unless rules && !rules.empty?
+      # NOTE: do not uniq the *_ids arrays — a mapping like U+00DF → "SS"
+      # legitimately contains two U+0053 entries and they must be
+      # preserved in order. Conditions, by contrast, are categorical
+      # tags (Final_Sigma, tr, After_I) and deduping them is correct.
+      cp.casing ||= Models::CodePoint::Casing.new
+      cp.casing.full_upper_ids = rules.flat_map(&:upper_ids)
+      cp.casing.full_lower_ids = rules.flat_map(&:lower_ids)
+      cp.casing.full_title_ids = rules.flat_map(&:title_ids)
+      cp.casing.conditions = rules.flat_map(&:conditions).uniq
+    end
+    def assign_case_folding(cp, indices)
+      rules = indices.case_folding[cp.cp]
+      return unless rules && !rules.empty?
+      cp.case_folding ||= Models::CodePoint::CaseFolding.new
+      rules.each do |rule|
+        case rule.status
+        when "C" then cp.case_folding.common_id = rule.mapping_ids.first
+        when "S" then cp.case_folding.simple_id = rule.mapping_ids.first
+        when "T" then cp.case_folding.turkic_id = rule.mapping_ids.first
+        when "F" then cp.case_folding.full_ids = rule.mapping_ids
+        end
+      end
+    end
+    def assign_binary_properties(cp, indices)
+      records = indices.binary_properties[cp.cp]
+      return unless records && !records.empty?
+      cp.binary_properties = records.map(&:property_short)
+    end
+    def assign_names_list(cp, indices)
+      entry = indices.names_list[cp.cp]
+      return unless entry
+      cp.names_list = entry
+      cp.relationships.concat(entry.cross_references)
+      cp.relationships.concat(entry.sample_sequences)
+      cp.relationships.concat(entry.compatibility_equivalents)
+      cp.relationships.concat(entry.informal_aliases)
+      cp.relationships.concat(entry.footnotes)
+    end
+    def assign_name_aliases(cp, indices)
+      aliases = indices.name_aliases[cp.cp]
+      return unless aliases && !aliases.empty?
+      aliases.each do |alias_record|
+        cp.relationships << Models::Relationship::InformalAlias.new(
+          description: alias_record.text,
+          source: "name_aliases"
+        )
+      end
+    end
+    def assign_standardized_variants(cp, indices)
+      variants = indices.standardized_variants[cp.id]
+      return unless variants && !variants.empty?
+      cp.standardized_variants = variants
+      variants.each do |variant|
+        cp.relationships << Models::Relationship::VariationSequence.new(
+          target_ids: [variant.base_id, variant.variation_selector_id],
+          description: variant.description,
+          contexts: variant.contexts,
+          source: "standardized_variants"
+        )
+      end
+    end
+    def assign_unihan(cp, indices)
+      entry = indices.unihan[cp.cp]
+      return unless entry
+      cp.unihan = entry
+    end
+    def assign_cjk_radical(cp, indices)
+      radicals = indices.cjk_radicals[cp.id]
+      return unless radicals && !radicals.empty?
+      radicals.each do |radical|
+        cp.relationships << Models::Relationship::CrossReference.new(
+          target_ids: [radical.cjk_radical_id],
+          description: "KangXi radical ##{radical.radical_number}",
+          source: "cjk_radicals"
+        )
+      end
+    end
+    # ---- Range lookup (bsearch) ----------------------------------------
+    # Finds the range-containing record in a sorted array via bsearch.
+    # Records respond to `range_first` and `range_last`.
+    #
+    # bsearch_index integer-mode convention: return -1 to search LEFT,
+    # +1 to search RIGHT, 0 for a match. `cp < range_first` means the
+    # target range lies in earlier (lower-indexed) records, so we return
+    # -1; `cp > range_last` means it lies in later records, so we return
+    # +1.
+    def find_in_range(cp, sorted_ranges)
+      return nil if sorted_ranges.nil? || sorted_ranges.empty?
+      idx = sorted_ranges.bsearch_index do |record|
+        if cp < record.range_first
+          -1
+        elsif cp > record.range_last
+          1
+        else
+          0
+        end
+      end
+      idx.nil? ? nil : sorted_ranges[idx]
+    end
+  end
+end

data/lib/ucode/database.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# frozen_string_literal: true
+require "sqlite3"
+require "ucode/cache"
+require "ucode/error"
+require "ucode/range_entry"
+module Ucode
+  # SQLite-backed UCD lookup index for one Unicode version.
+  #
+  # One Database instance = one `.sqlite3` file at
+  # `Cache.sqlite_path(version)`. The DB holds two range tables
+  # (`blocks` and `scripts`), each pre-coalesced during build.
+  #
+  # Why SQLite (alongside the YAML Index):
+  #
+  # - Persistent across processes — build once, reuse across runs.
+  # - Btree-indexed queries load only the requested rows.
+  # - Small on disk (~hundreds of KB after coalescing).
+  #
+  # Lifecycle:
+  #
+  #   Database.build(version)   # streams Coordinator output → SQLite
+  #   Database.open(version)    # opens existing SQLite (read-only)
+  #   Database.cached?(version) # checks for .sqlite3 file
+  #
+  class Database
+    SCHEMA_VERSION = "1"
+    BLOCKS_TABLE = "blocks"
+    SCRIPTS_TABLE = "scripts"
+    private_constant :BLOCKS_TABLE, :SCRIPTS_TABLE
+    class << self
+      # Open an existing database. Raises DatabaseMissingError if the
+      # file is absent, DatabaseSchemaError if the on-disk schema
+      # version does not match `SCHEMA_VERSION`.
+      # @param version [String]
+      # @return [Database]
+      def open(version)
+        path = Cache.sqlite_path(version)
+        unless path.exist?
+          raise DatabaseMissingError.new(
+            "No UCD SQLite cache for version #{version.inspect} at #{path}",
+            context: { version: version, path: path.to_s },
+          )
+        end
+        db = new(path.to_s)
+        db.verify_schema_version!
+        db
+      end
+      # Stream the Coordinator output for `version` into a new SQLite
+      # cache, then open it. Replaces any existing file.
+      # @param version [String]
+      # @return [Database]
+      def build(version)
+        DbBuilder.build(version)
+        open(version)
+      end
+      # True if a built SQLite cache exists for this version.
+      # @param version [String]
+      # @return [Boolean]
+      def cached?(version)
+        Cache.sqlite_path(version).exist?
+      end
+    end
+    # @param path [String] path to the .sqlite3 file
+    def initialize(path)
+      @db = SQLite3::Database.new(path, readonly: true, results_as_hash: true)
+      @db.busy_timeout = 5000
+    end
+    # @return [String] the UCD version this DB was built from.
+    def ucd_version
+      @ucd_version ||= meta("ucd_version")
+    end
+    # @return [String] the schema version recorded at build time.
+    def schema_version
+      @schema_version ||= meta("schema_version")
+    end
+    # Look up the block name covering `codepoint`. nil if not in any
+    # known block (typically: cp is unassigned or outside the source
+    # fixture).
+    # @param codepoint [Integer]
+    # @return [String, nil]
+    def lookup_block(codepoint)
+      lookup(BLOCKS_TABLE, codepoint)
+    end
+    # Look up the script name covering `codepoint`. nil if not in any
+    # known script.
+    # @param codepoint [Integer]
+    # @return [String, nil]
+    def lookup_script(codepoint)
+      lookup(SCRIPTS_TABLE, codepoint)
+    end
+    # Enumerate every range in the blocks table that overlaps the
+    # inclusive query range, sorted by first_cp.
+    # @param first [Integer]
+    # @param last [Integer]
+    # @return [Enumerator<RangeEntry>] if no block given
+    def each_block_overlapping(first, last, &block)
+      each_overlapping(BLOCKS_TABLE, first, last, &block)
+    end
+    # Enumerate every range in the scripts table that overlaps the
+    # inclusive query range, sorted by first_cp.
+    # @param first [Integer]
+    # @param last [Integer]
+    # @return [Enumerator<RangeEntry>] if no block given
+    def each_script_overlapping(first, last, &block)
+      each_overlapping(SCRIPTS_TABLE, first, last, &block)
+    end
+    # All block ranges, sorted by first_cp. Mostly useful in specs.
+    # @return [Array<RangeEntry>]
+    def block_entries
+      entries(BLOCKS_TABLE)
+    end
+    # All script ranges, sorted by first_cp. Mostly useful in specs.
+    # @return [Array<RangeEntry>]
+    def script_entries
+      entries(SCRIPTS_TABLE)
+    end
+    # Every block range that shares the given block name. Empty for an
+    # unknown name. Used by the audit BlockAggregator to derive a
+    # block's assigned-codepoint set and span without a separate
+    # canonical-range lookup.
+    # @param name [String] block name as stored (e.g. "Basic_Latin")
+    # @return [Array<RangeEntry>] sorted by first_cp
+    def block_ranges_by_name(name)
+      ranges_by_name(BLOCKS_TABLE, name)
+    end
+    # Every script range that shares the given script code. Empty for an
+    # unknown name. Used by the audit ScriptAggregator.
+    # @param name [String] ISO 15924 script code (e.g. "Latn")
+    # @return [Array<RangeEntry>] sorted by first_cp
+    def script_ranges_by_name(name)
+      ranges_by_name(SCRIPTS_TABLE, name)
+    end
+    # Close the underlying SQLite connection. Idempotent.
+    # @return [void]
+    def close
+      @db.close
+    end
+    # Raises DatabaseSchemaError if the on-disk schema version does
+    # not match `SCHEMA_VERSION`. Called by `.open`; exposed for
+    # consumers that hold a long-lived connection.
+    # @return [void]
+    def verify_schema_version!
+      return if schema_version == SCHEMA_VERSION
+      raise DatabaseSchemaError.new(
+        "SQLite schema mismatch: on-disk #{schema_version.inspect}, " \
+        "expected #{SCHEMA_VERSION.inspect}",
+        context: { on_disk: schema_version, expected: SCHEMA_VERSION },
+      )
+    end
+    private
+    def meta(key)
+      @db.get_first_value(
+        "SELECT value FROM schema_meta WHERE key = ?",
+        [key.to_s],
+      )
+    end
+    def lookup(table, codepoint)
+      @db.get_first_value(
+        "SELECT name FROM #{table} WHERE first_cp <= ? AND last_cp >= ? LIMIT 1",
+        [codepoint, codepoint],
+      )
+    end
+    def each_overlapping(table, first, last)
+      return enum_for(:each_overlapping, table, first, last) unless block_given?
+      @db.execute(
+        "SELECT first_cp, last_cp, name FROM #{table} " \
+        "WHERE first_cp <= ? AND last_cp >= ? ORDER BY first_cp",
+        [last, first],
+      ).each do |row|
+        yield RangeEntry.new(row["first_cp"], row["last_cp"], row["name"])
+      end
+    end
+    def entries(table)
+      @db.execute(
+        "SELECT first_cp, last_cp, name FROM #{table} ORDER BY first_cp",
+      ).map { |row| RangeEntry.new(row["first_cp"], row["last_cp"], row["name"]) }
+    end
+    def ranges_by_name(table, name)
+      @db.execute(
+        "SELECT first_cp, last_cp, name FROM #{table} " \
+        "WHERE name = ? ORDER BY first_cp",
+        [name],
+      ).map { |row| RangeEntry.new(row["first_cp"], row["last_cp"], row["name"]) }
+    end
+  end
+end