RubyGems - ucode - Versions diffs - 0.1.0 - Mend

ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

checksums.yaml +7 -0
data/CLAUDE.md +211 -0
data/Gemfile +22 -0
data/Gemfile.lock +406 -0
data/README.md +469 -0
data/Rakefile +18 -0
data/TODO.new/00-README.md +66 -0
data/TODO.new/01-pillar-terminology-alignment.md +69 -0
data/TODO.new/02-audit-schema-design.md +255 -0
data/TODO.new/03-directory-output-spec.md +203 -0
data/TODO.new/04-fontist-org-contract.md +173 -0
data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
data/TODO.new/06-audit-namespace-skeleton.md +105 -0
data/TODO.new/07-audit-models-port.md +132 -0
data/TODO.new/08-extractors-cheap-port.md +113 -0
data/TODO.new/09-extractors-expensive-port.md +99 -0
data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
data/TODO.new/12-formatters-port.md +115 -0
data/TODO.new/13-directory-emitter.md +147 -0
data/TODO.new/14-html-face-browser.md +144 -0
data/TODO.new/15-html-library-browser.md +102 -0
data/TODO.new/16-cli-audit-subcommands.md +142 -0
data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
data/TODO.new/19-fontisan-docs-update.md +155 -0
data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
data/TODO.new/21-canonical-unicode17-build.md +148 -0
data/TODO.new/22-implementation-order.md +176 -0
data/UCODE_CHANGELOG.md +97 -0
data/exe/ucode +8 -0
data/lib/ucode/aggregator.rb +77 -0
data/lib/ucode/audit/block_aggregator.rb +90 -0
data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
data/lib/ucode/audit/context.rb +137 -0
data/lib/ucode/audit/discrepancy_detector.rb +213 -0
data/lib/ucode/audit/extractors/aggregations.rb +70 -0
data/lib/ucode/audit/extractors/base.rb +21 -0
data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
data/lib/ucode/audit/extractors/coverage.rb +55 -0
data/lib/ucode/audit/extractors/hinting.rb +199 -0
data/lib/ucode/audit/extractors/identity.rb +65 -0
data/lib/ucode/audit/extractors/licensing.rb +75 -0
data/lib/ucode/audit/extractors/metrics.rb +108 -0
data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
data/lib/ucode/audit/extractors/provenance.rb +34 -0
data/lib/ucode/audit/extractors/style.rb +88 -0
data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
data/lib/ucode/audit/extractors.rb +31 -0
data/lib/ucode/audit/plane_aggregator.rb +37 -0
data/lib/ucode/audit/registry.rb +63 -0
data/lib/ucode/audit/script_aggregator.rb +92 -0
data/lib/ucode/audit.rb +27 -0
data/lib/ucode/cache.rb +113 -0
data/lib/ucode/cli.rb +272 -0
data/lib/ucode/commands/build.rb +68 -0
data/lib/ucode/commands/cache.rb +46 -0
data/lib/ucode/commands/fetch.rb +62 -0
data/lib/ucode/commands/font_coverage.rb +57 -0
data/lib/ucode/commands/glyphs.rb +136 -0
data/lib/ucode/commands/lookup.rb +65 -0
data/lib/ucode/commands/parse.rb +62 -0
data/lib/ucode/commands/site.rb +33 -0
data/lib/ucode/commands.rb +19 -0
data/lib/ucode/config.rb +110 -0
data/lib/ucode/coordinator/indices.rb +34 -0
data/lib/ucode/coordinator.rb +397 -0
data/lib/ucode/database.rb +214 -0
data/lib/ucode/db_builder.rb +107 -0
data/lib/ucode/error.rb +96 -0
data/lib/ucode/fetch/code_charts.rb +57 -0
data/lib/ucode/fetch/http.rb +83 -0
data/lib/ucode/fetch/ucd_zip.rb +57 -0
data/lib/ucode/fetch/unihan_zip.rb +57 -0
data/lib/ucode/fetch.rb +14 -0
data/lib/ucode/glyphs/cell_extractor.rb +130 -0
data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
data/lib/ucode/glyphs/grid.rb +30 -0
data/lib/ucode/glyphs/grid_detector.rb +165 -0
data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
data/lib/ucode/glyphs/last_resort/source.rb +125 -0
data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
data/lib/ucode/glyphs/last_resort.rb +36 -0
data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
data/lib/ucode/glyphs/page_renderer.rb +221 -0
data/lib/ucode/glyphs/path_bbox.rb +62 -0
data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
data/lib/ucode/glyphs/real_fonts.rb +32 -0
data/lib/ucode/glyphs/writer.rb +250 -0
data/lib/ucode/glyphs.rb +27 -0
data/lib/ucode/index.rb +106 -0
data/lib/ucode/index_builder.rb +94 -0
data/lib/ucode/models/audit/audit_axis.rb +30 -0
data/lib/ucode/models/audit/audit_diff.rb +77 -0
data/lib/ucode/models/audit/audit_report.rb +137 -0
data/lib/ucode/models/audit/baseline.rb +32 -0
data/lib/ucode/models/audit/block_summary.rb +72 -0
data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
data/lib/ucode/models/audit/codepoint_range.rb +39 -0
data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
data/lib/ucode/models/audit/color_capabilities.rb +91 -0
data/lib/ucode/models/audit/discrepancy.rb +38 -0
data/lib/ucode/models/audit/duplicate_group.rb +23 -0
data/lib/ucode/models/audit/embedding_type.rb +81 -0
data/lib/ucode/models/audit/field_change.rb +28 -0
data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
data/lib/ucode/models/audit/gasp_range.rb +63 -0
data/lib/ucode/models/audit/hinting.rb +99 -0
data/lib/ucode/models/audit/library_summary.rb +40 -0
data/lib/ucode/models/audit/licensing.rb +48 -0
data/lib/ucode/models/audit/metrics.rb +111 -0
data/lib/ucode/models/audit/named_instance.rb +41 -0
data/lib/ucode/models/audit/opentype_layout.rb +38 -0
data/lib/ucode/models/audit/plane_summary.rb +31 -0
data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
data/lib/ucode/models/audit/script_features.rb +28 -0
data/lib/ucode/models/audit/script_summary.rb +54 -0
data/lib/ucode/models/audit/variation_detail.rb +42 -0
data/lib/ucode/models/audit.rb +50 -0
data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
data/lib/ucode/models/bidi_mirroring.rb +19 -0
data/lib/ucode/models/binary_property_assignment.rb +26 -0
data/lib/ucode/models/block.rb +36 -0
data/lib/ucode/models/case_folding_rule.rb +23 -0
data/lib/ucode/models/cjk_radical.rb +23 -0
data/lib/ucode/models/codepoint/bidi.rb +28 -0
data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
data/lib/ucode/models/codepoint/case_folding.rb +25 -0
data/lib/ucode/models/codepoint/casing.rb +32 -0
data/lib/ucode/models/codepoint/decomposition.rb +27 -0
data/lib/ucode/models/codepoint/display.rb +24 -0
data/lib/ucode/models/codepoint/emoji.rb +29 -0
data/lib/ucode/models/codepoint/hangul.rb +20 -0
data/lib/ucode/models/codepoint/identifier.rb +30 -0
data/lib/ucode/models/codepoint/indic.rb +20 -0
data/lib/ucode/models/codepoint/joining.rb +20 -0
data/lib/ucode/models/codepoint/normalization.rb +35 -0
data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
data/lib/ucode/models/codepoint.rb +122 -0
data/lib/ucode/models/name_alias.rb +21 -0
data/lib/ucode/models/named_sequence.rb +19 -0
data/lib/ucode/models/names_list_entry.rb +38 -0
data/lib/ucode/models/plane.rb +36 -0
data/lib/ucode/models/property_alias.rb +24 -0
data/lib/ucode/models/property_value_alias.rb +26 -0
data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
data/lib/ucode/models/relationship/cross_reference.rb +17 -0
data/lib/ucode/models/relationship/footnote.rb +24 -0
data/lib/ucode/models/relationship/informal_alias.rb +18 -0
data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
data/lib/ucode/models/relationship.rb +57 -0
data/lib/ucode/models/script.rb +41 -0
data/lib/ucode/models/special_casing_rule.rb +28 -0
data/lib/ucode/models/standardized_variant.rb +24 -0
data/lib/ucode/models/unihan_entry.rb +23 -0
data/lib/ucode/models.rb +47 -0
data/lib/ucode/parsers/auxiliary.rb +26 -0
data/lib/ucode/parsers/base.rb +137 -0
data/lib/ucode/parsers/bidi_brackets.rb +41 -0
data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
data/lib/ucode/parsers/blocks.rb +63 -0
data/lib/ucode/parsers/case_folding.rb +53 -0
data/lib/ucode/parsers/cjk_radicals.rb +102 -0
data/lib/ucode/parsers/derived_age.rb +59 -0
data/lib/ucode/parsers/derived_core_properties.rb +60 -0
data/lib/ucode/parsers/extracted_properties.rb +74 -0
data/lib/ucode/parsers/name_aliases.rb +44 -0
data/lib/ucode/parsers/named_sequences.rb +51 -0
data/lib/ucode/parsers/names_list.rb +250 -0
data/lib/ucode/parsers/property_aliases.rb +41 -0
data/lib/ucode/parsers/property_value_aliases.rb +46 -0
data/lib/ucode/parsers/script_extensions.rb +64 -0
data/lib/ucode/parsers/scripts.rb +60 -0
data/lib/ucode/parsers/special_casing.rb +62 -0
data/lib/ucode/parsers/standardized_variants.rb +56 -0
data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
data/lib/ucode/parsers/unicode_data.rb +268 -0
data/lib/ucode/parsers/unihan.rb +125 -0
data/lib/ucode/parsers.rb +35 -0
data/lib/ucode/range_entry.rb +58 -0
data/lib/ucode/repo/aggregate_writer.rb +364 -0
data/lib/ucode/repo/atomic_writes.rb +48 -0
data/lib/ucode/repo/codepoint_writer.rb +96 -0
data/lib/ucode/repo/paths.rb +122 -0
data/lib/ucode/repo.rb +22 -0
data/lib/ucode/site/config_emitter.rb +124 -0
data/lib/ucode/site/generator.rb +178 -0
data/lib/ucode/site/search_index.rb +68 -0
data/lib/ucode/site/template/.gitignore +4 -0
data/lib/ucode/site/template/.vitepress/config.ts +8 -0
data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
data/lib/ucode/site/template/char/[codepoint].md +13 -0
data/lib/ucode/site/template/components/BlockView.vue +57 -0
data/lib/ucode/site/template/components/CharView.vue +85 -0
data/lib/ucode/site/template/components/PlaneView.vue +56 -0
data/lib/ucode/site/template/components/SearchView.vue +66 -0
data/lib/ucode/site/template/index.md +25 -0
data/lib/ucode/site/template/package.json +18 -0
data/lib/ucode/site/template/search.md +9 -0
data/lib/ucode/site.rb +13 -0
data/lib/ucode/version.rb +5 -0
data/lib/ucode/version_resolver.rb +76 -0
data/lib/ucode.rb +74 -0
data/ucode.gemspec +56 -0
metadata +404 -0

data/TODO.new/20-canonical-resolver-4-tier.md ADDED Viewed

@@ -0,0 +1,182 @@
+# 20 — Canonical 4-tier resolver
+## Goal
+Wire the 4-tier glyph sourcing strategy into Mode 1's per-codepoint
+canonical dataset writer. For each assigned codepoint, the resolver
+tries Tier 1 → Pillar 1 → Pillar 2 → Pillar 3 in order and uses the
+first tier that produces a glyph.
+Today Mode 1 has the pillars (1-3) implemented but no Tier 1 hook, no
+config-driven font selection per block, and no priority-ordered
+resolver. This TODO builds the resolver.
+## Files to create
+- `lib/ucode/glyphs/resolver.rb` — the priority-ordered resolver.
+- `lib/ucode/glyphs/source_config.rb` — block → preferred Tier 1 font
+  config table.
+- `lib/ucode/glyphs/sources/`
+  - `tier1_real_font.rb` — wraps the existing RealFonts pipeline as a
+    resolver source.
+  - `pillar1_embedded_tounicode.rb` — wraps `EmbeddedFonts::Catalog`.
+  - `pillar2_correlator.rb` — wraps `ContentStreamCorrelator`.
+  - `pillar3_last_resort.rb` — wraps `LastResort`.
+- `lib/ucode/glyphs/source.rb` — common interface (`#fetch(codepoint)
+  → Result or nil`).
+- Specs for resolver + each source wrapper.
+## Source interface
+```ruby
+class Ucode::Glyphs::Source
+  Result = Struct.new(:tier, :codepoint, :svg, :provenance, keyword_init: true)
+  # @param codepoint [Integer]
+  # @return [Result, nil] nil if this source cannot produce a glyph
+  def fetch(codepoint)
+    raise NotImplementedError
+  end
+  # @return [String] e.g. "tier-1:noto-sans-sidetic", "pillar-1:embedded",
+  #                   "pillar-2:correlated", "pillar-3:last-resort"
+  def provenance
+    raise NotImplementedError
+  end
+end
+```
+Each tier is a `Source` subclass. The resolver holds an ordered array
+of sources and returns the first non-nil result.
+## Resolver behavior
+```ruby
+class Ucode::Glyphs::Resolver
+  DEFAULT_ORDER = %i[tier1 pillar1 pillar2 pillar3].freeze
+  def initialize(sources:, order: DEFAULT_ORDER)
+    @sources_by_tier = sources.group_by(&:tier)
+    @order = order
+  end
+  def resolve(codepoint)
+    @order.each do |tier|
+      Array(@sources_by_tier[tier]).each do |source|
+        result = source.fetch(codepoint)
+        return result if result
+      end
+    end
+    nil
+  end
+end
+```
+Sources can be plural per tier (e.g. multiple Tier 1 fonts covering
+different blocks). The resolver tries them in declared order.
+## Source config
+The block → Tier 1 font mapping lives in a config file, populated
+from the baseline audit in TODO 05:
+```yaml
+# config/unicode17_tier1_fonts.yml
+tier1_fonts:
+  Sidetic:
+    - label=Lentariso
+    - noto-sans-sidetic
+  Beria_Erfe:
+    - label=Kedebideri
+  Tai_Yo:
+    - label=NotoSerifTaiYo
+  Tolong_Siki:
+    - noto-sans-tolong-siki
+  # ...
+  CJK_Unified_Ideographs_Extension_J:
+    - label=FSung-1
+    - label=FSung-2
+    # ... FSung-1 through FSung-X
+    - noto-sans-cjk-jp
+```
+Block names use the original Unicode verbatim form. Each entry is a
+fontist-resolvable name (fontist finds/installs) OR a `label=/path`
+for direct paths (matches the existing `FontLocator` convention).
+The config is loaded at resolver construction time. Each block entry
+expands to one or more `Sources::Tier1RealFont` instances.
+## Pillar sources
+The pillar sources don't need per-block config — they auto-discover
+from the Code Charts PDF and the Last Resort UFO:
+- `Sources::Pillar1EmbeddedTounicode`: initialized with the Code Charts
+  PDF path; serves any codepoint in `Catalog#codepoints`.
+- `Sources::Pillar2Correlator`: initialized with correlator configs
+  (per TODO `lib/ucode/glyphs/embedded_fonts/catalog.rb`'s
+  `correlator_configs:` registry).
+- `Sources::Pillar3LastResort`: initialized with the Last Resort UFO
+  path; serves any codepoint the UFO has a `.glif` for.
+## Integration with Repo::CodepointWriter
+Mode 1's existing `Ucode::Repo::CodepointWriter` is updated to use the
+resolver:
+```ruby
+repo_writer = Ucode::Repo::CodepointWriter.new(
+  output_root: Pathname.new("output"),
+  resolver: Ucode::Glyphs::Resolver.new(sources: resolver_sources),
+  # ...
+)
+Ucode::Coordinator.new.each_codepoint(ucd_dir:, unihan_dir:) do |cp|
+  repo_writer.write_codepoint(cp)  # internally calls resolver.resolve(cp)
+end
+```
+The writer records `provenance` in the per-codepoint `index.json`
+under a new field, so the dataset is debuggable:
+```json
+{
+  "codepoint": 10980,
+  "name": "SIDETIC LETTER A",
+  ...
+  "glyph": {
+    "svg_path": "glyph.svg",
+    "source": {
+      "tier": "tier-1",
+      "provenance": "tier-1:lentariso"
+    }
+  }
+}
+```
+## Acceptance
+- Resolver returns a `Result` for every codepoint in the Unicode 17
+  baseline (no nils for assigned codepoints — Tier 3 always catches
+  the tail).
+- Provenance is recorded per codepoint; running stats show e.g.
+  "Tier 1: 150,000 codepoints, Pillar 1: 3,000, Pillar 2: 800,
+  Pillar 3: 1,500".
+- A codepoint with no Tier 1 font configured (e.g. a private specimen
+  block) falls through to Pillar 1-2-3 cleanly without errors.
+- Re-running with an updated Tier 1 config (e.g. a new font added for
+  Sidetic) re-resolves and rewrites only the affected codepoints.
+- All specs use real font fixtures (the existing
+  `spec/fixtures/fonts/`); no `double()`.
+- Rubocop clean.
+## References
+- Architecture: `docs/architecture.md` §"The 4-tier glyph sourcing strategy"
+- Existing Tier 1: `lib/ucode/glyphs/real_fonts/`
+- Existing Pillar 1: `lib/ucode/glyphs/embedded_fonts/catalog.rb`
+- Existing Pillar 2: `lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb`
+- Existing Pillar 3: `lib/ucode/glyphs/last_resort/`
+- Baseline data: `TODO.new/05-baseline-unicode17-coverage-audit.md`
+- Mode 1 writer: `lib/ucode/repo/codepoint_writer.rb`

data/TODO.new/21-canonical-unicode17-build.md ADDED Viewed

@@ -0,0 +1,148 @@
+# 21 — Canonical Unicode 17 dataset build
+## Goal
+Produce a complete Unicode 17 Mode 1 dataset end-to-end. Every assigned
+codepoint gets `index.json` (UCD properties, NamesList relationships,
+Unihan readings) + canonical `glyph.svg` (sourced via the 4-tier
+resolver from TODO 20).
+This is the integration test for the entire Mode 1 pipeline. It also
+produces the dataset that ships to consumers (Vitepress site,
+downloads, etc.).
+## Scope
+Run the full Mode 1 build against Unicode 17.0:
+```bash
+bin/ucode fetch ucd --version 17.0.0
+bin/ucode fetch unihan --version 17.0.0
+bin/ucode fetch charts --version 17.0.0
+bin/ucode parse --version 17.0.0
+bin/ucode glyphs --version 17.0.0 --include-glyphs
+bin/ucode site build    # optional: also build the Vitepress site
+```
+The deliverable is the `output/` tree plus a build-report.json
+summarizing what got built, what got skipped, and what failed.
+## Pre-conditions
+All of these must be in place before this TODO runs:
+1. PR #1 (`tier1-cmap-audit`) merged.
+2. TODOs 01, 05, 20 complete (pillar alignment, baseline audit, resolver).
+3. Tier 1 fonts downloaded into `data/fonts/` per the baseline audit's
+   recommendations (TODO 05 deliverable).
+4. Code Charts PDFs downloaded into `data/pdfs/` (per-block).
+5. Last Resort UFO cloned into `data/last-resort-font/`.
+## Build report
+The build emits `output/build-report.json`:
+```json
+{
+  "unicode_version": "17.0.0",
+  "ucode_version": "0.2.0",
+  "generated_at": "2026-07-01T12:00:00Z",
+  "totals": {
+    "codepoints_assigned": 150012,
+    "codepoints_built": 150012,
+    "codepoints_skipped": 0,
+    "codepoints_failed": 0
+  },
+  "by_tier": {
+    "tier-1": 150012,
+    "pillar-1": 3000,
+    "pillar-2": 800,
+    "pillar-3": 1500
+  },
+  "by_block": [
+    { "name": "Basic Latin", "assigned": 128, "built": 128,
+      "tier_breakdown": { "tier-1": 128 } },
+    { "name": "Sidetic", "assigned": 26, "built": 26,
+      "tier_breakdown": { "tier-1": 26 } },
+    ...
+  ],
+  "failures": []
+}
+```
+The `by_tier` counts overlap (a codepoint that was attempted via Tier 1
+but fell through to Pillar 1 is counted in both). The `built` count
+per codepoint is the tier that actually produced its glyph.
+## Validation
+After the build:
+1. **Completeness check**: every codepoint in the Unicode 17 baseline
+   has a `glyph.svg`. Any missing is a bug.
+2. **Schema check**: every `index.json` deserializes via
+   `Ucode::Models::CodePoint.from_hash`.
+3. **Provenance sanity**: no codepoint is missing the
+   `glyph.source.tier` field.
+4. **Sample inspection**: spot-check 20 codepoints across different
+   tiers and visually verify the SVG renders correctly (manual).
+5. **Block coverage**: per-block built count matches the baseline
+   audit's per-block coverage (TODO 05).
+## Performance targets
+- Total build time: under 4 hours on a single machine (target).
+  The 4,298 CJK Extension J codepoints dominate; parallelize via
+  `--parallel N` (default is `Ucode.configuration.parallel_workers`).
+- Disk usage: under 50 GB for the full Unicode 17 dataset (target).
+  Each codepoint's `index.json` averages ~3KB; glyph SVG averages
+  ~2KB. 150k codepoints × 5KB ≈ 750MB core data; rest is indexes,
+  relationships, manifest, site build.
+- Idempotency: re-running the build after a no-op source change
+  produces zero file writes (per `CLAUDE.md` idempotency rule).
+## Release gating
+The dataset produced by this TODO is what gets published. Before
+publishing:
+- All validation checks above pass.
+- Spot inspection by the user (sign-off required).
+- Build report committed to the repo for traceability:
+  `output/build-report.json` (gitignored under `/output/`; copy a
+  summary into `docs/build-reports/<date>-unicode17.md` for the
+  permanent record).
+The published artifacts:
+- Static dataset: `output/` tarballed and uploaded to GitHub releases.
+- Vitepress site: built from `output/` and deployed to the site host.
+- Per-block PDFs and Last Resort UFO NOT included in the dataset
+  release — they're build inputs, not outputs.
+## Acceptance
+- Full Unicode 17 build completes without errors.
+- `output/build-report.json` shows `codepoints_built ==
+  codepoints_assigned` (zero failures, zero skips).
+- 10 random codepoints across different blocks have valid `glyph.svg`
+  files that render correctly.
+- Per-block tier breakdown matches the baseline audit (TODO 05).
+- Idempotency verified: re-running the build produces zero writes.
+- Dataset size and build time within targets (or documented
+  exceptions).
+## Out of scope
+- The audit migration (TODOs 06-19). Mode 1 doesn't depend on Mode 2.
+- The fontist.org data feed (separate effort; consumes Mode 2 audits).
+- Site deployment automation (separate effort).
+## References
+- Architecture: `docs/architecture.md` §"Mode 1 — canonical Unicode dataset"
+- Resolver: `TODO.new/20-canonical-resolver-4-tier.md`
+- Baseline data: `TODO.new/05-baseline-unicode17-coverage-audit.md`
+- Existing pipeline: `lib/ucode/repo/codepoint_writer.rb`,
+  `lib/ucode/coordinator.rb`
+- Build commands: `CLAUDE.md` §"Build / test commands"

data/TODO.new/22-implementation-order.md ADDED Viewed

@@ -0,0 +1,176 @@
+# 22 — Implementation order
+## Goal
+Sequence the TODOs in this directory so dependencies flow correctly
+and each track lands as a reviewable PR. Update this file when the
+sequence changes — it's the canonical answer to "what comes next".
+## Sequencing principles
+- **Schema and contract first.** Lock the data shape before porting
+  code that produces or consumes it. TODOs 01-04 land before any
+  porting TODO.
+- **Measure before optimizing.** TODO 05 (baseline audit) informs
+  TODO 20 (resolver config) and TODO 21 (build verification). It
+  doesn't block porting work — porting can start in parallel — but
+  its deliverable must exist before TODO 20 ships.
+- **One PR per TODO** unless tightly coupled. Each track is one
+  branch, one PR, one merge.
+- **Migration order: port → wire → cleanup.** Don't delete fontisan
+  code until ucode's equivalent is shipped and proven. TODOs 17-19
+  land only after TODOs 06-16 are merged and fontist.org has
+  validated the new contract.
+## Dependency graph
+```
+01 pillar-terminology-alignment ─── standalone, ship anytime
+02 audit-schema-design ────────────┐
+03 directory-output-spec ──────────┤
+04 fontist-org-contract ───────────┘
+                                   │
+                                   ▼
+05 baseline-unicode17-coverage-audit ───┐
+                                        │
+06 audit-namespace-skeleton ────────────┤
+                                        │
+07 audit-models-port ───────────────────┤
+                                        │
+08 extractors-cheap-port ───────────────┤
+                                        │
+09 extractors-expensive-port ───────────┤
+                                        │
+10 aggregations-ucd-rewrite ────────────┤
+                                        │
+11 differ-and-library-auditor-port ─────┤
+                                        │
+12 formatters-port ─────────────────────┤
+                                        │
+13 directory-emitter ───────────────────┤
+                                        │
+14 html-face-browser ───────────────────┤
+                                        │
+15 html-library-browser ────────────────┤
+                                        │
+16 cli-audit-subcommands ───────────────┘
+                                   │
+                                   ▼
+17 fontisan-cleanup-audit ──┐
+18 fontisan-cleanup-ucd  ───┴── after 16 validated in production
+19 fontisan-docs-update ──────  after 17 + 18
+20 canonical-resolver-4-tier ──── after 05 (needs baseline data)
+                                 │
+                                 ▼
+21 canonical-unicode17-build ──── after 20
+```
+## Recommended PR sequence
+### Track A — Alignment & contract (parallel-safe, ship first)
+- PR-A1: TODO 01 (pillar terminology). One commit. No deps.
+- PR-A2: TODOs 02 + 03 + 04 (schema, layout, contract). One PR; these
+  three define a single contract and are easier to review together.
+### Track B — Baseline measurement (parallel with Track A)
+- PR-B1: TODO 05 (baseline audit). Long-running — depends on
+  acquiring fonts, running cmaps, building the report. Can start
+  the moment PR #1 (`tier1-cmap-audit`) merges; doesn't block
+  Tracks C-D.
+### Track C — Audit migration (strict sequence)
+Each PR builds on the previous. Don't skip ahead.
+- PR-C1: TODOs 06 + 07 (skeleton + models). One PR. Pure data;
+  nothing runs yet.
+- PR-C2: TODO 08 (cheap extractors). Brief-mode audits work after
+  this.
+- PR-C3: TODO 09 (expensive extractors). Full-mode audits work, minus
+  aggregations.
+- PR-C4: TODO 10 (aggregations rewrite). Full audit produces real
+  coverage data.
+- PR-C5: TODOs 11 + 12 (differ + formatters). Diff and text output.
+- PR-C6: TODO 13 (directory emitter). JSON output to disk.
+- PR-C7: TODOs 14 + 15 (HTML browsers).
+- PR-C8: TODO 16 (CLI subcommands). End-user-facing.
+After PR-C8, ucode's audit is feature-complete and producing real
+data.
+### Track D — Fontisan cleanup (after Track C + production validation)
+- PR-D1: TODOs 17 + 18 + 19 (cleanup + docs). One PR per fontisan
+  repo; do this only after ucode's audit has been the source of
+  truth for at least one release cycle.
+### Track E — Canonical Mode 1 alignment (after Track B)
+- PR-E1: TODO 20 (4-tier resolver).
+- PR-E2: TODO 21 (Unicode 17 full build). The integration test.
+## Acceptance gates per PR
+Every PR in this directory must:
+- Pass GHA on Ruby 3.1, 3.2, 3.3, 3.4.
+- Pass `bundle exec rubocop` on new and modified files.
+- Pass `bundle exec rspec` for new and affected specs.
+- Add or update specs covering new behavior.
+- No `double()` in any spec.
+- No `def to_h` / `from_h` / `to_json` / `from_json` anywhere.
+- No AI attribution in commits, PRs, or docs.
+- Update `docs/architecture.md` if the architecture shifts.
+- Update this file (TODO 22) if the sequence changes.
+## Smoke tests per track
+After each track merges, run a smoke test against a real fixture:
+- After PR-C2 (cheap extractors): `ucode audit font spec/fixtures/fonts/MonaSans-Regular.ttf --brief`
+  produces a face report with identity + coverage totals.
+- After PR-C4 (aggregations): same command without `--brief` produces
+  full block + script coverage for the fixture font.
+- After PR-C6 (emitter): `--output /tmp/audit-test/` writes the
+  directory tree; re-run produces zero writes.
+- After PR-C8 (CLI): full audit + library + compare + browser all
+  work end-to-end.
+- After PR-E2 (canonical build): full Unicode 17 dataset exists,
+  validation passes, build report committed.
+## Cross-cutting concerns
+### Performance
+Track ucode's parse + audit performance per release. Target: full
+Unicode 17 build under 4 hours; single-font audit under 5 seconds for
+typical Latin fonts, under 30 seconds for CJK. Document regressions in
+`docs/performance.md`.
+### Documentation
+Every user-facing PR (CLI changes, schema changes, output layout
+changes) updates:
+- `docs/architecture.md` if shape changes.
+- `docs/guide/` if user workflow changes.
+- `CHANGELOG.md` (new file — create if missing) for any
+  user-visible change.
+- `TODO.new/00-README.md` checkmark when a TODO completes.
+### Memory
+When this directory's work is done (all TODOs checked off), move the
+directory to `TODO.done/2026H2-audit-migration/` (or similar) so the
+next planning cycle starts with a clean `TODO.new/`. Don't delete —
+the historical record is valuable.
+## References
+- Architecture: `docs/architecture.md`
+- Global rules: `~/.claude/CLAUDE.md`, `CLAUDE.md`
+- Existing TODO structure: `TODO/` (v0.1 historical record)
+- Memory files: `/Users/mulgogi/.claude/projects/-Users-mulgogi-src-fontist-ucode/memory/`

data/UCODE_CHANGELOG.md ADDED Viewed

@@ -0,0 +1,97 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [0.1.0] - 2026-06-25
+### Highlights
+First public release. The JSON dataset pipeline, SQLite lookup index, and
+Vitepress site generator are production-ready. **SVG glyph extraction from
+the Code Charts PDFs is experimental and gated behind an opt-in flag** —
+see "Deferred" below.
+### Added
+- **Foundation**: `Ucode::Config`, `Ucode::Cache`, `Ucode::VersionResolver`,
+  `Ucode::Error` hierarchy with structured `context:` payloads.
+- **Fetchers**: `Ucode::Fetch::{UcdZip,UnihanZip,CodeCharts,Http}` with
+  retries, timeouts, and XDG-compliant cache layout.
+- **Models (lutaml-model)**: `Plane`, `Block`, `Script`, `CodePoint` with
+  nested sub-models (`Bidi`, `Casing`, `CaseFolding`, `Display`,
+  `Segmentation`, `Hangul`, `Indic`, `Emoji`, `Identifier`,
+  `Normalization`, `Joining`); polymorphic `Relationship` hierarchy
+  (`CrossReference`, `SeeAlso`, `CompatibilityEquivalent`,
+  `SampleSequence`, `InformalAlias`, `Footnote`, `VariationSequence`);
+  `UnihanEntry`, `NamedSequence`, `StandardizedVariant`, `CjkRadical`,
+  `SpecialCasingRule`, `CaseFoldingRule`, `BidiBracketPair`, `NameAlias`,
+  `PropertyAlias`, `PropertyValueAlias`.
+- **Parsers (streaming)**: one per UCD text file — `UnicodeData`,
+  `Blocks`, `Scripts`, `ScriptExtensions`, `PropertyAliases`,
+  `PropertyValueAliases`, `NameAliases`, `NamedSequences`,
+  `SpecialCasing`, `CaseFolding`, `BidiMirroring`, `BidiBrackets`,
+  `CjkRadicals`, `StandardizedVariants`, `NamesList` (state-machine),
+  `DerivedAge`, `DerivedCoreProperties`, `ExtractedProperties`,
+  `Auxiliary` (10 files), `Unihan` (8 files).
+- **Coordinator**: streaming single-pass enrichment, `Coordinator::Indices`
+  struct of every loaded index.
+- **Indices**: `Ucode::Index` (YAML bsearch, dependency-free),
+  `Ucode::Database` (SQLite, persistent), `Ucode::DbBuilder`,
+  `Ucode::IndexBuilder`, `Ucode::RangeEntry`.
+- **Aggregator**: `aggregate_blocks`, `aggregate_scripts` — pure
+  transformations over `Enumerable<Integer>` + `Index`.
+- **Repo writers**: `Repo::Paths` (path conventions),
+  `Repo::AtomicWrites` (byte-compared atomic writes),
+  `Repo::CodepointWriter` (streaming + threaded per-cp JSON),
+  `Repo::AggregateWriter` (planes, blocks, scripts, indexes,
+  relationships, enums, named sequences, manifest).
+- **Site**: `Site::Generator` (init + build), `Site::ConfigEmitter`
+  (`config.ts` from output tree), `Site::SearchIndex` (MiniSearch
+  payload), Vitepress template with Vue components (`PlaneView`,
+  `BlockView`, `CharView`, `SearchView`), dynamic `char/[codepoint]`
+  route.
+- **CLI**: `bin/ucode` Thor CLI with `fetch`, `parse`, `glyphs`,
+  `site`, `lookup`, `cache`, `build`, `version` subcommands. Each
+  command delegates to a pure `Commands::*Command` class.
+- **Docs**: `README.md`, `docs/FONTISAN_MIGRATION.md`.
+### Deferred (v0.2)
+- **Per-codepoint SVG glyph extraction is experimental.** The
+  `Ucode::Glyphs` pipeline shipped in v0.1 (`PdfFetcher`, `PageRenderer`,
+  `GridDetector`, `CellExtractor`, `Writer`, `MonolithPageMap`) is fully
+  implemented and tested, but the Code Charts PDFs composite the
+  cell-border decorations and the actual character outline into a single
+  glyph definition, so the current `CellExtractor` output includes both.
+  The CLI gates the step behind `--include-glyphs` (default off) and prints
+  a warning.
+- **v0.2 strategy — two pillars that bypass the cell extractor entirely:**
+  1. **Real character glyphs** are read straight from the subsetted fonts
+     embedded in `CodeCharts.pdf` (the `Uni*`/`UCS*`-prefixed per-block
+     fonts). Each font program contains only the character outline — the
+     cell-border decoration is page content, not part of the glyph — so
+     extracting the font stream + walking the ToUnicode CMap yields clean
+     per-codepoint SVGs without any post-processing of composite paths.
+  2. **Last Resort placeholders** (unassigned, noncharacter, PUA
+     codepoints) are rendered directly from the
+     [Last Resort Font](https://github.com/unicode-org/last-resort-font)
+     UFO source (380 `.glif` files + Format 13 `cmap`), matching the
+     placeholder box the Code Charts actually display.
+- The v0.1 cell-position resolution (`GridDetector` +
+  `CellExtractor#find_use_at`) is correct and is retained as the
+  authoritative cell→codepoint map; only the rendering path is replaced.
+### Tooling
+- `rubocop`, `rubocop-rspec`, `rubocop-performance`, `rubocop-rake` for
+  lint; `rspec` for tests; `simplecov` for coverage (94%+ line coverage,
+  80% minimum enforced).
+- 580+ specs covering every public API.
+[Unreleased]: https://github.com/fontist/ucode/compare/v0.1.0...HEAD
+[0.1.0]: https://github.com/fontist/ucode/releases/tag/v0.1.0

data/exe/ucode ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
+require "ucode"
+Ucode::Cli.start(ARGV)

data/lib/ucode/aggregator.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+module Ucode
+  # Coverage analysis over codepoint sets.
+  #
+  # Pure transformations: given a collection of codepoints and an
+  # `Index` (blocks or scripts), return aggregated summaries. No I/O,
+  # no mutation of inputs, no global state.
+  #
+  # OCP: new aggregation kinds (planes, categories, ...) slot in as
+  # new methods without altering existing ones.
+  module Aggregator
+    # Summary of how many codepoints of one block are present in a
+    # given input set. Plain Struct — Ruby's built-in `to_h` covers
+    # any serialization needs.
+    BlockSummary = Struct.new(
+      :name,
+      :first_cp,
+      :last_cp,
+      :total,
+      :covered,
+      :fill_ratio,
+      :complete,
+      keyword_init: true,
+    )
+    class << self
+      # @param codepoints [Enumerable<Integer>]
+      # @param blocks_index [Ucode::Index]
+      # @return [Array<BlockSummary>] one summary per block in the index,
+      #   in the index's natural (first_cp) order
+      def aggregate_blocks(codepoints, blocks_index)
+        sorted = codepoints.sort
+        blocks_index.map { |entry| build_block_summary(entry, sorted) }
+      end
+      # @param codepoints [Enumerable<Integer>]
+      # @param scripts_index [Ucode::Index]
+      # @return [Array<String>] sorted unique script names covering the
+      #   given codepoints
+      def aggregate_scripts(codepoints, scripts_index)
+        codepoints.filter_map { |cp| scripts_index.lookup(cp) }.uniq.sort
+      end
+      private
+      def build_block_summary(entry, sorted_cps)
+        covered = count_in_range(sorted_cps, entry.first_cp, entry.last_cp)
+        total = entry.size
+        BlockSummary.new(
+          name: entry.name,
+          first_cp: entry.first_cp,
+          last_cp: entry.last_cp,
+          total: total,
+          covered: covered,
+          fill_ratio: total.zero? ? 0.0 : (covered.to_f / total),
+          complete: covered == total,
+        )
+      end
+      # Count of sorted cps in the inclusive [first, last] range, in O(log N).
+      def count_in_range(sorted, first, last)
+        upper_bound(sorted, last) - lower_bound(sorted, first)
+      end
+      # Index of the first cp >= value (or sorted.size if none).
+      def lower_bound(sorted, value)
+        sorted.bsearch_index { |cp| cp >= value } || sorted.size
+      end
+      # Index of the first cp > value (or sorted.size if none).
+      def upper_bound(sorted, value)
+        sorted.bsearch_index { |cp| cp > value } || sorted.size
+      end
+    end
+  end
+end