ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,182 @@
1
+ # 20 — Canonical 4-tier resolver
2
+
3
+ ## Goal
4
+
5
+ Wire the 4-tier glyph sourcing strategy into Mode 1's per-codepoint
6
+ canonical dataset writer. For each assigned codepoint, the resolver
7
+ tries Tier 1 → Pillar 1 → Pillar 2 → Pillar 3 in order and uses the
8
+ first tier that produces a glyph.
9
+
10
+ Today Mode 1 has the pillars (1-3) implemented but no Tier 1 hook, no
11
+ config-driven font selection per block, and no priority-ordered
12
+ resolver. This TODO builds the resolver.
13
+
14
+ ## Files to create
15
+
16
+ - `lib/ucode/glyphs/resolver.rb` — the priority-ordered resolver.
17
+ - `lib/ucode/glyphs/source_config.rb` — block → preferred Tier 1 font
18
+ config table.
19
+ - `lib/ucode/glyphs/sources/`
20
+ - `tier1_real_font.rb` — wraps the existing RealFonts pipeline as a
21
+ resolver source.
22
+ - `pillar1_embedded_tounicode.rb` — wraps `EmbeddedFonts::Catalog`.
23
+ - `pillar2_correlator.rb` — wraps `ContentStreamCorrelator`.
24
+ - `pillar3_last_resort.rb` — wraps `LastResort`.
25
+ - `lib/ucode/glyphs/source.rb` — common interface (`#fetch(codepoint)
26
+ → Result or nil`).
27
+ - Specs for resolver + each source wrapper.
28
+
29
+ ## Source interface
30
+
31
+ ```ruby
32
+ class Ucode::Glyphs::Source
33
+ Result = Struct.new(:tier, :codepoint, :svg, :provenance, keyword_init: true)
34
+
35
+ # @param codepoint [Integer]
36
+ # @return [Result, nil] nil if this source cannot produce a glyph
37
+ def fetch(codepoint)
38
+ raise NotImplementedError
39
+ end
40
+
41
+ # @return [String] e.g. "tier-1:noto-sans-sidetic", "pillar-1:embedded",
42
+ # "pillar-2:correlated", "pillar-3:last-resort"
43
+ def provenance
44
+ raise NotImplementedError
45
+ end
46
+ end
47
+ ```
48
+
49
+ Each tier is a `Source` subclass. The resolver holds an ordered array
50
+ of sources and returns the first non-nil result.
51
+
52
+ ## Resolver behavior
53
+
54
+ ```ruby
55
+ class Ucode::Glyphs::Resolver
56
+ DEFAULT_ORDER = %i[tier1 pillar1 pillar2 pillar3].freeze
57
+
58
+ def initialize(sources:, order: DEFAULT_ORDER)
59
+ @sources_by_tier = sources.group_by(&:tier)
60
+ @order = order
61
+ end
62
+
63
+ def resolve(codepoint)
64
+ @order.each do |tier|
65
+ Array(@sources_by_tier[tier]).each do |source|
66
+ result = source.fetch(codepoint)
67
+ return result if result
68
+ end
69
+ end
70
+ nil
71
+ end
72
+ end
73
+ ```
74
+
75
+ Sources can be plural per tier (e.g. multiple Tier 1 fonts covering
76
+ different blocks). The resolver tries them in declared order.
77
+
78
+ ## Source config
79
+
80
+ The block → Tier 1 font mapping lives in a config file, populated
81
+ from the baseline audit in TODO 05:
82
+
83
+ ```yaml
84
+ # config/unicode17_tier1_fonts.yml
85
+ tier1_fonts:
86
+ Sidetic:
87
+ - label=Lentariso
88
+ - noto-sans-sidetic
89
+ Beria_Erfe:
90
+ - label=Kedebideri
91
+ Tai_Yo:
92
+ - label=NotoSerifTaiYo
93
+ Tolong_Siki:
94
+ - noto-sans-tolong-siki
95
+ # ...
96
+ CJK_Unified_Ideographs_Extension_J:
97
+ - label=FSung-1
98
+ - label=FSung-2
99
+ # ... FSung-1 through FSung-X
100
+ - noto-sans-cjk-jp
101
+ ```
102
+
103
+ Block names use the original Unicode verbatim form. Each entry is a
104
+ fontist-resolvable name (fontist finds/installs) OR a `label=/path`
105
+ for direct paths (matches the existing `FontLocator` convention).
106
+
107
+ The config is loaded at resolver construction time. Each block entry
108
+ expands to one or more `Sources::Tier1RealFont` instances.
109
+
110
+ ## Pillar sources
111
+
112
+ The pillar sources don't need per-block config — they auto-discover
113
+ from the Code Charts PDF and the Last Resort UFO:
114
+
115
+ - `Sources::Pillar1EmbeddedTounicode`: initialized with the Code Charts
116
+ PDF path; serves any codepoint in `Catalog#codepoints`.
117
+ - `Sources::Pillar2Correlator`: initialized with correlator configs
118
+ (per TODO `lib/ucode/glyphs/embedded_fonts/catalog.rb`'s
119
+ `correlator_configs:` registry).
120
+ - `Sources::Pillar3LastResort`: initialized with the Last Resort UFO
121
+ path; serves any codepoint the UFO has a `.glif` for.
122
+
123
+ ## Integration with Repo::CodepointWriter
124
+
125
+ Mode 1's existing `Ucode::Repo::CodepointWriter` is updated to use the
126
+ resolver:
127
+
128
+ ```ruby
129
+ repo_writer = Ucode::Repo::CodepointWriter.new(
130
+ output_root: Pathname.new("output"),
131
+ resolver: Ucode::Glyphs::Resolver.new(sources: resolver_sources),
132
+ # ...
133
+ )
134
+
135
+ Ucode::Coordinator.new.each_codepoint(ucd_dir:, unihan_dir:) do |cp|
136
+ repo_writer.write_codepoint(cp) # internally calls resolver.resolve(cp)
137
+ end
138
+ ```
139
+
140
+ The writer records `provenance` in the per-codepoint `index.json`
141
+ under a new field, so the dataset is debuggable:
142
+
143
+ ```json
144
+ {
145
+ "codepoint": 10980,
146
+ "name": "SIDETIC LETTER A",
147
+ ...
148
+ "glyph": {
149
+ "svg_path": "glyph.svg",
150
+ "source": {
151
+ "tier": "tier-1",
152
+ "provenance": "tier-1:lentariso"
153
+ }
154
+ }
155
+ }
156
+ ```
157
+
158
+ ## Acceptance
159
+
160
+ - Resolver returns a `Result` for every codepoint in the Unicode 17
161
+ baseline (no nils for assigned codepoints — Tier 3 always catches
162
+ the tail).
163
+ - Provenance is recorded per codepoint; running stats show e.g.
164
+ "Tier 1: 150,000 codepoints, Pillar 1: 3,000, Pillar 2: 800,
165
+ Pillar 3: 1,500".
166
+ - A codepoint with no Tier 1 font configured (e.g. a private specimen
167
+ block) falls through to Pillar 1-2-3 cleanly without errors.
168
+ - Re-running with an updated Tier 1 config (e.g. a new font added for
169
+ Sidetic) re-resolves and rewrites only the affected codepoints.
170
+ - All specs use real font fixtures (the existing
171
+ `spec/fixtures/fonts/`); no `double()`.
172
+ - Rubocop clean.
173
+
174
+ ## References
175
+
176
+ - Architecture: `docs/architecture.md` §"The 4-tier glyph sourcing strategy"
177
+ - Existing Tier 1: `lib/ucode/glyphs/real_fonts/`
178
+ - Existing Pillar 1: `lib/ucode/glyphs/embedded_fonts/catalog.rb`
179
+ - Existing Pillar 2: `lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb`
180
+ - Existing Pillar 3: `lib/ucode/glyphs/last_resort/`
181
+ - Baseline data: `TODO.new/05-baseline-unicode17-coverage-audit.md`
182
+ - Mode 1 writer: `lib/ucode/repo/codepoint_writer.rb`
@@ -0,0 +1,148 @@
1
+ # 21 — Canonical Unicode 17 dataset build
2
+
3
+ ## Goal
4
+
5
+ Produce a complete Unicode 17 Mode 1 dataset end-to-end. Every assigned
6
+ codepoint gets `index.json` (UCD properties, NamesList relationships,
7
+ Unihan readings) + canonical `glyph.svg` (sourced via the 4-tier
8
+ resolver from TODO 20).
9
+
10
+ This is the integration test for the entire Mode 1 pipeline. It also
11
+ produces the dataset that ships to consumers (Vitepress site,
12
+ downloads, etc.).
13
+
14
+ ## Scope
15
+
16
+ Run the full Mode 1 build against Unicode 17.0:
17
+
18
+ ```bash
19
+ bin/ucode fetch ucd --version 17.0.0
20
+ bin/ucode fetch unihan --version 17.0.0
21
+ bin/ucode fetch charts --version 17.0.0
22
+ bin/ucode parse --version 17.0.0
23
+ bin/ucode glyphs --version 17.0.0 --include-glyphs
24
+ bin/ucode site build # optional: also build the Vitepress site
25
+ ```
26
+
27
+ The deliverable is the `output/` tree plus a build-report.json
28
+ summarizing what got built, what got skipped, and what failed.
29
+
30
+ ## Pre-conditions
31
+
32
+ All of these must be in place before this TODO runs:
33
+
34
+ 1. PR #1 (`tier1-cmap-audit`) merged.
35
+ 2. TODOs 01, 05, 20 complete (pillar alignment, baseline audit, resolver).
36
+ 3. Tier 1 fonts downloaded into `data/fonts/` per the baseline audit's
37
+ recommendations (TODO 05 deliverable).
38
+ 4. Code Charts PDFs downloaded into `data/pdfs/` (per-block).
39
+ 5. Last Resort UFO cloned into `data/last-resort-font/`.
40
+
41
+ ## Build report
42
+
43
+ The build emits `output/build-report.json`:
44
+
45
+ ```json
46
+ {
47
+ "unicode_version": "17.0.0",
48
+ "ucode_version": "0.2.0",
49
+ "generated_at": "2026-07-01T12:00:00Z",
50
+ "totals": {
51
+ "codepoints_assigned": 150012,
52
+ "codepoints_built": 150012,
53
+ "codepoints_skipped": 0,
54
+ "codepoints_failed": 0
55
+ },
56
+ "by_tier": {
57
+ "tier-1": 150012,
58
+ "pillar-1": 3000,
59
+ "pillar-2": 800,
60
+ "pillar-3": 1500
61
+ },
62
+ "by_block": [
63
+ { "name": "Basic Latin", "assigned": 128, "built": 128,
64
+ "tier_breakdown": { "tier-1": 128 } },
65
+ { "name": "Sidetic", "assigned": 26, "built": 26,
66
+ "tier_breakdown": { "tier-1": 26 } },
67
+ ...
68
+ ],
69
+ "failures": []
70
+ }
71
+ ```
72
+
73
+ The `by_tier` counts overlap (a codepoint that was attempted via Tier 1
74
+ but fell through to Pillar 1 is counted in both). The `built` count
75
+ per codepoint is the tier that actually produced its glyph.
76
+
77
+ ## Validation
78
+
79
+ After the build:
80
+
81
+ 1. **Completeness check**: every codepoint in the Unicode 17 baseline
82
+ has a `glyph.svg`. Any missing is a bug.
83
+ 2. **Schema check**: every `index.json` deserializes via
84
+ `Ucode::Models::CodePoint.from_hash`.
85
+ 3. **Provenance sanity**: no codepoint is missing the
86
+ `glyph.source.tier` field.
87
+ 4. **Sample inspection**: spot-check 20 codepoints across different
88
+ tiers and visually verify the SVG renders correctly (manual).
89
+ 5. **Block coverage**: per-block built count matches the baseline
90
+ audit's per-block coverage (TODO 05).
91
+
92
+ ## Performance targets
93
+
94
+ - Total build time: under 4 hours on a single machine (target).
95
+ The 4,298 CJK Extension J codepoints dominate; parallelize via
96
+ `--parallel N` (default is `Ucode.configuration.parallel_workers`).
97
+ - Disk usage: under 50 GB for the full Unicode 17 dataset (target).
98
+ Each codepoint's `index.json` averages ~3KB; glyph SVG averages
99
+ ~2KB. 150k codepoints × 5KB ≈ 750MB core data; rest is indexes,
100
+ relationships, manifest, site build.
101
+ - Idempotency: re-running the build after a no-op source change
102
+ produces zero file writes (per `CLAUDE.md` idempotency rule).
103
+
104
+ ## Release gating
105
+
106
+ The dataset produced by this TODO is what gets published. Before
107
+ publishing:
108
+
109
+ - All validation checks above pass.
110
+ - Spot inspection by the user (sign-off required).
111
+ - Build report committed to the repo for traceability:
112
+ `output/build-report.json` (gitignored under `/output/`; copy a
113
+ summary into `docs/build-reports/<date>-unicode17.md` for the
114
+ permanent record).
115
+
116
+ The published artifacts:
117
+
118
+ - Static dataset: `output/` tarballed and uploaded to GitHub releases.
119
+ - Vitepress site: built from `output/` and deployed to the site host.
120
+ - Per-block PDFs and Last Resort UFO NOT included in the dataset
121
+ release — they're build inputs, not outputs.
122
+
123
+ ## Acceptance
124
+
125
+ - Full Unicode 17 build completes without errors.
126
+ - `output/build-report.json` shows `codepoints_built ==
127
+ codepoints_assigned` (zero failures, zero skips).
128
+ - 10 random codepoints across different blocks have valid `glyph.svg`
129
+ files that render correctly.
130
+ - Per-block tier breakdown matches the baseline audit (TODO 05).
131
+ - Idempotency verified: re-running the build produces zero writes.
132
+ - Dataset size and build time within targets (or documented
133
+ exceptions).
134
+
135
+ ## Out of scope
136
+
137
+ - The audit migration (TODOs 06-19). Mode 1 doesn't depend on Mode 2.
138
+ - The fontist.org data feed (separate effort; consumes Mode 2 audits).
139
+ - Site deployment automation (separate effort).
140
+
141
+ ## References
142
+
143
+ - Architecture: `docs/architecture.md` §"Mode 1 — canonical Unicode dataset"
144
+ - Resolver: `TODO.new/20-canonical-resolver-4-tier.md`
145
+ - Baseline data: `TODO.new/05-baseline-unicode17-coverage-audit.md`
146
+ - Existing pipeline: `lib/ucode/repo/codepoint_writer.rb`,
147
+ `lib/ucode/coordinator.rb`
148
+ - Build commands: `CLAUDE.md` §"Build / test commands"
@@ -0,0 +1,176 @@
1
+ # 22 — Implementation order
2
+
3
+ ## Goal
4
+
5
+ Sequence the TODOs in this directory so dependencies flow correctly
6
+ and each track lands as a reviewable PR. Update this file when the
7
+ sequence changes — it's the canonical answer to "what comes next".
8
+
9
+ ## Sequencing principles
10
+
11
+ - **Schema and contract first.** Lock the data shape before porting
12
+ code that produces or consumes it. TODOs 01-04 land before any
13
+ porting TODO.
14
+ - **Measure before optimizing.** TODO 05 (baseline audit) informs
15
+ TODO 20 (resolver config) and TODO 21 (build verification). It
16
+ doesn't block porting work — porting can start in parallel — but
17
+ its deliverable must exist before TODO 20 ships.
18
+ - **One PR per TODO** unless tightly coupled. Each track is one
19
+ branch, one PR, one merge.
20
+ - **Migration order: port → wire → cleanup.** Don't delete fontisan
21
+ code until ucode's equivalent is shipped and proven. TODOs 17-19
22
+ land only after TODOs 06-16 are merged and fontist.org has
23
+ validated the new contract.
24
+
25
+ ## Dependency graph
26
+
27
+ ```
28
+ 01 pillar-terminology-alignment ─── standalone, ship anytime
29
+ 02 audit-schema-design ────────────┐
30
+ 03 directory-output-spec ──────────┤
31
+ 04 fontist-org-contract ───────────┘
32
+
33
+
34
+ 05 baseline-unicode17-coverage-audit ───┐
35
+
36
+ 06 audit-namespace-skeleton ────────────┤
37
+
38
+ 07 audit-models-port ───────────────────┤
39
+
40
+ 08 extractors-cheap-port ───────────────┤
41
+
42
+ 09 extractors-expensive-port ───────────┤
43
+
44
+ 10 aggregations-ucd-rewrite ────────────┤
45
+
46
+ 11 differ-and-library-auditor-port ─────┤
47
+
48
+ 12 formatters-port ─────────────────────┤
49
+
50
+ 13 directory-emitter ───────────────────┤
51
+
52
+ 14 html-face-browser ───────────────────┤
53
+
54
+ 15 html-library-browser ────────────────┤
55
+
56
+ 16 cli-audit-subcommands ───────────────┘
57
+
58
+
59
+ 17 fontisan-cleanup-audit ──┐
60
+ 18 fontisan-cleanup-ucd ───┴── after 16 validated in production
61
+ 19 fontisan-docs-update ────── after 17 + 18
62
+
63
+ 20 canonical-resolver-4-tier ──── after 05 (needs baseline data)
64
+
65
+
66
+ 21 canonical-unicode17-build ──── after 20
67
+ ```
68
+
69
+ ## Recommended PR sequence
70
+
71
+ ### Track A — Alignment & contract (parallel-safe, ship first)
72
+
73
+ - PR-A1: TODO 01 (pillar terminology). One commit. No deps.
74
+ - PR-A2: TODOs 02 + 03 + 04 (schema, layout, contract). One PR; these
75
+ three define a single contract and are easier to review together.
76
+
77
+ ### Track B — Baseline measurement (parallel with Track A)
78
+
79
+ - PR-B1: TODO 05 (baseline audit). Long-running — depends on
80
+ acquiring fonts, running cmaps, building the report. Can start
81
+ the moment PR #1 (`tier1-cmap-audit`) merges; doesn't block
82
+ Tracks C-D.
83
+
84
+ ### Track C — Audit migration (strict sequence)
85
+
86
+ Each PR builds on the previous. Don't skip ahead.
87
+
88
+ - PR-C1: TODOs 06 + 07 (skeleton + models). One PR. Pure data;
89
+ nothing runs yet.
90
+ - PR-C2: TODO 08 (cheap extractors). Brief-mode audits work after
91
+ this.
92
+ - PR-C3: TODO 09 (expensive extractors). Full-mode audits work, minus
93
+ aggregations.
94
+ - PR-C4: TODO 10 (aggregations rewrite). Full audit produces real
95
+ coverage data.
96
+ - PR-C5: TODOs 11 + 12 (differ + formatters). Diff and text output.
97
+ - PR-C6: TODO 13 (directory emitter). JSON output to disk.
98
+ - PR-C7: TODOs 14 + 15 (HTML browsers).
99
+ - PR-C8: TODO 16 (CLI subcommands). End-user-facing.
100
+
101
+ After PR-C8, ucode's audit is feature-complete and producing real
102
+ data.
103
+
104
+ ### Track D — Fontisan cleanup (after Track C + production validation)
105
+
106
+ - PR-D1: TODOs 17 + 18 + 19 (cleanup + docs). One PR per fontisan
107
+ repo; do this only after ucode's audit has been the source of
108
+ truth for at least one release cycle.
109
+
110
+ ### Track E — Canonical Mode 1 alignment (after Track B)
111
+
112
+ - PR-E1: TODO 20 (4-tier resolver).
113
+ - PR-E2: TODO 21 (Unicode 17 full build). The integration test.
114
+
115
+ ## Acceptance gates per PR
116
+
117
+ Every PR in this directory must:
118
+
119
+ - Pass GHA on Ruby 3.1, 3.2, 3.3, 3.4.
120
+ - Pass `bundle exec rubocop` on new and modified files.
121
+ - Pass `bundle exec rspec` for new and affected specs.
122
+ - Add or update specs covering new behavior.
123
+ - No `double()` in any spec.
124
+ - No `def to_h` / `from_h` / `to_json` / `from_json` anywhere.
125
+ - No AI attribution in commits, PRs, or docs.
126
+ - Update `docs/architecture.md` if the architecture shifts.
127
+ - Update this file (TODO 22) if the sequence changes.
128
+
129
+ ## Smoke tests per track
130
+
131
+ After each track merges, run a smoke test against a real fixture:
132
+
133
+ - After PR-C2 (cheap extractors): `ucode audit font spec/fixtures/fonts/MonaSans-Regular.ttf --brief`
134
+ produces a face report with identity + coverage totals.
135
+ - After PR-C4 (aggregations): same command without `--brief` produces
136
+ full block + script coverage for the fixture font.
137
+ - After PR-C6 (emitter): `--output /tmp/audit-test/` writes the
138
+ directory tree; re-run produces zero writes.
139
+ - After PR-C8 (CLI): full audit + library + compare + browser all
140
+ work end-to-end.
141
+ - After PR-E2 (canonical build): full Unicode 17 dataset exists,
142
+ validation passes, build report committed.
143
+
144
+ ## Cross-cutting concerns
145
+
146
+ ### Performance
147
+
148
+ Track ucode's parse + audit performance per release. Target: full
149
+ Unicode 17 build under 4 hours; single-font audit under 5 seconds for
150
+ typical Latin fonts, under 30 seconds for CJK. Document regressions in
151
+ `docs/performance.md`.
152
+
153
+ ### Documentation
154
+
155
+ Every user-facing PR (CLI changes, schema changes, output layout
156
+ changes) updates:
157
+
158
+ - `docs/architecture.md` if shape changes.
159
+ - `docs/guide/` if user workflow changes.
160
+ - `CHANGELOG.md` (new file — create if missing) for any
161
+ user-visible change.
162
+ - `TODO.new/00-README.md` checkmark when a TODO completes.
163
+
164
+ ### Memory
165
+
166
+ When this directory's work is done (all TODOs checked off), move the
167
+ directory to `TODO.done/2026H2-audit-migration/` (or similar) so the
168
+ next planning cycle starts with a clean `TODO.new/`. Don't delete —
169
+ the historical record is valuable.
170
+
171
+ ## References
172
+
173
+ - Architecture: `docs/architecture.md`
174
+ - Global rules: `~/.claude/CLAUDE.md`, `CLAUDE.md`
175
+ - Existing TODO structure: `TODO/` (v0.1 historical record)
176
+ - Memory files: `/Users/mulgogi/.claude/projects/-Users-mulgogi-src-fontist-ucode/memory/`
@@ -0,0 +1,97 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2026-06-25
11
+
12
+ ### Highlights
13
+
14
+ First public release. The JSON dataset pipeline, SQLite lookup index, and
15
+ Vitepress site generator are production-ready. **SVG glyph extraction from
16
+ the Code Charts PDFs is experimental and gated behind an opt-in flag** —
17
+ see "Deferred" below.
18
+
19
+ ### Added
20
+
21
+ - **Foundation**: `Ucode::Config`, `Ucode::Cache`, `Ucode::VersionResolver`,
22
+ `Ucode::Error` hierarchy with structured `context:` payloads.
23
+ - **Fetchers**: `Ucode::Fetch::{UcdZip,UnihanZip,CodeCharts,Http}` with
24
+ retries, timeouts, and XDG-compliant cache layout.
25
+ - **Models (lutaml-model)**: `Plane`, `Block`, `Script`, `CodePoint` with
26
+ nested sub-models (`Bidi`, `Casing`, `CaseFolding`, `Display`,
27
+ `Segmentation`, `Hangul`, `Indic`, `Emoji`, `Identifier`,
28
+ `Normalization`, `Joining`); polymorphic `Relationship` hierarchy
29
+ (`CrossReference`, `SeeAlso`, `CompatibilityEquivalent`,
30
+ `SampleSequence`, `InformalAlias`, `Footnote`, `VariationSequence`);
31
+ `UnihanEntry`, `NamedSequence`, `StandardizedVariant`, `CjkRadical`,
32
+ `SpecialCasingRule`, `CaseFoldingRule`, `BidiBracketPair`, `NameAlias`,
33
+ `PropertyAlias`, `PropertyValueAlias`.
34
+ - **Parsers (streaming)**: one per UCD text file — `UnicodeData`,
35
+ `Blocks`, `Scripts`, `ScriptExtensions`, `PropertyAliases`,
36
+ `PropertyValueAliases`, `NameAliases`, `NamedSequences`,
37
+ `SpecialCasing`, `CaseFolding`, `BidiMirroring`, `BidiBrackets`,
38
+ `CjkRadicals`, `StandardizedVariants`, `NamesList` (state-machine),
39
+ `DerivedAge`, `DerivedCoreProperties`, `ExtractedProperties`,
40
+ `Auxiliary` (10 files), `Unihan` (8 files).
41
+ - **Coordinator**: streaming single-pass enrichment, `Coordinator::Indices`
42
+ struct of every loaded index.
43
+ - **Indices**: `Ucode::Index` (YAML bsearch, dependency-free),
44
+ `Ucode::Database` (SQLite, persistent), `Ucode::DbBuilder`,
45
+ `Ucode::IndexBuilder`, `Ucode::RangeEntry`.
46
+ - **Aggregator**: `aggregate_blocks`, `aggregate_scripts` — pure
47
+ transformations over `Enumerable<Integer>` + `Index`.
48
+ - **Repo writers**: `Repo::Paths` (path conventions),
49
+ `Repo::AtomicWrites` (byte-compared atomic writes),
50
+ `Repo::CodepointWriter` (streaming + threaded per-cp JSON),
51
+ `Repo::AggregateWriter` (planes, blocks, scripts, indexes,
52
+ relationships, enums, named sequences, manifest).
53
+ - **Site**: `Site::Generator` (init + build), `Site::ConfigEmitter`
54
+ (`config.ts` from output tree), `Site::SearchIndex` (MiniSearch
55
+ payload), Vitepress template with Vue components (`PlaneView`,
56
+ `BlockView`, `CharView`, `SearchView`), dynamic `char/[codepoint]`
57
+ route.
58
+ - **CLI**: `bin/ucode` Thor CLI with `fetch`, `parse`, `glyphs`,
59
+ `site`, `lookup`, `cache`, `build`, `version` subcommands. Each
60
+ command delegates to a pure `Commands::*Command` class.
61
+ - **Docs**: `README.md`, `docs/FONTISAN_MIGRATION.md`.
62
+
63
+ ### Deferred (v0.2)
64
+
65
+ - **Per-codepoint SVG glyph extraction is experimental.** The
66
+ `Ucode::Glyphs` pipeline shipped in v0.1 (`PdfFetcher`, `PageRenderer`,
67
+ `GridDetector`, `CellExtractor`, `Writer`, `MonolithPageMap`) is fully
68
+ implemented and tested, but the Code Charts PDFs composite the
69
+ cell-border decorations and the actual character outline into a single
70
+ glyph definition, so the current `CellExtractor` output includes both.
71
+ The CLI gates the step behind `--include-glyphs` (default off) and prints
72
+ a warning.
73
+ - **v0.2 strategy — two pillars that bypass the cell extractor entirely:**
74
+ 1. **Real character glyphs** are read straight from the subsetted fonts
75
+ embedded in `CodeCharts.pdf` (the `Uni*`/`UCS*`-prefixed per-block
76
+ fonts). Each font program contains only the character outline — the
77
+ cell-border decoration is page content, not part of the glyph — so
78
+ extracting the font stream + walking the ToUnicode CMap yields clean
79
+ per-codepoint SVGs without any post-processing of composite paths.
80
+ 2. **Last Resort placeholders** (unassigned, noncharacter, PUA
81
+ codepoints) are rendered directly from the
82
+ [Last Resort Font](https://github.com/unicode-org/last-resort-font)
83
+ UFO source (380 `.glif` files + Format 13 `cmap`), matching the
84
+ placeholder box the Code Charts actually display.
85
+ - The v0.1 cell-position resolution (`GridDetector` +
86
+ `CellExtractor#find_use_at`) is correct and is retained as the
87
+ authoritative cell→codepoint map; only the rendering path is replaced.
88
+
89
+ ### Tooling
90
+
91
+ - `rubocop`, `rubocop-rspec`, `rubocop-performance`, `rubocop-rake` for
92
+ lint; `rspec` for tests; `simplecov` for coverage (94%+ line coverage,
93
+ 80% minimum enforced).
94
+ - 580+ specs covering every public API.
95
+
96
+ [Unreleased]: https://github.com/fontist/ucode/compare/v0.1.0...HEAD
97
+ [0.1.0]: https://github.com/fontist/ucode/releases/tag/v0.1.0
data/exe/ucode ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
5
+
6
+ require "ucode"
7
+
8
+ Ucode::Cli.start(ARGV)
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Coverage analysis over codepoint sets.
5
+ #
6
+ # Pure transformations: given a collection of codepoints and an
7
+ # `Index` (blocks or scripts), return aggregated summaries. No I/O,
8
+ # no mutation of inputs, no global state.
9
+ #
10
+ # OCP: new aggregation kinds (planes, categories, ...) slot in as
11
+ # new methods without altering existing ones.
12
+ module Aggregator
13
+ # Summary of how many codepoints of one block are present in a
14
+ # given input set. Plain Struct — Ruby's built-in `to_h` covers
15
+ # any serialization needs.
16
+ BlockSummary = Struct.new(
17
+ :name,
18
+ :first_cp,
19
+ :last_cp,
20
+ :total,
21
+ :covered,
22
+ :fill_ratio,
23
+ :complete,
24
+ keyword_init: true,
25
+ )
26
+
27
+ class << self
28
+ # @param codepoints [Enumerable<Integer>]
29
+ # @param blocks_index [Ucode::Index]
30
+ # @return [Array<BlockSummary>] one summary per block in the index,
31
+ # in the index's natural (first_cp) order
32
+ def aggregate_blocks(codepoints, blocks_index)
33
+ sorted = codepoints.sort
34
+ blocks_index.map { |entry| build_block_summary(entry, sorted) }
35
+ end
36
+
37
+ # @param codepoints [Enumerable<Integer>]
38
+ # @param scripts_index [Ucode::Index]
39
+ # @return [Array<String>] sorted unique script names covering the
40
+ # given codepoints
41
+ def aggregate_scripts(codepoints, scripts_index)
42
+ codepoints.filter_map { |cp| scripts_index.lookup(cp) }.uniq.sort
43
+ end
44
+
45
+ private
46
+
47
+ def build_block_summary(entry, sorted_cps)
48
+ covered = count_in_range(sorted_cps, entry.first_cp, entry.last_cp)
49
+ total = entry.size
50
+ BlockSummary.new(
51
+ name: entry.name,
52
+ first_cp: entry.first_cp,
53
+ last_cp: entry.last_cp,
54
+ total: total,
55
+ covered: covered,
56
+ fill_ratio: total.zero? ? 0.0 : (covered.to_f / total),
57
+ complete: covered == total,
58
+ )
59
+ end
60
+
61
+ # Count of sorted cps in the inclusive [first, last] range, in O(log N).
62
+ def count_in_range(sorted, first, last)
63
+ upper_bound(sorted, last) - lower_bound(sorted, first)
64
+ end
65
+
66
+ # Index of the first cp >= value (or sorted.size if none).
67
+ def lower_bound(sorted, value)
68
+ sorted.bsearch_index { |cp| cp >= value } || sorted.size
69
+ end
70
+
71
+ # Index of the first cp > value (or sorted.size if none).
72
+ def upper_bound(sorted, value)
73
+ sorted.bsearch_index { |cp| cp > value } || sorted.size
74
+ end
75
+ end
76
+ end
77
+ end