ucode 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +72 -0
  3. data/Gemfile.lock +2 -2
  4. data/TODO.full/00-README.md +116 -0
  5. data/TODO.full/01-panglyph-vision.md +112 -0
  6. data/TODO.full/02-panglyph-repo-bootstrap.md +184 -0
  7. data/TODO.full/03-panglyph-font-builder.md +201 -0
  8. data/TODO.full/04-panglyph-publish-pipeline.md +126 -0
  9. data/TODO.full/05-ucode-0-1-1-release.md +139 -0
  10. data/TODO.full/06-fontisan-remove-audit.md +142 -0
  11. data/TODO.full/07-fontisan-remove-ucd.md +125 -0
  12. data/TODO.full/08-archive-private-bin-build.md +143 -0
  13. data/TODO.full/09-archive-public-structure.md +164 -0
  14. data/TODO.full/10-fontist-org-woff-glyphs.md +131 -0
  15. data/TODO.full/11-fontist-org-audit-coverage.md +140 -0
  16. data/TODO.full/12-implementation-order.md +216 -0
  17. data/TODO.full/13-fontisan-font-writer-api.md +189 -0
  18. data/TODO.full/14-fontisan-table-writers.md +66 -0
  19. data/TODO.full/15-panglyph-builder-real.md +82 -0
  20. data/TODO.full/16-archive-public-sync-workflows.md +167 -0
  21. data/TODO.full/17-fontist-org-font-picker.md +73 -0
  22. data/TODO.full/18-comprehensive-spec-coverage.md +64 -0
  23. data/TODO.full/19-ucode-0-1-2-patch.md +32 -0
  24. data/TODO.full/20-fontisan-0-2-23-release.md +52 -0
  25. data/TODO.new/00-README.md +30 -0
  26. data/TODO.new/23-universal-glyph-set-source-map.md +312 -0
  27. data/TODO.new/24-universal-glyph-set-build.md +189 -0
  28. data/TODO.new/25-font-audit-against-universal-set.md +195 -0
  29. data/TODO.new/26-missing-glyph-reporter.md +189 -0
  30. data/TODO.new/27-fontist-org-consumer-integration.md +200 -0
  31. data/TODO.new/28-implementation-order-update.md +187 -0
  32. data/TODO.new/29-universal-set-curation-uc17.md +312 -0
  33. data/TODO.new/30-tier1-font-acquisition.md +241 -0
  34. data/TODO.new/31-universal-set-production-build.md +205 -0
  35. data/TODO.new/32-uc17-coverage-matrix.md +165 -0
  36. data/TODO.new/33-specialist-font-acquisition-refresh.md +138 -0
  37. data/TODO.new/34-pillar2-content-stream-correlator.md +147 -0
  38. data/TODO.new/35-universal-set-production-run.md +160 -0
  39. data/TODO.new/36-per-font-coverage-audit.md +145 -0
  40. data/TODO.new/37-coverage-highlight-reporter.md +125 -0
  41. data/TODO.new/38-fontist-org-glyph-consumer.md +141 -0
  42. data/TODO.new/39-implementation-order-update-32-38.md +258 -0
  43. data/TODO.new/40-archive-private-uses-ucode-audit.md +124 -0
  44. data/TODO.new/41-ucode-unicode-archive-bridge.md +160 -0
  45. data/config/specialist_fonts.yml +102 -0
  46. data/config/unicode17_tier1_fonts.yml +42 -0
  47. data/config/unicode17_universal_glyph_set.yml +293 -0
  48. data/lib/ucode/audit/block_aggregator.rb +57 -29
  49. data/lib/ucode/audit/browser/face_page.rb +128 -0
  50. data/lib/ucode/audit/browser/glyph_panel.rb +124 -0
  51. data/lib/ucode/audit/browser/library_page.rb +74 -0
  52. data/lib/ucode/audit/browser/missing_glyph_page.rb +87 -0
  53. data/lib/ucode/audit/browser/template.rb +47 -0
  54. data/lib/ucode/audit/browser/templates/face.css +200 -0
  55. data/lib/ucode/audit/browser/templates/face.html.erb +41 -0
  56. data/lib/ucode/audit/browser/templates/face.js +298 -0
  57. data/lib/ucode/audit/browser/templates/library.css +119 -0
  58. data/lib/ucode/audit/browser/templates/library.html.erb +42 -0
  59. data/lib/ucode/audit/browser/templates/library.js +99 -0
  60. data/lib/ucode/audit/browser/templates/missing_glyph_page.css +119 -0
  61. data/lib/ucode/audit/browser/templates/missing_glyph_page.html.erb +58 -0
  62. data/lib/ucode/audit/browser/templates/missing_glyph_page.js +2 -0
  63. data/lib/ucode/audit/browser.rb +32 -0
  64. data/lib/ucode/audit/context.rb +27 -1
  65. data/lib/ucode/audit/coverage_reference.rb +103 -0
  66. data/lib/ucode/audit/differ.rb +121 -0
  67. data/lib/ucode/audit/emitter/block_emitter.rb +52 -0
  68. data/lib/ucode/audit/emitter/codepoint_emitter.rb +87 -0
  69. data/lib/ucode/audit/emitter/collection_emitter.rb +80 -0
  70. data/lib/ucode/audit/emitter/face_directory.rb +212 -0
  71. data/lib/ucode/audit/emitter/glyph_emitter.rb +48 -0
  72. data/lib/ucode/audit/emitter/index_emitter.rb +149 -0
  73. data/lib/ucode/audit/emitter/library_emitter.rb +96 -0
  74. data/lib/ucode/audit/emitter/paths.rb +312 -0
  75. data/lib/ucode/audit/emitter/plane_emitter.rb +29 -0
  76. data/lib/ucode/audit/emitter/script_emitter.rb +29 -0
  77. data/lib/ucode/audit/emitter.rb +29 -0
  78. data/lib/ucode/audit/extractors/aggregations.rb +31 -2
  79. data/lib/ucode/audit/face_auditor.rb +86 -0
  80. data/lib/ucode/audit/formatters/audit_diff_text.rb +112 -0
  81. data/lib/ucode/audit/formatters/audit_text.rb +411 -0
  82. data/lib/ucode/audit/formatters/color.rb +48 -0
  83. data/lib/ucode/audit/formatters/library_summary_text.rb +98 -0
  84. data/lib/ucode/audit/formatters/text_formatter.rb +83 -0
  85. data/lib/ucode/audit/formatters.rb +23 -0
  86. data/lib/ucode/audit/library_aggregator.rb +86 -0
  87. data/lib/ucode/audit/library_auditor.rb +105 -0
  88. data/lib/ucode/audit/release/emitter.rb +152 -0
  89. data/lib/ucode/audit/release/face_card.rb +93 -0
  90. data/lib/ucode/audit/release/formula_audits.rb +50 -0
  91. data/lib/ucode/audit/release/library_index_builder.rb +78 -0
  92. data/lib/ucode/audit/release/manifest_builder.rb +127 -0
  93. data/lib/ucode/audit/release.rb +42 -0
  94. data/lib/ucode/audit/ucd_only_reference.rb +81 -0
  95. data/lib/ucode/audit/universal_set_reference.rb +136 -0
  96. data/lib/ucode/audit.rb +31 -0
  97. data/lib/ucode/cli.rb +339 -33
  98. data/lib/ucode/commands/audit/browser_command.rb +82 -0
  99. data/lib/ucode/commands/audit/collection_command.rb +103 -0
  100. data/lib/ucode/commands/audit/compare_command.rb +188 -0
  101. data/lib/ucode/commands/audit/font_command.rb +140 -0
  102. data/lib/ucode/commands/audit/library_command.rb +87 -0
  103. data/lib/ucode/commands/audit/reference_builder.rb +64 -0
  104. data/lib/ucode/commands/audit.rb +20 -0
  105. data/lib/ucode/commands/block_feed.rb +73 -0
  106. data/lib/ucode/commands/canonical_build.rb +138 -0
  107. data/lib/ucode/commands/fetch.rb +37 -1
  108. data/lib/ucode/commands/release.rb +115 -0
  109. data/lib/ucode/commands/universal_set.rb +211 -0
  110. data/lib/ucode/commands.rb +5 -0
  111. data/lib/ucode/coordinator/indices.rb +11 -0
  112. data/lib/ucode/coordinator.rb +138 -5
  113. data/lib/ucode/error.rb +30 -2
  114. data/lib/ucode/fetch/font_fetcher/result.rb +39 -0
  115. data/lib/ucode/fetch/font_fetcher.rb +16 -0
  116. data/lib/ucode/fetch/specialist_font_fetcher.rb +280 -0
  117. data/lib/ucode/fetch.rb +7 -3
  118. data/lib/ucode/glyphs/real_fonts/cmap_cache.rb +74 -0
  119. data/lib/ucode/glyphs/real_fonts.rb +1 -0
  120. data/lib/ucode/glyphs/resolver.rb +62 -0
  121. data/lib/ucode/glyphs/source.rb +48 -0
  122. data/lib/ucode/glyphs/source_builder.rb +61 -0
  123. data/lib/ucode/glyphs/source_config/coverage_assertion.rb +79 -0
  124. data/lib/ucode/glyphs/source_config/gap_report.rb +54 -0
  125. data/lib/ucode/glyphs/source_config.rb +104 -0
  126. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +63 -0
  127. data/lib/ucode/glyphs/sources/pillar3_last_resort.rb +51 -0
  128. data/lib/ucode/glyphs/sources/tier1_real_font.rb +104 -0
  129. data/lib/ucode/glyphs/sources.rb +20 -0
  130. data/lib/ucode/glyphs/universal_set/builder.rb +161 -0
  131. data/lib/ucode/glyphs/universal_set/coverage_report.rb +139 -0
  132. data/lib/ucode/glyphs/universal_set/idempotency.rb +86 -0
  133. data/lib/ucode/glyphs/universal_set/manifest_accumulator.rb +195 -0
  134. data/lib/ucode/glyphs/universal_set/manifest_writer.rb +61 -0
  135. data/lib/ucode/glyphs/universal_set/pre_build_check.rb +197 -0
  136. data/lib/ucode/glyphs/universal_set/validator.rb +204 -0
  137. data/lib/ucode/glyphs/universal_set.rb +45 -0
  138. data/lib/ucode/glyphs.rb +6 -0
  139. data/lib/ucode/models/audit/baseline.rb +6 -0
  140. data/lib/ucode/models/audit/block_summary.rb +7 -0
  141. data/lib/ucode/models/audit/codepoint_provenance.rb +39 -0
  142. data/lib/ucode/models/audit/release_face.rb +42 -0
  143. data/lib/ucode/models/audit/release_formula.rb +33 -0
  144. data/lib/ucode/models/audit/release_manifest.rb +43 -0
  145. data/lib/ucode/models/audit/release_universal_set.rb +37 -0
  146. data/lib/ucode/models/audit.rb +9 -0
  147. data/lib/ucode/models/block.rb +2 -0
  148. data/lib/ucode/models/build_report.rb +109 -0
  149. data/lib/ucode/models/codepoint/glyph.rb +42 -0
  150. data/lib/ucode/models/codepoint.rb +3 -0
  151. data/lib/ucode/models/glyph_source.rb +86 -0
  152. data/lib/ucode/models/glyph_source_map.rb +138 -0
  153. data/lib/ucode/models/specialist_font.rb +70 -0
  154. data/lib/ucode/models/specialist_font_manifest.rb +48 -0
  155. data/lib/ucode/models/unihan_entry.rb +81 -9
  156. data/lib/ucode/models/unihan_field.rb +21 -0
  157. data/lib/ucode/models/universal_set_entry.rb +47 -0
  158. data/lib/ucode/models/universal_set_manifest.rb +78 -0
  159. data/lib/ucode/models/validation_report.rb +99 -0
  160. data/lib/ucode/models.rb +9 -0
  161. data/lib/ucode/parsers/named_sequences.rb +5 -5
  162. data/lib/ucode/parsers/unihan.rb +50 -19
  163. data/lib/ucode/repo/aggregate_writer.rb +34 -2
  164. data/lib/ucode/repo/block_feed_emitter.rb +153 -0
  165. data/lib/ucode/repo/build_report_accumulator.rb +138 -0
  166. data/lib/ucode/repo/build_report_writer.rb +46 -0
  167. data/lib/ucode/repo/build_validator.rb +229 -0
  168. data/lib/ucode/repo/codepoint_writer.rb +50 -1
  169. data/lib/ucode/repo/paths.rb +8 -0
  170. data/lib/ucode/repo.rb +4 -0
  171. data/lib/ucode/version.rb +1 -1
  172. data/schema/block-feed.output.schema.yml +134 -0
  173. metadata +143 -2
  174. data/ucode.gemspec +0 -56
@@ -0,0 +1,258 @@
1
+ # 39 — Implementation order update (TODOs 32–38)
2
+
3
+ ## Goal
4
+
5
+ Sequence the remaining work for Part 1 (universal glyph set with
6
+ full UC17 coverage) and Part 2 (per-font audit + highlight) so
7
+ each PR is independently reviewable and the critical path is
8
+ short.
9
+
10
+ Extends [TODO 28](28-implementation-order-update.md) which sequenced
11
+ TODOs 23–31.
12
+
13
+ # 39 — Implementation order update (TODOs 32–41)
14
+
15
+ ## Goal
16
+
17
+ Sequence the remaining work for Part 1 (universal glyph set with
18
+ full UC17 coverage) and Part 2 (per-font audit + highlight) plus
19
+ the pipeline wiring (TODOs 40–41) so each PR is independently
20
+ reviewable and the critical path is short.
21
+
22
+ Extends [TODO 28](28-implementation-order-update.md) which sequenced
23
+ TODOs 23–31.
24
+
25
+ ## Critical path
26
+
27
+ ```
28
+ ┌─────────────────────┐
29
+ │ 32 Coverage matrix │ ← policy only; no deps
30
+ └──────────┬──────────┘
31
+
32
+ ┌────────────────┼────────────────┐
33
+ ▼ ▼
34
+ ┌─────────────────────┐ ┌──────────────────────┐
35
+ │ 33 Font acquisition │ │ 34 Pillar 2 │ ← parallel
36
+ │ (URLs + formulas) │ │ ContentStreamCorr. │
37
+ └──────────┬──────────┘ └──────────┬───────────┘
38
+ │ │
39
+ └────────────┬────────────────────┘
40
+
41
+ ┌─────────────────────────┐
42
+ │ 35 Production run │ ← end of Part 1
43
+ │ (universal set build) │
44
+ └────────────┬────────────┘
45
+
46
+ ┌────────────┴────────────┐
47
+ ▼ ▼
48
+ ┌──────────────────────┐ ┌──────────────────────┐
49
+ │ 41 Unicode artifacts │ │ 36 Per-font │
50
+ │ → archive-public │ │ coverage audit │
51
+ │ bridge │ └──────────┬───────────┘
52
+ └──────────┬───────────┘ │
53
+ │ ▼
54
+ │ ┌──────────────────────┐
55
+ │ │ 37 Highlight reporter │
56
+ │ └──────────┬───────────┘
57
+ ▼ │
58
+ ┌──────────────────────┐ │
59
+ │ 38 fontist.org glyph │←───────────────┘
60
+ │ consumer │
61
+ └──────────────────────┘
62
+
63
+ ┌──────────────────────────────────────┐
64
+ │ 40 fontist-archive-private │ ← can start any time;
65
+ │ bin/build uses ucode audit │ independent of 32–38
66
+ └──────────────────────────────────────┘
67
+ ```
68
+
69
+ ## Phase 1 — Policy + acquisition (sequential, blocking)
70
+
71
+ ### Track A1 — Coverage matrix (TODO 32)
72
+
73
+ **Branch**: `audit/coverage-matrix`
74
+ **PR**: ucode/PR-XX
75
+
76
+ YAML-only change to `config/unicode17_universal_glyph_set.yml`.
77
+ Extends `Models::GlyphSourceMap` to accept `default_sources` at top
78
+ level. Adds per-block specialists with full provenance/rationale.
79
+
80
+ Acceptance: every block has a defined Tier 1 (or pillar 2 fallback
81
+ policy). Reviewer can sign off without font availability.
82
+
83
+ **Estimated**: 1–2 sessions. Mostly research + YAML writing.
84
+
85
+ ### Track A2 — Font acquisition refresh (TODO 33)
86
+
87
+ **Branch**: `audit/font-acquisition-refresh`
88
+ **PR**: ucode/PR-XX + 3+ fontist/formulas PRs
89
+
90
+ Depends on A1 (uses the curated specialist list). Two halves:
91
+
92
+ - **A2a — Direct URL fixes**: Lentariso, EgyptianText,
93
+ UniHieroglyphica, BabelStone, Symbola. Update
94
+ `specialist_fonts.yml`, verify sha256.
95
+ - **A2b — fontist formula PRs**: open upstream PRs for Noto Sans
96
+ CJK JP, Noto Sans Symbols, Noto Sans Symbols 2, Noto Music,
97
+ Noto Sans Sharada, Noto Sans Sidetic, Noto Sans Tolong Siki,
98
+ Noto Sans Tangut, Noto Sans Arabic, Noto Sans Telugu, Noto Sans
99
+ Kannada.
100
+
101
+ A2b blocks on external review (fontist maintainers). Until merged,
102
+ ucode falls back to direct notofonts.github.io URLs (Phase C of
103
+ TODO 33).
104
+
105
+ **Estimated**: 2–3 sessions for A2a; A2b is async (external PRs).
106
+
107
+ ### Track A3 — Pillar 2 ContentStreamCorrelator (TODO 34)
108
+
109
+ **Branch**: `audit/pillar2-correlator`
110
+ **PR**: ucode/PR-XX
111
+
112
+ Parallel to A1/A2 — no upstream deps. Generalizes `/tmp/correlate_v4.rb`
113
+ into `Ucode::Glyphs::EmbeddedFonts::ContentStreamCorrelator`. Patches
114
+ `Catalog#build_entry` to delegate when `tu_ref` is nil.
115
+
116
+ **Estimated**: 2 sessions. Algorithm is proven; needs generalization
117
+ + tests on Sidetic and Beria Erfe PDFs.
118
+
119
+ ## Phase 2 — Production build (sequential, blocked by Phase 1)
120
+
121
+ ### Track B1 — Universal set production run (TODO 35)
122
+
123
+ **Branch**: `audit/universal-set-production`
124
+ **PR**: ucode/PR-XX (manifest + sample of glyphs as fixtures; full
125
+ set is too big for git)
126
+
127
+ Blocked by A1, A2, A3 (need fonts + pillar 2 fallback). Runs
128
+ `ucode universal-set build 17.0.0` end-to-end. Emits manifest,
129
+ entries, glyphs, HTML browser.
130
+
131
+ **Estimated**: 1 session to run + validate + write summary doc.
132
+ Wall-clock for build itself: 30–60 minutes.
133
+
134
+ ## Phase 3 — Pipeline wiring (parallel after Phase 2)
135
+
136
+ ### Track B2 — fontist-archive-private bin/build refactor (TODO 40)
137
+
138
+ **Branch**: `audit/archive-private-uses-ucode` (in fontist-archive-private repo)
139
+ **PR**: fontist-archive-private/PR-XX
140
+
141
+ Independent of Phase 1/2 — can start any time. Swaps
142
+ `Fontisan::Commands::AuditCommand` for `ucode audit font`, removes
143
+ the UCD stub hack. After TODO 35 lands, adds the
144
+ `--reference-universal-set` flag so audits include coverage
145
+ comparison against the canonical glyphs.
146
+
147
+ **Estimated**: 1 session for Phase A (swap invocation + remove stub).
148
+ Phase B (universal-set reference) is a follow-up after TODO 41.
149
+
150
+ ### Track B3 — ucode Unicode artifacts → archive bridge (TODO 41)
151
+
152
+ **Branch**: `audit/unicode-archive-bridge` (in ucode repo + fontist-archive-public repo)
153
+ **PR**: ucode/PR-XX + fontist-archive-public/PR-XX
154
+
155
+ Blocked by B1 (universal set must exist to be bridged). Adds the
156
+ publish workflow in ucode, the `unicode/` directory in archive-public,
157
+ and the fetch-data.sh updates in fontist.org.
158
+
159
+ **Estimated**: 2 sessions. Workflow + sync scripts + verification.
160
+
161
+ ## Phase 4 — Consumer wiring (parallel after Phase 3)
162
+
163
+ ### Track C1 — Per-font coverage audit (TODO 36)
164
+
165
+ **Branch**: `audit/per-font-coverage`
166
+ **PR**: ucode/PR-XX
167
+
168
+ Blocked by B1 (universal set is the reference). Extends
169
+ `ucode audit font/library` with coverage section. Outputs JSON
170
+ per-font/per-block coverage stats. Once B2 lands, this audit is
171
+ also what `fontist-archive-private/bin/build` produces per formula.
172
+
173
+ **Estimated**: 2 sessions.
174
+
175
+ ### Track C2 — Coverage highlight reporter (TODO 37)
176
+
177
+ **Branch**: `audit/highlight-reporter`
178
+ **PR**: ucode/PR-XX
179
+
180
+ Blocked by C1 (consumes audit data). HTML visualizer with per-block
181
+ missing-glyph grids, comparison view, library heatmap.
182
+
183
+ **Estimated**: 2–3 sessions.
184
+
185
+ ### Track C3 — fontist.org glyph consumer (TODO 38)
186
+
187
+ **Branch**: `feat/fontist-org-glyph-consumer` (in fontist/fontist.github.io repo)
188
+ **PR**: fontist.github.io/PR-XX
189
+
190
+ Blocked by B3 (universal set must be in fontist-archive-public
191
+ under `unicode/`). Independent of C1/C2 (different consumer). Wires
192
+ `UnicodeCharPage.vue` to render universal-set SVGs + provenance
193
+ badge.
194
+
195
+ **Estimated**: 2 sessions.
196
+
197
+ ## Sequencing rules
198
+
199
+ 1. **PR-per-TODO.** No bundled PRs unless tightly coupled (e.g.,
200
+ A2a + A2b could land together if A2b's fontist PRs are still
201
+ in review).
202
+
203
+ 2. **A3 + B2 can run in parallel with Phase 1/2.** Both are pure
204
+ code work and don't touch the curated config.
205
+
206
+ 3. **C1/C2/C3 all depend on B1 or B3 but not on each other.** They
207
+ can land in any order once their dependency merges.
208
+
209
+ 4. **External PRs (fontist/formulas) don't block ucode progress.**
210
+ Until they merge, ucode uses direct URLs as fallback. Once they
211
+ merge, ucode's config can switch back to `kind: fontist`.
212
+
213
+ 5. **Merging requires explicit user authorization per PR.** No
214
+ auto-merge.
215
+
216
+ ## Branch naming
217
+
218
+ Following the convention in TODO.new/00-README.md:
219
+
220
+ - ucode repo: `audit/<track-slug>` (e.g. `audit/coverage-matrix`)
221
+ - fontist.org repo: `feat/<track-slug>` or `fix/<track-slug>` as
222
+ appropriate
223
+ - fontist-archive-private repo: `audit/<track-slug>`
224
+ - fontist-archive-public repo: `audit/<track-slug>`
225
+
226
+ ## What's NOT in this plan
227
+
228
+ These items are out of scope for the current Part 1/Part 2 directive:
229
+
230
+ - **CI for periodic re-build**: when Unicode versions update,
231
+ regenerate the set. Belongs in a separate TODO once the
232
+ infrastructure stabilizes.
233
+ - **Real-time glyph extraction**: users extracting glyphs on
234
+ demand via ucode-as-a-service. Not in scope; the universal set
235
+ is pre-built.
236
+ - **Color emoji extraction**: Noto Color Emoji uses CBDT/CBLC
237
+ bitmap tables, not vector outlines. Out of scope for vector
238
+ extraction; would need separate TODO.
239
+ - **Glyph diffing across Unicode versions**: tracking how a
240
+ codepoint's official glyph changes between Unicode X.Y and X.Z.
241
+ Useful but separate.
242
+
243
+ ## Acceptance
244
+
245
+ - [ ] Every TODO 32–41 lists its branch + PR-per-TODO commitment
246
+ - [ ] Critical path is unambiguous: A1 → A2 → B1 → {B3, C1} → {C2, C3}
247
+ - [ ] Parallel tracks (A3, B2) identified explicitly
248
+ - [ ] External dependencies (fontist/formulas PRs) called out
249
+ - [ ] Out-of-scope items listed so they don't creep in
250
+
251
+ ## References
252
+
253
+ - [TODO 28](28-implementation-order-update.md) — prior sequencing (23–31)
254
+ - [TODO 32](32-uc17-coverage-matrix.md) — Phase 1 start
255
+ - [TODO 35](35-universal-set-production-run.md) — Phase 1 end
256
+ - [TODO 36](36-per-font-coverage-audit.md) — Phase 2 start
257
+ - [TODO 40](40-archive-private-uses-ucode-audit.md) — pipeline wiring
258
+ - [TODO 41](41-ucode-unicode-archive-bridge.md) — publishing pipeline
@@ -0,0 +1,124 @@
1
+ # 40 — fontist-archive-private bin/build uses ucode audit
2
+
3
+ ## Goal
4
+
5
+ Refactor `fontist-archive-private/bin/build` so it invokes
6
+ `ucode audit font` instead of the dead `Fontisan::Commands::AuditCommand`
7
+ path. The current script (last touched when fontisan still owned audits)
8
+ has a UCD-stub hack at lines 13–21 that returns empty UCD data —
9
+ exactly the functionality ucode provides natively via its own UCD
10
+ parse + cache.
11
+
12
+ This TODO is the engineering work to make the architecture doc
13
+ (coverage-architecture.md §"Build Pipeline") match reality.
14
+
15
+ ## Why a separate TODO
16
+
17
+ The audit migration (TODOs 06–12) ported fontisan's audit subsystem
18
+ into ucode. The CLI command `ucode audit font <path>` produces the
19
+ same shape of YAML that `Fontisan::Commands::AuditCommand` used to.
20
+ But fontist-archive-private's `bin/build` was never updated to call
21
+ the new tool — it still requires `fontisan` and stubs UCD out.
22
+
23
+ Three problems with the current state:
24
+
25
+ 1. **Coverage aggregations are empty.** Every audit YAML currently
26
+ has `blocks: []`, `unicode_scripts: []` because the UCD stub
27
+ returns nil. Consumers (fontist.org's coverage browser) see
28
+ per-font cmap lists but no per-block fill ratios.
29
+
30
+ 2. **Two sources of truth for "what's in UCD."** fontisan used to
31
+ auto-download `ucd.all.flat.xml` (removed per CLAUDE.md); the
32
+ stub hack papers over the missing file. ucode has its own
33
+ authoritative UCD parse under `~/.cache/ucode/`.
34
+
35
+ 3. **No universal-set reference.** Even if the UCD stub were removed,
36
+ fontisan can't compare a font's cmap to the canonical universal
37
+ glyph set. ucode can (TODO 35 produces the set; TODO 36 adds the
38
+ comparison).
39
+
40
+ ## Scope
41
+
42
+ ### Phase A — Swap audit invocation
43
+
44
+ 1. Replace lines 100–115 of `bin/build`:
45
+
46
+ ```ruby
47
+ # OLD (broken — uses Fontisan::Commands::AuditCommand + UCD stub)
48
+ cmd = Fontisan::Commands::AuditCommand.new(face_path, font_index: font_index, no_codepoints: false)
49
+ report = cmd.run
50
+ File.write(audit_path, report.to_yaml)
51
+ ```
52
+
53
+ with:
54
+
55
+ ```ruby
56
+ # NEW — invoke ucode audit via CLI (shelling out keeps the build
57
+ # script decoupled from ucode's internal API)
58
+ system("ucode", "audit", "font", face_path,
59
+ "--font-index", font_index.to_s,
60
+ "--output", audit_path,
61
+ out: File::NULL, err: verbose ? $stderr : File::NULL)
62
+ ```
63
+
64
+ OR via the Ruby API if shell-out overhead becomes measurable:
65
+
66
+ ```ruby
67
+ Ucode::Commands::Audit::FontCommand.new.call(
68
+ path: face_path,
69
+ font_index: font_index,
70
+ output: audit_path,
71
+ )
72
+ ```
73
+
74
+ 2. **Remove the UCD stub hack** (lines 13–21 of `bin/build`). ucode
75
+ has its own UCD cache; no stub needed.
76
+
77
+ 3. **Update Gemfile** — add `ucode` gem, keep `fontisan` for
78
+ ConvertCommand (WOFF generation), keep `excavate` for archive
79
+ extraction.
80
+
81
+ ### Phase B — Universal-set reference (after TODO 35)
82
+
83
+ 4. When the universal set is published (TODO 35 + TODO 41 bridge),
84
+ pass `--reference-universal-set=<path>` to every `ucode audit font`
85
+ invocation. The audit YAML gains a `coverage` section comparing
86
+ the font's cmap to the canonical per-block codepoint lists.
87
+
88
+ 5. The universal set lives in fontist-archive-public under
89
+ `unicode/universal-glyph-set/` (per TODO 41). fontist-archive-private's
90
+ CI checks it out shallow before invoking bin/build, so the audit
91
+ can reference it.
92
+
93
+ ### Phase C — Cleanup
94
+
95
+ 6. Remove the `module Fontisan::Audit::Context` monkey-patch
96
+ entirely. Dead code once fontisan's AuditCommand is no longer
97
+ called.
98
+
99
+ 7. Update `coverage-architecture.md` examples to match the new
100
+ `ucode_version` field in the audit YAML schema (replaces
101
+ `fontisan_version` for the audit producer; fontisan_version may
102
+ still appear as the parser-layer version).
103
+
104
+ 8. Specs: add a test fixture — small formula YAML + small TTF →
105
+ assert bin/build produces an audit YAML with non-empty `blocks`
106
+ and the new `ucode_version` field.
107
+
108
+ ## Acceptance
109
+
110
+ - [ ] `bin/build` invokes `ucode audit font` (not Fontisan)
111
+ - [ ] UCD stub hack removed
112
+ - [ ] Audit YAMLs include populated `blocks:` and `unicode_scripts:`
113
+ (not empty arrays)
114
+ - [ ] Audit YAML carries `ucode_version` field
115
+ - [ ] Universal-set comparison lands when TODO 35 + TODO 41 are done
116
+ - [ ] At least one formula end-to-end (e.g. `google/abeezee`) produces
117
+ a complete audit YAML via the new path
118
+
119
+ ## References
120
+
121
+ - `fontist-archive-private/bin/build` — current implementation
122
+ - `fontist.org/coverage-architecture.md` — target architecture (updated)
123
+ - [TODO 36](36-per-font-coverage-audit.md) — consumes the new audit data
124
+ - [TODO 41](41-ucode-unicode-archive-bridge.md) — universal-set publishing
@@ -0,0 +1,160 @@
1
+ # 41 — ucode Unicode artifacts → fontist-archive-public bridge
2
+
3
+ ## Goal
4
+
5
+ Publish ucode's Unicode-only artifacts (universal glyph set, block-feed,
6
+ per-codepoint JSONs) into `fontist-archive-public/unicode/` so
7
+ fontist.org has ONE source of truth for both per-font data AND
8
+ per-codepoint data.
9
+
10
+ Mirrors the existing fontisan pattern: ucode's CI builds artifacts in
11
+ its own repo, then a sync workflow publishes them to
12
+ fontist-archive-public.
13
+
14
+ ## Why a separate TODO
15
+
16
+ Today ucode's output lives only in the ucode repo under `output/`
17
+ (gitignored — too big to commit). PR #44 in fontist.org added a
18
+ `fetch-data.sh --with-ucode` flag that pulls from
19
+ `raw.githubusercontent.com/fontist/ucode/main/docs/public/` — direct
20
+ fetch, bypassing the archive.
21
+
22
+ Two problems with the direct-fetch approach:
23
+
24
+ 1. **Source/build separation is blurred.** The ucode repo would have
25
+ to commit built artifacts under `docs/public/` (1.2 GB if we ship
26
+ per-codepoint JSONs). Repo bloat, git history grows linearly with
27
+ Unicode versions.
28
+
29
+ 2. **Inconsistent with fontist-archive pattern.** Per-font data goes
30
+ private → public archive → site. Per-codepoint data should follow
31
+ the same shape.
32
+
33
+ This TODO introduces `fontist-archive-public/unicode/` as the
34
+ canonical public location for ucode's Unicode artifacts.
35
+
36
+ ## Scope
37
+
38
+ ### Phase A — ucode CI publishes to fontist-archive-public
39
+
40
+ 1. New GHA workflow in `fontist/ucode`: `.github/workflows/publish-unicode-archive.yml`
41
+ Triggers on:
42
+ - Push to main (after parse + universal-set build succeeds)
43
+ - Manual dispatch (regenerate without rebuilding UCD)
44
+
45
+ 2. The workflow runs:
46
+ - `ucode fetch ucd <version>` + `ucode fetch unihan <version>` +
47
+ `ucode fetch charts <version>` + `ucode fetch fonts`
48
+ - `ucode parse <version>` → produces `output/`
49
+ - `ucode block-feed --ucode-output=./output --target=./output/block-feed`
50
+ - `ucode universal-set build <version>` (TODO 35) → produces
51
+ `output/universal_glyph_set/`
52
+ - Sync into `fontist-archive-public` via git:
53
+
54
+ ```yaml
55
+ - name: Sync to fontist-archive-public
56
+ run: |
57
+ git clone --depth 1 https://${GH_TOKEN}@github.com/fontist/fontist-archive-public archive
58
+ rsync -a --delete output/block-feed/ archive/unicode/block-feed/
59
+ rsync -a --delete output/universal_glyph_set/ archive/unicode/universal-glyph-set/
60
+ # Per-codepoint JSONs are 1.2GB total — too big for git LFS.
61
+ # Either: (a) sample for production (Basic Latin + CJK subset);
62
+ # Or: (b) push to a release artifact, not the repo itself.
63
+ # Decision: per-codepoint JSONs ship via GitHub Release assets
64
+ # attached to the workflow run, NOT committed to the repo.
65
+ cd archive
66
+ git config user.email "ucode-bot@fontist.org"
67
+ git config user.name "ucode-bot"
68
+ git commit -am "Sync Unicode data from ucode@${GITHUB_SHA}"
69
+ git push origin main
70
+ ```
71
+
72
+ 3. Per-codepoint JSONs (`output/blocks/<ID>/<U+XXXX>/index.json`) —
73
+ 1.2 GB total, too big for the repo. Publish as a Release asset
74
+ `.tar.zst` per Unicode version. fontist.org's fetch-data.sh
75
+ downloads + extracts on demand.
76
+
77
+ ### Phase B — fontist-archive-public structure
78
+
79
+ 4. The public archive gains a new top-level `unicode/` directory:
80
+
81
+ ```
82
+ fontist-archive-public/
83
+ ├── coverage/ # existing — per-font audit YAMLs
84
+ ├── fonts/ # existing — WOFF specimens
85
+ ├── fonts.json # existing
86
+ ├── unicode/ # NEW — ucode output
87
+ │ ├── block-feed/
88
+ │ │ ├── unicode-blocks.json
89
+ │ │ ├── unicode-version.json
90
+ │ │ └── unicode/blocks/<slug>.json
91
+ │ ├── universal-glyph-set/
92
+ │ │ ├── manifest.json
93
+ │ │ ├── entries/U+XXXX.json
94
+ │ │ └── glyphs/U+XXXX.svg
95
+ │ └── codepoints-{version}.tar.zst # release asset link
96
+ ├── bin/sync-from-private
97
+ └── .github/workflows/sync.yml
98
+ ```
99
+
100
+ 5. Update `fontist-archive-public/bin/sync-from-private` to also
101
+ accept the ucode sync (or add a separate `sync-from-ucode` script).
102
+ The sync workflow in fontist-archive-public triggers on:
103
+ - fontist-archive-private pushes (existing — coverage/woff sync)
104
+ - ucode publish workflow run (new — unicode/ sync)
105
+
106
+ ### Phase C — fontist.org fetch-data.sh
107
+
108
+ 6. Extend `scripts/fetch-data.sh` to also copy `unicode/` from
109
+ `fontist-archive-public`:
110
+
111
+ ```bash
112
+ log "copying unicode/block-feed/"
113
+ mkdir -p "$PUBLIC/unicode"
114
+ if [[ -d "$TMP/archive/unicode/block-feed" ]]; then
115
+ cp -r "$TMP/archive/unicode/block-feed/." "$PUBLIC/unicode/"
116
+ fi
117
+
118
+ log "copying unicode/universal-glyph-set/"
119
+ if [[ -d "$TMP/archive/unicode/universal-glyph-set" ]]; then
120
+ cp -r "$TMP/archive/unicode/universal-glyph-set/." "$PUBLIC/unicode/glyphs/"
121
+ fi
122
+ ```
123
+
124
+ 7. The `--with-ucode` flag from PR #44 becomes a no-op (or redirects
125
+ to a warning to update fetch-data.sh). All ucode data flows through
126
+ the archive.
127
+
128
+ 8. For per-codepoint JSONs (1.2 GB tar.zst): add a `--with-codepoints`
129
+ flag to fetch-data.sh. Default OFF — production doesn't need all
130
+ 299k JSONs; local dev can opt in. When ON, download the Release
131
+ asset, extract to `public/codepoints/`.
132
+
133
+ ### Phase D — Versioning
134
+
135
+ 9. `unicode/unicode-version.json` records the UCD version. fontist.org
136
+ reads this to display "Unicode 17.0.0 data, refreshed <date>".
137
+
138
+ 10. When a new Unicode version drops (UCD 18.0.0), ucode publishes a
139
+ NEW versioned directory:
140
+ `unicode/v18/block-feed/`, `unicode/v18/universal-glyph-set/`.
141
+ fontist.org can pin to a specific version.
142
+
143
+ ## Acceptance
144
+
145
+ - [ ] ucode GHA workflow runs end-to-end on push to main
146
+ - [ ] fontist-archive-public gains `unicode/block-feed/` and
147
+ `unicode/universal-glyph-set/`
148
+ - [ ] Per-codepoint JSONs ship as a Release asset (not in-repo)
149
+ - [ ] fontist.org `fetch-data.sh` copies `unicode/` from the archive
150
+ (no more direct raw.githubusercontent.com fetch)
151
+ - [ ] `unicode-version.json` reflects the current UCD version
152
+ - [ ] ucode repo stays lean (no built artifacts committed)
153
+
154
+ ## References
155
+
156
+ - [TODO 35](35-universal-set-production-run.md) — universal-set build
157
+ - [TODO 38](38-fontist-org-glyph-consumer.md) — consumer side
158
+ - [TODO 40](40-archive-private-uses-ucode-audit.md) — per-font audit pipeline
159
+ - `fontist.org/scripts/fetch-data.sh` — consumer (needs Phase C update)
160
+ - `fontist.org/coverage-architecture.md` — updated architecture
@@ -0,0 +1,102 @@
1
+ ---
2
+ # Specialist Tier 1 fonts not in fontist's formula index.
3
+ #
4
+ # Each entry has:
5
+ # label — human + provenance key (matches the labels used in
6
+ # config/unicode17_universal_glyph_set.yml)
7
+ # version — upstream version string
8
+ # license — OFL is the default. Anything else requires
9
+ # --allow-proprietary on `ucode fetch fonts`.
10
+ # url — canonical download URL. null = local-only (the
11
+ # user supplies the file at `path`).
12
+ # sha256 — null until the first successful `ucode fetch fonts`
13
+ # run; the fetcher computes it and writes it back
14
+ # here as a checkpoint. After that, mismatches raise
15
+ # Ucode::FontChecksumError.
16
+ # path — destination under data/fonts/ (relative) or absolute.
17
+ # Local-only entries may use ~ and shell globs.
18
+ # extract — true if `url` is a zip; the fetcher extracts only
19
+ # `extract_member` to `path`.
20
+ # extract_member — filename to pull out of the zip (required when
21
+ # extract: true).
22
+ # provenance — short citation; what this font covers and why we
23
+ # need it.
24
+ #
25
+ # URLs are the canonical sources named in TODO.new/30. The first
26
+ # `ucode fetch fonts` run verifies each URL resolves and records the
27
+ # SHA256 here as a checkpoint commit.
28
+ fonts:
29
+ - label: Lentariso
30
+ version: '1.033'
31
+ license: OFL
32
+ url: https://github.com/Bry10022/Lentariso/releases/download/1.033/Lentariso.otf
33
+ sha256: null
34
+ path: data/fonts/Lentariso.otf
35
+ extract: false
36
+ provenance: github.com/Bry10022/Lentariso — Imperial Aramaic, Phoenician, Sidetic
37
+
38
+ - label: Kedebideri
39
+ version: '3.001'
40
+ license: OFL
41
+ url: https://software.sil.org/downloads/r/kedebideri/Kedebideri-3.001.zip
42
+ sha256: null
43
+ path: data/fonts/Kedebideri-Regular.ttf
44
+ extract: true
45
+ extract_member: Kedebideri-Regular.ttf
46
+ provenance: SIL, first Unicode font for Beria Erfe
47
+
48
+ - label: NotoSerifTaiYo
49
+ version: draft-2025-09
50
+ license: OFL
51
+ url: https://translationcommons.org/wp-content/uploads/2025/09/NotoSerifTaiYo.ttf
52
+ sha256: null
53
+ path: data/fonts/NotoSerifTaiYo.ttf
54
+ extract: false
55
+ provenance: translationcommons.org, pre-release Noto variant for Tai Yo
56
+
57
+ - label: UniHieroglyphica
58
+ version: '16.0'
59
+ license: OFL
60
+ url: https://www.suignard.com/UniHieroglyphica/UniHieroglyphica-16.0.zip
61
+ sha256: null
62
+ path: data/fonts/UniHieroglyphica.ttf
63
+ extract: true
64
+ extract_member: UniHieroglyphica.ttf
65
+ provenance: suignard.com, authoritative for Egyptian Hieroglyphs
66
+
67
+ - label: EgyptianText
68
+ version: '1.0'
69
+ license: OFL
70
+ url: https://github.com/microsoft/font-tools/releases/download/v1.0/EgyptianText-Regular.ttf
71
+ sha256: null
72
+ path: data/fonts/EgyptianText-Regular.ttf
73
+ extract: false
74
+ provenance: microsoft/font-tools — Egyptian Hieroglyph Format Controls block
75
+
76
+ - label: BabelStonePseudographica
77
+ version: '2024-09-10'
78
+ license: OFL
79
+ url: https://www.babelstone.co.uk/Fonts/Download/BabelStonePseudographica.zip
80
+ sha256: null
81
+ path: data/fonts/BabelStonePseudographica.ttf
82
+ extract: true
83
+ extract_member: BabelStonePseudographica.ttf
84
+ provenance: BabelStone, partial Unicode 17 symbol coverage
85
+
86
+ - label: Symbola
87
+ version: '13.0'
88
+ license: OFL
89
+ url: https://dn-works.com/wp-content/uploads/2020/ufas/Symbola.zip
90
+ sha256: null
91
+ path: data/fonts/Symbola.ttf
92
+ extract: true
93
+ extract_member: Symbola.ttf
94
+ provenance: dn-works.com, broad Unicode symbol coverage
95
+
96
+ - label: FSung
97
+ version: '2024'
98
+ license: OFL
99
+ url: null
100
+ path: "~/Downloads/全宋體/FSung-*.ttf"
101
+ extract: false
102
+ provenance: Taiwan MOE 全宋體 — local-only, user-supplied. Covers CJK Unified Ideographs Extension J.
@@ -0,0 +1,42 @@
1
+ # Unicode 17.0 Tier 1 font mapping.
2
+ #
3
+ # Block → preferred Tier 1 real fonts. Each entry is a font specifier
4
+ # resolvable by Ucode::Glyphs::RealFonts::FontLocator:
5
+ #
6
+ # - "LabelName=/absolute/path/to/font.ttf" — direct path with a
7
+ # human label. Used for fonts not in fontist's formula index.
8
+ # - "fontist-formula-name" — resolved via `fontist find/install`.
9
+ #
10
+ # Block names use the original Unicode verbatim form, matching UCD
11
+ # Blocks.txt. Do not slugify.
12
+ #
13
+ # Populated from docs/unicode17-coverage-baseline.md. See that file
14
+ # for cmap-verified coverage numbers and pending font acquisitions.
15
+
16
+ tier1_fonts:
17
+ # ✅ 4298/4298 cmap-verified via FSung-3 (全宋體, locally available).
18
+ # FSung-3 also covers CJK Extension H at 100%.
19
+ CJK_Unified_Ideographs_Extension_J:
20
+ - FSung-3=/Users/mulgogi/Downloads/全宋體/FSung-3.ttf
21
+
22
+ # ✅ 4192/4192 cmap-verified via FSung-3.
23
+ CJK_Unified_Ideographs_Extension_H:
24
+ - FSung-3=/Users/mulgogi/Downloads/全宋體/FSung-3.ttf
25
+
26
+ # ✅ 88/88 cmap-verified via Noto Sans Adlam (Unicode 16 baseline).
27
+ # Unicode 17 reportedly adds +29; pending re-audit with updated font.
28
+ Adlam:
29
+ - NotoSansAdlam=spec/fixtures/fonts/NotoSansAdlam-Regular.ttf
30
+
31
+ # The blocks below have cmap-verified coverage from prior sessions
32
+ # (see docs/unicode17-coverage-baseline.md) but their fonts are not
33
+ # currently on disk. Uncomment after acquiring the fonts and
34
+ # replacing the placeholder path:
35
+ #
36
+ # Sidetic (26/26 via Lentariso ≥1.029):
37
+ # Sidetic:
38
+ # - Lentariso=/path/to/Lentariso-Re.ttf
39
+ #
40
+ # Beria Erfe (50/50 via Kedebideri 3.001):
41
+ # Beria_Erfe:
42
+ # - Kedebideri=/path/to/Kedebideri-Regular.otf