ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open3"
4
+ require "pathname"
5
+ require "tmpdir"
6
+
7
+ require "ucode/error"
8
+
9
+ module Ucode
10
+ module Glyphs
11
+ # Strategy interface for PDF-page-to-SVG rendering.
12
+ #
13
+ # Subclasses implement `renderer_name`, `binary_name`, and
14
+ # `build_command`. The base class handles availability check,
15
+ # command execution, error handling, and the renderer registry.
16
+ #
17
+ # **OCP**: a new renderer is a new subclass file + one entry in
18
+ # `KNOWN_RENDERERS`. The base class and existing renderers are not
19
+ # modified.
20
+ #
21
+ # **Vector-only requirement**: every renderer here must emit SVG
22
+ # `<path>` elements (vector data) for the Code Charts PDFs, not
23
+ # raster images. Callers verify this via `path_count` on the output.
24
+ class PageRenderer
25
+ OUTPUT_FORMAT = :svg
26
+
27
+ # Fixture used by `works?` to smoke-test renderers. Resolved lazily
28
+ # so missing-fixture environments (installed gem without spec assets)
29
+ # don't fail at load time.
30
+ DEFAULT_SMOKE_FIXTURE =
31
+ File.expand_path("../../../spec/fixtures/pdfs/basic_latin.pdf", __dir__)
32
+
33
+ # Ordered list of known concrete renderer class names (as symbols),
34
+ # most-preferred first. Resolved lazily via `const_get` so that
35
+ # loading any one renderer does not eagerly load all of them — this
36
+ # avoids a circular require (each renderer file requires this file
37
+ # to inherit from PageRenderer).
38
+ KNOWN_RENDERERS = %i[
39
+ MutoolRenderer
40
+ Pdf2svgRenderer
41
+ DvisvgmRenderer
42
+ PdftocairoRenderer
43
+ ].freeze
44
+ private_constant :KNOWN_RENDERERS
45
+
46
+ class << self
47
+ # @return [Symbol] short identifier (e.g. :mutool)
48
+ def renderer_name
49
+ raise NotImplementedError
50
+ end
51
+
52
+ # @return [String, Symbol] the binary looked up on PATH
53
+ def binary_name
54
+ raise NotImplementedError
55
+ end
56
+
57
+ # @return [Symbol] always :svg for now; future formats (png, etc.)
58
+ # would warrant a separate renderer family.
59
+ def output_format
60
+ OUTPUT_FORMAT
61
+ end
62
+
63
+ # Build the argv for the renderer. Subclasses return an Array
64
+ # suitable for `Open3.capture2e` (no shell interpolation).
65
+ # @param pdf_path [Pathname, String]
66
+ # @param page_num [Integer] 1-indexed
67
+ # @param out_path [Pathname, String]
68
+ # @return [Array<String>]
69
+ def build_command(pdf_path, page_num, out_path)
70
+ raise NotImplementedError
71
+ end
72
+
73
+ # @return [Boolean] true if the binary is on PATH
74
+ def available?
75
+ system("which", binary_name.to_s, out: "/dev/null", err: "/dev/null")
76
+ end
77
+
78
+ # Smoke-test the binary by actually rendering one page of the
79
+ # fixture PDF AND verifying the output format is consumable by
80
+ # the downstream `GridDetector` / `CellExtractor` pipeline.
81
+ #
82
+ # Three things can make a renderer unusable for this codebase:
83
+ # 1. Binary not on PATH (`available?` catches this).
84
+ # 2. Binary on PATH but silently broken (e.g. Ubuntu's
85
+ # `mupdf-tools` is built without LCMS, so `mutool` warns
86
+ # "ICC support is not available" and emits zero bytes for
87
+ # ICC-profiled PDFs).
88
+ # 3. Binary works but emits a flat-path SVG that GridDetector
89
+ # can't parse (mutool's format: `<path id="font_X_Y">`
90
+ # directly in `<defs>`, no `<use>` references). The grid
91
+ # detector requires the `<g id="glyph-N-M">` + `<use>` form
92
+ # produced by pdftocairo / pdf2svg.
93
+ #
94
+ # The result is memoized per-renderer for the process lifetime —
95
+ # the binary's capabilities don't change mid-run.
96
+ #
97
+ # When no fixture PDF is available (e.g. installed gem without
98
+ # spec assets), degrades to `available?` — we can't smoke-test
99
+ # without input, so we trust the binary's presence on PATH.
100
+ #
101
+ # @param fixture_pdf [String, Pathname] small one-page PDF used
102
+ # for the smoke render. Defaults to the project's
103
+ # `basic_latin.pdf` spec fixture.
104
+ # @return [Boolean]
105
+ def works?(fixture_pdf: DEFAULT_SMOKE_FIXTURE)
106
+ if !available?
107
+ false
108
+ elsif !File.exist?(fixture_pdf.to_s)
109
+ true # no fixture to verify against; trust PATH
110
+ else
111
+ smoke_render_ok?(fixture_pdf)
112
+ end
113
+ end
114
+
115
+ # Render one page of `pdf_path` to `out_path` as SVG.
116
+ # @param pdf_path [Pathname, String]
117
+ # @param page_num [Integer] 1-indexed
118
+ # @param out_path [Pathname, String]
119
+ # @return [Symbol] :ok on success
120
+ # @raise [Ucode::PdfRenderError] on failure (non-zero exit,
121
+ # output file missing, or binary unavailable)
122
+ def render(pdf_path, page_num, out_path)
123
+ unless available?
124
+ raise PdfRenderError.new(
125
+ "binary '#{binary_name}' not available on PATH",
126
+ context: { renderer: name, binary: binary_name },
127
+ )
128
+ end
129
+
130
+ out = Pathname.new(out_path)
131
+ out.dirname.mkpath
132
+
133
+ cmd = build_command(Pathname.new(pdf_path), page_num, out)
134
+ output, status = Open3.capture2e(*cmd)
135
+
136
+ unless status.success? && out.exist? && out.size.positive?
137
+ raise PdfRenderError.new(
138
+ "render failed for page #{page_num} of #{pdf_path} via '#{binary_name}'",
139
+ context: {
140
+ renderer: name,
141
+ binary: binary_name,
142
+ exit_status: status.exitstatus,
143
+ output: output,
144
+ },
145
+ )
146
+ end
147
+
148
+ :ok
149
+ end
150
+
151
+ # ---- Registry ----
152
+
153
+ # @return [Array<Class>] every known concrete renderer
154
+ def all
155
+ @all ||= KNOWN_RENDERERS.map { |n| Ucode::Glyphs.const_get(n) }.freeze
156
+ end
157
+
158
+ # @return [Array<Class>] renderers whose binary is installed
159
+ def available
160
+ all.select(&:available?)
161
+ end
162
+
163
+ # @return [Array<Class>] renderers that actually produce SVG in
164
+ # the format `GridDetector` consumes (smoke-tested once per
165
+ # process via `works?`, then cached). Subset of `available`.
166
+ def working
167
+ return @working if @working
168
+
169
+ @working = all.select(&:works?).freeze
170
+ end
171
+
172
+ # Clear the cached `working` list. Useful when the environment
173
+ # changes (e.g. a binary is installed mid-process) or in tests.
174
+ def reset_working_cache!
175
+ @working = nil
176
+ end
177
+
178
+ # @param name [Symbol, String]
179
+ # @return [Class, nil]
180
+ def find(name)
181
+ all.find { |r| r.renderer_name == name.to_sym }
182
+ end
183
+
184
+ # @return [Class, nil] the first working renderer; falls back to
185
+ # the first available renderer if none have been smoke-tested
186
+ # yet (preserves eager-init paths). nil if nothing is installed.
187
+ def default
188
+ working.first || available.first
189
+ end
190
+
191
+ private
192
+
193
+ # @param fixture_pdf [String] path to an existing PDF
194
+ # @return [Boolean] true iff rendering page 1 produces an SVG
195
+ # with the `<g id="glyph-N-M">` + `<use>` form that
196
+ # `GridDetector` requires.
197
+ def smoke_render_ok?(fixture_pdf)
198
+ Dir.mktmpdir("renderer-smoke-") do |dir|
199
+ out = File.join(dir, "smoke.svg")
200
+ begin
201
+ render(fixture_pdf, 1, out)
202
+ rescue PdfRenderError
203
+ break false
204
+ end
205
+ svg_has_pipeline_format?(out)
206
+ end
207
+ end
208
+
209
+ def svg_has_pipeline_format?(out_path)
210
+ return false unless File.exist?(out_path)
211
+ return false unless File.size(out_path).positive?
212
+
213
+ body = File.read(out_path)
214
+ body.include?("<svg") &&
215
+ body.include?("<use") &&
216
+ body.match?("id=\"glyph-\\d+-\\d+\"")
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ # Estimates the axis-aligned bounding box of an SVG `<path>` `d`
6
+ # attribute by scanning every numeric coordinate pair in the path
7
+ # data. This is a conservative over-estimate: control points and
8
+ # implicit vertices are included, so the true curve bbox is always
9
+ # contained within the estimate. For grid detection and cell
10
+ # membership tests, the over-estimate is sufficient and avoids the
11
+ # cost of a Bezier solver.
12
+ #
13
+ # Only absolute coordinates are returned. Relative commands (lowercase
14
+ # `m`, `l`, `c`, …) are NOT supported — Code Charts SVGs from every
15
+ # supported renderer (pdftocairo, pdf2svg, dvisvgm, mutool) emit
16
+ # absolute commands. If relative commands appear, parse them via a
17
+ # proper SVG path parser before calling this.
18
+ module PathBbox
19
+ NUMBER = /-?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?/.freeze
20
+
21
+ Result = Struct.new(:min_x, :min_y, :max_x, :max_y, keyword_init: true) do
22
+ def width
23
+ return nil if empty?
24
+
25
+ max_x - min_x
26
+ end
27
+
28
+ def height
29
+ return nil if empty?
30
+
31
+ max_y - min_y
32
+ end
33
+
34
+ def empty?
35
+ min_x.nil? || min_y.nil? || max_x.nil? || max_y.nil?
36
+ end
37
+ end
38
+
39
+ class << self
40
+ def estimate(path_d)
41
+ return Result.new if path_d.nil? || path_d.empty?
42
+
43
+ numbers = path_d.scan(NUMBER).map(&:to_f)
44
+ return Result.new if numbers.empty?
45
+
46
+ xs = []
47
+ ys = []
48
+ numbers.each_slice(2) do |x, y|
49
+ xs << x
50
+ ys << y
51
+ end
52
+ Result.new(
53
+ min_x: xs.min,
54
+ min_y: ys.min,
55
+ max_x: xs.max,
56
+ max_y: ys.max,
57
+ )
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/glyphs/page_renderer"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ # `pdf2svg` — simple, widely available. One SVG per page.
8
+ #
9
+ # Command: `pdf2svg <in.pdf> <out.svg> <page>`
10
+ class Pdf2svgRenderer < PageRenderer
11
+ class << self
12
+ def renderer_name
13
+ :pdf2svg
14
+ end
15
+
16
+ def binary_name
17
+ :pdf2svg
18
+ end
19
+
20
+ def build_command(pdf_path, page_num, out_path)
21
+ ["pdf2svg", pdf_path.to_s, out_path.to_s, page_num.to_s]
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "open3"
5
+
6
+ require "ucode/cache"
7
+ require "ucode/fetch/code_charts"
8
+ require "ucode/glyphs/monolith_page_map"
9
+
10
+ module Ucode
11
+ module Glyphs
12
+ # Resolves a Unicode block to its source PDF on disk.
13
+ #
14
+ # Primary source: the per-block PDF cached at
15
+ # `<cache>/<version>/pdfs/U<XXXX>.pdf` (downloaded from
16
+ # `unicode.org/charts/PDF/` by `Ucode::Fetch::CodeCharts`).
17
+ #
18
+ # Fallback: slice the page range from the monolith `CodeCharts.pdf`.
19
+ # The page range is resolved by `MonolithPageMap` from the PDF's
20
+ # bookmark outline, cached under `data/codecharts_page_map.json`.
21
+ class PdfFetcher
22
+ # @param version [String] UCD version, used as the cache namespace.
23
+ # @param monolith_path [String, Pathname, nil] path to the full
24
+ # `CodeCharts.pdf`. Pass nil to disable monolith fallback.
25
+ # @param blocks [Array<Ucode::Models::Block>] required for monolith
26
+ # fallback — used to match bookmark titles to block first-cps.
27
+ # @param page_map_cache [String, Pathname, nil] where to read/write
28
+ # the monolith page-map JSON cache.
29
+ def initialize(version, monolith_path: nil, blocks: [], page_map_cache: nil)
30
+ @version = version
31
+ @monolith_path = monolith_path && Pathname.new(monolith_path)
32
+ @blocks = blocks
33
+ @page_map_cache = page_map_cache
34
+ end
35
+
36
+ # Resolve the per-block PDF for `block_first_cp`, fetching from the
37
+ # network if missing. Returns the local PDF path, or nil if the
38
+ # block's PDF is unavailable (network failure + no monolith, or
39
+ # monolith lacks the requested block).
40
+ #
41
+ # @param block_first_cp [Integer] first codepoint of the block;
42
+ # also the PDF's URL slug per unicode.org's naming convention.
43
+ # @param force [Boolean] re-download even if cached.
44
+ # @return [Pathname, nil]
45
+ def fetch(block_first_cp:, force: false)
46
+ path = per_block_path(block_first_cp)
47
+ return path if path.exist? && !force
48
+
49
+ download(block_first_cp)
50
+ return path if path.exist?
51
+
52
+ slice_from_monolith(block_first_cp)
53
+ end
54
+
55
+ private
56
+
57
+ def per_block_path(block_first_cp)
58
+ Cache.pdfs_dir(@version).join("U#{hex_slug(block_first_cp)}.pdf")
59
+ end
60
+
61
+ def hex_slug(cp)
62
+ cp.to_s(16).upcase.rjust(4, "0")
63
+ end
64
+
65
+ def download(block_first_cp)
66
+ Fetch::CodeCharts.call(@version, block_first_cps: [block_first_cp])
67
+ rescue StandardError => e
68
+ # Network failures fall through to monolith fallback. We do not
69
+ # swallow programming errors (NoMethodError etc.) — only fetch
70
+ # failures (network, checksum, HTTP).
71
+ return if e.is_a?(Ucode::FetchError)
72
+
73
+ raise
74
+ end
75
+
76
+ def slice_from_monolith(block_first_cp)
77
+ return unless @monolith_path&.exist?
78
+
79
+ entry = page_map[block_first_cp]
80
+ return unless entry && entry.start_page && entry.end_page
81
+
82
+ slice_pages(entry.start_page, entry.end_page, per_block_path(block_first_cp))
83
+ end
84
+
85
+ def page_map
86
+ @page_map ||= MonolithPageMap.load(
87
+ monolith_path: @monolith_path,
88
+ blocks: @blocks,
89
+ cache_path: @page_map_cache,
90
+ )
91
+ end
92
+
93
+ def slice_pages(start_page, end_page, out_path)
94
+ out_path.dirname.mkpath
95
+ cmd = ["pdftk", @monolith_path.to_s, "cat",
96
+ "#{start_page}-#{end_page}", "output", out_path.to_s]
97
+ _out, status = Open3.capture2e(*cmd)
98
+ status.success? ? out_path : nil
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/glyphs/page_renderer"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ # `pdftocairo -svg` from the Poppler suite. Available on macOS via
8
+ # `brew install poppler`. Slower than `mutool` but widely available.
9
+ #
10
+ # Command: `pdftocairo -svg -f <n> -l <n> <in.pdf> <out.svg>`
11
+ #
12
+ # The `-f`/`-l` pair restricts rendering to one page (first/last).
13
+ class PdftocairoRenderer < PageRenderer
14
+ class << self
15
+ def renderer_name
16
+ :pdftocairo
17
+ end
18
+
19
+ def binary_name
20
+ :pdftocairo
21
+ end
22
+
23
+ def build_command(pdf_path, page_num, out_path)
24
+ ["pdftocairo", "-svg",
25
+ "-f", page_num.to_s,
26
+ "-l", page_num.to_s,
27
+ pdf_path.to_s, out_path.to_s]
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ module RealFonts
8
+ # Per-block coverage row on a {FontCoverageReport}.
9
+ #
10
+ # `assigned` is the UCD-assigned codepoint count for this block
11
+ # (from {Unicode17Blocks}); `covered` is the count actually
12
+ # present in the font's cmap; `missing_cps` is the human-readable
13
+ # hex form (`U+XXXX`) of every assigned codepoint the font lacks,
14
+ # so a downstream consumer can audit gaps without re-walking the
15
+ # cmap.
16
+ class BlockCoverage < Lutaml::Model::Serializable
17
+ attribute :name, :string
18
+ attribute :first_cp, :integer
19
+ attribute :last_cp, :integer
20
+ attribute :assigned, :integer
21
+ attribute :covered, :integer
22
+ attribute :missing_cps, :string, collection: true, default: -> { [] }
23
+
24
+ key_value do
25
+ map "name", to: :name
26
+ map "first_cp", to: :first_cp
27
+ map "last_cp", to: :last_cp
28
+ map "assigned", to: :assigned
29
+ map "covered", to: :covered
30
+ map "missing_cps", to: :missing_cps
31
+ end
32
+
33
+ def fill_ratio
34
+ return 0.0 if assigned.nil? || assigned.zero?
35
+
36
+ (covered.to_f / assigned).round(4)
37
+ end
38
+
39
+ def complete?
40
+ assigned.to_i.positive? && covered == assigned
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "time"
5
+
6
+ require "fontisan"
7
+
8
+ require_relative "block_coverage"
9
+ require_relative "font_coverage_report"
10
+ require_relative "unicode_17_blocks"
11
+
12
+ module Ucode
13
+ module Glyphs
14
+ module RealFonts
15
+ # Builds a {FontCoverageReport} for a font on disk.
16
+ #
17
+ # Strategy:
18
+ #
19
+ # 1. Walk the font's cmap via fontisan to get the set of
20
+ # codepoints the font actually has outlines for.
21
+ # 2. For each Unicode 17 block in {Unicode17Blocks}, intersect
22
+ # the block's assigned-codepoint ranges against the cmap
23
+ # set. The denominator (`assigned`) comes from our curated
24
+ # ranges table — not from fontisan's UCD database, because
25
+ # the UCD database is a separate download and its block
26
+ # coverage for Unicode 17 is incomplete (it omits several
27
+ # new blocks). The numerator (`covered`) and the
28
+ # `missing_cps` list both come from the cmap walk.
29
+ # 3. Also call fontisan's {Fontisan::Commands::AuditCommand} in
30
+ # brief mode for identity + total counts (no UCD dependency
31
+ # in brief mode).
32
+ class CoverageAuditor
33
+ UCD_VERSION = "17.0.0"
34
+
35
+ # @param font_path [Pathname, String]
36
+ # @return [FontCoverageReport]
37
+ def audit(font_path)
38
+ font_path = Pathname(font_path)
39
+ fontisan_report = run_fontisan_audit(font_path)
40
+ cmap_codepoints = read_cmap_codepoints(font_path)
41
+ blocks = Unicode17Blocks::ALL.map do |block|
42
+ build_block_coverage(block, cmap_codepoints)
43
+ end
44
+
45
+ FontCoverageReport.new(**report_kwargs(font_path, fontisan_report,
46
+ blocks))
47
+ end
48
+
49
+ private
50
+
51
+ # Brief mode is enough — we don't need fontisan's Aggregations
52
+ # extractor (we compute our own coverage from the curated
53
+ # Unicode17Blocks table) and brief mode skips the UCD database
54
+ # dependency that full mode requires.
55
+ def run_fontisan_audit(font_path)
56
+ Fontisan::Commands::AuditCommand.new(
57
+ font_path.to_s,
58
+ ucd_version: UCD_VERSION,
59
+ audit_brief: true,
60
+ ).run
61
+ end
62
+
63
+ def read_cmap_codepoints(font_path)
64
+ font = Fontisan::FontLoader.load(font_path.to_s)
65
+ cmap = font.table(Fontisan::Constants::CMAP_TAG)
66
+ return Set.new unless cmap
67
+
68
+ cmap.unicode_mappings.keys.to_set
69
+ end
70
+
71
+ def build_block_coverage(block, cmap_codepoints)
72
+ assigned_cps = block.assigned_ranges.flat_map(&:to_a)
73
+ covered = assigned_cps.select { |cp| cmap_codepoints.include?(cp) }
74
+
75
+ BlockCoverage.new(
76
+ name: block.name,
77
+ first_cp: block.first_cp,
78
+ last_cp: block.last_cp,
79
+ assigned: assigned_cps.length,
80
+ covered: covered.length,
81
+ missing_cps: missing_cps_for(assigned_cps, covered),
82
+ )
83
+ end
84
+
85
+ def missing_cps_for(assigned_cps, covered)
86
+ (assigned_cps - covered).map { |cp| format("U+%04X", cp) }
87
+ end
88
+
89
+ def identity_kwargs(font_path, fontisan_report)
90
+ {
91
+ source_file: font_path.basename.to_s,
92
+ source_format: fontisan_report.source_format,
93
+ family_name: fontisan_report.family_name,
94
+ full_name: fontisan_report.full_name,
95
+ postscript_name: fontisan_report.postscript_name,
96
+ version: fontisan_report.version,
97
+ }
98
+ end
99
+
100
+ def count_kwargs(fontisan_report, blocks)
101
+ {
102
+ total_codepoints: fontisan_report.total_codepoints,
103
+ total_glyphs: fontisan_report.total_glyphs,
104
+ ucd_version: UCD_VERSION,
105
+ blocks: blocks,
106
+ }
107
+ end
108
+
109
+ def report_kwargs(font_path, fontisan_report, blocks)
110
+ identity_kwargs(font_path, fontisan_report)
111
+ .merge(count_kwargs(fontisan_report, blocks))
112
+ .merge(generated_at: Time.now.utc.iso8601)
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ require_relative "block_coverage"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module RealFonts
10
+ # Coverage report for a single font face. Produced by
11
+ # {CoverageAuditor} from a {Fontisan::Models::Audit::AuditReport}
12
+ # + the font's own cmap walk. Carries identity (so a consumer
13
+ # reading the file knows which font produced it), coverage
14
+ # totals, and the per-block detail restricted to Unicode 17 new
15
+ # blocks (older blocks are noise for this audit).
16
+ class FontCoverageReport < Lutaml::Model::Serializable
17
+ attribute :generated_at, :string
18
+ attribute :source_file, :string
19
+ attribute :source_format, :string
20
+ attribute :family_name, :string
21
+ attribute :full_name, :string
22
+ attribute :postscript_name, :string
23
+ attribute :version, :string
24
+ attribute :total_codepoints, :integer
25
+ attribute :total_glyphs, :integer
26
+ attribute :ucd_version, :string
27
+ attribute :blocks, BlockCoverage, collection: true, default: -> { [] }
28
+
29
+ key_value do
30
+ map "generated_at", to: :generated_at
31
+ map "source_file", to: :source_file
32
+ map "source_format", to: :source_format
33
+ map "family_name", to: :family_name
34
+ map "full_name", to: :full_name
35
+ map "postscript_name", to: :postscript_name
36
+ map "version", to: :version
37
+ map "total_codepoints", to: :total_codepoints
38
+ map "total_glyphs", to: :total_glyphs
39
+ map "ucd_version", to: :ucd_version
40
+ map "blocks", to: :blocks
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end