ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ require "fontist"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module RealFonts
10
+ # Resolves a user-provided font specifier to a concrete file
11
+ # path on disk. Resolution order:
12
+ #
13
+ # 1. Direct file path — returns it if it exists. Useful for
14
+ # local checkouts (e.g. a developer's clone of Lentariso).
15
+ # 2. `Fontist::Font.find(name)` — returns the already-installed
16
+ # font path if fontist has it on disk.
17
+ # 3. `Fontist::Font.install(name)` — downloads + installs the
18
+ # font via the fontist formula index.
19
+ #
20
+ # Fontist is the canonical discovery layer for the fontist
21
+ # ecosystem. We never reach into other package managers or
22
+ # hardcode URLs here — formulas live in fontist/formulas.
23
+ class FontLocator
24
+ LocateResult = Struct.new(:name, :path, :via, keyword_init: true)
25
+
26
+ # @param spec [String] either a file path or a fontist formula
27
+ # name (case-insensitive). A `name=path` form is also
28
+ # accepted so a CLI can name the font whatever the user
29
+ # wants without depending on the formula's family name.
30
+ # @param install [Boolean] if true and the font is not on
31
+ # disk, attempt `Fontist::Font.install`. Default: true.
32
+ # @return [LocateResult]
33
+ # @raise [Errno::ENOENT] if path does not exist and fontist
34
+ # cannot resolve the name.
35
+ def locate(spec, install: true)
36
+ name, path = split_spec(spec)
37
+ return result(name, path, :direct) if path && File.exist?(path)
38
+
39
+ via_fontist = find_via_fontist(name, install: install)
40
+ return via_fontist if via_fontist
41
+
42
+ raise Errno::ENOENT, "Font not found: #{spec}"
43
+ end
44
+
45
+ private
46
+
47
+ def split_spec(spec)
48
+ if spec.include?("=")
49
+ name, path = spec.split("=", 2)
50
+ [name.strip, path]
51
+ else
52
+ [spec, spec]
53
+ end
54
+ end
55
+
56
+ def find_via_fontist(name, install:)
57
+ found = safe_fontist_lookup { Fontist::Font.find(name) }
58
+ return result(name, found, :fontist_find) if found
59
+ return nil unless install
60
+
61
+ paths = install_via_fontist(name)
62
+ return nil unless paths&.any?
63
+
64
+ result(name, paths.first, :fontist_install)
65
+ end
66
+
67
+ def install_via_fontist(name)
68
+ Fontist::Font.install(
69
+ name,
70
+ confirmation: "yes",
71
+ hide_licenses: true,
72
+ )
73
+ rescue Fontist::Errors::UnsupportedFontError,
74
+ Fontist::Errors::FontNotFoundError
75
+ nil
76
+ end
77
+
78
+ # `Fontist::Font.find` raises `UnsupportedFontError` when the
79
+ # name isn't in the formula index — that's a "not found"
80
+ # outcome for our purposes, not an exceptional control-flow
81
+ # event. Translate to nil so the caller can fall through to
82
+ # the install-or-fail branch.
83
+ def safe_fontist_lookup
84
+ yield
85
+ rescue Fontist::Errors::UnsupportedFontError, Fontist::Errors::FontNotFoundError
86
+ nil
87
+ end
88
+
89
+ def result(name, path, via)
90
+ LocateResult.new(name: name, path: Pathname(path), via: via)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module RealFonts
6
+ # The new blocks introduced by Unicode 17.0 that this audit
7
+ # cares about. Each block carries its explicit assigned-codepoint
8
+ # ranges.
9
+ #
10
+ # Sources (in priority order):
11
+ # 1. Unicode 17.0 `Blocks.txt` — block name + first/last cp.
12
+ # 2. Per-block code-chart legend on unicode.org — published
13
+ # assigned-codepoint count.
14
+ # 3. Direct inspection of a known-good font (fontisan audit)
15
+ # — confirms at least the assigned count when a font has
16
+ # 100% coverage.
17
+ #
18
+ # Where the chart legend publishes an assigned count but not the
19
+ # exact ranges, we approximate by extending from the block's
20
+ # first codepoint up to the count. This may mis-attribute a few
21
+ # reserved slots in the middle of a block as "assigned"; the
22
+ # `missing_cps` list then over-reports by those slots. Refining
23
+ # to exact ranges is a follow-up once UCD 17.0 text files are
24
+ # integrated into the ucode dataset.
25
+ #
26
+ # Block names match the verbatim UCD block name (`Blocks.txt`
27
+ # field 2) — never slugified.
28
+ Block = Struct.new(:name, :first_cp, :last_cp, :assigned_ranges,
29
+ keyword_init: true) do
30
+ def covers?(codepoint)
31
+ codepoint.between?(first_cp, last_cp)
32
+ end
33
+ end
34
+
35
+ module Unicode17Blocks
36
+ ALL = [
37
+ # Sidetic — U+10940..U+1095F, 26 assigned (verified via
38
+ # Lentariso: covers U+10940..U+10959 exactly).
39
+ Block.new(name: "Sidetic",
40
+ first_cp: 0x10940, last_cp: 0x1095F,
41
+ assigned_ranges: [0x10940..0x10959]),
42
+ # Sharada Supplement — U+11B60..U+11B7F, 8 assigned.
43
+ Block.new(name: "Sharada Supplement",
44
+ first_cp: 0x11B60, last_cp: 0x11B7F,
45
+ assigned_ranges: [0x11B60..0x11B67]),
46
+ # Tolong Siki — U+11DB0..U+11DEF, 54 assigned (letters +
47
+ # digits; ranges approximate).
48
+ Block.new(name: "Tolong Siki",
49
+ first_cp: 0x11DB0, last_cp: 0x11DEF,
50
+ assigned_ranges: [0x11DB0..0x11DE5]),
51
+ # Beria Erfe — U+16EA0..U+16EDF, 50 assigned across two runs
52
+ # (U+16EB9-U+16EBA reserved — verified via Kedebideri).
53
+ Block.new(name: "Beria Erfe",
54
+ first_cp: 0x16EA0, last_cp: 0x16EDF,
55
+ assigned_ranges: [0x16EA0..0x16EB8, 0x16EBB..0x16ED3]),
56
+ # Tai Yo — full block range; published as 52 codepoints in
57
+ # the UCD 17.0 block list.
58
+ Block.new(name: "Tai Yo",
59
+ first_cp: 0x1E6C0, last_cp: 0x1E6F3,
60
+ assigned_ranges: [0x1E6C0..0x1E6F3]),
61
+ # Symbols for Legacy Computing Supplement — 9 assigned
62
+ # (approximate; U+1CC00..U+1CC08).
63
+ Block.new(name: "Symbols for Legacy Computing Supplement",
64
+ first_cp: 0x1CC00, last_cp: 0x1CCFF,
65
+ assigned_ranges: [0x1CC00..0x1CC08]),
66
+ # Supplemental Arrows-C — 9 assigned (U+1CF00..U+1CF08).
67
+ Block.new(name: "Supplemental Arrows-C",
68
+ first_cp: 0x1CF00, last_cp: 0x1CFCF,
69
+ assigned_ranges: [0x1CF00..0x1CF08]),
70
+ # Alchemical Symbols — 4 new in Unicode 17.
71
+ Block.new(name: "Alchemical Symbols",
72
+ first_cp: 0x1F740, last_cp: 0x1F77F,
73
+ assigned_ranges: [0x1F740..0x1F743]),
74
+ # Miscellaneous Symbols Supplement — published as 34
75
+ # assigned in Unicode 17; ranges approximate.
76
+ Block.new(name: "Miscellaneous Symbols Supplement",
77
+ first_cp: 0x1FA70, last_cp: 0x1FAFF,
78
+ assigned_ranges: [0x1FA70..0x1FA91]),
79
+ # Musical Symbols Supplement (Znamenny Notation additions)
80
+ # — U+1D200..U+1D24F, additions in Unicode 17. Range
81
+ # approximate.
82
+ Block.new(name: "Musical Symbols Supplement",
83
+ first_cp: 0x1D200, last_cp: 0x1D24F,
84
+ assigned_ranges: [0x1D200..0x1D245]),
85
+ # CJK Unified Ideographs Extension J — U+31350..U+323AF,
86
+ # 4,293 assigned per UCD 17.0. Audit uses the published
87
+ # block range; the assigned set may extend slightly past
88
+ # U+323AF in some distributions.
89
+ Block.new(name: "CJK Unified Ideographs Extension J",
90
+ first_cp: 0x31350, last_cp: 0x323AF,
91
+ assigned_ranges: [0x31350..0x323AF]),
92
+ ].freeze
93
+
94
+ def self.each(&)
95
+ ALL.each(&)
96
+ end
97
+
98
+ def self.for_codepoint(codepoint)
99
+ ALL.find { |b| codepoint >= b.first_cp && codepoint <= b.last_cp }
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "pathname"
5
+
6
+ require_relative "font_coverage_report"
7
+
8
+ module Ucode
9
+ module Glyphs
10
+ module RealFonts
11
+ # Persists a {FontCoverageReport} as a JSON file under
12
+ # `output/font_coverage/`. One file per audited face; the
13
+ # filename is derived from the report's `source_file` so the
14
+ # source and the report are trivially correlated.
15
+ class Writer
16
+ DEFAULT_OUTPUT_DIR = "font_coverage"
17
+
18
+ # @param output_root [Pathname, String] parent directory; the
19
+ # `font_coverage/` subdirectory is created inside it.
20
+ def initialize(output_root)
21
+ @output_root = Pathname(output_root)
22
+ end
23
+
24
+ # @param report [FontCoverageReport]
25
+ # @return [Pathname] absolute path of the written file
26
+ def write(report)
27
+ path = target_path(report)
28
+ path.dirname.mkpath
29
+ path.write("#{JSON.pretty_generate(report.to_hash)}\n")
30
+ path
31
+ end
32
+
33
+ private
34
+
35
+ def target_path(report)
36
+ base = safe_basename(source_label(report))
37
+ @output_root.join(DEFAULT_OUTPUT_DIR, "#{base}.json")
38
+ end
39
+
40
+ def source_label(report)
41
+ report.source_file || report.postscript_name || "font"
42
+ end
43
+
44
+ def safe_basename(name)
45
+ File.basename(name, ".*").gsub(/[^A-Za-z0-9._-]/, "_")
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ # Tier-1 glyph sourcing — real font cmaps.
6
+ #
7
+ # When a real OpenType/TrueType font covers a Unicode 17 block,
8
+ # walking its cmap and lifting glyph outlines directly from the
9
+ # font's `glyf`/`CFF ` table produces higher-fidelity SVGs than
10
+ # vector-extracting from the Code Charts PDF (which composites
11
+ # chart-grid chrome into the same glyph). Tier 1 is the preferred
12
+ # source; Code Charts PDF (pillar 1 ToUnicode, pillar 2 positional
13
+ # correlation, pillar 3 Last Resort) are fallbacks for codepoints
14
+ # no real font covers.
15
+ #
16
+ # Font discovery goes through **fontist** (`Fontist::Font.find` /
17
+ # `install`); font parsing/audit/outline extraction goes through
18
+ # **fontisan** (`Fontisan::Commands::AuditCommand`,
19
+ # `Fontisan::OutlineExtractor`). Both gems live in the fontist
20
+ # org; fontist already depends on fontisan. No other Ruby
21
+ # font-parsing library is permitted.
22
+ module RealFonts
23
+ autoload :Unicode17Blocks, "ucode/glyphs/real_fonts/unicode_17_blocks"
24
+ autoload :BlockCoverage, "ucode/glyphs/real_fonts/block_coverage"
25
+ autoload :FontCoverageReport,
26
+ "ucode/glyphs/real_fonts/font_coverage_report"
27
+ autoload :FontLocator, "ucode/glyphs/real_fonts/font_locator"
28
+ autoload :CoverageAuditor, "ucode/glyphs/real_fonts/coverage_auditor"
29
+ autoload :Writer, "ucode/glyphs/real_fonts/writer"
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,250 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "thread"
5
+ require "tmpdir"
6
+ require "nokogiri"
7
+
8
+ require "ucode/error"
9
+ require "ucode/glyphs/page_renderer"
10
+ require "ucode/glyphs/grid_detector"
11
+ require "ucode/glyphs/cell_extractor"
12
+ require "ucode/repo/atomic_writes"
13
+ require "ucode/repo/paths"
14
+
15
+ module Ucode
16
+ module Glyphs
17
+ # Writes `glyph.svg` for every codepoint in a block by orchestrating
18
+ # the per-block pipeline: render PDF page → detect grid → extract
19
+ # each cell → write atomic file.
20
+ #
21
+ # The Writer is **page-driven**: the caller hands it a `page_map`
22
+ # (`{ page_num => first_cp_on_that_page }`) so the writer knows what
23
+ # codepoint each detected cell anchor corresponds to. This is the
24
+ # one piece of state the Writer can't derive on its own — pdftocairo
25
+ # converts the row's codepoint labels to outlined glyphs, so they
26
+ # aren't readable as text.
27
+ #
28
+ # **Idempotent**: re-runs are no-ops via `Repo::AtomicWrites` (byte
29
+ # comparison; same content is skipped). Safe to re-run on the whole
30
+ # output tree.
31
+ #
32
+ # **Atomic**: writes go through `<path>.tmp` + rename. A crash mid-
33
+ # write leaves either the old file or no file, never a truncated one.
34
+ #
35
+ # **Placeholder for assigned codepoints with no glyph**: when a
36
+ # codepoint is listed in `block.codepoint_ids` but no cell is found
37
+ # on any rendered page, a small placeholder SVG is written so the
38
+ # site can render a "no official glyph" badge. Counted in the tally
39
+ # as `placeholder`.
40
+ #
41
+ # **Pure-ish**: takes a renderer instance (defaults to the first
42
+ # available system renderer) and a fetcher; both are injectable for
43
+ # tests. The only I/O is the renderer, the writer's output_root, and
44
+ # any optional cache.
45
+ class Writer
46
+ include Repo::AtomicWrites
47
+
48
+ PlaceholderViewBoxSize = 100
49
+ private_constant :PlaceholderViewBoxSize
50
+
51
+ # @param output_root [String, Pathname]
52
+ # @param renderer [Ucode::Glyphs::PageRenderer] concrete renderer class
53
+ # @param parallel_workers [Integer] worker pool size for #write_all
54
+ def initialize(output_root:, renderer: PageRenderer.default, parallel_workers: 4)
55
+ @output_root = Pathname.new(output_root)
56
+ @renderer = renderer
57
+ @parallel_workers = parallel_workers
58
+ end
59
+
60
+ # Process every page in `page_map`, writing glyph.svg for each
61
+ # codepoint that (a) falls inside the block's range and (b) has a
62
+ # detectable glyph on the page.
63
+ #
64
+ # @param block [Ucode::Models::Block]
65
+ # @param pdf_path [String, Pathname]
66
+ # @param page_map [Hash{Integer => Integer}] page_num => first cp on that page
67
+ # @param strict [Boolean] raise GlyphError when the PDF is missing
68
+ # or no grid is detected on any page; when false, returns a tally
69
+ # with `no_grid` set and writes placeholders for assigned cps.
70
+ # @return [Hash] tally { written: N, skipped: N, empty: N,
71
+ # placeholder: N, no_grid: N }
72
+ def write_block(block:, pdf_path:, page_map:, strict: false)
73
+ unless pdf_path && Pathname.new(pdf_path).exist?
74
+ raise_missing_pdf!(block, pdf_path) if strict
75
+ return placeholder_pass(block, zero_tally.tap { |h| h[:no_grid] = 1 })
76
+ end
77
+
78
+ tally = zero_tally
79
+ page_map.each do |page_num, first_cp|
80
+ merge_tally!(tally, write_page(block: block, pdf_path: pdf_path,
81
+ page_num: page_num, first_cp: first_cp))
82
+ end
83
+ placeholder_pass(block, tally)
84
+ end
85
+
86
+ # Render one page, detect its grid, write every cell whose codepoint
87
+ # falls inside `block`'s range.
88
+ #
89
+ # @param block [Ucode::Models::Block]
90
+ # @param pdf_path [String, Pathname]
91
+ # @param page_num [Integer] 1-based PDF page number
92
+ # @param first_cp [Integer] codepoint of the grid's top-left cell
93
+ # @return [Hash] tally
94
+ def write_page(block:, pdf_path:, page_num:, first_cp:)
95
+ svg_doc = render_page(pdf_path, page_num)
96
+ return no_grid_tally unless svg_doc
97
+
98
+ grid = GridDetector.detect(svg_doc, block_first_cp: first_cp)
99
+ return no_grid_tally unless grid
100
+
101
+ counts = zero_tally
102
+ extractor = CellExtractor.new(svg_doc)
103
+ grid.rows.times do |row|
104
+ grid.columns.times do |col|
105
+ cp = grid.codepoint_at(row, col)
106
+ next unless cp && block.covers?(cp)
107
+
108
+ cell_svg = extractor.extract(grid, cp)
109
+ if cell_svg.nil?
110
+ counts[:empty] += 1
111
+ next
112
+ end
113
+
114
+ written = write_glyph(block, cp, cell_svg)
115
+ counts[written ? :written : :skipped] += 1
116
+ end
117
+ end
118
+ counts
119
+ end
120
+
121
+ # Drain a list of block-spec hashes through the worker pool.
122
+ # Each spec has the same shape as #write_block's kwargs:
123
+ #
124
+ # { block:, pdf_path:, page_map: }
125
+ #
126
+ # @param specs [Array<Hash>]
127
+ # @return [Hash] aggregated tally across all blocks
128
+ def write_all(specs)
129
+ return drain_inline(specs) if @parallel_workers <= 1
130
+
131
+ drain_threaded(specs)
132
+ end
133
+
134
+ private
135
+
136
+ def zero_tally
137
+ { written: 0, skipped: 0, empty: 0, placeholder: 0, no_grid: 0 }
138
+ end
139
+
140
+ def no_grid_tally
141
+ zero_tally.tap { |h| h[:no_grid] = 1 }
142
+ end
143
+
144
+ def merge_tally!(acc, other)
145
+ other.each { |k, v| acc[k] = (acc[k] || 0) + v }
146
+ end
147
+
148
+ def drain_inline(specs)
149
+ specs.each_with_object(zero_tally) do |spec, tally|
150
+ merge_tally!(tally, write_block(**spec))
151
+ end
152
+ end
153
+
154
+ def drain_threaded(specs)
155
+ queue = Queue.new
156
+ mutex = Mutex.new
157
+ tally = zero_tally
158
+
159
+ workers = Array.new(@parallel_workers) do
160
+ Thread.new do
161
+ loop do
162
+ spec = queue.pop
163
+ break if spec.nil?
164
+
165
+ result = write_block(**spec)
166
+ mutex.synchronize { merge_tally!(tally, result) }
167
+ end
168
+ end
169
+ end
170
+
171
+ specs.each { |spec| queue << spec }
172
+ @parallel_workers.times { queue << nil }
173
+ workers.each(&:join)
174
+ tally
175
+ end
176
+
177
+ def render_page(pdf_path, page_num)
178
+ Dir.mktmpdir do |dir|
179
+ out = File.join(dir, "p#{page_num}.svg")
180
+ begin
181
+ result = @renderer.render(Pathname.new(pdf_path), page_num, out)
182
+ rescue Ucode::PdfRenderError
183
+ # Graceful degradation: a broken renderer (e.g. mutool on a
184
+ # host without LCMS) yields no_grid → placeholders downstream.
185
+ next nil
186
+ end
187
+ return nil unless result == :ok && File.exist?(out)
188
+
189
+ Nokogiri::XML(File.read(out))
190
+ end
191
+ end
192
+
193
+ def write_glyph(block, codepoint, cell_svg)
194
+ cp_id = Repo::Paths.cp_id(codepoint)
195
+ path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
196
+ write_atomic(path, serialize_svg(cell_svg))
197
+ end
198
+
199
+ # For every assigned codepoint in the block that doesn't already
200
+ # have a glyph.svg on disk, write a placeholder.
201
+ def placeholder_pass(block, tally)
202
+ return tally if block.codepoint_ids.nil? || block.codepoint_ids.empty?
203
+
204
+ block.codepoint_ids.each do |cp_id|
205
+ cp = cp_id_to_int(cp_id)
206
+ next unless cp
207
+ next unless block.covers?(cp)
208
+
209
+ path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
210
+ next if path.exist?
211
+
212
+ if write_atomic(path, placeholder_svg_payload)
213
+ tally[:placeholder] = (tally[:placeholder] || 0) + 1
214
+ end
215
+ end
216
+ tally
217
+ end
218
+
219
+ def cp_id_to_int(cp_id)
220
+ return nil unless cp_id.is_a?(String) && cp_id.start_with?("U+")
221
+
222
+ cp_id[2..].to_i(16)
223
+ end
224
+
225
+ def placeholder_svg_payload
226
+ size = PlaceholderViewBoxSize
227
+ # A simple dashed square + text marker so the site can render
228
+ # an obvious "no official glyph" badge without needing extra state.
229
+ <<~SVG
230
+ <?xml version="1.0" encoding="UTF-8"?>
231
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 #{size} #{size}" width="#{size}" height="#{size}">
232
+ <rect x="1" y="1" width="#{size - 2}" height="#{size - 2}" fill="none" stroke="#999" stroke-width="1" stroke-dasharray="4 4"/>
233
+ <text x="#{size / 2}" y="#{size / 2}" font-family="sans-serif" font-size="14" text-anchor="middle" dominant-baseline="middle" fill="#999">no glyph</text>
234
+ </svg>
235
+ SVG
236
+ end
237
+
238
+ def serialize_svg(doc)
239
+ doc.to_xml.strip
240
+ end
241
+
242
+ def raise_missing_pdf!(block, pdf_path)
243
+ raise Ucode::GlyphError.new(
244
+ "no PDF available for block '#{block.id}'",
245
+ context: { block_id: block.id, pdf_path: pdf_path&.to_s },
246
+ )
247
+ end
248
+ end
249
+ end
250
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Glyphs — converts Code Charts PDF pages into per-codepoint SVGs.
5
+ #
6
+ # Pipeline: fetch per-block PDF → render to SVG → detect grid → extract
7
+ # cell → normalize viewBox → write glyph.svg.
8
+ #
9
+ # Vector extraction only. NEVER run OCR.
10
+ module Glyphs
11
+ autoload :PdfFetcher, "ucode/glyphs/pdf_fetcher"
12
+ autoload :PageRenderer, "ucode/glyphs/page_renderer"
13
+ autoload :MutoolRenderer, "ucode/glyphs/mutool_renderer"
14
+ autoload :Pdf2svgRenderer, "ucode/glyphs/pdf2svg_renderer"
15
+ autoload :DvisvgmRenderer, "ucode/glyphs/dvisvgm_renderer"
16
+ autoload :PdftocairoRenderer, "ucode/glyphs/pdftocairo_renderer"
17
+ autoload :Grid, "ucode/glyphs/grid"
18
+ autoload :PathBbox, "ucode/glyphs/path_bbox"
19
+ autoload :GridDetector, "ucode/glyphs/grid_detector"
20
+ autoload :CellExtractor, "ucode/glyphs/cell_extractor"
21
+ autoload :MonolithPageMap, "ucode/glyphs/monolith_page_map"
22
+ autoload :Writer, "ucode/glyphs/writer"
23
+ autoload :LastResort, "ucode/glyphs/last_resort"
24
+ autoload :EmbeddedFonts, "ucode/glyphs/embedded_fonts"
25
+ autoload :RealFonts, "ucode/glyphs/real_fonts"
26
+ end
27
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "yaml"
5
+
6
+ module Ucode
7
+ # Sorted, run-length-encoded lookup table over Unicode codepoints.
8
+ #
9
+ # One Index answers "what <thing> does codepoint N belong to?" for one
10
+ # property (block, or script). Lookup is O(log N) via `bsearch_index`.
11
+ #
12
+ # Two ways to construct:
13
+ # - `Index.from_triples([[first, last, name], ...])`
14
+ # - `Index.load(path)` from a YAML file previously written by `#save`.
15
+ #
16
+ # The YAML form is the dependency-free alternative to SQLite — same
17
+ # query API, simpler ops. Pick whichever fits the deployment.
18
+ class Index
19
+ include Enumerable
20
+
21
+ def initialize(entries)
22
+ @entries = entries.sort
23
+ end
24
+
25
+ attr_reader :entries
26
+
27
+ def each(&block)
28
+ @entries.each(&block)
29
+ end
30
+
31
+ def size
32
+ @entries.size
33
+ end
34
+
35
+ # @param codepoint [Integer]
36
+ # @return [String, nil] the name of the range covering `codepoint`, or nil
37
+ def lookup(codepoint)
38
+ idx = bsearch_index(codepoint)
39
+ idx && @entries[idx].name
40
+ end
41
+
42
+ # Enumerate every range whose [first_cp, last_cp] overlaps the
43
+ # inclusive query range. Returns a lazy Enumerator when called
44
+ # without a block.
45
+ # @param first [Integer]
46
+ # @param last [Integer]
47
+ # @return [Enumerator<RangeEntry>, nil]
48
+ def each_overlapping(first, last, &block)
49
+ return enum_for(:each_overlapping, first, last) unless block_given?
50
+
51
+ start_idx = bsearch_first_overlap(first)
52
+ return if start_idx.nil?
53
+
54
+ @entries[start_idx..].each do |entry|
55
+ break if entry.first_cp > last
56
+
57
+ yield entry if entry.last_cp >= first
58
+ end
59
+ end
60
+
61
+ # Serialize to a YAML file.
62
+ # @param path [String, Pathname]
63
+ # @return [void]
64
+ def save(path)
65
+ File.open(path, "w") do |file|
66
+ YAML.dump(@entries.map(&:to_h), file)
67
+ end
68
+ end
69
+
70
+ # Load from a YAML file previously written by #save.
71
+ # @param path [String, Pathname]
72
+ # @return [Index]
73
+ def self.load(path)
74
+ hashes = YAML.load_file(path)
75
+ new(hashes.map { |h| RangeEntry.from_h(h) })
76
+ end
77
+
78
+ # Build an Index from raw [first_cp, last_cp, name] triples.
79
+ # @param triples [Array<Array(Integer, Integer, String)>]
80
+ # @return [Index]
81
+ def self.from_triples(triples)
82
+ new(triples.map { |first, last, name| RangeEntry.new(first, last, name) })
83
+ end
84
+
85
+ private
86
+
87
+ # bsearch_index integer-mode convention: -1 = search LEFT, +1 = RIGHT,
88
+ # 0 = match. See Coordinator#find_in_range for the same convention.
89
+ def bsearch_index(codepoint)
90
+ @entries.bsearch_index do |entry|
91
+ if codepoint < entry.first_cp
92
+ -1
93
+ elsif codepoint > entry.last_cp
94
+ 1
95
+ else
96
+ 0
97
+ end
98
+ end
99
+ end
100
+
101
+ # Boolean-mode bsearch: first entry whose `last_cp >= first`.
102
+ def bsearch_first_overlap(first)
103
+ @entries.bsearch_index { |entry| entry.last_cp >= first }
104
+ end
105
+ end
106
+ end