ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ require "ucode/glyphs/grid"
6
+ require "ucode/glyphs/path_bbox"
7
+
8
+ module Ucode
9
+ module Glyphs
10
+ # Detects the chart grid in a Code Charts PDF page rendered to SVG.
11
+ #
12
+ # The PDF page produced by pdftocairo / pdf2svg / dvisvgm contains
13
+ # every visible element (title, block name, row labels, codepoint
14
+ # digits, and the actual character glyphs) as positioned `<use>`
15
+ # references into a `<defs>` block of named glyph outlines. The
16
+ # character cells we want to extract correspond to glyphs whose
17
+ # bounding box is larger than every label or digit font on the
18
+ # page — the chart's character samples are drawn at a larger size
19
+ # than any of the surrounding text.
20
+ #
21
+ # Algorithm:
22
+ # 1. Walk `<defs>`, estimate each glyph's bbox via `PathBbox`.
23
+ # 2. Classify a glyph as "character-sized" when its width and
24
+ # height both exceed `CharSizeThreshold` (default 8 pt).
25
+ # This excludes title, row-label, and digit glyphs while
26
+ # keeping every actual character sample — including pages
27
+ # where the chart mixes multiple character fonts (e.g. the
28
+ # Basic Latin page uses one font for punctuation/digits and
29
+ # another for letters).
30
+ # 3. Collect every `<use>` that references a character-sized
31
+ # glyph; these are the cell origins.
32
+ # 4. Cluster the Y values of those uses into rows, and within
33
+ # each row cluster the X values into columns.
34
+ # 5. Drop rows whose column count diverges from the modal value
35
+ # (these are footer/header artifacts, not chart rows).
36
+ # 6. Return a `Grid` value object anchored at the top-left cell
37
+ # with uniform column/row pitches derived from the median
38
+ # spacing between adjacent clusters.
39
+ #
40
+ # This is pure (no I/O). The detector takes a parsed Nokogiri
41
+ # document and returns a `Grid`.
42
+ class GridDetector
43
+ CharSizeThreshold = 8.0
44
+ ClusterEpsilon = 15.0
45
+ private_constant :CharSizeThreshold, :ClusterEpsilon
46
+
47
+ class << self
48
+ # @param doc [Nokogiri::XML::Document]
49
+ # @param block_first_cp [Integer] first codepoint of the block;
50
+ # stored on the Grid so callers can map codepoint ↔ cell.
51
+ # @return [Ucode::Glyphs::Grid, nil] nil if no character grid
52
+ # could be detected.
53
+ def detect(doc, block_first_cp:)
54
+ uses = collect_uses(doc)
55
+ return nil if uses.empty?
56
+
57
+ char_glyph_ids = char_sized_glyph_ids(doc)
58
+ return nil if char_glyph_ids.empty?
59
+
60
+ cell_uses = uses.select { |u| char_glyph_ids.include?(u.glyph_id) }
61
+ return nil if cell_uses.empty?
62
+
63
+ build_grid(cell_uses, block_first_cp)
64
+ end
65
+
66
+ private
67
+
68
+ UsePosition = Struct.new(:x, :y, :glyph_id, :set_id, keyword_init: true)
69
+
70
+ def collect_uses(doc)
71
+ doc.css("use").map do |node|
72
+ href = node["xlink:href"] || node["href"] || ""
73
+ glyph_id = href.sub(/\A#/, "")
74
+ match = glyph_id.match(/\Aglyph-(\d+)-(\d+)\z/)
75
+ next nil unless match
76
+
77
+ UsePosition.new(
78
+ x: node["x"].to_f,
79
+ y: node["y"].to_f,
80
+ glyph_id: glyph_id,
81
+ set_id: match[1].to_i,
82
+ )
83
+ end.compact
84
+ end
85
+
86
+ def char_sized_glyph_ids(doc)
87
+ doc.css("defs g[id^='glyph-']").each_with_object({}) do |g, acc|
88
+ id = g["id"]
89
+ next unless id =~ /\Aglyph-\d+-\d+\z/
90
+
91
+ paths = g.css("path")
92
+ next if paths.empty?
93
+
94
+ bbox = paths.map { |p| PathBbox.estimate(p["d"]) }.reject(&:empty?).reduce do |a, b|
95
+ PathBbox::Result.new(
96
+ min_x: [a.min_x, b.min_x].min,
97
+ min_y: [a.min_y, b.min_y].min,
98
+ max_x: [a.max_x, b.max_x].max,
99
+ max_y: [a.max_y, b.max_y].max,
100
+ )
101
+ end
102
+ next unless bbox
103
+
104
+ acc[id] = true if char_sized?(bbox)
105
+ end
106
+ end
107
+
108
+ def char_sized?(bbox)
109
+ bbox.width >= CharSizeThreshold && bbox.height >= CharSizeThreshold
110
+ end
111
+
112
+ def median(values)
113
+ return 0.0 if values.empty?
114
+
115
+ sorted = values.sort
116
+ mid = sorted.size / 2
117
+ sorted.size.even? ? (sorted[mid - 1] + sorted[mid]) / 2.0 : sorted[mid]
118
+ end
119
+
120
+ def build_grid(cell_uses, block_first_cp)
121
+ row_clusters = cluster_by_value(cell_uses, :y)
122
+ return nil if row_clusters.empty?
123
+
124
+ column_clusters = cluster_by_value(cell_uses, :x)
125
+ return nil if column_clusters.empty?
126
+
127
+ column_starts = column_clusters.map { |c| c.map(&:x).min }.sort
128
+ row_starts = row_clusters.map { |c| c.map(&:y).min }.sort
129
+
130
+ Grid.new(
131
+ origin_x: column_starts.first,
132
+ origin_y: row_starts.first,
133
+ column_pitch: median_pitch(column_starts),
134
+ row_pitch: median_pitch(row_starts),
135
+ columns: column_starts.size,
136
+ rows: row_starts.size,
137
+ block_first_cp: block_first_cp,
138
+ )
139
+ end
140
+
141
+ def cluster_by_value(items, attr)
142
+ sorted = items.sort_by { |i| i.public_send(attr) }
143
+ clusters = []
144
+ sorted.each do |item|
145
+ value = item.public_send(attr)
146
+ if clusters.empty? || (value - clusters.last[:max]).abs > ClusterEpsilon
147
+ clusters << { max: value, items: [item] }
148
+ else
149
+ clusters.last[:max] = value
150
+ clusters.last[:items] << item
151
+ end
152
+ end
153
+ clusters.map { |c| c[:items] }
154
+ end
155
+
156
+ def median_pitch(sorted_values)
157
+ return 0.0 if sorted_values.size < 2
158
+
159
+ pitches = sorted_values.each_cons(2).map { |a, b| b - a }
160
+ median(pitches)
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ require "ucode/error"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module LastResort
10
+ # Parses the Last Resort Font `cmap-f13.ttx` once into a flat
11
+ # `{codepoint_int => glyph_name}` lookup.
12
+ #
13
+ # The Format 13 cmap has 1,114,112 entries (every codepoint from
14
+ # U+0000 to U+10FFFF). Each entry looks like:
15
+ #
16
+ # <map code="0x0" name="lastresortlatin"/>
17
+ #
18
+ # We parse every `<map>` child of every `<cmap_format_*>` element,
19
+ # ignore the platform/encoding attributes (Format 13 only here),
20
+ # and build a single Hash. Memory cost is ~80 MB for the parsed
21
+ # Hash on Ruby 3.x — acceptable for the CLI, paid once per run.
22
+ #
23
+ # For long-running processes (e.g. the site dev server), the
24
+ # parsed index can be cached via the optional `cache:` constructor
25
+ # argument. The cache contract is `cache.read(key) -> Hash | nil`
26
+ # and `cache.write(key, hash) -> void`; pass an object with both
27
+ # methods (e.g. `Ucode::Cache`).
28
+ class CmapIndex
29
+ CodeAttr = "code"
30
+ private_constant :CodeAttr
31
+
32
+ NameAttr = "name"
33
+ private_constant :NameAttr
34
+
35
+ # Parse the cmap file at `path` and return a frozen Hash.
36
+ #
37
+ # @param path [String, Pathname, #to_path] cmap-f13.ttx path
38
+ # @return [Hash{Integer=>String}] codepoint → glyph name
39
+ def self.parse(path)
40
+ new(path).to_h
41
+ end
42
+
43
+ # @param path [String, Pathname, #to_path] cmap-f13.ttx path
44
+ def initialize(path)
45
+ @path = Pathname.new(path)
46
+ end
47
+
48
+ # @return [Hash{Integer=>String}] frozen codepoint → glyph name
49
+ def to_h
50
+ @to_h ||= build_index.freeze
51
+ end
52
+
53
+ # @param codepoint [Integer]
54
+ # @return [String, nil] glyph name or nil if no entry
55
+ def [](codepoint)
56
+ to_h[codepoint]
57
+ end
58
+
59
+ # @return [Boolean]
60
+ def key?(codepoint)
61
+ to_h.key?(codepoint)
62
+ end
63
+
64
+ # @return [Integer] number of entries
65
+ def size
66
+ to_h.size
67
+ end
68
+
69
+ private
70
+
71
+ def build_index
72
+ doc = Nokogiri::XML(@path.read) do |config|
73
+ config.noblanks.strict
74
+ end
75
+ index = {}
76
+ doc.xpath("/ttFont/cmap/cmap_format_13/map").each do |node|
77
+ code = parse_code(node[CodeAttr])
78
+ name = node[NameAttr]
79
+ next if code.nil? || name.nil? || name.empty?
80
+
81
+ index[code] = name
82
+ end
83
+ index
84
+ end
85
+
86
+ def parse_code(raw)
87
+ return nil if raw.nil? || raw.empty?
88
+
89
+ raw.start_with?("0x", "0X") ? raw[2..].to_i(16) : raw.to_i(16)
90
+ rescue ArgumentError
91
+ nil
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ module LastResort
8
+ # Parses the UFO `contents.plist` once into a
9
+ # `{glyph_name => glif_basename}` lookup.
10
+ #
11
+ # The plist is the standard UFO v3 format:
12
+ #
13
+ # <dict>
14
+ # <key>lastresortlatin</key>
15
+ # <string>lastresortlatin.glif</string>
16
+ # ...
17
+ # </dict>
18
+ #
19
+ # 380 entries (one per placeholder glyph). Tiny file, but parsing
20
+ # it once per Writer avoids 380 redundant Nokogiri passes across
21
+ # the per-codepoint loop.
22
+ class Contents
23
+ KeyEl = "key"
24
+ private_constant :KeyEl
25
+
26
+ StringEl = "string"
27
+ private_constant :StringEl
28
+
29
+ # Parse the plist file at `path` and return a frozen Hash.
30
+ #
31
+ # @param path [String, Pathname, #to_path] contents.plist path
32
+ # @return [Hash{String=>String}] glyph name → glif basename
33
+ def self.parse(path)
34
+ new(path).to_h
35
+ end
36
+
37
+ # @param path [String, Pathname, #to_path] contents.plist path
38
+ def initialize(path)
39
+ @path = Pathname.new(path)
40
+ end
41
+
42
+ # @return [Hash{String=>String}] frozen glyph name → glif basename
43
+ def to_h
44
+ @to_h ||= build_index.freeze
45
+ end
46
+
47
+ # @param glyph_name [String]
48
+ # @return [String, nil] glif basename (e.g. "lastresortlatin.glif")
49
+ def [](glyph_name)
50
+ to_h[glyph_name]
51
+ end
52
+
53
+ # @return [Boolean]
54
+ def key?(glyph_name)
55
+ to_h.key?(glyph_name)
56
+ end
57
+
58
+ private
59
+
60
+ def build_index
61
+ doc = Nokogiri::XML(@path.read) do |config|
62
+ config.noblanks.strict
63
+ end
64
+ pairs = doc.xpath("/plist/dict/*").each_slice(2)
65
+ pairs.each_with_object({}) do |(key_node, val_node), hash|
66
+ next unless key_node.name == KeyEl && val_node&.name == StringEl
67
+
68
+ hash[key_node.text] = val_node.text
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ require "ucode/error"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module LastResort
10
+ # Parses one UFO `.glif` outline file into a {Glif::Outline} value
11
+ # object: advance width + list of contours, each contour being a
12
+ # list of {Glif::Point}s.
13
+ #
14
+ # UFO point semantics:
15
+ #
16
+ # * `type="move"` — on-curve; starts a new contour.
17
+ # * `type="line"` — on-curve; straight line from previous.
18
+ # * `type="curve"` — on-curve; cubic Bezier. The preceding 1–2
19
+ # points with no `type` are off-curve control
20
+ # points.
21
+ # * `type="qcurve"` — on-curve; quadratic Bezier. Preceding N
22
+ # points with no `type` are off-curve controls.
23
+ # * no `type` — off-curve control point.
24
+ #
25
+ # Contours are implicitly closed (UFO follows PostScript
26
+ # convention). {Svg} adds the closing `Z` when emitting SVG path
27
+ # data, so the outline representation here is open.
28
+ #
29
+ # All coordinates are in font units (integers in the Last Resort
30
+ # UFO; the parser accepts floats too for forward compatibility).
31
+ module Glif
32
+ # Single outline point. `kind` is one of `:offcurve`, `:move`,
33
+ # `:line`, `:curve`, `:qcurve`.
34
+ Point = Struct.new(:x, :y, :kind, keyword_init: true) do
35
+ def on_curve?
36
+ kind != :offcurve
37
+ end
38
+ end
39
+
40
+ # One contour — an ordered list of {Point}s.
41
+ Contour = Struct.new(:points, keyword_init: true)
42
+
43
+ # Parsed outline value object.
44
+ Outline = Struct.new(:advance, :contours, keyword_init: true) do
45
+ def bbox
46
+ return nil if contours.empty?
47
+
48
+ xs = []
49
+ ys = []
50
+ contours.each do |contour|
51
+ contour.points.each do |point|
52
+ xs << point.x
53
+ ys << point.y
54
+ end
55
+ end
56
+ return nil if xs.empty?
57
+
58
+ { min_x: xs.min, min_y: ys.min, max_x: xs.max, max_y: ys.max }
59
+ end
60
+ end
61
+
62
+ # @param path [String, Pathname, #to_path] `.glif` file path
63
+ # @return [Outline]
64
+ def self.read(path)
65
+ parse(Pathname.new(path))
66
+ end
67
+
68
+ # @param path [String, Pathname, #to_path] `.glif` file path
69
+ # @return [Outline]
70
+ def self.parse(path)
71
+ doc = Nokogiri::XML(path.read) do |config|
72
+ config.noblanks.strict
73
+ end
74
+ glyph = doc.at_xpath("/glyph") || doc.at_xpath("//glyph")
75
+ raise Ucode::GlyphError, "not a UFO .glif file: #{path}" unless glyph
76
+
77
+ advance = parse_advance(glyph)
78
+ contours = parse_contours(glyph)
79
+ Outline.new(advance: advance, contours: contours)
80
+ end
81
+
82
+ class << self
83
+ private
84
+
85
+ def parse_advance(glyph)
86
+ node = glyph.at_xpath("advance")
87
+ return 0 unless node
88
+
89
+ width = node["width"]
90
+ width ? width.to_i : 0
91
+ end
92
+
93
+ def parse_contours(glyph)
94
+ outline_node = glyph.at_xpath("outline")
95
+ return [] unless outline_node
96
+
97
+ outline_node.xpath("contour").map do |contour_node|
98
+ points = contour_node.xpath("point").map do |point_node|
99
+ Point.new(
100
+ x: point_node["x"].to_i,
101
+ y: point_node["y"].to_i,
102
+ kind: parse_kind(point_node["type"]),
103
+ )
104
+ end
105
+ Contour.new(points: points)
106
+ end
107
+ end
108
+
109
+ def parse_kind(type)
110
+ case type
111
+ when nil then :offcurve
112
+ when "move" then :move
113
+ when "line" then :line
114
+ when "curve" then :curve
115
+ when "qcurve" then :qcurve
116
+ else
117
+ raise Ucode::GlyphError, "unknown glif point type: #{type.inspect}"
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/error"
4
+ require "ucode/glyphs/last_resort/cmap_index"
5
+ require "ucode/glyphs/last_resort/contents"
6
+ require "ucode/glyphs/last_resort/glif"
7
+ require "ucode/glyphs/last_resort/svg"
8
+
9
+ module Ucode
10
+ module Glyphs
11
+ module LastResort
12
+ # Chains the four lookup stages needed to render one codepoint's
13
+ # Last Resort glyph: cmap (cp → name) → contents (name → file)
14
+ # → glif (file → outline) → svg (outline → SVG document).
15
+ #
16
+ # The CmapIndex and Contents are lazily built and memoized per
17
+ # Renderer instance, so rendering many codepoints shares the
18
+ # parsed cmap (1,114,112 entries) and plist (380 entries).
19
+ #
20
+ # Pure-ish: reads from disk via the Source paths; produces a
21
+ # {Result} struct. Never raises on missing codepoints — returns
22
+ # `nil` so callers can decide whether to log or fall back to a
23
+ # generic placeholder.
24
+ class Renderer
25
+ # Result of rendering one codepoint.
26
+ Result = Struct.new(:codepoint, :glyph_name, :svg, keyword_init: true) do
27
+ def ok?
28
+ !svg.nil?
29
+ end
30
+ end
31
+
32
+ # @param source [Source]
33
+ def initialize(source)
34
+ @source = source
35
+ end
36
+
37
+ # @param codepoint [Integer]
38
+ # @return [Result, nil] nil when the codepoint isn't in the cmap
39
+ # or the named glyph is missing from disk
40
+ def render(codepoint)
41
+ glyph_name = cmap[codepoint]
42
+ return nil unless glyph_name
43
+
44
+ basename = contents[glyph_name]
45
+ return nil unless basename
46
+
47
+ path = @source.glif_path(basename)
48
+ return nil unless path.exist?
49
+
50
+ outline = Glif.read(path)
51
+ svg = Svg.new(outline, codepoint: codepoint).to_s
52
+ Result.new(codepoint: codepoint, glyph_name: glyph_name, svg: svg)
53
+ end
54
+
55
+ # @return [CmapIndex]
56
+ def cmap
57
+ @cmap ||= CmapIndex.new(@source.cmap_path)
58
+ end
59
+
60
+ # @return [Contents]
61
+ def contents
62
+ @contents ||= Contents.new(@source.contents_path)
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ require "ucode/error"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module LastResort
10
+ # Locates the Last Resort Font UFO source on disk.
11
+ #
12
+ # Resolution order (first match wins):
13
+ #
14
+ # 1. Explicit `root:` argument.
15
+ # 2. `UCODE_LAST_RESORT_FONT_ROOT` environment variable.
16
+ # 3. `Ucode::Config#last_resort_font_root` (if configured).
17
+ # 4. Conventional sibling-of-repo path `../../external/unicode/
18
+ # last-resort-font` relative to the gem root.
19
+ #
20
+ # The UFO must contain:
21
+ #
22
+ # * `cmap-f13.ttx` — Format 13 cmap (cp → glyph name).
23
+ # * `font.ufo/glyphs/contents.plist` — glyph name → .glif file.
24
+ # * `font.ufo/glyphs/*.glif` — outline files.
25
+ #
26
+ # If any required artifact is missing, the constructor raises
27
+ # {Ucode::LastResortMissingError} with a `context:` payload listing
28
+ # the resolved root and which artifact is absent. The CLI catches
29
+ # this to print a friendly "see README for setup" message.
30
+ class Source
31
+ attr_reader :root, :cmap_path, :glyphs_dir, :contents_path
32
+
33
+ # Expected layout inside the UFO root.
34
+ CMAP_REL = "cmap-f13.ttx"
35
+ private_constant :CMAP_REL
36
+
37
+ GLYPHS_REL = "font.ufo/glyphs"
38
+ private_constant :GLYPHS_REL
39
+
40
+ CONTENTS_REL = "font.ufo/glyphs/contents.plist"
41
+ private_constant :CONTENTS_REL
42
+
43
+ # @param root [String, Pathname, nil] explicit UFO root
44
+ # @param env [Hash{String=>String}] env var source (defaults to ENV)
45
+ # @param gem_root [String, Pathname, nil] gem root for the
46
+ # conventional fallback (defaults to the directory holding
47
+ # `lib/ucode`); injectable for tests
48
+ # @raise [Ucode::LastResortMissingError] if a required artifact
49
+ # is missing at the resolved root
50
+ def initialize(root: nil, env: ENV, gem_root: nil)
51
+ @root = resolve_root(root, env, gem_root)
52
+ validate!
53
+ end
54
+
55
+ # @return [Boolean] true if all required artifacts are present
56
+ def available?
57
+ [
58
+ @cmap_path,
59
+ @glyphs_dir,
60
+ @contents_path,
61
+ ].all?(&:exist?)
62
+ end
63
+
64
+ # Path to a specific `.glif` file by basename. Does NOT verify
65
+ # the file exists; callers resolve via {Contents} first.
66
+ #
67
+ # @param basename [String] e.g. "lastresortlatin.glif"
68
+ # @return [Pathname]
69
+ def glif_path(basename)
70
+ @glyphs_dir.join(basename)
71
+ end
72
+
73
+ private
74
+
75
+ def resolve_root(explicit, env, gem_root)
76
+ return Pathname.new(explicit).expand_path if explicit
77
+
78
+ candidates = []
79
+ env_val = env["UCODE_LAST_RESORT_FONT_ROOT"]
80
+ candidates << Pathname.new(env_val) if env_val && !env_val.empty?
81
+ candidates << conventional_path(gem_root)
82
+ candidates.find { |c| c.exist? && looks_like_ufo_root?(c) }
83
+ end
84
+
85
+ def conventional_path(gem_root)
86
+ base = gem_root ? Pathname.new(gem_root) : default_gem_root
87
+ # gem_root is the project root (e.g. /.../fontist/ucode).
88
+ # The Last Resort Font is conventionally checked out as a
89
+ # sibling-of-the-workspace at <workspace>/external/unicode/
90
+ # last-resort-font — that's two levels up from the gem root.
91
+ base.expand_path.parent.parent.join("external", "unicode", "last-resort-font")
92
+ end
93
+
94
+ def default_gem_root
95
+ # __dir__ = lib/ucode/glyphs/last_resort. Four `..` get us back
96
+ # to the project root (the directory containing `lib/`).
97
+ Pathname.new(__dir__).join("..", "..", "..", "..")
98
+ end
99
+
100
+ def looks_like_ufo_root?(path)
101
+ path.join("font.ufo", "glyphs").directory?
102
+ end
103
+
104
+ def validate!
105
+ raise_missing if @root.nil?
106
+
107
+ @cmap_path = @root.join(CMAP_REL)
108
+ @glyphs_dir = @root.join(GLYPHS_REL)
109
+ @contents_path = @root.join(CONTENTS_REL)
110
+ raise_missing unless available?
111
+ end
112
+
113
+ def raise_missing
114
+ raise Ucode::LastResortMissingError.new(
115
+ "Last Resort Font UFO source not found",
116
+ context: {
117
+ resolved_root: @root&.to_s,
118
+ env_var: "UCODE_LAST_RESORT_FONT_ROOT",
119
+ },
120
+ )
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end