ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,372 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open3"
4
+ require "pathname"
5
+
6
+ require_relative "../../error"
7
+ require_relative "font_entry"
8
+ require_relative "tounicode"
9
+
10
+ module Ucode
11
+ module Glyphs
12
+ module EmbeddedFonts
13
+ # Walks the Code Charts PDF once and builds a global
14
+ # `{codepoint => FontEntry}` index.
15
+ #
16
+ # Discovery uses `mutool info` for the font list (one line per
17
+ # page-font), then `mutool show -g` to fetch the Type0 font dicts,
18
+ # their descendant CIDFont dicts, and the FontDescriptors — all in
19
+ # a handful of batched subprocess calls rather than one per font.
20
+ #
21
+ # For each Type0 font we then fetch its ToUnicode CMap stream
22
+ # (one `mutool show -b -o <tmpfile>` per font — these can't be
23
+ # batched because each is a separate stream) and parse it into a
24
+ # `{cid => codepoint}` map. With `/CIDToGIDMap /Identity` (the
25
+ # only form we currently support), `gid == cid`, so the per-font
26
+ # map is directly `{codepoint => gid}`.
27
+ #
28
+ # When multiple fonts cover the same codepoint (which happens for
29
+ # a handful of codepoints that appear in multiple blocks), the
30
+ # first font discovered wins. The discovery order follows the
31
+ # `mutool info` listing, which is page-major, so the earlier
32
+ # block's font wins — the expected behavior for the Code Charts.
33
+ class Catalog
34
+ # @param source [Source]
35
+ # @param correlator_configs [Hash{Integer=>ContentStreamCorrelator::Config}]
36
+ # maps a Type0 font's PDF object ID to the pillar-2 config to
37
+ # use when the font has no /ToUnicode CMap. Empty by default
38
+ # — fonts without ToUnicode and without a config are skipped
39
+ # (the v0.1 behavior).
40
+ def initialize(source, correlator_configs: {})
41
+ @source = source
42
+ @correlator_configs = correlator_configs
43
+ @index = nil
44
+ end
45
+
46
+ # @return [Hash{Integer=>FontEntry}] frozen codepoint → entry map
47
+ def index
48
+ @index ||= build_index.freeze
49
+ end
50
+
51
+ # @param codepoint [Integer]
52
+ # @return [FontEntry, nil]
53
+ def lookup(codepoint)
54
+ index[codepoint]
55
+ end
56
+
57
+ # @return [Array<Integer>] every codepoint this PDF covers
58
+ def codepoints
59
+ index.keys
60
+ end
61
+
62
+ # @return [Integer] number of codepoints covered
63
+ def size
64
+ index.size
65
+ end
66
+
67
+ # @return [Integer] number of Type0 fonts discovered
68
+ def font_count
69
+ font_entries.size
70
+ end
71
+
72
+ # @return [Array<FontEntry>] every font entry (one per Type0 font)
73
+ def font_entries
74
+ @font_entries ||= build_font_entries
75
+ end
76
+
77
+ private
78
+
79
+ def build_index
80
+ idx = {}
81
+ font_entries.each do |entry|
82
+ entry.codepoints.each do |cp|
83
+ idx[cp] ||= entry
84
+ end
85
+ end
86
+ idx
87
+ end
88
+
89
+ # Step 1: parse `mutool info` for the Type0 font list.
90
+ # Step 2: batch `mutool show -g` to get the Type0 dicts.
91
+ # Step 3: batch `mutool show -g` for the descendant CIDFont dicts.
92
+ # Step 4: batch `mutool show -g` for the FontDescriptors.
93
+ # Step 5: for each font, fetch + parse the ToUnicode CMap.
94
+ def build_font_entries
95
+ type0_refs = discover_type0_fonts
96
+ return [] if type0_refs.empty?
97
+
98
+ type0_dicts = fetch_objects(type0_refs.keys)
99
+ descendant_refs = []
100
+ tounicode_refs = []
101
+ type0_refs.each_key do |font_obj_id|
102
+ d = type0_dicts[font_obj_id] || {}
103
+ desc_ref = first_ref(d["DescendantFonts"])
104
+ tu_ref = first_ref(d["ToUnicode"])
105
+ descendant_refs << desc_ref if desc_ref
106
+ tounicode_refs << tu_ref if tu_ref
107
+ end
108
+
109
+ descendant_dicts = fetch_objects(descendant_refs)
110
+ fontdesc_refs = []
111
+ descendant_dicts.each_value do |d|
112
+ fd_ref = first_ref(d["FontDescriptor"])
113
+ fontdesc_refs << fd_ref if fd_ref
114
+ end
115
+
116
+ fontdesc_dicts = fetch_objects(fontdesc_refs)
117
+
118
+ # Walk again, now with all dicts in hand, and build entries.
119
+ entries = []
120
+ type0_refs.each do |font_obj_id, base_font|
121
+ entry = build_entry(
122
+ font_obj_id: font_obj_id,
123
+ base_font: base_font,
124
+ type0_dict: type0_dicts[font_obj_id],
125
+ descendant_dicts: descendant_dicts,
126
+ fontdesc_dicts: fontdesc_dicts,
127
+ )
128
+ entries << entry if entry
129
+ end
130
+ entries
131
+ end
132
+
133
+ # Parse `mutool info` output for Type0 fonts.
134
+ # Format per line: `\t<page>\t(<page_obj> 0 R):\tType0 '<name>' <enc> (<font_obj> 0 R)`
135
+ # Returns `{font_obj_id => base_font}` preserving first-seen order.
136
+ def discover_type0_fonts
137
+ # `mutool info` writes its report to STDERR, not STDOUT.
138
+ out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
139
+ unless status.success?
140
+ raise Ucode::EmbeddedFontsMissingError,
141
+ "mutool info failed: #{(out + err).strip}"
142
+ end
143
+
144
+ text = out + err
145
+ result = {}
146
+ seen = Set.new
147
+ text.each_line do |line|
148
+ next unless line.include?("Type0")
149
+
150
+ # Font lines look like: "<page>\t(<pageobj> 0 R):\tType0 '<base>' <enc> (<fontobj> 0 R)"
151
+ m = line.match(/Type0\s+'([^']+)'\s+\S+\s+\((\d+)\s+0\s+R\)/)
152
+ next unless m
153
+
154
+ base_font = m[1]
155
+ font_obj_id = m[2].to_i
156
+ next if seen.include?(font_obj_id)
157
+
158
+ seen << font_obj_id
159
+ result[font_obj_id] = base_font
160
+ end
161
+ result
162
+ end
163
+
164
+ # Batch `mutool show -g` for many object numbers at once.
165
+ # Returns `{obj_id => parsed_dict_hash}`.
166
+ def fetch_objects(obj_ids)
167
+ return {} if obj_ids.empty?
168
+
169
+ args = ["mutool", "show", "-g",
170
+ @source.pdf_to_s].concat(obj_ids.map(&:to_s))
171
+ out, err, status = Open3.capture3(*args)
172
+ unless status.success?
173
+ raise Ucode::EmbeddedFontsMissingError,
174
+ "mutool show failed: #{err.strip}"
175
+ end
176
+
177
+ parse_grep_output(out)
178
+ end
179
+
180
+ # Parse the `mutool show -g` output: one `<id> 0 obj <<...>>` per line.
181
+ # The dictionary body is a flat string of `/Key value` pairs;
182
+ # value can be a number, name, string, array, or nested dict.
183
+ # We extract a small set of keys we care about and represent
184
+ # their values as strings (caller uses helpers like first_ref).
185
+ def parse_grep_output(text)
186
+ result = {}
187
+ text.each_line do |line|
188
+ m = line.match(/^(\d+)\s+0\s+obj\s+(.*)$/)
189
+ next unless m
190
+
191
+ obj_id = m[1].to_i
192
+ result[obj_id] = parse_dict(m[2])
193
+ end
194
+ result
195
+ end
196
+
197
+ # We don't try to fully parse the PDF dict grammar. Instead we
198
+ # regex each field we need directly out of the dict body. This
199
+ # is robust to `<<...>>`/`[...]` nesting and to `/Key/Value`
200
+ # pairs (no whitespace) that break naive whitespace-split parsers.
201
+ def parse_dict(body)
202
+ body = body.to_s
203
+ {
204
+ "BaseFont" => field_match(body, %r{/BaseFont/([^\s/<>]+)}),
205
+ "DescendantFonts" => field_match(body,
206
+ %r{/DescendantFonts\s*\[\s*(\d+)\s+0\s+R\s*\]}),
207
+ "ToUnicode" => field_match(body, %r{/ToUnicode\s+(\d+)\s+0\s+R}),
208
+ "FontDescriptor" => field_match(body,
209
+ %r{/FontDescriptor\s+(\d+)\s+0\s+R}),
210
+ "FontFile2" => field_match(body, %r{/FontFile2\s+(\d+)\s+0\s+R}),
211
+ "FontFile3" => field_match(body, %r{/FontFile3\s+(\d+)\s+0\s+R}),
212
+ "CIDToGIDMap" => field_match(body,
213
+ %r{/CIDToGIDMap(?:/([^\s/<>]+)|\s+(\d+)\s+0\s+R)}),
214
+ }.compact
215
+ end
216
+
217
+ def field_match(body, regex)
218
+ m = body.match(regex)
219
+ return nil unless m
220
+
221
+ m.captures.compact.first
222
+ end
223
+
224
+ # Cast a captured integer string into an Integer, tolerant of nil.
225
+ # {parse_dict}'s regexes already extract just the digit run.
226
+ def first_ref(value)
227
+ return nil if value.nil? || value.empty?
228
+
229
+ Integer(value)
230
+ end
231
+
232
+ def build_entry(font_obj_id:, base_font:, type0_dict:,
233
+ descendant_dicts:, fontdesc_dicts:)
234
+ desc_ref = first_ref(type0_dict["DescendantFonts"])
235
+ tu_ref = first_ref(type0_dict["ToUnicode"])
236
+ return nil unless desc_ref
237
+
238
+ desc_dict = descendant_dicts[desc_ref] || {}
239
+ fd_dict = fontdesc_for(desc_dict, fontdesc_dicts)
240
+ return nil unless fd_dict
241
+
242
+ fontfile_obj_id, fontfile_kind = resolve_fontfile(fd_dict)
243
+ return nil unless fontfile_obj_id
244
+
245
+ cid_map_kind = resolve_cid_to_gid(desc_dict)
246
+ return nil unless cid_map_kind
247
+
248
+ cp_to_gid = build_codepoint_to_gid(
249
+ font_obj_id: font_obj_id,
250
+ tu_ref: tu_ref,
251
+ cid_map_kind: cid_map_kind,
252
+ )
253
+ return nil if cp_to_gid.empty?
254
+
255
+ FontEntry.new(
256
+ base_font: base_font,
257
+ font_obj_id: font_obj_id,
258
+ fontfile_obj_id: fontfile_obj_id,
259
+ fontfile_kind: fontfile_kind,
260
+ tounicode_obj_id: tu_ref,
261
+ cid_to_gid_map: cid_map_kind,
262
+ codepoint_to_gid: cp_to_gid.freeze,
263
+ source: @source,
264
+ )
265
+ end
266
+
267
+ def fontdesc_for(desc_dict, fontdesc_dicts)
268
+ fd_ref = first_ref(desc_dict["FontDescriptor"])
269
+ return nil unless fd_ref
270
+
271
+ fontdesc_dicts[fd_ref]
272
+ end
273
+
274
+ # Tier-1 path: parse the /ToUnicode CMap. Pillar-2 fallback:
275
+ # when no /ToUnicode is present, consult the correlator_configs
276
+ # registry — if the user supplied a config for this font, render
277
+ # the relevant page(s) to SVG and run positional correlation.
278
+ # Returns an empty hash when neither path produces a map (the
279
+ # caller treats that as "skip this font").
280
+ def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:)
281
+ return {} if cid_map_kind != :identity
282
+
283
+ return codepoint_map_from_tounicode(tu_ref) if tu_ref
284
+
285
+ codepoint_map_from_correlator(font_obj_id)
286
+ end
287
+
288
+ def codepoint_map_from_tounicode(tu_ref)
289
+ cmap_text = fetch_tounicode(tu_ref)
290
+ build_codepoint_map(ToUnicode.parse(cmap_text), :identity)
291
+ end
292
+
293
+ def codepoint_map_from_correlator(font_obj_id)
294
+ config = @correlator_configs[font_obj_id]
295
+ return {} unless config
296
+
297
+ svg = render_pages(config.page_numbers)
298
+ ContentStreamCorrelator.new(config).correlate(svg)
299
+ end
300
+
301
+ def resolve_fontfile(fd_dict)
302
+ if fd_dict.key?("FontFile2")
303
+ [first_ref(fd_dict["FontFile2"]), :ttf]
304
+ elsif fd_dict.key?("FontFile3")
305
+ [first_ref(fd_dict["FontFile3"]), :cff]
306
+ end
307
+ end
308
+
309
+ def resolve_cid_to_gid(desc_dict)
310
+ raw = desc_dict["CIDToGIDMap"]
311
+ return nil if raw.nil?
312
+
313
+ # parse_dict captures the name without the leading slash, so
314
+ # "/Identity" comes through as "Identity". A stream-form map
315
+ # is captured as the integer obj id — not supported yet.
316
+ if raw.to_s == "Identity"
317
+ :identity
318
+ end
319
+ end
320
+
321
+ def fetch_tounicode(obj_id)
322
+ Tempfile.create("ucode-tounicode") do |tmp|
323
+ tmp.close
324
+ ok = system("mutool", "show", "-o", tmp.path, "-b",
325
+ @source.pdf_to_s, obj_id.to_s,
326
+ out: File::NULL, err: File::NULL)
327
+ unless ok
328
+ raise Ucode::EmbeddedFontsMissingError,
329
+ "mutool show failed for ToUnicode obj=#{obj_id}"
330
+ end
331
+
332
+ File.binread(tmp.path).force_encoding("UTF-8")
333
+ end
334
+ end
335
+
336
+ # Render the given 1-based PDF pages to a single SVG string
337
+ # suitable for {ContentStreamCorrelator#correlate}. Each page
338
+ # is a separate `<svg>...</svg>` document; the correlator's
339
+ # `<use>` regex tolerates either a single concatenated blob or
340
+ # multiple documents. Output is captured from mutool's stdout.
341
+ def render_pages(page_numbers)
342
+ return "" if page_numbers.nil? || page_numbers.empty?
343
+
344
+ out, err, status = run_mutool_draw(page_numbers)
345
+ unless status.success?
346
+ raise Ucode::EmbeddedFontsMissingError,
347
+ "mutool draw failed: #{err.strip}"
348
+ end
349
+
350
+ out
351
+ end
352
+
353
+ def run_mutool_draw(page_numbers)
354
+ Open3.capture3(
355
+ "mutool", "draw", "-F", "svg",
356
+ @source.pdf_to_s,
357
+ *page_numbers.map(&:to_s)
358
+ )
359
+ end
360
+
361
+ def build_codepoint_map(cid_to_cp, cid_map_kind)
362
+ return {} if cid_to_cp.empty? || cid_map_kind != :identity
363
+
364
+ # With /CIDToGIDMap /Identity, gid == cid.
365
+ cid_to_cp.each_with_object({}) do |(cid, cp), h|
366
+ h[cp] = cid
367
+ end
368
+ end
369
+ end
370
+ end
371
+ end
372
+ end
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module EmbeddedFonts
6
+ # Pillar 2 fallback: build a `{codepoint => gid}` map for a Type0
7
+ # font whose PDF object graph has no `/ToUnicode` CMap stream.
8
+ #
9
+ # The Code Charts draw every chart cell as a `<use>` element that
10
+ # references the font's GID via an `href` of the form
11
+ # `#font_<font_obj_id>_<gid>`. The chart also prints the row +
12
+ # column codepoint labels using one or more "label" fonts (small
13
+ # Latin glyphs) that show the hex codepoint as text. By clustering
14
+ # the labels positionally (Y-bucket for the row, X-bucket for the
15
+ # column) we recover the codepoint each cluster represents, then
16
+ # match each cluster positionally to the specimen glyph at the
17
+ # same Y/X position.
18
+ #
19
+ # The algorithm generalizes the Tai Yo correlator that was tested
20
+ # against `data/pdfs/U1E6C0.pdf` (50/52 specimen codepoints
21
+ # matched, with the two missing being layout edge cases). The
22
+ # bucket sizes are configurable because some blocks use a tighter
23
+ # grid than others.
24
+ #
25
+ # Inputs are deliberately pure: a string of SVG markup plus a
26
+ # {Config}. The catalog is responsible for sourcing the SVG (by
27
+ # rendering the relevant PDF page(s) via `mutool draw -F svg`) and
28
+ # for knowing which font_obj_ids are labels vs specimen on that
29
+ # page. That keeps this class trivially testable with synthetic
30
+ # SVG fixtures.
31
+ class ContentStreamCorrelator
32
+ # Per-font / per-block configuration.
33
+ #
34
+ # @!attribute label_font_ids [Array<Integer>] Type0 font object
35
+ # IDs whose glyphs print the hex codepoint labels on the page.
36
+ # @!attribute specimen_font_id [Integer] Type0 font object ID
37
+ # whose glyphs are the specimens we want to attribute.
38
+ # @!attribute page_numbers [Array<Integer>] 1-based PDF page
39
+ # numbers whose content streams reference the specimen font.
40
+ # @!attribute y_bucket [Float] vertical clustering granularity
41
+ # in PDF points. Default 1.5 — matches mutool's text matrix
42
+ # granularity for the row labels.
43
+ # @!attribute x_bucket [Float] horizontal clustering granularity
44
+ # in PDF points. Default 50.0 — separates label clusters
45
+ # within a row (labels are ~16pt wide, clusters ~60-160pt
46
+ # apart).
47
+ Config = Struct.new(
48
+ :label_font_ids,
49
+ :specimen_font_id,
50
+ :page_numbers,
51
+ :y_bucket,
52
+ :x_bucket,
53
+ keyword_init: true,
54
+ )
55
+
56
+ DEFAULT_Y_BUCKET = 1.5
57
+ DEFAULT_X_BUCKET = 50.0
58
+
59
+ # Internal value object for a parsed `<use>` element. Public so
60
+ # the spec can construct realistic fixtures without re-implementing
61
+ # the parser shape.
62
+ Use = Struct.new(:font_id, :gid, :text, :x, :y, keyword_init: true)
63
+
64
+ # @param config [Config]
65
+ def initialize(config)
66
+ @config = config
67
+ @y_bucket = config.y_bucket || DEFAULT_Y_BUCKET
68
+ @x_bucket = config.x_bucket || DEFAULT_X_BUCKET
69
+ end
70
+
71
+ # @param svg [String] rendered PDF page(s) as SVG markup. May
72
+ # contain multiple `<svg>` documents concatenated (one per
73
+ # page); the regex scan handles either case.
74
+ # @return [Hash{Integer=>Integer}] codepoint => gid. Empty if
75
+ # no clusters could be matched.
76
+ def correlate(svg)
77
+ uses = parse_uses(svg)
78
+ return {} if uses.empty?
79
+
80
+ partition_and_map(uses)
81
+ end
82
+
83
+ private
84
+
85
+ def partition_and_map(uses)
86
+ labels, specimens = partition_uses(uses)
87
+ return {} if labels.empty? || specimens.empty?
88
+
89
+ cp_per_cluster = decode_label_clusters(labels)
90
+ return {} if cp_per_cluster.empty?
91
+
92
+ build_mapping(cp_per_cluster, group_rows(specimens))
93
+ end
94
+
95
+ def partition_uses(uses)
96
+ labels = uses.select do |u|
97
+ @config.label_font_ids.include?(u.font_id)
98
+ end
99
+ specimens = uses.select { |u| u.font_id == @config.specimen_font_id }
100
+ [labels, specimens]
101
+ end
102
+
103
+ # Match `<use .../>` elements and pull out the font_obj_id and
104
+ # gid from the href, plus the text matrix's e and f terms (which
105
+ # give the X/Y origin). The data-text attribute carries the
106
+ # show-string as mutool emitted it (HTML-entity-encoded).
107
+ def parse_uses(svg)
108
+ svg.scan(%r{<use ([^/>]*?)/>}).filter_map do |(attrs_s)|
109
+ use_from_attrs(attrs_s)
110
+ end
111
+ end
112
+
113
+ def use_from_attrs(attrs)
114
+ font_match = match_font_ref(attrs)
115
+ return nil unless font_match
116
+
117
+ tm = attrs.match(
118
+ /matrix\([^,]+,[^,]+,[^,]+,[^,]+,([\d.-]+),([\d.-]+)\)/,
119
+ )
120
+ return nil unless tm
121
+
122
+ build_use(attrs, font_match, tm)
123
+ end
124
+
125
+ def match_font_ref(attrs)
126
+ href = extract_href(attrs)
127
+ return nil unless href
128
+
129
+ href.match(/#font_(\d+)_(\d+)\z/)
130
+ end
131
+
132
+ def build_use(attrs, font_match, transform)
133
+ Use.new(
134
+ font_id: font_match[1].to_i,
135
+ gid: font_match[2].to_i,
136
+ text: attrs[/data-text="([^"]*)"/, 1] || "",
137
+ x: transform[1].to_f,
138
+ y: transform[2].to_f,
139
+ )
140
+ end
141
+
142
+ def extract_href(attrs)
143
+ attrs[/xlink:href="([^"]+)"/, 1] || attrs[/href="([^"]+)"/, 1]
144
+ end
145
+
146
+ # Cluster label uses by quantized (Y, X) position. Within each
147
+ # cluster, members are sorted by X so that joined text reads
148
+ # left-to-right (hex codepoint string).
149
+ def decode_label_clusters(labels)
150
+ cluster_members = bucket_labels_by_position(labels)
151
+ decode_each_cluster(cluster_members)
152
+ end
153
+
154
+ def bucket_labels_by_position(labels)
155
+ clusters = Hash.new { |h, k| h[k] = [] }
156
+ labels.each do |label|
157
+ key = [bucket(label.y, @y_bucket), bucket(label.x, @x_bucket)]
158
+ clusters[key] << label
159
+ end
160
+ clusters
161
+ end
162
+
163
+ def decode_each_cluster(clusters)
164
+ clusters.each_with_object({}) do |(key, members), decoded|
165
+ text = members.sort_by(&:x).map { |m| decode_entities(m.text) }.join
166
+ next unless text.match?(/\A[0-9A-Fa-f]{4,6}\z/)
167
+
168
+ decoded[key] = text.to_i(16)
169
+ end
170
+ end
171
+
172
+ # Group any set of uses (labels or specimens) by Y-bucket; sort
173
+ # each row by X so positional matching is straightforward.
174
+ def group_rows(uses)
175
+ rows = Hash.new { |h, k| h[k] = [] }
176
+ uses.each do |u|
177
+ rows[bucket(u.y, @y_bucket)] << u
178
+ end
179
+ rows.each_value { |v| v.sort_by!(&:x) }
180
+ rows
181
+ end
182
+
183
+ # Within each Y-row, the rightmost label cluster is the
184
+ # specimen codepoint; the rightmost specimen glyph is the
185
+ # specimen GID. The preceding label clusters (if any) are
186
+ # cross-reference codepoints, matched positionally to the
187
+ # preceding specimen glyphs in the same row.
188
+ def build_mapping(cp_per_cluster, specimen_rows)
189
+ cp_rows = group_cps_by_row(cp_per_cluster)
190
+ cp_rows.keys.sort.each_with_object({}) do |yb, mapping|
191
+ assign_row(mapping, cp_rows[yb], specimen_rows[yb] || [])
192
+ end
193
+ end
194
+
195
+ def assign_row(mapping, cps, glyphs)
196
+ return if cps.empty? || glyphs.empty?
197
+
198
+ mapping[cps.last] = glyphs.last.gid
199
+ assign_xrefs(mapping, cps[0...-1], glyphs[0...-1])
200
+ end
201
+
202
+ def assign_xrefs(mapping, xref_cps, xref_glyphs)
203
+ xref_cps.each_with_index do |cp, i|
204
+ g = xref_glyphs[i]
205
+ mapping[cp] = g.gid if g
206
+ end
207
+ end
208
+
209
+ def group_cps_by_row(cp_per_cluster)
210
+ rows = Hash.new { |h, k| h[k] = [] }
211
+ cp_per_cluster.each do |(yb, xb), cp|
212
+ rows[yb] << [cp, xb]
213
+ end
214
+ rows.each_value { |v| v.sort_by! { |_, xb| xb } }
215
+ rows.transform_values { |v| v.map(&:first) }
216
+ end
217
+
218
+ def bucket(value, size)
219
+ (value / size).round * size
220
+ end
221
+
222
+ def decode_entities(text)
223
+ text.gsub(/&#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U") }
224
+ end
225
+ end
226
+ end
227
+ end
228
+ end