ucode 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2e60f073662cc78885f8ee6a0333a2307a8b894ddb9789b1e206f6ae63d25e3
4
- data.tar.gz: 50b3984de26589d0aab193250e9c8dd3f56fd0d10a4a77a21ffdcb236c38c737
3
+ metadata.gz: 409561757912083c19e4044c0ed37129945bf6de53bc3b029d349e4a8f16f10f
4
+ data.tar.gz: 85a06e0383587af4d8a88342974a58105423635b85212ecc7b1783268e6c5e2a
5
5
  SHA512:
6
- metadata.gz: 88d9ef3df0f99af9b3897cd429666092d48a373d9c60d3c83a6e06a59404e0523c736a8d971fbb6d6d29c9b4b80d9610cad54a64cda439326e19a80670d67ba9
7
- data.tar.gz: 8146f6984defddb5b45204fe7a8852cd3051bf1474584eddc7489e5ec9fd4900d48452ac5462a1c6c2ac9b8213823ace68cebe6c5b5176dd0c25c922f8f1d4ba
6
+ metadata.gz: 85660ae16bbfa2632131888872ddebca9cdee45d26791837ddce2fa629e18a721c9701b023c5424e058165eac03d1d4e1d16bb2a6c0b582a8ef4c1e0104ecdf5
7
+ data.tar.gz: 411de21c9c5f3e46b559752d54462f02f4110aa59e261ee3ea6c19383ad9383fd43f013e1107fe59b3e6c3b296f906a8c99e7087da5f9c85f2b13ba385447b95
data/Rakefile CHANGED
@@ -26,4 +26,4 @@ begin
26
26
  rescue LoadError
27
27
  end
28
28
 
29
- task default: %i[spec rubocop]
29
+ task default: %i[spec rubocop]
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  unicode_version: 17.0.0
3
- ucode_version: 0.2.1
3
+ ucode_version: 0.2.3
4
4
  generated_at: '2026-06-28T00:00:00Z'
5
5
  default_sources:
6
6
  - kind: fontist
data/lib/ucode/cli.rb CHANGED
@@ -109,34 +109,6 @@ module Ucode
109
109
  puts JSON.pretty_generate(result)
110
110
  end
111
111
 
112
- # ─────────────── glyphs ───────────────
113
- desc "glyphs [VERSION]", "Extract per-codepoint SVGs from Code Charts PDFs (experimental)"
114
- long_desc <<~LONG
115
- EXPERIMENTAL in v0.1. The cell extractor currently includes cell-border
116
- decorations alongside the actual character outline, so the output is not
117
- yet suitable for end-user display. Opt in with --include-glyphs to run
118
- the pipeline anyway; otherwise it returns a skipped payload.
119
- LONG
120
- option :to, type: :string, default: "./output"
121
- option :block, type: :array, desc: "Limit to these block ids"
122
- option :force, type: :boolean, default: false
123
- option :monolith, type: :string, default: "CodeCharts.pdf",
124
- desc: "Path to CodeCharts.pdf for fallback slicing"
125
- option :include_glyphs, type: :boolean, default: false,
126
- desc: "Opt into the experimental v0.1 pipeline"
127
- def glyphs(version = nil)
128
- result = Commands::GlyphsCommand.new.call(
129
- VersionResolver.resolve(version),
130
- output_root: options[:to],
131
- block_filter: options[:block],
132
- force: options[:force],
133
- monolith_path: options[:monolith],
134
- include_glyphs: options[:include_glyphs],
135
- warn: $stderr,
136
- )
137
- puts JSON.pretty_generate(result)
138
- end
139
-
140
112
  # ─────────────── site ───────────────
141
113
  class Site < Thor
142
114
  desc "init", "Copy the Vitepress scaffold into site/"
@@ -345,22 +317,16 @@ module Ucode
345
317
  subcommand "cache", Cache
346
318
 
347
319
  # ─────────────── build ───────────────
348
- desc "build [VERSION]", "Full pipeline: fetch + parse + (optional) glyphs + site"
320
+ desc "build [VERSION]", "Full pipeline: fetch + parse + site"
349
321
  option :to, type: :string, default: "./output"
350
322
  option :site, type: :string, default: nil, desc: "Build the site here (skipped if nil)"
351
- option :monolith, type: :string, default: "CodeCharts.pdf"
352
323
  option :force_fetch, type: :boolean, default: false
353
- option :include_glyphs, type: :boolean, default: false,
354
- desc: "Opt into the experimental v0.1 glyph step"
355
324
  def build(version = nil)
356
325
  result = Commands::BuildCommand.new.call(
357
326
  version,
358
327
  output_root: options[:to],
359
328
  site_root: options[:site],
360
- monolith_path: options[:monolith],
361
329
  force_fetch: options[:force_fetch],
362
- include_glyphs: options[:include_glyphs],
363
- warn: $stderr,
364
330
  )
365
331
  puts JSON.pretty_generate(result)
366
332
  end
@@ -8,36 +8,23 @@ require "ucode/version_resolver"
8
8
  module Ucode
9
9
  module Commands
10
10
  # `ucode build` — full pipeline: fetch (ucd + unihan + charts) →
11
- # parse → (optional) glyphs → (optional) site. Resumable: each step
12
- # is idempotent and safe to re-run.
11
+ # parse → site. Resumable: each step is idempotent and safe to re-run.
13
12
  #
14
13
  # Resolves the version intent once at the top and threads the
15
- # resolved string through every sub-command. See Candidate 4 of the
16
- # 2026-06-29 architecture review.
17
- #
18
- # **Glyph step is opt-in as of v0.1** because the SVG cell extractor
19
- # is still experimental. Pass `include_glyphs: true` to enable it;
20
- # otherwise the glyphs step is recorded as skipped.
14
+ # resolved string through every sub-command.
21
15
  class BuildCommand
22
16
  # @param version_intent [nil, :default, :latest, String]
23
17
  # @param output_root [String, Pathname]
24
18
  # @param site_root [String, Pathname, nil] if nil, skip site build
25
- # @param monolith_path [String, Pathname, nil] CodeCharts.pdf fallback
26
19
  # @param force_fetch [Boolean] re-download sources
27
- # @param include_glyphs [Boolean] opt into the experimental glyph
28
- # step (default false)
29
- # @param warn [IO, nil] forwarded to GlyphsCommand when enabled
30
20
  # @return [Hash] aggregated step results
31
21
  def call(version_intent, output_root:, site_root: nil,
32
- monolith_path: nil, force_fetch: false,
33
- include_glyphs: false, warn: nil)
22
+ force_fetch: false)
34
23
  version = VersionResolver.resolve(version_intent)
35
24
  steps = {}
36
25
 
37
26
  steps[:fetch] = run_fetch(version, force: force_fetch)
38
27
  steps[:parse] = ParseCommand.new.call(version, output_root: output_root)
39
- steps[:glyphs] = run_glyphs(version, output_root, monolith_path,
40
- include_glyphs: include_glyphs, warn: warn)
41
28
  steps[:site] = run_site(output_root, site_root) if site_root
42
29
 
43
30
  { version: version, steps: steps }
@@ -54,16 +41,6 @@ module Ucode
54
41
  }
55
42
  end
56
43
 
57
- def run_glyphs(version, output_root, monolith_path, include_glyphs:, warn:)
58
- GlyphsCommand.new.call(
59
- version,
60
- output_root: output_root,
61
- monolith_path: monolith_path || "CodeCharts.pdf",
62
- include_glyphs: include_glyphs,
63
- warn: warn,
64
- )
65
- end
66
-
67
44
  def run_site(output_root, site_root)
68
45
  SiteCommand.new.build(output_root: output_root, site_root: site_root)
69
46
  end
@@ -20,10 +20,7 @@ module Ucode
20
20
  # `index.json` + `glyph.svg` atomically, accumulate per-tier +
21
21
  # per-block stats, and emit `output/build-report.json`.
22
22
  #
23
- # This is the v0.2 replacement for the v0.1 cell-extractor pipeline
24
- # in {GlyphsCommand}. The two coexist until the v0.1 pipeline is
25
- # removed (TODOs 17-19); CanonicalBuildCommand is the path forward
26
- # for production dataset runs.
23
+ # This is the production path for dataset runs.
27
24
  #
28
25
  # == Pre-conditions (per TODO 21)
29
26
  #
@@ -9,7 +9,6 @@ module Ucode
9
9
  module Commands
10
10
  autoload :FetchCommand, "ucode/commands/fetch"
11
11
  autoload :ParseCommand, "ucode/commands/parse"
12
- autoload :GlyphsCommand, "ucode/commands/glyphs"
13
12
  autoload :SiteCommand, "ucode/commands/site"
14
13
  autoload :LookupCommand, "ucode/commands/lookup"
15
14
  autoload :CacheCommand, "ucode/commands/cache"
data/lib/ucode/error.rb CHANGED
@@ -24,8 +24,6 @@ module Ucode
24
24
  # │ ├── Ucode::DatabaseSchemaError
25
25
  # │ └── Ucode::UnknownVersionError
26
26
  # └── Ucode::GlyphError
27
- # ├── Ucode::PdfRenderError
28
- # ├── Ucode::GridDetectionError
29
27
  # ├── Ucode::LastResortMissingError
30
28
  # ├── Ucode::EmbeddedFontsMissingError
31
29
  # └── Ucode::UniversalSetPreBuildError
@@ -104,12 +102,6 @@ module Ucode
104
102
  # Glyph pipeline failures.
105
103
  class GlyphError < Error; end
106
104
 
107
- # PDF → SVG rendering failure.
108
- class PdfRenderError < GlyphError; end
109
-
110
- # Grid detection couldn't anchor on codepoint labels.
111
- class GridDetectionError < GlyphError; end
112
-
113
105
  # The Last Resort Font UFO source cannot be located or is missing a
114
106
  # required artifact (cmap-f13.ttx, font.ufo/glyphs/, contents.plist).
115
107
  class LastResortMissingError < GlyphError; end
@@ -249,6 +249,7 @@ module Ucode
249
249
  font_obj_id: font_obj_id,
250
250
  tu_ref: tu_ref,
251
251
  cid_map_kind: cid_map_kind,
252
+ base_font: base_font,
252
253
  )
253
254
  return nil if cp_to_gid.empty?
254
255
 
@@ -275,14 +276,23 @@ module Ucode
275
276
  # when no /ToUnicode is present, consult the correlator_configs
276
277
  # registry — if the user supplied a config for this font, render
277
278
  # the relevant page(s) to SVG and run positional correlation.
278
- # Returns an empty hash when neither path produces a map (the
279
- # caller treats that as "skip this font").
280
- def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:)
279
+ # Pillar-2b fallback: when no caller-supplied config either,
280
+ # auto-detect via `mutool trace` parse the structured text
281
+ # trace to build `{codepoint => gid}` from hex labels + specimen
282
+ # positions. Returns an empty hash when none of the paths
283
+ # produce a map (the caller treats that as "skip this font").
284
+ def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:,
285
+ base_font: nil)
281
286
  return {} if cid_map_kind != :identity
282
287
 
283
288
  return codepoint_map_from_tounicode(tu_ref) if tu_ref
284
289
 
285
- codepoint_map_from_correlator(font_obj_id)
290
+ map = codepoint_map_from_correlator(font_obj_id)
291
+ return map unless map.empty?
292
+
293
+ return {} unless base_font
294
+
295
+ codepoint_map_from_trace(base_font, font_obj_id)
286
296
  end
287
297
 
288
298
  def codepoint_map_from_tounicode(tu_ref)
@@ -298,6 +308,73 @@ module Ucode
298
308
  ContentStreamCorrelator.new(config).correlate(svg)
299
309
  end
300
310
 
311
+ # Pillar-2b: auto-detect codepoint → GID via `mutool trace`.
312
+ # For CID-keyed fonts without /ToUnicode and without a
313
+ # caller-supplied correlator config, trace every page of the
314
+ # PDF and positionally match hex labels to specimen glyphs.
315
+ # `mutool info` only reports the first page per font, so tracing
316
+ # all pages is simpler and catches every chart page.
317
+ #
318
+ # Each page is correlated independently to prevent cross-page
319
+ # position interference (page coordinate systems overlap, so
320
+ # a label on page 3 could wrongly match a specimen on page 2).
321
+ # First match wins when a codepoint appears on multiple pages.
322
+ def codepoint_map_from_trace(base_font, _font_obj_id)
323
+ return {} unless font_appears_in_pdf?(base_font)
324
+
325
+ runner = TraceRunner.new(@source.pdf_path)
326
+ correlator = TraceCorrelator.new(specimen_font_name: base_font)
327
+
328
+ (1..page_count).each_with_object({}) do |page, mapping|
329
+ glyphs = runner.trace([page])
330
+ page_mapping = correlator.correlate(glyphs)
331
+ page_mapping.each do |cp, gid|
332
+ mapping[cp] ||= gid
333
+ end
334
+ end
335
+ end
336
+
337
+ def font_appears_in_pdf?(base_font)
338
+ font_entries_cache.key?(base_font)
339
+ end
340
+
341
+ # Lazy cache of {base_font => true} — which fonts `mutool info`
342
+ # reports in this PDF. We only need the key set, not page numbers,
343
+ # because {codepoint_map_from_trace} traces all pages regardless.
344
+ def font_entries_cache
345
+ @font_entries_cache ||= begin
346
+ result = {}
347
+ mutool_info_text.each_line do |line|
348
+ next unless line.include?("Type0")
349
+
350
+ font_match = line.match(/Type0\s+'([^']+)'/)
351
+ next unless font_match
352
+
353
+ result[font_match[1]] = true
354
+ end
355
+ result
356
+ end
357
+ end
358
+
359
+ # Total pages in the PDF, parsed from `mutool info`'s
360
+ # `Pages: N` line. Falls back to the first font page if parsing
361
+ # fails (so we still try at least one page).
362
+ def page_count
363
+ @page_count ||= begin
364
+ m = mutool_info_text.match(/^Pages:\s+(\d+)/)
365
+ m ? m[1].to_i : 1
366
+ end
367
+ end
368
+
369
+ def mutool_info_text
370
+ @mutool_info_text ||= run_mutool_info
371
+ end
372
+
373
+ def run_mutool_info
374
+ out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
375
+ status.success? ? out + err : ""
376
+ end
377
+
301
378
  def resolve_fontfile(fd_dict)
302
379
  if fd_dict.key?("FontFile2")
303
380
  [first_ref(fd_dict["FontFile2"]), :ttf]
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module EmbeddedFonts
6
+ # Correlates specimen glyphs (CID font without `/ToUnicode`) to
7
+ # their Unicode codepoints via positional matching against hex
8
+ # codepoint labels on the same chart page.
9
+ #
10
+ # The Unicode Code Charts use two layouts:
11
+ #
12
+ # 1. **List layout** (chart pages): the hex codepoint label (e.g.
13
+ # "10D75") is printed to the LEFT of the specimen glyph at the
14
+ # same Y baseline.
15
+ #
16
+ # 2. **Grid layout** (summary pages): the hex codepoint label is
17
+ # printed directly ABOVE the specimen glyph (~12 pt higher on
18
+ # Y, same X).
19
+ #
20
+ # Both layouts are handled by matching each specimen to the
21
+ # nearest valid label cluster by Euclidean distance, with a
22
+ # maximum match radius that excludes far-away header/footer text.
23
+ #
24
+ # The codepoint labels in every Unicode Code Charts PDF are set
25
+ # in a single dedicated label font (typically ArialNarrow).
26
+ # Character names, headers, and footers use other fonts. To avoid
27
+ # false matches from hex chars in those texts, the correlator
28
+ # auto-detects the label font as the non-specimen font that
29
+ # contributes the most hex-char glyphs.
30
+ #
31
+ # Matching is greedy one-to-one: each GID and each codepoint is
32
+ # assigned at most once, so a specimen that sits between two
33
+ # labels only claims the closer one.
34
+ #
35
+ # Pure logic — no I/O. The caller passes pre-parsed TraceGlyph
36
+ # arrays (typically from {TraceRunner} + {TraceParser}).
37
+ class TraceCorrelator
38
+ DEFAULT_Y_BUCKET = 1.0
39
+ private_constant :DEFAULT_Y_BUCKET
40
+
41
+ # Adjacent label chars within one codepoint label are ~4-6 pt
42
+ # apart on X. Different columns are ~30+ pt apart. 10 pt
43
+ # cleanly separates within-label from between-column gaps.
44
+ X_GAP_THRESHOLD = 10.0
45
+ private_constant :X_GAP_THRESHOLD
46
+
47
+ # Maximum valid Unicode codepoint. Filters out false labels
48
+ # that form hex strings from character-name fragments.
49
+ UNICODE_MAX = 0x10FFFF
50
+ private_constant :UNICODE_MAX
51
+
52
+ # Maximum Euclidean distance from a specimen to its matching
53
+ # label cluster. List-layout labels are ~21 pt to the left;
54
+ # grid-layout labels are ~12 pt above. Header/footer text is
55
+ # always > 30 pt away from any specimen.
56
+ MAX_MATCH_DISTANCE = 30.0
57
+ private_constant :MAX_MATCH_DISTANCE
58
+
59
+ # @param specimen_font_name [String] the BaseFont name of the
60
+ # CID font whose glyphs need correlation
61
+ def initialize(specimen_font_name:)
62
+ @specimen_font_name = specimen_font_name
63
+ @y_bucket = DEFAULT_Y_BUCKET
64
+ end
65
+
66
+ # @param trace_glyphs [Array<TraceGlyph>]
67
+ # @return [Hash{Integer=>Integer}] codepoint => gid
68
+ def correlate(trace_glyphs)
69
+ specimens = trace_glyphs.select { |g| g.font_name == @specimen_font_name }
70
+ return {} if specimens.empty?
71
+
72
+ label_font = detect_label_font(trace_glyphs)
73
+ return {} unless label_font
74
+
75
+ labels = trace_glyphs.select { |g| label_glyph?(g, label_font) }
76
+ return {} if labels.empty?
77
+
78
+ clusters = build_label_clusters(labels)
79
+ return {} if clusters.empty?
80
+
81
+ build_mapping(specimens, clusters)
82
+ end
83
+
84
+ private
85
+
86
+ # The label font is the non-specimen font whose hex-char glyphs
87
+ # appear most often in close proximity to specimen glyphs.
88
+ # Code Charts dedicate one small font to the codepoint labels;
89
+ # body text, headers, and character names use other fonts that
90
+ # may also contain hex chars but are not co-located with
91
+ # specimens (e.g. the index page has thousands of hex chars in
92
+ # MyriadPro-Light but zero specimens).
93
+ LABEL_PROXIMITY_RADIUS = 50.0
94
+ private_constant :LABEL_PROXIMITY_RADIUS
95
+
96
+ def detect_label_font(trace_glyphs)
97
+ specimens = trace_glyphs.select { |g| g.font_name == @specimen_font_name }
98
+ return nil if specimens.empty?
99
+
100
+ non_specimen_hex = non_specimen_hex_glyphs(trace_glyphs)
101
+ return nil if non_specimen_hex.empty?
102
+
103
+ counts = proximity_counts(specimens, non_specimen_hex)
104
+ return nil if counts.empty?
105
+
106
+ counts.max_by { |_, n| n }.first
107
+ end
108
+
109
+ def non_specimen_hex_glyphs(trace_glyphs)
110
+ trace_glyphs.select do |g|
111
+ g.font_name != @specimen_font_name &&
112
+ g.unicode&.match?(/\A[0-9A-Fa-f]\z/)
113
+ end
114
+ end
115
+
116
+ def proximity_counts(specimens, candidates)
117
+ counts = Hash.new(0)
118
+ radius_sq = LABEL_PROXIMITY_RADIUS * LABEL_PROXIMITY_RADIUS
119
+ specimens.each do |spec|
120
+ candidates.each do |g|
121
+ counts[g.font_name] += 1 if within_radius?(spec, g, radius_sq)
122
+ end
123
+ end
124
+ counts
125
+ end
126
+
127
+ def within_radius?(spec, glyph, radius_sq)
128
+ dx = spec.x - glyph.x
129
+ dy = spec.y - glyph.y
130
+ dx * dx + dy * dy < radius_sq
131
+ end
132
+
133
+ def label_glyph?(glyph, label_font)
134
+ glyph.font_name == label_font &&
135
+ glyph.unicode&.match?(/\A[0-9A-Fa-f]\z/)
136
+ end
137
+
138
+ # Cluster labels by Y (row), then by X gap (column within row).
139
+ # Returns a flat array of {x:, y:, codepoint:} clusters.
140
+ def build_label_clusters(labels)
141
+ by_y = labels.group_by { |g| quantize(g.y, @y_bucket) }
142
+ by_y.flat_map { |(_, glyphs)| clusters_from_row(glyphs) }
143
+ end
144
+
145
+ def clusters_from_row(glyphs)
146
+ cluster_by_x_gap(glyphs.sort_by(&:x)).filter_map { |cluster| build_cluster(cluster) }
147
+ end
148
+
149
+ def build_cluster(cluster)
150
+ hex = cluster.map(&:unicode).join
151
+ return nil unless hex.match?(/\A[0-9A-Fa-f]{4,6}\z/)
152
+
153
+ cp = hex.to_i(16)
154
+ return nil unless cp <= UNICODE_MAX
155
+
156
+ {
157
+ x: cluster.sum(&:x) / cluster.size,
158
+ y: cluster.first.y,
159
+ codepoint: cp,
160
+ }
161
+ end
162
+
163
+ def cluster_by_x_gap(sorted_glyphs)
164
+ clusters = []
165
+ current = []
166
+
167
+ sorted_glyphs.each do |g|
168
+ if current.empty? || (g.x - current.last.x).abs < X_GAP_THRESHOLD
169
+ current << g
170
+ else
171
+ clusters << current if current.size > 1
172
+ current = [g]
173
+ end
174
+ end
175
+ clusters << current if current.size > 1
176
+ clusters
177
+ end
178
+
179
+ # Greedy one-to-one matching: each GID and each codepoint is
180
+ # assigned at most once. Candidate pairs are sorted by distance
181
+ # so the closest specimen-label pair always wins.
182
+ def build_mapping(specimens, clusters)
183
+ candidates = Array.new(clusters.size) { |ci| specimen_distances(specimens, clusters, ci) }
184
+
185
+ assigned_gids = Set.new
186
+ assigned_cps = Set.new
187
+ mapping = {}
188
+
189
+ pairs_by_distance(candidates).each do |spec_idx, cluster_idx, dist|
190
+ next if dist > MAX_MATCH_DISTANCE
191
+
192
+ spec = specimens[spec_idx]
193
+ cluster = clusters[cluster_idx]
194
+ next if assigned_gids.include?(spec.gid)
195
+ next if assigned_cps.include?(cluster[:codepoint])
196
+
197
+ assigned_gids << spec.gid
198
+ assigned_cps << cluster[:codepoint]
199
+ mapping[cluster[:codepoint]] = spec.gid
200
+ end
201
+
202
+ mapping
203
+ end
204
+
205
+ def specimen_distances(specimens, clusters, cluster_idx)
206
+ cluster = clusters[cluster_idx]
207
+ specimens.each_with_index.map do |spec, spec_idx|
208
+ [spec_idx, cluster_idx, distance(spec, cluster)]
209
+ end
210
+ end
211
+
212
+ def pairs_by_distance(candidates)
213
+ candidates.flatten(1).sort_by { |_, _, dist| dist }
214
+ end
215
+
216
+ def distance(spec, cluster)
217
+ dx = spec.x - cluster[:x]
218
+ dy = spec.y - cluster[:y]
219
+ Math.sqrt(dx * dx + dy * dy)
220
+ end
221
+
222
+ def quantize(value, bucket_size)
223
+ return nil if value.nil?
224
+
225
+ (value / bucket_size).round * bucket_size
226
+ end
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module EmbeddedFonts
6
+ # Value object for one glyph emitted by `mutool trace`.
7
+ #
8
+ # Each `<g>` element in the trace XML maps to one TraceGlyph:
9
+ #
10
+ # <g unicode="�" glyph="174" x="237.06" y="673.92" adv=".62"/>
11
+ #
12
+ # The `font_name` is inherited from the enclosing `<span>`:
13
+ #
14
+ # <span font="GPJAHB+WolofGaraySansSerif" ...>
15
+ # <g .../>
16
+ # </span>
17
+ TraceGlyph = Struct.new(
18
+ :font_name,
19
+ :gid,
20
+ :x,
21
+ :y,
22
+ :unicode,
23
+ keyword_init: true,
24
+ )
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ module EmbeddedFonts
8
+ # Parses the XML output of `mutool trace <pdf> <page>` into an
9
+ # array of {TraceGlyph} instances.
10
+ #
11
+ # The trace XML uses a flat `<span font="...">` → `<g glyph="..."
12
+ # x="..." y="..." unicode="..."/>` structure. Nokogiri walks
13
+ # the tree; the parser maps each `<g>` to a TraceGlyph,
14
+ # inheriting the font_name from the enclosing span.
15
+ #
16
+ # Pure function — no I/O, no PDF access. Callers inject the XML
17
+ # string (typically from {TraceRunner}).
18
+ module TraceParser
19
+ class << self
20
+ # @param xml [String] raw mutool trace XML
21
+ # @return [Array<TraceGlyph>] one per `<g>` element; empty
22
+ # if the XML is empty or has no `<g>` elements
23
+ def parse(xml)
24
+ return [] if xml.nil? || xml.strip.empty?
25
+
26
+ doc = Nokogiri::XML(xml)
27
+ doc.css("span").flat_map { |span| glyphs_in_span(span) }
28
+ end
29
+
30
+ private
31
+
32
+ def glyphs_in_span(span)
33
+ font_name = span[:font]
34
+ span.css("g").map { |g| build_glyph(font_name, g) }
35
+ end
36
+
37
+ def build_glyph(font_name, g)
38
+ TraceGlyph.new(
39
+ font_name: font_name,
40
+ gid: g[:glyph]&.to_i,
41
+ x: g[:x]&.to_f,
42
+ y: g[:y]&.to_f,
43
+ unicode: g[:unicode],
44
+ )
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open3"
4
+ require "pathname"
5
+
6
+ require "ucode/error"
7
+
8
+ module Ucode
9
+ module Glyphs
10
+ module EmbeddedFonts
11
+ # Thin I/O wrapper around `mutool trace <pdf> <page>`.
12
+ #
13
+ # Runs mutool on the given pages, captures the XML output,
14
+ # delegates parsing to {TraceParser}, and returns a flat
15
+ # `Array<TraceGlyph>` across all pages.
16
+ #
17
+ # The only class in the trace pipeline that touches the
18
+ # filesystem / spawns subprocesses. Everything upstream
19
+ # (parser, correlator) is pure.
20
+ class TraceRunner
21
+ # @param pdf_path [Pathname, String]
22
+ def initialize(pdf_path)
23
+ @pdf_path = Pathname.new(pdf_path)
24
+ end
25
+
26
+ # @param page_numbers [Array<Integer>] 1-based PDF page numbers
27
+ # @return [Array<TraceGlyph>]
28
+ def trace(page_numbers)
29
+ page_numbers.flat_map { |page| trace_page(page) }
30
+ end
31
+
32
+ private
33
+
34
+ def trace_page(page)
35
+ xml = run_mutool(page)
36
+ TraceParser.parse(xml)
37
+ end
38
+
39
+ def run_mutool(page)
40
+ out, err, status = Open3.capture3(
41
+ "mutool", "trace", @pdf_path.to_s, page.to_s,
42
+ )
43
+ unless status.success?
44
+ raise Ucode::EmbeddedFontsMissingError,
45
+ "mutool trace failed: #{(out + err).strip}"
46
+ end
47
+
48
+ out + err
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -42,6 +42,10 @@ module Ucode
42
42
  autoload :Catalog, "ucode/glyphs/embedded_fonts/catalog"
43
43
  autoload :ContentStreamCorrelator,
44
44
  "ucode/glyphs/embedded_fonts/content_stream_correlator"
45
+ autoload :TraceGlyph, "ucode/glyphs/embedded_fonts/trace_glyph"
46
+ autoload :TraceParser, "ucode/glyphs/embedded_fonts/trace_parser"
47
+ autoload :TraceCorrelator, "ucode/glyphs/embedded_fonts/trace_correlator"
48
+ autoload :TraceRunner, "ucode/glyphs/embedded_fonts/trace_runner"
45
49
  autoload :Svg, "ucode/glyphs/embedded_fonts/svg"
46
50
  autoload :Renderer, "ucode/glyphs/embedded_fonts/renderer"
47
51
  autoload :Writer, "ucode/glyphs/embedded_fonts/writer"