ucode 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ucode/code_chart/extractor.rb +1 -9
  3. data/lib/ucode/code_chart/writer.rb +1 -1
  4. data/lib/ucode/commands/canonical_build.rb +4 -4
  5. data/lib/ucode/commands/universal_set.rb +5 -3
  6. data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
  7. data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
  8. data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
  9. data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
  10. data/lib/ucode/coordinator/enrichment/display.rb +36 -0
  11. data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
  12. data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
  13. data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
  14. data/lib/ucode/coordinator/enrichment/names.rb +63 -0
  15. data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
  16. data/lib/ucode/coordinator/enrichment.rb +51 -0
  17. data/lib/ucode/coordinator/range_lookup.rb +65 -0
  18. data/lib/ucode/coordinator.rb +4 -276
  19. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
  20. data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
  21. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
  22. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
  23. data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
  24. data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
  25. data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
  26. data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
  27. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
  28. data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
  29. data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
  30. data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
  31. data/lib/ucode/glyphs/resolver_factory.rb +45 -0
  32. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
  33. data/lib/ucode/glyphs.rb +1 -0
  34. data/lib/ucode/version.rb +1 -1
  35. metadata +20 -3
@@ -1,42 +1,26 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "open3"
4
- require "pathname"
5
-
6
- require_relative "../../error"
7
- require_relative "font_entry"
8
- require_relative "tounicode"
9
-
10
3
  module Ucode
11
4
  module Glyphs
12
5
  module EmbeddedFonts
13
- # Walks the Code Charts PDF once and builds a global
14
- # `{codepoint => FontEntry}` index.
6
+ # Composes {PdfIndexer} + {CodepointMapper} to build a global
7
+ # `{codepoint => FontEntry}` index from a Code Charts PDF.
15
8
  #
16
- # Discovery uses `mutool info` for the font list (one line per
17
- # page-font), then `mutool show -g` to fetch the Type0 font dicts,
18
- # their descendant CIDFont dicts, and the FontDescriptors — all in
19
- # a handful of batched subprocess calls rather than one per font.
9
+ # Responsibilities split cleanly:
20
10
  #
21
- # For each Type0 font we then fetch its ToUnicode CMap stream
22
- # (one `mutool show -b -o <tmpfile>` per font these can't be
23
- # batched because each is a separate stream) and parse it into a
24
- # `{cid => codepoint}` map. With `/CIDToGIDMap /Identity` (the
25
- # only form we currently support), `gid == cid`, so the per-font
26
- # map is directly `{codepoint => gid}`.
11
+ # * {PdfIndexer} subprocess + dict parsing Array<RawFontDescriptor>
12
+ # * {CodepointMapper} 3-path codepoint→GID strategy {cp => gid}
13
+ # * {Catalog} (this class) composes both into FontEntry objects
14
+ # and exposes the public lookup interface
27
15
  #
28
- # When multiple fonts cover the same codepoint (which happens for
29
- # a handful of codepoints that appear in multiple blocks), the
30
- # first font discovered wins. The discovery order follows the
31
- # `mutool info` listing, which is page-major, so the earlier
32
- # block's font wins — the expected behavior for the Code Charts.
16
+ # When multiple fonts cover the same codepoint, the first font
17
+ # discovered wins. Discovery order follows mutool info's page-major
18
+ # listing, so earlier blocks' fonts win the expected behavior.
33
19
  class Catalog
34
- # @param source [Source]
20
+ # @param source [PdfLocation]
35
21
  # @param correlator_configs [Hash{Integer=>ContentStreamCorrelator::Config}]
36
22
  # maps a Type0 font's PDF object ID to the pillar-2 config to
37
- # use when the font has no /ToUnicode CMap. Empty by default
38
- # — fonts without ToUnicode and without a config are skipped
39
- # (the v0.1 behavior).
23
+ # use when the font has no /ToUnicode CMap. Empty by default.
40
24
  def initialize(source, correlator_configs: {})
41
25
  @source = source
42
26
  @correlator_configs = correlator_configs
@@ -64,7 +48,7 @@ module Ucode
64
48
  index.size
65
49
  end
66
50
 
67
- # @return [Integer] number of Type0 fonts discovered
51
+ # @return [Integer] number of Type0 fonts with non-empty maps
68
52
  def font_count
69
53
  font_entries.size
70
54
  end
@@ -86,363 +70,35 @@ module Ucode
86
70
  idx
87
71
  end
88
72
 
89
- # Step 1: parse `mutool info` for the Type0 font list.
90
- # Step 2: batch `mutool show -g` to get the Type0 dicts.
91
- # Step 3: batch `mutool show -g` for the descendant CIDFont dicts.
92
- # Step 4: batch `mutool show -g` for the FontDescriptors.
93
- # Step 5: for each font, fetch + parse the ToUnicode CMap.
94
73
  def build_font_entries
95
- type0_refs = discover_type0_fonts
96
- return [] if type0_refs.empty?
97
-
98
- type0_dicts = fetch_objects(type0_refs.keys)
99
- descendant_refs = []
100
- tounicode_refs = []
101
- type0_refs.each_key do |font_obj_id|
102
- d = type0_dicts[font_obj_id] || {}
103
- desc_ref = first_ref(d["DescendantFonts"])
104
- tu_ref = first_ref(d["ToUnicode"])
105
- descendant_refs << desc_ref if desc_ref
106
- tounicode_refs << tu_ref if tu_ref
107
- end
108
-
109
- descendant_dicts = fetch_objects(descendant_refs)
110
- fontdesc_refs = []
111
- descendant_dicts.each_value do |d|
112
- fd_ref = first_ref(d["FontDescriptor"])
113
- fontdesc_refs << fd_ref if fd_ref
114
- end
115
-
116
- fontdesc_dicts = fetch_objects(fontdesc_refs)
117
-
118
- # Walk again, now with all dicts in hand, and build entries.
119
- entries = []
120
- type0_refs.each do |font_obj_id, base_font|
121
- entry = build_entry(
122
- font_obj_id: font_obj_id,
123
- base_font: base_font,
124
- type0_dict: type0_dicts[font_obj_id],
125
- descendant_dicts: descendant_dicts,
126
- fontdesc_dicts: fontdesc_dicts,
74
+ indexer.raw_descriptors.filter_map do |desc|
75
+ cp_to_gid = mapper.map(desc)
76
+ next nil if cp_to_gid.empty?
77
+
78
+ FontEntry.new(
79
+ base_font: desc.base_font,
80
+ font_obj_id: desc.font_obj_id,
81
+ fontfile_obj_id: desc.fontfile_obj_id,
82
+ fontfile_kind: desc.fontfile_kind,
83
+ tounicode_obj_id: desc.tounicode_ref,
84
+ cid_to_gid_map: desc.cid_map_kind,
85
+ codepoint_to_gid: cp_to_gid.freeze,
86
+ source: @source,
127
87
  )
128
- entries << entry if entry
129
- end
130
- entries
131
- end
132
-
133
- # Parse `mutool info` output for Type0 fonts.
134
- # Format per line: `\t<page>\t(<page_obj> 0 R):\tType0 '<name>' <enc> (<font_obj> 0 R)`
135
- # Returns `{font_obj_id => base_font}` preserving first-seen order.
136
- def discover_type0_fonts
137
- # `mutool info` writes its report to STDERR, not STDOUT.
138
- out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
139
- unless status.success?
140
- raise Ucode::EmbeddedFontsMissingError,
141
- "mutool info failed: #{(out + err).strip}"
142
- end
143
-
144
- text = out + err
145
- result = {}
146
- seen = Set.new
147
- text.each_line do |line|
148
- next unless line.include?("Type0")
149
-
150
- # Font lines look like: "<page>\t(<pageobj> 0 R):\tType0 '<base>' <enc> (<fontobj> 0 R)"
151
- m = line.match(/Type0\s+'([^']+)'\s+\S+\s+\((\d+)\s+0\s+R\)/)
152
- next unless m
153
-
154
- base_font = m[1]
155
- font_obj_id = m[2].to_i
156
- next if seen.include?(font_obj_id)
157
-
158
- seen << font_obj_id
159
- result[font_obj_id] = base_font
160
- end
161
- result
162
- end
163
-
164
- # Batch `mutool show -g` for many object numbers at once.
165
- # Returns `{obj_id => parsed_dict_hash}`.
166
- def fetch_objects(obj_ids)
167
- return {} if obj_ids.empty?
168
-
169
- args = ["mutool", "show", "-g",
170
- @source.pdf_to_s].concat(obj_ids.map(&:to_s))
171
- out, err, status = Open3.capture3(*args)
172
- unless status.success?
173
- raise Ucode::EmbeddedFontsMissingError,
174
- "mutool show failed: #{err.strip}"
175
- end
176
-
177
- parse_grep_output(out)
178
- end
179
-
180
- # Parse the `mutool show -g` output: one `<id> 0 obj <<...>>` per line.
181
- # The dictionary body is a flat string of `/Key value` pairs;
182
- # value can be a number, name, string, array, or nested dict.
183
- # We extract a small set of keys we care about and represent
184
- # their values as strings (caller uses helpers like first_ref).
185
- def parse_grep_output(text)
186
- result = {}
187
- text.each_line do |line|
188
- m = line.match(/^(\d+)\s+0\s+obj\s+(.*)$/)
189
- next unless m
190
-
191
- obj_id = m[1].to_i
192
- result[obj_id] = parse_dict(m[2])
193
88
  end
194
- result
195
- end
196
-
197
- # We don't try to fully parse the PDF dict grammar. Instead we
198
- # regex each field we need directly out of the dict body. This
199
- # is robust to `<<...>>`/`[...]` nesting and to `/Key/Value`
200
- # pairs (no whitespace) that break naive whitespace-split parsers.
201
- def parse_dict(body)
202
- body = body.to_s
203
- {
204
- "BaseFont" => field_match(body, %r{/BaseFont/([^\s/<>]+)}),
205
- "DescendantFonts" => field_match(body,
206
- %r{/DescendantFonts\s*\[\s*(\d+)\s+0\s+R\s*\]}),
207
- "ToUnicode" => field_match(body, %r{/ToUnicode\s+(\d+)\s+0\s+R}),
208
- "FontDescriptor" => field_match(body,
209
- %r{/FontDescriptor\s+(\d+)\s+0\s+R}),
210
- "FontFile2" => field_match(body, %r{/FontFile2\s+(\d+)\s+0\s+R}),
211
- "FontFile3" => field_match(body, %r{/FontFile3\s+(\d+)\s+0\s+R}),
212
- "CIDToGIDMap" => field_match(body,
213
- %r{/CIDToGIDMap(?:/([^\s/<>]+)|\s+(\d+)\s+0\s+R)}),
214
- }.compact
215
- end
216
-
217
- def field_match(body, regex)
218
- m = body.match(regex)
219
- return nil unless m
220
-
221
- m.captures.compact.first
222
89
  end
223
90
 
224
- # Cast a captured integer string into an Integer, tolerant of nil.
225
- # {parse_dict}'s regexes already extract just the digit run.
226
- def first_ref(value)
227
- return nil if value.nil? || value.empty?
228
-
229
- Integer(value)
91
+ def indexer
92
+ @indexer ||= PdfIndexer.new(source: @source)
230
93
  end
231
94
 
232
- def build_entry(font_obj_id:, base_font:, type0_dict:,
233
- descendant_dicts:, fontdesc_dicts:)
234
- desc_ref = first_ref(type0_dict["DescendantFonts"])
235
- tu_ref = first_ref(type0_dict["ToUnicode"])
236
- return nil unless desc_ref
237
-
238
- desc_dict = descendant_dicts[desc_ref] || {}
239
- fd_dict = fontdesc_for(desc_dict, fontdesc_dicts)
240
- return nil unless fd_dict
241
-
242
- fontfile_obj_id, fontfile_kind = resolve_fontfile(fd_dict)
243
- return nil unless fontfile_obj_id
244
-
245
- cid_map_kind = resolve_cid_to_gid(desc_dict)
246
- return nil unless cid_map_kind
247
-
248
- cp_to_gid = build_codepoint_to_gid(
249
- font_obj_id: font_obj_id,
250
- tu_ref: tu_ref,
251
- cid_map_kind: cid_map_kind,
252
- base_font: base_font,
253
- )
254
- return nil if cp_to_gid.empty?
255
-
256
- FontEntry.new(
257
- base_font: base_font,
258
- font_obj_id: font_obj_id,
259
- fontfile_obj_id: fontfile_obj_id,
260
- fontfile_kind: fontfile_kind,
261
- tounicode_obj_id: tu_ref,
262
- cid_to_gid_map: cid_map_kind,
263
- codepoint_to_gid: cp_to_gid.freeze,
95
+ def mapper
96
+ @mapper ||= CodepointMapper.new(
264
97
  source: @source,
98
+ correlator_configs: @correlator_configs,
99
+ indexer: indexer,
265
100
  )
266
101
  end
267
-
268
- def fontdesc_for(desc_dict, fontdesc_dicts)
269
- fd_ref = first_ref(desc_dict["FontDescriptor"])
270
- return nil unless fd_ref
271
-
272
- fontdesc_dicts[fd_ref]
273
- end
274
-
275
- # Tier-1 path: parse the /ToUnicode CMap. Pillar-2 fallback:
276
- # when no /ToUnicode is present, consult the correlator_configs
277
- # registry — if the user supplied a config for this font, render
278
- # the relevant page(s) to SVG and run positional correlation.
279
- # Pillar-2b fallback: when no caller-supplied config either,
280
- # auto-detect via `mutool trace` — parse the structured text
281
- # trace to build `{codepoint => gid}` from hex labels + specimen
282
- # positions. Returns an empty hash when none of the paths
283
- # produce a map (the caller treats that as "skip this font").
284
- def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:,
285
- base_font: nil)
286
- return {} if cid_map_kind != :identity
287
-
288
- return codepoint_map_from_tounicode(tu_ref) if tu_ref
289
-
290
- map = codepoint_map_from_correlator(font_obj_id)
291
- return map unless map.empty?
292
-
293
- return {} unless base_font
294
-
295
- codepoint_map_from_trace(base_font, font_obj_id)
296
- end
297
-
298
- def codepoint_map_from_tounicode(tu_ref)
299
- cmap_text = fetch_tounicode(tu_ref)
300
- build_codepoint_map(ToUnicode.parse(cmap_text), :identity)
301
- end
302
-
303
- def codepoint_map_from_correlator(font_obj_id)
304
- config = @correlator_configs[font_obj_id]
305
- return {} unless config
306
-
307
- svg = render_pages(config.page_numbers)
308
- ContentStreamCorrelator.new(config).correlate(svg)
309
- end
310
-
311
- # Pillar-2b: auto-detect codepoint → GID via `mutool trace`.
312
- # For CID-keyed fonts without /ToUnicode and without a
313
- # caller-supplied correlator config, trace every page of the
314
- # PDF and positionally match hex labels to specimen glyphs.
315
- # `mutool info` only reports the first page per font, so tracing
316
- # all pages is simpler and catches every chart page.
317
- #
318
- # Each page is correlated independently to prevent cross-page
319
- # position interference (page coordinate systems overlap, so
320
- # a label on page 3 could wrongly match a specimen on page 2).
321
- # First match wins when a codepoint appears on multiple pages.
322
- def codepoint_map_from_trace(base_font, _font_obj_id)
323
- return {} unless font_appears_in_pdf?(base_font)
324
-
325
- runner = TraceRunner.new(@source.pdf_path)
326
- correlator = TraceCorrelator.new(specimen_font_name: base_font)
327
-
328
- (1..page_count).each_with_object({}) do |page, mapping|
329
- glyphs = runner.trace([page])
330
- page_mapping = correlator.correlate(glyphs)
331
- page_mapping.each do |cp, gid|
332
- mapping[cp] ||= gid
333
- end
334
- end
335
- end
336
-
337
- def font_appears_in_pdf?(base_font)
338
- font_entries_cache.key?(base_font)
339
- end
340
-
341
- # Lazy cache of {base_font => true} — which fonts `mutool info`
342
- # reports in this PDF. We only need the key set, not page numbers,
343
- # because {codepoint_map_from_trace} traces all pages regardless.
344
- def font_entries_cache
345
- @font_entries_cache ||= begin
346
- result = {}
347
- mutool_info_text.each_line do |line|
348
- next unless line.include?("Type0")
349
-
350
- font_match = line.match(/Type0\s+'([^']+)'/)
351
- next unless font_match
352
-
353
- result[font_match[1]] = true
354
- end
355
- result
356
- end
357
- end
358
-
359
- # Total pages in the PDF, parsed from `mutool info`'s
360
- # `Pages: N` line. Falls back to the first font page if parsing
361
- # fails (so we still try at least one page).
362
- def page_count
363
- @page_count ||= begin
364
- m = mutool_info_text.match(/^Pages:\s+(\d+)/)
365
- m ? m[1].to_i : 1
366
- end
367
- end
368
-
369
- def mutool_info_text
370
- @mutool_info_text ||= run_mutool_info
371
- end
372
-
373
- def run_mutool_info
374
- out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
375
- status.success? ? out + err : ""
376
- end
377
-
378
- def resolve_fontfile(fd_dict)
379
- if fd_dict.key?("FontFile2")
380
- [first_ref(fd_dict["FontFile2"]), :ttf]
381
- elsif fd_dict.key?("FontFile3")
382
- [first_ref(fd_dict["FontFile3"]), :cff]
383
- end
384
- end
385
-
386
- def resolve_cid_to_gid(desc_dict)
387
- raw = desc_dict["CIDToGIDMap"]
388
- return nil if raw.nil?
389
-
390
- # parse_dict captures the name without the leading slash, so
391
- # "/Identity" comes through as "Identity". A stream-form map
392
- # is captured as the integer obj id — not supported yet.
393
- if raw.to_s == "Identity"
394
- :identity
395
- end
396
- end
397
-
398
- def fetch_tounicode(obj_id)
399
- Tempfile.create("ucode-tounicode") do |tmp|
400
- tmp.close
401
- ok = system("mutool", "show", "-o", tmp.path, "-b",
402
- @source.pdf_to_s, obj_id.to_s,
403
- out: File::NULL, err: File::NULL)
404
- unless ok
405
- raise Ucode::EmbeddedFontsMissingError,
406
- "mutool show failed for ToUnicode obj=#{obj_id}"
407
- end
408
-
409
- File.binread(tmp.path).force_encoding("UTF-8")
410
- end
411
- end
412
-
413
- # Render the given 1-based PDF pages to a single SVG string
414
- # suitable for {ContentStreamCorrelator#correlate}. Each page
415
- # is a separate `<svg>...</svg>` document; the correlator's
416
- # `<use>` regex tolerates either a single concatenated blob or
417
- # multiple documents. Output is captured from mutool's stdout.
418
- def render_pages(page_numbers)
419
- return "" if page_numbers.nil? || page_numbers.empty?
420
-
421
- out, err, status = run_mutool_draw(page_numbers)
422
- unless status.success?
423
- raise Ucode::EmbeddedFontsMissingError,
424
- "mutool draw failed: #{err.strip}"
425
- end
426
-
427
- out
428
- end
429
-
430
- def run_mutool_draw(page_numbers)
431
- Open3.capture3(
432
- "mutool", "draw", "-F", "svg",
433
- @source.pdf_to_s,
434
- *page_numbers.map(&:to_s)
435
- )
436
- end
437
-
438
- def build_codepoint_map(cid_to_cp, cid_map_kind)
439
- return {} if cid_to_cp.empty? || cid_map_kind != :identity
440
-
441
- # With /CIDToGIDMap /Identity, gid == cid.
442
- cid_to_cp.each_with_object({}) do |(cid, cp), h|
443
- h[cp] = cid
444
- end
445
- end
446
102
  end
447
103
  end
448
104
  end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open3"
4
+ require "pathname"
5
+ require "tempfile"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module EmbeddedFonts
10
+ # Resolves codepoint → GID for one Type0 font via a 3-path strategy:
11
+ #
12
+ # 1. **ToUnicode CMap** — the font's `/ToUnicode` stream (Tier 1
13
+ # for pillar 1). Parsed by {ToUnicode}.
14
+ # 2. **Caller-supplied correlator config** (pillar 2) — render the
15
+ # font's pages to SVG and run {ContentStreamCorrelator}.
16
+ # 3. **Auto-detect via mutool trace** (pillar 2b) — trace every
17
+ # page and run {TraceCorrelator} positionally.
18
+ #
19
+ # Each path returns a `{codepoint => gid}` map. First non-empty
20
+ # result wins; the strategy stops there.
21
+ #
22
+ # Pure strategy orchestration — does NOT parse the PDF object graph
23
+ # (that's {PdfIndexer}'s job). Takes a {RawFontDescriptor} + the
24
+ # shared {PdfIndexer} (for page_count + font_appears? queries used
25
+ # by the trace fallback).
26
+ class CodepointMapper
27
+ # @param source [PdfLocation]
28
+ # @param correlator_configs [Hash{Integer=>ContentStreamCorrelator::Config}]
29
+ # caller-supplied pillar-2 configs, keyed by font_obj_id
30
+ # @param indexer [PdfIndexer] for page_count + font_appears? queries
31
+ def initialize(source:, correlator_configs:, indexer:)
32
+ @source = source
33
+ @correlator_configs = correlator_configs
34
+ @indexer = indexer
35
+ end
36
+
37
+ # @param descriptor [RawFontDescriptor]
38
+ # @return [Hash{Integer=>Integer}] codepoint => gid; empty when
39
+ # no strategy produces a map
40
+ def map(descriptor)
41
+ return {} unless descriptor.cid_map_kind == :identity
42
+
43
+ from_tounicode = map_from_tounicode(descriptor.tounicode_ref)
44
+ return from_tounicode unless from_tounicode.empty?
45
+
46
+ from_correlator = map_from_correlator(descriptor.font_obj_id)
47
+ return from_correlator unless from_correlator.empty?
48
+
49
+ map_from_trace(descriptor.base_font)
50
+ end
51
+
52
+ private
53
+
54
+ # ---- Strategy 1: /ToUnicode CMap --------------------------------
55
+
56
+ def map_from_tounicode(tu_ref)
57
+ return {} unless tu_ref
58
+
59
+ cmap_text = fetch_tounicode(tu_ref)
60
+ cid_to_cp = ToUnicode.parse(cmap_text)
61
+ build_codepoint_map(cid_to_cp)
62
+ end
63
+
64
+ def build_codepoint_map(cid_to_cp)
65
+ cid_to_cp.each_with_object({}) do |(cid, cp), h|
66
+ h[cp] = cid
67
+ end
68
+ end
69
+
70
+ def fetch_tounicode(obj_id)
71
+ Tempfile.create("ucode-tounicode") do |tmp|
72
+ tmp.close
73
+ ok = system("mutool", "show", "-o", tmp.path, "-b",
74
+ @source.pdf_to_s, obj_id.to_s,
75
+ out: File::NULL, err: File::NULL)
76
+ unless ok
77
+ raise Ucode::EmbeddedFontsMissingError,
78
+ "mutool show failed for ToUnicode obj=#{obj_id}"
79
+ end
80
+
81
+ File.binread(tmp.path).force_encoding("UTF-8")
82
+ end
83
+ end
84
+
85
+ # ---- Strategy 2: caller-supplied correlator config --------------
86
+
87
+ def map_from_correlator(font_obj_id)
88
+ config = @correlator_configs[font_obj_id]
89
+ return {} unless config
90
+
91
+ svg = render_pages(config.page_numbers)
92
+ ContentStreamCorrelator.new(config).correlate(svg)
93
+ end
94
+
95
+ def render_pages(page_numbers)
96
+ return "" if page_numbers.nil? || page_numbers.empty?
97
+
98
+ out, err, status = Open3.capture3(
99
+ "mutool", "draw", "-F", "svg",
100
+ @source.pdf_to_s,
101
+ *page_numbers.map(&:to_s),
102
+ )
103
+ unless status.success?
104
+ raise Ucode::EmbeddedFontsMissingError,
105
+ "mutool draw failed: #{err.strip}"
106
+ end
107
+
108
+ out
109
+ end
110
+
111
+ # ---- Strategy 3: auto-detect via mutool trace --------------------
112
+
113
+ def map_from_trace(base_font)
114
+ return {} unless @indexer.font_appears?(base_font)
115
+
116
+ runner = TraceRunner.new(@source.pdf_path)
117
+ correlator = TraceCorrelator.new(specimen_font_name: base_font)
118
+
119
+ (1..@indexer.page_count).each_with_object({}) do |page, mapping|
120
+ glyphs = runner.trace([page])
121
+ page_mapping = correlator.correlate(glyphs)
122
+ page_mapping.each do |cp, gid|
123
+ mapping[cp] ||= gid
124
+ end
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end