ucode 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ucode/code_chart/extractor.rb +1 -9
- data/lib/ucode/code_chart/writer.rb +1 -1
- data/lib/ucode/commands/canonical_build.rb +4 -4
- data/lib/ucode/commands/universal_set.rb +5 -3
- data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
- data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
- data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
- data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
- data/lib/ucode/coordinator/enrichment/display.rb +36 -0
- data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
- data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
- data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
- data/lib/ucode/coordinator/enrichment/names.rb +63 -0
- data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
- data/lib/ucode/coordinator/enrichment.rb +51 -0
- data/lib/ucode/coordinator/range_lookup.rb +65 -0
- data/lib/ucode/coordinator.rb +4 -276
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
- data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
- data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
- data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
- data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
- data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
- data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
- data/lib/ucode/glyphs/resolver_factory.rb +45 -0
- data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
- data/lib/ucode/glyphs.rb +1 -0
- data/lib/ucode/version.rb +1 -1
- metadata +20 -3
|
@@ -1,42 +1,26 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "open3"
|
|
4
|
-
require "pathname"
|
|
5
|
-
|
|
6
|
-
require_relative "../../error"
|
|
7
|
-
require_relative "font_entry"
|
|
8
|
-
require_relative "tounicode"
|
|
9
|
-
|
|
10
3
|
module Ucode
|
|
11
4
|
module Glyphs
|
|
12
5
|
module EmbeddedFonts
|
|
13
|
-
#
|
|
14
|
-
# `{codepoint => FontEntry}` index.
|
|
6
|
+
# Composes {PdfIndexer} + {CodepointMapper} to build a global
|
|
7
|
+
# `{codepoint => FontEntry}` index from a Code Charts PDF.
|
|
15
8
|
#
|
|
16
|
-
#
|
|
17
|
-
# page-font), then `mutool show -g` to fetch the Type0 font dicts,
|
|
18
|
-
# their descendant CIDFont dicts, and the FontDescriptors — all in
|
|
19
|
-
# a handful of batched subprocess calls rather than one per font.
|
|
9
|
+
# Responsibilities split cleanly:
|
|
20
10
|
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
# only form we currently support), `gid == cid`, so the per-font
|
|
26
|
-
# map is directly `{codepoint => gid}`.
|
|
11
|
+
# * {PdfIndexer} — subprocess + dict parsing → Array<RawFontDescriptor>
|
|
12
|
+
# * {CodepointMapper} — 3-path codepoint→GID strategy → {cp => gid}
|
|
13
|
+
# * {Catalog} (this class) — composes both into FontEntry objects
|
|
14
|
+
# and exposes the public lookup interface
|
|
27
15
|
#
|
|
28
|
-
# When multiple fonts cover the same codepoint
|
|
29
|
-
#
|
|
30
|
-
#
|
|
31
|
-
# `mutool info` listing, which is page-major, so the earlier
|
|
32
|
-
# block's font wins — the expected behavior for the Code Charts.
|
|
16
|
+
# When multiple fonts cover the same codepoint, the first font
|
|
17
|
+
# discovered wins. Discovery order follows mutool info's page-major
|
|
18
|
+
# listing, so earlier blocks' fonts win — the expected behavior.
|
|
33
19
|
class Catalog
|
|
34
|
-
# @param source [
|
|
20
|
+
# @param source [PdfLocation]
|
|
35
21
|
# @param correlator_configs [Hash{Integer=>ContentStreamCorrelator::Config}]
|
|
36
22
|
# maps a Type0 font's PDF object ID to the pillar-2 config to
|
|
37
|
-
# use when the font has no /ToUnicode CMap. Empty by default
|
|
38
|
-
# — fonts without ToUnicode and without a config are skipped
|
|
39
|
-
# (the v0.1 behavior).
|
|
23
|
+
# use when the font has no /ToUnicode CMap. Empty by default.
|
|
40
24
|
def initialize(source, correlator_configs: {})
|
|
41
25
|
@source = source
|
|
42
26
|
@correlator_configs = correlator_configs
|
|
@@ -64,7 +48,7 @@ module Ucode
|
|
|
64
48
|
index.size
|
|
65
49
|
end
|
|
66
50
|
|
|
67
|
-
# @return [Integer] number of Type0 fonts
|
|
51
|
+
# @return [Integer] number of Type0 fonts with non-empty maps
|
|
68
52
|
def font_count
|
|
69
53
|
font_entries.size
|
|
70
54
|
end
|
|
@@ -86,363 +70,35 @@ module Ucode
|
|
|
86
70
|
idx
|
|
87
71
|
end
|
|
88
72
|
|
|
89
|
-
# Step 1: parse `mutool info` for the Type0 font list.
|
|
90
|
-
# Step 2: batch `mutool show -g` to get the Type0 dicts.
|
|
91
|
-
# Step 3: batch `mutool show -g` for the descendant CIDFont dicts.
|
|
92
|
-
# Step 4: batch `mutool show -g` for the FontDescriptors.
|
|
93
|
-
# Step 5: for each font, fetch + parse the ToUnicode CMap.
|
|
94
73
|
def build_font_entries
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
descendant_dicts = fetch_objects(descendant_refs)
|
|
110
|
-
fontdesc_refs = []
|
|
111
|
-
descendant_dicts.each_value do |d|
|
|
112
|
-
fd_ref = first_ref(d["FontDescriptor"])
|
|
113
|
-
fontdesc_refs << fd_ref if fd_ref
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
fontdesc_dicts = fetch_objects(fontdesc_refs)
|
|
117
|
-
|
|
118
|
-
# Walk again, now with all dicts in hand, and build entries.
|
|
119
|
-
entries = []
|
|
120
|
-
type0_refs.each do |font_obj_id, base_font|
|
|
121
|
-
entry = build_entry(
|
|
122
|
-
font_obj_id: font_obj_id,
|
|
123
|
-
base_font: base_font,
|
|
124
|
-
type0_dict: type0_dicts[font_obj_id],
|
|
125
|
-
descendant_dicts: descendant_dicts,
|
|
126
|
-
fontdesc_dicts: fontdesc_dicts,
|
|
74
|
+
indexer.raw_descriptors.filter_map do |desc|
|
|
75
|
+
cp_to_gid = mapper.map(desc)
|
|
76
|
+
next nil if cp_to_gid.empty?
|
|
77
|
+
|
|
78
|
+
FontEntry.new(
|
|
79
|
+
base_font: desc.base_font,
|
|
80
|
+
font_obj_id: desc.font_obj_id,
|
|
81
|
+
fontfile_obj_id: desc.fontfile_obj_id,
|
|
82
|
+
fontfile_kind: desc.fontfile_kind,
|
|
83
|
+
tounicode_obj_id: desc.tounicode_ref,
|
|
84
|
+
cid_to_gid_map: desc.cid_map_kind,
|
|
85
|
+
codepoint_to_gid: cp_to_gid.freeze,
|
|
86
|
+
source: @source,
|
|
127
87
|
)
|
|
128
|
-
entries << entry if entry
|
|
129
|
-
end
|
|
130
|
-
entries
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
# Parse `mutool info` output for Type0 fonts.
|
|
134
|
-
# Format per line: `\t<page>\t(<page_obj> 0 R):\tType0 '<name>' <enc> (<font_obj> 0 R)`
|
|
135
|
-
# Returns `{font_obj_id => base_font}` preserving first-seen order.
|
|
136
|
-
def discover_type0_fonts
|
|
137
|
-
# `mutool info` writes its report to STDERR, not STDOUT.
|
|
138
|
-
out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
|
|
139
|
-
unless status.success?
|
|
140
|
-
raise Ucode::EmbeddedFontsMissingError,
|
|
141
|
-
"mutool info failed: #{(out + err).strip}"
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
text = out + err
|
|
145
|
-
result = {}
|
|
146
|
-
seen = Set.new
|
|
147
|
-
text.each_line do |line|
|
|
148
|
-
next unless line.include?("Type0")
|
|
149
|
-
|
|
150
|
-
# Font lines look like: "<page>\t(<pageobj> 0 R):\tType0 '<base>' <enc> (<fontobj> 0 R)"
|
|
151
|
-
m = line.match(/Type0\s+'([^']+)'\s+\S+\s+\((\d+)\s+0\s+R\)/)
|
|
152
|
-
next unless m
|
|
153
|
-
|
|
154
|
-
base_font = m[1]
|
|
155
|
-
font_obj_id = m[2].to_i
|
|
156
|
-
next if seen.include?(font_obj_id)
|
|
157
|
-
|
|
158
|
-
seen << font_obj_id
|
|
159
|
-
result[font_obj_id] = base_font
|
|
160
|
-
end
|
|
161
|
-
result
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
# Batch `mutool show -g` for many object numbers at once.
|
|
165
|
-
# Returns `{obj_id => parsed_dict_hash}`.
|
|
166
|
-
def fetch_objects(obj_ids)
|
|
167
|
-
return {} if obj_ids.empty?
|
|
168
|
-
|
|
169
|
-
args = ["mutool", "show", "-g",
|
|
170
|
-
@source.pdf_to_s].concat(obj_ids.map(&:to_s))
|
|
171
|
-
out, err, status = Open3.capture3(*args)
|
|
172
|
-
unless status.success?
|
|
173
|
-
raise Ucode::EmbeddedFontsMissingError,
|
|
174
|
-
"mutool show failed: #{err.strip}"
|
|
175
|
-
end
|
|
176
|
-
|
|
177
|
-
parse_grep_output(out)
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
# Parse the `mutool show -g` output: one `<id> 0 obj <<...>>` per line.
|
|
181
|
-
# The dictionary body is a flat string of `/Key value` pairs;
|
|
182
|
-
# value can be a number, name, string, array, or nested dict.
|
|
183
|
-
# We extract a small set of keys we care about and represent
|
|
184
|
-
# their values as strings (caller uses helpers like first_ref).
|
|
185
|
-
def parse_grep_output(text)
|
|
186
|
-
result = {}
|
|
187
|
-
text.each_line do |line|
|
|
188
|
-
m = line.match(/^(\d+)\s+0\s+obj\s+(.*)$/)
|
|
189
|
-
next unless m
|
|
190
|
-
|
|
191
|
-
obj_id = m[1].to_i
|
|
192
|
-
result[obj_id] = parse_dict(m[2])
|
|
193
88
|
end
|
|
194
|
-
result
|
|
195
|
-
end
|
|
196
|
-
|
|
197
|
-
# We don't try to fully parse the PDF dict grammar. Instead we
|
|
198
|
-
# regex each field we need directly out of the dict body. This
|
|
199
|
-
# is robust to `<<...>>`/`[...]` nesting and to `/Key/Value`
|
|
200
|
-
# pairs (no whitespace) that break naive whitespace-split parsers.
|
|
201
|
-
def parse_dict(body)
|
|
202
|
-
body = body.to_s
|
|
203
|
-
{
|
|
204
|
-
"BaseFont" => field_match(body, %r{/BaseFont/([^\s/<>]+)}),
|
|
205
|
-
"DescendantFonts" => field_match(body,
|
|
206
|
-
%r{/DescendantFonts\s*\[\s*(\d+)\s+0\s+R\s*\]}),
|
|
207
|
-
"ToUnicode" => field_match(body, %r{/ToUnicode\s+(\d+)\s+0\s+R}),
|
|
208
|
-
"FontDescriptor" => field_match(body,
|
|
209
|
-
%r{/FontDescriptor\s+(\d+)\s+0\s+R}),
|
|
210
|
-
"FontFile2" => field_match(body, %r{/FontFile2\s+(\d+)\s+0\s+R}),
|
|
211
|
-
"FontFile3" => field_match(body, %r{/FontFile3\s+(\d+)\s+0\s+R}),
|
|
212
|
-
"CIDToGIDMap" => field_match(body,
|
|
213
|
-
%r{/CIDToGIDMap(?:/([^\s/<>]+)|\s+(\d+)\s+0\s+R)}),
|
|
214
|
-
}.compact
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
def field_match(body, regex)
|
|
218
|
-
m = body.match(regex)
|
|
219
|
-
return nil unless m
|
|
220
|
-
|
|
221
|
-
m.captures.compact.first
|
|
222
89
|
end
|
|
223
90
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def first_ref(value)
|
|
227
|
-
return nil if value.nil? || value.empty?
|
|
228
|
-
|
|
229
|
-
Integer(value)
|
|
91
|
+
def indexer
|
|
92
|
+
@indexer ||= PdfIndexer.new(source: @source)
|
|
230
93
|
end
|
|
231
94
|
|
|
232
|
-
def
|
|
233
|
-
|
|
234
|
-
desc_ref = first_ref(type0_dict["DescendantFonts"])
|
|
235
|
-
tu_ref = first_ref(type0_dict["ToUnicode"])
|
|
236
|
-
return nil unless desc_ref
|
|
237
|
-
|
|
238
|
-
desc_dict = descendant_dicts[desc_ref] || {}
|
|
239
|
-
fd_dict = fontdesc_for(desc_dict, fontdesc_dicts)
|
|
240
|
-
return nil unless fd_dict
|
|
241
|
-
|
|
242
|
-
fontfile_obj_id, fontfile_kind = resolve_fontfile(fd_dict)
|
|
243
|
-
return nil unless fontfile_obj_id
|
|
244
|
-
|
|
245
|
-
cid_map_kind = resolve_cid_to_gid(desc_dict)
|
|
246
|
-
return nil unless cid_map_kind
|
|
247
|
-
|
|
248
|
-
cp_to_gid = build_codepoint_to_gid(
|
|
249
|
-
font_obj_id: font_obj_id,
|
|
250
|
-
tu_ref: tu_ref,
|
|
251
|
-
cid_map_kind: cid_map_kind,
|
|
252
|
-
base_font: base_font,
|
|
253
|
-
)
|
|
254
|
-
return nil if cp_to_gid.empty?
|
|
255
|
-
|
|
256
|
-
FontEntry.new(
|
|
257
|
-
base_font: base_font,
|
|
258
|
-
font_obj_id: font_obj_id,
|
|
259
|
-
fontfile_obj_id: fontfile_obj_id,
|
|
260
|
-
fontfile_kind: fontfile_kind,
|
|
261
|
-
tounicode_obj_id: tu_ref,
|
|
262
|
-
cid_to_gid_map: cid_map_kind,
|
|
263
|
-
codepoint_to_gid: cp_to_gid.freeze,
|
|
95
|
+
def mapper
|
|
96
|
+
@mapper ||= CodepointMapper.new(
|
|
264
97
|
source: @source,
|
|
98
|
+
correlator_configs: @correlator_configs,
|
|
99
|
+
indexer: indexer,
|
|
265
100
|
)
|
|
266
101
|
end
|
|
267
|
-
|
|
268
|
-
def fontdesc_for(desc_dict, fontdesc_dicts)
|
|
269
|
-
fd_ref = first_ref(desc_dict["FontDescriptor"])
|
|
270
|
-
return nil unless fd_ref
|
|
271
|
-
|
|
272
|
-
fontdesc_dicts[fd_ref]
|
|
273
|
-
end
|
|
274
|
-
|
|
275
|
-
# Tier-1 path: parse the /ToUnicode CMap. Pillar-2 fallback:
|
|
276
|
-
# when no /ToUnicode is present, consult the correlator_configs
|
|
277
|
-
# registry — if the user supplied a config for this font, render
|
|
278
|
-
# the relevant page(s) to SVG and run positional correlation.
|
|
279
|
-
# Pillar-2b fallback: when no caller-supplied config either,
|
|
280
|
-
# auto-detect via `mutool trace` — parse the structured text
|
|
281
|
-
# trace to build `{codepoint => gid}` from hex labels + specimen
|
|
282
|
-
# positions. Returns an empty hash when none of the paths
|
|
283
|
-
# produce a map (the caller treats that as "skip this font").
|
|
284
|
-
def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:,
|
|
285
|
-
base_font: nil)
|
|
286
|
-
return {} if cid_map_kind != :identity
|
|
287
|
-
|
|
288
|
-
return codepoint_map_from_tounicode(tu_ref) if tu_ref
|
|
289
|
-
|
|
290
|
-
map = codepoint_map_from_correlator(font_obj_id)
|
|
291
|
-
return map unless map.empty?
|
|
292
|
-
|
|
293
|
-
return {} unless base_font
|
|
294
|
-
|
|
295
|
-
codepoint_map_from_trace(base_font, font_obj_id)
|
|
296
|
-
end
|
|
297
|
-
|
|
298
|
-
def codepoint_map_from_tounicode(tu_ref)
|
|
299
|
-
cmap_text = fetch_tounicode(tu_ref)
|
|
300
|
-
build_codepoint_map(ToUnicode.parse(cmap_text), :identity)
|
|
301
|
-
end
|
|
302
|
-
|
|
303
|
-
def codepoint_map_from_correlator(font_obj_id)
|
|
304
|
-
config = @correlator_configs[font_obj_id]
|
|
305
|
-
return {} unless config
|
|
306
|
-
|
|
307
|
-
svg = render_pages(config.page_numbers)
|
|
308
|
-
ContentStreamCorrelator.new(config).correlate(svg)
|
|
309
|
-
end
|
|
310
|
-
|
|
311
|
-
# Pillar-2b: auto-detect codepoint → GID via `mutool trace`.
|
|
312
|
-
# For CID-keyed fonts without /ToUnicode and without a
|
|
313
|
-
# caller-supplied correlator config, trace every page of the
|
|
314
|
-
# PDF and positionally match hex labels to specimen glyphs.
|
|
315
|
-
# `mutool info` only reports the first page per font, so tracing
|
|
316
|
-
# all pages is simpler and catches every chart page.
|
|
317
|
-
#
|
|
318
|
-
# Each page is correlated independently to prevent cross-page
|
|
319
|
-
# position interference (page coordinate systems overlap, so
|
|
320
|
-
# a label on page 3 could wrongly match a specimen on page 2).
|
|
321
|
-
# First match wins when a codepoint appears on multiple pages.
|
|
322
|
-
def codepoint_map_from_trace(base_font, _font_obj_id)
|
|
323
|
-
return {} unless font_appears_in_pdf?(base_font)
|
|
324
|
-
|
|
325
|
-
runner = TraceRunner.new(@source.pdf_path)
|
|
326
|
-
correlator = TraceCorrelator.new(specimen_font_name: base_font)
|
|
327
|
-
|
|
328
|
-
(1..page_count).each_with_object({}) do |page, mapping|
|
|
329
|
-
glyphs = runner.trace([page])
|
|
330
|
-
page_mapping = correlator.correlate(glyphs)
|
|
331
|
-
page_mapping.each do |cp, gid|
|
|
332
|
-
mapping[cp] ||= gid
|
|
333
|
-
end
|
|
334
|
-
end
|
|
335
|
-
end
|
|
336
|
-
|
|
337
|
-
def font_appears_in_pdf?(base_font)
|
|
338
|
-
font_entries_cache.key?(base_font)
|
|
339
|
-
end
|
|
340
|
-
|
|
341
|
-
# Lazy cache of {base_font => true} — which fonts `mutool info`
|
|
342
|
-
# reports in this PDF. We only need the key set, not page numbers,
|
|
343
|
-
# because {codepoint_map_from_trace} traces all pages regardless.
|
|
344
|
-
def font_entries_cache
|
|
345
|
-
@font_entries_cache ||= begin
|
|
346
|
-
result = {}
|
|
347
|
-
mutool_info_text.each_line do |line|
|
|
348
|
-
next unless line.include?("Type0")
|
|
349
|
-
|
|
350
|
-
font_match = line.match(/Type0\s+'([^']+)'/)
|
|
351
|
-
next unless font_match
|
|
352
|
-
|
|
353
|
-
result[font_match[1]] = true
|
|
354
|
-
end
|
|
355
|
-
result
|
|
356
|
-
end
|
|
357
|
-
end
|
|
358
|
-
|
|
359
|
-
# Total pages in the PDF, parsed from `mutool info`'s
|
|
360
|
-
# `Pages: N` line. Falls back to the first font page if parsing
|
|
361
|
-
# fails (so we still try at least one page).
|
|
362
|
-
def page_count
|
|
363
|
-
@page_count ||= begin
|
|
364
|
-
m = mutool_info_text.match(/^Pages:\s+(\d+)/)
|
|
365
|
-
m ? m[1].to_i : 1
|
|
366
|
-
end
|
|
367
|
-
end
|
|
368
|
-
|
|
369
|
-
def mutool_info_text
|
|
370
|
-
@mutool_info_text ||= run_mutool_info
|
|
371
|
-
end
|
|
372
|
-
|
|
373
|
-
def run_mutool_info
|
|
374
|
-
out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
|
|
375
|
-
status.success? ? out + err : ""
|
|
376
|
-
end
|
|
377
|
-
|
|
378
|
-
def resolve_fontfile(fd_dict)
|
|
379
|
-
if fd_dict.key?("FontFile2")
|
|
380
|
-
[first_ref(fd_dict["FontFile2"]), :ttf]
|
|
381
|
-
elsif fd_dict.key?("FontFile3")
|
|
382
|
-
[first_ref(fd_dict["FontFile3"]), :cff]
|
|
383
|
-
end
|
|
384
|
-
end
|
|
385
|
-
|
|
386
|
-
def resolve_cid_to_gid(desc_dict)
|
|
387
|
-
raw = desc_dict["CIDToGIDMap"]
|
|
388
|
-
return nil if raw.nil?
|
|
389
|
-
|
|
390
|
-
# parse_dict captures the name without the leading slash, so
|
|
391
|
-
# "/Identity" comes through as "Identity". A stream-form map
|
|
392
|
-
# is captured as the integer obj id — not supported yet.
|
|
393
|
-
if raw.to_s == "Identity"
|
|
394
|
-
:identity
|
|
395
|
-
end
|
|
396
|
-
end
|
|
397
|
-
|
|
398
|
-
def fetch_tounicode(obj_id)
|
|
399
|
-
Tempfile.create("ucode-tounicode") do |tmp|
|
|
400
|
-
tmp.close
|
|
401
|
-
ok = system("mutool", "show", "-o", tmp.path, "-b",
|
|
402
|
-
@source.pdf_to_s, obj_id.to_s,
|
|
403
|
-
out: File::NULL, err: File::NULL)
|
|
404
|
-
unless ok
|
|
405
|
-
raise Ucode::EmbeddedFontsMissingError,
|
|
406
|
-
"mutool show failed for ToUnicode obj=#{obj_id}"
|
|
407
|
-
end
|
|
408
|
-
|
|
409
|
-
File.binread(tmp.path).force_encoding("UTF-8")
|
|
410
|
-
end
|
|
411
|
-
end
|
|
412
|
-
|
|
413
|
-
# Render the given 1-based PDF pages to a single SVG string
|
|
414
|
-
# suitable for {ContentStreamCorrelator#correlate}. Each page
|
|
415
|
-
# is a separate `<svg>...</svg>` document; the correlator's
|
|
416
|
-
# `<use>` regex tolerates either a single concatenated blob or
|
|
417
|
-
# multiple documents. Output is captured from mutool's stdout.
|
|
418
|
-
def render_pages(page_numbers)
|
|
419
|
-
return "" if page_numbers.nil? || page_numbers.empty?
|
|
420
|
-
|
|
421
|
-
out, err, status = run_mutool_draw(page_numbers)
|
|
422
|
-
unless status.success?
|
|
423
|
-
raise Ucode::EmbeddedFontsMissingError,
|
|
424
|
-
"mutool draw failed: #{err.strip}"
|
|
425
|
-
end
|
|
426
|
-
|
|
427
|
-
out
|
|
428
|
-
end
|
|
429
|
-
|
|
430
|
-
def run_mutool_draw(page_numbers)
|
|
431
|
-
Open3.capture3(
|
|
432
|
-
"mutool", "draw", "-F", "svg",
|
|
433
|
-
@source.pdf_to_s,
|
|
434
|
-
*page_numbers.map(&:to_s)
|
|
435
|
-
)
|
|
436
|
-
end
|
|
437
|
-
|
|
438
|
-
def build_codepoint_map(cid_to_cp, cid_map_kind)
|
|
439
|
-
return {} if cid_to_cp.empty? || cid_map_kind != :identity
|
|
440
|
-
|
|
441
|
-
# With /CIDToGIDMap /Identity, gid == cid.
|
|
442
|
-
cid_to_cp.each_with_object({}) do |(cid, cp), h|
|
|
443
|
-
h[cp] = cid
|
|
444
|
-
end
|
|
445
|
-
end
|
|
446
102
|
end
|
|
447
103
|
end
|
|
448
104
|
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open3"
|
|
4
|
+
require "pathname"
|
|
5
|
+
require "tempfile"
|
|
6
|
+
|
|
7
|
+
module Ucode
|
|
8
|
+
module Glyphs
|
|
9
|
+
module EmbeddedFonts
|
|
10
|
+
# Resolves codepoint → GID for one Type0 font via a 3-path strategy:
|
|
11
|
+
#
|
|
12
|
+
# 1. **ToUnicode CMap** — the font's `/ToUnicode` stream (Tier 1
|
|
13
|
+
# for pillar 1). Parsed by {ToUnicode}.
|
|
14
|
+
# 2. **Caller-supplied correlator config** (pillar 2) — render the
|
|
15
|
+
# font's pages to SVG and run {ContentStreamCorrelator}.
|
|
16
|
+
# 3. **Auto-detect via mutool trace** (pillar 2b) — trace every
|
|
17
|
+
# page and run {TraceCorrelator} positionally.
|
|
18
|
+
#
|
|
19
|
+
# Each path returns a `{codepoint => gid}` map. First non-empty
|
|
20
|
+
# result wins; the strategy stops there.
|
|
21
|
+
#
|
|
22
|
+
# Pure strategy orchestration — does NOT parse the PDF object graph
|
|
23
|
+
# (that's {PdfIndexer}'s job). Takes a {RawFontDescriptor} + the
|
|
24
|
+
# shared {PdfIndexer} (for page_count + font_appears? queries used
|
|
25
|
+
# by the trace fallback).
|
|
26
|
+
class CodepointMapper
|
|
27
|
+
# @param source [PdfLocation]
|
|
28
|
+
# @param correlator_configs [Hash{Integer=>ContentStreamCorrelator::Config}]
|
|
29
|
+
# caller-supplied pillar-2 configs, keyed by font_obj_id
|
|
30
|
+
# @param indexer [PdfIndexer] for page_count + font_appears? queries
|
|
31
|
+
def initialize(source:, correlator_configs:, indexer:)
|
|
32
|
+
@source = source
|
|
33
|
+
@correlator_configs = correlator_configs
|
|
34
|
+
@indexer = indexer
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @param descriptor [RawFontDescriptor]
|
|
38
|
+
# @return [Hash{Integer=>Integer}] codepoint => gid; empty when
|
|
39
|
+
# no strategy produces a map
|
|
40
|
+
def map(descriptor)
|
|
41
|
+
return {} unless descriptor.cid_map_kind == :identity
|
|
42
|
+
|
|
43
|
+
from_tounicode = map_from_tounicode(descriptor.tounicode_ref)
|
|
44
|
+
return from_tounicode unless from_tounicode.empty?
|
|
45
|
+
|
|
46
|
+
from_correlator = map_from_correlator(descriptor.font_obj_id)
|
|
47
|
+
return from_correlator unless from_correlator.empty?
|
|
48
|
+
|
|
49
|
+
map_from_trace(descriptor.base_font)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
# ---- Strategy 1: /ToUnicode CMap --------------------------------
|
|
55
|
+
|
|
56
|
+
def map_from_tounicode(tu_ref)
|
|
57
|
+
return {} unless tu_ref
|
|
58
|
+
|
|
59
|
+
cmap_text = fetch_tounicode(tu_ref)
|
|
60
|
+
cid_to_cp = ToUnicode.parse(cmap_text)
|
|
61
|
+
build_codepoint_map(cid_to_cp)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def build_codepoint_map(cid_to_cp)
|
|
65
|
+
cid_to_cp.each_with_object({}) do |(cid, cp), h|
|
|
66
|
+
h[cp] = cid
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def fetch_tounicode(obj_id)
|
|
71
|
+
Tempfile.create("ucode-tounicode") do |tmp|
|
|
72
|
+
tmp.close
|
|
73
|
+
ok = system("mutool", "show", "-o", tmp.path, "-b",
|
|
74
|
+
@source.pdf_to_s, obj_id.to_s,
|
|
75
|
+
out: File::NULL, err: File::NULL)
|
|
76
|
+
unless ok
|
|
77
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
78
|
+
"mutool show failed for ToUnicode obj=#{obj_id}"
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
File.binread(tmp.path).force_encoding("UTF-8")
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# ---- Strategy 2: caller-supplied correlator config --------------
|
|
86
|
+
|
|
87
|
+
def map_from_correlator(font_obj_id)
|
|
88
|
+
config = @correlator_configs[font_obj_id]
|
|
89
|
+
return {} unless config
|
|
90
|
+
|
|
91
|
+
svg = render_pages(config.page_numbers)
|
|
92
|
+
ContentStreamCorrelator.new(config).correlate(svg)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def render_pages(page_numbers)
|
|
96
|
+
return "" if page_numbers.nil? || page_numbers.empty?
|
|
97
|
+
|
|
98
|
+
out, err, status = Open3.capture3(
|
|
99
|
+
"mutool", "draw", "-F", "svg",
|
|
100
|
+
@source.pdf_to_s,
|
|
101
|
+
*page_numbers.map(&:to_s),
|
|
102
|
+
)
|
|
103
|
+
unless status.success?
|
|
104
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
105
|
+
"mutool draw failed: #{err.strip}"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
out
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# ---- Strategy 3: auto-detect via mutool trace --------------------
|
|
112
|
+
|
|
113
|
+
def map_from_trace(base_font)
|
|
114
|
+
return {} unless @indexer.font_appears?(base_font)
|
|
115
|
+
|
|
116
|
+
runner = TraceRunner.new(@source.pdf_path)
|
|
117
|
+
correlator = TraceCorrelator.new(specimen_font_name: base_font)
|
|
118
|
+
|
|
119
|
+
(1..@indexer.page_count).each_with_object({}) do |page, mapping|
|
|
120
|
+
glyphs = runner.trace([page])
|
|
121
|
+
page_mapping = correlator.correlate(glyphs)
|
|
122
|
+
page_mapping.each do |cp, gid|
|
|
123
|
+
mapping[cp] ||= gid
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|