ucode 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/config/unicode17_universal_glyph_set.yml +1 -1
- data/lib/ucode/cli.rb +1 -35
- data/lib/ucode/commands/build.rb +3 -26
- data/lib/ucode/commands/canonical_build.rb +1 -4
- data/lib/ucode/commands.rb +0 -1
- data/lib/ucode/error.rb +0 -8
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +81 -4
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +230 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_glyph.rb +27 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_parser.rb +50 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_runner.rb +53 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +4 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +7 -50
- data/lib/ucode/glyphs.rb +4 -14
- data/lib/ucode/version.rb +1 -1
- data/lib/ucode.rb +0 -2
- metadata +6 -15
- data/lib/ucode/commands/glyphs.rb +0 -94
- data/lib/ucode/glyphs/cell_extractor.rb +0 -130
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +0 -29
- data/lib/ucode/glyphs/grid.rb +0 -30
- data/lib/ucode/glyphs/grid_detector.rb +0 -165
- data/lib/ucode/glyphs/monolith_page_map.rb +0 -181
- data/lib/ucode/glyphs/mutool_renderer.rb +0 -28
- data/lib/ucode/glyphs/page_renderer.rb +0 -234
- data/lib/ucode/glyphs/path_bbox.rb +0 -62
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +0 -26
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +0 -32
- data/lib/ucode/glyphs/pipeline.rb +0 -105
- data/lib/ucode/glyphs/writer.rb +0 -250
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 409561757912083c19e4044c0ed37129945bf6de53bc3b029d349e4a8f16f10f
|
|
4
|
+
data.tar.gz: 85a06e0383587af4d8a88342974a58105423635b85212ecc7b1783268e6c5e2a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 85660ae16bbfa2632131888872ddebca9cdee45d26791837ddce2fa629e18a721c9701b023c5424e058165eac03d1d4e1d16bb2a6c0b582a8ef4c1e0104ecdf5
|
|
7
|
+
data.tar.gz: 411de21c9c5f3e46b559752d54462f02f4110aa59e261ee3ea6c19383ad9383fd43f013e1107fe59b3e6c3b296f906a8c99e7087da5f9c85f2b13ba385447b95
|
data/Rakefile
CHANGED
data/lib/ucode/cli.rb
CHANGED
|
@@ -109,34 +109,6 @@ module Ucode
|
|
|
109
109
|
puts JSON.pretty_generate(result)
|
|
110
110
|
end
|
|
111
111
|
|
|
112
|
-
# ─────────────── glyphs ───────────────
|
|
113
|
-
desc "glyphs [VERSION]", "Extract per-codepoint SVGs from Code Charts PDFs (experimental)"
|
|
114
|
-
long_desc <<~LONG
|
|
115
|
-
EXPERIMENTAL in v0.1. The cell extractor currently includes cell-border
|
|
116
|
-
decorations alongside the actual character outline, so the output is not
|
|
117
|
-
yet suitable for end-user display. Opt in with --include-glyphs to run
|
|
118
|
-
the pipeline anyway; otherwise it returns a skipped payload.
|
|
119
|
-
LONG
|
|
120
|
-
option :to, type: :string, default: "./output"
|
|
121
|
-
option :block, type: :array, desc: "Limit to these block ids"
|
|
122
|
-
option :force, type: :boolean, default: false
|
|
123
|
-
option :monolith, type: :string, default: "CodeCharts.pdf",
|
|
124
|
-
desc: "Path to CodeCharts.pdf for fallback slicing"
|
|
125
|
-
option :include_glyphs, type: :boolean, default: false,
|
|
126
|
-
desc: "Opt into the experimental v0.1 pipeline"
|
|
127
|
-
def glyphs(version = nil)
|
|
128
|
-
result = Commands::GlyphsCommand.new.call(
|
|
129
|
-
VersionResolver.resolve(version),
|
|
130
|
-
output_root: options[:to],
|
|
131
|
-
block_filter: options[:block],
|
|
132
|
-
force: options[:force],
|
|
133
|
-
monolith_path: options[:monolith],
|
|
134
|
-
include_glyphs: options[:include_glyphs],
|
|
135
|
-
warn: $stderr,
|
|
136
|
-
)
|
|
137
|
-
puts JSON.pretty_generate(result)
|
|
138
|
-
end
|
|
139
|
-
|
|
140
112
|
# ─────────────── site ───────────────
|
|
141
113
|
class Site < Thor
|
|
142
114
|
desc "init", "Copy the Vitepress scaffold into site/"
|
|
@@ -345,22 +317,16 @@ module Ucode
|
|
|
345
317
|
subcommand "cache", Cache
|
|
346
318
|
|
|
347
319
|
# ─────────────── build ───────────────
|
|
348
|
-
desc "build [VERSION]", "Full pipeline: fetch + parse +
|
|
320
|
+
desc "build [VERSION]", "Full pipeline: fetch + parse + site"
|
|
349
321
|
option :to, type: :string, default: "./output"
|
|
350
322
|
option :site, type: :string, default: nil, desc: "Build the site here (skipped if nil)"
|
|
351
|
-
option :monolith, type: :string, default: "CodeCharts.pdf"
|
|
352
323
|
option :force_fetch, type: :boolean, default: false
|
|
353
|
-
option :include_glyphs, type: :boolean, default: false,
|
|
354
|
-
desc: "Opt into the experimental v0.1 glyph step"
|
|
355
324
|
def build(version = nil)
|
|
356
325
|
result = Commands::BuildCommand.new.call(
|
|
357
326
|
version,
|
|
358
327
|
output_root: options[:to],
|
|
359
328
|
site_root: options[:site],
|
|
360
|
-
monolith_path: options[:monolith],
|
|
361
329
|
force_fetch: options[:force_fetch],
|
|
362
|
-
include_glyphs: options[:include_glyphs],
|
|
363
|
-
warn: $stderr,
|
|
364
330
|
)
|
|
365
331
|
puts JSON.pretty_generate(result)
|
|
366
332
|
end
|
data/lib/ucode/commands/build.rb
CHANGED
|
@@ -8,36 +8,23 @@ require "ucode/version_resolver"
|
|
|
8
8
|
module Ucode
|
|
9
9
|
module Commands
|
|
10
10
|
# `ucode build` — full pipeline: fetch (ucd + unihan + charts) →
|
|
11
|
-
# parse →
|
|
12
|
-
# is idempotent and safe to re-run.
|
|
11
|
+
# parse → site. Resumable: each step is idempotent and safe to re-run.
|
|
13
12
|
#
|
|
14
13
|
# Resolves the version intent once at the top and threads the
|
|
15
|
-
# resolved string through every sub-command.
|
|
16
|
-
# 2026-06-29 architecture review.
|
|
17
|
-
#
|
|
18
|
-
# **Glyph step is opt-in as of v0.1** because the SVG cell extractor
|
|
19
|
-
# is still experimental. Pass `include_glyphs: true` to enable it;
|
|
20
|
-
# otherwise the glyphs step is recorded as skipped.
|
|
14
|
+
# resolved string through every sub-command.
|
|
21
15
|
class BuildCommand
|
|
22
16
|
# @param version_intent [nil, :default, :latest, String]
|
|
23
17
|
# @param output_root [String, Pathname]
|
|
24
18
|
# @param site_root [String, Pathname, nil] if nil, skip site build
|
|
25
|
-
# @param monolith_path [String, Pathname, nil] CodeCharts.pdf fallback
|
|
26
19
|
# @param force_fetch [Boolean] re-download sources
|
|
27
|
-
# @param include_glyphs [Boolean] opt into the experimental glyph
|
|
28
|
-
# step (default false)
|
|
29
|
-
# @param warn [IO, nil] forwarded to GlyphsCommand when enabled
|
|
30
20
|
# @return [Hash] aggregated step results
|
|
31
21
|
def call(version_intent, output_root:, site_root: nil,
|
|
32
|
-
|
|
33
|
-
include_glyphs: false, warn: nil)
|
|
22
|
+
force_fetch: false)
|
|
34
23
|
version = VersionResolver.resolve(version_intent)
|
|
35
24
|
steps = {}
|
|
36
25
|
|
|
37
26
|
steps[:fetch] = run_fetch(version, force: force_fetch)
|
|
38
27
|
steps[:parse] = ParseCommand.new.call(version, output_root: output_root)
|
|
39
|
-
steps[:glyphs] = run_glyphs(version, output_root, monolith_path,
|
|
40
|
-
include_glyphs: include_glyphs, warn: warn)
|
|
41
28
|
steps[:site] = run_site(output_root, site_root) if site_root
|
|
42
29
|
|
|
43
30
|
{ version: version, steps: steps }
|
|
@@ -54,16 +41,6 @@ module Ucode
|
|
|
54
41
|
}
|
|
55
42
|
end
|
|
56
43
|
|
|
57
|
-
def run_glyphs(version, output_root, monolith_path, include_glyphs:, warn:)
|
|
58
|
-
GlyphsCommand.new.call(
|
|
59
|
-
version,
|
|
60
|
-
output_root: output_root,
|
|
61
|
-
monolith_path: monolith_path || "CodeCharts.pdf",
|
|
62
|
-
include_glyphs: include_glyphs,
|
|
63
|
-
warn: warn,
|
|
64
|
-
)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
44
|
def run_site(output_root, site_root)
|
|
68
45
|
SiteCommand.new.build(output_root: output_root, site_root: site_root)
|
|
69
46
|
end
|
|
@@ -20,10 +20,7 @@ module Ucode
|
|
|
20
20
|
# `index.json` + `glyph.svg` atomically, accumulate per-tier +
|
|
21
21
|
# per-block stats, and emit `output/build-report.json`.
|
|
22
22
|
#
|
|
23
|
-
# This is the
|
|
24
|
-
# in {GlyphsCommand}. The two coexist until the v0.1 pipeline is
|
|
25
|
-
# removed (TODOs 17-19); CanonicalBuildCommand is the path forward
|
|
26
|
-
# for production dataset runs.
|
|
23
|
+
# This is the production path for dataset runs.
|
|
27
24
|
#
|
|
28
25
|
# == Pre-conditions (per TODO 21)
|
|
29
26
|
#
|
data/lib/ucode/commands.rb
CHANGED
|
@@ -9,7 +9,6 @@ module Ucode
|
|
|
9
9
|
module Commands
|
|
10
10
|
autoload :FetchCommand, "ucode/commands/fetch"
|
|
11
11
|
autoload :ParseCommand, "ucode/commands/parse"
|
|
12
|
-
autoload :GlyphsCommand, "ucode/commands/glyphs"
|
|
13
12
|
autoload :SiteCommand, "ucode/commands/site"
|
|
14
13
|
autoload :LookupCommand, "ucode/commands/lookup"
|
|
15
14
|
autoload :CacheCommand, "ucode/commands/cache"
|
data/lib/ucode/error.rb
CHANGED
|
@@ -24,8 +24,6 @@ module Ucode
|
|
|
24
24
|
# │ ├── Ucode::DatabaseSchemaError
|
|
25
25
|
# │ └── Ucode::UnknownVersionError
|
|
26
26
|
# └── Ucode::GlyphError
|
|
27
|
-
# ├── Ucode::PdfRenderError
|
|
28
|
-
# ├── Ucode::GridDetectionError
|
|
29
27
|
# ├── Ucode::LastResortMissingError
|
|
30
28
|
# ├── Ucode::EmbeddedFontsMissingError
|
|
31
29
|
# └── Ucode::UniversalSetPreBuildError
|
|
@@ -104,12 +102,6 @@ module Ucode
|
|
|
104
102
|
# Glyph pipeline failures.
|
|
105
103
|
class GlyphError < Error; end
|
|
106
104
|
|
|
107
|
-
# PDF → SVG rendering failure.
|
|
108
|
-
class PdfRenderError < GlyphError; end
|
|
109
|
-
|
|
110
|
-
# Grid detection couldn't anchor on codepoint labels.
|
|
111
|
-
class GridDetectionError < GlyphError; end
|
|
112
|
-
|
|
113
105
|
# The Last Resort Font UFO source cannot be located or is missing a
|
|
114
106
|
# required artifact (cmap-f13.ttx, font.ufo/glyphs/, contents.plist).
|
|
115
107
|
class LastResortMissingError < GlyphError; end
|
|
@@ -249,6 +249,7 @@ module Ucode
|
|
|
249
249
|
font_obj_id: font_obj_id,
|
|
250
250
|
tu_ref: tu_ref,
|
|
251
251
|
cid_map_kind: cid_map_kind,
|
|
252
|
+
base_font: base_font,
|
|
252
253
|
)
|
|
253
254
|
return nil if cp_to_gid.empty?
|
|
254
255
|
|
|
@@ -275,14 +276,23 @@ module Ucode
|
|
|
275
276
|
# when no /ToUnicode is present, consult the correlator_configs
|
|
276
277
|
# registry — if the user supplied a config for this font, render
|
|
277
278
|
# the relevant page(s) to SVG and run positional correlation.
|
|
278
|
-
#
|
|
279
|
-
#
|
|
280
|
-
|
|
279
|
+
# Pillar-2b fallback: when no caller-supplied config either,
|
|
280
|
+
# auto-detect via `mutool trace` — parse the structured text
|
|
281
|
+
# trace to build `{codepoint => gid}` from hex labels + specimen
|
|
282
|
+
# positions. Returns an empty hash when none of the paths
|
|
283
|
+
# produce a map (the caller treats that as "skip this font").
|
|
284
|
+
def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:,
|
|
285
|
+
base_font: nil)
|
|
281
286
|
return {} if cid_map_kind != :identity
|
|
282
287
|
|
|
283
288
|
return codepoint_map_from_tounicode(tu_ref) if tu_ref
|
|
284
289
|
|
|
285
|
-
codepoint_map_from_correlator(font_obj_id)
|
|
290
|
+
map = codepoint_map_from_correlator(font_obj_id)
|
|
291
|
+
return map unless map.empty?
|
|
292
|
+
|
|
293
|
+
return {} unless base_font
|
|
294
|
+
|
|
295
|
+
codepoint_map_from_trace(base_font, font_obj_id)
|
|
286
296
|
end
|
|
287
297
|
|
|
288
298
|
def codepoint_map_from_tounicode(tu_ref)
|
|
@@ -298,6 +308,73 @@ module Ucode
|
|
|
298
308
|
ContentStreamCorrelator.new(config).correlate(svg)
|
|
299
309
|
end
|
|
300
310
|
|
|
311
|
+
# Pillar-2b: auto-detect codepoint → GID via `mutool trace`.
|
|
312
|
+
# For CID-keyed fonts without /ToUnicode and without a
|
|
313
|
+
# caller-supplied correlator config, trace every page of the
|
|
314
|
+
# PDF and positionally match hex labels to specimen glyphs.
|
|
315
|
+
# `mutool info` only reports the first page per font, so tracing
|
|
316
|
+
# all pages is simpler and catches every chart page.
|
|
317
|
+
#
|
|
318
|
+
# Each page is correlated independently to prevent cross-page
|
|
319
|
+
# position interference (page coordinate systems overlap, so
|
|
320
|
+
# a label on page 3 could wrongly match a specimen on page 2).
|
|
321
|
+
# First match wins when a codepoint appears on multiple pages.
|
|
322
|
+
def codepoint_map_from_trace(base_font, _font_obj_id)
|
|
323
|
+
return {} unless font_appears_in_pdf?(base_font)
|
|
324
|
+
|
|
325
|
+
runner = TraceRunner.new(@source.pdf_path)
|
|
326
|
+
correlator = TraceCorrelator.new(specimen_font_name: base_font)
|
|
327
|
+
|
|
328
|
+
(1..page_count).each_with_object({}) do |page, mapping|
|
|
329
|
+
glyphs = runner.trace([page])
|
|
330
|
+
page_mapping = correlator.correlate(glyphs)
|
|
331
|
+
page_mapping.each do |cp, gid|
|
|
332
|
+
mapping[cp] ||= gid
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def font_appears_in_pdf?(base_font)
|
|
338
|
+
font_entries_cache.key?(base_font)
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Lazy cache of {base_font => true} — which fonts `mutool info`
|
|
342
|
+
# reports in this PDF. We only need the key set, not page numbers,
|
|
343
|
+
# because {codepoint_map_from_trace} traces all pages regardless.
|
|
344
|
+
def font_entries_cache
|
|
345
|
+
@font_entries_cache ||= begin
|
|
346
|
+
result = {}
|
|
347
|
+
mutool_info_text.each_line do |line|
|
|
348
|
+
next unless line.include?("Type0")
|
|
349
|
+
|
|
350
|
+
font_match = line.match(/Type0\s+'([^']+)'/)
|
|
351
|
+
next unless font_match
|
|
352
|
+
|
|
353
|
+
result[font_match[1]] = true
|
|
354
|
+
end
|
|
355
|
+
result
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Total pages in the PDF, parsed from `mutool info`'s
|
|
360
|
+
# `Pages: N` line. Falls back to the first font page if parsing
|
|
361
|
+
# fails (so we still try at least one page).
|
|
362
|
+
def page_count
|
|
363
|
+
@page_count ||= begin
|
|
364
|
+
m = mutool_info_text.match(/^Pages:\s+(\d+)/)
|
|
365
|
+
m ? m[1].to_i : 1
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
def mutool_info_text
|
|
370
|
+
@mutool_info_text ||= run_mutool_info
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
def run_mutool_info
|
|
374
|
+
out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
|
|
375
|
+
status.success? ? out + err : ""
|
|
376
|
+
end
|
|
377
|
+
|
|
301
378
|
def resolve_fontfile(fd_dict)
|
|
302
379
|
if fd_dict.key?("FontFile2")
|
|
303
380
|
[first_ref(fd_dict["FontFile2"]), :ttf]
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Glyphs
|
|
5
|
+
module EmbeddedFonts
|
|
6
|
+
# Correlates specimen glyphs (CID font without `/ToUnicode`) to
|
|
7
|
+
# their Unicode codepoints via positional matching against hex
|
|
8
|
+
# codepoint labels on the same chart page.
|
|
9
|
+
#
|
|
10
|
+
# The Unicode Code Charts use two layouts:
|
|
11
|
+
#
|
|
12
|
+
# 1. **List layout** (chart pages): the hex codepoint label (e.g.
|
|
13
|
+
# "10D75") is printed to the LEFT of the specimen glyph at the
|
|
14
|
+
# same Y baseline.
|
|
15
|
+
#
|
|
16
|
+
# 2. **Grid layout** (summary pages): the hex codepoint label is
|
|
17
|
+
# printed directly ABOVE the specimen glyph (~12 pt higher on
|
|
18
|
+
# Y, same X).
|
|
19
|
+
#
|
|
20
|
+
# Both layouts are handled by matching each specimen to the
|
|
21
|
+
# nearest valid label cluster by Euclidean distance, with a
|
|
22
|
+
# maximum match radius that excludes far-away header/footer text.
|
|
23
|
+
#
|
|
24
|
+
# The codepoint labels in every Unicode Code Charts PDF are set
|
|
25
|
+
# in a single dedicated label font (typically ArialNarrow).
|
|
26
|
+
# Character names, headers, and footers use other fonts. To avoid
|
|
27
|
+
# false matches from hex chars in those texts, the correlator
|
|
28
|
+
# auto-detects the label font as the non-specimen font that
|
|
29
|
+
# contributes the most hex-char glyphs.
|
|
30
|
+
#
|
|
31
|
+
# Matching is greedy one-to-one: each GID and each codepoint is
|
|
32
|
+
# assigned at most once, so a specimen that sits between two
|
|
33
|
+
# labels only claims the closer one.
|
|
34
|
+
#
|
|
35
|
+
# Pure logic — no I/O. The caller passes pre-parsed TraceGlyph
|
|
36
|
+
# arrays (typically from {TraceRunner} + {TraceParser}).
|
|
37
|
+
class TraceCorrelator
|
|
38
|
+
DEFAULT_Y_BUCKET = 1.0
|
|
39
|
+
private_constant :DEFAULT_Y_BUCKET
|
|
40
|
+
|
|
41
|
+
# Adjacent label chars within one codepoint label are ~4-6 pt
|
|
42
|
+
# apart on X. Different columns are ~30+ pt apart. 10 pt
|
|
43
|
+
# cleanly separates within-label from between-column gaps.
|
|
44
|
+
X_GAP_THRESHOLD = 10.0
|
|
45
|
+
private_constant :X_GAP_THRESHOLD
|
|
46
|
+
|
|
47
|
+
# Maximum valid Unicode codepoint. Filters out false labels
|
|
48
|
+
# that form hex strings from character-name fragments.
|
|
49
|
+
UNICODE_MAX = 0x10FFFF
|
|
50
|
+
private_constant :UNICODE_MAX
|
|
51
|
+
|
|
52
|
+
# Maximum Euclidean distance from a specimen to its matching
|
|
53
|
+
# label cluster. List-layout labels are ~21 pt to the left;
|
|
54
|
+
# grid-layout labels are ~12 pt above. Header/footer text is
|
|
55
|
+
# always > 30 pt away from any specimen.
|
|
56
|
+
MAX_MATCH_DISTANCE = 30.0
|
|
57
|
+
private_constant :MAX_MATCH_DISTANCE
|
|
58
|
+
|
|
59
|
+
# @param specimen_font_name [String] the BaseFont name of the
|
|
60
|
+
# CID font whose glyphs need correlation
|
|
61
|
+
def initialize(specimen_font_name:)
|
|
62
|
+
@specimen_font_name = specimen_font_name
|
|
63
|
+
@y_bucket = DEFAULT_Y_BUCKET
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# @param trace_glyphs [Array<TraceGlyph>]
|
|
67
|
+
# @return [Hash{Integer=>Integer}] codepoint => gid
|
|
68
|
+
def correlate(trace_glyphs)
|
|
69
|
+
specimens = trace_glyphs.select { |g| g.font_name == @specimen_font_name }
|
|
70
|
+
return {} if specimens.empty?
|
|
71
|
+
|
|
72
|
+
label_font = detect_label_font(trace_glyphs)
|
|
73
|
+
return {} unless label_font
|
|
74
|
+
|
|
75
|
+
labels = trace_glyphs.select { |g| label_glyph?(g, label_font) }
|
|
76
|
+
return {} if labels.empty?
|
|
77
|
+
|
|
78
|
+
clusters = build_label_clusters(labels)
|
|
79
|
+
return {} if clusters.empty?
|
|
80
|
+
|
|
81
|
+
build_mapping(specimens, clusters)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
# The label font is the non-specimen font whose hex-char glyphs
|
|
87
|
+
# appear most often in close proximity to specimen glyphs.
|
|
88
|
+
# Code Charts dedicate one small font to the codepoint labels;
|
|
89
|
+
# body text, headers, and character names use other fonts that
|
|
90
|
+
# may also contain hex chars but are not co-located with
|
|
91
|
+
# specimens (e.g. the index page has thousands of hex chars in
|
|
92
|
+
# MyriadPro-Light but zero specimens).
|
|
93
|
+
LABEL_PROXIMITY_RADIUS = 50.0
|
|
94
|
+
private_constant :LABEL_PROXIMITY_RADIUS
|
|
95
|
+
|
|
96
|
+
def detect_label_font(trace_glyphs)
|
|
97
|
+
specimens = trace_glyphs.select { |g| g.font_name == @specimen_font_name }
|
|
98
|
+
return nil if specimens.empty?
|
|
99
|
+
|
|
100
|
+
non_specimen_hex = non_specimen_hex_glyphs(trace_glyphs)
|
|
101
|
+
return nil if non_specimen_hex.empty?
|
|
102
|
+
|
|
103
|
+
counts = proximity_counts(specimens, non_specimen_hex)
|
|
104
|
+
return nil if counts.empty?
|
|
105
|
+
|
|
106
|
+
counts.max_by { |_, n| n }.first
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def non_specimen_hex_glyphs(trace_glyphs)
|
|
110
|
+
trace_glyphs.select do |g|
|
|
111
|
+
g.font_name != @specimen_font_name &&
|
|
112
|
+
g.unicode&.match?(/\A[0-9A-Fa-f]\z/)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def proximity_counts(specimens, candidates)
|
|
117
|
+
counts = Hash.new(0)
|
|
118
|
+
radius_sq = LABEL_PROXIMITY_RADIUS * LABEL_PROXIMITY_RADIUS
|
|
119
|
+
specimens.each do |spec|
|
|
120
|
+
candidates.each do |g|
|
|
121
|
+
counts[g.font_name] += 1 if within_radius?(spec, g, radius_sq)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
counts
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def within_radius?(spec, glyph, radius_sq)
|
|
128
|
+
dx = spec.x - glyph.x
|
|
129
|
+
dy = spec.y - glyph.y
|
|
130
|
+
dx * dx + dy * dy < radius_sq
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def label_glyph?(glyph, label_font)
|
|
134
|
+
glyph.font_name == label_font &&
|
|
135
|
+
glyph.unicode&.match?(/\A[0-9A-Fa-f]\z/)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Cluster labels by Y (row), then by X gap (column within row).
|
|
139
|
+
# Returns a flat array of {x:, y:, codepoint:} clusters.
|
|
140
|
+
def build_label_clusters(labels)
|
|
141
|
+
by_y = labels.group_by { |g| quantize(g.y, @y_bucket) }
|
|
142
|
+
by_y.flat_map { |(_, glyphs)| clusters_from_row(glyphs) }
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def clusters_from_row(glyphs)
|
|
146
|
+
cluster_by_x_gap(glyphs.sort_by(&:x)).filter_map { |cluster| build_cluster(cluster) }
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def build_cluster(cluster)
|
|
150
|
+
hex = cluster.map(&:unicode).join
|
|
151
|
+
return nil unless hex.match?(/\A[0-9A-Fa-f]{4,6}\z/)
|
|
152
|
+
|
|
153
|
+
cp = hex.to_i(16)
|
|
154
|
+
return nil unless cp <= UNICODE_MAX
|
|
155
|
+
|
|
156
|
+
{
|
|
157
|
+
x: cluster.sum(&:x) / cluster.size,
|
|
158
|
+
y: cluster.first.y,
|
|
159
|
+
codepoint: cp,
|
|
160
|
+
}
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def cluster_by_x_gap(sorted_glyphs)
|
|
164
|
+
clusters = []
|
|
165
|
+
current = []
|
|
166
|
+
|
|
167
|
+
sorted_glyphs.each do |g|
|
|
168
|
+
if current.empty? || (g.x - current.last.x).abs < X_GAP_THRESHOLD
|
|
169
|
+
current << g
|
|
170
|
+
else
|
|
171
|
+
clusters << current if current.size > 1
|
|
172
|
+
current = [g]
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
clusters << current if current.size > 1
|
|
176
|
+
clusters
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Greedy one-to-one matching: each GID and each codepoint is
|
|
180
|
+
# assigned at most once. Candidate pairs are sorted by distance
|
|
181
|
+
# so the closest specimen-label pair always wins.
|
|
182
|
+
def build_mapping(specimens, clusters)
|
|
183
|
+
candidates = Array.new(clusters.size) { |ci| specimen_distances(specimens, clusters, ci) }
|
|
184
|
+
|
|
185
|
+
assigned_gids = Set.new
|
|
186
|
+
assigned_cps = Set.new
|
|
187
|
+
mapping = {}
|
|
188
|
+
|
|
189
|
+
pairs_by_distance(candidates).each do |spec_idx, cluster_idx, dist|
|
|
190
|
+
next if dist > MAX_MATCH_DISTANCE
|
|
191
|
+
|
|
192
|
+
spec = specimens[spec_idx]
|
|
193
|
+
cluster = clusters[cluster_idx]
|
|
194
|
+
next if assigned_gids.include?(spec.gid)
|
|
195
|
+
next if assigned_cps.include?(cluster[:codepoint])
|
|
196
|
+
|
|
197
|
+
assigned_gids << spec.gid
|
|
198
|
+
assigned_cps << cluster[:codepoint]
|
|
199
|
+
mapping[cluster[:codepoint]] = spec.gid
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
mapping
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def specimen_distances(specimens, clusters, cluster_idx)
|
|
206
|
+
cluster = clusters[cluster_idx]
|
|
207
|
+
specimens.each_with_index.map do |spec, spec_idx|
|
|
208
|
+
[spec_idx, cluster_idx, distance(spec, cluster)]
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def pairs_by_distance(candidates)
|
|
213
|
+
candidates.flatten(1).sort_by { |_, _, dist| dist }
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def distance(spec, cluster)
|
|
217
|
+
dx = spec.x - cluster[:x]
|
|
218
|
+
dy = spec.y - cluster[:y]
|
|
219
|
+
Math.sqrt(dx * dx + dy * dy)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def quantize(value, bucket_size)
|
|
223
|
+
return nil if value.nil?
|
|
224
|
+
|
|
225
|
+
(value / bucket_size).round * bucket_size
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Glyphs
|
|
5
|
+
module EmbeddedFonts
|
|
6
|
+
# Value object for one glyph emitted by `mutool trace`.
|
|
7
|
+
#
|
|
8
|
+
# Each `<g>` element in the trace XML maps to one TraceGlyph:
|
|
9
|
+
#
|
|
10
|
+
# <g unicode="�" glyph="174" x="237.06" y="673.92" adv=".62"/>
|
|
11
|
+
#
|
|
12
|
+
# The `font_name` is inherited from the enclosing `<span>`:
|
|
13
|
+
#
|
|
14
|
+
# <span font="GPJAHB+WolofGaraySansSerif" ...>
|
|
15
|
+
# <g .../>
|
|
16
|
+
# </span>
|
|
17
|
+
TraceGlyph = Struct.new(
|
|
18
|
+
:font_name,
|
|
19
|
+
:gid,
|
|
20
|
+
:x,
|
|
21
|
+
:y,
|
|
22
|
+
:unicode,
|
|
23
|
+
keyword_init: true,
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Glyphs
|
|
7
|
+
module EmbeddedFonts
|
|
8
|
+
# Parses the XML output of `mutool trace <pdf> <page>` into an
|
|
9
|
+
# array of {TraceGlyph} instances.
|
|
10
|
+
#
|
|
11
|
+
# The trace XML uses a flat `<span font="...">` → `<g glyph="..."
|
|
12
|
+
# x="..." y="..." unicode="..."/>` structure. Nokogiri walks
|
|
13
|
+
# the tree; the parser maps each `<g>` to a TraceGlyph,
|
|
14
|
+
# inheriting the font_name from the enclosing span.
|
|
15
|
+
#
|
|
16
|
+
# Pure function — no I/O, no PDF access. Callers inject the XML
|
|
17
|
+
# string (typically from {TraceRunner}).
|
|
18
|
+
module TraceParser
|
|
19
|
+
class << self
|
|
20
|
+
# @param xml [String] raw mutool trace XML
|
|
21
|
+
# @return [Array<TraceGlyph>] one per `<g>` element; empty
|
|
22
|
+
# if the XML is empty or has no `<g>` elements
|
|
23
|
+
def parse(xml)
|
|
24
|
+
return [] if xml.nil? || xml.strip.empty?
|
|
25
|
+
|
|
26
|
+
doc = Nokogiri::XML(xml)
|
|
27
|
+
doc.css("span").flat_map { |span| glyphs_in_span(span) }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def glyphs_in_span(span)
|
|
33
|
+
font_name = span[:font]
|
|
34
|
+
span.css("g").map { |g| build_glyph(font_name, g) }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def build_glyph(font_name, g)
|
|
38
|
+
TraceGlyph.new(
|
|
39
|
+
font_name: font_name,
|
|
40
|
+
gid: g[:glyph]&.to_i,
|
|
41
|
+
x: g[:x]&.to_f,
|
|
42
|
+
y: g[:y]&.to_f,
|
|
43
|
+
unicode: g[:unicode],
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open3"
|
|
4
|
+
require "pathname"
|
|
5
|
+
|
|
6
|
+
require "ucode/error"
|
|
7
|
+
|
|
8
|
+
module Ucode
|
|
9
|
+
module Glyphs
|
|
10
|
+
module EmbeddedFonts
|
|
11
|
+
# Thin I/O wrapper around `mutool trace <pdf> <page>`.
|
|
12
|
+
#
|
|
13
|
+
# Runs mutool on the given pages, captures the XML output,
|
|
14
|
+
# delegates parsing to {TraceParser}, and returns a flat
|
|
15
|
+
# `Array<TraceGlyph>` across all pages.
|
|
16
|
+
#
|
|
17
|
+
# The only class in the trace pipeline that touches the
|
|
18
|
+
# filesystem / spawns subprocesses. Everything upstream
|
|
19
|
+
# (parser, correlator) is pure.
|
|
20
|
+
class TraceRunner
|
|
21
|
+
# @param pdf_path [Pathname, String]
|
|
22
|
+
def initialize(pdf_path)
|
|
23
|
+
@pdf_path = Pathname.new(pdf_path)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @param page_numbers [Array<Integer>] 1-based PDF page numbers
|
|
27
|
+
# @return [Array<TraceGlyph>]
|
|
28
|
+
def trace(page_numbers)
|
|
29
|
+
page_numbers.flat_map { |page| trace_page(page) }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def trace_page(page)
|
|
35
|
+
xml = run_mutool(page)
|
|
36
|
+
TraceParser.parse(xml)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def run_mutool(page)
|
|
40
|
+
out, err, status = Open3.capture3(
|
|
41
|
+
"mutool", "trace", @pdf_path.to_s, page.to_s,
|
|
42
|
+
)
|
|
43
|
+
unless status.success?
|
|
44
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
45
|
+
"mutool trace failed: #{(out + err).strip}"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
out + err
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -42,6 +42,10 @@ module Ucode
|
|
|
42
42
|
autoload :Catalog, "ucode/glyphs/embedded_fonts/catalog"
|
|
43
43
|
autoload :ContentStreamCorrelator,
|
|
44
44
|
"ucode/glyphs/embedded_fonts/content_stream_correlator"
|
|
45
|
+
autoload :TraceGlyph, "ucode/glyphs/embedded_fonts/trace_glyph"
|
|
46
|
+
autoload :TraceParser, "ucode/glyphs/embedded_fonts/trace_parser"
|
|
47
|
+
autoload :TraceCorrelator, "ucode/glyphs/embedded_fonts/trace_correlator"
|
|
48
|
+
autoload :TraceRunner, "ucode/glyphs/embedded_fonts/trace_runner"
|
|
45
49
|
autoload :Svg, "ucode/glyphs/embedded_fonts/svg"
|
|
46
50
|
autoload :Renderer, "ucode/glyphs/embedded_fonts/renderer"
|
|
47
51
|
autoload :Writer, "ucode/glyphs/embedded_fonts/writer"
|