ucode 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/config/unicode17_universal_glyph_set.yml +1 -1
- data/lib/ucode/cli.rb +1 -35
- data/lib/ucode/commands/build.rb +3 -26
- data/lib/ucode/commands/canonical_build.rb +1 -4
- data/lib/ucode/commands.rb +0 -1
- data/lib/ucode/error.rb +0 -8
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +81 -4
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +230 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_glyph.rb +27 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_parser.rb +50 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_runner.rb +53 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +4 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +7 -50
- data/lib/ucode/glyphs.rb +4 -14
- data/lib/ucode/version.rb +1 -1
- data/lib/ucode.rb +0 -2
- metadata +6 -15
- data/lib/ucode/commands/glyphs.rb +0 -94
- data/lib/ucode/glyphs/cell_extractor.rb +0 -130
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +0 -29
- data/lib/ucode/glyphs/grid.rb +0 -30
- data/lib/ucode/glyphs/grid_detector.rb +0 -165
- data/lib/ucode/glyphs/monolith_page_map.rb +0 -181
- data/lib/ucode/glyphs/mutool_renderer.rb +0 -28
- data/lib/ucode/glyphs/page_renderer.rb +0 -234
- data/lib/ucode/glyphs/path_bbox.rb +0 -62
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +0 -26
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +0 -32
- data/lib/ucode/glyphs/pipeline.rb +0 -105
- data/lib/ucode/glyphs/writer.rb +0 -250
|
@@ -1,181 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "pathname"
|
|
4
|
-
require "json"
|
|
5
|
-
require "open3"
|
|
6
|
-
|
|
7
|
-
module Ucode
|
|
8
|
-
module Glyphs
|
|
9
|
-
# Maps a Unicode block's first codepoint to its page range inside the
|
|
10
|
-
# monolith `CodeCharts.pdf` by parsing the PDF's bookmark outline and
|
|
11
|
-
# matching each bookmark title to a Block.name from `Blocks.txt`.
|
|
12
|
-
#
|
|
13
|
-
# Each chart cluster printed by the Unicode Consortium is a single
|
|
14
|
-
# bookmark entry:
|
|
15
|
-
#
|
|
16
|
-
# BookmarkTitle: Greek and Coptic
|
|
17
|
-
# BookmarkLevel: 1
|
|
18
|
-
# BookmarkPageNumber: 415
|
|
19
|
-
#
|
|
20
|
-
# The cluster title usually equals a Block.name verbatim, but a few
|
|
21
|
-
# clusters carry a heading that prepends "C0 Controls and " /
|
|
22
|
-
# "C1 Controls and " to the block name. We resolve both forms.
|
|
23
|
-
#
|
|
24
|
-
# End-page of a cluster is one page before the next cluster's start
|
|
25
|
-
# page (last cluster's end-page is the PDF's last page).
|
|
26
|
-
#
|
|
27
|
-
# The map is cached as JSON at `data/codecharts_page_map.json` so
|
|
28
|
-
# we don't re-scan the 3,156-page monolith on every run.
|
|
29
|
-
class MonolithPageMap
|
|
30
|
-
BookmarkTitleRegex = /BookmarkTitle:\s*(.+)/.freeze
|
|
31
|
-
BookmarkPageRegex = /BookmarkPageNumber:\s*(\d+)/.freeze
|
|
32
|
-
private_constant :BookmarkTitleRegex, :BookmarkPageRegex
|
|
33
|
-
|
|
34
|
-
# The Unicode charts print these multi-block clusters as a single
|
|
35
|
-
# chart page (the C0/C1 control chars are drawn alongside their
|
|
36
|
-
# block's other characters). Each cluster title maps to the single
|
|
37
|
-
# block it belongs to.
|
|
38
|
-
ClusterPrefixes = [
|
|
39
|
-
"C0 Controls and ",
|
|
40
|
-
"C1 Controls and ",
|
|
41
|
-
].freeze
|
|
42
|
-
private_constant :ClusterPrefixes
|
|
43
|
-
|
|
44
|
-
MapEntry = Struct.new(:first_cp, :start_page, :end_page, keyword_init: true)
|
|
45
|
-
|
|
46
|
-
class << self
|
|
47
|
-
# Build the map by parsing the monolith's outline and matching
|
|
48
|
-
# each bookmark title to a Block.
|
|
49
|
-
#
|
|
50
|
-
# @param monolith_path [String, Pathname]
|
|
51
|
-
# @param blocks [Array<Ucode::Models::Block>] the parsed Blocks table
|
|
52
|
-
# @return [Hash{Integer => MapEntry}] keyed by block.range_first
|
|
53
|
-
def build(monolith_path:, blocks:)
|
|
54
|
-
name_to_first_cp = blocks.each_with_object({}) do |b, h|
|
|
55
|
-
h[b.name] = b.range_first
|
|
56
|
-
end
|
|
57
|
-
total_pages = page_count(monolith_path)
|
|
58
|
-
entries = parse_bookmarks(dump_bookmarks(monolith_path), name_to_first_cp)
|
|
59
|
-
attach_end_pages(entries, total_pages)
|
|
60
|
-
entries.each_with_object({}) do |e, h|
|
|
61
|
-
h[e.first_cp] = e
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# Pure: parse a `pdftk dump_data` string into a list of
|
|
66
|
-
# MapEntry rows (without end_pages). Exposed for unit tests
|
|
67
|
-
# and any caller that already has the dump cached.
|
|
68
|
-
#
|
|
69
|
-
# @param dump [String] the raw `pdftk dump_data` output
|
|
70
|
-
# @param name_to_first_cp [Hash{String => Integer}]
|
|
71
|
-
# @return [Array<MapEntry>]
|
|
72
|
-
def parse_bookmarks(dump, name_to_first_cp)
|
|
73
|
-
entries = []
|
|
74
|
-
current_title = nil
|
|
75
|
-
dump.each_line do |line|
|
|
76
|
-
case line
|
|
77
|
-
when BookmarkTitleRegex
|
|
78
|
-
current_title = Regexp.last_match(1).strip
|
|
79
|
-
when BookmarkPageRegex
|
|
80
|
-
page = Regexp.last_match(1).to_i
|
|
81
|
-
cp = resolve_first_cp(current_title, name_to_first_cp)
|
|
82
|
-
entries << MapEntry.new(first_cp: cp, start_page: page) if cp
|
|
83
|
-
current_title = nil
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
entries.sort_by(&:start_page)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
# Pure: attach end_pages by sorting entries and assigning each
|
|
90
|
-
# entry's end to one page before the next entry's start.
|
|
91
|
-
#
|
|
92
|
-
# @param entries [Array<MapEntry>]
|
|
93
|
-
# @param total_pages [Integer, nil] page count of the source PDF;
|
|
94
|
-
# the last entry's end_page falls back to this when present.
|
|
95
|
-
# @return [Array<MapEntry>] the same entries, mutated with end_pages.
|
|
96
|
-
def attach_end_pages(entries, total_pages = nil)
|
|
97
|
-
sorted = entries.sort_by(&:start_page)
|
|
98
|
-
sorted.each_with_index do |entry, i|
|
|
99
|
-
next_entry = sorted[i + 1]
|
|
100
|
-
entry.end_page = next_entry ? next_entry.start_page - 1 : total_pages
|
|
101
|
-
end
|
|
102
|
-
sorted
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
# Load from cache, or build and cache.
|
|
106
|
-
# @param monolith_path [String, Pathname]
|
|
107
|
-
# @param blocks [Array<Ucode::Models::Block>]
|
|
108
|
-
# @param cache_path [String, Pathname, nil]
|
|
109
|
-
# @return [Hash{Integer => MapEntry}]
|
|
110
|
-
def load(monolith_path:, blocks:, cache_path: nil)
|
|
111
|
-
cache = cache_path && Pathname.new(cache_path)
|
|
112
|
-
if cache&.exist?
|
|
113
|
-
return load_from_json(cache.read)
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
map = build(monolith_path: monolith_path, blocks: blocks)
|
|
117
|
-
write_cache(map, cache) if cache
|
|
118
|
-
map
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
# Look up a block's page range by its first cp.
|
|
122
|
-
# @param map [Hash{Integer => MapEntry}]
|
|
123
|
-
# @param block_first_cp [Integer]
|
|
124
|
-
# @return [MapEntry, nil]
|
|
125
|
-
def range_for(map, block_first_cp)
|
|
126
|
-
map[block_first_cp]
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
# ---- I/O helpers (impure) --------------------------------------
|
|
130
|
-
|
|
131
|
-
def dump_bookmarks(monolith_path)
|
|
132
|
-
out, status = Open3.capture2e("pdftk", monolith_path.to_s, "dump_data")
|
|
133
|
-
return "" unless status.success?
|
|
134
|
-
|
|
135
|
-
out
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
def page_count(monolith_path)
|
|
139
|
-
out, status = Open3.capture2e("pdfinfo", monolith_path.to_s)
|
|
140
|
-
return nil unless status.success?
|
|
141
|
-
|
|
142
|
-
match = out.match(/^Pages:\s+(\d+)/)
|
|
143
|
-
match ? match[1].to_i : nil
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
private
|
|
147
|
-
|
|
148
|
-
def resolve_first_cp(title, name_to_first_cp)
|
|
149
|
-
return nil unless title
|
|
150
|
-
|
|
151
|
-
return name_to_first_cp[title] if name_to_first_cp.key?(title)
|
|
152
|
-
|
|
153
|
-
ClusterPrefixes.each do |prefix|
|
|
154
|
-
stripped = title.sub(/\A#{Regexp.escape(prefix)}/, "")
|
|
155
|
-
return name_to_first_cp[stripped] if name_to_first_cp.key?(stripped)
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
nil
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
def write_cache(map, cache_path)
|
|
162
|
-
payload = map.values.map { |e| { "first_cp" => e.first_cp,
|
|
163
|
-
"start_page" => e.start_page,
|
|
164
|
-
"end_page" => e.end_page } }
|
|
165
|
-
cache_path.dirname.mkpath
|
|
166
|
-
cache_path.write(JSON.pretty_generate(payload))
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
def load_from_json(json)
|
|
170
|
-
payload = JSON.parse(json)
|
|
171
|
-
payload.each_with_object({}) do |row, h|
|
|
172
|
-
entry = MapEntry.new(first_cp: row["first_cp"],
|
|
173
|
-
start_page: row["start_page"],
|
|
174
|
-
end_page: row["end_page"])
|
|
175
|
-
h[entry.first_cp] = entry
|
|
176
|
-
end
|
|
177
|
-
end
|
|
178
|
-
end
|
|
179
|
-
end
|
|
180
|
-
end
|
|
181
|
-
end
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "ucode/glyphs/page_renderer"
|
|
4
|
-
|
|
5
|
-
module Ucode
|
|
6
|
-
module Glyphs
|
|
7
|
-
# `mutool draw` from MuPDF — typically the fastest and cleanest.
|
|
8
|
-
# Emits one `<svg>` per page with `<path>` vector data.
|
|
9
|
-
#
|
|
10
|
-
# Command: `mutool draw -F svg -o <out.svg> <in.pdf> <page>`
|
|
11
|
-
class MutoolRenderer < PageRenderer
|
|
12
|
-
class << self
|
|
13
|
-
def renderer_name
|
|
14
|
-
:mutool
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def binary_name
|
|
18
|
-
:mutool
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def build_command(pdf_path, page_num, out_path)
|
|
22
|
-
["mutool", "draw", "-F", "svg", "-o", out_path.to_s,
|
|
23
|
-
pdf_path.to_s, page_num.to_s]
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
end
|
|
@@ -1,234 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "open3"
|
|
4
|
-
require "pathname"
|
|
5
|
-
require "tmpdir"
|
|
6
|
-
|
|
7
|
-
require "ucode/error"
|
|
8
|
-
|
|
9
|
-
module Ucode
|
|
10
|
-
module Glyphs
|
|
11
|
-
# Strategy interface for PDF-page-to-SVG rendering.
|
|
12
|
-
#
|
|
13
|
-
# Subclasses implement `renderer_name`, `binary_name`, and
|
|
14
|
-
# `build_command`. The base class handles availability check,
|
|
15
|
-
# command execution, error handling, and the renderer registry.
|
|
16
|
-
#
|
|
17
|
-
# **OCP**: a new renderer is a new subclass file + one entry in
|
|
18
|
-
# `KNOWN_RENDERERS`. The base class and existing renderers are not
|
|
19
|
-
# modified.
|
|
20
|
-
#
|
|
21
|
-
# **Vector-only requirement**: every renderer here must emit SVG
|
|
22
|
-
# `<path>` elements (vector data) for the Code Charts PDFs, not
|
|
23
|
-
# raster images. Callers verify this via `path_count` on the output.
|
|
24
|
-
class PageRenderer
|
|
25
|
-
OUTPUT_FORMAT = :svg
|
|
26
|
-
|
|
27
|
-
# Fixture used by `works?` to smoke-test renderers. Resolved lazily
|
|
28
|
-
# so missing-fixture environments (installed gem without spec assets)
|
|
29
|
-
# don't fail at load time.
|
|
30
|
-
DEFAULT_SMOKE_FIXTURE =
|
|
31
|
-
File.expand_path("../../../spec/fixtures/pdfs/basic_latin.pdf", __dir__)
|
|
32
|
-
|
|
33
|
-
# Ordered list of known concrete renderer class names (as symbols),
|
|
34
|
-
# most-preferred first. Resolved lazily via `const_get` so that
|
|
35
|
-
# loading any one renderer does not eagerly load all of them — this
|
|
36
|
-
# avoids a circular require (each renderer file requires this file
|
|
37
|
-
# to inherit from PageRenderer).
|
|
38
|
-
KNOWN_RENDERERS = %i[
|
|
39
|
-
MutoolRenderer
|
|
40
|
-
Pdf2svgRenderer
|
|
41
|
-
DvisvgmRenderer
|
|
42
|
-
PdftocairoRenderer
|
|
43
|
-
].freeze
|
|
44
|
-
private_constant :KNOWN_RENDERERS
|
|
45
|
-
|
|
46
|
-
class << self
|
|
47
|
-
# @return [Symbol] short identifier (e.g. :mutool)
|
|
48
|
-
def renderer_name
|
|
49
|
-
raise NotImplementedError
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# @return [String, Symbol] the binary looked up on PATH
|
|
53
|
-
def binary_name
|
|
54
|
-
raise NotImplementedError
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
# @return [Symbol] always :svg for now; future formats (png, etc.)
|
|
58
|
-
# would warrant a separate renderer family.
|
|
59
|
-
def output_format
|
|
60
|
-
OUTPUT_FORMAT
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# Build the argv for the renderer. Subclasses return an Array
|
|
64
|
-
# suitable for `Open3.capture2e` (no shell interpolation).
|
|
65
|
-
# @param pdf_path [Pathname, String]
|
|
66
|
-
# @param page_num [Integer] 1-indexed
|
|
67
|
-
# @param out_path [Pathname, String]
|
|
68
|
-
# @return [Array<String>]
|
|
69
|
-
def build_command(pdf_path, page_num, out_path)
|
|
70
|
-
raise NotImplementedError
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# @return [Boolean] true if the binary is on PATH. Returns
|
|
74
|
-
# false on hosts without `which`/`where` or where the
|
|
75
|
-
# binary isn't installed — the next renderer in
|
|
76
|
-
# KNOWN_RENDERERS is tried.
|
|
77
|
-
def available?
|
|
78
|
-
if Gem.win_platform?
|
|
79
|
-
# `where` returns the first match path; exit status 0
|
|
80
|
-
# means the binary is found. Suppress stdout/stderr to
|
|
81
|
-
# avoid polluting test output.
|
|
82
|
-
system("where #{binary_name} >NUL 2>NUL")
|
|
83
|
-
else
|
|
84
|
-
system("which", binary_name.to_s,
|
|
85
|
-
out: "/dev/null", err: "/dev/null")
|
|
86
|
-
end
|
|
87
|
-
rescue Errno::ENOENT, Errno::EINVAL
|
|
88
|
-
false
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
# Smoke-test the binary by actually rendering one page of the
|
|
92
|
-
# fixture PDF AND verifying the output format is consumable by
|
|
93
|
-
# the downstream `GridDetector` / `CellExtractor` pipeline.
|
|
94
|
-
#
|
|
95
|
-
# Three things can make a renderer unusable for this codebase:
|
|
96
|
-
# 1. Binary not on PATH (`available?` catches this).
|
|
97
|
-
# 2. Binary on PATH but silently broken (e.g. Ubuntu's
|
|
98
|
-
# `mupdf-tools` is built without LCMS, so `mutool` warns
|
|
99
|
-
# "ICC support is not available" and emits zero bytes for
|
|
100
|
-
# ICC-profiled PDFs).
|
|
101
|
-
# 3. Binary works but emits a flat-path SVG that GridDetector
|
|
102
|
-
# can't parse (mutool's format: `<path id="font_X_Y">`
|
|
103
|
-
# directly in `<defs>`, no `<use>` references). The grid
|
|
104
|
-
# detector requires the `<g id="glyph-N-M">` + `<use>` form
|
|
105
|
-
# produced by pdftocairo / pdf2svg.
|
|
106
|
-
#
|
|
107
|
-
# The result is memoized per-renderer for the process lifetime —
|
|
108
|
-
# the binary's capabilities don't change mid-run.
|
|
109
|
-
#
|
|
110
|
-
# When no fixture PDF is available (e.g. installed gem without
|
|
111
|
-
# spec assets), degrades to `available?` — we can't smoke-test
|
|
112
|
-
# without input, so we trust the binary's presence on PATH.
|
|
113
|
-
#
|
|
114
|
-
# @param fixture_pdf [String, Pathname] small one-page PDF used
|
|
115
|
-
# for the smoke render. Defaults to the project's
|
|
116
|
-
# `basic_latin.pdf` spec fixture.
|
|
117
|
-
# @return [Boolean]
|
|
118
|
-
def works?(fixture_pdf: DEFAULT_SMOKE_FIXTURE)
|
|
119
|
-
if !available?
|
|
120
|
-
false
|
|
121
|
-
elsif !File.exist?(fixture_pdf.to_s)
|
|
122
|
-
true # no fixture to verify against; trust PATH
|
|
123
|
-
else
|
|
124
|
-
smoke_render_ok?(fixture_pdf)
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
# Render one page of `pdf_path` to `out_path` as SVG.
|
|
129
|
-
# @param pdf_path [Pathname, String]
|
|
130
|
-
# @param page_num [Integer] 1-indexed
|
|
131
|
-
# @param out_path [Pathname, String]
|
|
132
|
-
# @return [Symbol] :ok on success
|
|
133
|
-
# @raise [Ucode::PdfRenderError] on failure (non-zero exit,
|
|
134
|
-
# output file missing, or binary unavailable)
|
|
135
|
-
def render(pdf_path, page_num, out_path)
|
|
136
|
-
unless available?
|
|
137
|
-
raise PdfRenderError.new(
|
|
138
|
-
"binary '#{binary_name}' not available on PATH",
|
|
139
|
-
context: { renderer: name, binary: binary_name },
|
|
140
|
-
)
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
out = Pathname.new(out_path)
|
|
144
|
-
out.dirname.mkpath
|
|
145
|
-
|
|
146
|
-
cmd = build_command(Pathname.new(pdf_path), page_num, out)
|
|
147
|
-
output, status = Open3.capture2e(*cmd)
|
|
148
|
-
|
|
149
|
-
unless status.success? && out.exist? && out.size.positive?
|
|
150
|
-
raise PdfRenderError.new(
|
|
151
|
-
"render failed for page #{page_num} of #{pdf_path} via '#{binary_name}'",
|
|
152
|
-
context: {
|
|
153
|
-
renderer: name,
|
|
154
|
-
binary: binary_name,
|
|
155
|
-
exit_status: status.exitstatus,
|
|
156
|
-
output: output,
|
|
157
|
-
},
|
|
158
|
-
)
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
:ok
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
# ---- Registry ----
|
|
165
|
-
|
|
166
|
-
# @return [Array<Class>] every known concrete renderer
|
|
167
|
-
def all
|
|
168
|
-
@all ||= KNOWN_RENDERERS.map { |n| Ucode::Glyphs.const_get(n) }.freeze
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
# @return [Array<Class>] renderers whose binary is installed
|
|
172
|
-
def available
|
|
173
|
-
all.select(&:available?)
|
|
174
|
-
end
|
|
175
|
-
|
|
176
|
-
# @return [Array<Class>] renderers that actually produce SVG in
|
|
177
|
-
# the format `GridDetector` consumes (smoke-tested once per
|
|
178
|
-
# process via `works?`, then cached). Subset of `available`.
|
|
179
|
-
def working
|
|
180
|
-
return @working if @working
|
|
181
|
-
|
|
182
|
-
@working = all.select(&:works?).freeze
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
# Clear the cached `working` list. Useful when the environment
|
|
186
|
-
# changes (e.g. a binary is installed mid-process) or in tests.
|
|
187
|
-
def reset_working_cache!
|
|
188
|
-
@working = nil
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
# @param name [Symbol, String]
|
|
192
|
-
# @return [Class, nil]
|
|
193
|
-
def find(name)
|
|
194
|
-
all.find { |r| r.renderer_name == name.to_sym }
|
|
195
|
-
end
|
|
196
|
-
|
|
197
|
-
# @return [Class, nil] the first working renderer; falls back to
|
|
198
|
-
# the first available renderer if none have been smoke-tested
|
|
199
|
-
# yet (preserves eager-init paths). nil if nothing is installed.
|
|
200
|
-
def default
|
|
201
|
-
working.first || available.first
|
|
202
|
-
end
|
|
203
|
-
|
|
204
|
-
private
|
|
205
|
-
|
|
206
|
-
# @param fixture_pdf [String] path to an existing PDF
|
|
207
|
-
# @return [Boolean] true iff rendering page 1 produces an SVG
|
|
208
|
-
# with the `<g id="glyph-N-M">` + `<use>` form that
|
|
209
|
-
# `GridDetector` requires.
|
|
210
|
-
def smoke_render_ok?(fixture_pdf)
|
|
211
|
-
Dir.mktmpdir("renderer-smoke-") do |dir|
|
|
212
|
-
out = File.join(dir, "smoke.svg")
|
|
213
|
-
begin
|
|
214
|
-
render(fixture_pdf, 1, out)
|
|
215
|
-
rescue PdfRenderError
|
|
216
|
-
break false
|
|
217
|
-
end
|
|
218
|
-
svg_has_pipeline_format?(out)
|
|
219
|
-
end
|
|
220
|
-
end
|
|
221
|
-
|
|
222
|
-
def svg_has_pipeline_format?(out_path)
|
|
223
|
-
return false unless File.exist?(out_path)
|
|
224
|
-
return false unless File.size(out_path).positive?
|
|
225
|
-
|
|
226
|
-
body = File.read(out_path)
|
|
227
|
-
body.include?("<svg") &&
|
|
228
|
-
body.include?("<use") &&
|
|
229
|
-
body.match?("id=\"glyph-\\d+-\\d+\"")
|
|
230
|
-
end
|
|
231
|
-
end
|
|
232
|
-
end
|
|
233
|
-
end
|
|
234
|
-
end
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Ucode
|
|
4
|
-
module Glyphs
|
|
5
|
-
# Estimates the axis-aligned bounding box of an SVG `<path>` `d`
|
|
6
|
-
# attribute by scanning every numeric coordinate pair in the path
|
|
7
|
-
# data. This is a conservative over-estimate: control points and
|
|
8
|
-
# implicit vertices are included, so the true curve bbox is always
|
|
9
|
-
# contained within the estimate. For grid detection and cell
|
|
10
|
-
# membership tests, the over-estimate is sufficient and avoids the
|
|
11
|
-
# cost of a Bezier solver.
|
|
12
|
-
#
|
|
13
|
-
# Only absolute coordinates are returned. Relative commands (lowercase
|
|
14
|
-
# `m`, `l`, `c`, …) are NOT supported — Code Charts SVGs from every
|
|
15
|
-
# supported renderer (pdftocairo, pdf2svg, dvisvgm, mutool) emit
|
|
16
|
-
# absolute commands. If relative commands appear, parse them via a
|
|
17
|
-
# proper SVG path parser before calling this.
|
|
18
|
-
module PathBbox
|
|
19
|
-
NUMBER = /-?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?/.freeze
|
|
20
|
-
|
|
21
|
-
Result = Struct.new(:min_x, :min_y, :max_x, :max_y, keyword_init: true) do
|
|
22
|
-
def width
|
|
23
|
-
return nil if empty?
|
|
24
|
-
|
|
25
|
-
max_x - min_x
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
def height
|
|
29
|
-
return nil if empty?
|
|
30
|
-
|
|
31
|
-
max_y - min_y
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
def empty?
|
|
35
|
-
min_x.nil? || min_y.nil? || max_x.nil? || max_y.nil?
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
class << self
|
|
40
|
-
def estimate(path_d)
|
|
41
|
-
return Result.new if path_d.nil? || path_d.empty?
|
|
42
|
-
|
|
43
|
-
numbers = path_d.scan(NUMBER).map(&:to_f)
|
|
44
|
-
return Result.new if numbers.empty?
|
|
45
|
-
|
|
46
|
-
xs = []
|
|
47
|
-
ys = []
|
|
48
|
-
numbers.each_slice(2) do |x, y|
|
|
49
|
-
xs << x
|
|
50
|
-
ys << y
|
|
51
|
-
end
|
|
52
|
-
Result.new(
|
|
53
|
-
min_x: xs.min,
|
|
54
|
-
min_y: ys.min,
|
|
55
|
-
max_x: xs.max,
|
|
56
|
-
max_y: ys.max,
|
|
57
|
-
)
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "ucode/glyphs/page_renderer"
|
|
4
|
-
|
|
5
|
-
module Ucode
|
|
6
|
-
module Glyphs
|
|
7
|
-
# `pdf2svg` — simple, widely available. One SVG per page.
|
|
8
|
-
#
|
|
9
|
-
# Command: `pdf2svg <in.pdf> <out.svg> <page>`
|
|
10
|
-
class Pdf2svgRenderer < PageRenderer
|
|
11
|
-
class << self
|
|
12
|
-
def renderer_name
|
|
13
|
-
:pdf2svg
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
def binary_name
|
|
17
|
-
:pdf2svg
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
def build_command(pdf_path, page_num, out_path)
|
|
21
|
-
["pdf2svg", pdf_path.to_s, out_path.to_s, page_num.to_s]
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
end
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "ucode/glyphs/page_renderer"
|
|
4
|
-
|
|
5
|
-
module Ucode
|
|
6
|
-
module Glyphs
|
|
7
|
-
# `pdftocairo -svg` from the Poppler suite. Available on macOS via
|
|
8
|
-
# `brew install poppler`. Slower than `mutool` but widely available.
|
|
9
|
-
#
|
|
10
|
-
# Command: `pdftocairo -svg -f <n> -l <n> <in.pdf> <out.svg>`
|
|
11
|
-
#
|
|
12
|
-
# The `-f`/`-l` pair restricts rendering to one page (first/last).
|
|
13
|
-
class PdftocairoRenderer < PageRenderer
|
|
14
|
-
class << self
|
|
15
|
-
def renderer_name
|
|
16
|
-
:pdftocairo
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
def binary_name
|
|
20
|
-
:pdftocairo
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
def build_command(pdf_path, page_num, out_path)
|
|
24
|
-
["pdftocairo", "-svg",
|
|
25
|
-
"-f", page_num.to_s,
|
|
26
|
-
"-l", page_num.to_s,
|
|
27
|
-
pdf_path.to_s, out_path.to_s]
|
|
28
|
-
end
|
|
29
|
-
end
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
end
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "pathname"
|
|
4
|
-
|
|
5
|
-
require "ucode/cache"
|
|
6
|
-
require "ucode/glyphs/pdf_fetcher"
|
|
7
|
-
require "ucode/glyphs/writer"
|
|
8
|
-
require "ucode/parsers"
|
|
9
|
-
|
|
10
|
-
module Ucode
|
|
11
|
-
module Glyphs
|
|
12
|
-
# Assembles the per-block specs that {Glyphs::Writer#write_all} drains.
|
|
13
|
-
#
|
|
14
|
-
# Owns three pieces of orchestration that {Commands::GlyphsCommand}
|
|
15
|
-
# used to carry inline:
|
|
16
|
-
#
|
|
17
|
-
# - block loading from {Cache.ucd_dir}/Blocks.txt (with an optional
|
|
18
|
-
# block filter)
|
|
19
|
-
# - PDF fetcher construction (with monolith fallback)
|
|
20
|
-
# - the per-block page-map heuristic (per-block PDFs are page 1 =
|
|
21
|
-
# title, page 2 = first chart page starting at the block's first
|
|
22
|
-
# codepoint; true for most BMP blocks; multi-page blocks need a
|
|
23
|
-
# richer resolver — mismatches yield placeholder SVGs only, never
|
|
24
|
-
# wrong glyphs)
|
|
25
|
-
#
|
|
26
|
-
# The Command stays a thin wrapper that prints the experimental
|
|
27
|
-
# warning and wires the writer. See Candidate 3 of the 2026-06-29
|
|
28
|
-
# architecture review.
|
|
29
|
-
class Pipeline
|
|
30
|
-
# Path to the monolith fallback file when no per-block PDF is on
|
|
31
|
-
# disk yet. Overridable for tests.
|
|
32
|
-
DEFAULT_MONOLITH_PATH = "CodeCharts.pdf"
|
|
33
|
-
# Cache path for the page-map corpus. Overridable for tests.
|
|
34
|
-
DEFAULT_PAGE_MAP_CACHE = "data/codecharts_page_map.json"
|
|
35
|
-
|
|
36
|
-
Spec = Struct.new(:block, :pdf_path, :page_map, keyword_init: true)
|
|
37
|
-
|
|
38
|
-
# @param version [String] resolved UCD version (callers must
|
|
39
|
-
# resolve via {VersionResolver.resolve} first)
|
|
40
|
-
# @param block_filter [Array<String>, nil] block ids to limit to;
|
|
41
|
-
# nil = every block
|
|
42
|
-
# @param monolith_path [String, Pathname, nil] fallback monolith
|
|
43
|
-
# @param page_map_cache [String, Pathname] cache for the page map
|
|
44
|
-
def initialize(version:, block_filter: nil,
|
|
45
|
-
monolith_path: DEFAULT_MONOLITH_PATH,
|
|
46
|
-
page_map_cache: DEFAULT_PAGE_MAP_CACHE)
|
|
47
|
-
@version = version
|
|
48
|
-
@block_filter = block_filter
|
|
49
|
-
@monolith_path = monolith_path
|
|
50
|
-
@page_map_cache = page_map_cache
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
# Load every block from the cached Blocks.txt (filtered by
|
|
54
|
-
# `@block_filter` when set) and pair each one with a fetched PDF
|
|
55
|
-
# path and a page map. Blocks whose PDF cannot be fetched are
|
|
56
|
-
# silently dropped — the placeholder pass downstream covers them.
|
|
57
|
-
#
|
|
58
|
-
# @param force [Boolean] re-fetch PDFs even when cached
|
|
59
|
-
# @return [Array<Spec>]
|
|
60
|
-
def build_specs(force: false)
|
|
61
|
-
blocks = load_blocks
|
|
62
|
-
fetcher = build_fetcher(blocks)
|
|
63
|
-
blocks.map { |block| spec_for(block, fetcher, force) }.compact
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
private
|
|
67
|
-
|
|
68
|
-
def load_blocks
|
|
69
|
-
path = Cache.ucd_dir(@version).join("Blocks.txt")
|
|
70
|
-
return [] unless path.exist?
|
|
71
|
-
|
|
72
|
-
all = Parsers::Blocks.each_record(path).to_a
|
|
73
|
-
return all unless @block_filter && !@block_filter.empty?
|
|
74
|
-
|
|
75
|
-
filter_set = @block_filter.to_set
|
|
76
|
-
all.select { |block| filter_set.include?(block.id) }
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
def build_fetcher(blocks)
|
|
80
|
-
monolith = @monolith_path ? Pathname.new(@monolith_path) : nil
|
|
81
|
-
monolith = monolith.exist? ? monolith : nil
|
|
82
|
-
PdfFetcher.new(
|
|
83
|
-
@version,
|
|
84
|
-
monolith_path: monolith,
|
|
85
|
-
blocks: blocks,
|
|
86
|
-
page_map_cache: @page_map_cache,
|
|
87
|
-
)
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
def spec_for(block, fetcher, force)
|
|
91
|
-
pdf_path = fetcher.fetch(block_first_cp: block.range_first, force: force)
|
|
92
|
-
return nil unless pdf_path
|
|
93
|
-
|
|
94
|
-
Spec.new(block: block, pdf_path: pdf_path, page_map: page_map_for(block))
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# Per-block PDFs are page 1 = title, page 2 = first chart page
|
|
98
|
-
# starting at the block's first codepoint. True for most BMP
|
|
99
|
-
# blocks; multi-page blocks (CJK) need a richer resolver.
|
|
100
|
-
def page_map_for(block)
|
|
101
|
-
{ 2 => block.range_first }
|
|
102
|
-
end
|
|
103
|
-
end
|
|
104
|
-
end
|
|
105
|
-
end
|