ucode 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/config/unicode17_universal_glyph_set.yml +1 -1
- data/lib/ucode/cli.rb +1 -35
- data/lib/ucode/commands/build.rb +3 -26
- data/lib/ucode/commands/canonical_build.rb +1 -4
- data/lib/ucode/commands.rb +0 -1
- data/lib/ucode/error.rb +0 -8
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +81 -4
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +230 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_glyph.rb +27 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_parser.rb +50 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_runner.rb +53 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +4 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +7 -50
- data/lib/ucode/glyphs.rb +4 -14
- data/lib/ucode/version.rb +1 -1
- data/lib/ucode.rb +0 -2
- metadata +6 -15
- data/lib/ucode/commands/glyphs.rb +0 -94
- data/lib/ucode/glyphs/cell_extractor.rb +0 -130
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +0 -29
- data/lib/ucode/glyphs/grid.rb +0 -30
- data/lib/ucode/glyphs/grid_detector.rb +0 -165
- data/lib/ucode/glyphs/monolith_page_map.rb +0 -181
- data/lib/ucode/glyphs/mutool_renderer.rb +0 -28
- data/lib/ucode/glyphs/page_renderer.rb +0 -234
- data/lib/ucode/glyphs/path_bbox.rb +0 -62
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +0 -26
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +0 -32
- data/lib/ucode/glyphs/pipeline.rb +0 -105
- data/lib/ucode/glyphs/writer.rb +0 -250
data/lib/ucode/glyphs/writer.rb
DELETED
|
@@ -1,250 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "pathname"
|
|
4
|
-
require "thread"
|
|
5
|
-
require "tmpdir"
|
|
6
|
-
require "nokogiri"
|
|
7
|
-
|
|
8
|
-
require "ucode/error"
|
|
9
|
-
require "ucode/glyphs/page_renderer"
|
|
10
|
-
require "ucode/glyphs/grid_detector"
|
|
11
|
-
require "ucode/glyphs/cell_extractor"
|
|
12
|
-
require "ucode/repo/atomic_writes"
|
|
13
|
-
require "ucode/repo/paths"
|
|
14
|
-
|
|
15
|
-
module Ucode
|
|
16
|
-
module Glyphs
|
|
17
|
-
# Writes `glyph.svg` for every codepoint in a block by orchestrating
|
|
18
|
-
# the per-block pipeline: render PDF page → detect grid → extract
|
|
19
|
-
# each cell → write atomic file.
|
|
20
|
-
#
|
|
21
|
-
# The Writer is **page-driven**: the caller hands it a `page_map`
|
|
22
|
-
# (`{ page_num => first_cp_on_that_page }`) so the writer knows what
|
|
23
|
-
# codepoint each detected cell anchor corresponds to. This is the
|
|
24
|
-
# one piece of state the Writer can't derive on its own — pdftocairo
|
|
25
|
-
# converts the row's codepoint labels to outlined glyphs, so they
|
|
26
|
-
# aren't readable as text.
|
|
27
|
-
#
|
|
28
|
-
# **Idempotent**: re-runs are no-ops via `Repo::AtomicWrites` (byte
|
|
29
|
-
# comparison; same content is skipped). Safe to re-run on the whole
|
|
30
|
-
# output tree.
|
|
31
|
-
#
|
|
32
|
-
# **Atomic**: writes go through `<path>.tmp` + rename. A crash mid-
|
|
33
|
-
# write leaves either the old file or no file, never a truncated one.
|
|
34
|
-
#
|
|
35
|
-
# **Placeholder for assigned codepoints with no glyph**: when a
|
|
36
|
-
# codepoint is listed in `block.codepoint_ids` but no cell is found
|
|
37
|
-
# on any rendered page, a small placeholder SVG is written so the
|
|
38
|
-
# site can render a "no official glyph" badge. Counted in the tally
|
|
39
|
-
# as `placeholder`.
|
|
40
|
-
#
|
|
41
|
-
# **Pure-ish**: takes a renderer instance (defaults to the first
|
|
42
|
-
# available system renderer) and a fetcher; both are injectable for
|
|
43
|
-
# tests. The only I/O is the renderer, the writer's output_root, and
|
|
44
|
-
# any optional cache.
|
|
45
|
-
class Writer
|
|
46
|
-
include Repo::AtomicWrites
|
|
47
|
-
|
|
48
|
-
PlaceholderViewBoxSize = 100
|
|
49
|
-
private_constant :PlaceholderViewBoxSize
|
|
50
|
-
|
|
51
|
-
# @param output_root [String, Pathname]
|
|
52
|
-
# @param renderer [Ucode::Glyphs::PageRenderer] concrete renderer class
|
|
53
|
-
# @param parallel_workers [Integer] worker pool size for #write_all
|
|
54
|
-
def initialize(output_root:, renderer: PageRenderer.default, parallel_workers: 4)
|
|
55
|
-
@output_root = Pathname.new(output_root)
|
|
56
|
-
@renderer = renderer
|
|
57
|
-
@parallel_workers = parallel_workers
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
# Process every page in `page_map`, writing glyph.svg for each
|
|
61
|
-
# codepoint that (a) falls inside the block's range and (b) has a
|
|
62
|
-
# detectable glyph on the page.
|
|
63
|
-
#
|
|
64
|
-
# @param block [Ucode::Models::Block]
|
|
65
|
-
# @param pdf_path [String, Pathname]
|
|
66
|
-
# @param page_map [Hash{Integer => Integer}] page_num => first cp on that page
|
|
67
|
-
# @param strict [Boolean] raise GlyphError when the PDF is missing
|
|
68
|
-
# or no grid is detected on any page; when false, returns a tally
|
|
69
|
-
# with `no_grid` set and writes placeholders for assigned cps.
|
|
70
|
-
# @return [Hash] tally { written: N, skipped: N, empty: N,
|
|
71
|
-
# placeholder: N, no_grid: N }
|
|
72
|
-
def write_block(block:, pdf_path:, page_map:, strict: false)
|
|
73
|
-
unless pdf_path && Pathname.new(pdf_path).exist?
|
|
74
|
-
raise_missing_pdf!(block, pdf_path) if strict
|
|
75
|
-
return placeholder_pass(block, zero_tally.tap { |h| h[:no_grid] = 1 })
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
tally = zero_tally
|
|
79
|
-
page_map.each do |page_num, first_cp|
|
|
80
|
-
merge_tally!(tally, write_page(block: block, pdf_path: pdf_path,
|
|
81
|
-
page_num: page_num, first_cp: first_cp))
|
|
82
|
-
end
|
|
83
|
-
placeholder_pass(block, tally)
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
# Render one page, detect its grid, write every cell whose codepoint
|
|
87
|
-
# falls inside `block`'s range.
|
|
88
|
-
#
|
|
89
|
-
# @param block [Ucode::Models::Block]
|
|
90
|
-
# @param pdf_path [String, Pathname]
|
|
91
|
-
# @param page_num [Integer] 1-based PDF page number
|
|
92
|
-
# @param first_cp [Integer] codepoint of the grid's top-left cell
|
|
93
|
-
# @return [Hash] tally
|
|
94
|
-
def write_page(block:, pdf_path:, page_num:, first_cp:)
|
|
95
|
-
svg_doc = render_page(pdf_path, page_num)
|
|
96
|
-
return no_grid_tally unless svg_doc
|
|
97
|
-
|
|
98
|
-
grid = GridDetector.detect(svg_doc, block_first_cp: first_cp)
|
|
99
|
-
return no_grid_tally unless grid
|
|
100
|
-
|
|
101
|
-
counts = zero_tally
|
|
102
|
-
extractor = CellExtractor.new(svg_doc)
|
|
103
|
-
grid.rows.times do |row|
|
|
104
|
-
grid.columns.times do |col|
|
|
105
|
-
cp = grid.codepoint_at(row, col)
|
|
106
|
-
next unless cp && block.covers?(cp)
|
|
107
|
-
|
|
108
|
-
cell_svg = extractor.extract(grid, cp)
|
|
109
|
-
if cell_svg.nil?
|
|
110
|
-
counts[:empty] += 1
|
|
111
|
-
next
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
written = write_glyph(block, cp, cell_svg)
|
|
115
|
-
counts[written ? :written : :skipped] += 1
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
counts
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
# Drain a list of block-spec hashes through the worker pool.
|
|
122
|
-
# Each spec has the same shape as #write_block's kwargs:
|
|
123
|
-
#
|
|
124
|
-
# { block:, pdf_path:, page_map: }
|
|
125
|
-
#
|
|
126
|
-
# @param specs [Array<Hash>]
|
|
127
|
-
# @return [Hash] aggregated tally across all blocks
|
|
128
|
-
def write_all(specs)
|
|
129
|
-
return drain_inline(specs) if @parallel_workers <= 1
|
|
130
|
-
|
|
131
|
-
drain_threaded(specs)
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
private
|
|
135
|
-
|
|
136
|
-
def zero_tally
|
|
137
|
-
{ written: 0, skipped: 0, empty: 0, placeholder: 0, no_grid: 0 }
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
def no_grid_tally
|
|
141
|
-
zero_tally.tap { |h| h[:no_grid] = 1 }
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
def merge_tally!(acc, other)
|
|
145
|
-
other.each { |k, v| acc[k] = (acc[k] || 0) + v }
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
def drain_inline(specs)
|
|
149
|
-
specs.each_with_object(zero_tally) do |spec, tally|
|
|
150
|
-
merge_tally!(tally, write_block(**spec))
|
|
151
|
-
end
|
|
152
|
-
end
|
|
153
|
-
|
|
154
|
-
def drain_threaded(specs)
|
|
155
|
-
queue = Queue.new
|
|
156
|
-
mutex = Mutex.new
|
|
157
|
-
tally = zero_tally
|
|
158
|
-
|
|
159
|
-
workers = Array.new(@parallel_workers) do
|
|
160
|
-
Thread.new do
|
|
161
|
-
loop do
|
|
162
|
-
spec = queue.pop
|
|
163
|
-
break if spec.nil?
|
|
164
|
-
|
|
165
|
-
result = write_block(**spec)
|
|
166
|
-
mutex.synchronize { merge_tally!(tally, result) }
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
specs.each { |spec| queue << spec }
|
|
172
|
-
@parallel_workers.times { queue << nil }
|
|
173
|
-
workers.each(&:join)
|
|
174
|
-
tally
|
|
175
|
-
end
|
|
176
|
-
|
|
177
|
-
def render_page(pdf_path, page_num)
|
|
178
|
-
Dir.mktmpdir do |dir|
|
|
179
|
-
out = File.join(dir, "p#{page_num}.svg")
|
|
180
|
-
begin
|
|
181
|
-
result = @renderer.render(Pathname.new(pdf_path), page_num, out)
|
|
182
|
-
rescue Ucode::PdfRenderError
|
|
183
|
-
# Graceful degradation: a broken renderer (e.g. mutool on a
|
|
184
|
-
# host without LCMS) yields no_grid → placeholders downstream.
|
|
185
|
-
next nil
|
|
186
|
-
end
|
|
187
|
-
return nil unless result == :ok && File.exist?(out)
|
|
188
|
-
|
|
189
|
-
Nokogiri::XML(File.read(out))
|
|
190
|
-
end
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
def write_glyph(block, codepoint, cell_svg)
|
|
194
|
-
cp_id = Repo::Paths.cp_id(codepoint)
|
|
195
|
-
path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
|
|
196
|
-
write_atomic(path, serialize_svg(cell_svg))
|
|
197
|
-
end
|
|
198
|
-
|
|
199
|
-
# For every assigned codepoint in the block that doesn't already
|
|
200
|
-
# have a glyph.svg on disk, write a placeholder.
|
|
201
|
-
def placeholder_pass(block, tally)
|
|
202
|
-
return tally if block.codepoint_ids.nil? || block.codepoint_ids.empty?
|
|
203
|
-
|
|
204
|
-
block.codepoint_ids.each do |cp_id|
|
|
205
|
-
cp = cp_id_to_int(cp_id)
|
|
206
|
-
next unless cp
|
|
207
|
-
next unless block.covers?(cp)
|
|
208
|
-
|
|
209
|
-
path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
|
|
210
|
-
next if path.exist?
|
|
211
|
-
|
|
212
|
-
if write_atomic(path, placeholder_svg_payload)
|
|
213
|
-
tally[:placeholder] = (tally[:placeholder] || 0) + 1
|
|
214
|
-
end
|
|
215
|
-
end
|
|
216
|
-
tally
|
|
217
|
-
end
|
|
218
|
-
|
|
219
|
-
def cp_id_to_int(cp_id)
|
|
220
|
-
return nil unless cp_id.is_a?(String) && cp_id.start_with?("U+")
|
|
221
|
-
|
|
222
|
-
cp_id[2..].to_i(16)
|
|
223
|
-
end
|
|
224
|
-
|
|
225
|
-
def placeholder_svg_payload
|
|
226
|
-
size = PlaceholderViewBoxSize
|
|
227
|
-
# A simple dashed square + text marker so the site can render
|
|
228
|
-
# an obvious "no official glyph" badge without needing extra state.
|
|
229
|
-
<<~SVG
|
|
230
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
|
231
|
-
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 #{size} #{size}" width="#{size}" height="#{size}">
|
|
232
|
-
<rect x="1" y="1" width="#{size - 2}" height="#{size - 2}" fill="none" stroke="#999" stroke-width="1" stroke-dasharray="4 4"/>
|
|
233
|
-
<text x="#{size / 2}" y="#{size / 2}" font-family="sans-serif" font-size="14" text-anchor="middle" dominant-baseline="middle" fill="#999">no glyph</text>
|
|
234
|
-
</svg>
|
|
235
|
-
SVG
|
|
236
|
-
end
|
|
237
|
-
|
|
238
|
-
def serialize_svg(doc)
|
|
239
|
-
doc.to_xml.strip
|
|
240
|
-
end
|
|
241
|
-
|
|
242
|
-
def raise_missing_pdf!(block, pdf_path)
|
|
243
|
-
raise Ucode::GlyphError.new(
|
|
244
|
-
"no PDF available for block '#{block.id}'",
|
|
245
|
-
context: { block_id: block.id, pdf_path: pdf_path&.to_s },
|
|
246
|
-
)
|
|
247
|
-
end
|
|
248
|
-
end
|
|
249
|
-
end
|
|
250
|
-
end
|