ucode 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,250 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "pathname"
4
- require "thread"
5
- require "tmpdir"
6
- require "nokogiri"
7
-
8
- require "ucode/error"
9
- require "ucode/glyphs/page_renderer"
10
- require "ucode/glyphs/grid_detector"
11
- require "ucode/glyphs/cell_extractor"
12
- require "ucode/repo/atomic_writes"
13
- require "ucode/repo/paths"
14
-
15
- module Ucode
16
- module Glyphs
17
- # Writes `glyph.svg` for every codepoint in a block by orchestrating
18
- # the per-block pipeline: render PDF page → detect grid → extract
19
- # each cell → write atomic file.
20
- #
21
- # The Writer is **page-driven**: the caller hands it a `page_map`
22
- # (`{ page_num => first_cp_on_that_page }`) so the writer knows what
23
- # codepoint each detected cell anchor corresponds to. This is the
24
- # one piece of state the Writer can't derive on its own — pdftocairo
25
- # converts the row's codepoint labels to outlined glyphs, so they
26
- # aren't readable as text.
27
- #
28
- # **Idempotent**: re-runs are no-ops via `Repo::AtomicWrites` (byte
29
- # comparison; same content is skipped). Safe to re-run on the whole
30
- # output tree.
31
- #
32
- # **Atomic**: writes go through `<path>.tmp` + rename. A crash mid-
33
- # write leaves either the old file or no file, never a truncated one.
34
- #
35
- # **Placeholder for assigned codepoints with no glyph**: when a
36
- # codepoint is listed in `block.codepoint_ids` but no cell is found
37
- # on any rendered page, a small placeholder SVG is written so the
38
- # site can render a "no official glyph" badge. Counted in the tally
39
- # as `placeholder`.
40
- #
41
- # **Pure-ish**: takes a renderer instance (defaults to the first
42
- # available system renderer) and a fetcher; both are injectable for
43
- # tests. The only I/O is the renderer, the writer's output_root, and
44
- # any optional cache.
45
- class Writer
46
- include Repo::AtomicWrites
47
-
48
- PlaceholderViewBoxSize = 100
49
- private_constant :PlaceholderViewBoxSize
50
-
51
- # @param output_root [String, Pathname]
52
- # @param renderer [Ucode::Glyphs::PageRenderer] concrete renderer class
53
- # @param parallel_workers [Integer] worker pool size for #write_all
54
- def initialize(output_root:, renderer: PageRenderer.default, parallel_workers: 4)
55
- @output_root = Pathname.new(output_root)
56
- @renderer = renderer
57
- @parallel_workers = parallel_workers
58
- end
59
-
60
- # Process every page in `page_map`, writing glyph.svg for each
61
- # codepoint that (a) falls inside the block's range and (b) has a
62
- # detectable glyph on the page.
63
- #
64
- # @param block [Ucode::Models::Block]
65
- # @param pdf_path [String, Pathname]
66
- # @param page_map [Hash{Integer => Integer}] page_num => first cp on that page
67
- # @param strict [Boolean] raise GlyphError when the PDF is missing
68
- # or no grid is detected on any page; when false, returns a tally
69
- # with `no_grid` set and writes placeholders for assigned cps.
70
- # @return [Hash] tally { written: N, skipped: N, empty: N,
71
- # placeholder: N, no_grid: N }
72
- def write_block(block:, pdf_path:, page_map:, strict: false)
73
- unless pdf_path && Pathname.new(pdf_path).exist?
74
- raise_missing_pdf!(block, pdf_path) if strict
75
- return placeholder_pass(block, zero_tally.tap { |h| h[:no_grid] = 1 })
76
- end
77
-
78
- tally = zero_tally
79
- page_map.each do |page_num, first_cp|
80
- merge_tally!(tally, write_page(block: block, pdf_path: pdf_path,
81
- page_num: page_num, first_cp: first_cp))
82
- end
83
- placeholder_pass(block, tally)
84
- end
85
-
86
- # Render one page, detect its grid, write every cell whose codepoint
87
- # falls inside `block`'s range.
88
- #
89
- # @param block [Ucode::Models::Block]
90
- # @param pdf_path [String, Pathname]
91
- # @param page_num [Integer] 1-based PDF page number
92
- # @param first_cp [Integer] codepoint of the grid's top-left cell
93
- # @return [Hash] tally
94
- def write_page(block:, pdf_path:, page_num:, first_cp:)
95
- svg_doc = render_page(pdf_path, page_num)
96
- return no_grid_tally unless svg_doc
97
-
98
- grid = GridDetector.detect(svg_doc, block_first_cp: first_cp)
99
- return no_grid_tally unless grid
100
-
101
- counts = zero_tally
102
- extractor = CellExtractor.new(svg_doc)
103
- grid.rows.times do |row|
104
- grid.columns.times do |col|
105
- cp = grid.codepoint_at(row, col)
106
- next unless cp && block.covers?(cp)
107
-
108
- cell_svg = extractor.extract(grid, cp)
109
- if cell_svg.nil?
110
- counts[:empty] += 1
111
- next
112
- end
113
-
114
- written = write_glyph(block, cp, cell_svg)
115
- counts[written ? :written : :skipped] += 1
116
- end
117
- end
118
- counts
119
- end
120
-
121
- # Drain a list of block-spec hashes through the worker pool.
122
- # Each spec has the same shape as #write_block's kwargs:
123
- #
124
- # { block:, pdf_path:, page_map: }
125
- #
126
- # @param specs [Array<Hash>]
127
- # @return [Hash] aggregated tally across all blocks
128
- def write_all(specs)
129
- return drain_inline(specs) if @parallel_workers <= 1
130
-
131
- drain_threaded(specs)
132
- end
133
-
134
- private
135
-
136
- def zero_tally
137
- { written: 0, skipped: 0, empty: 0, placeholder: 0, no_grid: 0 }
138
- end
139
-
140
- def no_grid_tally
141
- zero_tally.tap { |h| h[:no_grid] = 1 }
142
- end
143
-
144
- def merge_tally!(acc, other)
145
- other.each { |k, v| acc[k] = (acc[k] || 0) + v }
146
- end
147
-
148
- def drain_inline(specs)
149
- specs.each_with_object(zero_tally) do |spec, tally|
150
- merge_tally!(tally, write_block(**spec))
151
- end
152
- end
153
-
154
- def drain_threaded(specs)
155
- queue = Queue.new
156
- mutex = Mutex.new
157
- tally = zero_tally
158
-
159
- workers = Array.new(@parallel_workers) do
160
- Thread.new do
161
- loop do
162
- spec = queue.pop
163
- break if spec.nil?
164
-
165
- result = write_block(**spec)
166
- mutex.synchronize { merge_tally!(tally, result) }
167
- end
168
- end
169
- end
170
-
171
- specs.each { |spec| queue << spec }
172
- @parallel_workers.times { queue << nil }
173
- workers.each(&:join)
174
- tally
175
- end
176
-
177
- def render_page(pdf_path, page_num)
178
- Dir.mktmpdir do |dir|
179
- out = File.join(dir, "p#{page_num}.svg")
180
- begin
181
- result = @renderer.render(Pathname.new(pdf_path), page_num, out)
182
- rescue Ucode::PdfRenderError
183
- # Graceful degradation: a broken renderer (e.g. mutool on a
184
- # host without LCMS) yields no_grid → placeholders downstream.
185
- next nil
186
- end
187
- return nil unless result == :ok && File.exist?(out)
188
-
189
- Nokogiri::XML(File.read(out))
190
- end
191
- end
192
-
193
- def write_glyph(block, codepoint, cell_svg)
194
- cp_id = Repo::Paths.cp_id(codepoint)
195
- path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
196
- write_atomic(path, serialize_svg(cell_svg))
197
- end
198
-
199
- # For every assigned codepoint in the block that doesn't already
200
- # have a glyph.svg on disk, write a placeholder.
201
- def placeholder_pass(block, tally)
202
- return tally if block.codepoint_ids.nil? || block.codepoint_ids.empty?
203
-
204
- block.codepoint_ids.each do |cp_id|
205
- cp = cp_id_to_int(cp_id)
206
- next unless cp
207
- next unless block.covers?(cp)
208
-
209
- path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
210
- next if path.exist?
211
-
212
- if write_atomic(path, placeholder_svg_payload)
213
- tally[:placeholder] = (tally[:placeholder] || 0) + 1
214
- end
215
- end
216
- tally
217
- end
218
-
219
- def cp_id_to_int(cp_id)
220
- return nil unless cp_id.is_a?(String) && cp_id.start_with?("U+")
221
-
222
- cp_id[2..].to_i(16)
223
- end
224
-
225
- def placeholder_svg_payload
226
- size = PlaceholderViewBoxSize
227
- # A simple dashed square + text marker so the site can render
228
- # an obvious "no official glyph" badge without needing extra state.
229
- <<~SVG
230
- <?xml version="1.0" encoding="UTF-8"?>
231
- <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 #{size} #{size}" width="#{size}" height="#{size}">
232
- <rect x="1" y="1" width="#{size - 2}" height="#{size - 2}" fill="none" stroke="#999" stroke-width="1" stroke-dasharray="4 4"/>
233
- <text x="#{size / 2}" y="#{size / 2}" font-family="sans-serif" font-size="14" text-anchor="middle" dominant-baseline="middle" fill="#999">no glyph</text>
234
- </svg>
235
- SVG
236
- end
237
-
238
- def serialize_svg(doc)
239
- doc.to_xml.strip
240
- end
241
-
242
- def raise_missing_pdf!(block, pdf_path)
243
- raise Ucode::GlyphError.new(
244
- "no PDF available for block '#{block.id}'",
245
- context: { block_id: block.id, pdf_path: pdf_path&.to_s },
246
- )
247
- end
248
- end
249
- end
250
- end