ucode 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/TODO.extract-code-chart/01-pdf-fetch-validation.md +80 -0
- data/TODO.extract-code-chart/02-block-name-resolver.md +68 -0
- data/TODO.extract-code-chart/03-codechart-namespace.md +82 -0
- data/TODO.extract-code-chart/04-codechart-extractor.md +154 -0
- data/TODO.extract-code-chart/05-provenance-and-sidecar.md +147 -0
- data/TODO.extract-code-chart/06-codechart-writer.md +134 -0
- data/TODO.extract-code-chart/07-codechart-cli.md +135 -0
- data/TODO.extract-code-chart/08-specs.md +87 -0
- data/lib/ucode/cli.rb +99 -0
- data/lib/ucode/code_chart/extractor.rb +122 -0
- data/lib/ucode/code_chart/provenance.rb +81 -0
- data/lib/ucode/code_chart/sidecar.rb +52 -0
- data/lib/ucode/code_chart/writer.rb +128 -0
- data/lib/ucode/code_chart.rb +39 -0
- data/lib/ucode/error.rb +11 -0
- data/lib/ucode/fetch/code_charts.rb +1 -1
- data/lib/ucode/fetch/http.rb +84 -14
- data/lib/ucode/parsers/blocks.rb +34 -0
- data/lib/ucode/version.rb +1 -1
- data/lib/ucode.rb +3 -0
- metadata +15 -2
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# TODO 07 — CodeChart CLI
|
|
2
|
+
|
|
3
|
+
## Status
|
|
4
|
+
|
|
5
|
+
Pending. Depends on TODO 06 (Writer), TODO 04 (Extractor),
|
|
6
|
+
TODO 02 (block name resolver).
|
|
7
|
+
|
|
8
|
+
## Goal
|
|
9
|
+
|
|
10
|
+
`ucode code-chart fetch | extract | list` — the REQ (R4) commands.
|
|
11
|
+
Thin Thor wrappers that delegate to the existing `CodeChart::*`
|
|
12
|
+
modules. No orchestration logic in the CLI; every command is a
|
|
13
|
+
single delegation.
|
|
14
|
+
|
|
15
|
+
## Files
|
|
16
|
+
|
|
17
|
+
- `lib/ucode/cli.rb` — add the `CodeChartCmd` Thor subcommand class.
|
|
18
|
+
- `spec/ucode/cli_spec.rb` (extend existing) — verify the new
|
|
19
|
+
subcommand wires up.
|
|
20
|
+
|
|
21
|
+
## Design
|
|
22
|
+
|
|
23
|
+
### Subcommand shape
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
class Cli < Thor
|
|
27
|
+
# …existing commands…
|
|
28
|
+
|
|
29
|
+
class CodeChartCmd < Thor
|
|
30
|
+
desc "fetch --block BLOCK", "Download the Code Charts PDF for a block"
|
|
31
|
+
option :block, type: :string, required: true,
|
|
32
|
+
desc: "Block identifier (e.g. Sidetic)"
|
|
33
|
+
def fetch
|
|
34
|
+
puts JSON.pretty_generate(
|
|
35
|
+
Commands::FetchCommand.new.fetch_charts(
|
|
36
|
+
VersionResolver.resolve(nil),
|
|
37
|
+
block_first_cps: [block_first_cp!(options[:block])],
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
desc "extract --block BLOCK --to DIR", "Extract per-codepoint SVG + provenance sidecars"
|
|
43
|
+
option :block, type: :string, required: true,
|
|
44
|
+
desc: "Block identifier (e.g. Sidetic)"
|
|
45
|
+
option :to, type: :string, required: true,
|
|
46
|
+
desc: "Output directory"
|
|
47
|
+
def extract
|
|
48
|
+
# ...
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
desc "list", "List blocks that have Code Charts PDFs available locally"
|
|
52
|
+
def list
|
|
53
|
+
# ...
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
desc "code-chart", "Extract per-codepoint SVG glyphs from Unicode Code Charts PDFs"
|
|
58
|
+
subcommand "code-chart", CodeChartCmd
|
|
59
|
+
end
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### `extract` flow
|
|
63
|
+
|
|
64
|
+
```ruby
|
|
65
|
+
def extract
|
|
66
|
+
Ucode::Commands::FetchCommand.new.fetch_charts(
|
|
67
|
+
VersionResolver.resolve(nil),
|
|
68
|
+
block_first_cps: [block_first_cp!(options[:block])],
|
|
69
|
+
)
|
|
70
|
+
blocks_txt = Ucode::Cache.ucd_dir(VersionResolver.resolve(nil)).join("Blocks.txt")
|
|
71
|
+
block = Parsers::Blocks.find_by_name(blocks_txt, options[:block]) or
|
|
72
|
+
raise Thor::Error, "Unknown block: #{options[:block].inspect}"
|
|
73
|
+
pdf = Ucode::Glyphs::PdfFetcher.new(
|
|
74
|
+
VersionResolver.resolve(nil),
|
|
75
|
+
monolith_path: nil,
|
|
76
|
+
blocks: [block],
|
|
77
|
+
).fetch(block_first_cp: block.range_first, force: false) or
|
|
78
|
+
raise Thor::Error, "PDF unavailable for block #{options[:block]}"
|
|
79
|
+
|
|
80
|
+
writer = Ucode::CodeChart::Writer.new(
|
|
81
|
+
output_root: Pathname.new(options[:to]),
|
|
82
|
+
pdf_path: pdf,
|
|
83
|
+
blocks_txt: blocks_txt,
|
|
84
|
+
)
|
|
85
|
+
summary = writer.write(block)
|
|
86
|
+
puts JSON.pretty_generate(summary.to_h.compact)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def block_first_cp!(block_id)
|
|
90
|
+
cache = Ucode::Cache.ucd_dir(VersionResolver.resolve(nil))
|
|
91
|
+
block = Ucode::Parsers::Blocks.find_by_name(cache.join("Blocks.txt"), block_id)
|
|
92
|
+
raise Thor::Error, "Unknown block: #{block_id.inspect}" unless block
|
|
93
|
+
block.range_first
|
|
94
|
+
end
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Why `Ucode::Commands::FetchCommand.new.fetch_charts` for fetch
|
|
98
|
+
|
|
99
|
+
`fetch_charts` is the existing CLI hook for "download a Code Charts
|
|
100
|
+
PDF for these block first-cps." We just call it with the block's
|
|
101
|
+
first cp. Reuse, don't reimplement.
|
|
102
|
+
|
|
103
|
+
### Why resolve version once at the top of `extract`
|
|
104
|
+
|
|
105
|
+
Per Candidate 4 of the architecture review (`refactor/build-context-resolve-version-once`,
|
|
106
|
+
merged): every CLI method resolves the version once and threads it
|
|
107
|
+
through. This CLI method does the same — one call per invocation.
|
|
108
|
+
|
|
109
|
+
### Why no separate `Commands::CodeChartCommand` class
|
|
110
|
+
|
|
111
|
+
Following the existing pattern (e.g. `Cli::Audit` calls
|
|
112
|
+
`Commands::Audit::*Command` *only* when the logic is non-trivial).
|
|
113
|
+
The CodeChart commands are trivial delegations — a one-liner each.
|
|
114
|
+
The CLI methods call `CodeChart::Writer` and `CodeChart::Extractor`
|
|
115
|
+
directly. Adding a `Commands::CodeChartCommand` class would be
|
|
116
|
+
indirection without a payoff.
|
|
117
|
+
|
|
118
|
+
If the extract logic grows (e.g. progress reporting, partial
|
|
119
|
+
extraction), extract it into a Command class at that point.
|
|
120
|
+
|
|
121
|
+
## Acceptance
|
|
122
|
+
|
|
123
|
+
- `ucode code-chart fetch --block Sidetic` downloads the PDF.
|
|
124
|
+
- `ucode code-chart extract --block Sidetic --to /tmp/s/` extracts
|
|
125
|
+
to the given directory.
|
|
126
|
+
- `ucode code-chart list` prints available blocks.
|
|
127
|
+
- Unknown block names produce a clean error, not a stack trace.
|
|
128
|
+
- The CLI matches the REQ's signature exactly.
|
|
129
|
+
|
|
130
|
+
## Out of scope
|
|
131
|
+
|
|
132
|
+
- `--version` flag (the REQ doesn't specify, and existing commands
|
|
133
|
+
default to the configured version).
|
|
134
|
+
- `--format svg|glif` (the REQ specifies SVG; `.glif` output is a
|
|
135
|
+
future extension).
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# TODO 08 — CodeChart specs
|
|
2
|
+
|
|
3
|
+
## Status
|
|
4
|
+
|
|
5
|
+
Pending. Depends on TODOs 01–07.
|
|
6
|
+
|
|
7
|
+
## Goal
|
|
8
|
+
|
|
9
|
+
Comprehensive spec coverage for every new module. Per the project
|
|
10
|
+
rules: real model instances (no doubles), tight focused tests,
|
|
11
|
+
behavior assertions (not interaction counts).
|
|
12
|
+
|
|
13
|
+
## Files
|
|
14
|
+
|
|
15
|
+
Already enumerated per TODO. Final spec coverage:
|
|
16
|
+
|
|
17
|
+
- `spec/ucode/fetch/code_charts_spec.rb` — happy path + HTTP 4xx +
|
|
18
|
+
wrong content-type + non-PDF body. (TODO 01)
|
|
19
|
+
- `spec/ucode/parsers/blocks_spec.rb` (extend) — `find_by_name`
|
|
20
|
+
happy path + nil on miss. (TODO 02)
|
|
21
|
+
- `spec/ucode/code_chart/extractor_spec.rb` — constructor invariants,
|
|
22
|
+
Resolver wiring, integration test against fixture PDF. (TODO 04)
|
|
23
|
+
- `spec/ucode/code_chart/provenance_spec.rb` — value object
|
|
24
|
+
construction + `to_h` schema. (TODO 05)
|
|
25
|
+
- `spec/ucode/code_chart/sidecar_spec.rb` — write sidecar, idempotent
|
|
26
|
+
re-write. (TODO 05)
|
|
27
|
+
- `spec/ucode/code_chart/writer_spec.rb` — full lifecycle:
|
|
28
|
+
extract → write → summary. Idempotent re-run produces byte-identical
|
|
29
|
+
files. (TODO 06)
|
|
30
|
+
- `spec/ucode/cli_spec.rb` (extend) — verify `ucode code-chart fetch`,
|
|
31
|
+
`extract`, `list` wire up. (TODO 07)
|
|
32
|
+
|
|
33
|
+
## Design
|
|
34
|
+
|
|
35
|
+
### Fixture strategy
|
|
36
|
+
|
|
37
|
+
The existing `spec/fixtures/pdfs/basic_latin.pdf` is the only PDF
|
|
38
|
+
fixture in the repo. It's tiny and validates the integration path.
|
|
39
|
+
The Sidetic + Egyptian Ext-B PDFs are large (whole-block) and would
|
|
40
|
+
inflate the repo. The integration spec uses `basic_latin.pdf` to
|
|
41
|
+
exercise the full pipeline; per-codepoint assertions cover
|
|
42
|
+
representative cases.
|
|
43
|
+
|
|
44
|
+
If Sidetic-specific behavior must be tested, a smaller fixture PDF
|
|
45
|
+
cropped to ~5 codepoints would be the right tool — out of scope for
|
|
46
|
+
this TODO.
|
|
47
|
+
|
|
48
|
+
### No doubles policy
|
|
49
|
+
|
|
50
|
+
The project's `~/.claude/CLAUDE.md` rule: no doubles. All specs use
|
|
51
|
+
real instances:
|
|
52
|
+
- `Ucode::Models::Block.new(...)` for test blocks.
|
|
53
|
+
- A temp directory + real `Blocks.txt` text for parser specs.
|
|
54
|
+
- The real `Ucode::Glyphs::Resolver` for extractor specs.
|
|
55
|
+
|
|
56
|
+
### Idempotency assertion pattern
|
|
57
|
+
|
|
58
|
+
`Writer#write` idempotency is asserted via byte-equality:
|
|
59
|
+
|
|
60
|
+
```ruby
|
|
61
|
+
first_run = writer.write(block)
|
|
62
|
+
first_size = File.stat(svg_path).size
|
|
63
|
+
sleep 0.01 # ensure mtime changes would be detectable
|
|
64
|
+
second_run = writer.write(block)
|
|
65
|
+
second_size = File.stat(svg_path).size
|
|
66
|
+
expect(second_size).to eq(first_size)
|
|
67
|
+
expect(File.read(svg_path)).to eq(expected_svg_bytes)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
This is the existing pattern from `spec/ucode/repo/aggregate_writer_spec.rb`
|
|
71
|
+
(idiempotency spec there). Reuse.
|
|
72
|
+
|
|
73
|
+
## Acceptance
|
|
74
|
+
|
|
75
|
+
- `bundle exec rspec spec/ucode/code_chart/ spec/ucode/fetch/code_charts_spec.rb spec/ucode/parsers/blocks_spec.rb`
|
|
76
|
+
passes 100%.
|
|
77
|
+
- Coverage for the new files is ≥ 95% (per the project's per-file
|
|
78
|
+
floor of 30% + the overall 80% minimum).
|
|
79
|
+
- No doubles are introduced (verify with `grep -r "double(" spec/ucode/code_chart/`).
|
|
80
|
+
- The integration spec exercises both the Extractor and Writer
|
|
81
|
+
together end-to-end.
|
|
82
|
+
|
|
83
|
+
## Out of scope
|
|
84
|
+
|
|
85
|
+
- Performance benchmarks — separate concern.
|
|
86
|
+
- Sidetic-specific fixtures — requires PDF curation beyond the
|
|
87
|
+
scope of this feature.
|
data/lib/ucode/cli.rb
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
require "thor"
|
|
4
4
|
|
|
5
5
|
require "ucode/commands"
|
|
6
|
+
require "ucode/code_chart"
|
|
6
7
|
require "ucode/version_resolver"
|
|
7
8
|
|
|
8
9
|
module Ucode
|
|
@@ -157,6 +158,104 @@ module Ucode
|
|
|
157
158
|
desc "site", "Generate the Vitepress site"
|
|
158
159
|
subcommand "site", Site
|
|
159
160
|
|
|
161
|
+
# ─────────────── code-chart ───────────────
|
|
162
|
+
# Extract per-codepoint SVG glyphs from a Unicode Code Charts PDF.
|
|
163
|
+
# One folder per block under --to, with <U+XXXX>.svg + .json pairs.
|
|
164
|
+
class CodeChartCmd < Thor
|
|
165
|
+
desc "fetch --block BLOCK [VERSION]", "Download the Code Charts PDF for a block"
|
|
166
|
+
option :block, type: :string, required: true,
|
|
167
|
+
desc: "Block identifier (e.g. Sidetic, Basic_Latin)"
|
|
168
|
+
def fetch(version = nil)
|
|
169
|
+
with_codechart_errors do
|
|
170
|
+
block_first_cp = resolve_block_first_cp!(options[:block], version)
|
|
171
|
+
result = Commands::FetchCommand.new.fetch_charts(
|
|
172
|
+
VersionResolver.resolve(version),
|
|
173
|
+
block_first_cps: [block_first_cp],
|
|
174
|
+
)
|
|
175
|
+
puts JSON.pretty_generate(result)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
desc "extract --block BLOCK --to DIR [VERSION]",
|
|
180
|
+
"Extract per-codepoint SVG + provenance sidecars from a Code Charts PDF"
|
|
181
|
+
option :block, type: :string, required: true,
|
|
182
|
+
desc: "Block identifier (e.g. Sidetic)"
|
|
183
|
+
option :to, type: :string, required: true,
|
|
184
|
+
desc: "Output directory (will contain <block_id>/<U+XXXX>.svg + .json)"
|
|
185
|
+
def extract(version = nil)
|
|
186
|
+
with_codechart_errors do
|
|
187
|
+
version_str = VersionResolver.resolve(version)
|
|
188
|
+
block = resolve_block!(options[:block], version_str)
|
|
189
|
+
block_first_cp = block.range_first
|
|
190
|
+
|
|
191
|
+
# Download (idempotent — re-runs skip when the PDF is cached).
|
|
192
|
+
Commands::FetchCommand.new.fetch_charts(version_str, block_first_cps: [block_first_cp])
|
|
193
|
+
|
|
194
|
+
pdf = Ucode::Glyphs::PdfFetcher.new(version_str)
|
|
195
|
+
.fetch(block_first_cp: block_first_cp)
|
|
196
|
+
raise Ucode::CodeChartNotFoundError.new(
|
|
197
|
+
"Code Charts PDF unavailable for block #{block.id.inspect}",
|
|
198
|
+
context: { block_id: block.id, version: version_str },
|
|
199
|
+
) unless pdf
|
|
200
|
+
|
|
201
|
+
writer = Ucode::CodeChart::Writer.new(
|
|
202
|
+
output_root: Pathname.new(options[:to]),
|
|
203
|
+
pdf_path: pdf,
|
|
204
|
+
ucd_version: version_str,
|
|
205
|
+
)
|
|
206
|
+
summary = writer.write(block)
|
|
207
|
+
puts JSON.pretty_generate(summary.to_h.compact)
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
desc "list", "List cached Code Charts PDFs under the version's cache"
|
|
212
|
+
def list
|
|
213
|
+
version = VersionResolver.resolve(nil)
|
|
214
|
+
pdfs_dir = Ucode::Cache.pdfs_dir(version)
|
|
215
|
+
files = pdfs_dir.exist? ? pdfs_dir.children.sort : []
|
|
216
|
+
if files.empty?
|
|
217
|
+
puts "(no cached Code Charts PDFs)"
|
|
218
|
+
return
|
|
219
|
+
end
|
|
220
|
+
files.each do |f|
|
|
221
|
+
puts f.basename.to_s
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
private
|
|
226
|
+
|
|
227
|
+
# Resolve a block name to its first codepoint via the cached
|
|
228
|
+
# Blocks.txt. Raises {Ucode::UnknownBlockError} on miss.
|
|
229
|
+
def resolve_block!(block_id, version)
|
|
230
|
+
blocks_txt = Ucode::Cache.ucd_dir(VersionResolver.resolve(version)).join("Blocks.txt")
|
|
231
|
+
Ucode::Parsers::Blocks.find_by_id!(blocks_txt, block_id)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def resolve_block_first_cp!(block_id, version)
|
|
235
|
+
resolve_block!(block_id, version).range_first
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Convert semantic Ucode errors into Thor errors so Thor's
|
|
239
|
+
# dispatch prints the message cleanly instead of a stack trace.
|
|
240
|
+
# Thor's `start` rescues only `Thor::Error`; without this bridge,
|
|
241
|
+
# any `Ucode::Error` subclass propagates as an uncaught exception.
|
|
242
|
+
def with_codechart_errors
|
|
243
|
+
yield
|
|
244
|
+
rescue Ucode::Error => e
|
|
245
|
+
raise Thor::Error, e.message
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Register the subcommand under the underscored method name
|
|
250
|
+
# (`code_chart`). Thor's `normalize_command_name` converts the
|
|
251
|
+
# user's hyphenated form (`code-chart`) to the underscored form
|
|
252
|
+
# before lookup, so `ucode code-chart <cmd>` dispatches correctly.
|
|
253
|
+
# `desc` first registers the method as a Thor command so the
|
|
254
|
+
# dispatch table has an entry; `subcommand` then attaches the
|
|
255
|
+
# CodeChartCmd class to it.
|
|
256
|
+
desc "code_chart <command>", "Extract SVG glyphs from Unicode Code Charts PDFs"
|
|
257
|
+
subcommand "code_chart", CodeChartCmd
|
|
258
|
+
|
|
160
259
|
# ─────────────── lookup ───────────────
|
|
161
260
|
class Lookup < Thor
|
|
162
261
|
desc "block CODEPOINT", "Block name covering CODEPOINT (integer or 0xNNNN)"
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
|
|
5
|
+
require "ucode/error"
|
|
6
|
+
require "ucode/glyphs/embedded_fonts/catalog"
|
|
7
|
+
require "ucode/glyphs/embedded_fonts/renderer"
|
|
8
|
+
require "ucode/glyphs/embedded_fonts/source"
|
|
9
|
+
require "ucode/glyphs/resolver"
|
|
10
|
+
require "ucode/glyphs/sources/pillar1_embedded_tounicode"
|
|
11
|
+
require "ucode/glyphs/sources/tier1_real_font"
|
|
12
|
+
|
|
13
|
+
module Ucode
|
|
14
|
+
module CodeChart
|
|
15
|
+
# Walks every assigned codepoint in a block and returns one
|
|
16
|
+
# {Result} per codepoint that any tier produced a glyph for.
|
|
17
|
+
#
|
|
18
|
+
# This is **not** a new extraction pipeline — it composes the
|
|
19
|
+
# existing {Ucode::Glyphs::Resolver} with per-block inputs
|
|
20
|
+
# (the block's Code Charts PDF + optionally Tier 1 and Pillar 3
|
|
21
|
+
# sources). The Resolver owns tier selection; the Extractor owns
|
|
22
|
+
# inputs.
|
|
23
|
+
#
|
|
24
|
+
# The REQ (R2) describes extraction via "locate the grid cell
|
|
25
|
+
# whose margin label matches the codepoint" — that was the v0.1
|
|
26
|
+
# retired approach (cell-border compositing). The current path
|
|
27
|
+
# is the embedded-font walk (Pillar 1, via {EmbeddedFonts::Catalog})
|
|
28
|
+
# with Pillar 2 (positional correlation) and Pillar 3 (Last Resort
|
|
29
|
+
# placeholders) as fallbacks.
|
|
30
|
+
#
|
|
31
|
+
# ## Tier selection
|
|
32
|
+
#
|
|
33
|
+
# Pillar 1 is always configured (the embedded font walk over the
|
|
34
|
+
# block's PDF). Tier 1 (real-font cmap) and Pillar 3 (Last
|
|
35
|
+
# Resort) are optional — the caller injects pre-built sources.
|
|
36
|
+
# This avoids forcing the Extractor to construct Last Resort
|
|
37
|
+
# eagerly, which would fail in environments where the UFO is
|
|
38
|
+
# not checked out.
|
|
39
|
+
class Extractor
|
|
40
|
+
# Result of extracting one codepoint.
|
|
41
|
+
Result = Struct.new(:codepoint, :svg, :tier, :provenance, keyword_init: true)
|
|
42
|
+
|
|
43
|
+
# @param block [Ucode::Models::Block] block whose assigned
|
|
44
|
+
# codepoints will be extracted
|
|
45
|
+
# @param pdf_path [Pathname, String] path to the per-block
|
|
46
|
+
# Code Charts PDF (downloaded by the caller; the Extractor
|
|
47
|
+
# doesn't fetch)
|
|
48
|
+
# @param cache_dir [Pathname, String, nil] directory for
|
|
49
|
+
# cached extracted font streams. nil = default
|
|
50
|
+
# (data/pdf-fonts/ relative to the gem root).
|
|
51
|
+
# @param tier1_sources [Array<Ucode::Glyphs::Source>, nil]
|
|
52
|
+
# optional Tier 1 sources (real-font cmap). nil = no Tier 1
|
|
53
|
+
# @param pillar3_source [Ucode::Glyphs::Source, nil] optional
|
|
54
|
+
# Pillar 3 (Last Resort) source. nil = no Pillar 3 fallback.
|
|
55
|
+
# Callers that want Last Resort placeholders inject the
|
|
56
|
+
# pre-built source here.
|
|
57
|
+
def initialize(block:, pdf_path:, cache_dir: nil,
|
|
58
|
+
tier1_sources: nil, pillar3_source: nil)
|
|
59
|
+
@block = block
|
|
60
|
+
@pdf_path = Pathname.new(pdf_path)
|
|
61
|
+
@cache_dir = cache_dir && Pathname.new(cache_dir)
|
|
62
|
+
@tier1_sources = tier1_sources || []
|
|
63
|
+
@pillar3_source = pillar3_source
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# @return [Array<Result>] one Result per codepoint that any
|
|
67
|
+
# tier produced a glyph for. Codepoints no tier can serve
|
|
68
|
+
# are silently skipped (no Result yielded).
|
|
69
|
+
def extract
|
|
70
|
+
resolver = build_resolver
|
|
71
|
+
results = []
|
|
72
|
+
each_codepoint do |cp|
|
|
73
|
+
resolver_result = resolver.resolve(cp)
|
|
74
|
+
next unless resolver_result&.svg
|
|
75
|
+
|
|
76
|
+
results << Result.new(
|
|
77
|
+
codepoint: cp,
|
|
78
|
+
svg: resolver_result.svg,
|
|
79
|
+
tier: resolver_result.tier,
|
|
80
|
+
provenance: resolver_result.provenance,
|
|
81
|
+
)
|
|
82
|
+
end
|
|
83
|
+
results
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
# Yields every codepoint in the block's range in ascending
|
|
89
|
+
# order. We yield the whole range because the Resolver's
|
|
90
|
+
# tiers handle unassigned codepoints — Pillar 3 (when
|
|
91
|
+
# configured) maps every codepoint via its Format 13 cmap,
|
|
92
|
+
# so unassigned slots get a placeholder. With no Pillar 3
|
|
93
|
+
# injected, only assigned codepoints (those the embedded
|
|
94
|
+
# font actually covers) yield Results; the rest are silently
|
|
95
|
+
# skipped, satisfying the REQ's "skip unassigned codepoints".
|
|
96
|
+
def each_codepoint
|
|
97
|
+
return enum_for(:each_codepoint) unless block_given?
|
|
98
|
+
|
|
99
|
+
(@block.range_first..@block.range_last).each do |cp|
|
|
100
|
+
yield cp
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def build_resolver
|
|
105
|
+
sources = @tier1_sources.dup
|
|
106
|
+
sources.concat(embedded_pillar_sources)
|
|
107
|
+
sources << @pillar3_source if @pillar3_source
|
|
108
|
+
order = sources.map(&:tier).uniq
|
|
109
|
+
Glyphs::Resolver.new(sources: sources, order: order)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def embedded_pillar_sources
|
|
113
|
+
embedded_source = Glyphs::EmbeddedFonts::Source.new(
|
|
114
|
+
pdf: @pdf_path, cache_dir: @cache_dir,
|
|
115
|
+
)
|
|
116
|
+
catalog = Glyphs::EmbeddedFonts::Catalog.new(embedded_source)
|
|
117
|
+
renderer = Glyphs::EmbeddedFonts::Renderer.new(catalog)
|
|
118
|
+
[Glyphs::Sources::Pillar1EmbeddedTounicode.new(renderer: renderer)]
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require "time"
|
|
5
|
+
|
|
6
|
+
require "ucode/version"
|
|
7
|
+
|
|
8
|
+
module Ucode
|
|
9
|
+
module CodeChart
|
|
10
|
+
# Per-codepoint provenance value object — every field the REQ
|
|
11
|
+
# (R5) requires in the sidecar JSON next to each extracted SVG.
|
|
12
|
+
#
|
|
13
|
+
# Single source of truth for the provenance schema: the
|
|
14
|
+
# {Sidecar} writer reads this Struct, the Writer constructs it.
|
|
15
|
+
# Adding a field is one place to change.
|
|
16
|
+
#
|
|
17
|
+
# `extractor_version` reads from `Ucode::VERSION` at construction
|
|
18
|
+
# so the field stays in sync with the gem's version bump — single
|
|
19
|
+
# source of truth.
|
|
20
|
+
#
|
|
21
|
+
# `extracted_at` is the extraction event timestamp (UTC ISO8601),
|
|
22
|
+
# not the file-write timestamp.
|
|
23
|
+
Provenance = Struct.new(
|
|
24
|
+
:codepoint,
|
|
25
|
+
:block,
|
|
26
|
+
:source_pdf_url,
|
|
27
|
+
:source_pdf_sha256,
|
|
28
|
+
:ucd_version,
|
|
29
|
+
:extracted_at,
|
|
30
|
+
:extractor_version,
|
|
31
|
+
keyword_init: true,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Computes the source PDF's URL from a block name and first
|
|
35
|
+
# codepoint. Mirrors the per-block URL convention in
|
|
36
|
+
# {Ucode::Fetch::CodeCharts}: 4-digit hex for BMP, 6-digit for
|
|
37
|
+
# supplementary planes.
|
|
38
|
+
#
|
|
39
|
+
# @param block_first_cp [Integer]
|
|
40
|
+
# @return [String]
|
|
41
|
+
def self.code_chart_url(block_first_cp)
|
|
42
|
+
width = block_first_cp > 0xFFFF ? 6 : 4
|
|
43
|
+
slug = block_first_cp.to_s(16).upcase.rjust(width, "0")
|
|
44
|
+
"#{Ucode.configuration.charts_base_url}/U#{slug}.pdf"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Builds a Provenance from the inputs the {Writer} has on hand
|
|
48
|
+
# (block, codepoint, ucd_version, pdf_path). Computes the PDF
|
|
49
|
+
# hash + URL once. The `extracted_at` timestamp is fixed at
|
|
50
|
+
# call time so re-running the same block produces identical
|
|
51
|
+
# provenance JSON for unchanged codepoints.
|
|
52
|
+
#
|
|
53
|
+
# @param block [Ucode::Models::Block]
|
|
54
|
+
# @param codepoint [Integer]
|
|
55
|
+
# @param ucd_version [String]
|
|
56
|
+
# @param pdf_path [Pathname, String]
|
|
57
|
+
# @param now [Time, nil] override for tests
|
|
58
|
+
# @return [Provenance]
|
|
59
|
+
def self.build(block:, codepoint:, ucd_version:, pdf_path:, now: nil)
|
|
60
|
+
path = Pathname.new(pdf_path)
|
|
61
|
+
Provenance.new(
|
|
62
|
+
codepoint: format("U+%04X", codepoint),
|
|
63
|
+
block: block.id,
|
|
64
|
+
source_pdf_url: code_chart_url(block.range_first),
|
|
65
|
+
source_pdf_sha256: sha256_of(path),
|
|
66
|
+
ucd_version: ucd_version,
|
|
67
|
+
extracted_at: (now || Time.now.utc).iso8601,
|
|
68
|
+
extractor_version: Ucode::VERSION,
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# @param path [Pathname]
|
|
73
|
+
# @return [String] hex digest, "" when the path doesn't exist
|
|
74
|
+
# (callers can decide how to handle a missing hash)
|
|
75
|
+
def self.sha256_of(path)
|
|
76
|
+
return "" unless path.exist?
|
|
77
|
+
|
|
78
|
+
Digest::SHA256.file(path).hexdigest
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "pathname"
|
|
5
|
+
|
|
6
|
+
require "ucode/repo/atomic_writes"
|
|
7
|
+
|
|
8
|
+
module Ucode
|
|
9
|
+
module CodeChart
|
|
10
|
+
# Writes a {Provenance} to disk as the sidecar JSON next to its
|
|
11
|
+
# corresponding SVG.
|
|
12
|
+
#
|
|
13
|
+
# Path: `<output_root>/<codepoint>.json` — colocated with the
|
|
14
|
+
# SVG so a downstream consumer can find both files by a single
|
|
15
|
+
# directory listing.
|
|
16
|
+
#
|
|
17
|
+
# Idempotent via {Ucode::Repo::AtomicWrites#write_atomic}: a
|
|
18
|
+
# re-write of byte-identical content is a no-op (no temp-file
|
|
19
|
+
# rename). Provenance JSON is canonical (sorted keys via Ruby's
|
|
20
|
+
# stdlib JSON), so the byte-equality test is sound.
|
|
21
|
+
class Sidecar
|
|
22
|
+
include Ucode::Repo::AtomicWrites
|
|
23
|
+
|
|
24
|
+
# @param output_root [Pathname, String] directory the SVG +
|
|
25
|
+
# sidecar live in. Parent directories are created on demand.
|
|
26
|
+
def initialize(output_root:)
|
|
27
|
+
@output_root = Pathname.new(output_root)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @param provenance [Ucode::CodeChart::Provenance]
|
|
31
|
+
# @return [Pathname] the written sidecar path
|
|
32
|
+
def write(provenance)
|
|
33
|
+
path = path_for(provenance)
|
|
34
|
+
payload = JSON.pretty_generate(provenance.to_h) + "\n"
|
|
35
|
+
write_atomic(path, payload)
|
|
36
|
+
path
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @param codepoint_id [String] e.g. "U+10920"
|
|
40
|
+
# @return [Pathname] the would-be path for a sidecar
|
|
41
|
+
def path_for_id(codepoint_id)
|
|
42
|
+
@output_root.join("#{codepoint_id}.json")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def path_for(provenance)
|
|
48
|
+
path_for_id(provenance.codepoint)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|