ucode 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "pathname"
5
+
6
+ require "ucode/code_chart/extractor"
7
+ require "ucode/code_chart/provenance"
8
+ require "ucode/code_chart/sidecar"
9
+ require "ucode/error"
10
+ require "ucode/version_resolver"
11
+
12
+ module Ucode
13
+ module CodeChart
14
+ # Orchestrates extraction + provenance sidecar writing for one
15
+ # block. The Writer is the **only thing that touches disk** in the
16
+ # CodeChart namespace; everything else is composition.
17
+ #
18
+ # Output layout (per block):
19
+ #
20
+ # <output_root>/<block_id>/<U+XXXX>.svg
21
+ # <output_root>/<block_id>/<U+XXXX>.json # provenance sidecar
22
+ #
23
+ # One folder per block keeps each block's output self-contained
24
+ # and discoverable — a downstream consumer (fontisan) can iterate
25
+ # a block's folder without scanning the whole tree.
26
+ #
27
+ # Idempotent: re-running `write` on the same inputs produces
28
+ # byte-identical files (SVGs via content check; sidecars via
29
+ # {Ucode::Repo::AtomicWrites#write_atomic}'s canonical-JSON
30
+ # byte-equality). The {Summary} tally distinguishes "first run"
31
+ # writes from no-op re-writes.
32
+ class Writer
33
+ # Per-block run summary. Returned from {#write}.
34
+ Summary = Struct.new(
35
+ :block,
36
+ :codepoints_extracted,
37
+ :svgs_written,
38
+ :sidecars_written,
39
+ :pdf_sha256,
40
+ keyword_init: true,
41
+ )
42
+
43
+ # @param output_root [Pathname, String] parent directory. The
44
+ # `<block_id>/` subdirectory is created inside it.
45
+ # @param pdf_path [Pathname, String] Code Charts PDF (already
46
+ # downloaded by the caller; Writer doesn't fetch).
47
+ # @param ucd_version [String, nil] UCD version to stamp on
48
+ # provenance. nil = resolved via {VersionResolver.resolve(nil)}.
49
+ # @param cache_dir [Pathname, String, nil] font-stream cache
50
+ # directory for the EmbeddedFonts::Source.
51
+ # @param now [Time, nil] timestamp override (for tests).
52
+ # @param pillar3_source, tier1_sources: forwarded to the Extractor.
53
+ def initialize(output_root:, pdf_path:, ucd_version: nil,
54
+ cache_dir: nil, now: nil,
55
+ pillar3_source: nil, tier1_sources: nil)
56
+ @output_root = Pathname.new(output_root)
57
+ @pdf_path = Pathname.new(pdf_path)
58
+ @ucd_version = ucd_version || VersionResolver.resolve(nil)
59
+ @cache_dir = cache_dir && Pathname.new(cache_dir)
60
+ @now = now
61
+ @pillar3_source = pillar3_source
62
+ @tier1_sources = tier1_sources
63
+ end
64
+
65
+ # Extracts every codepoint in `block` and writes `<block_id>/<cp>.svg`
66
+ # + `<block_id>/<cp>.json` under `@output_root`. Returns a
67
+ # {Summary} tally.
68
+ #
69
+ # @param block [Ucode::Models::Block]
70
+ # @return [Summary]
71
+ def write(block)
72
+ block_dir = @output_root.join(block.id)
73
+ block_dir.mkpath
74
+
75
+ pdf_sha = CodeChart.sha256_of(@pdf_path)
76
+
77
+ sidecar = Sidecar.new(output_root: block_dir)
78
+ extractor = Extractor.new(
79
+ block: block,
80
+ pdf_path: @pdf_path,
81
+ cache_dir: @cache_dir,
82
+ pillar3_source: @pillar3_source,
83
+ tier1_sources: @tier1_sources,
84
+ )
85
+
86
+ results = extractor.extract
87
+ svgs = 0
88
+ sidecars = 0
89
+ results.each do |result|
90
+ write_svg(block_dir, result)
91
+ svgs += 1
92
+ provenance = CodeChart.build(
93
+ block: block, codepoint: result.codepoint,
94
+ ucd_version: @ucd_version, pdf_path: @pdf_path,
95
+ now: @now,
96
+ )
97
+ sidecar.write(provenance)
98
+ sidecars += 1
99
+ end
100
+
101
+ Summary.new(
102
+ block: block.id,
103
+ codepoints_extracted: results.size,
104
+ svgs_written: svgs,
105
+ sidecars_written: sidecars,
106
+ pdf_sha256: pdf_sha,
107
+ )
108
+ end
109
+
110
+ private
111
+
112
+ # Writes one SVG, skipping the write when the existing content
113
+ # is byte-identical (so mtime is preserved on idempotent
114
+ # re-runs — the Sidecar uses `Repo::AtomicWrites` for the same
115
+ # reason but at a different layer).
116
+ def write_svg(block_dir, result)
117
+ path = block_dir.join("#{format_cp(result.codepoint)}.svg")
118
+ return if path.exist? && path.read == result.svg
119
+
120
+ path.write(result.svg)
121
+ end
122
+
123
+ def format_cp(codepoint)
124
+ "U+#{codepoint.to_s(16).upcase.rjust(4, '0')}"
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # CodeChart — per-codepoint SVG glyph extraction from Unicode Code
5
+ # Charts PDFs.
6
+ #
7
+ # The "Code Chart donor" use case (essenfont consumer): for blocks
8
+ # where no OFL real-font covers the glyphs (Sidetic in Unicode 17,
9
+ # Egyptian Hieroglyphs Extended-B), the only canonical source is
10
+ # the Unicode Consortium's Code Chart PDF. This namespace turns one
11
+ # such PDF into a tree of standalone SVG files plus provenance
12
+ # sidecar JSON.
13
+ #
14
+ # ## Architecture (MECE)
15
+ #
16
+ # Every concern has exactly one home:
17
+ #
18
+ # * **Block metadata** (range + assigned codepoints) — Parsers::Blocks
19
+ # * **PDF download + cache** — Fetch::CodeCharts + Glyphs::PdfFetcher
20
+ # * **PDF object-graph walk + font extraction** — Glyphs::EmbeddedFonts::*
21
+ # * **Tier selection (Pillar 1 / 2 / 3)** — Glyphs::Resolver
22
+ # * **SVG conversion + y-flip + viewBox** — Glyphs::EmbeddedFonts::Svg
23
+ # * **Provenance schema** — CodeChart::Provenance (this namespace)
24
+ # * **Sidecar JSON write** — CodeChart::Sidecar (this namespace)
25
+ # * **Per-block orchestration + idempotent disk write** — CodeChart::Writer
26
+ # * **CLI dispatch** — Cli::CodeChartCmd
27
+ #
28
+ # CodeChart::* is the feature-facing namespace. It does not
29
+ # implement extraction, font parsing, or PDF I/O — it composes
30
+ # the existing infrastructure. Replacing the implementation
31
+ # (e.g. a future pure-Ruby PDF parser per ADR-0001) does not
32
+ # change the public API.
33
+ module CodeChart
34
+ autoload :Extractor, "ucode/code_chart/extractor"
35
+ autoload :Provenance, "ucode/code_chart/provenance"
36
+ autoload :Sidecar, "ucode/code_chart/sidecar"
37
+ autoload :Writer, "ucode/code_chart/writer"
38
+ end
39
+ end
data/lib/ucode/error.rb CHANGED
@@ -97,6 +97,10 @@ module Ucode
97
97
  # Version string not in Config.known_versions.
98
98
  class UnknownVersionError < LookupError; end
99
99
 
100
+ # Block identifier not present in the cached Blocks.txt. Carries the
101
+ # offending id and the path searched in `context:`.
102
+ class UnknownBlockError < LookupError; end
103
+
100
104
  # Glyph pipeline failures.
101
105
  class GlyphError < Error; end
102
106
 
@@ -114,6 +118,13 @@ module Ucode
114
118
  # `mutool` is not installed on the PATH.
115
119
  class EmbeddedFontsMissingError < GlyphError; end
116
120
 
121
+ # The Code Charts PDF for a requested block cannot be obtained: the
122
+ # network returned 4xx/5xx, the response wasn't application/pdf, or
123
+ # the body didn't start with the `%PDF` magic. Distinct from
124
+ # {EmbeddedFontsMissingError} (which fires when the file is already
125
+ # on disk and we just can't open it): this fires at fetch time.
126
+ class CodeChartNotFoundError < GlyphError; end
127
+
117
128
  # Pre-build validation failed for a universal-set build. The
118
129
  # context carries the failing checks so the CLI can render a
119
130
  # useful diagnostic without re-running them. Distinct from
@@ -29,7 +29,7 @@ module Ucode
29
29
  next if dest.exist? && !force
30
30
 
31
31
  url = "#{Ucode.configuration.charts_base_url}/#{filename}"
32
- Http.get(url, dest: dest)
32
+ Http.get(url, dest: dest, validate: :pdf)
33
33
  downloaded += 1
34
34
  end
35
35
  downloaded
@@ -23,9 +23,17 @@ module Ucode
23
23
  # directory is created if absent.
24
24
  # @param retries [Integer, nil] override Config.http_retries.
25
25
  # @param timeout [Integer, nil] override Config.http_timeout.
26
+ # @param validate [Symbol, nil] when `:pdf`, after a successful
27
+ # download verify (a) Content-Type starts with `application/pdf`
28
+ # and (b) the first 4 bytes of the body are `%PDF`. Raises
29
+ # {Ucode::CodeChartNotFoundError} with the offending header
30
+ # value in `context:` on failure. nil = no validation (the
31
+ # default for non-PDF callers like UcdZip and UnihanZip).
26
32
  # @return [Pathname] destination path on success.
27
33
  # @raise [Ucode::NetworkError] if all retries fail.
28
- def get(url, dest:, retries: nil, timeout: nil)
34
+ # @raise [Ucode::CodeChartNotFoundError] when `validate: :pdf`
35
+ # and the response fails content validation.
36
+ def get(url, dest:, retries: nil, timeout: nil, validate: nil)
29
37
  uri = url.is_a?(URI) ? url : URI(url)
30
38
  destination = Pathname.new(dest)
31
39
  destination.dirname.mkpath
@@ -36,15 +44,21 @@ module Ucode
36
44
 
37
45
  last_error = nil
38
46
  (attempts + 1).times do |attempt|
39
- return stream_to(uri, destination, read_timeout)
40
- rescue StandardError => e
41
- last_error = e
42
- sleep_for = backoff_sequence[attempt] || backoff_sequence.last
43
- Ucode.configuration.logger&.warn do
44
- "Http GET #{uri} failed (attempt #{attempt + 1}/#{attempts + 1}): " \
45
- "#{e.class}: #{e.message}; retrying in #{sleep_for}s"
47
+ begin
48
+ response = stream_to(uri, destination, read_timeout)
49
+ validate_response!(validate, response, destination) if validate
50
+ return destination
51
+ rescue ValidationFailure => e
52
+ raise e.cause
53
+ rescue StandardError => e
54
+ last_error = e
55
+ sleep_for = backoff_sequence[attempt] || backoff_sequence.last
56
+ Ucode.configuration.logger&.warn do
57
+ "Http GET #{uri} failed (attempt #{attempt + 1}/#{attempts + 1}): " \
58
+ "#{e.class}: #{e.message}; retrying in #{sleep_for}s"
59
+ end
60
+ sleep(sleep_for)
46
61
  end
47
- sleep(sleep_for)
48
62
  end
49
63
 
50
64
  raise Ucode::NetworkError.new(
@@ -55,19 +69,34 @@ module Ucode
55
69
 
56
70
  private
57
71
 
72
+ # Internal carrier for a validation failure inside a retry
73
+ # attempt. Re-raised from the loop so the response body (which
74
+ # is partial on retries) isn't double-validated against
75
+ # truncated bytes.
76
+ class ValidationFailure < StandardError
77
+ attr_reader :cause
78
+
79
+ def initialize(cause)
80
+ @cause = cause
81
+ super(cause.message)
82
+ end
83
+ end
84
+
58
85
  def stream_to(uri, destination, read_timeout)
86
+ response = nil
59
87
  Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https",
60
88
  read_timeout: read_timeout) do |http|
61
89
  request = Net::HTTP::Get.new(uri)
62
- http.request(request) do |response|
63
- unless response.is_a?(Net::HTTPSuccess)
64
- raise "HTTP #{response.code} #{response.message}"
90
+ http.request(request) do |r|
91
+ unless r.is_a?(Net::HTTPSuccess)
92
+ raise "HTTP #{r.code} #{r.message}"
65
93
  end
66
94
 
67
- write_body(response, destination)
95
+ write_body(r, destination)
96
+ response = r
68
97
  end
69
98
  end
70
- destination
99
+ response or raise "no response received"
71
100
  end
72
101
 
73
102
  def write_body(response, destination)
@@ -77,6 +106,47 @@ module Ucode
77
106
  end
78
107
  File.rename(partial.to_s, destination.to_s)
79
108
  end
109
+
110
+ # Verifies Content-Type and magic bytes for a downloaded file.
111
+ # Raises ValidationFailure carrying a CodeChartNotFoundError so
112
+ # the retry loop in `get` doesn't re-attempt a download that's
113
+ # structurally invalid (only the transport is retriable).
114
+ def validate_response!(mode, response, destination)
115
+ case mode
116
+ when :pdf then validate_pdf!(response, destination)
117
+ else raise ArgumentError, "unknown validate mode: #{mode.inspect}"
118
+ end
119
+ end
120
+
121
+ PDF_CONTENT_TYPE_PREFIX = "application/pdf"
122
+ PDF_MAGIC = "%PDF"
123
+ private_constant :PDF_CONTENT_TYPE_PREFIX, :PDF_MAGIC
124
+
125
+ def validate_pdf!(response, destination)
126
+ content_type = response["Content-Type"].to_s
127
+ unless content_type.start_with?(PDF_CONTENT_TYPE_PREFIX)
128
+ raise ValidationFailure.new(
129
+ Ucode::CodeChartNotFoundError.new(
130
+ "expected Content-Type application/pdf, got #{content_type.inspect}",
131
+ context: { url: response.uri.to_s, content_type: content_type },
132
+ ),
133
+ )
134
+ end
135
+
136
+ # Re-open the destination file and peek at the first 4 bytes.
137
+ # The response body has already been written to disk by
138
+ # `stream_to`; we don't re-read from the response (which is
139
+ # consumed by then).
140
+ magic = File.open(destination, "rb") { |f| f.read(4) }
141
+ unless magic == PDF_MAGIC
142
+ raise ValidationFailure.new(
143
+ Ucode::CodeChartNotFoundError.new(
144
+ "expected %PDF magic bytes, got #{magic.inspect}",
145
+ context: { url: response.uri.to_s, magic: magic },
146
+ ),
147
+ )
148
+ end
149
+ end
80
150
  end
81
151
  end
82
152
  end
@@ -37,6 +37,40 @@ module Ucode
37
37
  nil
38
38
  end
39
39
 
40
+ # Resolves a block by its identifier (the underscored form of
41
+ # the block name, e.g. "Basic_Latin", "Egyptian_Hieroglyphs_Extended-B").
42
+ # Streams `Blocks.txt` once and short-circuits on first match —
43
+ # callers don't need to walk the whole ~340-block file.
44
+ #
45
+ # @param path [Pathname, String] path to a Blocks.txt
46
+ # @param id [String] block identifier (matches `Models::Block#id`)
47
+ # @return [Models::Block, nil] the block, or nil when no block
48
+ # has the given id
49
+ def find_by_id(path, id)
50
+ return nil if id.nil? || id.empty?
51
+
52
+ each_record(path) do |block|
53
+ return block if block.id == id
54
+ end
55
+ nil
56
+ end
57
+
58
+ # Same as {find_by_id} but raises {Ucode::UnknownBlockError} on
59
+ # miss. Use this in callers that can't recover from a missing
60
+ # block (CLI commands, extractors that need a block to proceed).
61
+ #
62
+ # @param path [Pathname, String] path to a Blocks.txt
63
+ # @param id [String] block identifier
64
+ # @return [Models::Block]
65
+ # @raise [Ucode::UnknownBlockError] when no block matches
66
+ def find_by_id!(path, id)
67
+ find_by_id(path, id) or
68
+ raise Ucode::UnknownBlockError.new(
69
+ "unknown Unicode block: #{id.inspect}",
70
+ context: { block_id: id, blocks_txt: path.to_s },
71
+ )
72
+ end
73
+
40
74
  private
41
75
 
42
76
  def build_block(range, name)
data/lib/ucode/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ucode
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/ucode.rb CHANGED
@@ -30,11 +30,13 @@ module Ucode
30
30
  autoload :DatabaseMissingError, "ucode/error"
31
31
  autoload :DatabaseSchemaError, "ucode/error"
32
32
  autoload :UnknownVersionError, "ucode/error"
33
+ autoload :UnknownBlockError, "ucode/error"
33
34
  autoload :GlyphError, "ucode/error"
34
35
  autoload :PdfRenderError, "ucode/error"
35
36
  autoload :GridDetectionError, "ucode/error"
36
37
  autoload :LastResortMissingError, "ucode/error"
37
38
  autoload :EmbeddedFontsMissingError, "ucode/error"
39
+ autoload :CodeChartNotFoundError, "ucode/error"
38
40
 
39
41
  # Infrastructure
40
42
  autoload :Cache, "ucode/cache"
@@ -54,6 +56,7 @@ module Ucode
54
56
  autoload :Repo, "ucode/repo"
55
57
  autoload :Glyphs, "ucode/glyphs"
56
58
  autoload :Audit, "ucode/audit"
59
+ autoload :CodeChart, "ucode/code_chart"
57
60
  autoload :Site, "ucode/site"
58
61
  autoload :Commands, "ucode/commands"
59
62
  autoload :Cli, "ucode/cli"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ucode
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-06-29 00:00:00.000000000 Z
11
+ date: 2026-06-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -156,6 +156,14 @@ files:
156
156
  - Gemfile
157
157
  - README.md
158
158
  - Rakefile
159
+ - TODO.extract-code-chart/01-pdf-fetch-validation.md
160
+ - TODO.extract-code-chart/02-block-name-resolver.md
161
+ - TODO.extract-code-chart/03-codechart-namespace.md
162
+ - TODO.extract-code-chart/04-codechart-extractor.md
163
+ - TODO.extract-code-chart/05-provenance-and-sidecar.md
164
+ - TODO.extract-code-chart/06-codechart-writer.md
165
+ - TODO.extract-code-chart/07-codechart-cli.md
166
+ - TODO.extract-code-chart/08-specs.md
159
167
  - TODO.full/00-README.md
160
168
  - TODO.full/01-panglyph-vision.md
161
169
  - TODO.full/02-panglyph-repo-bootstrap.md
@@ -295,6 +303,11 @@ files:
295
303
  - lib/ucode/audit/universal_set_reference.rb
296
304
  - lib/ucode/cache.rb
297
305
  - lib/ucode/cli.rb
306
+ - lib/ucode/code_chart.rb
307
+ - lib/ucode/code_chart/extractor.rb
308
+ - lib/ucode/code_chart/provenance.rb
309
+ - lib/ucode/code_chart/sidecar.rb
310
+ - lib/ucode/code_chart/writer.rb
298
311
  - lib/ucode/commands.rb
299
312
  - lib/ucode/commands/audit.rb
300
313
  - lib/ucode/commands/audit/browser_command.rb