ucode 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ucode/audit/reference_factory.rb +66 -0
- data/lib/ucode/audit.rb +1 -0
- data/lib/ucode/cli.rb +35 -16
- data/lib/ucode/commands/audit.rb +0 -1
- data/lib/ucode/commands/build.rb +4 -0
- data/lib/ucode/commands/canonical_build.rb +2 -3
- data/lib/ucode/commands/fetch.rb +12 -14
- data/lib/ucode/commands/glyphs.rb +25 -67
- data/lib/ucode/commands/lookup.rb +11 -11
- data/lib/ucode/commands/parse.rb +7 -5
- data/lib/ucode/commands/release.rb +0 -1
- data/lib/ucode/commands/universal_set.rb +10 -14
- data/lib/ucode/coordinator/indices.rb +38 -2
- data/lib/ucode/glyphs/pipeline.rb +106 -0
- data/lib/ucode/glyphs.rb +1 -0
- data/lib/ucode/repo/aggregate_writer.rb +60 -298
- data/lib/ucode/repo/writers/blocks_writer.rb +73 -0
- data/lib/ucode/repo/writers/enums_writer.rb +38 -0
- data/lib/ucode/repo/writers/indexes_writer.rb +53 -0
- data/lib/ucode/repo/writers/manifest_writer.rb +78 -0
- data/lib/ucode/repo/writers/named_sequences_writer.rb +47 -0
- data/lib/ucode/repo/writers/planes_writer.rb +82 -0
- data/lib/ucode/repo/writers/relationships_writer.rb +71 -0
- data/lib/ucode/repo/writers/scripts_writer.rb +54 -0
- data/lib/ucode/repo/writers.rb +20 -0
- data/lib/ucode/repo.rb +1 -0
- data/lib/ucode/version.rb +1 -1
- data/ucode.gemspec +56 -0
- metadata +18 -5
- data/Gemfile.lock +0 -406
- data/lib/ucode/commands/audit/reference_builder.rb +0 -64
|
@@ -39,7 +39,7 @@ module Ucode
|
|
|
39
39
|
# (in `lib/ucode/cli.rb`) is responsible only for argument
|
|
40
40
|
# parsing and dispatch.
|
|
41
41
|
class BuildCommand
|
|
42
|
-
# @param
|
|
42
|
+
# @param version [String] resolved UCD version
|
|
43
43
|
# @param output_root [String, Pathname] directory that will
|
|
44
44
|
# hold `manifest.json`, `glyphs/`, `reports/`.
|
|
45
45
|
# @param source_config_path [String, Pathname, nil] override
|
|
@@ -58,10 +58,9 @@ module Ucode
|
|
|
58
58
|
# resolver and don't have a real source config on disk.
|
|
59
59
|
# @return [Hash] { version:, manifest_path:, totals:,
|
|
60
60
|
# by_tier:, coverage:, validation: }
|
|
61
|
-
def call(
|
|
61
|
+
def call(version, output_root:, source_config_path: nil,
|
|
62
62
|
resolver: nil, block_filter: nil,
|
|
63
63
|
parallel_workers: default_workers, skip_pre_check: false)
|
|
64
|
-
version = VersionResolver.resolve(version_intent)
|
|
65
64
|
root = Pathname.new(output_root)
|
|
66
65
|
|
|
67
66
|
config_path = source_config_path_or_default(source_config_path)
|
|
@@ -144,7 +143,7 @@ module Ucode
|
|
|
144
143
|
# checks (config loads, fonts present, coverage assertion runs)
|
|
145
144
|
# without starting the 4-hour build.
|
|
146
145
|
class PreCheckCommand
|
|
147
|
-
# @param
|
|
146
|
+
# @param version [String] resolved UCD version
|
|
148
147
|
# @param source_config_path [String, Pathname, nil]
|
|
149
148
|
# @param cmaps [#covers?] injectable; defaults to
|
|
150
149
|
# RealFonts::CmapCache.
|
|
@@ -153,9 +152,8 @@ module Ucode
|
|
|
153
152
|
# @return [Ucode::Glyphs::UniversalSet::PreBuildReport]
|
|
154
153
|
# @raise [Ucode::UniversalSetPreBuildError] when missing_fonts
|
|
155
154
|
# is non-empty or the config fails to load.
|
|
156
|
-
def call(
|
|
155
|
+
def call(version, source_config_path: nil, cmaps: nil,
|
|
157
156
|
font_locator: nil)
|
|
158
|
-
version = VersionResolver.resolve(version_intent)
|
|
159
157
|
database = Database.open(version)
|
|
160
158
|
config_path = source_config_path || Glyphs::SourceConfig::DEFAULT_PATH
|
|
161
159
|
|
|
@@ -171,12 +169,11 @@ module Ucode
|
|
|
171
169
|
# shape (or regenerating reports after a model change) without
|
|
172
170
|
# re-running the build.
|
|
173
171
|
class ReportCommand
|
|
174
|
-
# @param
|
|
172
|
+
# @param version [String] resolved UCD version
|
|
175
173
|
# @param output_root [String, Pathname] directory holding
|
|
176
174
|
# `manifest.json`.
|
|
177
175
|
# @return [Hash] the {CoverageReport#emit} payload.
|
|
178
|
-
def call(
|
|
179
|
-
version = VersionResolver.resolve(version_intent)
|
|
176
|
+
def call(version, output_root:)
|
|
180
177
|
root = Pathname.new(output_root)
|
|
181
178
|
manifest_path = root.join("manifest.json")
|
|
182
179
|
raise Ucode::Error, "manifest not found at #{manifest_path}" unless manifest_path.exist?
|
|
@@ -196,12 +193,11 @@ module Ucode
|
|
|
196
193
|
# totals_reconcile, provenance_complete).
|
|
197
194
|
class ValidateCommand
|
|
198
195
|
# @param output_root [String, Pathname]
|
|
199
|
-
# @param
|
|
200
|
-
#
|
|
201
|
-
#
|
|
196
|
+
# @param version [String, nil] resolved UCD version, used only
|
|
197
|
+
# to stamp the report's unicode_version when the manifest's
|
|
198
|
+
# recorded value is missing.
|
|
202
199
|
# @return [Hash] the {Validator#validate} payload.
|
|
203
|
-
def call(output_root,
|
|
204
|
-
version = version_intent && VersionResolver.resolve(version_intent)
|
|
200
|
+
def call(output_root, version: nil)
|
|
205
201
|
Glyphs::UniversalSet::Validator
|
|
206
202
|
.new(output_root, unicode_version: version).validate
|
|
207
203
|
end
|
|
@@ -2,6 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
module Ucode
|
|
4
4
|
class Coordinator
|
|
5
|
+
# Pairs of (output-file-slug, indices-field) for every per-property
|
|
6
|
+
# relationship table the Repo writes. Each field holds a Hash whose
|
|
7
|
+
# values are Records (or Arrays of Records). The Repo iterates
|
|
8
|
+
# `Indices#each_relationship` instead of reaching into the Struct by
|
|
9
|
+
# field name (see Candidate 1 of the 2026-06-29 architecture review).
|
|
10
|
+
RELATIONSHIPS = [
|
|
11
|
+
["special_casing", :special_casing],
|
|
12
|
+
["case_folding", :case_folding],
|
|
13
|
+
["bidi_mirroring", :bidi_mirroring],
|
|
14
|
+
["bidi_brackets", :bidi_brackets],
|
|
15
|
+
["cjk_radicals", :cjk_radicals],
|
|
16
|
+
["standardized_variants", :standardized_variants],
|
|
17
|
+
["name_aliases", :name_aliases],
|
|
18
|
+
].freeze
|
|
19
|
+
private_constant :RELATIONSHIPS
|
|
20
|
+
|
|
5
21
|
# Bag of pre-built indices consumed by the per-codepoint enrichment
|
|
6
22
|
# pass. Every field is a frozen-shaped collection that is read-only
|
|
7
23
|
# after `build_indices` returns: range files land in sorted Arrays
|
|
@@ -12,6 +28,10 @@ module Ucode
|
|
|
12
28
|
# call reads as a self-documenting catalogue of every parsed file —
|
|
13
29
|
# adding a new index is one keyword arg here, one builder call in
|
|
14
30
|
# `Coordinator#build_indices`, and one assignment in `#enrich`.
|
|
31
|
+
#
|
|
32
|
+
# The relationship enumerator (`#each_relationship`) is the seam the
|
|
33
|
+
# Repo uses to write per-property relationship tables without knowing
|
|
34
|
+
# which Struct field carries which data.
|
|
15
35
|
Indices = Struct.new(
|
|
16
36
|
:blocks,
|
|
17
37
|
:scripts,
|
|
@@ -40,6 +60,22 @@ module Ucode
|
|
|
40
60
|
:emoji_properties,
|
|
41
61
|
:extra_binary_properties,
|
|
42
62
|
keyword_init: true,
|
|
43
|
-
)
|
|
63
|
+
) do
|
|
64
|
+
# Yield (slug, records) for each relationship table. The seam
|
|
65
|
+
# between "what the Coordinator indexed" and "what the Repo writes"
|
|
66
|
+
# lives here — Repo never names a Struct field directly.
|
|
67
|
+
#
|
|
68
|
+
# @yieldparam slug [String] output file slug under
|
|
69
|
+
# `output/relationships/`
|
|
70
|
+
# @yieldparam records [Hash<Integer|String, Record|Array<Record>>]
|
|
71
|
+
# @return [Enumerator] when no block is given
|
|
72
|
+
def each_relationship(&block)
|
|
73
|
+
return enum_for(:each_relationship) unless block_given?
|
|
74
|
+
|
|
75
|
+
RELATIONSHIPS.each do |slug, field|
|
|
76
|
+
yield(slug, public_send(field))
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
44
80
|
end
|
|
45
|
-
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
require "set"
|
|
5
|
+
|
|
6
|
+
require "ucode/cache"
|
|
7
|
+
require "ucode/glyphs/pdf_fetcher"
|
|
8
|
+
require "ucode/glyphs/writer"
|
|
9
|
+
require "ucode/parsers"
|
|
10
|
+
|
|
11
|
+
module Ucode
|
|
12
|
+
module Glyphs
|
|
13
|
+
# Assembles the per-block specs that {Glyphs::Writer#write_all} drains.
|
|
14
|
+
#
|
|
15
|
+
# Owns three pieces of orchestration that {Commands::GlyphsCommand}
|
|
16
|
+
# used to carry inline:
|
|
17
|
+
#
|
|
18
|
+
# - block loading from {Cache.ucd_dir}/Blocks.txt (with an optional
|
|
19
|
+
# block filter)
|
|
20
|
+
# - PDF fetcher construction (with monolith fallback)
|
|
21
|
+
# - the per-block page-map heuristic (per-block PDFs are page 1 =
|
|
22
|
+
# title, page 2 = first chart page starting at the block's first
|
|
23
|
+
# codepoint; true for most BMP blocks; multi-page blocks need a
|
|
24
|
+
# richer resolver — mismatches yield placeholder SVGs only, never
|
|
25
|
+
# wrong glyphs)
|
|
26
|
+
#
|
|
27
|
+
# The Command stays a thin wrapper that prints the experimental
|
|
28
|
+
# warning and wires the writer. See Candidate 3 of the 2026-06-29
|
|
29
|
+
# architecture review.
|
|
30
|
+
class Pipeline
|
|
31
|
+
# Path to the monolith fallback file when no per-block PDF is on
|
|
32
|
+
# disk yet. Overridable for tests.
|
|
33
|
+
DEFAULT_MONOLITH_PATH = "CodeCharts.pdf"
|
|
34
|
+
# Cache path for the page-map corpus. Overridable for tests.
|
|
35
|
+
DEFAULT_PAGE_MAP_CACHE = "data/codecharts_page_map.json"
|
|
36
|
+
|
|
37
|
+
Spec = Struct.new(:block, :pdf_path, :page_map, keyword_init: true)
|
|
38
|
+
|
|
39
|
+
# @param version [String] resolved UCD version (callers must
|
|
40
|
+
# resolve via {VersionResolver.resolve} first)
|
|
41
|
+
# @param block_filter [Array<String>, nil] block ids to limit to;
|
|
42
|
+
# nil = every block
|
|
43
|
+
# @param monolith_path [String, Pathname, nil] fallback monolith
|
|
44
|
+
# @param page_map_cache [String, Pathname] cache for the page map
|
|
45
|
+
def initialize(version:, block_filter: nil,
|
|
46
|
+
monolith_path: DEFAULT_MONOLITH_PATH,
|
|
47
|
+
page_map_cache: DEFAULT_PAGE_MAP_CACHE)
|
|
48
|
+
@version = version
|
|
49
|
+
@block_filter = block_filter
|
|
50
|
+
@monolith_path = monolith_path
|
|
51
|
+
@page_map_cache = page_map_cache
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Load every block from the cached Blocks.txt (filtered by
|
|
55
|
+
# `@block_filter` when set) and pair each one with a fetched PDF
|
|
56
|
+
# path and a page map. Blocks whose PDF cannot be fetched are
|
|
57
|
+
# silently dropped — the placeholder pass downstream covers them.
|
|
58
|
+
#
|
|
59
|
+
# @param force [Boolean] re-fetch PDFs even when cached
|
|
60
|
+
# @return [Array<Spec>]
|
|
61
|
+
def build_specs(force: false)
|
|
62
|
+
blocks = load_blocks
|
|
63
|
+
fetcher = build_fetcher(blocks)
|
|
64
|
+
blocks.map { |block| spec_for(block, fetcher, force) }.compact
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def load_blocks
|
|
70
|
+
path = Cache.ucd_dir(@version).join("Blocks.txt")
|
|
71
|
+
return [] unless path.exist?
|
|
72
|
+
|
|
73
|
+
all = Parsers::Blocks.each_record(path).to_a
|
|
74
|
+
return all unless @block_filter && !@block_filter.empty?
|
|
75
|
+
|
|
76
|
+
filter_set = @block_filter.to_set
|
|
77
|
+
all.select { |block| filter_set.include?(block.id) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def build_fetcher(blocks)
|
|
81
|
+
monolith = @monolith_path ? Pathname.new(@monolith_path) : nil
|
|
82
|
+
monolith = monolith.exist? ? monolith : nil
|
|
83
|
+
PdfFetcher.new(
|
|
84
|
+
@version,
|
|
85
|
+
monolith_path: monolith,
|
|
86
|
+
blocks: blocks,
|
|
87
|
+
page_map_cache: @page_map_cache,
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def spec_for(block, fetcher, force)
|
|
92
|
+
pdf_path = fetcher.fetch(block_first_cp: block.range_first, force: force)
|
|
93
|
+
return nil unless pdf_path
|
|
94
|
+
|
|
95
|
+
Spec.new(block: block, pdf_path: pdf_path, page_map: page_map_for(block))
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Per-block PDFs are page 1 = title, page 2 = first chart page
|
|
99
|
+
# starting at the block's first codepoint. True for most BMP
|
|
100
|
+
# blocks; multi-page blocks (CJK) need a richer resolver.
|
|
101
|
+
def page_map_for(block)
|
|
102
|
+
{ 2 => block.range_first }
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
data/lib/ucode/glyphs.rb
CHANGED
|
@@ -20,6 +20,7 @@ module Ucode
|
|
|
20
20
|
autoload :CellExtractor, "ucode/glyphs/cell_extractor"
|
|
21
21
|
autoload :MonolithPageMap, "ucode/glyphs/monolith_page_map"
|
|
22
22
|
autoload :Writer, "ucode/glyphs/writer"
|
|
23
|
+
autoload :Pipeline, "ucode/glyphs/pipeline"
|
|
23
24
|
autoload :LastResort, "ucode/glyphs/last_resort"
|
|
24
25
|
autoload :EmbeddedFonts, "ucode/glyphs/embedded_fonts"
|
|
25
26
|
autoload :RealFonts, "ucode/glyphs/real_fonts"
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "pathname"
|
|
4
|
-
require "json"
|
|
5
|
-
require "time"
|
|
6
4
|
|
|
7
|
-
require "ucode/
|
|
8
|
-
require "ucode/repo/
|
|
9
|
-
require "ucode/repo/
|
|
5
|
+
require "ucode/repo/writers/planes_writer"
|
|
6
|
+
require "ucode/repo/writers/blocks_writer"
|
|
7
|
+
require "ucode/repo/writers/scripts_writer"
|
|
8
|
+
require "ucode/repo/writers/indexes_writer"
|
|
9
|
+
require "ucode/repo/writers/relationships_writer"
|
|
10
|
+
require "ucode/repo/writers/enums_writer"
|
|
11
|
+
require "ucode/repo/writers/named_sequences_writer"
|
|
12
|
+
require "ucode/repo/writers/manifest_writer"
|
|
10
13
|
|
|
11
14
|
module Ucode
|
|
12
15
|
module Repo
|
|
@@ -14,67 +17,30 @@ module Ucode
|
|
|
14
17
|
#
|
|
15
18
|
# output/planes/<n>.json
|
|
16
19
|
# output/blocks/<ID>.json
|
|
17
|
-
# output/blocks/index.json
|
|
20
|
+
# output/blocks/index.json
|
|
18
21
|
# output/scripts/<code>.json
|
|
19
|
-
# output/index/names.json
|
|
20
|
-
# output/index/labels.json
|
|
21
|
-
# output/index/codepoint_to_block.json
|
|
22
|
-
# output/relationships/*.json
|
|
23
|
-
# output/enums.json
|
|
22
|
+
# output/index/names.json
|
|
23
|
+
# output/index/labels.json
|
|
24
|
+
# output/index/codepoint_to_block.json
|
|
25
|
+
# output/relationships/*.json
|
|
26
|
+
# output/enums.json
|
|
24
27
|
# output/named_sequences/<slug>.json
|
|
25
28
|
# output/manifest.json
|
|
26
29
|
#
|
|
27
|
-
# **Single pass**: callers feed one CodePoint at a time via `#add
|
|
28
|
-
#
|
|
29
|
-
#
|
|
30
|
+
# **Single pass**: callers feed one CodePoint at a time via `#add`,
|
|
31
|
+
# which folds into the streaming accumulators. `#flush` then
|
|
32
|
+
# composes eight per-concern writer classes (one per output kind)
|
|
33
|
+
# and runs them in order. Adding a new aggregate = adding one
|
|
34
|
+
# writer class + one line here. See Candidate 5 of the 2026-06-29
|
|
35
|
+
# architecture review.
|
|
30
36
|
#
|
|
31
37
|
# **MECE**:
|
|
32
38
|
# - paths: `Repo::Paths`
|
|
33
39
|
# - atomic writes: `Repo::AtomicWrites`
|
|
34
|
-
# - stream aggregation: this class
|
|
40
|
+
# - stream aggregation: this class (the `#add` half)
|
|
41
|
+
# - per-concern writers: `Repo::Writers::*`
|
|
35
42
|
# - serialization: lutaml-model `to_yaml_hash` / `to_json`
|
|
36
43
|
class AggregateWriter
|
|
37
|
-
include AtomicWrites
|
|
38
|
-
|
|
39
|
-
# Static metadata for the 17 Unicode planes. Planes 4–13 are
|
|
40
|
-
# unassigned in Unicode 17; their entries use placeholder names.
|
|
41
|
-
PLANE_TABLE = {
|
|
42
|
-
0 => ["Basic Multilingual Plane", "BMP"],
|
|
43
|
-
1 => ["Supplementary Multilingual Plane", "SMP"],
|
|
44
|
-
2 => ["Supplementary Ideographic Plane", "SIP"],
|
|
45
|
-
3 => ["Tertiary Ideographic Plane", "TIP"],
|
|
46
|
-
4 => ["Unassigned Plane 4", "—"],
|
|
47
|
-
5 => ["Unassigned Plane 5", "—"],
|
|
48
|
-
6 => ["Unassigned Plane 6", "—"],
|
|
49
|
-
7 => ["Unassigned Plane 7", "—"],
|
|
50
|
-
8 => ["Unassigned Plane 8", "—"],
|
|
51
|
-
9 => ["Unassigned Plane 9", "—"],
|
|
52
|
-
10 => ["Unassigned Plane 10", "—"],
|
|
53
|
-
11 => ["Unassigned Plane 11", "—"],
|
|
54
|
-
12 => ["Unassigned Plane 12", "—"],
|
|
55
|
-
13 => ["Unassigned Plane 13", "—"],
|
|
56
|
-
14 => ["Supplementary Special-purpose Plane", "SSP"],
|
|
57
|
-
15 => ["Supplementary Private Use Area-A", "SPUA-A"],
|
|
58
|
-
16 => ["Supplementary Private Use Area-B", "SPUA-B"],
|
|
59
|
-
}.freeze
|
|
60
|
-
private_constant :PLANE_TABLE
|
|
61
|
-
|
|
62
|
-
# Coordinator::Indices fields paired with the file slug used
|
|
63
|
-
# under `output/relationships/`. Each field is a Hash<Integer,
|
|
64
|
-
# Record> or Hash<Integer, Array<Record>>.
|
|
65
|
-
RELATIONSHIP_SOURCES = {
|
|
66
|
-
special_casing: "special_casing",
|
|
67
|
-
case_folding: "case_folding",
|
|
68
|
-
bidi_mirroring: "bidi_mirroring",
|
|
69
|
-
bidi_brackets: "bidi_brackets",
|
|
70
|
-
cjk_radicals: "cjk_radicals",
|
|
71
|
-
standardized_variants: "standardized_variants",
|
|
72
|
-
name_aliases: "name_aliases",
|
|
73
|
-
}.freeze
|
|
74
|
-
private_constant :RELATIONSHIP_SOURCES
|
|
75
|
-
|
|
76
|
-
attr_reader :codepoint_count
|
|
77
|
-
|
|
78
44
|
# @param output_root [String, Pathname]
|
|
79
45
|
def initialize(output_root)
|
|
80
46
|
@output_root = Pathname.new(output_root)
|
|
@@ -87,6 +53,8 @@ module Ucode
|
|
|
87
53
|
@codepoint_count = 0
|
|
88
54
|
end
|
|
89
55
|
|
|
56
|
+
attr_reader :codepoint_count
|
|
57
|
+
|
|
90
58
|
# Fold one CodePoint into the stream accumulators. No-ops if the
|
|
91
59
|
# cp has no block_id (it has no home in the output tree).
|
|
92
60
|
# @param cp [Ucode::Models::CodePoint]
|
|
@@ -107,10 +75,8 @@ module Ucode
|
|
|
107
75
|
@codepoint_count += 1
|
|
108
76
|
end
|
|
109
77
|
|
|
110
|
-
#
|
|
111
|
-
#
|
|
112
|
-
# `sc` subset of PropertyValueAliases; the full alias tables and
|
|
113
|
-
# the named sequences are passed through from the CLI/parsers).
|
|
78
|
+
# Compose the eight per-concern writers, run them in order, and
|
|
79
|
+
# return the total number of files written.
|
|
114
80
|
#
|
|
115
81
|
# @param ucd_version [String]
|
|
116
82
|
# @param indices [Ucode::Coordinator::Indices]
|
|
@@ -118,19 +84,40 @@ module Ucode
|
|
|
118
84
|
# @param property_value_aliases [Array<Ucode::Models::PropertyValueAlias>]
|
|
119
85
|
# @param named_sequences [Array<Ucode::Models::NamedSequence>]
|
|
120
86
|
# @param glyph_count [Integer]
|
|
121
|
-
# @return [Integer]
|
|
87
|
+
# @return [Integer]
|
|
122
88
|
def flush(ucd_version:, indices:, property_aliases: [],
|
|
123
89
|
property_value_aliases: [], named_sequences: [], glyph_count: 0)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
90
|
+
writers(ucd_version, indices, property_aliases, property_value_aliases,
|
|
91
|
+
named_sequences, glyph_count).sum(&:write)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# @api private — exposed for testing.
|
|
95
|
+
def writers(ucd_version, indices, property_aliases,
|
|
96
|
+
property_value_aliases, named_sequences, glyph_count)
|
|
97
|
+
[
|
|
98
|
+
Writers::PlanesWriter.new(output_root: @output_root, blocks: indices.blocks),
|
|
99
|
+
Writers::BlocksWriter.new(output_root: @output_root,
|
|
100
|
+
blocks: indices.blocks,
|
|
101
|
+
block_codepoint_ids: @block_codepoint_ids,
|
|
102
|
+
block_ages: @block_ages),
|
|
103
|
+
Writers::ScriptsWriter.new(output_root: @output_root,
|
|
104
|
+
scripts: indices.scripts,
|
|
105
|
+
script_codepoint_ids: @script_codepoint_ids),
|
|
106
|
+
Writers::IndexesWriter.new(output_root: @output_root,
|
|
107
|
+
names: @names_index,
|
|
108
|
+
labels: @labels_index,
|
|
109
|
+
cp_to_block: @cp_to_block),
|
|
110
|
+
Writers::RelationshipsWriter.new(output_root: @output_root, indices: indices),
|
|
111
|
+
Writers::EnumsWriter.new(output_root: @output_root,
|
|
112
|
+
property_aliases: property_aliases,
|
|
113
|
+
property_value_aliases: property_value_aliases),
|
|
114
|
+
Writers::NamedSequencesWriter.new(output_root: @output_root,
|
|
115
|
+
named_sequences: named_sequences),
|
|
116
|
+
Writers::ManifestWriter.new(output_root: @output_root,
|
|
117
|
+
ucd_version: ucd_version,
|
|
118
|
+
codepoint_count: @codepoint_count,
|
|
119
|
+
glyph_count: glyph_count),
|
|
120
|
+
]
|
|
134
121
|
end
|
|
135
122
|
|
|
136
123
|
private
|
|
@@ -151,9 +138,7 @@ module Ucode
|
|
|
151
138
|
|
|
152
139
|
# Per-block `age` is the earliest DerivedAge of any codepoint in
|
|
153
140
|
# the block, compared as a Gem::Version. Stored as the original
|
|
154
|
-
# string (e.g. "1.1", "17.0.0").
|
|
155
|
-
# block has an age (rare — only happens for entirely-reserved
|
|
156
|
-
# blocks, which the parser excludes anyway).
|
|
141
|
+
# string (e.g. "1.1", "17.0.0").
|
|
157
142
|
def track_block_age(cp)
|
|
158
143
|
return if cp.age.nil? || cp.age.empty?
|
|
159
144
|
|
|
@@ -168,229 +153,6 @@ module Ucode
|
|
|
168
153
|
def min_age(a, b)
|
|
169
154
|
Gem::Version.new(a) < Gem::Version.new(b) ? a : b
|
|
170
155
|
end
|
|
171
|
-
|
|
172
|
-
# ---- Plane files -------------------------------------------------
|
|
173
|
-
|
|
174
|
-
def write_planes(blocks)
|
|
175
|
-
plane_block_ids = group_block_ids_by_plane(blocks)
|
|
176
|
-
count = 0
|
|
177
|
-
(0..16).each do |n|
|
|
178
|
-
path = Paths.plane_metadata_path(@output_root, n)
|
|
179
|
-
count += 1 if write_atomic(path, plane_payload(n, plane_block_ids[n] || []))
|
|
180
|
-
end
|
|
181
|
-
count
|
|
182
|
-
end
|
|
183
|
-
|
|
184
|
-
def group_block_ids_by_plane(blocks)
|
|
185
|
-
blocks.each_with_object(Hash.new { |h, k| h[k] = [] }) do |block, h|
|
|
186
|
-
h[block.plane_number] << block.id
|
|
187
|
-
end
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
def plane_payload(plane_number, block_ids)
|
|
191
|
-
name, abbrev = PLANE_TABLE.fetch(plane_number)
|
|
192
|
-
range_first = plane_number * 0x10000
|
|
193
|
-
range_last = range_first + 0xFFFF
|
|
194
|
-
to_pretty_json(
|
|
195
|
-
"number" => plane_number,
|
|
196
|
-
"name" => name,
|
|
197
|
-
"abbrev" => abbrev,
|
|
198
|
-
"range_first" => range_first,
|
|
199
|
-
"range_last" => range_last,
|
|
200
|
-
"block_ids" => block_ids,
|
|
201
|
-
)
|
|
202
|
-
end
|
|
203
|
-
|
|
204
|
-
# ---- Block files -------------------------------------------------
|
|
205
|
-
|
|
206
|
-
def write_blocks(blocks)
|
|
207
|
-
count = blocks.sum do |block|
|
|
208
|
-
block.age = @block_ages[block.id]
|
|
209
|
-
path = Paths.block_metadata_path(@output_root, block.id)
|
|
210
|
-
write_atomic(path, block_payload(block)) ? 1 : 0
|
|
211
|
-
end
|
|
212
|
-
count + write_blocks_index(blocks)
|
|
213
|
-
end
|
|
214
|
-
|
|
215
|
-
def write_blocks_index(blocks)
|
|
216
|
-
path = Paths.blocks_index_path(@output_root)
|
|
217
|
-
summary = blocks.map do |block|
|
|
218
|
-
{
|
|
219
|
-
"id" => block.id,
|
|
220
|
-
"name" => block.name,
|
|
221
|
-
"first_cp" => block.range_first,
|
|
222
|
-
"last_cp" => block.range_last,
|
|
223
|
-
"plane_number" => block.plane_number,
|
|
224
|
-
"age" => @block_ages[block.id],
|
|
225
|
-
}
|
|
226
|
-
end
|
|
227
|
-
write_atomic(path, to_pretty_json(summary)) ? 1 : 0
|
|
228
|
-
end
|
|
229
|
-
|
|
230
|
-
def block_payload(block)
|
|
231
|
-
to_pretty_json(
|
|
232
|
-
"id" => block.id,
|
|
233
|
-
"name" => block.name,
|
|
234
|
-
"range_first" => block.range_first,
|
|
235
|
-
"range_last" => block.range_last,
|
|
236
|
-
"plane_number" => block.plane_number,
|
|
237
|
-
"age" => @block_ages[block.id],
|
|
238
|
-
"codepoint_ids" => (@block_codepoint_ids[block.id] || []),
|
|
239
|
-
)
|
|
240
|
-
end
|
|
241
|
-
|
|
242
|
-
# ---- Script files ------------------------------------------------
|
|
243
|
-
|
|
244
|
-
def write_scripts(scripts)
|
|
245
|
-
count = 0
|
|
246
|
-
scripts.group_by(&:code).each do |code, ranges|
|
|
247
|
-
next if code.nil? || code.empty?
|
|
248
|
-
|
|
249
|
-
path = Paths.script_metadata_path(@output_root, code)
|
|
250
|
-
count += 1 if write_atomic(path, script_payload(code, ranges))
|
|
251
|
-
end
|
|
252
|
-
count
|
|
253
|
-
end
|
|
254
|
-
|
|
255
|
-
def script_payload(code, ranges)
|
|
256
|
-
to_pretty_json(
|
|
257
|
-
"code" => code,
|
|
258
|
-
"name" => ranges.first&.name,
|
|
259
|
-
"range_first" => ranges.map(&:range_first).min,
|
|
260
|
-
"range_last" => ranges.map(&:range_last).max,
|
|
261
|
-
"codepoint_ids" => (@script_codepoint_ids[code] || []),
|
|
262
|
-
)
|
|
263
|
-
end
|
|
264
|
-
|
|
265
|
-
# ---- Lookup indexes ---------------------------------------------
|
|
266
|
-
|
|
267
|
-
def write_indexes
|
|
268
|
-
count = 0
|
|
269
|
-
count += 1 if write_atomic(Paths.names_index_path(@output_root), to_pretty_json(@names_index))
|
|
270
|
-
count += 1 if write_atomic(Paths.labels_index_path(@output_root), to_pretty_json(@labels_index))
|
|
271
|
-
count += 1 if write_atomic(codepoint_to_block_path, to_pretty_json(@cp_to_block))
|
|
272
|
-
count
|
|
273
|
-
end
|
|
274
|
-
|
|
275
|
-
def codepoint_to_block_path
|
|
276
|
-
Pathname(@output_root).join("index", "codepoint_to_block.json")
|
|
277
|
-
end
|
|
278
|
-
|
|
279
|
-
# ---- Relationships ----------------------------------------------
|
|
280
|
-
|
|
281
|
-
def write_relationships(indices)
|
|
282
|
-
RELATIONSHIP_SOURCES.sum do |field, slug|
|
|
283
|
-
records = indices.public_send(field)
|
|
284
|
-
write_relationship_file(slug, records)
|
|
285
|
-
end
|
|
286
|
-
end
|
|
287
|
-
|
|
288
|
-
def write_relationship_file(slug, records)
|
|
289
|
-
return 0 if records.nil? || records.empty?
|
|
290
|
-
|
|
291
|
-
path = Pathname(@output_root).join("relationships", "#{slug}.json")
|
|
292
|
-
write_atomic(path, relationship_payload(records)) ? 1 : 0
|
|
293
|
-
end
|
|
294
|
-
|
|
295
|
-
# records is Hash<Integer, Record>, Hash<Integer, Array<Record>>,
|
|
296
|
-
# Hash<String, Record>, or Hash<String, Array<Record>>.
|
|
297
|
-
# Output: { "U+XXXX" => record.to_yaml_hash, ... } or
|
|
298
|
-
# { "U+XXXX" => [record.to_yaml_hash, ...], ... }
|
|
299
|
-
def relationship_payload(records)
|
|
300
|
-
payload = records.each_with_object({}) do |(key, value), h|
|
|
301
|
-
h[key_to_cp_id(key)] = serialize_value(value)
|
|
302
|
-
end
|
|
303
|
-
to_pretty_json(payload)
|
|
304
|
-
end
|
|
305
|
-
|
|
306
|
-
# Indices that are keyed by Integer codepoint (most of them) get
|
|
307
|
-
# formatted into "U+XXXX". Indices keyed by string ids already
|
|
308
|
-
# (cjk_radicals by ideograph_id, standardized_variants by base_id)
|
|
309
|
-
# are passed through verbatim.
|
|
310
|
-
def key_to_cp_id(key)
|
|
311
|
-
key.is_a?(Integer) ? Paths.cp_id(key) : key
|
|
312
|
-
end
|
|
313
|
-
|
|
314
|
-
def serialize_value(value)
|
|
315
|
-
return value.map { |v| serialize_one(v) } if value.is_a?(Array)
|
|
316
|
-
|
|
317
|
-
serialize_one(value)
|
|
318
|
-
end
|
|
319
|
-
|
|
320
|
-
def serialize_one(record)
|
|
321
|
-
record.to_yaml_hash
|
|
322
|
-
end
|
|
323
|
-
|
|
324
|
-
# ---- Enums -------------------------------------------------------
|
|
325
|
-
|
|
326
|
-
def write_enums(property_aliases, property_value_aliases)
|
|
327
|
-
path = Pathname(@output_root).join("enums.json")
|
|
328
|
-
payload = {
|
|
329
|
-
"properties" => property_aliases.map(&:to_yaml_hash),
|
|
330
|
-
"property_values" => property_value_aliases.map(&:to_yaml_hash),
|
|
331
|
-
}
|
|
332
|
-
write_atomic(path, to_pretty_json(payload)) ? 1 : 0
|
|
333
|
-
end
|
|
334
|
-
|
|
335
|
-
# ---- Named sequences --------------------------------------------
|
|
336
|
-
|
|
337
|
-
def write_named_sequences(named_sequences)
|
|
338
|
-
return 0 if named_sequences.nil? || named_sequences.empty?
|
|
339
|
-
|
|
340
|
-
dir = Pathname(@output_root).join("named_sequences")
|
|
341
|
-
named_sequences.sum do |ns|
|
|
342
|
-
path = dir.join("#{slug_for(ns)}.json")
|
|
343
|
-
write_atomic(path, ns.to_json(pretty: true)) ? 1 : 0
|
|
344
|
-
end
|
|
345
|
-
end
|
|
346
|
-
|
|
347
|
-
# Slug derived from the name: downcase, non-alphanumerics → "_".
|
|
348
|
-
def slug_for(named_sequence)
|
|
349
|
-
named_sequence.name
|
|
350
|
-
.downcase
|
|
351
|
-
.gsub(/[^a-z0-9]+/, "_")
|
|
352
|
-
.gsub(/^_+|_+$/, "")
|
|
353
|
-
end
|
|
354
|
-
|
|
355
|
-
# ---- Manifest ---------------------------------------------------
|
|
356
|
-
|
|
357
|
-
# Fields that define the manifest's semantic content. When these
|
|
358
|
-
# match the existing manifest on disk, we preserve the old
|
|
359
|
-
# `generated_at` so that re-runs are byte-idempotent (no rewrite
|
|
360
|
-
# unless something actually changed).
|
|
361
|
-
MANIFEST_CONTENT_KEYS = %w[
|
|
362
|
-
ucd_version codepoint_count glyph_count schema_version
|
|
363
|
-
].freeze
|
|
364
|
-
private_constant :MANIFEST_CONTENT_KEYS
|
|
365
|
-
|
|
366
|
-
def write_manifest(ucd_version:, glyph_count:)
|
|
367
|
-
path = Paths.manifest_path(@output_root)
|
|
368
|
-
content = {
|
|
369
|
-
"ucd_version" => ucd_version,
|
|
370
|
-
"codepoint_count" => @codepoint_count,
|
|
371
|
-
"glyph_count" => glyph_count,
|
|
372
|
-
"schema_version" => "1",
|
|
373
|
-
}
|
|
374
|
-
ts = preserved_or_new_timestamp(path, content)
|
|
375
|
-
payload = content.merge("generated_at" => ts)
|
|
376
|
-
write_atomic(path, to_pretty_json(payload)) ? 1 : 0
|
|
377
|
-
end
|
|
378
|
-
|
|
379
|
-
def preserved_or_new_timestamp(path, content)
|
|
380
|
-
existing = read_manifest(path)
|
|
381
|
-
return Time.now.utc.iso8601 unless existing
|
|
382
|
-
|
|
383
|
-
unchanged = MANIFEST_CONTENT_KEYS.all? { |k| existing[k] == content[k] }
|
|
384
|
-
unchanged ? existing["generated_at"] : Time.now.utc.iso8601
|
|
385
|
-
end
|
|
386
|
-
|
|
387
|
-
def read_manifest(path)
|
|
388
|
-
return nil unless path.exist?
|
|
389
|
-
|
|
390
|
-
JSON.parse(path.read)
|
|
391
|
-
rescue JSON::ParserError
|
|
392
|
-
nil
|
|
393
|
-
end
|
|
394
156
|
end
|
|
395
157
|
end
|
|
396
|
-
end
|
|
158
|
+
end
|