ucode 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -0
- data/Gemfile.lock +2 -2
- data/TODO.full/00-README.md +116 -0
- data/TODO.full/01-panglyph-vision.md +112 -0
- data/TODO.full/02-panglyph-repo-bootstrap.md +184 -0
- data/TODO.full/03-panglyph-font-builder.md +201 -0
- data/TODO.full/04-panglyph-publish-pipeline.md +126 -0
- data/TODO.full/05-ucode-0-1-1-release.md +139 -0
- data/TODO.full/06-fontisan-remove-audit.md +142 -0
- data/TODO.full/07-fontisan-remove-ucd.md +125 -0
- data/TODO.full/08-archive-private-bin-build.md +143 -0
- data/TODO.full/09-archive-public-structure.md +164 -0
- data/TODO.full/10-fontist-org-woff-glyphs.md +131 -0
- data/TODO.full/11-fontist-org-audit-coverage.md +140 -0
- data/TODO.full/12-implementation-order.md +216 -0
- data/TODO.full/13-fontisan-font-writer-api.md +189 -0
- data/TODO.full/14-fontisan-table-writers.md +66 -0
- data/TODO.full/15-panglyph-builder-real.md +82 -0
- data/TODO.full/16-archive-public-sync-workflows.md +167 -0
- data/TODO.full/17-fontist-org-font-picker.md +73 -0
- data/TODO.full/18-comprehensive-spec-coverage.md +64 -0
- data/TODO.full/19-ucode-0-1-2-patch.md +32 -0
- data/TODO.full/20-fontisan-0-2-23-release.md +52 -0
- data/TODO.new/00-README.md +30 -0
- data/TODO.new/23-universal-glyph-set-source-map.md +312 -0
- data/TODO.new/24-universal-glyph-set-build.md +189 -0
- data/TODO.new/25-font-audit-against-universal-set.md +195 -0
- data/TODO.new/26-missing-glyph-reporter.md +189 -0
- data/TODO.new/27-fontist-org-consumer-integration.md +200 -0
- data/TODO.new/28-implementation-order-update.md +187 -0
- data/TODO.new/29-universal-set-curation-uc17.md +312 -0
- data/TODO.new/30-tier1-font-acquisition.md +241 -0
- data/TODO.new/31-universal-set-production-build.md +205 -0
- data/TODO.new/32-uc17-coverage-matrix.md +165 -0
- data/TODO.new/33-specialist-font-acquisition-refresh.md +138 -0
- data/TODO.new/34-pillar2-content-stream-correlator.md +147 -0
- data/TODO.new/35-universal-set-production-run.md +160 -0
- data/TODO.new/36-per-font-coverage-audit.md +145 -0
- data/TODO.new/37-coverage-highlight-reporter.md +125 -0
- data/TODO.new/38-fontist-org-glyph-consumer.md +141 -0
- data/TODO.new/39-implementation-order-update-32-38.md +258 -0
- data/TODO.new/40-archive-private-uses-ucode-audit.md +124 -0
- data/TODO.new/41-ucode-unicode-archive-bridge.md +160 -0
- data/config/specialist_fonts.yml +102 -0
- data/config/unicode17_tier1_fonts.yml +42 -0
- data/config/unicode17_universal_glyph_set.yml +293 -0
- data/lib/ucode/audit/block_aggregator.rb +57 -29
- data/lib/ucode/audit/browser/face_page.rb +128 -0
- data/lib/ucode/audit/browser/glyph_panel.rb +124 -0
- data/lib/ucode/audit/browser/library_page.rb +74 -0
- data/lib/ucode/audit/browser/missing_glyph_page.rb +87 -0
- data/lib/ucode/audit/browser/template.rb +47 -0
- data/lib/ucode/audit/browser/templates/face.css +200 -0
- data/lib/ucode/audit/browser/templates/face.html.erb +41 -0
- data/lib/ucode/audit/browser/templates/face.js +298 -0
- data/lib/ucode/audit/browser/templates/library.css +119 -0
- data/lib/ucode/audit/browser/templates/library.html.erb +42 -0
- data/lib/ucode/audit/browser/templates/library.js +99 -0
- data/lib/ucode/audit/browser/templates/missing_glyph_page.css +119 -0
- data/lib/ucode/audit/browser/templates/missing_glyph_page.html.erb +58 -0
- data/lib/ucode/audit/browser/templates/missing_glyph_page.js +2 -0
- data/lib/ucode/audit/browser.rb +32 -0
- data/lib/ucode/audit/context.rb +27 -1
- data/lib/ucode/audit/coverage_reference.rb +103 -0
- data/lib/ucode/audit/differ.rb +121 -0
- data/lib/ucode/audit/emitter/block_emitter.rb +52 -0
- data/lib/ucode/audit/emitter/codepoint_emitter.rb +87 -0
- data/lib/ucode/audit/emitter/collection_emitter.rb +80 -0
- data/lib/ucode/audit/emitter/face_directory.rb +212 -0
- data/lib/ucode/audit/emitter/glyph_emitter.rb +48 -0
- data/lib/ucode/audit/emitter/index_emitter.rb +149 -0
- data/lib/ucode/audit/emitter/library_emitter.rb +96 -0
- data/lib/ucode/audit/emitter/paths.rb +312 -0
- data/lib/ucode/audit/emitter/plane_emitter.rb +29 -0
- data/lib/ucode/audit/emitter/script_emitter.rb +29 -0
- data/lib/ucode/audit/emitter.rb +29 -0
- data/lib/ucode/audit/extractors/aggregations.rb +31 -2
- data/lib/ucode/audit/face_auditor.rb +86 -0
- data/lib/ucode/audit/formatters/audit_diff_text.rb +112 -0
- data/lib/ucode/audit/formatters/audit_text.rb +411 -0
- data/lib/ucode/audit/formatters/color.rb +48 -0
- data/lib/ucode/audit/formatters/library_summary_text.rb +98 -0
- data/lib/ucode/audit/formatters/text_formatter.rb +83 -0
- data/lib/ucode/audit/formatters.rb +23 -0
- data/lib/ucode/audit/library_aggregator.rb +86 -0
- data/lib/ucode/audit/library_auditor.rb +105 -0
- data/lib/ucode/audit/release/emitter.rb +152 -0
- data/lib/ucode/audit/release/face_card.rb +93 -0
- data/lib/ucode/audit/release/formula_audits.rb +50 -0
- data/lib/ucode/audit/release/library_index_builder.rb +78 -0
- data/lib/ucode/audit/release/manifest_builder.rb +127 -0
- data/lib/ucode/audit/release.rb +42 -0
- data/lib/ucode/audit/ucd_only_reference.rb +81 -0
- data/lib/ucode/audit/universal_set_reference.rb +136 -0
- data/lib/ucode/audit.rb +31 -0
- data/lib/ucode/cli.rb +339 -33
- data/lib/ucode/commands/audit/browser_command.rb +82 -0
- data/lib/ucode/commands/audit/collection_command.rb +103 -0
- data/lib/ucode/commands/audit/compare_command.rb +188 -0
- data/lib/ucode/commands/audit/font_command.rb +140 -0
- data/lib/ucode/commands/audit/library_command.rb +87 -0
- data/lib/ucode/commands/audit/reference_builder.rb +64 -0
- data/lib/ucode/commands/audit.rb +20 -0
- data/lib/ucode/commands/block_feed.rb +73 -0
- data/lib/ucode/commands/canonical_build.rb +138 -0
- data/lib/ucode/commands/fetch.rb +37 -1
- data/lib/ucode/commands/release.rb +115 -0
- data/lib/ucode/commands/universal_set.rb +211 -0
- data/lib/ucode/commands.rb +5 -0
- data/lib/ucode/coordinator/indices.rb +11 -0
- data/lib/ucode/coordinator.rb +138 -5
- data/lib/ucode/error.rb +30 -2
- data/lib/ucode/fetch/font_fetcher/result.rb +39 -0
- data/lib/ucode/fetch/font_fetcher.rb +16 -0
- data/lib/ucode/fetch/specialist_font_fetcher.rb +280 -0
- data/lib/ucode/fetch.rb +7 -3
- data/lib/ucode/glyphs/real_fonts/cmap_cache.rb +74 -0
- data/lib/ucode/glyphs/real_fonts.rb +1 -0
- data/lib/ucode/glyphs/resolver.rb +62 -0
- data/lib/ucode/glyphs/source.rb +48 -0
- data/lib/ucode/glyphs/source_builder.rb +61 -0
- data/lib/ucode/glyphs/source_config/coverage_assertion.rb +79 -0
- data/lib/ucode/glyphs/source_config/gap_report.rb +54 -0
- data/lib/ucode/glyphs/source_config.rb +104 -0
- data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +63 -0
- data/lib/ucode/glyphs/sources/pillar3_last_resort.rb +51 -0
- data/lib/ucode/glyphs/sources/tier1_real_font.rb +104 -0
- data/lib/ucode/glyphs/sources.rb +20 -0
- data/lib/ucode/glyphs/universal_set/builder.rb +161 -0
- data/lib/ucode/glyphs/universal_set/coverage_report.rb +139 -0
- data/lib/ucode/glyphs/universal_set/idempotency.rb +86 -0
- data/lib/ucode/glyphs/universal_set/manifest_accumulator.rb +195 -0
- data/lib/ucode/glyphs/universal_set/manifest_writer.rb +61 -0
- data/lib/ucode/glyphs/universal_set/pre_build_check.rb +197 -0
- data/lib/ucode/glyphs/universal_set/validator.rb +204 -0
- data/lib/ucode/glyphs/universal_set.rb +45 -0
- data/lib/ucode/glyphs.rb +6 -0
- data/lib/ucode/models/audit/baseline.rb +6 -0
- data/lib/ucode/models/audit/block_summary.rb +7 -0
- data/lib/ucode/models/audit/codepoint_provenance.rb +39 -0
- data/lib/ucode/models/audit/release_face.rb +42 -0
- data/lib/ucode/models/audit/release_formula.rb +33 -0
- data/lib/ucode/models/audit/release_manifest.rb +43 -0
- data/lib/ucode/models/audit/release_universal_set.rb +37 -0
- data/lib/ucode/models/audit.rb +9 -0
- data/lib/ucode/models/block.rb +2 -0
- data/lib/ucode/models/build_report.rb +109 -0
- data/lib/ucode/models/codepoint/glyph.rb +42 -0
- data/lib/ucode/models/codepoint.rb +3 -0
- data/lib/ucode/models/glyph_source.rb +86 -0
- data/lib/ucode/models/glyph_source_map.rb +138 -0
- data/lib/ucode/models/specialist_font.rb +70 -0
- data/lib/ucode/models/specialist_font_manifest.rb +48 -0
- data/lib/ucode/models/unihan_entry.rb +81 -9
- data/lib/ucode/models/unihan_field.rb +21 -0
- data/lib/ucode/models/universal_set_entry.rb +47 -0
- data/lib/ucode/models/universal_set_manifest.rb +78 -0
- data/lib/ucode/models/validation_report.rb +99 -0
- data/lib/ucode/models.rb +9 -0
- data/lib/ucode/parsers/named_sequences.rb +5 -5
- data/lib/ucode/parsers/unihan.rb +50 -19
- data/lib/ucode/repo/aggregate_writer.rb +34 -2
- data/lib/ucode/repo/block_feed_emitter.rb +153 -0
- data/lib/ucode/repo/build_report_accumulator.rb +138 -0
- data/lib/ucode/repo/build_report_writer.rb +46 -0
- data/lib/ucode/repo/build_validator.rb +229 -0
- data/lib/ucode/repo/codepoint_writer.rb +50 -1
- data/lib/ucode/repo/paths.rb +8 -0
- data/lib/ucode/repo.rb +4 -0
- data/lib/ucode/version.rb +1 -1
- data/schema/block-feed.output.schema.yml +134 -0
- metadata +143 -2
- data/ucode.gemspec +0 -56
|
@@ -2,21 +2,93 @@
|
|
|
2
2
|
|
|
3
3
|
require "lutaml/model"
|
|
4
4
|
|
|
5
|
+
require "ucode/models/unihan_field"
|
|
6
|
+
|
|
5
7
|
module Ucode
|
|
6
8
|
module Models
|
|
7
|
-
# Unihan dictionary data for CJK codepoints
|
|
8
|
-
#
|
|
9
|
-
#
|
|
9
|
+
# Unihan dictionary data for CJK codepoints, grouped into the 8
|
|
10
|
+
# categories defined by the Unihan standard. Each category
|
|
11
|
+
# corresponds to one Unihan file:
|
|
12
|
+
#
|
|
13
|
+
# Unihan_DictionaryIndices.txt → dictionary_indices
|
|
14
|
+
# Unihan_DictionaryLikeData.txt → dictionary_like_data
|
|
15
|
+
# Unihan_IRGSources.txt → irg_sources
|
|
16
|
+
# Unihan_NumericValues.txt → numeric_values
|
|
17
|
+
# Unihan_RadicalStrokeCounts.txt → radical_stroke_counts
|
|
18
|
+
# Unihan_Readings.txt → readings
|
|
19
|
+
# Unihan_Variants.txt → variants
|
|
20
|
+
# Unihan_OtherMappings.txt → other_mappings
|
|
10
21
|
#
|
|
11
|
-
#
|
|
12
|
-
# is
|
|
13
|
-
#
|
|
14
|
-
#
|
|
22
|
+
# Each category attribute is a collection of {UnihanField} records.
|
|
23
|
+
# Category is set at parse time from the source filename (via
|
|
24
|
+
# `FILE_TO_CATEGORY`) — Unicode does not reorganize files across
|
|
25
|
+
# versions, so this is stable without per-field hardcoding.
|
|
15
26
|
class UnihanEntry < Lutaml::Model::Serializable
|
|
16
|
-
attribute
|
|
27
|
+
# Symbol → attribute name. Mirrors the 8 Unihan files.
|
|
28
|
+
CATEGORIES = {
|
|
29
|
+
dictionary_indices: :dictionary_indices,
|
|
30
|
+
dictionary_like_data: :dictionary_like_data,
|
|
31
|
+
irg_sources: :irg_sources,
|
|
32
|
+
numeric_values: :numeric_values,
|
|
33
|
+
radical_stroke_counts: :radical_stroke_counts,
|
|
34
|
+
readings: :readings,
|
|
35
|
+
variants: :variants,
|
|
36
|
+
other_mappings: :other_mappings,
|
|
37
|
+
}.freeze
|
|
38
|
+
|
|
39
|
+
# Filename → category symbol. Used by the parser to bucket
|
|
40
|
+
# records without callers needing to know the mapping.
|
|
41
|
+
FILE_TO_CATEGORY = {
|
|
42
|
+
"Unihan_DictionaryIndices.txt" => :dictionary_indices,
|
|
43
|
+
"Unihan_DictionaryLikeData.txt" => :dictionary_like_data,
|
|
44
|
+
"Unihan_IRGSources.txt" => :irg_sources,
|
|
45
|
+
"Unihan_NumericValues.txt" => :numeric_values,
|
|
46
|
+
"Unihan_RadicalStrokeCounts.txt" => :radical_stroke_counts,
|
|
47
|
+
"Unihan_Readings.txt" => :readings,
|
|
48
|
+
"Unihan_Variants.txt" => :variants,
|
|
49
|
+
"Unihan_OtherMappings.txt" => :other_mappings,
|
|
50
|
+
}.freeze
|
|
51
|
+
|
|
52
|
+
attribute :dictionary_indices, UnihanField, collection: true, default: -> { [] }
|
|
53
|
+
attribute :dictionary_like_data, UnihanField, collection: true, default: -> { [] }
|
|
54
|
+
attribute :irg_sources, UnihanField, collection: true, default: -> { [] }
|
|
55
|
+
attribute :numeric_values, UnihanField, collection: true, default: -> { [] }
|
|
56
|
+
attribute :radical_stroke_counts, UnihanField, collection: true, default: -> { [] }
|
|
57
|
+
attribute :readings, UnihanField, collection: true, default: -> { [] }
|
|
58
|
+
attribute :variants, UnihanField, collection: true, default: -> { [] }
|
|
59
|
+
attribute :other_mappings, UnihanField, collection: true, default: -> { [] }
|
|
60
|
+
|
|
61
|
+
# Pushes a field into the right category bucket. Used by the
|
|
62
|
+
# Coordinator when streaming records from the parser.
|
|
63
|
+
#
|
|
64
|
+
# @param category [Symbol] one of CATEGORIES keys
|
|
65
|
+
# @param name [String] e.g. "kMandarin"
|
|
66
|
+
# @param values [Array<String>] space-split values from Unihan
|
|
67
|
+
def add(category, name, values)
|
|
68
|
+
attr_name = CATEGORIES.fetch(category) { return }
|
|
69
|
+
public_send(attr_name) << UnihanField.new(name: name, values: values)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# True if any category has data.
|
|
73
|
+
def any?
|
|
74
|
+
CATEGORIES.keys.any? { |sym| !public_send(sym).empty? }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# All fields across every category, flattened to {name => values}.
|
|
78
|
+
# Iteration helper for consumers that want a flat view (search
|
|
79
|
+
# indexing, downstream filtering).
|
|
80
|
+
#
|
|
81
|
+
# @return [Hash{String => Array<String>}]
|
|
82
|
+
def all_fields
|
|
83
|
+
CATEGORIES.keys.each_with_object({}) do |sym, h|
|
|
84
|
+
public_send(sym).each { |f| h[f.name] = f.values }
|
|
85
|
+
end
|
|
86
|
+
end
|
|
17
87
|
|
|
18
88
|
key_value do
|
|
19
|
-
|
|
89
|
+
CATEGORIES.each do |symbol, attr_name|
|
|
90
|
+
map attr_name, to: symbol
|
|
91
|
+
end
|
|
20
92
|
end
|
|
21
93
|
end
|
|
22
94
|
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Models
|
|
7
|
+
# One Unihan field assignment: a k-field name plus its space-split
|
|
8
|
+
# values. e.g. `kMandarin → ["jìng"]`, `kHanyuPinyin → ["64047.030:jìng"]`.
|
|
9
|
+
# The values list is uniform across all Unihan fields — even single-valued
|
|
10
|
+
# ones are arrays, which simplifies consumer logic.
|
|
11
|
+
class UnihanField < Lutaml::Model::Serializable
|
|
12
|
+
attribute :name, :string
|
|
13
|
+
attribute :values, :string, collection: true, default: -> { [] }
|
|
14
|
+
|
|
15
|
+
key_value do
|
|
16
|
+
map "name", to: :name
|
|
17
|
+
map "values", to: :values
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Models
|
|
7
|
+
# One row in a {UniversalSetManifest}. Records the resolved glyph
|
|
8
|
+
# for a single codepoint: which tier produced it, which source
|
|
9
|
+
# font, and a stable content hash + size so downstream consumers
|
|
10
|
+
# can detect changes without re-reading the SVG.
|
|
11
|
+
#
|
|
12
|
+
# Wire shape (one entry per assigned codepoint in the manifest's
|
|
13
|
+
# `entries:` array):
|
|
14
|
+
#
|
|
15
|
+
# {
|
|
16
|
+
# "codepoint": 65,
|
|
17
|
+
# "id": "U+0041",
|
|
18
|
+
# "tier": "tier-1",
|
|
19
|
+
# "source": "noto-sans",
|
|
20
|
+
# "svg_sha256": "abc...",
|
|
21
|
+
# "svg_size_bytes": 412
|
|
22
|
+
# }
|
|
23
|
+
#
|
|
24
|
+
# `source` is the source identifier extracted from the resolver
|
|
25
|
+
# {Ucode::Glyphs::Source::Result#provenance} — i.e. the part after
|
|
26
|
+
# the `tier:` prefix ("noto-sans" for "tier-1:noto-sans"). This is
|
|
27
|
+
# what audits (TODO 25) group by when answering "how many
|
|
28
|
+
# codepoints does font X cover in this set?".
|
|
29
|
+
class UniversalSetEntry < Lutaml::Model::Serializable
|
|
30
|
+
attribute :codepoint, :integer
|
|
31
|
+
attribute :id, :string
|
|
32
|
+
attribute :tier, :string
|
|
33
|
+
attribute :source, :string
|
|
34
|
+
attribute :svg_sha256, :string
|
|
35
|
+
attribute :svg_size_bytes, :integer, default: 0
|
|
36
|
+
|
|
37
|
+
key_value do
|
|
38
|
+
map "codepoint", to: :codepoint
|
|
39
|
+
map "id", to: :id
|
|
40
|
+
map "tier", to: :tier
|
|
41
|
+
map "source", to: :source
|
|
42
|
+
map "svg_sha256", to: :svg_sha256
|
|
43
|
+
map "svg_size_bytes", to: :svg_size_bytes
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
|
|
5
|
+
require "ucode/models/universal_set_entry"
|
|
6
|
+
|
|
7
|
+
module Ucode
|
|
8
|
+
module Models
|
|
9
|
+
# Manifest emitted at the end of a universal glyph set build
|
|
10
|
+
# (TODO 24). The single index into the set: every codepoint that
|
|
11
|
+
# was attempted gets one {UniversalSetEntry}, and the totals +
|
|
12
|
+
# per-tier rollups let consumers (audits, fontist.org) answer
|
|
13
|
+
# "what does this set cover?" without reading every SVG.
|
|
14
|
+
#
|
|
15
|
+
# Wire shape:
|
|
16
|
+
#
|
|
17
|
+
# {
|
|
18
|
+
# "unicode_version": "17.0.0",
|
|
19
|
+
# "ucode_version": "0.2.0",
|
|
20
|
+
# "generated_at": "2026-06-28T00:00:00Z",
|
|
21
|
+
# "source_config_sha256": "abc...",
|
|
22
|
+
# "totals": {
|
|
23
|
+
# "codepoints_assigned": 150012,
|
|
24
|
+
# "codepoints_built": 150012,
|
|
25
|
+
# "codepoints_skipped": 0,
|
|
26
|
+
# "codepoints_failed": 0
|
|
27
|
+
# },
|
|
28
|
+
# "by_tier": {
|
|
29
|
+
# "tier-1": 148512, "pillar-1": 800,
|
|
30
|
+
# "pillar-2": 200, "pillar-3": 1500
|
|
31
|
+
# },
|
|
32
|
+
# "entries": [ { ... UniversalSetEntry ... }, ... ]
|
|
33
|
+
# }
|
|
34
|
+
#
|
|
35
|
+
# `source_config_sha256` pins which Tier 1 source map produced
|
|
36
|
+
# this set. Audits use it to detect drift between the reference
|
|
37
|
+
# set and the config they were validated against.
|
|
38
|
+
#
|
|
39
|
+
# This class is passive — accumulation logic lives in
|
|
40
|
+
# {Ucode::Glyphs::UniversalSet::ManifestAccumulator}; this class
|
|
41
|
+
# only describes the wire shape and handles (de)serialization via
|
|
42
|
+
# lutaml-model.
|
|
43
|
+
class UniversalSetManifest < Lutaml::Model::Serializable
|
|
44
|
+
# Total counts for one build run.
|
|
45
|
+
class Totals < Lutaml::Model::Serializable
|
|
46
|
+
attribute :codepoints_assigned, :integer, default: 0
|
|
47
|
+
attribute :codepoints_built, :integer, default: 0
|
|
48
|
+
attribute :codepoints_skipped, :integer, default: 0
|
|
49
|
+
attribute :codepoints_failed, :integer, default: 0
|
|
50
|
+
|
|
51
|
+
key_value do
|
|
52
|
+
map "codepoints_assigned", to: :codepoints_assigned
|
|
53
|
+
map "codepoints_built", to: :codepoints_built
|
|
54
|
+
map "codepoints_skipped", to: :codepoints_skipped
|
|
55
|
+
map "codepoints_failed", to: :codepoints_failed
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
attribute :unicode_version, :string
|
|
60
|
+
attribute :ucode_version, :string
|
|
61
|
+
attribute :generated_at, :string
|
|
62
|
+
attribute :source_config_sha256, :string
|
|
63
|
+
attribute :totals, Totals
|
|
64
|
+
attribute :by_tier, :hash, default: -> { {} }
|
|
65
|
+
attribute :entries, UniversalSetEntry, collection: true, default: -> { [] }
|
|
66
|
+
|
|
67
|
+
key_value do
|
|
68
|
+
map "unicode_version", to: :unicode_version
|
|
69
|
+
map "ucode_version", to: :ucode_version
|
|
70
|
+
map "generated_at", to: :generated_at
|
|
71
|
+
map "source_config_sha256", to: :source_config_sha256
|
|
72
|
+
map "totals", to: :totals
|
|
73
|
+
map "by_tier", to: :by_tier
|
|
74
|
+
map "entries", to: :entries
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Models
|
|
7
|
+
# Post-build validation report (TODO 21 §Validation). Emitted as
|
|
8
|
+
# `output/validation-report.json` by {Ucode::Repo::BuildValidator}
|
|
9
|
+
# after a canonical build run. Records the outcome of the four
|
|
10
|
+
# automated validation checks:
|
|
11
|
+
#
|
|
12
|
+
# 1. `completeness` — every codepoint folder has both
|
|
13
|
+
# `index.json` and `glyph.svg`.
|
|
14
|
+
# 2. `schema` — every `index.json` deserializes via
|
|
15
|
+
# `Ucode::Models::CodePoint.from_hash`.
|
|
16
|
+
# 3. `provenance_sanity` — every deserialized CodePoint carries
|
|
17
|
+
# a non-nil `glyph.source.tier`.
|
|
18
|
+
# 4. `block_coverage` — per-block built count matches the
|
|
19
|
+
# baseline (skipped when no baseline is supplied).
|
|
20
|
+
#
|
|
21
|
+
# The fifth TODO 21 check (sample inspection) is manual and out
|
|
22
|
+
# of scope for the automated validator.
|
|
23
|
+
#
|
|
24
|
+
# Like {BuildReport}, this model is passive: the accumulation
|
|
25
|
+
# logic lives in {Ucode::Repo::BuildValidator}; this class only
|
|
26
|
+
# describes the wire shape and handles (de)serialization.
|
|
27
|
+
class ValidationReport < Lutaml::Model::Serializable
|
|
28
|
+
# Aggregate pass/fail counts for the run.
|
|
29
|
+
class Totals < Lutaml::Model::Serializable
|
|
30
|
+
attribute :codepoints_checked, :integer, default: 0
|
|
31
|
+
attribute :failures, :integer, default: 0
|
|
32
|
+
attribute :checks_run, :integer, default: 0
|
|
33
|
+
attribute :checks_passed, :integer, default: 0
|
|
34
|
+
|
|
35
|
+
key_value do
|
|
36
|
+
map "codepoints_checked", to: :codepoints_checked
|
|
37
|
+
map "failures", to: :failures
|
|
38
|
+
map "checks_run", to: :checks_run
|
|
39
|
+
map "checks_passed", to: :checks_passed
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Per-check summary. `status` is one of `passed` / `failed` /
|
|
44
|
+
# `skipped`. `total` is the number of codepoints the check
|
|
45
|
+
# evaluated against (0 for `skipped`). `failures` is the count
|
|
46
|
+
# of recorded failures for this check.
|
|
47
|
+
class CheckSummary < Lutaml::Model::Serializable
|
|
48
|
+
STATUS_PASSED = "passed"
|
|
49
|
+
STATUS_FAILED = "failed"
|
|
50
|
+
STATUS_SKIPPED = "skipped"
|
|
51
|
+
|
|
52
|
+
attribute :name, :string
|
|
53
|
+
attribute :status, :string
|
|
54
|
+
attribute :total, :integer, default: 0
|
|
55
|
+
attribute :failures, :integer, default: 0
|
|
56
|
+
|
|
57
|
+
key_value do
|
|
58
|
+
map "name", to: :name
|
|
59
|
+
map "status", to: :status
|
|
60
|
+
map "total", to: :total
|
|
61
|
+
map "failures", to: :failures
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# One failure record. `codepoint` is the integer codepoint (or
|
|
66
|
+
# nil for structural failures like block_coverage); `block` is
|
|
67
|
+
# the verbatim block id (folder name); `check` names the check
|
|
68
|
+
# that produced this failure; `message` is a free-form
|
|
69
|
+
# human-readable explanation.
|
|
70
|
+
class Failure < Lutaml::Model::Serializable
|
|
71
|
+
attribute :codepoint, :integer
|
|
72
|
+
attribute :block, :string
|
|
73
|
+
attribute :check, :string
|
|
74
|
+
attribute :message, :string
|
|
75
|
+
|
|
76
|
+
key_value do
|
|
77
|
+
map "codepoint", to: :codepoint
|
|
78
|
+
map "block", to: :block
|
|
79
|
+
map "check", to: :check
|
|
80
|
+
map "message", to: :message
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
attribute :unicode_version, :string
|
|
85
|
+
attribute :generated_at, :string
|
|
86
|
+
attribute :totals, Totals
|
|
87
|
+
attribute :checks, CheckSummary, collection: true, default: -> { [] }
|
|
88
|
+
attribute :failures, Failure, collection: true, default: -> { [] }
|
|
89
|
+
|
|
90
|
+
key_value do
|
|
91
|
+
map "unicode_version", to: :unicode_version
|
|
92
|
+
map "generated_at", to: :generated_at
|
|
93
|
+
map "totals", to: :totals
|
|
94
|
+
map "checks", to: :checks
|
|
95
|
+
map "failures", to: :failures
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
data/lib/ucode/models.rb
CHANGED
|
@@ -31,6 +31,7 @@ module Ucode
|
|
|
31
31
|
autoload :Script, "ucode/models/script"
|
|
32
32
|
autoload :CodePoint, "ucode/models/codepoint"
|
|
33
33
|
autoload :UnihanEntry, "ucode/models/unihan_entry"
|
|
34
|
+
autoload :UnihanField, "ucode/models/unihan_field"
|
|
34
35
|
autoload :NamesListEntry, "ucode/models/names_list_entry"
|
|
35
36
|
autoload :NameAlias, "ucode/models/name_alias"
|
|
36
37
|
autoload :NamedSequence, "ucode/models/named_sequence"
|
|
@@ -43,5 +44,13 @@ module Ucode
|
|
|
43
44
|
autoload :BinaryPropertyAssignment, "ucode/models/binary_property_assignment"
|
|
44
45
|
autoload :Relationship, "ucode/models/relationship"
|
|
45
46
|
autoload :Audit, "ucode/models/audit"
|
|
47
|
+
autoload :BuildReport, "ucode/models/build_report"
|
|
48
|
+
autoload :ValidationReport, "ucode/models/validation_report"
|
|
49
|
+
autoload :GlyphSource, "ucode/models/glyph_source"
|
|
50
|
+
autoload :GlyphSourceMap, "ucode/models/glyph_source_map"
|
|
51
|
+
autoload :SpecialistFont, "ucode/models/specialist_font"
|
|
52
|
+
autoload :SpecialistFontManifest, "ucode/models/specialist_font_manifest"
|
|
53
|
+
autoload :UniversalSetEntry, "ucode/models/universal_set_entry"
|
|
54
|
+
autoload :UniversalSetManifest, "ucode/models/universal_set_manifest"
|
|
46
55
|
end
|
|
47
56
|
end
|
|
@@ -8,10 +8,10 @@ module Ucode
|
|
|
8
8
|
# Parses `NamedSequences.txt` — named multi-codepoint sequences.
|
|
9
9
|
#
|
|
10
10
|
# Format (UAX #44):
|
|
11
|
-
# cp1 cp2 cp3
|
|
11
|
+
# Name; cp1 cp2 cp3 ...
|
|
12
12
|
#
|
|
13
|
-
# The first field is
|
|
14
|
-
#
|
|
13
|
+
# The first field is the human-readable name; the second is a
|
|
14
|
+
# space-separated list of hex codepoints.
|
|
15
15
|
class NamedSequences < Base
|
|
16
16
|
class << self
|
|
17
17
|
# Yields one NamedSequence per non-comment line. Returns a lazy
|
|
@@ -23,8 +23,8 @@ module Ucode
|
|
|
23
23
|
fields = line.fields
|
|
24
24
|
next if fields.length < 2
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
name = fields[0]
|
|
27
|
+
sequence_field = fields[1]
|
|
28
28
|
next if name.nil? || name.empty?
|
|
29
29
|
|
|
30
30
|
yield Models::NamedSequence.new(
|
data/lib/ucode/parsers/unihan.rb
CHANGED
|
@@ -37,12 +37,32 @@ module Ucode
|
|
|
37
37
|
Unihan_OtherMappings.txt
|
|
38
38
|
].freeze
|
|
39
39
|
|
|
40
|
+
# Filename → category symbol. The parser tags every Record
|
|
41
|
+
# with the category derived from its source file, so consumers
|
|
42
|
+
# (Coordinator → UnihanEntry) don't need to know the mapping.
|
|
43
|
+
# Unicode does not reorganize files across versions, so this
|
|
44
|
+
# mapping is stable without per-field hardcoding.
|
|
45
|
+
FILE_TO_CATEGORY = {
|
|
46
|
+
"Unihan_DictionaryIndices.txt" => :dictionary_indices,
|
|
47
|
+
"Unihan_DictionaryLikeData.txt" => :dictionary_like_data,
|
|
48
|
+
"Unihan_IRGSources.txt" => :irg_sources,
|
|
49
|
+
"Unihan_NumericValues.txt" => :numeric_values,
|
|
50
|
+
"Unihan_RadicalStrokeCounts.txt" => :radical_stroke_counts,
|
|
51
|
+
"Unihan_Readings.txt" => :readings,
|
|
52
|
+
"Unihan_Variants.txt" => :variants,
|
|
53
|
+
"Unihan_OtherMappings.txt" => :other_mappings,
|
|
54
|
+
}.freeze
|
|
55
|
+
|
|
40
56
|
# Stream record: one Unihan line. Internal pipeline data — a Struct
|
|
41
57
|
# avoids lutaml-model ceremony for transient values. The final
|
|
42
58
|
# `UnihanEntry` model carries the merged, persisted shape. The
|
|
43
59
|
# member is `field_values` (not `values`) to avoid overriding
|
|
44
60
|
# `Struct#values` (the array of all member values).
|
|
45
|
-
|
|
61
|
+
#
|
|
62
|
+
# `category` is the symbol UnihanEntry uses to bucket the field
|
|
63
|
+
# into its category attribute (readings / variants / etc.). Set
|
|
64
|
+
# by `each_in_dir` from the source filename via FILE_TO_CATEGORY.
|
|
65
|
+
Record = Struct.new(:cp, :field, :field_values, :category, keyword_init: true) do
|
|
46
66
|
def cp_id
|
|
47
67
|
format("U+%04X", cp)
|
|
48
68
|
end
|
|
@@ -50,25 +70,16 @@ module Ucode
|
|
|
50
70
|
|
|
51
71
|
class << self
|
|
52
72
|
# Yields one Record per non-comment line in a single Unihan file.
|
|
53
|
-
#
|
|
54
|
-
|
|
55
|
-
|
|
73
|
+
# The caller must pass the source filename so the Record carries
|
|
74
|
+
# its category. Returns a lazy Enumerator when no block is given.
|
|
75
|
+
def each_record(path, filename: nil)
|
|
76
|
+
return enum_for(:each_record, path, filename: filename) unless block_given?
|
|
56
77
|
|
|
57
78
|
path_str = path.to_s
|
|
58
|
-
|
|
79
|
+
category = FILE_TO_CATEGORY.fetch(filename || File.basename(path_str), nil)
|
|
59
80
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
line = raw.chomp
|
|
63
|
-
next if line.empty? || line.start_with?("#")
|
|
64
|
-
|
|
65
|
-
begin
|
|
66
|
-
yield parse_line(line)
|
|
67
|
-
rescue MalformedLineError => e
|
|
68
|
-
e.context[:file] ||= path_str
|
|
69
|
-
e.context[:line] ||= lineno
|
|
70
|
-
raise
|
|
71
|
-
end
|
|
81
|
+
each_line_with_lineno(path_str) do |line, lineno|
|
|
82
|
+
yield tagged_record(line, category, path_str, lineno)
|
|
72
83
|
end
|
|
73
84
|
|
|
74
85
|
nil
|
|
@@ -76,7 +87,8 @@ module Ucode
|
|
|
76
87
|
|
|
77
88
|
# Iterates every known Unihan file in `dir`, yielding one Record
|
|
78
89
|
# per data line across all files. Missing files are silently
|
|
79
|
-
# skipped (incremental runs, partial downloads).
|
|
90
|
+
# skipped (incremental runs, partial downloads). Each Record
|
|
91
|
+
# carries its category so callers don't need to re-derive it.
|
|
80
92
|
def each_in_dir(dir)
|
|
81
93
|
return enum_for(:each_in_dir, dir) unless block_given?
|
|
82
94
|
|
|
@@ -85,7 +97,7 @@ module Ucode
|
|
|
85
97
|
path = dir_path.join(filename)
|
|
86
98
|
next unless path.exist?
|
|
87
99
|
|
|
88
|
-
each_record(path) { |record| yield record }
|
|
100
|
+
each_record(path, filename: filename) { |record| yield record }
|
|
89
101
|
end
|
|
90
102
|
|
|
91
103
|
nil
|
|
@@ -93,6 +105,25 @@ module Ucode
|
|
|
93
105
|
|
|
94
106
|
private
|
|
95
107
|
|
|
108
|
+
def each_line_with_lineno(path_str)
|
|
109
|
+
lineno = 0
|
|
110
|
+
File.foreach(path_str) do |raw|
|
|
111
|
+
lineno += 1
|
|
112
|
+
line = raw.chomp
|
|
113
|
+
next if line.empty? || line.start_with?("#")
|
|
114
|
+
|
|
115
|
+
yield line, lineno
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def tagged_record(line, category, path_str, lineno)
|
|
120
|
+
parse_line(line).tap { |r| r.category = category }
|
|
121
|
+
rescue MalformedLineError => e
|
|
122
|
+
e.context[:file] ||= path_str
|
|
123
|
+
e.context[:line] ||= lineno
|
|
124
|
+
raise
|
|
125
|
+
end
|
|
126
|
+
|
|
96
127
|
# Parses one TAB-separated Unihan data line into a Record. The
|
|
97
128
|
# `split("\t", 3)` limit preserves any tabs inside the value
|
|
98
129
|
# (defensive — real Unihan data does not contain them).
|
|
@@ -17,7 +17,7 @@ module Ucode
|
|
|
17
17
|
# output/blocks/index.json (block index)
|
|
18
18
|
# output/scripts/<code>.json
|
|
19
19
|
# output/index/names.json (cp_id → name)
|
|
20
|
-
# output/index/labels.json (cp_id → {name, gc, sc})
|
|
20
|
+
# output/index/labels.json (cp_id → {name, gc, sc, cc, bc, mir})
|
|
21
21
|
# output/index/codepoint_to_block.json (cp_id → block_id)
|
|
22
22
|
# output/relationships/*.json (per-property tables)
|
|
23
23
|
# output/enums.json (property aliases + value aliases)
|
|
@@ -79,6 +79,7 @@ module Ucode
|
|
|
79
79
|
def initialize(output_root)
|
|
80
80
|
@output_root = Pathname.new(output_root)
|
|
81
81
|
@block_codepoint_ids = Hash.new { |h, k| h[k] = [] }
|
|
82
|
+
@block_ages = Hash.new { |h, k| h[k] = nil }
|
|
82
83
|
@script_codepoint_ids = Hash.new { |h, k| h[k] = [] }
|
|
83
84
|
@names_index = {}
|
|
84
85
|
@labels_index = {}
|
|
@@ -94,6 +95,7 @@ module Ucode
|
|
|
94
95
|
return if cp.block_id.nil?
|
|
95
96
|
|
|
96
97
|
@block_codepoint_ids[cp.block_id] << cp.id
|
|
98
|
+
track_block_age(cp)
|
|
97
99
|
if cp.script_code
|
|
98
100
|
@script_codepoint_ids[cp.script_code] << cp.id
|
|
99
101
|
end
|
|
@@ -136,10 +138,37 @@ module Ucode
|
|
|
136
138
|
# ---- Per-codepoint accumulator helpers ---------------------------
|
|
137
139
|
|
|
138
140
|
def build_label(cp)
|
|
139
|
-
label = {
|
|
141
|
+
label = {
|
|
142
|
+
"name" => cp.name,
|
|
143
|
+
"gc" => cp.general_category,
|
|
144
|
+
"sc" => cp.script_code,
|
|
145
|
+
"cc" => cp.combining_class,
|
|
146
|
+
"bc" => cp.bidi&.bidi_class,
|
|
147
|
+
"mir" => cp.bidi&.is_mirrored ? true : nil,
|
|
148
|
+
}
|
|
140
149
|
label.reject { |_, v| v.nil? }
|
|
141
150
|
end
|
|
142
151
|
|
|
152
|
+
# Per-block `age` is the earliest DerivedAge of any codepoint in
|
|
153
|
+
# the block, compared as a Gem::Version. Stored as the original
|
|
154
|
+
# string (e.g. "1.1", "17.0.0"). nil when no codepoint in the
|
|
155
|
+
# block has an age (rare — only happens for entirely-reserved
|
|
156
|
+
# blocks, which the parser excludes anyway).
|
|
157
|
+
def track_block_age(cp)
|
|
158
|
+
return if cp.age.nil? || cp.age.empty?
|
|
159
|
+
|
|
160
|
+
current = @block_ages[cp.block_id]
|
|
161
|
+
@block_ages[cp.block_id] = if current.nil?
|
|
162
|
+
cp.age
|
|
163
|
+
else
|
|
164
|
+
min_age(current, cp.age)
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def min_age(a, b)
|
|
169
|
+
Gem::Version.new(a) < Gem::Version.new(b) ? a : b
|
|
170
|
+
end
|
|
171
|
+
|
|
143
172
|
# ---- Plane files -------------------------------------------------
|
|
144
173
|
|
|
145
174
|
def write_planes(blocks)
|
|
@@ -176,6 +205,7 @@ module Ucode
|
|
|
176
205
|
|
|
177
206
|
def write_blocks(blocks)
|
|
178
207
|
count = blocks.sum do |block|
|
|
208
|
+
block.age = @block_ages[block.id]
|
|
179
209
|
path = Paths.block_metadata_path(@output_root, block.id)
|
|
180
210
|
write_atomic(path, block_payload(block)) ? 1 : 0
|
|
181
211
|
end
|
|
@@ -191,6 +221,7 @@ module Ucode
|
|
|
191
221
|
"first_cp" => block.range_first,
|
|
192
222
|
"last_cp" => block.range_last,
|
|
193
223
|
"plane_number" => block.plane_number,
|
|
224
|
+
"age" => @block_ages[block.id],
|
|
194
225
|
}
|
|
195
226
|
end
|
|
196
227
|
write_atomic(path, to_pretty_json(summary)) ? 1 : 0
|
|
@@ -203,6 +234,7 @@ module Ucode
|
|
|
203
234
|
"range_first" => block.range_first,
|
|
204
235
|
"range_last" => block.range_last,
|
|
205
236
|
"plane_number" => block.plane_number,
|
|
237
|
+
"age" => @block_ages[block.id],
|
|
206
238
|
"codepoint_ids" => (@block_codepoint_ids[block.id] || []),
|
|
207
239
|
)
|
|
208
240
|
end
|