ucode 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +72 -0
  3. data/Gemfile.lock +2 -2
  4. data/TODO.full/00-README.md +116 -0
  5. data/TODO.full/01-panglyph-vision.md +112 -0
  6. data/TODO.full/02-panglyph-repo-bootstrap.md +184 -0
  7. data/TODO.full/03-panglyph-font-builder.md +201 -0
  8. data/TODO.full/04-panglyph-publish-pipeline.md +126 -0
  9. data/TODO.full/05-ucode-0-1-1-release.md +139 -0
  10. data/TODO.full/06-fontisan-remove-audit.md +142 -0
  11. data/TODO.full/07-fontisan-remove-ucd.md +125 -0
  12. data/TODO.full/08-archive-private-bin-build.md +143 -0
  13. data/TODO.full/09-archive-public-structure.md +164 -0
  14. data/TODO.full/10-fontist-org-woff-glyphs.md +131 -0
  15. data/TODO.full/11-fontist-org-audit-coverage.md +140 -0
  16. data/TODO.full/12-implementation-order.md +216 -0
  17. data/TODO.full/13-fontisan-font-writer-api.md +189 -0
  18. data/TODO.full/14-fontisan-table-writers.md +66 -0
  19. data/TODO.full/15-panglyph-builder-real.md +82 -0
  20. data/TODO.full/16-archive-public-sync-workflows.md +167 -0
  21. data/TODO.full/17-fontist-org-font-picker.md +73 -0
  22. data/TODO.full/18-comprehensive-spec-coverage.md +64 -0
  23. data/TODO.full/19-ucode-0-1-2-patch.md +32 -0
  24. data/TODO.full/20-fontisan-0-2-23-release.md +52 -0
  25. data/TODO.new/00-README.md +30 -0
  26. data/TODO.new/23-universal-glyph-set-source-map.md +312 -0
  27. data/TODO.new/24-universal-glyph-set-build.md +189 -0
  28. data/TODO.new/25-font-audit-against-universal-set.md +195 -0
  29. data/TODO.new/26-missing-glyph-reporter.md +189 -0
  30. data/TODO.new/27-fontist-org-consumer-integration.md +200 -0
  31. data/TODO.new/28-implementation-order-update.md +187 -0
  32. data/TODO.new/29-universal-set-curation-uc17.md +312 -0
  33. data/TODO.new/30-tier1-font-acquisition.md +241 -0
  34. data/TODO.new/31-universal-set-production-build.md +205 -0
  35. data/TODO.new/32-uc17-coverage-matrix.md +165 -0
  36. data/TODO.new/33-specialist-font-acquisition-refresh.md +138 -0
  37. data/TODO.new/34-pillar2-content-stream-correlator.md +147 -0
  38. data/TODO.new/35-universal-set-production-run.md +160 -0
  39. data/TODO.new/36-per-font-coverage-audit.md +145 -0
  40. data/TODO.new/37-coverage-highlight-reporter.md +125 -0
  41. data/TODO.new/38-fontist-org-glyph-consumer.md +141 -0
  42. data/TODO.new/39-implementation-order-update-32-38.md +258 -0
  43. data/TODO.new/40-archive-private-uses-ucode-audit.md +124 -0
  44. data/TODO.new/41-ucode-unicode-archive-bridge.md +160 -0
  45. data/config/specialist_fonts.yml +102 -0
  46. data/config/unicode17_tier1_fonts.yml +42 -0
  47. data/config/unicode17_universal_glyph_set.yml +293 -0
  48. data/lib/ucode/audit/block_aggregator.rb +57 -29
  49. data/lib/ucode/audit/browser/face_page.rb +128 -0
  50. data/lib/ucode/audit/browser/glyph_panel.rb +124 -0
  51. data/lib/ucode/audit/browser/library_page.rb +74 -0
  52. data/lib/ucode/audit/browser/missing_glyph_page.rb +87 -0
  53. data/lib/ucode/audit/browser/template.rb +47 -0
  54. data/lib/ucode/audit/browser/templates/face.css +200 -0
  55. data/lib/ucode/audit/browser/templates/face.html.erb +41 -0
  56. data/lib/ucode/audit/browser/templates/face.js +298 -0
  57. data/lib/ucode/audit/browser/templates/library.css +119 -0
  58. data/lib/ucode/audit/browser/templates/library.html.erb +42 -0
  59. data/lib/ucode/audit/browser/templates/library.js +99 -0
  60. data/lib/ucode/audit/browser/templates/missing_glyph_page.css +119 -0
  61. data/lib/ucode/audit/browser/templates/missing_glyph_page.html.erb +58 -0
  62. data/lib/ucode/audit/browser/templates/missing_glyph_page.js +2 -0
  63. data/lib/ucode/audit/browser.rb +32 -0
  64. data/lib/ucode/audit/context.rb +27 -1
  65. data/lib/ucode/audit/coverage_reference.rb +103 -0
  66. data/lib/ucode/audit/differ.rb +121 -0
  67. data/lib/ucode/audit/emitter/block_emitter.rb +52 -0
  68. data/lib/ucode/audit/emitter/codepoint_emitter.rb +87 -0
  69. data/lib/ucode/audit/emitter/collection_emitter.rb +80 -0
  70. data/lib/ucode/audit/emitter/face_directory.rb +212 -0
  71. data/lib/ucode/audit/emitter/glyph_emitter.rb +48 -0
  72. data/lib/ucode/audit/emitter/index_emitter.rb +149 -0
  73. data/lib/ucode/audit/emitter/library_emitter.rb +96 -0
  74. data/lib/ucode/audit/emitter/paths.rb +312 -0
  75. data/lib/ucode/audit/emitter/plane_emitter.rb +29 -0
  76. data/lib/ucode/audit/emitter/script_emitter.rb +29 -0
  77. data/lib/ucode/audit/emitter.rb +29 -0
  78. data/lib/ucode/audit/extractors/aggregations.rb +31 -2
  79. data/lib/ucode/audit/face_auditor.rb +86 -0
  80. data/lib/ucode/audit/formatters/audit_diff_text.rb +112 -0
  81. data/lib/ucode/audit/formatters/audit_text.rb +411 -0
  82. data/lib/ucode/audit/formatters/color.rb +48 -0
  83. data/lib/ucode/audit/formatters/library_summary_text.rb +98 -0
  84. data/lib/ucode/audit/formatters/text_formatter.rb +83 -0
  85. data/lib/ucode/audit/formatters.rb +23 -0
  86. data/lib/ucode/audit/library_aggregator.rb +86 -0
  87. data/lib/ucode/audit/library_auditor.rb +105 -0
  88. data/lib/ucode/audit/release/emitter.rb +152 -0
  89. data/lib/ucode/audit/release/face_card.rb +93 -0
  90. data/lib/ucode/audit/release/formula_audits.rb +50 -0
  91. data/lib/ucode/audit/release/library_index_builder.rb +78 -0
  92. data/lib/ucode/audit/release/manifest_builder.rb +127 -0
  93. data/lib/ucode/audit/release.rb +42 -0
  94. data/lib/ucode/audit/ucd_only_reference.rb +81 -0
  95. data/lib/ucode/audit/universal_set_reference.rb +136 -0
  96. data/lib/ucode/audit.rb +31 -0
  97. data/lib/ucode/cli.rb +339 -33
  98. data/lib/ucode/commands/audit/browser_command.rb +82 -0
  99. data/lib/ucode/commands/audit/collection_command.rb +103 -0
  100. data/lib/ucode/commands/audit/compare_command.rb +188 -0
  101. data/lib/ucode/commands/audit/font_command.rb +140 -0
  102. data/lib/ucode/commands/audit/library_command.rb +87 -0
  103. data/lib/ucode/commands/audit/reference_builder.rb +64 -0
  104. data/lib/ucode/commands/audit.rb +20 -0
  105. data/lib/ucode/commands/block_feed.rb +73 -0
  106. data/lib/ucode/commands/canonical_build.rb +138 -0
  107. data/lib/ucode/commands/fetch.rb +37 -1
  108. data/lib/ucode/commands/release.rb +115 -0
  109. data/lib/ucode/commands/universal_set.rb +211 -0
  110. data/lib/ucode/commands.rb +5 -0
  111. data/lib/ucode/coordinator/indices.rb +11 -0
  112. data/lib/ucode/coordinator.rb +138 -5
  113. data/lib/ucode/error.rb +30 -2
  114. data/lib/ucode/fetch/font_fetcher/result.rb +39 -0
  115. data/lib/ucode/fetch/font_fetcher.rb +16 -0
  116. data/lib/ucode/fetch/specialist_font_fetcher.rb +280 -0
  117. data/lib/ucode/fetch.rb +7 -3
  118. data/lib/ucode/glyphs/real_fonts/cmap_cache.rb +74 -0
  119. data/lib/ucode/glyphs/real_fonts.rb +1 -0
  120. data/lib/ucode/glyphs/resolver.rb +62 -0
  121. data/lib/ucode/glyphs/source.rb +48 -0
  122. data/lib/ucode/glyphs/source_builder.rb +61 -0
  123. data/lib/ucode/glyphs/source_config/coverage_assertion.rb +79 -0
  124. data/lib/ucode/glyphs/source_config/gap_report.rb +54 -0
  125. data/lib/ucode/glyphs/source_config.rb +104 -0
  126. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +63 -0
  127. data/lib/ucode/glyphs/sources/pillar3_last_resort.rb +51 -0
  128. data/lib/ucode/glyphs/sources/tier1_real_font.rb +104 -0
  129. data/lib/ucode/glyphs/sources.rb +20 -0
  130. data/lib/ucode/glyphs/universal_set/builder.rb +161 -0
  131. data/lib/ucode/glyphs/universal_set/coverage_report.rb +139 -0
  132. data/lib/ucode/glyphs/universal_set/idempotency.rb +86 -0
  133. data/lib/ucode/glyphs/universal_set/manifest_accumulator.rb +195 -0
  134. data/lib/ucode/glyphs/universal_set/manifest_writer.rb +61 -0
  135. data/lib/ucode/glyphs/universal_set/pre_build_check.rb +197 -0
  136. data/lib/ucode/glyphs/universal_set/validator.rb +204 -0
  137. data/lib/ucode/glyphs/universal_set.rb +45 -0
  138. data/lib/ucode/glyphs.rb +6 -0
  139. data/lib/ucode/models/audit/baseline.rb +6 -0
  140. data/lib/ucode/models/audit/block_summary.rb +7 -0
  141. data/lib/ucode/models/audit/codepoint_provenance.rb +39 -0
  142. data/lib/ucode/models/audit/release_face.rb +42 -0
  143. data/lib/ucode/models/audit/release_formula.rb +33 -0
  144. data/lib/ucode/models/audit/release_manifest.rb +43 -0
  145. data/lib/ucode/models/audit/release_universal_set.rb +37 -0
  146. data/lib/ucode/models/audit.rb +9 -0
  147. data/lib/ucode/models/block.rb +2 -0
  148. data/lib/ucode/models/build_report.rb +109 -0
  149. data/lib/ucode/models/codepoint/glyph.rb +42 -0
  150. data/lib/ucode/models/codepoint.rb +3 -0
  151. data/lib/ucode/models/glyph_source.rb +86 -0
  152. data/lib/ucode/models/glyph_source_map.rb +138 -0
  153. data/lib/ucode/models/specialist_font.rb +70 -0
  154. data/lib/ucode/models/specialist_font_manifest.rb +48 -0
  155. data/lib/ucode/models/unihan_entry.rb +81 -9
  156. data/lib/ucode/models/unihan_field.rb +21 -0
  157. data/lib/ucode/models/universal_set_entry.rb +47 -0
  158. data/lib/ucode/models/universal_set_manifest.rb +78 -0
  159. data/lib/ucode/models/validation_report.rb +99 -0
  160. data/lib/ucode/models.rb +9 -0
  161. data/lib/ucode/parsers/named_sequences.rb +5 -5
  162. data/lib/ucode/parsers/unihan.rb +50 -19
  163. data/lib/ucode/repo/aggregate_writer.rb +34 -2
  164. data/lib/ucode/repo/block_feed_emitter.rb +153 -0
  165. data/lib/ucode/repo/build_report_accumulator.rb +138 -0
  166. data/lib/ucode/repo/build_report_writer.rb +46 -0
  167. data/lib/ucode/repo/build_validator.rb +229 -0
  168. data/lib/ucode/repo/codepoint_writer.rb +50 -1
  169. data/lib/ucode/repo/paths.rb +8 -0
  170. data/lib/ucode/repo.rb +4 -0
  171. data/lib/ucode/version.rb +1 -1
  172. data/schema/block-feed.output.schema.yml +134 -0
  173. metadata +143 -2
  174. data/ucode.gemspec +0 -56
@@ -2,21 +2,93 @@
2
2
 
3
3
  require "lutaml/model"
4
4
 
5
+ require "ucode/models/unihan_field"
6
+
5
7
  module Ucode
6
8
  module Models
7
- # Unihan dictionary data for CJK codepoints. Flat-hash design: every
8
- # `kFoo` field is a key in `fields`, with array values (Unihan fields
9
- # are space-separated lists; uniform arrays simplify the shape).
9
+ # Unihan dictionary data for CJK codepoints, grouped into the 8
10
+ # categories defined by the Unihan standard. Each category
11
+ # corresponds to one Unihan file:
12
+ #
13
+ # Unihan_DictionaryIndices.txt → dictionary_indices
14
+ # Unihan_DictionaryLikeData.txt → dictionary_like_data
15
+ # Unihan_IRGSources.txt → irg_sources
16
+ # Unihan_NumericValues.txt → numeric_values
17
+ # Unihan_RadicalStrokeCounts.txt → radical_stroke_counts
18
+ # Unihan_Readings.txt → readings
19
+ # Unihan_Variants.txt → variants
20
+ # Unihan_OtherMappings.txt → other_mappings
10
21
  #
11
- # The semantic grouping (readings / radicals / variants / sources / etc.)
12
- # is a presentation concern, derived client-side by prefix. The data
13
- # model stays open Unihan adds fields across versions, and the hash
14
- # absorbs additions without model changes.
22
+ # Each category attribute is a collection of {UnihanField} records.
23
+ # Category is set at parse time from the source filename (via
24
+ # `FILE_TO_CATEGORY`) Unicode does not reorganize files across
25
+ # versions, so this is stable without per-field hardcoding.
15
26
  class UnihanEntry < Lutaml::Model::Serializable
16
- attribute :fields, :hash, default: -> { {} }
27
+ # Symbol → attribute name. Mirrors the 8 Unihan files.
28
+ CATEGORIES = {
29
+ dictionary_indices: :dictionary_indices,
30
+ dictionary_like_data: :dictionary_like_data,
31
+ irg_sources: :irg_sources,
32
+ numeric_values: :numeric_values,
33
+ radical_stroke_counts: :radical_stroke_counts,
34
+ readings: :readings,
35
+ variants: :variants,
36
+ other_mappings: :other_mappings,
37
+ }.freeze
38
+
39
+ # Filename → category symbol. Used by the parser to bucket
40
+ # records without callers needing to know the mapping.
41
+ FILE_TO_CATEGORY = {
42
+ "Unihan_DictionaryIndices.txt" => :dictionary_indices,
43
+ "Unihan_DictionaryLikeData.txt" => :dictionary_like_data,
44
+ "Unihan_IRGSources.txt" => :irg_sources,
45
+ "Unihan_NumericValues.txt" => :numeric_values,
46
+ "Unihan_RadicalStrokeCounts.txt" => :radical_stroke_counts,
47
+ "Unihan_Readings.txt" => :readings,
48
+ "Unihan_Variants.txt" => :variants,
49
+ "Unihan_OtherMappings.txt" => :other_mappings,
50
+ }.freeze
51
+
52
+ attribute :dictionary_indices, UnihanField, collection: true, default: -> { [] }
53
+ attribute :dictionary_like_data, UnihanField, collection: true, default: -> { [] }
54
+ attribute :irg_sources, UnihanField, collection: true, default: -> { [] }
55
+ attribute :numeric_values, UnihanField, collection: true, default: -> { [] }
56
+ attribute :radical_stroke_counts, UnihanField, collection: true, default: -> { [] }
57
+ attribute :readings, UnihanField, collection: true, default: -> { [] }
58
+ attribute :variants, UnihanField, collection: true, default: -> { [] }
59
+ attribute :other_mappings, UnihanField, collection: true, default: -> { [] }
60
+
61
+ # Pushes a field into the right category bucket. Used by the
62
+ # Coordinator when streaming records from the parser.
63
+ #
64
+ # @param category [Symbol] one of CATEGORIES keys
65
+ # @param name [String] e.g. "kMandarin"
66
+ # @param values [Array<String>] space-split values from Unihan
67
+ def add(category, name, values)
68
+ attr_name = CATEGORIES.fetch(category) { return }
69
+ public_send(attr_name) << UnihanField.new(name: name, values: values)
70
+ end
71
+
72
+ # True if any category has data.
73
+ def any?
74
+ CATEGORIES.keys.any? { |sym| !public_send(sym).empty? }
75
+ end
76
+
77
+ # All fields across every category, flattened to {name => values}.
78
+ # Iteration helper for consumers that want a flat view (search
79
+ # indexing, downstream filtering).
80
+ #
81
+ # @return [Hash{String => Array<String>}]
82
+ def all_fields
83
+ CATEGORIES.keys.each_with_object({}) do |sym, h|
84
+ public_send(sym).each { |f| h[f.name] = f.values }
85
+ end
86
+ end
17
87
 
18
88
  key_value do
19
- map "fields", to: :fields
89
+ CATEGORIES.each do |symbol, attr_name|
90
+ map attr_name, to: symbol
91
+ end
20
92
  end
21
93
  end
22
94
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ module Ucode
6
+ module Models
7
+ # One Unihan field assignment: a k-field name plus its space-split
8
+ # values. e.g. `kMandarin → ["jìng"]`, `kHanyuPinyin → ["64047.030:jìng"]`.
9
+ # The values list is uniform across all Unihan fields — even single-valued
10
+ # ones are arrays, which simplifies consumer logic.
11
+ class UnihanField < Lutaml::Model::Serializable
12
+ attribute :name, :string
13
+ attribute :values, :string, collection: true, default: -> { [] }
14
+
15
+ key_value do
16
+ map "name", to: :name
17
+ map "values", to: :values
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ module Ucode
6
+ module Models
7
+ # One row in a {UniversalSetManifest}. Records the resolved glyph
8
+ # for a single codepoint: which tier produced it, which source
9
+ # font, and a stable content hash + size so downstream consumers
10
+ # can detect changes without re-reading the SVG.
11
+ #
12
+ # Wire shape (one entry per assigned codepoint in the manifest's
13
+ # `entries:` array):
14
+ #
15
+ # {
16
+ # "codepoint": 65,
17
+ # "id": "U+0041",
18
+ # "tier": "tier-1",
19
+ # "source": "noto-sans",
20
+ # "svg_sha256": "abc...",
21
+ # "svg_size_bytes": 412
22
+ # }
23
+ #
24
+ # `source` is the source identifier extracted from the resolver
25
+ # {Ucode::Glyphs::Source::Result#provenance} — i.e. the part after
26
+ # the `tier:` prefix ("noto-sans" for "tier-1:noto-sans"). This is
27
+ # what audits (TODO 25) group by when answering "how many
28
+ # codepoints does font X cover in this set?".
29
+ class UniversalSetEntry < Lutaml::Model::Serializable
30
+ attribute :codepoint, :integer
31
+ attribute :id, :string
32
+ attribute :tier, :string
33
+ attribute :source, :string
34
+ attribute :svg_sha256, :string
35
+ attribute :svg_size_bytes, :integer, default: 0
36
+
37
+ key_value do
38
+ map "codepoint", to: :codepoint
39
+ map "id", to: :id
40
+ map "tier", to: :tier
41
+ map "source", to: :source
42
+ map "svg_sha256", to: :svg_sha256
43
+ map "svg_size_bytes", to: :svg_size_bytes
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ require "ucode/models/universal_set_entry"
6
+
7
+ module Ucode
8
+ module Models
9
+ # Manifest emitted at the end of a universal glyph set build
10
+ # (TODO 24). The single index into the set: every codepoint that
11
+ # was attempted gets one {UniversalSetEntry}, and the totals +
12
+ # per-tier rollups let consumers (audits, fontist.org) answer
13
+ # "what does this set cover?" without reading every SVG.
14
+ #
15
+ # Wire shape:
16
+ #
17
+ # {
18
+ # "unicode_version": "17.0.0",
19
+ # "ucode_version": "0.2.0",
20
+ # "generated_at": "2026-06-28T00:00:00Z",
21
+ # "source_config_sha256": "abc...",
22
+ # "totals": {
23
+ # "codepoints_assigned": 150012,
24
+ # "codepoints_built": 150012,
25
+ # "codepoints_skipped": 0,
26
+ # "codepoints_failed": 0
27
+ # },
28
+ # "by_tier": {
29
+ # "tier-1": 148512, "pillar-1": 800,
30
+ # "pillar-2": 200, "pillar-3": 1500
31
+ # },
32
+ # "entries": [ { ... UniversalSetEntry ... }, ... ]
33
+ # }
34
+ #
35
+ # `source_config_sha256` pins which Tier 1 source map produced
36
+ # this set. Audits use it to detect drift between the reference
37
+ # set and the config they were validated against.
38
+ #
39
+ # This class is passive — accumulation logic lives in
40
+ # {Ucode::Glyphs::UniversalSet::ManifestAccumulator}; this class
41
+ # only describes the wire shape and handles (de)serialization via
42
+ # lutaml-model.
43
+ class UniversalSetManifest < Lutaml::Model::Serializable
44
+ # Total counts for one build run.
45
+ class Totals < Lutaml::Model::Serializable
46
+ attribute :codepoints_assigned, :integer, default: 0
47
+ attribute :codepoints_built, :integer, default: 0
48
+ attribute :codepoints_skipped, :integer, default: 0
49
+ attribute :codepoints_failed, :integer, default: 0
50
+
51
+ key_value do
52
+ map "codepoints_assigned", to: :codepoints_assigned
53
+ map "codepoints_built", to: :codepoints_built
54
+ map "codepoints_skipped", to: :codepoints_skipped
55
+ map "codepoints_failed", to: :codepoints_failed
56
+ end
57
+ end
58
+
59
+ attribute :unicode_version, :string
60
+ attribute :ucode_version, :string
61
+ attribute :generated_at, :string
62
+ attribute :source_config_sha256, :string
63
+ attribute :totals, Totals
64
+ attribute :by_tier, :hash, default: -> { {} }
65
+ attribute :entries, UniversalSetEntry, collection: true, default: -> { [] }
66
+
67
+ key_value do
68
+ map "unicode_version", to: :unicode_version
69
+ map "ucode_version", to: :ucode_version
70
+ map "generated_at", to: :generated_at
71
+ map "source_config_sha256", to: :source_config_sha256
72
+ map "totals", to: :totals
73
+ map "by_tier", to: :by_tier
74
+ map "entries", to: :entries
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ module Ucode
6
+ module Models
7
+ # Post-build validation report (TODO 21 §Validation). Emitted as
8
+ # `output/validation-report.json` by {Ucode::Repo::BuildValidator}
9
+ # after a canonical build run. Records the outcome of the four
10
+ # automated validation checks:
11
+ #
12
+ # 1. `completeness` — every codepoint folder has both
13
+ # `index.json` and `glyph.svg`.
14
+ # 2. `schema` — every `index.json` deserializes via
15
+ # `Ucode::Models::CodePoint.from_hash`.
16
+ # 3. `provenance_sanity` — every deserialized CodePoint carries
17
+ # a non-nil `glyph.source.tier`.
18
+ # 4. `block_coverage` — per-block built count matches the
19
+ # baseline (skipped when no baseline is supplied).
20
+ #
21
+ # The fifth TODO 21 check (sample inspection) is manual and out
22
+ # of scope for the automated validator.
23
+ #
24
+ # Like {BuildReport}, this model is passive: the accumulation
25
+ # logic lives in {Ucode::Repo::BuildValidator}; this class only
26
+ # describes the wire shape and handles (de)serialization.
27
+ class ValidationReport < Lutaml::Model::Serializable
28
+ # Aggregate pass/fail counts for the run.
29
+ class Totals < Lutaml::Model::Serializable
30
+ attribute :codepoints_checked, :integer, default: 0
31
+ attribute :failures, :integer, default: 0
32
+ attribute :checks_run, :integer, default: 0
33
+ attribute :checks_passed, :integer, default: 0
34
+
35
+ key_value do
36
+ map "codepoints_checked", to: :codepoints_checked
37
+ map "failures", to: :failures
38
+ map "checks_run", to: :checks_run
39
+ map "checks_passed", to: :checks_passed
40
+ end
41
+ end
42
+
43
+ # Per-check summary. `status` is one of `passed` / `failed` /
44
+ # `skipped`. `total` is the number of codepoints the check
45
+ # evaluated against (0 for `skipped`). `failures` is the count
46
+ # of recorded failures for this check.
47
+ class CheckSummary < Lutaml::Model::Serializable
48
+ STATUS_PASSED = "passed"
49
+ STATUS_FAILED = "failed"
50
+ STATUS_SKIPPED = "skipped"
51
+
52
+ attribute :name, :string
53
+ attribute :status, :string
54
+ attribute :total, :integer, default: 0
55
+ attribute :failures, :integer, default: 0
56
+
57
+ key_value do
58
+ map "name", to: :name
59
+ map "status", to: :status
60
+ map "total", to: :total
61
+ map "failures", to: :failures
62
+ end
63
+ end
64
+
65
+ # One failure record. `codepoint` is the integer codepoint (or
66
+ # nil for structural failures like block_coverage); `block` is
67
+ # the verbatim block id (folder name); `check` names the check
68
+ # that produced this failure; `message` is a free-form
69
+ # human-readable explanation.
70
+ class Failure < Lutaml::Model::Serializable
71
+ attribute :codepoint, :integer
72
+ attribute :block, :string
73
+ attribute :check, :string
74
+ attribute :message, :string
75
+
76
+ key_value do
77
+ map "codepoint", to: :codepoint
78
+ map "block", to: :block
79
+ map "check", to: :check
80
+ map "message", to: :message
81
+ end
82
+ end
83
+
84
+ attribute :unicode_version, :string
85
+ attribute :generated_at, :string
86
+ attribute :totals, Totals
87
+ attribute :checks, CheckSummary, collection: true, default: -> { [] }
88
+ attribute :failures, Failure, collection: true, default: -> { [] }
89
+
90
+ key_value do
91
+ map "unicode_version", to: :unicode_version
92
+ map "generated_at", to: :generated_at
93
+ map "totals", to: :totals
94
+ map "checks", to: :checks
95
+ map "failures", to: :failures
96
+ end
97
+ end
98
+ end
99
+ end
data/lib/ucode/models.rb CHANGED
@@ -31,6 +31,7 @@ module Ucode
31
31
  autoload :Script, "ucode/models/script"
32
32
  autoload :CodePoint, "ucode/models/codepoint"
33
33
  autoload :UnihanEntry, "ucode/models/unihan_entry"
34
+ autoload :UnihanField, "ucode/models/unihan_field"
34
35
  autoload :NamesListEntry, "ucode/models/names_list_entry"
35
36
  autoload :NameAlias, "ucode/models/name_alias"
36
37
  autoload :NamedSequence, "ucode/models/named_sequence"
@@ -43,5 +44,13 @@ module Ucode
43
44
  autoload :BinaryPropertyAssignment, "ucode/models/binary_property_assignment"
44
45
  autoload :Relationship, "ucode/models/relationship"
45
46
  autoload :Audit, "ucode/models/audit"
47
+ autoload :BuildReport, "ucode/models/build_report"
48
+ autoload :ValidationReport, "ucode/models/validation_report"
49
+ autoload :GlyphSource, "ucode/models/glyph_source"
50
+ autoload :GlyphSourceMap, "ucode/models/glyph_source_map"
51
+ autoload :SpecialistFont, "ucode/models/specialist_font"
52
+ autoload :SpecialistFontManifest, "ucode/models/specialist_font_manifest"
53
+ autoload :UniversalSetEntry, "ucode/models/universal_set_entry"
54
+ autoload :UniversalSetManifest, "ucode/models/universal_set_manifest"
46
55
  end
47
56
  end
@@ -8,10 +8,10 @@ module Ucode
8
8
  # Parses `NamedSequences.txt` — named multi-codepoint sequences.
9
9
  #
10
10
  # Format (UAX #44):
11
- # cp1 cp2 cp3 ...; Name
11
+ # Name; cp1 cp2 cp3 ...
12
12
  #
13
- # The first field is a space-separated list of hex codepoints; the
14
- # second is the human-readable name.
13
+ # The first field is the human-readable name; the second is a
14
+ # space-separated list of hex codepoints.
15
15
  class NamedSequences < Base
16
16
  class << self
17
17
  # Yields one NamedSequence per non-comment line. Returns a lazy
@@ -23,8 +23,8 @@ module Ucode
23
23
  fields = line.fields
24
24
  next if fields.length < 2
25
25
 
26
- sequence_field = fields[0]
27
- name = fields[1]
26
+ name = fields[0]
27
+ sequence_field = fields[1]
28
28
  next if name.nil? || name.empty?
29
29
 
30
30
  yield Models::NamedSequence.new(
@@ -37,12 +37,32 @@ module Ucode
37
37
  Unihan_OtherMappings.txt
38
38
  ].freeze
39
39
 
40
+ # Filename → category symbol. The parser tags every Record
41
+ # with the category derived from its source file, so consumers
42
+ # (Coordinator → UnihanEntry) don't need to know the mapping.
43
+ # Unicode does not reorganize files across versions, so this
44
+ # mapping is stable without per-field hardcoding.
45
+ FILE_TO_CATEGORY = {
46
+ "Unihan_DictionaryIndices.txt" => :dictionary_indices,
47
+ "Unihan_DictionaryLikeData.txt" => :dictionary_like_data,
48
+ "Unihan_IRGSources.txt" => :irg_sources,
49
+ "Unihan_NumericValues.txt" => :numeric_values,
50
+ "Unihan_RadicalStrokeCounts.txt" => :radical_stroke_counts,
51
+ "Unihan_Readings.txt" => :readings,
52
+ "Unihan_Variants.txt" => :variants,
53
+ "Unihan_OtherMappings.txt" => :other_mappings,
54
+ }.freeze
55
+
40
56
  # Stream record: one Unihan line. Internal pipeline data — a Struct
41
57
  # avoids lutaml-model ceremony for transient values. The final
42
58
  # `UnihanEntry` model carries the merged, persisted shape. The
43
59
  # member is `field_values` (not `values`) to avoid overriding
44
60
  # `Struct#values` (the array of all member values).
45
- Record = Struct.new(:cp, :field, :field_values, keyword_init: true) do
61
+ #
62
+ # `category` is the symbol UnihanEntry uses to bucket the field
63
+ # into its category attribute (readings / variants / etc.). Set
64
+ # by `each_in_dir` from the source filename via FILE_TO_CATEGORY.
65
+ Record = Struct.new(:cp, :field, :field_values, :category, keyword_init: true) do
46
66
  def cp_id
47
67
  format("U+%04X", cp)
48
68
  end
@@ -50,25 +70,16 @@ module Ucode
50
70
 
51
71
  class << self
52
72
  # Yields one Record per non-comment line in a single Unihan file.
53
- # Returns a lazy Enumerator when no block is given.
54
- def each_record(path)
55
- return enum_for(:each_record, path) unless block_given?
73
+ # The caller must pass the source filename so the Record carries
74
+ # its category. Returns a lazy Enumerator when no block is given.
75
+ def each_record(path, filename: nil)
76
+ return enum_for(:each_record, path, filename: filename) unless block_given?
56
77
 
57
78
  path_str = path.to_s
58
- lineno = 0
79
+ category = FILE_TO_CATEGORY.fetch(filename || File.basename(path_str), nil)
59
80
 
60
- File.foreach(path_str) do |raw|
61
- lineno += 1
62
- line = raw.chomp
63
- next if line.empty? || line.start_with?("#")
64
-
65
- begin
66
- yield parse_line(line)
67
- rescue MalformedLineError => e
68
- e.context[:file] ||= path_str
69
- e.context[:line] ||= lineno
70
- raise
71
- end
81
+ each_line_with_lineno(path_str) do |line, lineno|
82
+ yield tagged_record(line, category, path_str, lineno)
72
83
  end
73
84
 
74
85
  nil
@@ -76,7 +87,8 @@ module Ucode
76
87
 
77
88
  # Iterates every known Unihan file in `dir`, yielding one Record
78
89
  # per data line across all files. Missing files are silently
79
- # skipped (incremental runs, partial downloads).
90
+ # skipped (incremental runs, partial downloads). Each Record
91
+ # carries its category so callers don't need to re-derive it.
80
92
  def each_in_dir(dir)
81
93
  return enum_for(:each_in_dir, dir) unless block_given?
82
94
 
@@ -85,7 +97,7 @@ module Ucode
85
97
  path = dir_path.join(filename)
86
98
  next unless path.exist?
87
99
 
88
- each_record(path) { |record| yield record }
100
+ each_record(path, filename: filename) { |record| yield record }
89
101
  end
90
102
 
91
103
  nil
@@ -93,6 +105,25 @@ module Ucode
93
105
 
94
106
  private
95
107
 
108
+ def each_line_with_lineno(path_str)
109
+ lineno = 0
110
+ File.foreach(path_str) do |raw|
111
+ lineno += 1
112
+ line = raw.chomp
113
+ next if line.empty? || line.start_with?("#")
114
+
115
+ yield line, lineno
116
+ end
117
+ end
118
+
119
+ def tagged_record(line, category, path_str, lineno)
120
+ parse_line(line).tap { |r| r.category = category }
121
+ rescue MalformedLineError => e
122
+ e.context[:file] ||= path_str
123
+ e.context[:line] ||= lineno
124
+ raise
125
+ end
126
+
96
127
  # Parses one TAB-separated Unihan data line into a Record. The
97
128
  # `split("\t", 3)` limit preserves any tabs inside the value
98
129
  # (defensive — real Unihan data does not contain them).
@@ -17,7 +17,7 @@ module Ucode
17
17
  # output/blocks/index.json (block index)
18
18
  # output/scripts/<code>.json
19
19
  # output/index/names.json (cp_id → name)
20
- # output/index/labels.json (cp_id → {name, gc, sc})
20
+ # output/index/labels.json (cp_id → {name, gc, sc, cc, bc, mir})
21
21
  # output/index/codepoint_to_block.json (cp_id → block_id)
22
22
  # output/relationships/*.json (per-property tables)
23
23
  # output/enums.json (property aliases + value aliases)
@@ -79,6 +79,7 @@ module Ucode
79
79
  def initialize(output_root)
80
80
  @output_root = Pathname.new(output_root)
81
81
  @block_codepoint_ids = Hash.new { |h, k| h[k] = [] }
82
+ @block_ages = Hash.new { |h, k| h[k] = nil }
82
83
  @script_codepoint_ids = Hash.new { |h, k| h[k] = [] }
83
84
  @names_index = {}
84
85
  @labels_index = {}
@@ -94,6 +95,7 @@ module Ucode
94
95
  return if cp.block_id.nil?
95
96
 
96
97
  @block_codepoint_ids[cp.block_id] << cp.id
98
+ track_block_age(cp)
97
99
  if cp.script_code
98
100
  @script_codepoint_ids[cp.script_code] << cp.id
99
101
  end
@@ -136,10 +138,37 @@ module Ucode
136
138
  # ---- Per-codepoint accumulator helpers ---------------------------
137
139
 
138
140
  def build_label(cp)
139
- label = { "name" => cp.name, "gc" => cp.general_category, "sc" => cp.script_code }
141
+ label = {
142
+ "name" => cp.name,
143
+ "gc" => cp.general_category,
144
+ "sc" => cp.script_code,
145
+ "cc" => cp.combining_class,
146
+ "bc" => cp.bidi&.bidi_class,
147
+ "mir" => cp.bidi&.is_mirrored ? true : nil,
148
+ }
140
149
  label.reject { |_, v| v.nil? }
141
150
  end
142
151
 
152
+ # Per-block `age` is the earliest DerivedAge of any codepoint in
153
+ # the block, compared as a Gem::Version. Stored as the original
154
+ # string (e.g. "1.1", "17.0.0"). nil when no codepoint in the
155
+ # block has an age (rare — only happens for entirely-reserved
156
+ # blocks, which the parser excludes anyway).
157
+ def track_block_age(cp)
158
+ return if cp.age.nil? || cp.age.empty?
159
+
160
+ current = @block_ages[cp.block_id]
161
+ @block_ages[cp.block_id] = if current.nil?
162
+ cp.age
163
+ else
164
+ min_age(current, cp.age)
165
+ end
166
+ end
167
+
168
+ def min_age(a, b)
169
+ Gem::Version.new(a) < Gem::Version.new(b) ? a : b
170
+ end
171
+
143
172
  # ---- Plane files -------------------------------------------------
144
173
 
145
174
  def write_planes(blocks)
@@ -176,6 +205,7 @@ module Ucode
176
205
 
177
206
  def write_blocks(blocks)
178
207
  count = blocks.sum do |block|
208
+ block.age = @block_ages[block.id]
179
209
  path = Paths.block_metadata_path(@output_root, block.id)
180
210
  write_atomic(path, block_payload(block)) ? 1 : 0
181
211
  end
@@ -191,6 +221,7 @@ module Ucode
191
221
  "first_cp" => block.range_first,
192
222
  "last_cp" => block.range_last,
193
223
  "plane_number" => block.plane_number,
224
+ "age" => @block_ages[block.id],
194
225
  }
195
226
  end
196
227
  write_atomic(path, to_pretty_json(summary)) ? 1 : 0
@@ -203,6 +234,7 @@ module Ucode
203
234
  "range_first" => block.range_first,
204
235
  "range_last" => block.range_last,
205
236
  "plane_number" => block.plane_number,
237
+ "age" => @block_ages[block.id],
206
238
  "codepoint_ids" => (@block_codepoint_ids[block.id] || []),
207
239
  )
208
240
  end