ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,364 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "json"
5
+ require "time"
6
+
7
+ require "ucode/models"
8
+ require "ucode/repo/atomic_writes"
9
+ require "ucode/repo/paths"
10
+
11
+ module Ucode
12
+ module Repo
13
+ # Writes every aggregate JSON file under `output/`:
14
+ #
15
+ # output/planes/<n>.json
16
+ # output/blocks/<ID>.json
17
+ # output/blocks/index.json (block index)
18
+ # output/scripts/<code>.json
19
+ # output/index/names.json (cp_id → name)
20
+ # output/index/labels.json (cp_id → {name, gc, sc})
21
+ # output/index/codepoint_to_block.json (cp_id → block_id)
22
+ # output/relationships/*.json (per-property tables)
23
+ # output/enums.json (property aliases + value aliases)
24
+ # output/named_sequences/<slug>.json
25
+ # output/manifest.json
26
+ #
27
+ # **Single pass**: callers feed one CodePoint at a time via `#add`;
28
+ # `#flush` writes all derived files using the Coordinator's indices
29
+ # for the static tables (relationships, enums, named sequences).
30
+ #
31
+ # **MECE**:
32
+ # - paths: `Repo::Paths`
33
+ # - atomic writes: `Repo::AtomicWrites`
34
+ # - stream aggregation: this class
35
+ # - serialization: lutaml-model `to_yaml_hash` / `to_json`
36
+ class AggregateWriter
37
+ include AtomicWrites
38
+
39
+ # Static metadata for the 17 Unicode planes. Planes 4–13 are
40
+ # unassigned in Unicode 17; their entries use placeholder names.
41
+ PLANE_TABLE = {
42
+ 0 => ["Basic Multilingual Plane", "BMP"],
43
+ 1 => ["Supplementary Multilingual Plane", "SMP"],
44
+ 2 => ["Supplementary Ideographic Plane", "SIP"],
45
+ 3 => ["Tertiary Ideographic Plane", "TIP"],
46
+ 4 => ["Unassigned Plane 4", "—"],
47
+ 5 => ["Unassigned Plane 5", "—"],
48
+ 6 => ["Unassigned Plane 6", "—"],
49
+ 7 => ["Unassigned Plane 7", "—"],
50
+ 8 => ["Unassigned Plane 8", "—"],
51
+ 9 => ["Unassigned Plane 9", "—"],
52
+ 10 => ["Unassigned Plane 10", "—"],
53
+ 11 => ["Unassigned Plane 11", "—"],
54
+ 12 => ["Unassigned Plane 12", "—"],
55
+ 13 => ["Unassigned Plane 13", "—"],
56
+ 14 => ["Supplementary Special-purpose Plane", "SSP"],
57
+ 15 => ["Supplementary Private Use Area-A", "SPUA-A"],
58
+ 16 => ["Supplementary Private Use Area-B", "SPUA-B"],
59
+ }.freeze
60
+ private_constant :PLANE_TABLE
61
+
62
+ # Coordinator::Indices fields paired with the file slug used
63
+ # under `output/relationships/`. Each field is a Hash<Integer,
64
+ # Record> or Hash<Integer, Array<Record>>.
65
+ RELATIONSHIP_SOURCES = {
66
+ special_casing: "special_casing",
67
+ case_folding: "case_folding",
68
+ bidi_mirroring: "bidi_mirroring",
69
+ bidi_brackets: "bidi_brackets",
70
+ cjk_radicals: "cjk_radicals",
71
+ standardized_variants: "standardized_variants",
72
+ name_aliases: "name_aliases",
73
+ }.freeze
74
+ private_constant :RELATIONSHIP_SOURCES
75
+
76
+ attr_reader :codepoint_count
77
+
78
+ # @param output_root [String, Pathname]
79
+ def initialize(output_root)
80
+ @output_root = Pathname.new(output_root)
81
+ @block_codepoint_ids = Hash.new { |h, k| h[k] = [] }
82
+ @script_codepoint_ids = Hash.new { |h, k| h[k] = [] }
83
+ @names_index = {}
84
+ @labels_index = {}
85
+ @cp_to_block = {}
86
+ @codepoint_count = 0
87
+ end
88
+
89
+ # Fold one CodePoint into the stream accumulators. No-ops if the
90
+ # cp has no block_id (it has no home in the output tree).
91
+ # @param cp [Ucode::Models::CodePoint]
92
+ # @return [void]
93
+ def add(cp)
94
+ return if cp.block_id.nil?
95
+
96
+ @block_codepoint_ids[cp.block_id] << cp.id
97
+ if cp.script_code
98
+ @script_codepoint_ids[cp.script_code] << cp.id
99
+ end
100
+ if cp.name && !cp.name.empty?
101
+ @names_index[cp.id] = cp.name
102
+ end
103
+ @labels_index[cp.id] = build_label(cp)
104
+ @cp_to_block[cp.id] = cp.block_id
105
+ @codepoint_count += 1
106
+ end
107
+
108
+ # Write every aggregate file. Optional params supply data that is
109
+ # not in `Coordinator::Indices` (the Coordinator only resolves the
110
+ # `sc` subset of PropertyValueAliases; the full alias tables and
111
+ # the named sequences are passed through from the CLI/parsers).
112
+ #
113
+ # @param ucd_version [String]
114
+ # @param indices [Ucode::Coordinator::Indices]
115
+ # @param property_aliases [Array<Ucode::Models::PropertyAlias>]
116
+ # @param property_value_aliases [Array<Ucode::Models::PropertyValueAlias>]
117
+ # @param named_sequences [Array<Ucode::Models::NamedSequence>]
118
+ # @param glyph_count [Integer]
119
+ # @return [Integer] number of files written
120
+ def flush(ucd_version:, indices:, property_aliases: [],
121
+ property_value_aliases: [], named_sequences: [], glyph_count: 0)
122
+ writes = 0
123
+ writes += write_planes(indices.blocks)
124
+ writes += write_blocks(indices.blocks)
125
+ writes += write_scripts(indices.scripts)
126
+ writes += write_indexes
127
+ writes += write_relationships(indices)
128
+ writes += write_enums(property_aliases, property_value_aliases)
129
+ writes += write_named_sequences(named_sequences)
130
+ writes += write_manifest(ucd_version: ucd_version, glyph_count: glyph_count)
131
+ writes
132
+ end
133
+
134
+ private
135
+
136
+ # ---- Per-codepoint accumulator helpers ---------------------------
137
+
138
+ def build_label(cp)
139
+ label = { "name" => cp.name, "gc" => cp.general_category, "sc" => cp.script_code }
140
+ label.reject { |_, v| v.nil? }
141
+ end
142
+
143
+ # ---- Plane files -------------------------------------------------
144
+
145
+ def write_planes(blocks)
146
+ plane_block_ids = group_block_ids_by_plane(blocks)
147
+ count = 0
148
+ (0..16).each do |n|
149
+ path = Paths.plane_metadata_path(@output_root, n)
150
+ count += 1 if write_atomic(path, plane_payload(n, plane_block_ids[n] || []))
151
+ end
152
+ count
153
+ end
154
+
155
+ def group_block_ids_by_plane(blocks)
156
+ blocks.each_with_object(Hash.new { |h, k| h[k] = [] }) do |block, h|
157
+ h[block.plane_number] << block.id
158
+ end
159
+ end
160
+
161
+ def plane_payload(plane_number, block_ids)
162
+ name, abbrev = PLANE_TABLE.fetch(plane_number)
163
+ range_first = plane_number * 0x10000
164
+ range_last = range_first + 0xFFFF
165
+ to_pretty_json(
166
+ "number" => plane_number,
167
+ "name" => name,
168
+ "abbrev" => abbrev,
169
+ "range_first" => range_first,
170
+ "range_last" => range_last,
171
+ "block_ids" => block_ids,
172
+ )
173
+ end
174
+
175
+ # ---- Block files -------------------------------------------------
176
+
177
+ def write_blocks(blocks)
178
+ count = blocks.sum do |block|
179
+ path = Paths.block_metadata_path(@output_root, block.id)
180
+ write_atomic(path, block_payload(block)) ? 1 : 0
181
+ end
182
+ count + write_blocks_index(blocks)
183
+ end
184
+
185
+ def write_blocks_index(blocks)
186
+ path = Paths.blocks_index_path(@output_root)
187
+ summary = blocks.map do |block|
188
+ {
189
+ "id" => block.id,
190
+ "name" => block.name,
191
+ "first_cp" => block.range_first,
192
+ "last_cp" => block.range_last,
193
+ "plane_number" => block.plane_number,
194
+ }
195
+ end
196
+ write_atomic(path, to_pretty_json(summary)) ? 1 : 0
197
+ end
198
+
199
+ def block_payload(block)
200
+ to_pretty_json(
201
+ "id" => block.id,
202
+ "name" => block.name,
203
+ "range_first" => block.range_first,
204
+ "range_last" => block.range_last,
205
+ "plane_number" => block.plane_number,
206
+ "codepoint_ids" => (@block_codepoint_ids[block.id] || []),
207
+ )
208
+ end
209
+
210
+ # ---- Script files ------------------------------------------------
211
+
212
+ def write_scripts(scripts)
213
+ count = 0
214
+ scripts.group_by(&:code).each do |code, ranges|
215
+ next if code.nil? || code.empty?
216
+
217
+ path = Paths.script_metadata_path(@output_root, code)
218
+ count += 1 if write_atomic(path, script_payload(code, ranges))
219
+ end
220
+ count
221
+ end
222
+
223
+ def script_payload(code, ranges)
224
+ to_pretty_json(
225
+ "code" => code,
226
+ "name" => ranges.first&.name,
227
+ "range_first" => ranges.map(&:range_first).min,
228
+ "range_last" => ranges.map(&:range_last).max,
229
+ "codepoint_ids" => (@script_codepoint_ids[code] || []),
230
+ )
231
+ end
232
+
233
+ # ---- Lookup indexes ---------------------------------------------
234
+
235
+ def write_indexes
236
+ count = 0
237
+ count += 1 if write_atomic(Paths.names_index_path(@output_root), to_pretty_json(@names_index))
238
+ count += 1 if write_atomic(Paths.labels_index_path(@output_root), to_pretty_json(@labels_index))
239
+ count += 1 if write_atomic(codepoint_to_block_path, to_pretty_json(@cp_to_block))
240
+ count
241
+ end
242
+
243
+ def codepoint_to_block_path
244
+ Pathname(@output_root).join("index", "codepoint_to_block.json")
245
+ end
246
+
247
+ # ---- Relationships ----------------------------------------------
248
+
249
+ def write_relationships(indices)
250
+ RELATIONSHIP_SOURCES.sum do |field, slug|
251
+ records = indices.public_send(field)
252
+ write_relationship_file(slug, records)
253
+ end
254
+ end
255
+
256
+ def write_relationship_file(slug, records)
257
+ return 0 if records.nil? || records.empty?
258
+
259
+ path = Pathname(@output_root).join("relationships", "#{slug}.json")
260
+ write_atomic(path, relationship_payload(records)) ? 1 : 0
261
+ end
262
+
263
+ # records is Hash<Integer, Record>, Hash<Integer, Array<Record>>,
264
+ # Hash<String, Record>, or Hash<String, Array<Record>>.
265
+ # Output: { "U+XXXX" => record.to_yaml_hash, ... } or
266
+ # { "U+XXXX" => [record.to_yaml_hash, ...], ... }
267
+ def relationship_payload(records)
268
+ payload = records.each_with_object({}) do |(key, value), h|
269
+ h[key_to_cp_id(key)] = serialize_value(value)
270
+ end
271
+ to_pretty_json(payload)
272
+ end
273
+
274
+ # Indices that are keyed by Integer codepoint (most of them) get
275
+ # formatted into "U+XXXX". Indices keyed by string ids already
276
+ # (cjk_radicals by ideograph_id, standardized_variants by base_id)
277
+ # are passed through verbatim.
278
+ def key_to_cp_id(key)
279
+ key.is_a?(Integer) ? Paths.cp_id(key) : key
280
+ end
281
+
282
+ def serialize_value(value)
283
+ return value.map { |v| serialize_one(v) } if value.is_a?(Array)
284
+
285
+ serialize_one(value)
286
+ end
287
+
288
+ def serialize_one(record)
289
+ record.to_yaml_hash
290
+ end
291
+
292
+ # ---- Enums -------------------------------------------------------
293
+
294
+ def write_enums(property_aliases, property_value_aliases)
295
+ path = Pathname(@output_root).join("enums.json")
296
+ payload = {
297
+ "properties" => property_aliases.map(&:to_yaml_hash),
298
+ "property_values" => property_value_aliases.map(&:to_yaml_hash),
299
+ }
300
+ write_atomic(path, to_pretty_json(payload)) ? 1 : 0
301
+ end
302
+
303
+ # ---- Named sequences --------------------------------------------
304
+
305
+ def write_named_sequences(named_sequences)
306
+ return 0 if named_sequences.nil? || named_sequences.empty?
307
+
308
+ dir = Pathname(@output_root).join("named_sequences")
309
+ named_sequences.sum do |ns|
310
+ path = dir.join("#{slug_for(ns)}.json")
311
+ write_atomic(path, ns.to_json(pretty: true)) ? 1 : 0
312
+ end
313
+ end
314
+
315
+ # Slug derived from the name: downcase, non-alphanumerics → "_".
316
+ def slug_for(named_sequence)
317
+ named_sequence.name
318
+ .downcase
319
+ .gsub(/[^a-z0-9]+/, "_")
320
+ .gsub(/^_+|_+$/, "")
321
+ end
322
+
323
+ # ---- Manifest ---------------------------------------------------
324
+
325
+ # Fields that define the manifest's semantic content. When these
326
+ # match the existing manifest on disk, we preserve the old
327
+ # `generated_at` so that re-runs are byte-idempotent (no rewrite
328
+ # unless something actually changed).
329
+ MANIFEST_CONTENT_KEYS = %w[
330
+ ucd_version codepoint_count glyph_count schema_version
331
+ ].freeze
332
+ private_constant :MANIFEST_CONTENT_KEYS
333
+
334
+ def write_manifest(ucd_version:, glyph_count:)
335
+ path = Paths.manifest_path(@output_root)
336
+ content = {
337
+ "ucd_version" => ucd_version,
338
+ "codepoint_count" => @codepoint_count,
339
+ "glyph_count" => glyph_count,
340
+ "schema_version" => "1",
341
+ }
342
+ ts = preserved_or_new_timestamp(path, content)
343
+ payload = content.merge("generated_at" => ts)
344
+ write_atomic(path, to_pretty_json(payload)) ? 1 : 0
345
+ end
346
+
347
+ def preserved_or_new_timestamp(path, content)
348
+ existing = read_manifest(path)
349
+ return Time.now.utc.iso8601 unless existing
350
+
351
+ unchanged = MANIFEST_CONTENT_KEYS.all? { |k| existing[k] == content[k] }
352
+ unchanged ? existing["generated_at"] : Time.now.utc.iso8601
353
+ end
354
+
355
+ def read_manifest(path)
356
+ return nil unless path.exist?
357
+
358
+ JSON.parse(path.read)
359
+ rescue JSON::ParserError
360
+ nil
361
+ end
362
+ end
363
+ end
364
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "json"
5
+
6
+ require "ucode/repo/paths"
7
+
8
+ module Ucode
9
+ module Repo
10
+ # Atomic, idempotent file-write helpers shared by CodepointWriter
11
+ # and AggregateWriter.
12
+ #
13
+ # - Atomic: write to a sibling `.tmp` file, then rename. A crash
14
+ # mid-write leaves either the old file or no file, never a
15
+ # truncated one.
16
+ # - Idempotent: byte-compare the existing file before writing;
17
+ # identical content is a no-op. Safe to re-run on the full
18
+ # dataset.
19
+ module AtomicWrites
20
+ # @param path [Pathname]
21
+ # @param payload [String] the exact bytes to write
22
+ # @return [Boolean] true if the file was written, false if skipped
23
+ def write_atomic(path, payload)
24
+ return false if same_content?(path, payload)
25
+
26
+ path.dirname.mkpath
27
+ tmp = Paths.tmp_path(path)
28
+ tmp.write(payload)
29
+ tmp.rename(path.to_s)
30
+ true
31
+ end
32
+
33
+ # @param path [Pathname]
34
+ # @param payload [String]
35
+ # @return [Boolean]
36
+ def same_content?(path, payload)
37
+ path.exist? && path.read == payload
38
+ end
39
+
40
+ # Pretty JSON for any Hash/Array value.
41
+ # @param value [Hash, Array]
42
+ # @return [String]
43
+ def to_pretty_json(value)
44
+ JSON.pretty_generate(value)
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "thread"
5
+
6
+ require "ucode/repo/atomic_writes"
7
+ require "ucode/repo/paths"
8
+
9
+ module Ucode
10
+ module Repo
11
+ # Writes one `index.json` per codepoint under `output/blocks/<id>/<cp>/`.
12
+ #
13
+ # Streaming + threaded + idempotent:
14
+ #
15
+ # - **Streaming**: callers pass an Enumerator; the writer pulls one
16
+ # codepoint at a time, never the full 160k set in memory.
17
+ # - **Threaded**: a fixed-size worker pool drains a shared queue.
18
+ # Each codepoint maps to a unique path → no per-file contention.
19
+ # - **Idempotent**: existing files are byte-compared to the new
20
+ # payload before writing; identical content is a no-op. Safe to
21
+ # re-run on the full dataset.
22
+ # - **Atomic**: writes go to `<path>.tmp`, then rename. A crash
23
+ # mid-write leaves either the old file or no file, never a
24
+ # truncated one.
25
+ class CodepointWriter
26
+ include AtomicWrites
27
+
28
+ # @param output_root [String, Pathname]
29
+ # @param parallel_workers [Integer] size of the worker pool. Set to
30
+ # 1 (or less) to run synchronously — useful in tests.
31
+ def initialize(output_root, parallel_workers: 8)
32
+ @output_root = Pathname.new(output_root)
33
+ @parallel_workers = parallel_workers
34
+ end
35
+
36
+ # Write one codepoint synchronously.
37
+ # @param codepoint [Ucode::Models::CodePoint]
38
+ # @return [Pathname, nil] the path written, or nil if skipped
39
+ # (missing block_id or content-identical to existing file)
40
+ def write(codepoint)
41
+ return nil if codepoint.block_id.nil?
42
+
43
+ path = Paths.codepoint_json_path(@output_root, codepoint.block_id, codepoint.id)
44
+ payload = serialize(codepoint)
45
+ return nil unless write_atomic(path, payload)
46
+
47
+ path
48
+ end
49
+
50
+ # Drain an Enumerator through the worker pool. Returns the total
51
+ # count of codepoints seen (whether or not each one was written).
52
+ # @param enum [Enumerator<Ucode::Models::CodePoint>, Enumerable]
53
+ # @return [Integer]
54
+ def write_each(enum)
55
+ return drain_inline(enum) if @parallel_workers <= 1
56
+
57
+ drain_threaded(enum)
58
+ end
59
+
60
+ private
61
+
62
+ def drain_inline(enum)
63
+ count = 0
64
+ enum.each { |cp| write(cp); count += 1 }
65
+ count
66
+ end
67
+
68
+ def drain_threaded(enum)
69
+ queue = Queue.new
70
+ mutex = Mutex.new
71
+ count = 0
72
+
73
+ workers = Array.new(@parallel_workers) do
74
+ Thread.new do
75
+ loop do
76
+ cp = queue.pop
77
+ break if cp.nil?
78
+
79
+ write(cp)
80
+ mutex.synchronize { count += 1 }
81
+ end
82
+ end
83
+ end
84
+
85
+ enum.each { |cp| queue << cp }
86
+ @parallel_workers.times { queue << nil }
87
+ workers.each(&:join)
88
+ count
89
+ end
90
+
91
+ def serialize(codepoint)
92
+ codepoint.to_json(pretty: true)
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module Ucode
6
+ module Repo
7
+ # Pure functions describing the on-disk layout of the output tree.
8
+ #
9
+ # The only code that knows the path conventions. Site generator,
10
+ # CLI, glyph writer, and fontisan adapter all go through here.
11
+ #
12
+ # All methods are pure: no I/O, no global state, no side effects.
13
+ # Returns Pathname instances so callers can compose further.
14
+ module Paths
15
+ BLOCKS_DIR = "blocks"
16
+ PLANES_DIR = "planes"
17
+ SCRIPTS_DIR = "scripts"
18
+ INDEX_DIR = "index"
19
+ INDEX_FILENAME = "index.json"
20
+ GLYPH_FILENAME = "glyph.svg"
21
+ PLANE_FILENAME_PREFIX = "" # plane files are <n>.json
22
+ private_constant :BLOCKS_DIR, :PLANES_DIR, :SCRIPTS_DIR, :INDEX_DIR,
23
+ :INDEX_FILENAME, :GLYPH_FILENAME,
24
+ :PLANE_FILENAME_PREFIX
25
+
26
+ class << self
27
+ # Format an integer codepoint as the canonical "U+XXXX" id used
28
+ # everywhere (paths, JSON, cross-references). Always at least
29
+ # 4 hex digits, uppercase, no extra padding.
30
+ # @param cp [Integer]
31
+ # @return [String]
32
+ def cp_id(cp)
33
+ format("U+%04X", cp)
34
+ end
35
+
36
+ # @param output_root [String, Pathname]
37
+ # @param block_id [String] verbatim block id (e.g. "ASCII", "CJK_Ext_A")
38
+ # @return [Pathname]
39
+ def block_dir(output_root, block_id)
40
+ Pathname(output_root).join(BLOCKS_DIR, block_id)
41
+ end
42
+
43
+ # @param output_root [String, Pathname]
44
+ # @param block_id [String]
45
+ # @param cp_id [String] e.g. "U+0041"
46
+ # @return [Pathname]
47
+ def codepoint_dir(output_root, block_id, cp_id)
48
+ block_dir(output_root, block_id).join(cp_id)
49
+ end
50
+
51
+ # @param output_root [String, Pathname]
52
+ # @param block_id [String]
53
+ # @param cp_id [String]
54
+ # @return [Pathname]
55
+ def codepoint_json_path(output_root, block_id, cp_id)
56
+ codepoint_dir(output_root, block_id, cp_id).join(INDEX_FILENAME)
57
+ end
58
+
59
+ # @param output_root [String, Pathname]
60
+ # @param block_id [String]
61
+ # @param cp_id [String]
62
+ # @return [Pathname]
63
+ def codepoint_glyph_path(output_root, block_id, cp_id)
64
+ codepoint_dir(output_root, block_id, cp_id).join(GLYPH_FILENAME)
65
+ end
66
+
67
+ # @param output_root [String, Pathname]
68
+ # @param block_id [String]
69
+ # @return [Pathname]
70
+ def block_metadata_path(output_root, block_id)
71
+ block_dir(output_root, block_id).join(INDEX_FILENAME)
72
+ end
73
+
74
+ # @param output_root [String, Pathname]
75
+ # @return [Pathname]
76
+ def blocks_index_path(output_root)
77
+ Pathname(output_root).join(BLOCKS_DIR, INDEX_FILENAME)
78
+ end
79
+
80
+ # @param output_root [String, Pathname]
81
+ # @param plane_number [Integer]
82
+ # @return [Pathname]
83
+ def plane_metadata_path(output_root, plane_number)
84
+ Pathname(output_root).join(PLANES_DIR, "#{plane_number}.json")
85
+ end
86
+
87
+ # @param output_root [String, Pathname]
88
+ # @param script_code [String]
89
+ # @return [Pathname]
90
+ def script_metadata_path(output_root, script_code)
91
+ Pathname(output_root).join(SCRIPTS_DIR, "#{script_code}.json")
92
+ end
93
+
94
+ # @param output_root [String, Pathname]
95
+ # @return [Pathname]
96
+ def names_index_path(output_root)
97
+ Pathname(output_root).join(INDEX_DIR, "names.json")
98
+ end
99
+
100
+ # @param output_root [String, Pathname]
101
+ # @return [Pathname]
102
+ def labels_index_path(output_root)
103
+ Pathname(output_root).join(INDEX_DIR, "labels.json")
104
+ end
105
+
106
+ # @param output_root [String, Pathname]
107
+ # @return [Pathname]
108
+ def manifest_path(output_root)
109
+ Pathname(output_root).join("manifest.json")
110
+ end
111
+
112
+ # Temporary path for atomic writes — same directory as `path`,
113
+ # so rename stays within one filesystem.
114
+ # @param path [Pathname]
115
+ # @return [Pathname]
116
+ def tmp_path(path)
117
+ path.parent.join("#{path.basename}.tmp")
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
data/lib/ucode/repo.rb ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Repo — writes the output tree under output/.
5
+ #
6
+ # One folder per codepoint (CJK included), no exceptions:
7
+ #
8
+ # output/planes/<n>.json
9
+ # output/blocks/<ORIGINAL_NAME>.json
10
+ # output/blocks/<ORIGINAL_NAME>/<U+XXXX>/index.json
11
+ # output/blocks/<ORIGINAL_NAME>/<U+XXXX>/glyph.svg
12
+ # output/scripts/<ScriptCode>.json
13
+ # output/index/names.json
14
+ # output/index/labels.json
15
+ # output/manifest.json
16
+ module Repo
17
+ autoload :Paths, "ucode/repo/paths"
18
+ autoload :AtomicWrites, "ucode/repo/atomic_writes"
19
+ autoload :CodepointWriter, "ucode/repo/codepoint_writer"
20
+ autoload :AggregateWriter, "ucode/repo/aggregate_writer"
21
+ end
22
+ end