ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,397 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "ucode/parsers"
5
+ require "ucode/models"
6
+
7
+ module Ucode
8
+ # Orchestrates the UCD + Unihan parsers and produces per-codepoint
9
+ # CodePoint records for a downstream sink (a writer, an aggregator,
10
+ # a database builder).
11
+ #
12
+ # **Streaming architecture**:
13
+ #
14
+ # 1. Indices pass — load every range/point file into memory, keyed
15
+ # by codepoint (hash) or sorted by `range_first` (bsearch).
16
+ # Peak memory is ~10 MB of indices, NOT 160 k CodePoints.
17
+ #
18
+ # 2. Stream pass — `UnicodeData.each_record` drives the main loop.
19
+ # For each yielded CodePoint, the Coordinator merges in data from
20
+ # the indices, then yields to the sink. CodePoints are GC'd
21
+ # after the sink processes them.
22
+ #
23
+ # Every data file is OPTIONAL — if a file is missing (partial fetch,
24
+ # incremental run), the corresponding indices stay empty and the
25
+ # matching CodePoint fields stay at their defaults. This makes the
26
+ # Coordinator resilient against partial fixtures and lets users run
27
+ # subsets.
28
+ class Coordinator
29
+ autoload :Indices, "ucode/coordinator/indices"
30
+
31
+ ISO_SCRIPT_PROPERTY = "sc".freeze
32
+ private_constant :ISO_SCRIPT_PROPERTY
33
+
34
+ attr_reader :config
35
+
36
+ def initialize(config = Ucode.configuration)
37
+ @config = config
38
+ end
39
+
40
+ # Stream-driven build. Calls `block` once per assigned codepoint.
41
+ def build(ucd_dir:, unihan_dir:, &block)
42
+ each_codepoint(ucd_dir: ucd_dir, unihan_dir: unihan_dir, &block)
43
+ end
44
+
45
+ # Iterates one enriched CodePoint per assigned codepoint. Returns a
46
+ # lazy Enumerator when called without a block.
47
+ def each_codepoint(ucd_dir:, unihan_dir:)
48
+ return enum_for(:each_codepoint, ucd_dir: ucd_dir, unihan_dir: unihan_dir) unless block_given?
49
+
50
+ indices = build_indices(ucd_dir, unihan_dir)
51
+ each_with_indices(ucd_dir: ucd_dir, unihan_dir: unihan_dir, indices: indices) do |cp|
52
+ yield cp
53
+ end
54
+
55
+ nil
56
+ end
57
+
58
+ # Like #each_codepoint but yields `(indices, cp)` so callers that
59
+ # need the indices for a post-stream flush (e.g. ParseCommand) can
60
+ # reuse them instead of re-building. Returns an Enumerator when no
61
+ # block is given.
62
+ def each_codepoint_with_indices(ucd_dir:, unihan_dir:)
63
+ unless block_given?
64
+ return enum_for(:each_codepoint_with_indices, ucd_dir: ucd_dir, unihan_dir: unihan_dir)
65
+ end
66
+
67
+ indices = build_indices(ucd_dir, unihan_dir)
68
+ each_with_indices(ucd_dir: ucd_dir, unihan_dir: unihan_dir, indices: indices) do |cp|
69
+ yield indices, cp
70
+ end
71
+
72
+ nil
73
+ end
74
+
75
+ # Build (and return) the Coordinator::Indices for the given UCD +
76
+ # Unihan dirs. Useful when the caller needs the indices separately
77
+ # from the streaming pass (e.g. AggregateWriter#flush).
78
+ def indices_for(ucd_dir:, unihan_dir:)
79
+ build_indices(ucd_dir, unihan_dir)
80
+ end
81
+
82
+ private
83
+
84
+ def each_with_indices(ucd_dir:, unihan_dir:, indices:)
85
+ unicode_data_path = Pathname.new(ucd_dir).join("UnicodeData.txt")
86
+
87
+ Parsers::UnicodeData.each_record(unicode_data_path) do |cp|
88
+ enrich(cp, indices)
89
+ yield cp
90
+ end
91
+ end
92
+
93
+ def build_indices(ucd_dir, unihan_dir)
94
+ property_value_aliases = property_value_aliases_index(ucd_dir)
95
+
96
+ Indices.new(
97
+ blocks: range_index(ucd_dir, "Blocks.txt", Parsers::Blocks),
98
+ scripts: scripts_index(ucd_dir, property_value_aliases),
99
+ property_value_aliases: property_value_aliases,
100
+ derived_age: cp_index(ucd_dir, "DerivedAge.txt", Parsers::DerivedAge, :cp),
101
+ binary_properties: multi_cp_index(ucd_dir, "DerivedCoreProperties.txt",
102
+ Parsers::DerivedCoreProperties),
103
+ script_extensions: multi_cp_index(ucd_dir, "ScriptExtensions.txt",
104
+ Parsers::ScriptExtensions, :cp),
105
+ bidi_mirroring: cp_index(ucd_dir, "BidiMirroring.txt",
106
+ Parsers::BidiMirroring, :codepoint),
107
+ bidi_brackets: cp_index(ucd_dir, "BidiBrackets.txt",
108
+ Parsers::BidiBrackets, :codepoint),
109
+ special_casing: multi_cp_index(ucd_dir, "SpecialCasing.txt",
110
+ Parsers::SpecialCasing),
111
+ # CaseFolding: one cp can carry C, F, S, and T statuses; the
112
+ # Coordinator buckets each row into CodePoint::CaseFolding by
113
+ # status, so the index holds an Array per cp.
114
+ case_folding: multi_cp_index(ucd_dir, "CaseFolding.txt",
115
+ Parsers::CaseFolding, :codepoint),
116
+ name_aliases: multi_cp_index(ucd_dir, "NameAliases.txt",
117
+ Parsers::NameAliases),
118
+ # CJKRadicals maps a canonical ideograph (e.g. U+4E00) to its
119
+ # KangXi radical; the lookup key is the ideograph_id ("U+XXXX"),
120
+ # not the radical_number or the cjk_radical_id.
121
+ cjk_radicals: multi_cp_index_by_id(ucd_dir, "CJKRadicals.txt",
122
+ Parsers::CjkRadicals, :ideograph_id),
123
+ standardized_variants: multi_cp_index_by_id(ucd_dir, "StandardizedVariants.txt",
124
+ Parsers::StandardizedVariants, :base_id),
125
+ names_list: names_list_index(ucd_dir),
126
+ unihan: unihan_index(unihan_dir)
127
+ )
128
+ end
129
+
130
+ # ---- Index builders -------------------------------------------------
131
+
132
+ def range_index(ucd_dir, filename, parser)
133
+ path = Pathname.new(ucd_dir).join(filename)
134
+ return [] unless path.exist?
135
+
136
+ parser.each_record(path).to_a.sort_by(&:range_first)
137
+ end
138
+
139
+ # Builds the sorted Script array and resolves each Script's ISO 15924
140
+ # code in one pass, using the pre-computed property_value_aliases map.
141
+ # This avoids re-resolving the alias on every per-cp lookup (160k ×
142
+ # hash lookup vs ~one lookup per Script range).
143
+ def scripts_index(ucd_dir, property_value_aliases)
144
+ path = Pathname.new(ucd_dir).join("Scripts.txt")
145
+ return [] unless path.exist?
146
+
147
+ Parsers::Scripts.each_record(path).map do |script|
148
+ script.code = property_value_aliases[script.name]
149
+ script
150
+ end.sort_by(&:range_first)
151
+ end
152
+
153
+ # Indexes by integer codepoint for parsers whose record exposes a
154
+ # `codepoint` integer accessor (or any method returning Integer).
155
+ def cp_index(ucd_dir, filename, parser, key_method)
156
+ path = Pathname.new(ucd_dir).join(filename)
157
+ return {} unless path.exist?
158
+
159
+ parser.each_record(path).each_with_object({}) do |record, h|
160
+ h[record.public_send(key_method)] = record
161
+ end
162
+ end
163
+
164
+ # Multi-valued index by integer codepoint. Each cp maps to an array
165
+ # of records (e.g. one cp can have several binary properties, several
166
+ # script extensions, several SpecialCasing rules).
167
+ def multi_cp_index(ucd_dir, filename, parser, key_method = :codepoint)
168
+ path = Pathname.new(ucd_dir).join(filename)
169
+ return {} unless path.exist?
170
+
171
+ parser.each_record(path).each_with_object(Hash.new { |h, k| h[k] = [] }) do |record, h|
172
+ h[record.public_send(key_method)] << record
173
+ end
174
+ end
175
+
176
+ # Multi-valued index keyed by a "U+XXXX" string id (e.g. standardized
177
+ # variants are keyed by base_id).
178
+ def multi_cp_index_by_id(ucd_dir, filename, parser, key_method)
179
+ path = Pathname.new(ucd_dir).join(filename)
180
+ return {} unless path.exist?
181
+
182
+ parser.each_record(path).each_with_object(Hash.new { |h, k| h[k] = [] }) do |record, h|
183
+ h[record.public_send(key_method)] << record
184
+ end
185
+ end
186
+
187
+ def property_value_aliases_index(ucd_dir)
188
+ path = Pathname.new(ucd_dir).join("PropertyValueAliases.txt")
189
+ return {} unless path.exist?
190
+
191
+ Parsers::PropertyValueAliases.each_record(path).each_with_object({}) do |pva, h|
192
+ next unless pva.property == ISO_SCRIPT_PROPERTY
193
+
194
+ h[pva.long] = pva.short
195
+ end
196
+ end
197
+
198
+ def names_list_index(ucd_dir)
199
+ path = Pathname.new(ucd_dir).join("NamesList.txt")
200
+ return {} unless path.exist?
201
+
202
+ Parsers::NamesList.each_record(path).each_with_object({}) do |entry, h|
203
+ h[entry.codepoint] = entry
204
+ end
205
+ end
206
+
207
+ def unihan_index(unihan_dir)
208
+ return {} if unihan_dir.nil?
209
+
210
+ dir = Pathname.new(unihan_dir)
211
+ return {} unless dir.exist?
212
+
213
+ by_field = Hash.new { |h, k| h[k] = {} }
214
+ Parsers::Unihan.each_in_dir(dir) do |record|
215
+ by_field[record.cp][record.field] = record.field_values
216
+ end
217
+
218
+ by_field.transform_values { |fields| Models::UnihanEntry.new(fields: fields) }
219
+ end
220
+
221
+ # ---- Per-codepoint enrichment --------------------------------------
222
+
223
+ def enrich(cp, indices)
224
+ cp.plane_number = cp.cp >> 16
225
+ cp.block_id = find_in_range(cp.cp, indices.blocks)&.id
226
+ assign_script(cp, indices)
227
+ assign_script_extensions(cp, indices)
228
+ assign_age(cp, indices)
229
+ assign_bidi(cp, indices)
230
+ assign_casing(cp, indices)
231
+ assign_case_folding(cp, indices)
232
+ assign_binary_properties(cp, indices)
233
+ assign_names_list(cp, indices)
234
+ assign_name_aliases(cp, indices)
235
+ assign_standardized_variants(cp, indices)
236
+ assign_unihan(cp, indices)
237
+ assign_cjk_radical(cp, indices)
238
+ end
239
+
240
+ def assign_script(cp, indices)
241
+ script = find_in_range(cp.cp, indices.scripts)
242
+ return unless script
243
+
244
+ cp.script_code = script.code || script.name
245
+ end
246
+
247
+ def assign_script_extensions(cp, indices)
248
+ tuples = indices.script_extensions[cp.cp]
249
+ return unless tuples && !tuples.empty?
250
+
251
+ tuples.each { |tuple| cp.script_extensions << tuple.script_code }
252
+ end
253
+
254
+ def assign_age(cp, indices)
255
+ record = indices.derived_age[cp.cp]
256
+ return unless record
257
+
258
+ cp.age = record.age
259
+ end
260
+
261
+ def assign_bidi(cp, indices)
262
+ mirroring = indices.bidi_mirroring[cp.cp]
263
+ brackets = indices.bidi_brackets[cp.cp]
264
+ return unless mirroring || brackets
265
+
266
+ cp.bidi ||= Models::CodePoint::Bidi.new
267
+ if mirroring
268
+ cp.bidi.mirroring_glyph_id = mirroring.mirrored_id
269
+ end
270
+ if brackets
271
+ cp.bidi.paired_bracket_type = brackets.type
272
+ cp.bidi.paired_bracket_id = brackets.paired_id
273
+ end
274
+ end
275
+
276
+ def assign_casing(cp, indices)
277
+ rules = indices.special_casing[cp.cp]
278
+ return unless rules && !rules.empty?
279
+
280
+ # NOTE: do not uniq the *_ids arrays — a mapping like U+00DF → "SS"
281
+ # legitimately contains two U+0053 entries and they must be
282
+ # preserved in order. Conditions, by contrast, are categorical
283
+ # tags (Final_Sigma, tr, After_I) and deduping them is correct.
284
+ cp.casing ||= Models::CodePoint::Casing.new
285
+ cp.casing.full_upper_ids = rules.flat_map(&:upper_ids)
286
+ cp.casing.full_lower_ids = rules.flat_map(&:lower_ids)
287
+ cp.casing.full_title_ids = rules.flat_map(&:title_ids)
288
+ cp.casing.conditions = rules.flat_map(&:conditions).uniq
289
+ end
290
+
291
+ def assign_case_folding(cp, indices)
292
+ rules = indices.case_folding[cp.cp]
293
+ return unless rules && !rules.empty?
294
+
295
+ cp.case_folding ||= Models::CodePoint::CaseFolding.new
296
+ rules.each do |rule|
297
+ case rule.status
298
+ when "C" then cp.case_folding.common_id = rule.mapping_ids.first
299
+ when "S" then cp.case_folding.simple_id = rule.mapping_ids.first
300
+ when "T" then cp.case_folding.turkic_id = rule.mapping_ids.first
301
+ when "F" then cp.case_folding.full_ids = rule.mapping_ids
302
+ end
303
+ end
304
+ end
305
+
306
+ def assign_binary_properties(cp, indices)
307
+ records = indices.binary_properties[cp.cp]
308
+ return unless records && !records.empty?
309
+
310
+ cp.binary_properties = records.map(&:property_short)
311
+ end
312
+
313
+ def assign_names_list(cp, indices)
314
+ entry = indices.names_list[cp.cp]
315
+ return unless entry
316
+
317
+ cp.names_list = entry
318
+ cp.relationships.concat(entry.cross_references)
319
+ cp.relationships.concat(entry.sample_sequences)
320
+ cp.relationships.concat(entry.compatibility_equivalents)
321
+ cp.relationships.concat(entry.informal_aliases)
322
+ cp.relationships.concat(entry.footnotes)
323
+ end
324
+
325
+ def assign_name_aliases(cp, indices)
326
+ aliases = indices.name_aliases[cp.cp]
327
+ return unless aliases && !aliases.empty?
328
+
329
+ aliases.each do |alias_record|
330
+ cp.relationships << Models::Relationship::InformalAlias.new(
331
+ description: alias_record.text,
332
+ source: "name_aliases"
333
+ )
334
+ end
335
+ end
336
+
337
+ def assign_standardized_variants(cp, indices)
338
+ variants = indices.standardized_variants[cp.id]
339
+ return unless variants && !variants.empty?
340
+
341
+ cp.standardized_variants = variants
342
+ variants.each do |variant|
343
+ cp.relationships << Models::Relationship::VariationSequence.new(
344
+ target_ids: [variant.base_id, variant.variation_selector_id],
345
+ description: variant.description,
346
+ contexts: variant.contexts,
347
+ source: "standardized_variants"
348
+ )
349
+ end
350
+ end
351
+
352
+ def assign_unihan(cp, indices)
353
+ entry = indices.unihan[cp.cp]
354
+ return unless entry
355
+
356
+ cp.unihan = entry
357
+ end
358
+
359
+ def assign_cjk_radical(cp, indices)
360
+ radicals = indices.cjk_radicals[cp.id]
361
+ return unless radicals && !radicals.empty?
362
+
363
+ radicals.each do |radical|
364
+ cp.relationships << Models::Relationship::CrossReference.new(
365
+ target_ids: [radical.cjk_radical_id],
366
+ description: "KangXi radical ##{radical.radical_number}",
367
+ source: "cjk_radicals"
368
+ )
369
+ end
370
+ end
371
+
372
+ # ---- Range lookup (bsearch) ----------------------------------------
373
+
374
+ # Finds the range-containing record in a sorted array via bsearch.
375
+ # Records respond to `range_first` and `range_last`.
376
+ #
377
+ # bsearch_index integer-mode convention: return -1 to search LEFT,
378
+ # +1 to search RIGHT, 0 for a match. `cp < range_first` means the
379
+ # target range lies in earlier (lower-indexed) records, so we return
380
+ # -1; `cp > range_last` means it lies in later records, so we return
381
+ # +1.
382
+ def find_in_range(cp, sorted_ranges)
383
+ return nil if sorted_ranges.nil? || sorted_ranges.empty?
384
+
385
+ idx = sorted_ranges.bsearch_index do |record|
386
+ if cp < record.range_first
387
+ -1
388
+ elsif cp > record.range_last
389
+ 1
390
+ else
391
+ 0
392
+ end
393
+ end
394
+ idx.nil? ? nil : sorted_ranges[idx]
395
+ end
396
+ end
397
+ end
@@ -0,0 +1,214 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sqlite3"
4
+ require "ucode/cache"
5
+ require "ucode/error"
6
+ require "ucode/range_entry"
7
+
8
+ module Ucode
9
+ # SQLite-backed UCD lookup index for one Unicode version.
10
+ #
11
+ # One Database instance = one `.sqlite3` file at
12
+ # `Cache.sqlite_path(version)`. The DB holds two range tables
13
+ # (`blocks` and `scripts`), each pre-coalesced during build.
14
+ #
15
+ # Why SQLite (alongside the YAML Index):
16
+ #
17
+ # - Persistent across processes — build once, reuse across runs.
18
+ # - Btree-indexed queries load only the requested rows.
19
+ # - Small on disk (~hundreds of KB after coalescing).
20
+ #
21
+ # Lifecycle:
22
+ #
23
+ # Database.build(version) # streams Coordinator output → SQLite
24
+ # Database.open(version) # opens existing SQLite (read-only)
25
+ # Database.cached?(version) # checks for .sqlite3 file
26
+ #
27
+ class Database
28
+ SCHEMA_VERSION = "1"
29
+
30
+ BLOCKS_TABLE = "blocks"
31
+ SCRIPTS_TABLE = "scripts"
32
+ private_constant :BLOCKS_TABLE, :SCRIPTS_TABLE
33
+
34
+ class << self
35
+ # Open an existing database. Raises DatabaseMissingError if the
36
+ # file is absent, DatabaseSchemaError if the on-disk schema
37
+ # version does not match `SCHEMA_VERSION`.
38
+ # @param version [String]
39
+ # @return [Database]
40
+ def open(version)
41
+ path = Cache.sqlite_path(version)
42
+ unless path.exist?
43
+ raise DatabaseMissingError.new(
44
+ "No UCD SQLite cache for version #{version.inspect} at #{path}",
45
+ context: { version: version, path: path.to_s },
46
+ )
47
+ end
48
+
49
+ db = new(path.to_s)
50
+ db.verify_schema_version!
51
+ db
52
+ end
53
+
54
+ # Stream the Coordinator output for `version` into a new SQLite
55
+ # cache, then open it. Replaces any existing file.
56
+ # @param version [String]
57
+ # @return [Database]
58
+ def build(version)
59
+ DbBuilder.build(version)
60
+ open(version)
61
+ end
62
+
63
+ # True if a built SQLite cache exists for this version.
64
+ # @param version [String]
65
+ # @return [Boolean]
66
+ def cached?(version)
67
+ Cache.sqlite_path(version).exist?
68
+ end
69
+ end
70
+
71
+ # @param path [String] path to the .sqlite3 file
72
+ def initialize(path)
73
+ @db = SQLite3::Database.new(path, readonly: true, results_as_hash: true)
74
+ @db.busy_timeout = 5000
75
+ end
76
+
77
+ # @return [String] the UCD version this DB was built from.
78
+ def ucd_version
79
+ @ucd_version ||= meta("ucd_version")
80
+ end
81
+
82
+ # @return [String] the schema version recorded at build time.
83
+ def schema_version
84
+ @schema_version ||= meta("schema_version")
85
+ end
86
+
87
+ # Look up the block name covering `codepoint`. nil if not in any
88
+ # known block (typically: cp is unassigned or outside the source
89
+ # fixture).
90
+ # @param codepoint [Integer]
91
+ # @return [String, nil]
92
+ def lookup_block(codepoint)
93
+ lookup(BLOCKS_TABLE, codepoint)
94
+ end
95
+
96
+ # Look up the script name covering `codepoint`. nil if not in any
97
+ # known script.
98
+ # @param codepoint [Integer]
99
+ # @return [String, nil]
100
+ def lookup_script(codepoint)
101
+ lookup(SCRIPTS_TABLE, codepoint)
102
+ end
103
+
104
+ # Enumerate every range in the blocks table that overlaps the
105
+ # inclusive query range, sorted by first_cp.
106
+ # @param first [Integer]
107
+ # @param last [Integer]
108
+ # @return [Enumerator<RangeEntry>] if no block given
109
+ def each_block_overlapping(first, last, &block)
110
+ each_overlapping(BLOCKS_TABLE, first, last, &block)
111
+ end
112
+
113
+ # Enumerate every range in the scripts table that overlaps the
114
+ # inclusive query range, sorted by first_cp.
115
+ # @param first [Integer]
116
+ # @param last [Integer]
117
+ # @return [Enumerator<RangeEntry>] if no block given
118
+ def each_script_overlapping(first, last, &block)
119
+ each_overlapping(SCRIPTS_TABLE, first, last, &block)
120
+ end
121
+
122
+ # All block ranges, sorted by first_cp. Mostly useful in specs.
123
+ # @return [Array<RangeEntry>]
124
+ def block_entries
125
+ entries(BLOCKS_TABLE)
126
+ end
127
+
128
+ # All script ranges, sorted by first_cp. Mostly useful in specs.
129
+ # @return [Array<RangeEntry>]
130
+ def script_entries
131
+ entries(SCRIPTS_TABLE)
132
+ end
133
+
134
+ # Every block range that shares the given block name. Empty for an
135
+ # unknown name. Used by the audit BlockAggregator to derive a
136
+ # block's assigned-codepoint set and span without a separate
137
+ # canonical-range lookup.
138
+ # @param name [String] block name as stored (e.g. "Basic_Latin")
139
+ # @return [Array<RangeEntry>] sorted by first_cp
140
+ def block_ranges_by_name(name)
141
+ ranges_by_name(BLOCKS_TABLE, name)
142
+ end
143
+
144
+ # Every script range that shares the given script code. Empty for an
145
+ # unknown name. Used by the audit ScriptAggregator.
146
+ # @param name [String] ISO 15924 script code (e.g. "Latn")
147
+ # @return [Array<RangeEntry>] sorted by first_cp
148
+ def script_ranges_by_name(name)
149
+ ranges_by_name(SCRIPTS_TABLE, name)
150
+ end
151
+
152
+ # Close the underlying SQLite connection. Idempotent.
153
+ # @return [void]
154
+ def close
155
+ @db.close
156
+ end
157
+
158
+ # Raises DatabaseSchemaError if the on-disk schema version does
159
+ # not match `SCHEMA_VERSION`. Called by `.open`; exposed for
160
+ # consumers that hold a long-lived connection.
161
+ # @return [void]
162
+ def verify_schema_version!
163
+ return if schema_version == SCHEMA_VERSION
164
+
165
+ raise DatabaseSchemaError.new(
166
+ "SQLite schema mismatch: on-disk #{schema_version.inspect}, " \
167
+ "expected #{SCHEMA_VERSION.inspect}",
168
+ context: { on_disk: schema_version, expected: SCHEMA_VERSION },
169
+ )
170
+ end
171
+
172
+ private
173
+
174
+ def meta(key)
175
+ @db.get_first_value(
176
+ "SELECT value FROM schema_meta WHERE key = ?",
177
+ [key.to_s],
178
+ )
179
+ end
180
+
181
+ def lookup(table, codepoint)
182
+ @db.get_first_value(
183
+ "SELECT name FROM #{table} WHERE first_cp <= ? AND last_cp >= ? LIMIT 1",
184
+ [codepoint, codepoint],
185
+ )
186
+ end
187
+
188
+ def each_overlapping(table, first, last)
189
+ return enum_for(:each_overlapping, table, first, last) unless block_given?
190
+
191
+ @db.execute(
192
+ "SELECT first_cp, last_cp, name FROM #{table} " \
193
+ "WHERE first_cp <= ? AND last_cp >= ? ORDER BY first_cp",
194
+ [last, first],
195
+ ).each do |row|
196
+ yield RangeEntry.new(row["first_cp"], row["last_cp"], row["name"])
197
+ end
198
+ end
199
+
200
+ def entries(table)
201
+ @db.execute(
202
+ "SELECT first_cp, last_cp, name FROM #{table} ORDER BY first_cp",
203
+ ).map { |row| RangeEntry.new(row["first_cp"], row["last_cp"], row["name"]) }
204
+ end
205
+
206
+ def ranges_by_name(table, name)
207
+ @db.execute(
208
+ "SELECT first_cp, last_cp, name FROM #{table} " \
209
+ "WHERE name = ? ORDER BY first_cp",
210
+ [name],
211
+ ).map { |row| RangeEntry.new(row["first_cp"], row["last_cp"], row["name"]) }
212
+ end
213
+ end
214
+ end