ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lutaml/model"
4
+
5
+ module Ucode
6
+ module Models
7
+ # Unihan dictionary data for CJK codepoints. Flat-hash design: every
8
+ # `kFoo` field is a key in `fields`, with array values (Unihan fields
9
+ # are space-separated lists; uniform arrays simplify the shape).
10
+ #
11
+ # The semantic grouping (readings / radicals / variants / sources / etc.)
12
+ # is a presentation concern, derived client-side by prefix. The data
13
+ # model stays open — Unihan adds fields across versions, and the hash
14
+ # absorbs additions without model changes.
15
+ class UnihanEntry < Lutaml::Model::Serializable
16
+ attribute :fields, :hash, default: -> { {} }
17
+
18
+ key_value do
19
+ map "fields", to: :fields
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Models — typed class representations of every UCD aggregate.
5
+ #
6
+ # Conventions (apply to every class in this namespace):
7
+ #
8
+ # - Inheritance, not include:
9
+ #
10
+ # class Foo < Lutaml::Model::Serializable
11
+ #
12
+ # - Wire shape declared via `key_value do … end` (covers JSON + YAML).
13
+ # NEVER `mapping do`, NEVER `json do`.
14
+ #
15
+ # - Codepoint references are "U+XXXX" strings — never nested CodePoint
16
+ # objects. Keeps the data normalized: each codepoint's full data lives
17
+ # only in its own folder.
18
+ #
19
+ # - Polymorphism: `polymorphic_class: true` + `polymorphic_map:` on the
20
+ # base discriminator; `polymorphic: [...]` on the consumer attribute
21
+ # + `polymorphic: { attribute:, class_map: }` on its mapping.
22
+ #
23
+ # - NEVER define `to_h` / `from_h` / `to_json` / `from_json`. All
24
+ # (de)serialization goes through lutaml-model.
25
+ #
26
+ module Models
27
+ autoload :PropertyAlias, "ucode/models/property_alias"
28
+ autoload :PropertyValueAlias, "ucode/models/property_value_alias"
29
+ autoload :Plane, "ucode/models/plane"
30
+ autoload :Block, "ucode/models/block"
31
+ autoload :Script, "ucode/models/script"
32
+ autoload :CodePoint, "ucode/models/codepoint"
33
+ autoload :UnihanEntry, "ucode/models/unihan_entry"
34
+ autoload :NamesListEntry, "ucode/models/names_list_entry"
35
+ autoload :NameAlias, "ucode/models/name_alias"
36
+ autoload :NamedSequence, "ucode/models/named_sequence"
37
+ autoload :SpecialCasingRule, "ucode/models/special_casing_rule"
38
+ autoload :CaseFoldingRule, "ucode/models/case_folding_rule"
39
+ autoload :BidiMirroring, "ucode/models/bidi_mirroring"
40
+ autoload :BidiBracketPair, "ucode/models/bidi_bracket_pair"
41
+ autoload :CjkRadical, "ucode/models/cjk_radical"
42
+ autoload :StandardizedVariant, "ucode/models/standardized_variant"
43
+ autoload :BinaryPropertyAssignment, "ucode/models/binary_property_assignment"
44
+ autoload :Relationship, "ucode/models/relationship"
45
+ autoload :Audit, "ucode/models/audit"
46
+ end
47
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/extracted_properties"
4
+
5
+ module Ucode
6
+ module Parsers
7
+ # Generic range/value parser for the auxiliary segmentation files
8
+ # under `auxiliary/` (GraphemeBreakProperty, WordBreakProperty,
9
+ # SentenceBreakProperty, VerticalOrientation, IndicPositionalCategory,
10
+ # IndicSyllabicCategory, IdentifierStatus, IdentifierType) plus the
11
+ # top-level `LineBreak.txt` and `EastAsianWidth.txt`.
12
+ #
13
+ # File format is identical to ExtractedProperties (UAX #44 range/value):
14
+ #
15
+ # XXXX..YYYY; value
16
+ # XXXX; value
17
+ #
18
+ # Coordinator dispatches by file name to the right CodePoint
19
+ # attribute. This class exists as a distinct name so call sites read
20
+ # "auxiliary" instead of "extracted" — the parsing logic is shared
21
+ # via inheritance. Adding auxiliary-specific behavior later does not
22
+ # require touching ExtractedProperties (OCP).
23
+ class Auxiliary < ExtractedProperties
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/error"
4
+
5
+ module Ucode
6
+ module Parsers
7
+ # Shared infrastructure for every UCD text-file parser. Subclasses
8
+ # implement `.each_record(path) { |record| ... }` returning an
9
+ # Enumerator when called without a block.
10
+ #
11
+ # All methods are class methods — parsers are stateless.
12
+ #
13
+ # UCD text-file format (UAX #44):
14
+ # - Fields separated by `;`
15
+ # - Lines starting with `#` are comments
16
+ # - Blank lines are ignored
17
+ # - Some lines carry an inline `# trailing comment` after the data
18
+ class Base
19
+ # One physical line from the source file, post-filter (blanks and
20
+ # comment-only lines are skipped before yielding).
21
+ Line = Struct.new(:number, :text, :comment, keyword_init: true) do
22
+ # Returns the data part of the line — everything before the first
23
+ # `#`, rstripped. For lines with no comment this is just the text.
24
+ def data
25
+ idx = text.index("#")
26
+ idx.nil? ? text : text[0...idx].rstrip
27
+ end
28
+
29
+ # Splits the data part on `;` into stripped fields.
30
+ def fields
31
+ data.split(";").map(&:strip)
32
+ end
33
+
34
+ # Returns the n-th field (0-based), or nil if out of range.
35
+ def field(n)
36
+ fields[n]
37
+ end
38
+ end
39
+
40
+ HEX_PATTERN = /\A[0-9A-Fa-f]{1,6}\z/.freeze
41
+ private_constant :HEX_PATTERN
42
+
43
+ RANGE_SEPARATOR = ".."
44
+ private_constant :RANGE_SEPARATOR
45
+
46
+ class << self
47
+ # Iterates non-blank, non-comment lines from `path`, yielding Line
48
+ # records. Returns an Enumerator when no block is given so callers
49
+ # can chain (`.first(n)`, `.lazy.map`, etc.).
50
+ #
51
+ # Lines that are entirely whitespace or start with `#` are skipped
52
+ # silently — comment text is preserved on data lines that carry an
53
+ # inline `# trailing comment`.
54
+ def each_line(path)
55
+ return enum_for(:each_line, path) unless block_given?
56
+
57
+ lineno = 0
58
+ File.foreach(path.to_s) do |raw|
59
+ lineno += 1
60
+ stripped = raw.strip
61
+ next if stripped.empty?
62
+ next if stripped.start_with?("#")
63
+
64
+ yield build_line(lineno, raw)
65
+ end
66
+ end
67
+
68
+ # Parses an n-th `;`-separated field from a line of text or a Line
69
+ # struct. Strips surrounding whitespace. Returns nil if the field
70
+ # is missing or out of range.
71
+ def parse_field(line, n)
72
+ fields = line_fields(line)
73
+ return nil if fields.length <= n
74
+
75
+ fields[n]
76
+ end
77
+
78
+ # Parses a codepoint-or-range field per UAX #44. Accepts:
79
+ # "0041" → 0x0041 (Integer)
80
+ # "3400..4DBF" → 0x3400..0x4DBF (Range)
81
+ #
82
+ # Returns nil for blank input. Raises Ucode::MalformedLineError
83
+ # for invalid hex.
84
+ def parse_codepoint_or_range(field)
85
+ return nil if field.nil? || field.empty?
86
+
87
+ if field.include?(RANGE_SEPARATOR)
88
+ first_str, last_str = field.split(RANGE_SEPARATOR, 2)
89
+ first = parse_hex_cp(first_str)
90
+ last = parse_hex_cp(last_str)
91
+ Range.new(first, last)
92
+ else
93
+ parse_hex_cp(field)
94
+ end
95
+ end
96
+
97
+ # Parses a single hex codepoint string into an Integer. Raises
98
+ # Ucode::MalformedLineError with the offending input in context
99
+ # for invalid input.
100
+ def parse_hex_cp(input)
101
+ s = input.to_s.strip
102
+ unless s.match?(HEX_PATTERN)
103
+ raise MalformedLineError.new(
104
+ "invalid codepoint: #{input.inspect}",
105
+ context: { input: input }
106
+ )
107
+ end
108
+ s.to_i(16)
109
+ end
110
+
111
+ private
112
+
113
+ # Builds a Line struct from a raw text line. Splits off any
114
+ # trailing `# comment` into the Line's `comment` field.
115
+ def build_line(number, raw)
116
+ text = raw.chomp
117
+ hash_idx = text.index("#")
118
+
119
+ if hash_idx.nil?
120
+ Line.new(number: number, text: text, comment: nil)
121
+ else
122
+ Line.new(
123
+ number: number,
124
+ text: text[0...hash_idx].rstrip,
125
+ comment: text[(hash_idx + 1)..].strip
126
+ )
127
+ end
128
+ end
129
+
130
+ def line_fields(line)
131
+ data = line.is_a?(Line) ? line.data : line.to_s
132
+ data.split(";").map(&:strip)
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/bidi_bracket_pair"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `BidiBrackets.txt` — paired bracket partners.
9
+ #
10
+ # Format (UAX #44):
11
+ # cp; paired_cp; type
12
+ #
13
+ # `type` is `o` (open) or `c` (close). Coordinator merges each row
14
+ # into `CodePoint#bidi.paired_bracket_id` and `.paired_bracket_type`.
15
+ class BidiBrackets < Base
16
+ class << self
17
+ def each_record(path)
18
+ return enum_for(:each_record, path) unless block_given?
19
+
20
+ each_line(path) do |line|
21
+ fields = line.fields
22
+ next if fields.length < 3
23
+
24
+ cp = parse_hex_cp(fields[0])
25
+ paired_cp = parse_hex_cp(fields[1])
26
+ type = fields[2]
27
+ next if type.nil? || type.empty?
28
+
29
+ yield Models::BidiBracketPair.new(
30
+ codepoint: cp,
31
+ paired_id: format("U+%04X", paired_cp),
32
+ type: type
33
+ )
34
+ end
35
+
36
+ nil
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/bidi_mirroring"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `BidiMirroring.txt` — the bidi mirroring glyph partner.
9
+ #
10
+ # Format (UAX #44):
11
+ # cp; mirrored_cp
12
+ #
13
+ # Coordinator merges each row into `CodePoint#bidi.mirroring_glyph_id`.
14
+ class BidiMirroring < Base
15
+ class << self
16
+ def each_record(path)
17
+ return enum_for(:each_record, path) unless block_given?
18
+
19
+ each_line(path) do |line|
20
+ fields = line.fields
21
+ next if fields.length < 2
22
+
23
+ cp = parse_hex_cp(fields[0])
24
+ mirrored_cp = parse_hex_cp(fields[1])
25
+
26
+ yield Models::BidiMirroring.new(
27
+ codepoint: cp,
28
+ mirrored_id: format("U+%04X", mirrored_cp)
29
+ )
30
+ end
31
+
32
+ nil
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/block"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `Blocks.txt` — one block range per line.
9
+ #
10
+ # Format (UAX #44):
11
+ # XXXX..XXXX; Block Name
12
+ #
13
+ # The `id` is the block name with runs of whitespace collapsed to a
14
+ # single underscore. The `name` is preserved verbatim. Per the
15
+ # project rules (CLAUDE.md), block names are NOT otherwise slugified.
16
+ #
17
+ # `plane_number` is derived from the high bits of `range_first`.
18
+ class Blocks < Base
19
+ class << self
20
+ # Yields one Block per non-comment line. Returns a lazy
21
+ # Enumerator when called without a block.
22
+ def each_record(path)
23
+ return enum_for(:each_record, path) unless block_given?
24
+
25
+ each_line(path) do |line|
26
+ fields = line.fields
27
+ next if fields.length < 2
28
+
29
+ range_field = fields[0]
30
+ name = fields[1]
31
+ next if name.nil? || name.empty?
32
+
33
+ range = parse_codepoint_or_range(range_field)
34
+ yield build_block(range, name)
35
+ end
36
+
37
+ nil
38
+ end
39
+
40
+ private
41
+
42
+ def build_block(range, name)
43
+ first, last = bounds_of(range)
44
+ Models::Block.new(
45
+ id: name.gsub(/\s+/, "_"),
46
+ name: name,
47
+ range_first: first,
48
+ range_last: last,
49
+ plane_number: first >> 16
50
+ )
51
+ end
52
+
53
+ def bounds_of(range)
54
+ if range.is_a?(Range)
55
+ [range.begin, range.end]
56
+ else
57
+ [range, range]
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/case_folding_rule"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `CaseFolding.txt` — case folding mappings for comparison.
9
+ #
10
+ # Format (UAX #44):
11
+ # cp; status; mapping; # name
12
+ #
13
+ # `status` is one of: C (common), F (full), S (simple), T (turkic).
14
+ # `mapping` is one or more space-separated hex codepoints.
15
+ class CaseFolding < Base
16
+ class << self
17
+ # Yields one CaseFoldingRule per non-comment line. Returns a lazy
18
+ # Enumerator when called without a block.
19
+ def each_record(path)
20
+ return enum_for(:each_record, path) unless block_given?
21
+
22
+ each_line(path) do |line|
23
+ fields = line.fields
24
+ next if fields.length < 3
25
+
26
+ cp = parse_hex_cp(fields[0])
27
+ status = fields[1]
28
+ next if status.nil? || status.empty?
29
+
30
+ yield Models::CaseFoldingRule.new(
31
+ codepoint: cp,
32
+ status: status,
33
+ mapping_ids: parse_mapping(fields[2]),
34
+ comment: line.comment
35
+ )
36
+ end
37
+
38
+ nil
39
+ end
40
+
41
+ private
42
+
43
+ def parse_mapping(field)
44
+ return [] if field.nil? || field.empty?
45
+
46
+ field.split(/\s+/).reject(&:empty?).map do |hex|
47
+ format("U+%04X", parse_hex_cp(hex))
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/cjk_radical"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `CJKRadicals.txt` — KangXi radical → CJK radical ideograph
9
+ # → canonical ideograph mapping.
10
+ #
11
+ # Format (UAX #44):
12
+ # radical_number; cjk_radical; ideograph
13
+ #
14
+ # `cjk_radical` and `ideograph` are either a single hex codepoint
15
+ # (`2F00`) or a range in the form `XXXX..YYYY`. Range rows are
16
+ # expanded to one CjkRadical per codepoint.
17
+ #
18
+ # Coordinator merges each row into the relevant CodePoint.
19
+ class CjkRadicals < Base
20
+ class << self
21
+ def each_record(path)
22
+ return enum_for(:each_record, path) unless block_given?
23
+
24
+ each_line(path) do |line|
25
+ fields = line.fields
26
+ next if fields.length < 3
27
+
28
+ radical_number = parse_radical_number(fields[0])
29
+ cjk_radical_field = fields[1]
30
+ ideograph_field = fields[2]
31
+ next if radical_number.nil?
32
+
33
+ yield_models(radical_number, cjk_radical_field, ideograph_field).each do |model|
34
+ yield model
35
+ end
36
+ end
37
+
38
+ nil
39
+ end
40
+
41
+ private
42
+
43
+ # The radical number is a positive integer; some rows carry a
44
+ # trailing comment-stripped form. Reject anything non-numeric.
45
+ def parse_radical_number(field)
46
+ return nil if field.nil? || field.empty?
47
+
48
+ Integer(field, exception: false)
49
+ end
50
+
51
+ def yield_models(radical_number, cjk_radical_field, ideograph_field)
52
+ cjk_ids = expand_ids(cjk_radical_field)
53
+ ideograph_ids = expand_ids(ideograph_field)
54
+
55
+ if cjk_ids.size == 1 && ideograph_ids.size == 1
56
+ return [Models::CjkRadical.new(
57
+ radical_number: radical_number,
58
+ cjk_radical_id: cjk_ids.first,
59
+ ideograph_id: ideograph_ids.first
60
+ )]
61
+ end
62
+
63
+ if cjk_ids.size == 1 && ideograph_ids.size > 1
64
+ return ideograph_ids.map do |ideograph_id|
65
+ Models::CjkRadical.new(
66
+ radical_number: radical_number,
67
+ cjk_radical_id: cjk_ids.first,
68
+ ideograph_id: ideograph_id
69
+ )
70
+ end
71
+ end
72
+
73
+ if cjk_ids.size > 1 && ideograph_ids.size == 1
74
+ return cjk_ids.map do |cjk_radical_id|
75
+ Models::CjkRadical.new(
76
+ radical_number: radical_number,
77
+ cjk_radical_id: cjk_radical_id,
78
+ ideograph_id: ideograph_ids.first
79
+ )
80
+ end
81
+ end
82
+
83
+ cjk_ids.zip(ideograph_ids).map do |cjk_id, ideograph_id|
84
+ Models::CjkRadical.new(
85
+ radical_number: radical_number,
86
+ cjk_radical_id: cjk_id,
87
+ ideograph_id: ideograph_id
88
+ )
89
+ end
90
+ end
91
+
92
+ def expand_ids(field)
93
+ return [] if field.nil? || field.empty?
94
+
95
+ range = parse_codepoint_or_range(field)
96
+ cps = range.is_a?(Range) ? range.to_a : [range]
97
+ cps.map { |cp| format("U+%04X", cp) }
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+
5
+ module Ucode
6
+ module Parsers
7
+ # Parses `DerivedAge.txt` — the Unicode version in which each
8
+ # codepoint was first assigned.
9
+ #
10
+ # Format (UAX #44):
11
+ # XXXX..YYYY; M.N
12
+ # XXXX; M.N
13
+ #
14
+ # The age is a Unicode version string like "1.1", "5.2", "15.0".
15
+ # Coordinator merges each row into `CodePoint#age`.
16
+ #
17
+ # Ranges are expanded per-codepoint (one Tuple per cp) because the
18
+ # Coordinator needs per-cp assignment for `CodePoint#age`.
19
+ class DerivedAge < Base
20
+ # Lightweight record yielded by `.each_record`. Models are
21
+ # heavyweight for stream-only data — the Coordinator consumes
22
+ # these immediately.
23
+ Tuple = Struct.new(:cp, :age, keyword_init: true) do
24
+ def cp_id
25
+ format("U+%04X", cp)
26
+ end
27
+ end
28
+
29
+ class << self
30
+ def each_record(path)
31
+ return enum_for(:each_record, path) unless block_given?
32
+
33
+ each_line(path) do |line|
34
+ fields = line.fields
35
+ next if fields.length < 2
36
+
37
+ range = parse_codepoint_or_range(fields[0])
38
+ age = fields[1]
39
+ next if age.nil? || age.empty?
40
+
41
+ each_cp(range) { |cp| yield Tuple.new(cp: cp, age: age) }
42
+ end
43
+
44
+ nil
45
+ end
46
+
47
+ private
48
+
49
+ def each_cp(range)
50
+ if range.is_a?(Range)
51
+ range.each { |cp| yield cp }
52
+ else
53
+ yield range
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/binary_property_assignment"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `DerivedCoreProperties.txt` — derived binary properties
9
+ # (Alphabetic, Uppercase, White_Space, Bidi_Control, …).
10
+ #
11
+ # Format (UAX #44):
12
+ # XXXX..YYYY; Property_Name
13
+ # XXXX; Property_Name
14
+ #
15
+ # The file only lists positive assignments; absence means the
16
+ # property is false. Each yielded `BinaryPropertyAssignment` has
17
+ # `enabled: true`.
18
+ #
19
+ # Coordinator appends each `property_short` (resolved to the long
20
+ # form via PropertyAliases if needed) to `CodePoint#binary_properties`.
21
+ class DerivedCoreProperties < Base
22
+ class << self
23
+ def each_record(path)
24
+ return enum_for(:each_record, path) unless block_given?
25
+
26
+ each_line(path) do |line|
27
+ fields = line.fields
28
+ next if fields.length < 2
29
+
30
+ range = parse_codepoint_or_range(fields[0])
31
+ property = fields[1]
32
+ next if property.nil? || property.empty?
33
+
34
+ each_cp(range) { |cp| yield build_assignment(cp, property) }
35
+ end
36
+
37
+ nil
38
+ end
39
+
40
+ private
41
+
42
+ def each_cp(range)
43
+ if range.is_a?(Range)
44
+ range.each { |cp| yield cp }
45
+ else
46
+ yield range
47
+ end
48
+ end
49
+
50
+ def build_assignment(cp, property)
51
+ Models::BinaryPropertyAssignment.new(
52
+ codepoint: cp,
53
+ property_short: property,
54
+ enabled: true
55
+ )
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end