ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/standardized_variant"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `StandardizedVariants.txt` — variation selector sequences.
9
+ #
10
+ # Format (UAX #44):
11
+ # base_cp VS_cp; description; [contexts]; # trailing comment
12
+ #
13
+ # `base_cp` + `variation_selector_id` is the key; `description` is
14
+ # the visual result; `contexts` (optional) is a space-separated
15
+ # list of shaping contexts (e.g. `no-break`).
16
+ class StandardizedVariants < Base
17
+ class << self
18
+ def each_record(path)
19
+ return enum_for(:each_record, path) unless block_given?
20
+
21
+ each_line(path) do |line|
22
+ fields = line.fields
23
+ next if fields.length < 2
24
+
25
+ sequence_field = fields[0]
26
+ description = fields[1]
27
+ next if description.nil? || description.empty?
28
+
29
+ sequence = sequence_field.to_s.split(/\s+/).reject(&:empty?)
30
+ next if sequence.length < 2
31
+
32
+ base = parse_hex_cp(sequence[0])
33
+ vs = parse_hex_cp(sequence[1])
34
+
35
+ yield Models::StandardizedVariant.new(
36
+ base_id: format("U+%04X", base),
37
+ variation_selector_id: format("U+%04X", vs),
38
+ description: description,
39
+ contexts: parse_contexts(fields[2])
40
+ )
41
+ end
42
+
43
+ nil
44
+ end
45
+
46
+ private
47
+
48
+ def parse_contexts(field)
49
+ return [] if field.nil? || field.empty?
50
+
51
+ field.split(/\s*;\s*/).flat_map { |part| part.split(/\s+/) }.reject(&:empty?)
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Parsers
5
+ class UnicodeData < Base
6
+ # Computes the official Unicode name for a Hangul syllable codepoint
7
+ # per the algorithm in Chapter 3 of the Unicode Standard (TR #15).
8
+ #
9
+ # The name is "HANGUL SYLLABLE " followed by the concatenation of the
10
+ # short names of the L, V, (optional T) Jamo that compose it.
11
+ #
12
+ # Constants are the canonical Jamo short names from UnicodeData.txt
13
+ # (also published separately as Jamo.txt). Indexing into these arrays
14
+ # by (cp - BASE_L/V/T) gives the short name for that Jamo.
15
+ module HangulName
16
+ S_BASE = 0xAC00
17
+ L_BASE = 0x1100
18
+ V_BASE = 0x1161
19
+ T_BASE = 0x11A7
20
+
21
+ L_COUNT = 19
22
+ V_COUNT = 21
23
+ T_COUNT = 28
24
+ N_COUNT = V_COUNT * T_COUNT # 588
25
+ S_COUNT = L_COUNT * N_COUNT # 11_172
26
+
27
+ LEAD_SHORT_NAMES = %w[
28
+ G GG N D DD R M B BB S SS
29
+ J JJ C K T P H
30
+ ].freeze
31
+
32
+ VOWEL_SHORT_NAMES = %w[
33
+ A AE YA YAE EO E YEO YE O WA WAE OE YO
34
+ U WEO WE WI YU EU YI I
35
+ ].freeze
36
+
37
+ TRAIL_SHORT_NAMES = [
38
+ "", # 11A7 has no short name; used for LV (no trail)
39
+ "G", "GG", "GS", "N", "NJ", "NH", "D",
40
+ "L", "LG", "LM", "LB", "LS", "LT", "LH",
41
+ "M", "B", "BS", "S", "SS", "NG", "J",
42
+ "C", "K", "T", "P", "H"
43
+ ].freeze
44
+
45
+ class << self
46
+ # Returns true if `cp` is in the Hangul syllable block.
47
+ def hangul_syllable?(cp)
48
+ cp.is_a?(Integer) &&
49
+ cp >= S_BASE &&
50
+ cp < S_BASE + S_COUNT
51
+ end
52
+
53
+ # Returns the synthesized name for a Hangul syllable codepoint,
54
+ # or nil if `cp` is not in the Hangul syllable block.
55
+ def call(cp)
56
+ return nil unless hangul_syllable?(cp)
57
+
58
+ s_index = cp - S_BASE
59
+ l_index = s_index / N_COUNT
60
+ v_index = (s_index % N_COUNT) / T_COUNT
61
+ t_index = s_index % T_COUNT
62
+
63
+ parts = [LEAD_SHORT_NAMES[l_index], VOWEL_SHORT_NAMES[v_index]]
64
+ parts << TRAIL_SHORT_NAMES[t_index] if t_index.positive?
65
+
66
+ "HANGUL SYLLABLE #{parts.join}"
67
+ end
68
+ end
69
+ end
70
+ private_constant :HangulName
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,268 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/codepoint"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `UnicodeData.txt` — the primary per-codepoint record file.
9
+ #
10
+ # Field layout (UAX #44, 15 `;`-separated fields):
11
+ # 0. codepoint
12
+ # 1. name (`<control>` or `<Type, First>` / `<Type, Last>` for ranges)
13
+ # 2. general_category
14
+ # 3. canonical_combining_class
15
+ # 4. bidi_class
16
+ # 5. decomposition_type_and_mapping (combined: optional `<tag>` + cps)
17
+ # 6. numeric_value_decimal (deprecated duplicate of 8 for Nd)
18
+ # 7. numeric_value_digit (deprecated duplicate of 8 for Nl)
19
+ # 8. numeric_value (canonical)
20
+ # 9. bidi_mirrored (Y/N)
21
+ # 10. Unicode_1_Name (deprecated, kept as `name1`)
22
+ # 11. ISO_10646_comment (deprecated, ignored)
23
+ # 12. simple_uppercase_mapping
24
+ # 13. simple_lowercase_mapping
25
+ # 14. simple_titlecase_mapping
26
+ #
27
+ # Hangul syllables and CJK ideographs appear as range markers
28
+ # (`<..., First>` / `<..., Last>`). The range is expanded to one
29
+ # CodePoint per codepoint with the appropriate synthesized name.
30
+ class UnicodeData < Base
31
+ autoload :HangulName, "ucode/parsers/unicode_data/hangul_name"
32
+
33
+ FIRST_MARKER = "First"
34
+ LAST_MARKER = "Last"
35
+ private_constant :FIRST_MARKER, :LAST_MARKER
36
+
37
+ class << self
38
+ # Yields one CodePoint per codepoint in `path`. Range markers
39
+ # (`<..., First>` to `<..., Last>`) are expanded to one CodePoint
40
+ # per codepoint, with names synthesized per Unicode rules.
41
+ #
42
+ # Returns a lazy Enumerator when called without a block.
43
+ def each_record(path)
44
+ return enum_for(:each_record, path) unless block_given?
45
+
46
+ pending_range = nil
47
+
48
+ each_line(path) do |line|
49
+ begin
50
+ fields = line.fields
51
+
52
+ if pending_range
53
+ unless fields[1]&.end_with?("#{LAST_MARKER}>")
54
+ raise MalformedLineError.new(
55
+ "expected <#{pending_range[:template]}, #{LAST_MARKER}>, " \
56
+ "got #{fields[1].inspect}",
57
+ context: { file: path.to_s, line: line.number }
58
+ )
59
+ end
60
+
61
+ last_cp = parse_hex_cp(fields[0])
62
+ expand_range(pending_range, last_cp).each { |cp| yield cp }
63
+ pending_range = nil
64
+ next
65
+ end
66
+
67
+ cp = parse_hex_cp(fields[0])
68
+ name = fields[1]
69
+
70
+ if range_start?(name)
71
+ pending_range = {
72
+ first_cp: cp,
73
+ template: extract_template(name),
74
+ general_category: fields[2],
75
+ combining_class: fields[3].to_i,
76
+ bidi_class: fields[4],
77
+ bidi_mirrored: fields[9]
78
+ }
79
+ next
80
+ end
81
+
82
+ yield build_codepoint(
83
+ cp: cp,
84
+ name: synthesize_name(cp, name),
85
+ general_category: fields[2],
86
+ combining_class: fields[3].to_i,
87
+ bidi_class: fields[4],
88
+ decomposition_field: fields[5],
89
+ numeric_decimal: fields[6],
90
+ numeric_digit: fields[7],
91
+ numeric_value: fields[8],
92
+ bidi_mirrored: fields[9],
93
+ unicode_1_name: fields[10],
94
+ simple_upper_id: fields[12],
95
+ simple_lower_id: fields[13],
96
+ simple_title_id: fields[14]
97
+ )
98
+ rescue MalformedLineError => e
99
+ e.context[:file] ||= path.to_s
100
+ e.context[:line] ||= line.number
101
+ raise
102
+ end
103
+ end
104
+
105
+ nil
106
+ end
107
+
108
+ private
109
+
110
+ def range_start?(name)
111
+ name&.end_with?("#{FIRST_MARKER}>")
112
+ end
113
+
114
+ def extract_template(name)
115
+ name.delete_prefix("<").delete_suffix(", #{FIRST_MARKER}>")
116
+ end
117
+
118
+ # Synthesizes the official name for codepoints whose UnicodeData
119
+ # name is a placeholder. For `<control>` and other non-range
120
+ # placeholders the raw name is returned verbatim. For CJK and
121
+ # Hangul ranges the per-codepoint name is computed algorithmically.
122
+ def synthesize_name(cp, name)
123
+ case name
124
+ when "<control>" then "<control>"
125
+ when /\A<.*CJK.*>\z/
126
+ "CJK UNIFIED IDEOGRAPH-#{format("%04X", cp)}"
127
+ else
128
+ HangulName.call(cp) || name
129
+ end
130
+ end
131
+
132
+ # Expands a (first, last, template) range into one CodePoint per
133
+ # codepoint with the synthesized per-codepoint name.
134
+ def expand_range(range, last_cp)
135
+ first_cp = range[:first_cp]
136
+ Enumerator.new do |yielder|
137
+ first_cp.upto(last_cp) do |cp|
138
+ yielder << build_codepoint(
139
+ cp: cp,
140
+ name: synthesize_name(cp, "<#{range[:template]}, #{FIRST_MARKER}>"),
141
+ general_category: range[:general_category],
142
+ combining_class: range[:combining_class] || 0,
143
+ bidi_class: range[:bidi_class],
144
+ bidi_mirrored: range[:bidi_mirrored]
145
+ )
146
+ end
147
+ end
148
+ end
149
+
150
+ def build_codepoint(cp:, name:, general_category:, combining_class:,
151
+ bidi_class:, decomposition_field: nil,
152
+ numeric_decimal: nil, numeric_digit: nil, numeric_value: nil,
153
+ bidi_mirrored: nil, unicode_1_name: nil,
154
+ simple_upper_id: nil, simple_lower_id: nil, simple_title_id: nil)
155
+ Models::CodePoint.new(
156
+ cp: cp,
157
+ id: format("U+%04X", cp),
158
+ name: name,
159
+ name1: cp_or_nil(unicode_1_name),
160
+ general_category: general_category,
161
+ combining_class: combining_class.to_i,
162
+ bidi: build_bidi(bidi_class, bidi_mirrored),
163
+ decomposition: build_decomposition(decomposition_field),
164
+ numeric: build_numeric(general_category, numeric_decimal, numeric_digit, numeric_value),
165
+ casing: build_casing(simple_upper_id, simple_lower_id, simple_title_id)
166
+ )
167
+ end
168
+
169
+ def build_bidi(bidi_class, mirrored)
170
+ return nil if (bidi_class.nil? || bidi_class.empty?) &&
171
+ (mirrored.nil? || mirrored.empty?)
172
+
173
+ Models::CodePoint::Bidi.new(
174
+ bidi_class: cp_or_nil(bidi_class),
175
+ is_mirrored: mirrored == "Y"
176
+ )
177
+ end
178
+
179
+ # Field 5 is a single combined field: optional `<tag>` prefix
180
+ # followed by space-separated codepoint hexes. No prefix means
181
+ # canonical decomposition (`can`).
182
+ def build_decomposition(combined)
183
+ return nil if combined.nil? || combined.empty?
184
+
185
+ type = "can"
186
+ mapping = combined
187
+
188
+ if combined.start_with?("<")
189
+ close = combined.index(">")
190
+ type = combined[1...close]
191
+ mapping = combined[(close + 1)..]
192
+ end
193
+
194
+ ids = mapping.split(/\s+/).reject(&:empty?).map do |hex|
195
+ format("U+%04X", parse_hex_cp(hex))
196
+ end
197
+
198
+ Models::CodePoint::Decomposition.new(
199
+ type: type,
200
+ codepoint_ids: ids
201
+ )
202
+ end
203
+
204
+ # Derives Numeric_Type from general_category (Nd/Nl/No) and uses
205
+ # field 8 as the canonical value. Fields 6 and 7 are deprecated
206
+ # duplicates of 8 for Nd and Nl respectively; they are consulted
207
+ # only as a fallback when field 8 is unexpectedly blank.
208
+ def build_numeric(gc, decimal_field, digit_field, numeric_field)
209
+ type = numeric_type_for_gc(gc)
210
+ return nil unless type
211
+
212
+ raw = [numeric_field, digit_field, decimal_field].find { |v| !v.nil? && !v.empty? }
213
+ return nil if raw.nil?
214
+
215
+ numerator, denominator = parse_numeric_value(raw)
216
+ Models::CodePoint::NumericValue.new(
217
+ type: type,
218
+ numerator: numerator,
219
+ denominator: denominator
220
+ )
221
+ end
222
+
223
+ def numeric_type_for_gc(gc)
224
+ case gc&.to_s
225
+ when /\ANd/ then "de"
226
+ when /\ANl/ then "di"
227
+ when /\ANo/ then "nu"
228
+ end
229
+ end
230
+
231
+ def parse_numeric_value(value)
232
+ if value.include?("/")
233
+ num, denom = value.split("/", 2)
234
+ [num.to_i, denom.to_i]
235
+ else
236
+ [value.to_i, 1]
237
+ end
238
+ end
239
+
240
+ def build_casing(upper_id, lower_id, title_id)
241
+ return nil if blank?(upper_id) && blank?(lower_id) && blank?(title_id)
242
+
243
+ Models::CodePoint::Casing.new(
244
+ simple_upper_id: cp_id(upper_id),
245
+ simple_lower_id: cp_id(lower_id),
246
+ simple_title_id: cp_id(title_id)
247
+ )
248
+ end
249
+
250
+ def cp_id(field)
251
+ return nil if blank?(field)
252
+
253
+ format("U+%04X", parse_hex_cp(field))
254
+ end
255
+
256
+ def cp_or_nil(field)
257
+ return nil if blank?(field)
258
+
259
+ field
260
+ end
261
+
262
+ def blank?(field)
263
+ field.nil? || field.empty?
264
+ end
265
+ end
266
+ end
267
+ end
268
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "ucode/parsers/base"
5
+ require "ucode/error"
6
+
7
+ module Ucode
8
+ module Parsers
9
+ # Parses all eight Unihan files (`Unihan_IRGSources.txt`,
10
+ # `Unihan_NumericValues.txt`, `Unihan_RadicalStrokeCounts.txt`,
11
+ # `Unihan_Readings.txt`, `Unihan_DictionaryIndices.txt`,
12
+ # `Unihan_DictionaryLikeData.txt`, `Unihan_Variants.txt`,
13
+ # `Unihan_OtherMappings.txt`).
14
+ #
15
+ # File format is uniform across all eight (Unihan documentation):
16
+ #
17
+ # U+XXXX<TAB>kField<TAB>value
18
+ #
19
+ # The value may be a space-separated list (`kRSUnicode`, `kDefinition`
20
+ # for prose, `kCangjieInput` for multiple codes). `.split` (whitespace)
21
+ # produces the values array uniformly. Coordinator groups records by
22
+ # `cp` and writes into `CodePoint.unihan.fields[field]`.
23
+ #
24
+ # One parser, not eight: the format is uniform. The filename carries
25
+ # no parse-time information — every line is self-describing via its
26
+ # field name. Adding a new Unihan file is a one-line change to
27
+ # `FILES`; no parser modification (OCP).
28
+ class Unihan < Base
29
+ FILES = %w[
30
+ Unihan_DictionaryIndices.txt
31
+ Unihan_DictionaryLikeData.txt
32
+ Unihan_IRGSources.txt
33
+ Unihan_NumericValues.txt
34
+ Unihan_RadicalStrokeCounts.txt
35
+ Unihan_Readings.txt
36
+ Unihan_Variants.txt
37
+ Unihan_OtherMappings.txt
38
+ ].freeze
39
+
40
+ # Stream record: one Unihan line. Internal pipeline data — a Struct
41
+ # avoids lutaml-model ceremony for transient values. The final
42
+ # `UnihanEntry` model carries the merged, persisted shape. The
43
+ # member is `field_values` (not `values`) to avoid overriding
44
+ # `Struct#values` (the array of all member values).
45
+ Record = Struct.new(:cp, :field, :field_values, keyword_init: true) do
46
+ def cp_id
47
+ format("U+%04X", cp)
48
+ end
49
+ end
50
+
51
+ class << self
52
+ # Yields one Record per non-comment line in a single Unihan file.
53
+ # Returns a lazy Enumerator when no block is given.
54
+ def each_record(path)
55
+ return enum_for(:each_record, path) unless block_given?
56
+
57
+ path_str = path.to_s
58
+ lineno = 0
59
+
60
+ File.foreach(path_str) do |raw|
61
+ lineno += 1
62
+ line = raw.chomp
63
+ next if line.empty? || line.start_with?("#")
64
+
65
+ begin
66
+ yield parse_line(line)
67
+ rescue MalformedLineError => e
68
+ e.context[:file] ||= path_str
69
+ e.context[:line] ||= lineno
70
+ raise
71
+ end
72
+ end
73
+
74
+ nil
75
+ end
76
+
77
+ # Iterates every known Unihan file in `dir`, yielding one Record
78
+ # per data line across all files. Missing files are silently
79
+ # skipped (incremental runs, partial downloads).
80
+ def each_in_dir(dir)
81
+ return enum_for(:each_in_dir, dir) unless block_given?
82
+
83
+ dir_path = Pathname.new(dir)
84
+ FILES.each do |filename|
85
+ path = dir_path.join(filename)
86
+ next unless path.exist?
87
+
88
+ each_record(path) { |record| yield record }
89
+ end
90
+
91
+ nil
92
+ end
93
+
94
+ private
95
+
96
+ # Parses one TAB-separated Unihan data line into a Record. The
97
+ # `split("\t", 3)` limit preserves any tabs inside the value
98
+ # (defensive — real Unihan data does not contain them).
99
+ def parse_line(line)
100
+ cp_str, field, value = line.split("\t", 3)
101
+ unless cp_str && field && value && !value.empty?
102
+ raise MalformedLineError.new(
103
+ "invalid Unihan line: #{line.inspect}",
104
+ context: { line: line }
105
+ )
106
+ end
107
+
108
+ cp_str = cp_str.strip
109
+ unless cp_str.start_with?("U+") && cp_str.length > 2
110
+ raise MalformedLineError.new(
111
+ "invalid Unihan codepoint: #{cp_str.inspect}",
112
+ context: { cp: cp_str }
113
+ )
114
+ end
115
+
116
+ Record.new(
117
+ cp: parse_hex_cp(cp_str[2..]),
118
+ field: field.strip,
119
+ field_values: value.strip.split
120
+ )
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module Ucode
6
+ # Parsers — one class per UCD text file.
7
+ #
8
+ # All parsers stream: they read line by line via `File.foreach`, never
9
+ # accumulate the whole file in memory, and yield one record at a time.
10
+ # When called without a block, they return a lazy Enumerator so the
11
+ # Coordinator can compose them.
12
+ module Parsers
13
+ autoload :Base, "ucode/parsers/base"
14
+ autoload :UnicodeData, "ucode/parsers/unicode_data"
15
+ autoload :Blocks, "ucode/parsers/blocks"
16
+ autoload :Scripts, "ucode/parsers/scripts"
17
+ autoload :ScriptExtensions, "ucode/parsers/script_extensions"
18
+ autoload :PropertyAliases, "ucode/parsers/property_aliases"
19
+ autoload :PropertyValueAliases, "ucode/parsers/property_value_aliases"
20
+ autoload :NameAliases, "ucode/parsers/name_aliases"
21
+ autoload :NamedSequences, "ucode/parsers/named_sequences"
22
+ autoload :SpecialCasing, "ucode/parsers/special_casing"
23
+ autoload :CaseFolding, "ucode/parsers/case_folding"
24
+ autoload :BidiMirroring, "ucode/parsers/bidi_mirroring"
25
+ autoload :BidiBrackets, "ucode/parsers/bidi_brackets"
26
+ autoload :CjkRadicals, "ucode/parsers/cjk_radicals"
27
+ autoload :StandardizedVariants, "ucode/parsers/standardized_variants"
28
+ autoload :NamesList, "ucode/parsers/names_list"
29
+ autoload :DerivedAge, "ucode/parsers/derived_age"
30
+ autoload :DerivedCoreProperties, "ucode/parsers/derived_core_properties"
31
+ autoload :ExtractedProperties, "ucode/parsers/extracted_properties"
32
+ autoload :Auxiliary, "ucode/parsers/auxiliary"
33
+ autoload :Unihan, "ucode/parsers/unihan"
34
+ end
35
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Value object representing one row in a run-length-encoded UCD index.
5
+ #
6
+ # Sorted by `first_cp`. Entries within a single Index are disjoint (no
7
+ # overlapping ranges). This is a leaf value object — not a
8
+ # `Lutaml::Model::Serializable` model — because it has no wire shape,
9
+ # no nested types, and is consumed only by the YAML-backed Index. The
10
+ # `to_h` / `from_h` pair below is the deliberate serialization contract
11
+ # for the YAML file format and is exempt from the no-to_h rule by
12
+ # design (that rule covers model classes only).
13
+ class RangeEntry
14
+ include Comparable
15
+
16
+ attr_reader :first_cp, :last_cp, :name
17
+
18
+ def initialize(first_cp, last_cp, name)
19
+ @first_cp = first_cp
20
+ @last_cp = last_cp
21
+ @name = name
22
+ end
23
+
24
+ def covers?(codepoint)
25
+ codepoint >= @first_cp && codepoint <= @last_cp
26
+ end
27
+
28
+ def size
29
+ @last_cp - @first_cp + 1
30
+ end
31
+
32
+ def <=>(other)
33
+ [@first_cp, @last_cp] <=> [other.first_cp, other.last_cp]
34
+ end
35
+
36
+ def ==(other)
37
+ other.is_a?(RangeEntry) &&
38
+ @first_cp == other.first_cp &&
39
+ @last_cp == other.last_cp &&
40
+ @name == other.name
41
+ end
42
+ alias eql? ==
43
+
44
+ def hash
45
+ [@first_cp, @last_cp, @name].hash
46
+ end
47
+
48
+ def to_h
49
+ { first_cp: @first_cp, last_cp: @last_cp, name: @name }
50
+ end
51
+
52
+ def self.from_h(hash)
53
+ new(hash[:first_cp] || hash["first_cp"],
54
+ hash[:last_cp] || hash["last_cp"],
55
+ hash[:name] || hash["name"])
56
+ end
57
+ end
58
+ end