ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+
5
+ module Ucode
6
+ module Parsers
7
+ # Generic range/value parser for the files under `extracted/`
8
+ # (DerivedGeneralCategory, DerivedJoiningGroup, DerivedLineBreak,
9
+ # DerivedNumericType, …).
10
+ #
11
+ # Format is uniform across every file (UAX #44):
12
+ # XXXX..YYYY; value
13
+ # XXXX; value
14
+ #
15
+ # The parser is intentionally dumb: it yields `(first, last, value)`
16
+ # triples without knowing what the value means. The Coordinator
17
+ # dispatches by source file name (DerivedGeneralCategory.txt →
18
+ # CodePoint#general_category, etc.). This decoupling means a new
19
+ # extracted file adds one line to the Coordinator, not a new parser.
20
+ #
21
+ # Ranges are NOT expanded — yielding per-codepoint would explode the
22
+ # stream for CJK ranges. The Coordinator expands lazily if needed.
23
+ class ExtractedProperties < Base
24
+ # Lightweight record yielded by `.each_record`. The Coordinator
25
+ # consumes these immediately; no need for full lutaml-model
26
+ # overhead. Members are named `range_first` / `range_last` (not
27
+ # `first` / `last`) to avoid overriding `Enumerable#first`.
28
+ Tuple = Struct.new(:range_first, :range_last, :value, keyword_init: true) do
29
+ # The inclusive Range of codepoints this assignment covers.
30
+ def range
31
+ Range.new(range_first, range_last)
32
+ end
33
+
34
+ # Enumerator over every codepoint id in this tuple's range.
35
+ def cp_ids
36
+ (range_first..range_last).map { |cp| format("U+%04X", cp) }
37
+ end
38
+
39
+ def single?
40
+ range_first == range_last
41
+ end
42
+ end
43
+
44
+ class << self
45
+ def each_record(path)
46
+ return enum_for(:each_record, path) unless block_given?
47
+
48
+ each_line(path) do |line|
49
+ fields = line.fields
50
+ next if fields.length < 2
51
+
52
+ range = parse_codepoint_or_range(fields[0])
53
+ value = fields[1]
54
+ next if value.nil? || value.empty?
55
+
56
+ yield build_tuple(range, value)
57
+ end
58
+
59
+ nil
60
+ end
61
+
62
+ private
63
+
64
+ def build_tuple(range, value)
65
+ if range.is_a?(Range)
66
+ Tuple.new(range_first: range.first, range_last: range.last, value: value)
67
+ else
68
+ Tuple.new(range_first: range, range_last: range, value: value)
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/name_alias"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `NameAliases.txt` — alternate / correction / control names
9
+ # attached to a codepoint.
10
+ #
11
+ # Format (UAX #44):
12
+ # cp; alias_text; type
13
+ #
14
+ # `type` is one of: correction, control, alternate, figment,
15
+ # abbreviation.
16
+ class NameAliases < Base
17
+ class << self
18
+ # Yields one NameAlias per non-comment line. Returns a lazy
19
+ # Enumerator when called without a block.
20
+ def each_record(path)
21
+ return enum_for(:each_record, path) unless block_given?
22
+
23
+ each_line(path) do |line|
24
+ fields = line.fields
25
+ next if fields.length < 3
26
+
27
+ cp = parse_hex_cp(fields[0])
28
+ text = fields[1]
29
+ type = fields[2]
30
+ next if text.nil? || text.empty? || type.nil? || type.empty?
31
+
32
+ yield Models::NameAlias.new(
33
+ codepoint: cp,
34
+ text: text,
35
+ type: type
36
+ )
37
+ end
38
+
39
+ nil
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/named_sequence"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `NamedSequences.txt` — named multi-codepoint sequences.
9
+ #
10
+ # Format (UAX #44):
11
+ # cp1 cp2 cp3 ...; Name
12
+ #
13
+ # The first field is a space-separated list of hex codepoints; the
14
+ # second is the human-readable name.
15
+ class NamedSequences < Base
16
+ class << self
17
+ # Yields one NamedSequence per non-comment line. Returns a lazy
18
+ # Enumerator when called without a block.
19
+ def each_record(path)
20
+ return enum_for(:each_record, path) unless block_given?
21
+
22
+ each_line(path) do |line|
23
+ fields = line.fields
24
+ next if fields.length < 2
25
+
26
+ sequence_field = fields[0]
27
+ name = fields[1]
28
+ next if name.nil? || name.empty?
29
+
30
+ yield Models::NamedSequence.new(
31
+ name: name,
32
+ codepoint_ids: parse_sequence(sequence_field)
33
+ )
34
+ end
35
+
36
+ nil
37
+ end
38
+
39
+ private
40
+
41
+ def parse_sequence(field)
42
+ return [] if field.nil? || field.empty?
43
+
44
+ field.split(/\s+/).reject(&:empty?).map do |hex|
45
+ format("U+%04X", parse_hex_cp(hex))
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,250 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/error"
5
+ require "ucode/models/names_list_entry"
6
+ require "ucode/models/relationship"
7
+
8
+ module Ucode
9
+ module Parsers
10
+ # Parses `NamesList.txt` — the human-curated annotated names file
11
+ # Unicode uses to render the Code Charts' name pages.
12
+ #
13
+ # Format (per the file's own header):
14
+ #
15
+ # cp; Name ← header line at column 0 → new NamesListEntry
16
+ # → U+XXXX note ← indented annotation lines
17
+ # × U+XXXX U+YYYY note
18
+ # ≡ U+XXXX note
19
+ # = alias text
20
+ # * footnote text
21
+ #
22
+ # Plus dropped lines:
23
+ #
24
+ # `# comment` ← file-level comment
25
+ # `% instruction` ← dropped (instructional)
26
+ # `~ heading` ← dropped (table-of-contents)
27
+ #
28
+ # Annotation scopes attach to the most recent header. Lines that do
29
+ # not start a new header are silently ignored.
30
+ #
31
+ # Implemented as a small state machine: one current NamesListEntry is
32
+ # held in a local; header lines flush the previous entry, annotation
33
+ # lines append to the current entry. Regex cannot express this
34
+ # scoping.
35
+ class NamesList < Base
36
+ HEADER_PATTERN = /\A([0-9A-Fa-f]{4,6})\s*;\s*(.+?)\s*\z/.freeze
37
+ private_constant :HEADER_PATTERN
38
+
39
+ CP_REF_PATTERN = /\AU\+([0-9A-Fa-f]{4,6})\b/.freeze
40
+ private_constant :CP_REF_PATTERN
41
+
42
+ RENDERED_PATTERN = /\(rendered:\s*(.+?)\)\z/.freeze
43
+ private_constant :RENDERED_PATTERN
44
+
45
+ MARKER_CROSS_REFERENCE = "→".freeze
46
+ MARKER_SAMPLE_SEQUENCE = "×".freeze
47
+ MARKER_COMPAT_EQUIV = "≡".freeze
48
+ MARKER_ALIAS = "=".freeze
49
+ MARKER_FOOTNOTE = "*".freeze
50
+ MARKER_INSTRUCTIONAL = "%".freeze
51
+ MARKER_HEADING = "~".freeze
52
+
53
+ SOURCE_TAG = "names_list".freeze
54
+ private_constant :SOURCE_TAG
55
+
56
+ class << self
57
+ # Yields one NamesListEntry per codepoint header. Returns a lazy
58
+ # Enumerator when no block is given.
59
+ def each_record(path)
60
+ return enum_for(:each_record, path) unless block_given?
61
+
62
+ entry = nil
63
+ lineno = 0
64
+ path_str = path.to_s
65
+
66
+ File.foreach(path_str) do |raw|
67
+ lineno += 1
68
+ line = raw.chomp
69
+
70
+ begin
71
+ if header_line?(line)
72
+ yield entry if entry
73
+ entry = build_header(line)
74
+ elsif indented_line?(line) && entry
75
+ parsed = parse_annotation(line)
76
+ attach_annotation(entry, parsed) if parsed
77
+ end
78
+ # else: blank, comment, heading, or pre-header — skip
79
+ rescue MalformedLineError => e
80
+ e.context[:file] ||= path_str
81
+ e.context[:line] ||= lineno
82
+ raise
83
+ end
84
+ end
85
+
86
+ yield entry if entry
87
+ nil
88
+ end
89
+
90
+ private
91
+
92
+ # Column-0 line whose first non-blank char is a hex digit and
93
+ # which carries the `;` separator. Excludes `%`, `~`, `#`.
94
+ def header_line?(line)
95
+ return false if line.empty?
96
+ return false if line.start_with?("#", "%", "~", "@")
97
+
98
+ !line.match(HEADER_PATTERN).nil?
99
+ end
100
+
101
+ # Indented annotation: column 0 is whitespace and the line is
102
+ # non-empty.
103
+ def indented_line?(line)
104
+ return false if line.empty?
105
+
106
+ line[0] == " " || line[0] == "\t"
107
+ end
108
+
109
+ def build_header(line)
110
+ m = line.match(HEADER_PATTERN)
111
+ unless m
112
+ raise MalformedLineError.new(
113
+ "invalid NamesList.txt header: #{line.inspect}",
114
+ context: { line: line }
115
+ )
116
+ end
117
+
118
+ Models::NamesListEntry.new(
119
+ codepoint: m[1].to_i(16),
120
+ name: m[2]
121
+ )
122
+ end
123
+
124
+ # Parses one indented annotation line. Returns a
125
+ # `[container_attribute, Relationship]` pair, or `nil` if the
126
+ # marker is dropped (`%`, `~`) or unknown.
127
+ def parse_annotation(line)
128
+ stripped = line.lstrip
129
+ marker = stripped[0]
130
+ rest = stripped[1..].to_s.lstrip
131
+
132
+ case marker
133
+ when MARKER_CROSS_REFERENCE
134
+ target_ids, note = split_targets_and_note(rest)
135
+ [
136
+ :cross_references,
137
+ build_cross_reference(target_ids, note),
138
+ ]
139
+ when MARKER_SAMPLE_SEQUENCE
140
+ target_ids, note = split_targets_and_note(rest)
141
+ [
142
+ :sample_sequences,
143
+ build_sample_sequence(target_ids, note),
144
+ ]
145
+ when MARKER_COMPAT_EQUIV
146
+ target_ids, note = split_targets_and_note(rest)
147
+ [
148
+ :compatibility_equivalents,
149
+ build_compat_equiv(target_ids, note),
150
+ ]
151
+ when MARKER_ALIAS
152
+ [:informal_aliases, build_alias(rest)]
153
+ when MARKER_FOOTNOTE
154
+ [:footnotes, build_footnote(rest)]
155
+ when MARKER_INSTRUCTIONAL, MARKER_HEADING
156
+ nil
157
+ else
158
+ nil
159
+ end
160
+ end
161
+
162
+ def build_cross_reference(target_ids, note)
163
+ Models::Relationship::CrossReference.new(
164
+ target_ids: target_ids,
165
+ description: note.empty? ? nil : note,
166
+ source: SOURCE_TAG
167
+ )
168
+ end
169
+
170
+ def build_sample_sequence(target_ids, note)
171
+ rendered = extract_rendered(note)
172
+ Models::Relationship::SampleSequence.new(
173
+ target_ids: target_ids,
174
+ description: note.empty? ? nil : note,
175
+ rendered_form: rendered,
176
+ source: SOURCE_TAG
177
+ )
178
+ end
179
+
180
+ def build_compat_equiv(target_ids, note)
181
+ Models::Relationship::CompatEquiv.new(
182
+ target_ids: target_ids,
183
+ description: note.empty? ? nil : note,
184
+ source: SOURCE_TAG
185
+ )
186
+ end
187
+
188
+ def build_alias(text)
189
+ Models::Relationship::InformalAlias.new(
190
+ description: text.empty? ? nil : text,
191
+ source: SOURCE_TAG
192
+ )
193
+ end
194
+
195
+ def build_footnote(text)
196
+ Models::Relationship::Footnote.new(
197
+ description: text.empty? ? nil : text,
198
+ category: detect_footnote_category(text),
199
+ source: SOURCE_TAG
200
+ )
201
+ end
202
+
203
+ # Splits a `U+XXXX [U+YYYY ...] note` payload into leading target
204
+ # ids (zero-padded `U+XXXX` form) and the trailing prose note.
205
+ def split_targets_and_note(rest)
206
+ targets = []
207
+ remaining = rest.dup
208
+
209
+ while (m = remaining.match(CP_REF_PATTERN))
210
+ targets << format("U+%04X", m[1].to_i(16))
211
+ remaining = remaining[m[0].length..].to_s.lstrip
212
+ end
213
+
214
+ [targets, remaining]
215
+ end
216
+
217
+ # Pulls `(rendered: X)` suffix from sample-sequence notes when
218
+ # present. Returns nil otherwise.
219
+ def extract_rendered(note)
220
+ m = note.match(RENDERED_PATTERN)
221
+ return nil unless m
222
+
223
+ m[1].strip
224
+ end
225
+
226
+ # Heuristic footnote category. The Unicode names list does not
227
+ # tag these explicitly; the categories are useful for UI grouping.
228
+ def detect_footnote_category(text)
229
+ first = text.split(/\s+/, 2).first&.downcase
230
+ case first
231
+ when "cap", "capital", "small", "lowercase", "uppercase",
232
+ "letter", "letterform", "glyph", "shape"
233
+ "letterform"
234
+ when "see", "compare", "vs", "versus", "distinguished"
235
+ "comparison"
236
+ when "history", "origin", "originally", "introduced"
237
+ "history"
238
+ else
239
+ "general"
240
+ end
241
+ end
242
+
243
+ def attach_annotation(entry, parsed)
244
+ attr_name, instance = parsed
245
+ entry.public_send(attr_name) << instance
246
+ end
247
+ end
248
+ end
249
+ end
250
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/property_alias"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `PropertyAliases.txt` — Unicode property short ↔ long name.
9
+ #
10
+ # Format (UAX #44):
11
+ # short; long_name; other_alias; other_alias; ...
12
+ #
13
+ # Example: `ccc; Canonical_Combining_Class; ccc`
14
+ class PropertyAliases < Base
15
+ class << self
16
+ # Yields one PropertyAlias per non-comment line. Returns a lazy
17
+ # Enumerator when called without a block.
18
+ def each_record(path)
19
+ return enum_for(:each_record, path) unless block_given?
20
+
21
+ each_line(path) do |line|
22
+ fields = line.fields
23
+ next if fields.length < 2
24
+
25
+ short = fields[0]
26
+ long = fields[1]
27
+ others = fields[2..].reject { |f| f.nil? || f.empty? }
28
+
29
+ yield Models::PropertyAlias.new(
30
+ short: short,
31
+ long: long,
32
+ other_aliases: others
33
+ )
34
+ end
35
+
36
+ nil
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/property_value_alias"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `PropertyValueAliases.txt` — per-property value aliases.
9
+ #
10
+ # Format (UAX #44):
11
+ # property; short_value; long_value; other_alias; ...
12
+ #
13
+ # Examples:
14
+ # gc; Lu; Uppercase_Letter
15
+ # sc; Latn; Latin
16
+ # ccc; 0; NR
17
+ class PropertyValueAliases < Base
18
+ class << self
19
+ # Yields one PropertyValueAlias per non-comment line. Returns a
20
+ # lazy Enumerator when called without a block.
21
+ def each_record(path)
22
+ return enum_for(:each_record, path) unless block_given?
23
+
24
+ each_line(path) do |line|
25
+ fields = line.fields
26
+ next if fields.length < 3
27
+
28
+ property = fields[0]
29
+ short = fields[1]
30
+ long = fields[2]
31
+ others = fields[3..].reject { |f| f.nil? || f.empty? }
32
+
33
+ yield Models::PropertyValueAlias.new(
34
+ property: property,
35
+ short: short,
36
+ long: long,
37
+ other_aliases: others
38
+ )
39
+ end
40
+
41
+ nil
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+
5
+ module Ucode
6
+ module Parsers
7
+ # Parses `ScriptExtensions.txt` — additional scripts per codepoint.
8
+ #
9
+ # Format (UAX #44):
10
+ # XXXX..XXXX ; Latn Grek Cyrl # trailing comment
11
+ #
12
+ # A codepoint can be associated with many scripts. The parser yields
13
+ # one Tuple per (codepoint, script_code) pair; the Coordinator merges
14
+ # these into CodePoint#script_extensions.
15
+ #
16
+ # `script_code` is the ISO 15924 4-letter code already present in the
17
+ # source file (e.g. `Latn`, `Grek`). No alias resolution is needed.
18
+ class ScriptExtensions < Base
19
+ # One (codepoint, ISO 15924 code) pair yielded by `.each_record`.
20
+ Tuple = Struct.new(:cp, :script_code, keyword_init: true) do
21
+ def cp_id
22
+ format("U+%04X", cp)
23
+ end
24
+ end
25
+
26
+ class << self
27
+ # Yields one Tuple per (codepoint, script_code) pair. Returns a
28
+ # lazy Enumerator when called without a block.
29
+ def each_record(path)
30
+ return enum_for(:each_record, path) unless block_given?
31
+
32
+ each_line(path) do |line|
33
+ fields = line.fields
34
+ next if fields.length < 2
35
+
36
+ codes_field = fields[1]
37
+ next if codes_field.nil? || codes_field.empty?
38
+
39
+ range = parse_codepoint_or_range(fields[0])
40
+ codes = codes_field.split(/\s+/)
41
+
42
+ each_cp(range) do |cp|
43
+ codes.each do |code|
44
+ yield Tuple.new(cp: cp, script_code: code)
45
+ end
46
+ end
47
+ end
48
+
49
+ nil
50
+ end
51
+
52
+ private
53
+
54
+ def each_cp(range)
55
+ if range.is_a?(Range)
56
+ range.each { |cp| yield cp }
57
+ else
58
+ yield range
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/script"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `Scripts.txt` — the primary Script property assignment per
9
+ # codepoint range.
10
+ #
11
+ # Format (UAX #44):
12
+ # XXXX..XXXX ; Script_Name # trailing comment
13
+ # XXXX ; Script_Name # trailing comment
14
+ #
15
+ # Yields one Script per line, with `range_first` and `range_last`
16
+ # set. The Coordinator bsearches the resulting sorted array by cp.
17
+ # The ISO 15924 `code` is resolved later by the Coordinator via
18
+ # PropertyValueAliases (property=sc).
19
+ class Scripts < Base
20
+ class << self
21
+ def each_record(path)
22
+ return enum_for(:each_record, path) unless block_given?
23
+
24
+ each_line(path) do |line|
25
+ fields = line.fields
26
+ next if fields.length < 2
27
+
28
+ name = fields[1]
29
+ next if name.nil? || name.empty?
30
+ next if name == "@missing"
31
+
32
+ range = parse_codepoint_or_range(fields[0])
33
+ yield build_script(range, name)
34
+ end
35
+
36
+ nil
37
+ end
38
+
39
+ private
40
+
41
+ def build_script(range, name)
42
+ first, last = bounds_of(range)
43
+ Models::Script.new(
44
+ name: name,
45
+ range_first: first,
46
+ range_last: last
47
+ )
48
+ end
49
+
50
+ def bounds_of(range)
51
+ if range.is_a?(Range)
52
+ [range.begin, range.end]
53
+ else
54
+ [range, range]
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/parsers/base"
4
+ require "ucode/models/special_casing_rule"
5
+
6
+ module Ucode
7
+ module Parsers
8
+ # Parses `SpecialCasing.txt` — context-sensitive case mappings.
9
+ #
10
+ # Format (UAX #44):
11
+ # cp; lower; title; upper; [conditions;] # name
12
+ #
13
+ # The `lower`/`title`/`upper` fields are either empty or a
14
+ # space-separated list of hex codepoints. `conditions` is a
15
+ # space-separated list of context identifiers (`Final_Sigma`,
16
+ # `After_I`) and/or locale codes (`tr`, `az`). Filtering by
17
+ # condition is the consumer's job.
18
+ class SpecialCasing < Base
19
+ class << self
20
+ # Yields one SpecialCasingRule per non-comment line. Returns a
21
+ # lazy Enumerator when called without a block.
22
+ def each_record(path)
23
+ return enum_for(:each_record, path) unless block_given?
24
+
25
+ each_line(path) do |line|
26
+ fields = line.fields
27
+ next if fields.length < 4
28
+
29
+ cp = parse_hex_cp(fields[0])
30
+
31
+ yield Models::SpecialCasingRule.new(
32
+ codepoint: cp,
33
+ lower_ids: parse_mapping(fields[1]),
34
+ title_ids: parse_mapping(fields[2]),
35
+ upper_ids: parse_mapping(fields[3]),
36
+ conditions: parse_conditions(fields[4]),
37
+ comment: line.comment
38
+ )
39
+ end
40
+
41
+ nil
42
+ end
43
+
44
+ private
45
+
46
+ def parse_mapping(field)
47
+ return [] if field.nil? || field.empty?
48
+
49
+ field.split(/\s+/).reject(&:empty?).map do |hex|
50
+ format("U+%04X", parse_hex_cp(hex))
51
+ end
52
+ end
53
+
54
+ def parse_conditions(field)
55
+ return [] if field.nil? || field.empty?
56
+
57
+ field.split(/\s+/).reject(&:empty?)
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end