ucode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +211 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +406 -0
- data/README.md +469 -0
- data/Rakefile +18 -0
- data/TODO.new/00-README.md +66 -0
- data/TODO.new/01-pillar-terminology-alignment.md +69 -0
- data/TODO.new/02-audit-schema-design.md +255 -0
- data/TODO.new/03-directory-output-spec.md +203 -0
- data/TODO.new/04-fontist-org-contract.md +173 -0
- data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
- data/TODO.new/06-audit-namespace-skeleton.md +105 -0
- data/TODO.new/07-audit-models-port.md +132 -0
- data/TODO.new/08-extractors-cheap-port.md +113 -0
- data/TODO.new/09-extractors-expensive-port.md +99 -0
- data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
- data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
- data/TODO.new/12-formatters-port.md +115 -0
- data/TODO.new/13-directory-emitter.md +147 -0
- data/TODO.new/14-html-face-browser.md +144 -0
- data/TODO.new/15-html-library-browser.md +102 -0
- data/TODO.new/16-cli-audit-subcommands.md +142 -0
- data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
- data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
- data/TODO.new/19-fontisan-docs-update.md +155 -0
- data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
- data/TODO.new/21-canonical-unicode17-build.md +148 -0
- data/TODO.new/22-implementation-order.md +176 -0
- data/UCODE_CHANGELOG.md +97 -0
- data/exe/ucode +8 -0
- data/lib/ucode/aggregator.rb +77 -0
- data/lib/ucode/audit/block_aggregator.rb +90 -0
- data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
- data/lib/ucode/audit/context.rb +137 -0
- data/lib/ucode/audit/discrepancy_detector.rb +213 -0
- data/lib/ucode/audit/extractors/aggregations.rb +70 -0
- data/lib/ucode/audit/extractors/base.rb +21 -0
- data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
- data/lib/ucode/audit/extractors/coverage.rb +55 -0
- data/lib/ucode/audit/extractors/hinting.rb +199 -0
- data/lib/ucode/audit/extractors/identity.rb +65 -0
- data/lib/ucode/audit/extractors/licensing.rb +75 -0
- data/lib/ucode/audit/extractors/metrics.rb +108 -0
- data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
- data/lib/ucode/audit/extractors/provenance.rb +34 -0
- data/lib/ucode/audit/extractors/style.rb +88 -0
- data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
- data/lib/ucode/audit/extractors.rb +31 -0
- data/lib/ucode/audit/plane_aggregator.rb +37 -0
- data/lib/ucode/audit/registry.rb +63 -0
- data/lib/ucode/audit/script_aggregator.rb +92 -0
- data/lib/ucode/audit.rb +27 -0
- data/lib/ucode/cache.rb +113 -0
- data/lib/ucode/cli.rb +272 -0
- data/lib/ucode/commands/build.rb +68 -0
- data/lib/ucode/commands/cache.rb +46 -0
- data/lib/ucode/commands/fetch.rb +62 -0
- data/lib/ucode/commands/font_coverage.rb +57 -0
- data/lib/ucode/commands/glyphs.rb +136 -0
- data/lib/ucode/commands/lookup.rb +65 -0
- data/lib/ucode/commands/parse.rb +62 -0
- data/lib/ucode/commands/site.rb +33 -0
- data/lib/ucode/commands.rb +19 -0
- data/lib/ucode/config.rb +110 -0
- data/lib/ucode/coordinator/indices.rb +34 -0
- data/lib/ucode/coordinator.rb +397 -0
- data/lib/ucode/database.rb +214 -0
- data/lib/ucode/db_builder.rb +107 -0
- data/lib/ucode/error.rb +96 -0
- data/lib/ucode/fetch/code_charts.rb +57 -0
- data/lib/ucode/fetch/http.rb +83 -0
- data/lib/ucode/fetch/ucd_zip.rb +57 -0
- data/lib/ucode/fetch/unihan_zip.rb +57 -0
- data/lib/ucode/fetch.rb +14 -0
- data/lib/ucode/glyphs/cell_extractor.rb +130 -0
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
- data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
- data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
- data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
- data/lib/ucode/glyphs/grid.rb +30 -0
- data/lib/ucode/glyphs/grid_detector.rb +165 -0
- data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
- data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
- data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
- data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
- data/lib/ucode/glyphs/last_resort/source.rb +125 -0
- data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
- data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
- data/lib/ucode/glyphs/last_resort.rb +36 -0
- data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
- data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
- data/lib/ucode/glyphs/page_renderer.rb +221 -0
- data/lib/ucode/glyphs/path_bbox.rb +62 -0
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
- data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
- data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
- data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
- data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
- data/lib/ucode/glyphs/real_fonts.rb +32 -0
- data/lib/ucode/glyphs/writer.rb +250 -0
- data/lib/ucode/glyphs.rb +27 -0
- data/lib/ucode/index.rb +106 -0
- data/lib/ucode/index_builder.rb +94 -0
- data/lib/ucode/models/audit/audit_axis.rb +30 -0
- data/lib/ucode/models/audit/audit_diff.rb +77 -0
- data/lib/ucode/models/audit/audit_report.rb +137 -0
- data/lib/ucode/models/audit/baseline.rb +32 -0
- data/lib/ucode/models/audit/block_summary.rb +72 -0
- data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
- data/lib/ucode/models/audit/codepoint_range.rb +39 -0
- data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
- data/lib/ucode/models/audit/color_capabilities.rb +91 -0
- data/lib/ucode/models/audit/discrepancy.rb +38 -0
- data/lib/ucode/models/audit/duplicate_group.rb +23 -0
- data/lib/ucode/models/audit/embedding_type.rb +81 -0
- data/lib/ucode/models/audit/field_change.rb +28 -0
- data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
- data/lib/ucode/models/audit/gasp_range.rb +63 -0
- data/lib/ucode/models/audit/hinting.rb +99 -0
- data/lib/ucode/models/audit/library_summary.rb +40 -0
- data/lib/ucode/models/audit/licensing.rb +48 -0
- data/lib/ucode/models/audit/metrics.rb +111 -0
- data/lib/ucode/models/audit/named_instance.rb +41 -0
- data/lib/ucode/models/audit/opentype_layout.rb +38 -0
- data/lib/ucode/models/audit/plane_summary.rb +31 -0
- data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
- data/lib/ucode/models/audit/script_features.rb +28 -0
- data/lib/ucode/models/audit/script_summary.rb +54 -0
- data/lib/ucode/models/audit/variation_detail.rb +42 -0
- data/lib/ucode/models/audit.rb +50 -0
- data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
- data/lib/ucode/models/bidi_mirroring.rb +19 -0
- data/lib/ucode/models/binary_property_assignment.rb +26 -0
- data/lib/ucode/models/block.rb +36 -0
- data/lib/ucode/models/case_folding_rule.rb +23 -0
- data/lib/ucode/models/cjk_radical.rb +23 -0
- data/lib/ucode/models/codepoint/bidi.rb +28 -0
- data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
- data/lib/ucode/models/codepoint/case_folding.rb +25 -0
- data/lib/ucode/models/codepoint/casing.rb +32 -0
- data/lib/ucode/models/codepoint/decomposition.rb +27 -0
- data/lib/ucode/models/codepoint/display.rb +24 -0
- data/lib/ucode/models/codepoint/emoji.rb +29 -0
- data/lib/ucode/models/codepoint/hangul.rb +20 -0
- data/lib/ucode/models/codepoint/identifier.rb +30 -0
- data/lib/ucode/models/codepoint/indic.rb +20 -0
- data/lib/ucode/models/codepoint/joining.rb +20 -0
- data/lib/ucode/models/codepoint/normalization.rb +35 -0
- data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
- data/lib/ucode/models/codepoint.rb +122 -0
- data/lib/ucode/models/name_alias.rb +21 -0
- data/lib/ucode/models/named_sequence.rb +19 -0
- data/lib/ucode/models/names_list_entry.rb +38 -0
- data/lib/ucode/models/plane.rb +36 -0
- data/lib/ucode/models/property_alias.rb +24 -0
- data/lib/ucode/models/property_value_alias.rb +26 -0
- data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
- data/lib/ucode/models/relationship/cross_reference.rb +17 -0
- data/lib/ucode/models/relationship/footnote.rb +24 -0
- data/lib/ucode/models/relationship/informal_alias.rb +18 -0
- data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
- data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
- data/lib/ucode/models/relationship.rb +57 -0
- data/lib/ucode/models/script.rb +41 -0
- data/lib/ucode/models/special_casing_rule.rb +28 -0
- data/lib/ucode/models/standardized_variant.rb +24 -0
- data/lib/ucode/models/unihan_entry.rb +23 -0
- data/lib/ucode/models.rb +47 -0
- data/lib/ucode/parsers/auxiliary.rb +26 -0
- data/lib/ucode/parsers/base.rb +137 -0
- data/lib/ucode/parsers/bidi_brackets.rb +41 -0
- data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
- data/lib/ucode/parsers/blocks.rb +63 -0
- data/lib/ucode/parsers/case_folding.rb +53 -0
- data/lib/ucode/parsers/cjk_radicals.rb +102 -0
- data/lib/ucode/parsers/derived_age.rb +59 -0
- data/lib/ucode/parsers/derived_core_properties.rb +60 -0
- data/lib/ucode/parsers/extracted_properties.rb +74 -0
- data/lib/ucode/parsers/name_aliases.rb +44 -0
- data/lib/ucode/parsers/named_sequences.rb +51 -0
- data/lib/ucode/parsers/names_list.rb +250 -0
- data/lib/ucode/parsers/property_aliases.rb +41 -0
- data/lib/ucode/parsers/property_value_aliases.rb +46 -0
- data/lib/ucode/parsers/script_extensions.rb +64 -0
- data/lib/ucode/parsers/scripts.rb +60 -0
- data/lib/ucode/parsers/special_casing.rb +62 -0
- data/lib/ucode/parsers/standardized_variants.rb +56 -0
- data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
- data/lib/ucode/parsers/unicode_data.rb +268 -0
- data/lib/ucode/parsers/unihan.rb +125 -0
- data/lib/ucode/parsers.rb +35 -0
- data/lib/ucode/range_entry.rb +58 -0
- data/lib/ucode/repo/aggregate_writer.rb +364 -0
- data/lib/ucode/repo/atomic_writes.rb +48 -0
- data/lib/ucode/repo/codepoint_writer.rb +96 -0
- data/lib/ucode/repo/paths.rb +122 -0
- data/lib/ucode/repo.rb +22 -0
- data/lib/ucode/site/config_emitter.rb +124 -0
- data/lib/ucode/site/generator.rb +178 -0
- data/lib/ucode/site/search_index.rb +68 -0
- data/lib/ucode/site/template/.gitignore +4 -0
- data/lib/ucode/site/template/.vitepress/config.ts +8 -0
- data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
- data/lib/ucode/site/template/char/[codepoint].md +13 -0
- data/lib/ucode/site/template/components/BlockView.vue +57 -0
- data/lib/ucode/site/template/components/CharView.vue +85 -0
- data/lib/ucode/site/template/components/PlaneView.vue +56 -0
- data/lib/ucode/site/template/components/SearchView.vue +66 -0
- data/lib/ucode/site/template/index.md +25 -0
- data/lib/ucode/site/template/package.json +18 -0
- data/lib/ucode/site/template/search.md +9 -0
- data/lib/ucode/site.rb +13 -0
- data/lib/ucode/version.rb +5 -0
- data/lib/ucode/version_resolver.rb +76 -0
- data/lib/ucode.rb +74 -0
- data/ucode.gemspec +56 -0
- metadata +404 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/standardized_variant"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `StandardizedVariants.txt` — variation selector sequences.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# base_cp VS_cp; description; [contexts]; # trailing comment
|
|
12
|
+
#
|
|
13
|
+
# `base_cp` + `variation_selector_id` is the key; `description` is
|
|
14
|
+
# the visual result; `contexts` (optional) is a space-separated
|
|
15
|
+
# list of shaping contexts (e.g. `no-break`).
|
|
16
|
+
class StandardizedVariants < Base
|
|
17
|
+
class << self
|
|
18
|
+
def each_record(path)
|
|
19
|
+
return enum_for(:each_record, path) unless block_given?
|
|
20
|
+
|
|
21
|
+
each_line(path) do |line|
|
|
22
|
+
fields = line.fields
|
|
23
|
+
next if fields.length < 2
|
|
24
|
+
|
|
25
|
+
sequence_field = fields[0]
|
|
26
|
+
description = fields[1]
|
|
27
|
+
next if description.nil? || description.empty?
|
|
28
|
+
|
|
29
|
+
sequence = sequence_field.to_s.split(/\s+/).reject(&:empty?)
|
|
30
|
+
next if sequence.length < 2
|
|
31
|
+
|
|
32
|
+
base = parse_hex_cp(sequence[0])
|
|
33
|
+
vs = parse_hex_cp(sequence[1])
|
|
34
|
+
|
|
35
|
+
yield Models::StandardizedVariant.new(
|
|
36
|
+
base_id: format("U+%04X", base),
|
|
37
|
+
variation_selector_id: format("U+%04X", vs),
|
|
38
|
+
description: description,
|
|
39
|
+
contexts: parse_contexts(fields[2])
|
|
40
|
+
)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
nil
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def parse_contexts(field)
|
|
49
|
+
return [] if field.nil? || field.empty?
|
|
50
|
+
|
|
51
|
+
field.split(/\s*;\s*/).flat_map { |part| part.split(/\s+/) }.reject(&:empty?)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Parsers
|
|
5
|
+
class UnicodeData < Base
|
|
6
|
+
# Computes the official Unicode name for a Hangul syllable codepoint
|
|
7
|
+
# per the algorithm in Chapter 3 of the Unicode Standard (TR #15).
|
|
8
|
+
#
|
|
9
|
+
# The name is "HANGUL SYLLABLE " followed by the concatenation of the
|
|
10
|
+
# short names of the L, V, (optional T) Jamo that compose it.
|
|
11
|
+
#
|
|
12
|
+
# Constants are the canonical Jamo short names from UnicodeData.txt
|
|
13
|
+
# (also published separately as Jamo.txt). Indexing into these arrays
|
|
14
|
+
# by (cp - BASE_L/V/T) gives the short name for that Jamo.
|
|
15
|
+
module HangulName
|
|
16
|
+
S_BASE = 0xAC00
|
|
17
|
+
L_BASE = 0x1100
|
|
18
|
+
V_BASE = 0x1161
|
|
19
|
+
T_BASE = 0x11A7
|
|
20
|
+
|
|
21
|
+
L_COUNT = 19
|
|
22
|
+
V_COUNT = 21
|
|
23
|
+
T_COUNT = 28
|
|
24
|
+
N_COUNT = V_COUNT * T_COUNT # 588
|
|
25
|
+
S_COUNT = L_COUNT * N_COUNT # 11_172
|
|
26
|
+
|
|
27
|
+
LEAD_SHORT_NAMES = %w[
|
|
28
|
+
G GG N D DD R M B BB S SS
|
|
29
|
+
J JJ C K T P H
|
|
30
|
+
].freeze
|
|
31
|
+
|
|
32
|
+
VOWEL_SHORT_NAMES = %w[
|
|
33
|
+
A AE YA YAE EO E YEO YE O WA WAE OE YO
|
|
34
|
+
U WEO WE WI YU EU YI I
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
TRAIL_SHORT_NAMES = [
|
|
38
|
+
"", # 11A7 has no short name; used for LV (no trail)
|
|
39
|
+
"G", "GG", "GS", "N", "NJ", "NH", "D",
|
|
40
|
+
"L", "LG", "LM", "LB", "LS", "LT", "LH",
|
|
41
|
+
"M", "B", "BS", "S", "SS", "NG", "J",
|
|
42
|
+
"C", "K", "T", "P", "H"
|
|
43
|
+
].freeze
|
|
44
|
+
|
|
45
|
+
class << self
|
|
46
|
+
# Returns true if `cp` is in the Hangul syllable block.
|
|
47
|
+
def hangul_syllable?(cp)
|
|
48
|
+
cp.is_a?(Integer) &&
|
|
49
|
+
cp >= S_BASE &&
|
|
50
|
+
cp < S_BASE + S_COUNT
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Returns the synthesized name for a Hangul syllable codepoint,
|
|
54
|
+
# or nil if `cp` is not in the Hangul syllable block.
|
|
55
|
+
def call(cp)
|
|
56
|
+
return nil unless hangul_syllable?(cp)
|
|
57
|
+
|
|
58
|
+
s_index = cp - S_BASE
|
|
59
|
+
l_index = s_index / N_COUNT
|
|
60
|
+
v_index = (s_index % N_COUNT) / T_COUNT
|
|
61
|
+
t_index = s_index % T_COUNT
|
|
62
|
+
|
|
63
|
+
parts = [LEAD_SHORT_NAMES[l_index], VOWEL_SHORT_NAMES[v_index]]
|
|
64
|
+
parts << TRAIL_SHORT_NAMES[t_index] if t_index.positive?
|
|
65
|
+
|
|
66
|
+
"HANGUL SYLLABLE #{parts.join}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
private_constant :HangulName
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/codepoint"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `UnicodeData.txt` — the primary per-codepoint record file.
|
|
9
|
+
#
|
|
10
|
+
# Field layout (UAX #44, 15 `;`-separated fields):
|
|
11
|
+
# 0. codepoint
|
|
12
|
+
# 1. name (`<control>` or `<Type, First>` / `<Type, Last>` for ranges)
|
|
13
|
+
# 2. general_category
|
|
14
|
+
# 3. canonical_combining_class
|
|
15
|
+
# 4. bidi_class
|
|
16
|
+
# 5. decomposition_type_and_mapping (combined: optional `<tag>` + cps)
|
|
17
|
+
# 6. numeric_value_decimal (deprecated duplicate of 8 for Nd)
|
|
18
|
+
# 7. numeric_value_digit (deprecated duplicate of 8 for Nl)
|
|
19
|
+
# 8. numeric_value (canonical)
|
|
20
|
+
# 9. bidi_mirrored (Y/N)
|
|
21
|
+
# 10. Unicode_1_Name (deprecated, kept as `name1`)
|
|
22
|
+
# 11. ISO_10646_comment (deprecated, ignored)
|
|
23
|
+
# 12. simple_uppercase_mapping
|
|
24
|
+
# 13. simple_lowercase_mapping
|
|
25
|
+
# 14. simple_titlecase_mapping
|
|
26
|
+
#
|
|
27
|
+
# Hangul syllables and CJK ideographs appear as range markers
|
|
28
|
+
# (`<..., First>` / `<..., Last>`). The range is expanded to one
|
|
29
|
+
# CodePoint per codepoint with the appropriate synthesized name.
|
|
30
|
+
class UnicodeData < Base
|
|
31
|
+
autoload :HangulName, "ucode/parsers/unicode_data/hangul_name"
|
|
32
|
+
|
|
33
|
+
FIRST_MARKER = "First"
|
|
34
|
+
LAST_MARKER = "Last"
|
|
35
|
+
private_constant :FIRST_MARKER, :LAST_MARKER
|
|
36
|
+
|
|
37
|
+
class << self
|
|
38
|
+
# Yields one CodePoint per codepoint in `path`. Range markers
|
|
39
|
+
# (`<..., First>` to `<..., Last>`) are expanded to one CodePoint
|
|
40
|
+
# per codepoint, with names synthesized per Unicode rules.
|
|
41
|
+
#
|
|
42
|
+
# Returns a lazy Enumerator when called without a block.
|
|
43
|
+
def each_record(path)
|
|
44
|
+
return enum_for(:each_record, path) unless block_given?
|
|
45
|
+
|
|
46
|
+
pending_range = nil
|
|
47
|
+
|
|
48
|
+
each_line(path) do |line|
|
|
49
|
+
begin
|
|
50
|
+
fields = line.fields
|
|
51
|
+
|
|
52
|
+
if pending_range
|
|
53
|
+
unless fields[1]&.end_with?("#{LAST_MARKER}>")
|
|
54
|
+
raise MalformedLineError.new(
|
|
55
|
+
"expected <#{pending_range[:template]}, #{LAST_MARKER}>, " \
|
|
56
|
+
"got #{fields[1].inspect}",
|
|
57
|
+
context: { file: path.to_s, line: line.number }
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
last_cp = parse_hex_cp(fields[0])
|
|
62
|
+
expand_range(pending_range, last_cp).each { |cp| yield cp }
|
|
63
|
+
pending_range = nil
|
|
64
|
+
next
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
cp = parse_hex_cp(fields[0])
|
|
68
|
+
name = fields[1]
|
|
69
|
+
|
|
70
|
+
if range_start?(name)
|
|
71
|
+
pending_range = {
|
|
72
|
+
first_cp: cp,
|
|
73
|
+
template: extract_template(name),
|
|
74
|
+
general_category: fields[2],
|
|
75
|
+
combining_class: fields[3].to_i,
|
|
76
|
+
bidi_class: fields[4],
|
|
77
|
+
bidi_mirrored: fields[9]
|
|
78
|
+
}
|
|
79
|
+
next
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
yield build_codepoint(
|
|
83
|
+
cp: cp,
|
|
84
|
+
name: synthesize_name(cp, name),
|
|
85
|
+
general_category: fields[2],
|
|
86
|
+
combining_class: fields[3].to_i,
|
|
87
|
+
bidi_class: fields[4],
|
|
88
|
+
decomposition_field: fields[5],
|
|
89
|
+
numeric_decimal: fields[6],
|
|
90
|
+
numeric_digit: fields[7],
|
|
91
|
+
numeric_value: fields[8],
|
|
92
|
+
bidi_mirrored: fields[9],
|
|
93
|
+
unicode_1_name: fields[10],
|
|
94
|
+
simple_upper_id: fields[12],
|
|
95
|
+
simple_lower_id: fields[13],
|
|
96
|
+
simple_title_id: fields[14]
|
|
97
|
+
)
|
|
98
|
+
rescue MalformedLineError => e
|
|
99
|
+
e.context[:file] ||= path.to_s
|
|
100
|
+
e.context[:line] ||= line.number
|
|
101
|
+
raise
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
nil
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
private
|
|
109
|
+
|
|
110
|
+
def range_start?(name)
|
|
111
|
+
name&.end_with?("#{FIRST_MARKER}>")
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def extract_template(name)
|
|
115
|
+
name.delete_prefix("<").delete_suffix(", #{FIRST_MARKER}>")
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Synthesizes the official name for codepoints whose UnicodeData
|
|
119
|
+
# name is a placeholder. For `<control>` and other non-range
|
|
120
|
+
# placeholders the raw name is returned verbatim. For CJK and
|
|
121
|
+
# Hangul ranges the per-codepoint name is computed algorithmically.
|
|
122
|
+
def synthesize_name(cp, name)
|
|
123
|
+
case name
|
|
124
|
+
when "<control>" then "<control>"
|
|
125
|
+
when /\A<.*CJK.*>\z/
|
|
126
|
+
"CJK UNIFIED IDEOGRAPH-#{format("%04X", cp)}"
|
|
127
|
+
else
|
|
128
|
+
HangulName.call(cp) || name
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Expands a (first, last, template) range into one CodePoint per
|
|
133
|
+
# codepoint with the synthesized per-codepoint name.
|
|
134
|
+
def expand_range(range, last_cp)
|
|
135
|
+
first_cp = range[:first_cp]
|
|
136
|
+
Enumerator.new do |yielder|
|
|
137
|
+
first_cp.upto(last_cp) do |cp|
|
|
138
|
+
yielder << build_codepoint(
|
|
139
|
+
cp: cp,
|
|
140
|
+
name: synthesize_name(cp, "<#{range[:template]}, #{FIRST_MARKER}>"),
|
|
141
|
+
general_category: range[:general_category],
|
|
142
|
+
combining_class: range[:combining_class] || 0,
|
|
143
|
+
bidi_class: range[:bidi_class],
|
|
144
|
+
bidi_mirrored: range[:bidi_mirrored]
|
|
145
|
+
)
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def build_codepoint(cp:, name:, general_category:, combining_class:,
|
|
151
|
+
bidi_class:, decomposition_field: nil,
|
|
152
|
+
numeric_decimal: nil, numeric_digit: nil, numeric_value: nil,
|
|
153
|
+
bidi_mirrored: nil, unicode_1_name: nil,
|
|
154
|
+
simple_upper_id: nil, simple_lower_id: nil, simple_title_id: nil)
|
|
155
|
+
Models::CodePoint.new(
|
|
156
|
+
cp: cp,
|
|
157
|
+
id: format("U+%04X", cp),
|
|
158
|
+
name: name,
|
|
159
|
+
name1: cp_or_nil(unicode_1_name),
|
|
160
|
+
general_category: general_category,
|
|
161
|
+
combining_class: combining_class.to_i,
|
|
162
|
+
bidi: build_bidi(bidi_class, bidi_mirrored),
|
|
163
|
+
decomposition: build_decomposition(decomposition_field),
|
|
164
|
+
numeric: build_numeric(general_category, numeric_decimal, numeric_digit, numeric_value),
|
|
165
|
+
casing: build_casing(simple_upper_id, simple_lower_id, simple_title_id)
|
|
166
|
+
)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def build_bidi(bidi_class, mirrored)
|
|
170
|
+
return nil if (bidi_class.nil? || bidi_class.empty?) &&
|
|
171
|
+
(mirrored.nil? || mirrored.empty?)
|
|
172
|
+
|
|
173
|
+
Models::CodePoint::Bidi.new(
|
|
174
|
+
bidi_class: cp_or_nil(bidi_class),
|
|
175
|
+
is_mirrored: mirrored == "Y"
|
|
176
|
+
)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Field 5 is a single combined field: optional `<tag>` prefix
|
|
180
|
+
# followed by space-separated codepoint hexes. No prefix means
|
|
181
|
+
# canonical decomposition (`can`).
|
|
182
|
+
def build_decomposition(combined)
|
|
183
|
+
return nil if combined.nil? || combined.empty?
|
|
184
|
+
|
|
185
|
+
type = "can"
|
|
186
|
+
mapping = combined
|
|
187
|
+
|
|
188
|
+
if combined.start_with?("<")
|
|
189
|
+
close = combined.index(">")
|
|
190
|
+
type = combined[1...close]
|
|
191
|
+
mapping = combined[(close + 1)..]
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
ids = mapping.split(/\s+/).reject(&:empty?).map do |hex|
|
|
195
|
+
format("U+%04X", parse_hex_cp(hex))
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
Models::CodePoint::Decomposition.new(
|
|
199
|
+
type: type,
|
|
200
|
+
codepoint_ids: ids
|
|
201
|
+
)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Derives Numeric_Type from general_category (Nd/Nl/No) and uses
|
|
205
|
+
# field 8 as the canonical value. Fields 6 and 7 are deprecated
|
|
206
|
+
# duplicates of 8 for Nd and Nl respectively; they are consulted
|
|
207
|
+
# only as a fallback when field 8 is unexpectedly blank.
|
|
208
|
+
def build_numeric(gc, decimal_field, digit_field, numeric_field)
|
|
209
|
+
type = numeric_type_for_gc(gc)
|
|
210
|
+
return nil unless type
|
|
211
|
+
|
|
212
|
+
raw = [numeric_field, digit_field, decimal_field].find { |v| !v.nil? && !v.empty? }
|
|
213
|
+
return nil if raw.nil?
|
|
214
|
+
|
|
215
|
+
numerator, denominator = parse_numeric_value(raw)
|
|
216
|
+
Models::CodePoint::NumericValue.new(
|
|
217
|
+
type: type,
|
|
218
|
+
numerator: numerator,
|
|
219
|
+
denominator: denominator
|
|
220
|
+
)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def numeric_type_for_gc(gc)
|
|
224
|
+
case gc&.to_s
|
|
225
|
+
when /\ANd/ then "de"
|
|
226
|
+
when /\ANl/ then "di"
|
|
227
|
+
when /\ANo/ then "nu"
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def parse_numeric_value(value)
|
|
232
|
+
if value.include?("/")
|
|
233
|
+
num, denom = value.split("/", 2)
|
|
234
|
+
[num.to_i, denom.to_i]
|
|
235
|
+
else
|
|
236
|
+
[value.to_i, 1]
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def build_casing(upper_id, lower_id, title_id)
|
|
241
|
+
return nil if blank?(upper_id) && blank?(lower_id) && blank?(title_id)
|
|
242
|
+
|
|
243
|
+
Models::CodePoint::Casing.new(
|
|
244
|
+
simple_upper_id: cp_id(upper_id),
|
|
245
|
+
simple_lower_id: cp_id(lower_id),
|
|
246
|
+
simple_title_id: cp_id(title_id)
|
|
247
|
+
)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def cp_id(field)
|
|
251
|
+
return nil if blank?(field)
|
|
252
|
+
|
|
253
|
+
format("U+%04X", parse_hex_cp(field))
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def cp_or_nil(field)
|
|
257
|
+
return nil if blank?(field)
|
|
258
|
+
|
|
259
|
+
field
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def blank?(field)
|
|
263
|
+
field.nil? || field.empty?
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
require "ucode/parsers/base"
|
|
5
|
+
require "ucode/error"
|
|
6
|
+
|
|
7
|
+
module Ucode
|
|
8
|
+
module Parsers
|
|
9
|
+
# Parses all eight Unihan files (`Unihan_IRGSources.txt`,
|
|
10
|
+
# `Unihan_NumericValues.txt`, `Unihan_RadicalStrokeCounts.txt`,
|
|
11
|
+
# `Unihan_Readings.txt`, `Unihan_DictionaryIndices.txt`,
|
|
12
|
+
# `Unihan_DictionaryLikeData.txt`, `Unihan_Variants.txt`,
|
|
13
|
+
# `Unihan_OtherMappings.txt`).
|
|
14
|
+
#
|
|
15
|
+
# File format is uniform across all eight (Unihan documentation):
|
|
16
|
+
#
|
|
17
|
+
# U+XXXX<TAB>kField<TAB>value
|
|
18
|
+
#
|
|
19
|
+
# The value may be a space-separated list (`kRSUnicode`, `kDefinition`
|
|
20
|
+
# for prose, `kCangjieInput` for multiple codes). `.split` (whitespace)
|
|
21
|
+
# produces the values array uniformly. Coordinator groups records by
|
|
22
|
+
# `cp` and writes into `CodePoint.unihan.fields[field]`.
|
|
23
|
+
#
|
|
24
|
+
# One parser, not eight: the format is uniform. The filename carries
|
|
25
|
+
# no parse-time information — every line is self-describing via its
|
|
26
|
+
# field name. Adding a new Unihan file is a one-line change to
|
|
27
|
+
# `FILES`; no parser modification (OCP).
|
|
28
|
+
class Unihan < Base
|
|
29
|
+
FILES = %w[
|
|
30
|
+
Unihan_DictionaryIndices.txt
|
|
31
|
+
Unihan_DictionaryLikeData.txt
|
|
32
|
+
Unihan_IRGSources.txt
|
|
33
|
+
Unihan_NumericValues.txt
|
|
34
|
+
Unihan_RadicalStrokeCounts.txt
|
|
35
|
+
Unihan_Readings.txt
|
|
36
|
+
Unihan_Variants.txt
|
|
37
|
+
Unihan_OtherMappings.txt
|
|
38
|
+
].freeze
|
|
39
|
+
|
|
40
|
+
# Stream record: one Unihan line. Internal pipeline data — a Struct
|
|
41
|
+
# avoids lutaml-model ceremony for transient values. The final
|
|
42
|
+
# `UnihanEntry` model carries the merged, persisted shape. The
|
|
43
|
+
# member is `field_values` (not `values`) to avoid overriding
|
|
44
|
+
# `Struct#values` (the array of all member values).
|
|
45
|
+
Record = Struct.new(:cp, :field, :field_values, keyword_init: true) do
|
|
46
|
+
def cp_id
|
|
47
|
+
format("U+%04X", cp)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
class << self
|
|
52
|
+
# Yields one Record per non-comment line in a single Unihan file.
|
|
53
|
+
# Returns a lazy Enumerator when no block is given.
|
|
54
|
+
def each_record(path)
|
|
55
|
+
return enum_for(:each_record, path) unless block_given?
|
|
56
|
+
|
|
57
|
+
path_str = path.to_s
|
|
58
|
+
lineno = 0
|
|
59
|
+
|
|
60
|
+
File.foreach(path_str) do |raw|
|
|
61
|
+
lineno += 1
|
|
62
|
+
line = raw.chomp
|
|
63
|
+
next if line.empty? || line.start_with?("#")
|
|
64
|
+
|
|
65
|
+
begin
|
|
66
|
+
yield parse_line(line)
|
|
67
|
+
rescue MalformedLineError => e
|
|
68
|
+
e.context[:file] ||= path_str
|
|
69
|
+
e.context[:line] ||= lineno
|
|
70
|
+
raise
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
nil
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Iterates every known Unihan file in `dir`, yielding one Record
|
|
78
|
+
# per data line across all files. Missing files are silently
|
|
79
|
+
# skipped (incremental runs, partial downloads).
|
|
80
|
+
def each_in_dir(dir)
|
|
81
|
+
return enum_for(:each_in_dir, dir) unless block_given?
|
|
82
|
+
|
|
83
|
+
dir_path = Pathname.new(dir)
|
|
84
|
+
FILES.each do |filename|
|
|
85
|
+
path = dir_path.join(filename)
|
|
86
|
+
next unless path.exist?
|
|
87
|
+
|
|
88
|
+
each_record(path) { |record| yield record }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
nil
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
# Parses one TAB-separated Unihan data line into a Record. The
|
|
97
|
+
# `split("\t", 3)` limit preserves any tabs inside the value
|
|
98
|
+
# (defensive — real Unihan data does not contain them).
|
|
99
|
+
def parse_line(line)
|
|
100
|
+
cp_str, field, value = line.split("\t", 3)
|
|
101
|
+
unless cp_str && field && value && !value.empty?
|
|
102
|
+
raise MalformedLineError.new(
|
|
103
|
+
"invalid Unihan line: #{line.inspect}",
|
|
104
|
+
context: { line: line }
|
|
105
|
+
)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
cp_str = cp_str.strip
|
|
109
|
+
unless cp_str.start_with?("U+") && cp_str.length > 2
|
|
110
|
+
raise MalformedLineError.new(
|
|
111
|
+
"invalid Unihan codepoint: #{cp_str.inspect}",
|
|
112
|
+
context: { cp: cp_str }
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
Record.new(
|
|
117
|
+
cp: parse_hex_cp(cp_str[2..]),
|
|
118
|
+
field: field.strip,
|
|
119
|
+
field_values: value.strip.split
|
|
120
|
+
)
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
# Parsers — one class per UCD text file.
|
|
7
|
+
#
|
|
8
|
+
# All parsers stream: they read line by line via `File.foreach`, never
|
|
9
|
+
# accumulate the whole file in memory, and yield one record at a time.
|
|
10
|
+
# When called without a block, they return a lazy Enumerator so the
|
|
11
|
+
# Coordinator can compose them.
|
|
12
|
+
module Parsers
|
|
13
|
+
autoload :Base, "ucode/parsers/base"
|
|
14
|
+
autoload :UnicodeData, "ucode/parsers/unicode_data"
|
|
15
|
+
autoload :Blocks, "ucode/parsers/blocks"
|
|
16
|
+
autoload :Scripts, "ucode/parsers/scripts"
|
|
17
|
+
autoload :ScriptExtensions, "ucode/parsers/script_extensions"
|
|
18
|
+
autoload :PropertyAliases, "ucode/parsers/property_aliases"
|
|
19
|
+
autoload :PropertyValueAliases, "ucode/parsers/property_value_aliases"
|
|
20
|
+
autoload :NameAliases, "ucode/parsers/name_aliases"
|
|
21
|
+
autoload :NamedSequences, "ucode/parsers/named_sequences"
|
|
22
|
+
autoload :SpecialCasing, "ucode/parsers/special_casing"
|
|
23
|
+
autoload :CaseFolding, "ucode/parsers/case_folding"
|
|
24
|
+
autoload :BidiMirroring, "ucode/parsers/bidi_mirroring"
|
|
25
|
+
autoload :BidiBrackets, "ucode/parsers/bidi_brackets"
|
|
26
|
+
autoload :CjkRadicals, "ucode/parsers/cjk_radicals"
|
|
27
|
+
autoload :StandardizedVariants, "ucode/parsers/standardized_variants"
|
|
28
|
+
autoload :NamesList, "ucode/parsers/names_list"
|
|
29
|
+
autoload :DerivedAge, "ucode/parsers/derived_age"
|
|
30
|
+
autoload :DerivedCoreProperties, "ucode/parsers/derived_core_properties"
|
|
31
|
+
autoload :ExtractedProperties, "ucode/parsers/extracted_properties"
|
|
32
|
+
autoload :Auxiliary, "ucode/parsers/auxiliary"
|
|
33
|
+
autoload :Unihan, "ucode/parsers/unihan"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
# Value object representing one row in a run-length-encoded UCD index.
|
|
5
|
+
#
|
|
6
|
+
# Sorted by `first_cp`. Entries within a single Index are disjoint (no
|
|
7
|
+
# overlapping ranges). This is a leaf value object — not a
|
|
8
|
+
# `Lutaml::Model::Serializable` model — because it has no wire shape,
|
|
9
|
+
# no nested types, and is consumed only by the YAML-backed Index. The
|
|
10
|
+
# `to_h` / `from_h` pair below is the deliberate serialization contract
|
|
11
|
+
# for the YAML file format and is exempt from the no-to_h rule by
|
|
12
|
+
# design (that rule covers model classes only).
|
|
13
|
+
class RangeEntry
|
|
14
|
+
include Comparable
|
|
15
|
+
|
|
16
|
+
attr_reader :first_cp, :last_cp, :name
|
|
17
|
+
|
|
18
|
+
def initialize(first_cp, last_cp, name)
|
|
19
|
+
@first_cp = first_cp
|
|
20
|
+
@last_cp = last_cp
|
|
21
|
+
@name = name
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def covers?(codepoint)
|
|
25
|
+
codepoint >= @first_cp && codepoint <= @last_cp
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def size
|
|
29
|
+
@last_cp - @first_cp + 1
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def <=>(other)
|
|
33
|
+
[@first_cp, @last_cp] <=> [other.first_cp, other.last_cp]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def ==(other)
|
|
37
|
+
other.is_a?(RangeEntry) &&
|
|
38
|
+
@first_cp == other.first_cp &&
|
|
39
|
+
@last_cp == other.last_cp &&
|
|
40
|
+
@name == other.name
|
|
41
|
+
end
|
|
42
|
+
alias eql? ==
|
|
43
|
+
|
|
44
|
+
def hash
|
|
45
|
+
[@first_cp, @last_cp, @name].hash
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def to_h
|
|
49
|
+
{ first_cp: @first_cp, last_cp: @last_cp, name: @name }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def self.from_h(hash)
|
|
53
|
+
new(hash[:first_cp] || hash["first_cp"],
|
|
54
|
+
hash[:last_cp] || hash["last_cp"],
|
|
55
|
+
hash[:name] || hash["name"])
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|