ucode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +211 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +406 -0
- data/README.md +469 -0
- data/Rakefile +18 -0
- data/TODO.new/00-README.md +66 -0
- data/TODO.new/01-pillar-terminology-alignment.md +69 -0
- data/TODO.new/02-audit-schema-design.md +255 -0
- data/TODO.new/03-directory-output-spec.md +203 -0
- data/TODO.new/04-fontist-org-contract.md +173 -0
- data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
- data/TODO.new/06-audit-namespace-skeleton.md +105 -0
- data/TODO.new/07-audit-models-port.md +132 -0
- data/TODO.new/08-extractors-cheap-port.md +113 -0
- data/TODO.new/09-extractors-expensive-port.md +99 -0
- data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
- data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
- data/TODO.new/12-formatters-port.md +115 -0
- data/TODO.new/13-directory-emitter.md +147 -0
- data/TODO.new/14-html-face-browser.md +144 -0
- data/TODO.new/15-html-library-browser.md +102 -0
- data/TODO.new/16-cli-audit-subcommands.md +142 -0
- data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
- data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
- data/TODO.new/19-fontisan-docs-update.md +155 -0
- data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
- data/TODO.new/21-canonical-unicode17-build.md +148 -0
- data/TODO.new/22-implementation-order.md +176 -0
- data/UCODE_CHANGELOG.md +97 -0
- data/exe/ucode +8 -0
- data/lib/ucode/aggregator.rb +77 -0
- data/lib/ucode/audit/block_aggregator.rb +90 -0
- data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
- data/lib/ucode/audit/context.rb +137 -0
- data/lib/ucode/audit/discrepancy_detector.rb +213 -0
- data/lib/ucode/audit/extractors/aggregations.rb +70 -0
- data/lib/ucode/audit/extractors/base.rb +21 -0
- data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
- data/lib/ucode/audit/extractors/coverage.rb +55 -0
- data/lib/ucode/audit/extractors/hinting.rb +199 -0
- data/lib/ucode/audit/extractors/identity.rb +65 -0
- data/lib/ucode/audit/extractors/licensing.rb +75 -0
- data/lib/ucode/audit/extractors/metrics.rb +108 -0
- data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
- data/lib/ucode/audit/extractors/provenance.rb +34 -0
- data/lib/ucode/audit/extractors/style.rb +88 -0
- data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
- data/lib/ucode/audit/extractors.rb +31 -0
- data/lib/ucode/audit/plane_aggregator.rb +37 -0
- data/lib/ucode/audit/registry.rb +63 -0
- data/lib/ucode/audit/script_aggregator.rb +92 -0
- data/lib/ucode/audit.rb +27 -0
- data/lib/ucode/cache.rb +113 -0
- data/lib/ucode/cli.rb +272 -0
- data/lib/ucode/commands/build.rb +68 -0
- data/lib/ucode/commands/cache.rb +46 -0
- data/lib/ucode/commands/fetch.rb +62 -0
- data/lib/ucode/commands/font_coverage.rb +57 -0
- data/lib/ucode/commands/glyphs.rb +136 -0
- data/lib/ucode/commands/lookup.rb +65 -0
- data/lib/ucode/commands/parse.rb +62 -0
- data/lib/ucode/commands/site.rb +33 -0
- data/lib/ucode/commands.rb +19 -0
- data/lib/ucode/config.rb +110 -0
- data/lib/ucode/coordinator/indices.rb +34 -0
- data/lib/ucode/coordinator.rb +397 -0
- data/lib/ucode/database.rb +214 -0
- data/lib/ucode/db_builder.rb +107 -0
- data/lib/ucode/error.rb +96 -0
- data/lib/ucode/fetch/code_charts.rb +57 -0
- data/lib/ucode/fetch/http.rb +83 -0
- data/lib/ucode/fetch/ucd_zip.rb +57 -0
- data/lib/ucode/fetch/unihan_zip.rb +57 -0
- data/lib/ucode/fetch.rb +14 -0
- data/lib/ucode/glyphs/cell_extractor.rb +130 -0
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
- data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
- data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
- data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
- data/lib/ucode/glyphs/grid.rb +30 -0
- data/lib/ucode/glyphs/grid_detector.rb +165 -0
- data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
- data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
- data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
- data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
- data/lib/ucode/glyphs/last_resort/source.rb +125 -0
- data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
- data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
- data/lib/ucode/glyphs/last_resort.rb +36 -0
- data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
- data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
- data/lib/ucode/glyphs/page_renderer.rb +221 -0
- data/lib/ucode/glyphs/path_bbox.rb +62 -0
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
- data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
- data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
- data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
- data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
- data/lib/ucode/glyphs/real_fonts.rb +32 -0
- data/lib/ucode/glyphs/writer.rb +250 -0
- data/lib/ucode/glyphs.rb +27 -0
- data/lib/ucode/index.rb +106 -0
- data/lib/ucode/index_builder.rb +94 -0
- data/lib/ucode/models/audit/audit_axis.rb +30 -0
- data/lib/ucode/models/audit/audit_diff.rb +77 -0
- data/lib/ucode/models/audit/audit_report.rb +137 -0
- data/lib/ucode/models/audit/baseline.rb +32 -0
- data/lib/ucode/models/audit/block_summary.rb +72 -0
- data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
- data/lib/ucode/models/audit/codepoint_range.rb +39 -0
- data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
- data/lib/ucode/models/audit/color_capabilities.rb +91 -0
- data/lib/ucode/models/audit/discrepancy.rb +38 -0
- data/lib/ucode/models/audit/duplicate_group.rb +23 -0
- data/lib/ucode/models/audit/embedding_type.rb +81 -0
- data/lib/ucode/models/audit/field_change.rb +28 -0
- data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
- data/lib/ucode/models/audit/gasp_range.rb +63 -0
- data/lib/ucode/models/audit/hinting.rb +99 -0
- data/lib/ucode/models/audit/library_summary.rb +40 -0
- data/lib/ucode/models/audit/licensing.rb +48 -0
- data/lib/ucode/models/audit/metrics.rb +111 -0
- data/lib/ucode/models/audit/named_instance.rb +41 -0
- data/lib/ucode/models/audit/opentype_layout.rb +38 -0
- data/lib/ucode/models/audit/plane_summary.rb +31 -0
- data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
- data/lib/ucode/models/audit/script_features.rb +28 -0
- data/lib/ucode/models/audit/script_summary.rb +54 -0
- data/lib/ucode/models/audit/variation_detail.rb +42 -0
- data/lib/ucode/models/audit.rb +50 -0
- data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
- data/lib/ucode/models/bidi_mirroring.rb +19 -0
- data/lib/ucode/models/binary_property_assignment.rb +26 -0
- data/lib/ucode/models/block.rb +36 -0
- data/lib/ucode/models/case_folding_rule.rb +23 -0
- data/lib/ucode/models/cjk_radical.rb +23 -0
- data/lib/ucode/models/codepoint/bidi.rb +28 -0
- data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
- data/lib/ucode/models/codepoint/case_folding.rb +25 -0
- data/lib/ucode/models/codepoint/casing.rb +32 -0
- data/lib/ucode/models/codepoint/decomposition.rb +27 -0
- data/lib/ucode/models/codepoint/display.rb +24 -0
- data/lib/ucode/models/codepoint/emoji.rb +29 -0
- data/lib/ucode/models/codepoint/hangul.rb +20 -0
- data/lib/ucode/models/codepoint/identifier.rb +30 -0
- data/lib/ucode/models/codepoint/indic.rb +20 -0
- data/lib/ucode/models/codepoint/joining.rb +20 -0
- data/lib/ucode/models/codepoint/normalization.rb +35 -0
- data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
- data/lib/ucode/models/codepoint.rb +122 -0
- data/lib/ucode/models/name_alias.rb +21 -0
- data/lib/ucode/models/named_sequence.rb +19 -0
- data/lib/ucode/models/names_list_entry.rb +38 -0
- data/lib/ucode/models/plane.rb +36 -0
- data/lib/ucode/models/property_alias.rb +24 -0
- data/lib/ucode/models/property_value_alias.rb +26 -0
- data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
- data/lib/ucode/models/relationship/cross_reference.rb +17 -0
- data/lib/ucode/models/relationship/footnote.rb +24 -0
- data/lib/ucode/models/relationship/informal_alias.rb +18 -0
- data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
- data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
- data/lib/ucode/models/relationship.rb +57 -0
- data/lib/ucode/models/script.rb +41 -0
- data/lib/ucode/models/special_casing_rule.rb +28 -0
- data/lib/ucode/models/standardized_variant.rb +24 -0
- data/lib/ucode/models/unihan_entry.rb +23 -0
- data/lib/ucode/models.rb +47 -0
- data/lib/ucode/parsers/auxiliary.rb +26 -0
- data/lib/ucode/parsers/base.rb +137 -0
- data/lib/ucode/parsers/bidi_brackets.rb +41 -0
- data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
- data/lib/ucode/parsers/blocks.rb +63 -0
- data/lib/ucode/parsers/case_folding.rb +53 -0
- data/lib/ucode/parsers/cjk_radicals.rb +102 -0
- data/lib/ucode/parsers/derived_age.rb +59 -0
- data/lib/ucode/parsers/derived_core_properties.rb +60 -0
- data/lib/ucode/parsers/extracted_properties.rb +74 -0
- data/lib/ucode/parsers/name_aliases.rb +44 -0
- data/lib/ucode/parsers/named_sequences.rb +51 -0
- data/lib/ucode/parsers/names_list.rb +250 -0
- data/lib/ucode/parsers/property_aliases.rb +41 -0
- data/lib/ucode/parsers/property_value_aliases.rb +46 -0
- data/lib/ucode/parsers/script_extensions.rb +64 -0
- data/lib/ucode/parsers/scripts.rb +60 -0
- data/lib/ucode/parsers/special_casing.rb +62 -0
- data/lib/ucode/parsers/standardized_variants.rb +56 -0
- data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
- data/lib/ucode/parsers/unicode_data.rb +268 -0
- data/lib/ucode/parsers/unihan.rb +125 -0
- data/lib/ucode/parsers.rb +35 -0
- data/lib/ucode/range_entry.rb +58 -0
- data/lib/ucode/repo/aggregate_writer.rb +364 -0
- data/lib/ucode/repo/atomic_writes.rb +48 -0
- data/lib/ucode/repo/codepoint_writer.rb +96 -0
- data/lib/ucode/repo/paths.rb +122 -0
- data/lib/ucode/repo.rb +22 -0
- data/lib/ucode/site/config_emitter.rb +124 -0
- data/lib/ucode/site/generator.rb +178 -0
- data/lib/ucode/site/search_index.rb +68 -0
- data/lib/ucode/site/template/.gitignore +4 -0
- data/lib/ucode/site/template/.vitepress/config.ts +8 -0
- data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
- data/lib/ucode/site/template/char/[codepoint].md +13 -0
- data/lib/ucode/site/template/components/BlockView.vue +57 -0
- data/lib/ucode/site/template/components/CharView.vue +85 -0
- data/lib/ucode/site/template/components/PlaneView.vue +56 -0
- data/lib/ucode/site/template/components/SearchView.vue +66 -0
- data/lib/ucode/site/template/index.md +25 -0
- data/lib/ucode/site/template/package.json +18 -0
- data/lib/ucode/site/template/search.md +9 -0
- data/lib/ucode/site.rb +13 -0
- data/lib/ucode/version.rb +5 -0
- data/lib/ucode/version_resolver.rb +76 -0
- data/lib/ucode.rb +74 -0
- data/ucode.gemspec +56 -0
- metadata +404 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Models
|
|
7
|
+
# Unihan dictionary data for CJK codepoints. Flat-hash design: every
|
|
8
|
+
# `kFoo` field is a key in `fields`, with array values (Unihan fields
|
|
9
|
+
# are space-separated lists; uniform arrays simplify the shape).
|
|
10
|
+
#
|
|
11
|
+
# The semantic grouping (readings / radicals / variants / sources / etc.)
|
|
12
|
+
# is a presentation concern, derived client-side by prefix. The data
|
|
13
|
+
# model stays open — Unihan adds fields across versions, and the hash
|
|
14
|
+
# absorbs additions without model changes.
|
|
15
|
+
class UnihanEntry < Lutaml::Model::Serializable
|
|
16
|
+
attribute :fields, :hash, default: -> { {} }
|
|
17
|
+
|
|
18
|
+
key_value do
|
|
19
|
+
map "fields", to: :fields
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
data/lib/ucode/models.rb
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
# Models — typed class representations of every UCD aggregate.
|
|
5
|
+
#
|
|
6
|
+
# Conventions (apply to every class in this namespace):
|
|
7
|
+
#
|
|
8
|
+
# - Inheritance, not include:
|
|
9
|
+
#
|
|
10
|
+
# class Foo < Lutaml::Model::Serializable
|
|
11
|
+
#
|
|
12
|
+
# - Wire shape declared via `key_value do … end` (covers JSON + YAML).
|
|
13
|
+
# NEVER `mapping do`, NEVER `json do`.
|
|
14
|
+
#
|
|
15
|
+
# - Codepoint references are "U+XXXX" strings — never nested CodePoint
|
|
16
|
+
# objects. Keeps the data normalized: each codepoint's full data lives
|
|
17
|
+
# only in its own folder.
|
|
18
|
+
#
|
|
19
|
+
# - Polymorphism: `polymorphic_class: true` + `polymorphic_map:` on the
|
|
20
|
+
# base discriminator; `polymorphic: [...]` on the consumer attribute
|
|
21
|
+
# + `polymorphic: { attribute:, class_map: }` on its mapping.
|
|
22
|
+
#
|
|
23
|
+
# - NEVER define `to_h` / `from_h` / `to_json` / `from_json`. All
|
|
24
|
+
# (de)serialization goes through lutaml-model.
|
|
25
|
+
#
|
|
26
|
+
module Models
|
|
27
|
+
autoload :PropertyAlias, "ucode/models/property_alias"
|
|
28
|
+
autoload :PropertyValueAlias, "ucode/models/property_value_alias"
|
|
29
|
+
autoload :Plane, "ucode/models/plane"
|
|
30
|
+
autoload :Block, "ucode/models/block"
|
|
31
|
+
autoload :Script, "ucode/models/script"
|
|
32
|
+
autoload :CodePoint, "ucode/models/codepoint"
|
|
33
|
+
autoload :UnihanEntry, "ucode/models/unihan_entry"
|
|
34
|
+
autoload :NamesListEntry, "ucode/models/names_list_entry"
|
|
35
|
+
autoload :NameAlias, "ucode/models/name_alias"
|
|
36
|
+
autoload :NamedSequence, "ucode/models/named_sequence"
|
|
37
|
+
autoload :SpecialCasingRule, "ucode/models/special_casing_rule"
|
|
38
|
+
autoload :CaseFoldingRule, "ucode/models/case_folding_rule"
|
|
39
|
+
autoload :BidiMirroring, "ucode/models/bidi_mirroring"
|
|
40
|
+
autoload :BidiBracketPair, "ucode/models/bidi_bracket_pair"
|
|
41
|
+
autoload :CjkRadical, "ucode/models/cjk_radical"
|
|
42
|
+
autoload :StandardizedVariant, "ucode/models/standardized_variant"
|
|
43
|
+
autoload :BinaryPropertyAssignment, "ucode/models/binary_property_assignment"
|
|
44
|
+
autoload :Relationship, "ucode/models/relationship"
|
|
45
|
+
autoload :Audit, "ucode/models/audit"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/extracted_properties"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Parsers
|
|
7
|
+
# Generic range/value parser for the auxiliary segmentation files
|
|
8
|
+
# under `auxiliary/` (GraphemeBreakProperty, WordBreakProperty,
|
|
9
|
+
# SentenceBreakProperty, VerticalOrientation, IndicPositionalCategory,
|
|
10
|
+
# IndicSyllabicCategory, IdentifierStatus, IdentifierType) plus the
|
|
11
|
+
# top-level `LineBreak.txt` and `EastAsianWidth.txt`.
|
|
12
|
+
#
|
|
13
|
+
# File format is identical to ExtractedProperties (UAX #44 range/value):
|
|
14
|
+
#
|
|
15
|
+
# XXXX..YYYY; value
|
|
16
|
+
# XXXX; value
|
|
17
|
+
#
|
|
18
|
+
# Coordinator dispatches by file name to the right CodePoint
|
|
19
|
+
# attribute. This class exists as a distinct name so call sites read
|
|
20
|
+
# "auxiliary" instead of "extracted" — the parsing logic is shared
|
|
21
|
+
# via inheritance. Adding auxiliary-specific behavior later does not
|
|
22
|
+
# require touching ExtractedProperties (OCP).
|
|
23
|
+
class Auxiliary < ExtractedProperties
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/error"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Parsers
|
|
7
|
+
# Shared infrastructure for every UCD text-file parser. Subclasses
|
|
8
|
+
# implement `.each_record(path) { |record| ... }` returning an
|
|
9
|
+
# Enumerator when called without a block.
|
|
10
|
+
#
|
|
11
|
+
# All methods are class methods — parsers are stateless.
|
|
12
|
+
#
|
|
13
|
+
# UCD text-file format (UAX #44):
|
|
14
|
+
# - Fields separated by `;`
|
|
15
|
+
# - Lines starting with `#` are comments
|
|
16
|
+
# - Blank lines are ignored
|
|
17
|
+
# - Some lines carry an inline `# trailing comment` after the data
|
|
18
|
+
class Base
|
|
19
|
+
# One physical line from the source file, post-filter (blanks and
|
|
20
|
+
# comment-only lines are skipped before yielding).
|
|
21
|
+
Line = Struct.new(:number, :text, :comment, keyword_init: true) do
|
|
22
|
+
# Returns the data part of the line — everything before the first
|
|
23
|
+
# `#`, rstripped. For lines with no comment this is just the text.
|
|
24
|
+
def data
|
|
25
|
+
idx = text.index("#")
|
|
26
|
+
idx.nil? ? text : text[0...idx].rstrip
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Splits the data part on `;` into stripped fields.
|
|
30
|
+
def fields
|
|
31
|
+
data.split(";").map(&:strip)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Returns the n-th field (0-based), or nil if out of range.
|
|
35
|
+
def field(n)
|
|
36
|
+
fields[n]
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
HEX_PATTERN = /\A[0-9A-Fa-f]{1,6}\z/.freeze
|
|
41
|
+
private_constant :HEX_PATTERN
|
|
42
|
+
|
|
43
|
+
RANGE_SEPARATOR = ".."
|
|
44
|
+
private_constant :RANGE_SEPARATOR
|
|
45
|
+
|
|
46
|
+
class << self
|
|
47
|
+
# Iterates non-blank, non-comment lines from `path`, yielding Line
|
|
48
|
+
# records. Returns an Enumerator when no block is given so callers
|
|
49
|
+
# can chain (`.first(n)`, `.lazy.map`, etc.).
|
|
50
|
+
#
|
|
51
|
+
# Lines that are entirely whitespace or start with `#` are skipped
|
|
52
|
+
# silently — comment text is preserved on data lines that carry an
|
|
53
|
+
# inline `# trailing comment`.
|
|
54
|
+
def each_line(path)
|
|
55
|
+
return enum_for(:each_line, path) unless block_given?
|
|
56
|
+
|
|
57
|
+
lineno = 0
|
|
58
|
+
File.foreach(path.to_s) do |raw|
|
|
59
|
+
lineno += 1
|
|
60
|
+
stripped = raw.strip
|
|
61
|
+
next if stripped.empty?
|
|
62
|
+
next if stripped.start_with?("#")
|
|
63
|
+
|
|
64
|
+
yield build_line(lineno, raw)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Parses an n-th `;`-separated field from a line of text or a Line
|
|
69
|
+
# struct. Strips surrounding whitespace. Returns nil if the field
|
|
70
|
+
# is missing or out of range.
|
|
71
|
+
def parse_field(line, n)
|
|
72
|
+
fields = line_fields(line)
|
|
73
|
+
return nil if fields.length <= n
|
|
74
|
+
|
|
75
|
+
fields[n]
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Parses a codepoint-or-range field per UAX #44. Accepts:
|
|
79
|
+
# "0041" → 0x0041 (Integer)
|
|
80
|
+
# "3400..4DBF" → 0x3400..0x4DBF (Range)
|
|
81
|
+
#
|
|
82
|
+
# Returns nil for blank input. Raises Ucode::MalformedLineError
|
|
83
|
+
# for invalid hex.
|
|
84
|
+
def parse_codepoint_or_range(field)
|
|
85
|
+
return nil if field.nil? || field.empty?
|
|
86
|
+
|
|
87
|
+
if field.include?(RANGE_SEPARATOR)
|
|
88
|
+
first_str, last_str = field.split(RANGE_SEPARATOR, 2)
|
|
89
|
+
first = parse_hex_cp(first_str)
|
|
90
|
+
last = parse_hex_cp(last_str)
|
|
91
|
+
Range.new(first, last)
|
|
92
|
+
else
|
|
93
|
+
parse_hex_cp(field)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Parses a single hex codepoint string into an Integer. Raises
|
|
98
|
+
# Ucode::MalformedLineError with the offending input in context
|
|
99
|
+
# for invalid input.
|
|
100
|
+
def parse_hex_cp(input)
|
|
101
|
+
s = input.to_s.strip
|
|
102
|
+
unless s.match?(HEX_PATTERN)
|
|
103
|
+
raise MalformedLineError.new(
|
|
104
|
+
"invalid codepoint: #{input.inspect}",
|
|
105
|
+
context: { input: input }
|
|
106
|
+
)
|
|
107
|
+
end
|
|
108
|
+
s.to_i(16)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
# Builds a Line struct from a raw text line. Splits off any
|
|
114
|
+
# trailing `# comment` into the Line's `comment` field.
|
|
115
|
+
def build_line(number, raw)
|
|
116
|
+
text = raw.chomp
|
|
117
|
+
hash_idx = text.index("#")
|
|
118
|
+
|
|
119
|
+
if hash_idx.nil?
|
|
120
|
+
Line.new(number: number, text: text, comment: nil)
|
|
121
|
+
else
|
|
122
|
+
Line.new(
|
|
123
|
+
number: number,
|
|
124
|
+
text: text[0...hash_idx].rstrip,
|
|
125
|
+
comment: text[(hash_idx + 1)..].strip
|
|
126
|
+
)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def line_fields(line)
|
|
131
|
+
data = line.is_a?(Line) ? line.data : line.to_s
|
|
132
|
+
data.split(";").map(&:strip)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/bidi_bracket_pair"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `BidiBrackets.txt` — paired bracket partners.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# cp; paired_cp; type
|
|
12
|
+
#
|
|
13
|
+
# `type` is `o` (open) or `c` (close). Coordinator merges each row
|
|
14
|
+
# into `CodePoint#bidi.paired_bracket_id` and `.paired_bracket_type`.
|
|
15
|
+
class BidiBrackets < Base
|
|
16
|
+
class << self
|
|
17
|
+
def each_record(path)
|
|
18
|
+
return enum_for(:each_record, path) unless block_given?
|
|
19
|
+
|
|
20
|
+
each_line(path) do |line|
|
|
21
|
+
fields = line.fields
|
|
22
|
+
next if fields.length < 3
|
|
23
|
+
|
|
24
|
+
cp = parse_hex_cp(fields[0])
|
|
25
|
+
paired_cp = parse_hex_cp(fields[1])
|
|
26
|
+
type = fields[2]
|
|
27
|
+
next if type.nil? || type.empty?
|
|
28
|
+
|
|
29
|
+
yield Models::BidiBracketPair.new(
|
|
30
|
+
codepoint: cp,
|
|
31
|
+
paired_id: format("U+%04X", paired_cp),
|
|
32
|
+
type: type
|
|
33
|
+
)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
nil
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/bidi_mirroring"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `BidiMirroring.txt` — the bidi mirroring glyph partner.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# cp; mirrored_cp
|
|
12
|
+
#
|
|
13
|
+
# Coordinator merges each row into `CodePoint#bidi.mirroring_glyph_id`.
|
|
14
|
+
class BidiMirroring < Base
|
|
15
|
+
class << self
|
|
16
|
+
def each_record(path)
|
|
17
|
+
return enum_for(:each_record, path) unless block_given?
|
|
18
|
+
|
|
19
|
+
each_line(path) do |line|
|
|
20
|
+
fields = line.fields
|
|
21
|
+
next if fields.length < 2
|
|
22
|
+
|
|
23
|
+
cp = parse_hex_cp(fields[0])
|
|
24
|
+
mirrored_cp = parse_hex_cp(fields[1])
|
|
25
|
+
|
|
26
|
+
yield Models::BidiMirroring.new(
|
|
27
|
+
codepoint: cp,
|
|
28
|
+
mirrored_id: format("U+%04X", mirrored_cp)
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
nil
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/block"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `Blocks.txt` — one block range per line.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# XXXX..XXXX; Block Name
|
|
12
|
+
#
|
|
13
|
+
# The `id` is the block name with runs of whitespace collapsed to a
|
|
14
|
+
# single underscore. The `name` is preserved verbatim. Per the
|
|
15
|
+
# project rules (CLAUDE.md), block names are NOT otherwise slugified.
|
|
16
|
+
#
|
|
17
|
+
# `plane_number` is derived from the high bits of `range_first`.
|
|
18
|
+
class Blocks < Base
|
|
19
|
+
class << self
|
|
20
|
+
# Yields one Block per non-comment line. Returns a lazy
|
|
21
|
+
# Enumerator when called without a block.
|
|
22
|
+
def each_record(path)
|
|
23
|
+
return enum_for(:each_record, path) unless block_given?
|
|
24
|
+
|
|
25
|
+
each_line(path) do |line|
|
|
26
|
+
fields = line.fields
|
|
27
|
+
next if fields.length < 2
|
|
28
|
+
|
|
29
|
+
range_field = fields[0]
|
|
30
|
+
name = fields[1]
|
|
31
|
+
next if name.nil? || name.empty?
|
|
32
|
+
|
|
33
|
+
range = parse_codepoint_or_range(range_field)
|
|
34
|
+
yield build_block(range, name)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def build_block(range, name)
|
|
43
|
+
first, last = bounds_of(range)
|
|
44
|
+
Models::Block.new(
|
|
45
|
+
id: name.gsub(/\s+/, "_"),
|
|
46
|
+
name: name,
|
|
47
|
+
range_first: first,
|
|
48
|
+
range_last: last,
|
|
49
|
+
plane_number: first >> 16
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def bounds_of(range)
|
|
54
|
+
if range.is_a?(Range)
|
|
55
|
+
[range.begin, range.end]
|
|
56
|
+
else
|
|
57
|
+
[range, range]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/case_folding_rule"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `CaseFolding.txt` — case folding mappings for comparison.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# cp; status; mapping; # name
|
|
12
|
+
#
|
|
13
|
+
# `status` is one of: C (common), F (full), S (simple), T (turkic).
|
|
14
|
+
# `mapping` is one or more space-separated hex codepoints.
|
|
15
|
+
class CaseFolding < Base
|
|
16
|
+
class << self
|
|
17
|
+
# Yields one CaseFoldingRule per non-comment line. Returns a lazy
|
|
18
|
+
# Enumerator when called without a block.
|
|
19
|
+
def each_record(path)
|
|
20
|
+
return enum_for(:each_record, path) unless block_given?
|
|
21
|
+
|
|
22
|
+
each_line(path) do |line|
|
|
23
|
+
fields = line.fields
|
|
24
|
+
next if fields.length < 3
|
|
25
|
+
|
|
26
|
+
cp = parse_hex_cp(fields[0])
|
|
27
|
+
status = fields[1]
|
|
28
|
+
next if status.nil? || status.empty?
|
|
29
|
+
|
|
30
|
+
yield Models::CaseFoldingRule.new(
|
|
31
|
+
codepoint: cp,
|
|
32
|
+
status: status,
|
|
33
|
+
mapping_ids: parse_mapping(fields[2]),
|
|
34
|
+
comment: line.comment
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
nil
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def parse_mapping(field)
|
|
44
|
+
return [] if field.nil? || field.empty?
|
|
45
|
+
|
|
46
|
+
field.split(/\s+/).reject(&:empty?).map do |hex|
|
|
47
|
+
format("U+%04X", parse_hex_cp(hex))
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/cjk_radical"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `CJKRadicals.txt` — KangXi radical → CJK radical ideograph
|
|
9
|
+
# → canonical ideograph mapping.
|
|
10
|
+
#
|
|
11
|
+
# Format (UAX #44):
|
|
12
|
+
# radical_number; cjk_radical; ideograph
|
|
13
|
+
#
|
|
14
|
+
# `cjk_radical` and `ideograph` are either a single hex codepoint
|
|
15
|
+
# (`2F00`) or a range in the form `XXXX..YYYY`. Range rows are
|
|
16
|
+
# expanded to one CjkRadical per codepoint.
|
|
17
|
+
#
|
|
18
|
+
# Coordinator merges each row into the relevant CodePoint.
|
|
19
|
+
class CjkRadicals < Base
|
|
20
|
+
class << self
|
|
21
|
+
def each_record(path)
|
|
22
|
+
return enum_for(:each_record, path) unless block_given?
|
|
23
|
+
|
|
24
|
+
each_line(path) do |line|
|
|
25
|
+
fields = line.fields
|
|
26
|
+
next if fields.length < 3
|
|
27
|
+
|
|
28
|
+
radical_number = parse_radical_number(fields[0])
|
|
29
|
+
cjk_radical_field = fields[1]
|
|
30
|
+
ideograph_field = fields[2]
|
|
31
|
+
next if radical_number.nil?
|
|
32
|
+
|
|
33
|
+
yield_models(radical_number, cjk_radical_field, ideograph_field).each do |model|
|
|
34
|
+
yield model
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
nil
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
# The radical number is a positive integer; some rows carry a
|
|
44
|
+
# trailing comment-stripped form. Reject anything non-numeric.
|
|
45
|
+
def parse_radical_number(field)
|
|
46
|
+
return nil if field.nil? || field.empty?
|
|
47
|
+
|
|
48
|
+
Integer(field, exception: false)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def yield_models(radical_number, cjk_radical_field, ideograph_field)
|
|
52
|
+
cjk_ids = expand_ids(cjk_radical_field)
|
|
53
|
+
ideograph_ids = expand_ids(ideograph_field)
|
|
54
|
+
|
|
55
|
+
if cjk_ids.size == 1 && ideograph_ids.size == 1
|
|
56
|
+
return [Models::CjkRadical.new(
|
|
57
|
+
radical_number: radical_number,
|
|
58
|
+
cjk_radical_id: cjk_ids.first,
|
|
59
|
+
ideograph_id: ideograph_ids.first
|
|
60
|
+
)]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
if cjk_ids.size == 1 && ideograph_ids.size > 1
|
|
64
|
+
return ideograph_ids.map do |ideograph_id|
|
|
65
|
+
Models::CjkRadical.new(
|
|
66
|
+
radical_number: radical_number,
|
|
67
|
+
cjk_radical_id: cjk_ids.first,
|
|
68
|
+
ideograph_id: ideograph_id
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
if cjk_ids.size > 1 && ideograph_ids.size == 1
|
|
74
|
+
return cjk_ids.map do |cjk_radical_id|
|
|
75
|
+
Models::CjkRadical.new(
|
|
76
|
+
radical_number: radical_number,
|
|
77
|
+
cjk_radical_id: cjk_radical_id,
|
|
78
|
+
ideograph_id: ideograph_ids.first
|
|
79
|
+
)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
cjk_ids.zip(ideograph_ids).map do |cjk_id, ideograph_id|
|
|
84
|
+
Models::CjkRadical.new(
|
|
85
|
+
radical_number: radical_number,
|
|
86
|
+
cjk_radical_id: cjk_id,
|
|
87
|
+
ideograph_id: ideograph_id
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def expand_ids(field)
|
|
93
|
+
return [] if field.nil? || field.empty?
|
|
94
|
+
|
|
95
|
+
range = parse_codepoint_or_range(field)
|
|
96
|
+
cps = range.is_a?(Range) ? range.to_a : [range]
|
|
97
|
+
cps.map { |cp| format("U+%04X", cp) }
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Parsers
|
|
7
|
+
# Parses `DerivedAge.txt` — the Unicode version in which each
|
|
8
|
+
# codepoint was first assigned.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# XXXX..YYYY; M.N
|
|
12
|
+
# XXXX; M.N
|
|
13
|
+
#
|
|
14
|
+
# The age is a Unicode version string like "1.1", "5.2", "15.0".
|
|
15
|
+
# Coordinator merges each row into `CodePoint#age`.
|
|
16
|
+
#
|
|
17
|
+
# Ranges are expanded per-codepoint (one Tuple per cp) because the
|
|
18
|
+
# Coordinator needs per-cp assignment for `CodePoint#age`.
|
|
19
|
+
class DerivedAge < Base
|
|
20
|
+
# Lightweight record yielded by `.each_record`. Models are
|
|
21
|
+
# heavyweight for stream-only data — the Coordinator consumes
|
|
22
|
+
# these immediately.
|
|
23
|
+
Tuple = Struct.new(:cp, :age, keyword_init: true) do
|
|
24
|
+
def cp_id
|
|
25
|
+
format("U+%04X", cp)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class << self
|
|
30
|
+
def each_record(path)
|
|
31
|
+
return enum_for(:each_record, path) unless block_given?
|
|
32
|
+
|
|
33
|
+
each_line(path) do |line|
|
|
34
|
+
fields = line.fields
|
|
35
|
+
next if fields.length < 2
|
|
36
|
+
|
|
37
|
+
range = parse_codepoint_or_range(fields[0])
|
|
38
|
+
age = fields[1]
|
|
39
|
+
next if age.nil? || age.empty?
|
|
40
|
+
|
|
41
|
+
each_cp(range) { |cp| yield Tuple.new(cp: cp, age: age) }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
nil
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def each_cp(range)
|
|
50
|
+
if range.is_a?(Range)
|
|
51
|
+
range.each { |cp| yield cp }
|
|
52
|
+
else
|
|
53
|
+
yield range
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/binary_property_assignment"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `DerivedCoreProperties.txt` — derived binary properties
|
|
9
|
+
# (Alphabetic, Uppercase, White_Space, Bidi_Control, …).
|
|
10
|
+
#
|
|
11
|
+
# Format (UAX #44):
|
|
12
|
+
# XXXX..YYYY; Property_Name
|
|
13
|
+
# XXXX; Property_Name
|
|
14
|
+
#
|
|
15
|
+
# The file only lists positive assignments; absence means the
|
|
16
|
+
# property is false. Each yielded `BinaryPropertyAssignment` has
|
|
17
|
+
# `enabled: true`.
|
|
18
|
+
#
|
|
19
|
+
# Coordinator appends each `property_short` (resolved to the long
|
|
20
|
+
# form via PropertyAliases if needed) to `CodePoint#binary_properties`.
|
|
21
|
+
class DerivedCoreProperties < Base
|
|
22
|
+
class << self
|
|
23
|
+
def each_record(path)
|
|
24
|
+
return enum_for(:each_record, path) unless block_given?
|
|
25
|
+
|
|
26
|
+
each_line(path) do |line|
|
|
27
|
+
fields = line.fields
|
|
28
|
+
next if fields.length < 2
|
|
29
|
+
|
|
30
|
+
range = parse_codepoint_or_range(fields[0])
|
|
31
|
+
property = fields[1]
|
|
32
|
+
next if property.nil? || property.empty?
|
|
33
|
+
|
|
34
|
+
each_cp(range) { |cp| yield build_assignment(cp, property) }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def each_cp(range)
|
|
43
|
+
if range.is_a?(Range)
|
|
44
|
+
range.each { |cp| yield cp }
|
|
45
|
+
else
|
|
46
|
+
yield range
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def build_assignment(cp, property)
|
|
51
|
+
Models::BinaryPropertyAssignment.new(
|
|
52
|
+
codepoint: cp,
|
|
53
|
+
property_short: property,
|
|
54
|
+
enabled: true
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|