ucode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +211 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +406 -0
- data/README.md +469 -0
- data/Rakefile +18 -0
- data/TODO.new/00-README.md +66 -0
- data/TODO.new/01-pillar-terminology-alignment.md +69 -0
- data/TODO.new/02-audit-schema-design.md +255 -0
- data/TODO.new/03-directory-output-spec.md +203 -0
- data/TODO.new/04-fontist-org-contract.md +173 -0
- data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
- data/TODO.new/06-audit-namespace-skeleton.md +105 -0
- data/TODO.new/07-audit-models-port.md +132 -0
- data/TODO.new/08-extractors-cheap-port.md +113 -0
- data/TODO.new/09-extractors-expensive-port.md +99 -0
- data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
- data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
- data/TODO.new/12-formatters-port.md +115 -0
- data/TODO.new/13-directory-emitter.md +147 -0
- data/TODO.new/14-html-face-browser.md +144 -0
- data/TODO.new/15-html-library-browser.md +102 -0
- data/TODO.new/16-cli-audit-subcommands.md +142 -0
- data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
- data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
- data/TODO.new/19-fontisan-docs-update.md +155 -0
- data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
- data/TODO.new/21-canonical-unicode17-build.md +148 -0
- data/TODO.new/22-implementation-order.md +176 -0
- data/UCODE_CHANGELOG.md +97 -0
- data/exe/ucode +8 -0
- data/lib/ucode/aggregator.rb +77 -0
- data/lib/ucode/audit/block_aggregator.rb +90 -0
- data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
- data/lib/ucode/audit/context.rb +137 -0
- data/lib/ucode/audit/discrepancy_detector.rb +213 -0
- data/lib/ucode/audit/extractors/aggregations.rb +70 -0
- data/lib/ucode/audit/extractors/base.rb +21 -0
- data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
- data/lib/ucode/audit/extractors/coverage.rb +55 -0
- data/lib/ucode/audit/extractors/hinting.rb +199 -0
- data/lib/ucode/audit/extractors/identity.rb +65 -0
- data/lib/ucode/audit/extractors/licensing.rb +75 -0
- data/lib/ucode/audit/extractors/metrics.rb +108 -0
- data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
- data/lib/ucode/audit/extractors/provenance.rb +34 -0
- data/lib/ucode/audit/extractors/style.rb +88 -0
- data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
- data/lib/ucode/audit/extractors.rb +31 -0
- data/lib/ucode/audit/plane_aggregator.rb +37 -0
- data/lib/ucode/audit/registry.rb +63 -0
- data/lib/ucode/audit/script_aggregator.rb +92 -0
- data/lib/ucode/audit.rb +27 -0
- data/lib/ucode/cache.rb +113 -0
- data/lib/ucode/cli.rb +272 -0
- data/lib/ucode/commands/build.rb +68 -0
- data/lib/ucode/commands/cache.rb +46 -0
- data/lib/ucode/commands/fetch.rb +62 -0
- data/lib/ucode/commands/font_coverage.rb +57 -0
- data/lib/ucode/commands/glyphs.rb +136 -0
- data/lib/ucode/commands/lookup.rb +65 -0
- data/lib/ucode/commands/parse.rb +62 -0
- data/lib/ucode/commands/site.rb +33 -0
- data/lib/ucode/commands.rb +19 -0
- data/lib/ucode/config.rb +110 -0
- data/lib/ucode/coordinator/indices.rb +34 -0
- data/lib/ucode/coordinator.rb +397 -0
- data/lib/ucode/database.rb +214 -0
- data/lib/ucode/db_builder.rb +107 -0
- data/lib/ucode/error.rb +96 -0
- data/lib/ucode/fetch/code_charts.rb +57 -0
- data/lib/ucode/fetch/http.rb +83 -0
- data/lib/ucode/fetch/ucd_zip.rb +57 -0
- data/lib/ucode/fetch/unihan_zip.rb +57 -0
- data/lib/ucode/fetch.rb +14 -0
- data/lib/ucode/glyphs/cell_extractor.rb +130 -0
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
- data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
- data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
- data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
- data/lib/ucode/glyphs/grid.rb +30 -0
- data/lib/ucode/glyphs/grid_detector.rb +165 -0
- data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
- data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
- data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
- data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
- data/lib/ucode/glyphs/last_resort/source.rb +125 -0
- data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
- data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
- data/lib/ucode/glyphs/last_resort.rb +36 -0
- data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
- data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
- data/lib/ucode/glyphs/page_renderer.rb +221 -0
- data/lib/ucode/glyphs/path_bbox.rb +62 -0
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
- data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
- data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
- data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
- data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
- data/lib/ucode/glyphs/real_fonts.rb +32 -0
- data/lib/ucode/glyphs/writer.rb +250 -0
- data/lib/ucode/glyphs.rb +27 -0
- data/lib/ucode/index.rb +106 -0
- data/lib/ucode/index_builder.rb +94 -0
- data/lib/ucode/models/audit/audit_axis.rb +30 -0
- data/lib/ucode/models/audit/audit_diff.rb +77 -0
- data/lib/ucode/models/audit/audit_report.rb +137 -0
- data/lib/ucode/models/audit/baseline.rb +32 -0
- data/lib/ucode/models/audit/block_summary.rb +72 -0
- data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
- data/lib/ucode/models/audit/codepoint_range.rb +39 -0
- data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
- data/lib/ucode/models/audit/color_capabilities.rb +91 -0
- data/lib/ucode/models/audit/discrepancy.rb +38 -0
- data/lib/ucode/models/audit/duplicate_group.rb +23 -0
- data/lib/ucode/models/audit/embedding_type.rb +81 -0
- data/lib/ucode/models/audit/field_change.rb +28 -0
- data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
- data/lib/ucode/models/audit/gasp_range.rb +63 -0
- data/lib/ucode/models/audit/hinting.rb +99 -0
- data/lib/ucode/models/audit/library_summary.rb +40 -0
- data/lib/ucode/models/audit/licensing.rb +48 -0
- data/lib/ucode/models/audit/metrics.rb +111 -0
- data/lib/ucode/models/audit/named_instance.rb +41 -0
- data/lib/ucode/models/audit/opentype_layout.rb +38 -0
- data/lib/ucode/models/audit/plane_summary.rb +31 -0
- data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
- data/lib/ucode/models/audit/script_features.rb +28 -0
- data/lib/ucode/models/audit/script_summary.rb +54 -0
- data/lib/ucode/models/audit/variation_detail.rb +42 -0
- data/lib/ucode/models/audit.rb +50 -0
- data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
- data/lib/ucode/models/bidi_mirroring.rb +19 -0
- data/lib/ucode/models/binary_property_assignment.rb +26 -0
- data/lib/ucode/models/block.rb +36 -0
- data/lib/ucode/models/case_folding_rule.rb +23 -0
- data/lib/ucode/models/cjk_radical.rb +23 -0
- data/lib/ucode/models/codepoint/bidi.rb +28 -0
- data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
- data/lib/ucode/models/codepoint/case_folding.rb +25 -0
- data/lib/ucode/models/codepoint/casing.rb +32 -0
- data/lib/ucode/models/codepoint/decomposition.rb +27 -0
- data/lib/ucode/models/codepoint/display.rb +24 -0
- data/lib/ucode/models/codepoint/emoji.rb +29 -0
- data/lib/ucode/models/codepoint/hangul.rb +20 -0
- data/lib/ucode/models/codepoint/identifier.rb +30 -0
- data/lib/ucode/models/codepoint/indic.rb +20 -0
- data/lib/ucode/models/codepoint/joining.rb +20 -0
- data/lib/ucode/models/codepoint/normalization.rb +35 -0
- data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
- data/lib/ucode/models/codepoint.rb +122 -0
- data/lib/ucode/models/name_alias.rb +21 -0
- data/lib/ucode/models/named_sequence.rb +19 -0
- data/lib/ucode/models/names_list_entry.rb +38 -0
- data/lib/ucode/models/plane.rb +36 -0
- data/lib/ucode/models/property_alias.rb +24 -0
- data/lib/ucode/models/property_value_alias.rb +26 -0
- data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
- data/lib/ucode/models/relationship/cross_reference.rb +17 -0
- data/lib/ucode/models/relationship/footnote.rb +24 -0
- data/lib/ucode/models/relationship/informal_alias.rb +18 -0
- data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
- data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
- data/lib/ucode/models/relationship.rb +57 -0
- data/lib/ucode/models/script.rb +41 -0
- data/lib/ucode/models/special_casing_rule.rb +28 -0
- data/lib/ucode/models/standardized_variant.rb +24 -0
- data/lib/ucode/models/unihan_entry.rb +23 -0
- data/lib/ucode/models.rb +47 -0
- data/lib/ucode/parsers/auxiliary.rb +26 -0
- data/lib/ucode/parsers/base.rb +137 -0
- data/lib/ucode/parsers/bidi_brackets.rb +41 -0
- data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
- data/lib/ucode/parsers/blocks.rb +63 -0
- data/lib/ucode/parsers/case_folding.rb +53 -0
- data/lib/ucode/parsers/cjk_radicals.rb +102 -0
- data/lib/ucode/parsers/derived_age.rb +59 -0
- data/lib/ucode/parsers/derived_core_properties.rb +60 -0
- data/lib/ucode/parsers/extracted_properties.rb +74 -0
- data/lib/ucode/parsers/name_aliases.rb +44 -0
- data/lib/ucode/parsers/named_sequences.rb +51 -0
- data/lib/ucode/parsers/names_list.rb +250 -0
- data/lib/ucode/parsers/property_aliases.rb +41 -0
- data/lib/ucode/parsers/property_value_aliases.rb +46 -0
- data/lib/ucode/parsers/script_extensions.rb +64 -0
- data/lib/ucode/parsers/scripts.rb +60 -0
- data/lib/ucode/parsers/special_casing.rb +62 -0
- data/lib/ucode/parsers/standardized_variants.rb +56 -0
- data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
- data/lib/ucode/parsers/unicode_data.rb +268 -0
- data/lib/ucode/parsers/unihan.rb +125 -0
- data/lib/ucode/parsers.rb +35 -0
- data/lib/ucode/range_entry.rb +58 -0
- data/lib/ucode/repo/aggregate_writer.rb +364 -0
- data/lib/ucode/repo/atomic_writes.rb +48 -0
- data/lib/ucode/repo/codepoint_writer.rb +96 -0
- data/lib/ucode/repo/paths.rb +122 -0
- data/lib/ucode/repo.rb +22 -0
- data/lib/ucode/site/config_emitter.rb +124 -0
- data/lib/ucode/site/generator.rb +178 -0
- data/lib/ucode/site/search_index.rb +68 -0
- data/lib/ucode/site/template/.gitignore +4 -0
- data/lib/ucode/site/template/.vitepress/config.ts +8 -0
- data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
- data/lib/ucode/site/template/char/[codepoint].md +13 -0
- data/lib/ucode/site/template/components/BlockView.vue +57 -0
- data/lib/ucode/site/template/components/CharView.vue +85 -0
- data/lib/ucode/site/template/components/PlaneView.vue +56 -0
- data/lib/ucode/site/template/components/SearchView.vue +66 -0
- data/lib/ucode/site/template/index.md +25 -0
- data/lib/ucode/site/template/package.json +18 -0
- data/lib/ucode/site/template/search.md +9 -0
- data/lib/ucode/site.rb +13 -0
- data/lib/ucode/version.rb +5 -0
- data/lib/ucode/version_resolver.rb +76 -0
- data/lib/ucode.rb +74 -0
- data/ucode.gemspec +56 -0
- metadata +404 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Parsers
|
|
7
|
+
# Generic range/value parser for the files under `extracted/`
|
|
8
|
+
# (DerivedGeneralCategory, DerivedJoiningGroup, DerivedLineBreak,
|
|
9
|
+
# DerivedNumericType, …).
|
|
10
|
+
#
|
|
11
|
+
# Format is uniform across every file (UAX #44):
|
|
12
|
+
# XXXX..YYYY; value
|
|
13
|
+
# XXXX; value
|
|
14
|
+
#
|
|
15
|
+
# The parser is intentionally dumb: it yields `(first, last, value)`
|
|
16
|
+
# triples without knowing what the value means. The Coordinator
|
|
17
|
+
# dispatches by source file name (DerivedGeneralCategory.txt →
|
|
18
|
+
# CodePoint#general_category, etc.). This decoupling means a new
|
|
19
|
+
# extracted file adds one line to the Coordinator, not a new parser.
|
|
20
|
+
#
|
|
21
|
+
# Ranges are NOT expanded — yielding per-codepoint would explode the
|
|
22
|
+
# stream for CJK ranges. The Coordinator expands lazily if needed.
|
|
23
|
+
class ExtractedProperties < Base
|
|
24
|
+
# Lightweight record yielded by `.each_record`. The Coordinator
|
|
25
|
+
# consumes these immediately; no need for full lutaml-model
|
|
26
|
+
# overhead. Members are named `range_first` / `range_last` (not
|
|
27
|
+
# `first` / `last`) to avoid overriding `Enumerable#first`.
|
|
28
|
+
Tuple = Struct.new(:range_first, :range_last, :value, keyword_init: true) do
|
|
29
|
+
# The inclusive Range of codepoints this assignment covers.
|
|
30
|
+
def range
|
|
31
|
+
Range.new(range_first, range_last)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Enumerator over every codepoint id in this tuple's range.
|
|
35
|
+
def cp_ids
|
|
36
|
+
(range_first..range_last).map { |cp| format("U+%04X", cp) }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def single?
|
|
40
|
+
range_first == range_last
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
class << self
|
|
45
|
+
def each_record(path)
|
|
46
|
+
return enum_for(:each_record, path) unless block_given?
|
|
47
|
+
|
|
48
|
+
each_line(path) do |line|
|
|
49
|
+
fields = line.fields
|
|
50
|
+
next if fields.length < 2
|
|
51
|
+
|
|
52
|
+
range = parse_codepoint_or_range(fields[0])
|
|
53
|
+
value = fields[1]
|
|
54
|
+
next if value.nil? || value.empty?
|
|
55
|
+
|
|
56
|
+
yield build_tuple(range, value)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
nil
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def build_tuple(range, value)
|
|
65
|
+
if range.is_a?(Range)
|
|
66
|
+
Tuple.new(range_first: range.first, range_last: range.last, value: value)
|
|
67
|
+
else
|
|
68
|
+
Tuple.new(range_first: range, range_last: range, value: value)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/name_alias"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `NameAliases.txt` — alternate / correction / control names
|
|
9
|
+
# attached to a codepoint.
|
|
10
|
+
#
|
|
11
|
+
# Format (UAX #44):
|
|
12
|
+
# cp; alias_text; type
|
|
13
|
+
#
|
|
14
|
+
# `type` is one of: correction, control, alternate, figment,
|
|
15
|
+
# abbreviation.
|
|
16
|
+
class NameAliases < Base
|
|
17
|
+
class << self
|
|
18
|
+
# Yields one NameAlias per non-comment line. Returns a lazy
|
|
19
|
+
# Enumerator when called without a block.
|
|
20
|
+
def each_record(path)
|
|
21
|
+
return enum_for(:each_record, path) unless block_given?
|
|
22
|
+
|
|
23
|
+
each_line(path) do |line|
|
|
24
|
+
fields = line.fields
|
|
25
|
+
next if fields.length < 3
|
|
26
|
+
|
|
27
|
+
cp = parse_hex_cp(fields[0])
|
|
28
|
+
text = fields[1]
|
|
29
|
+
type = fields[2]
|
|
30
|
+
next if text.nil? || text.empty? || type.nil? || type.empty?
|
|
31
|
+
|
|
32
|
+
yield Models::NameAlias.new(
|
|
33
|
+
codepoint: cp,
|
|
34
|
+
text: text,
|
|
35
|
+
type: type
|
|
36
|
+
)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
nil
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/named_sequence"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `NamedSequences.txt` — named multi-codepoint sequences.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# cp1 cp2 cp3 ...; Name
|
|
12
|
+
#
|
|
13
|
+
# The first field is a space-separated list of hex codepoints; the
|
|
14
|
+
# second is the human-readable name.
|
|
15
|
+
class NamedSequences < Base
|
|
16
|
+
class << self
|
|
17
|
+
# Yields one NamedSequence per non-comment line. Returns a lazy
|
|
18
|
+
# Enumerator when called without a block.
|
|
19
|
+
def each_record(path)
|
|
20
|
+
return enum_for(:each_record, path) unless block_given?
|
|
21
|
+
|
|
22
|
+
each_line(path) do |line|
|
|
23
|
+
fields = line.fields
|
|
24
|
+
next if fields.length < 2
|
|
25
|
+
|
|
26
|
+
sequence_field = fields[0]
|
|
27
|
+
name = fields[1]
|
|
28
|
+
next if name.nil? || name.empty?
|
|
29
|
+
|
|
30
|
+
yield Models::NamedSequence.new(
|
|
31
|
+
name: name,
|
|
32
|
+
codepoint_ids: parse_sequence(sequence_field)
|
|
33
|
+
)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
nil
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def parse_sequence(field)
|
|
42
|
+
return [] if field.nil? || field.empty?
|
|
43
|
+
|
|
44
|
+
field.split(/\s+/).reject(&:empty?).map do |hex|
|
|
45
|
+
format("U+%04X", parse_hex_cp(hex))
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/error"
|
|
5
|
+
require "ucode/models/names_list_entry"
|
|
6
|
+
require "ucode/models/relationship"
|
|
7
|
+
|
|
8
|
+
module Ucode
|
|
9
|
+
module Parsers
|
|
10
|
+
# Parses `NamesList.txt` — the human-curated annotated names file
|
|
11
|
+
# Unicode uses to render the Code Charts' name pages.
|
|
12
|
+
#
|
|
13
|
+
# Format (per the file's own header):
|
|
14
|
+
#
|
|
15
|
+
# cp; Name ← header line at column 0 → new NamesListEntry
|
|
16
|
+
# → U+XXXX note ← indented annotation lines
|
|
17
|
+
# × U+XXXX U+YYYY note
|
|
18
|
+
# ≡ U+XXXX note
|
|
19
|
+
# = alias text
|
|
20
|
+
# * footnote text
|
|
21
|
+
#
|
|
22
|
+
# Plus dropped lines:
|
|
23
|
+
#
|
|
24
|
+
# `# comment` ← file-level comment
|
|
25
|
+
# `% instruction` ← dropped (instructional)
|
|
26
|
+
# `~ heading` ← dropped (table-of-contents)
|
|
27
|
+
#
|
|
28
|
+
# Annotation scopes attach to the most recent header. Lines that do
|
|
29
|
+
# not start a new header are silently ignored.
|
|
30
|
+
#
|
|
31
|
+
# Implemented as a small state machine: one current NamesListEntry is
|
|
32
|
+
# held in a local; header lines flush the previous entry, annotation
|
|
33
|
+
# lines append to the current entry. Regex cannot express this
|
|
34
|
+
# scoping.
|
|
35
|
+
class NamesList < Base
|
|
36
|
+
HEADER_PATTERN = /\A([0-9A-Fa-f]{4,6})\s*;\s*(.+?)\s*\z/.freeze
|
|
37
|
+
private_constant :HEADER_PATTERN
|
|
38
|
+
|
|
39
|
+
CP_REF_PATTERN = /\AU\+([0-9A-Fa-f]{4,6})\b/.freeze
|
|
40
|
+
private_constant :CP_REF_PATTERN
|
|
41
|
+
|
|
42
|
+
RENDERED_PATTERN = /\(rendered:\s*(.+?)\)\z/.freeze
|
|
43
|
+
private_constant :RENDERED_PATTERN
|
|
44
|
+
|
|
45
|
+
MARKER_CROSS_REFERENCE = "→".freeze
|
|
46
|
+
MARKER_SAMPLE_SEQUENCE = "×".freeze
|
|
47
|
+
MARKER_COMPAT_EQUIV = "≡".freeze
|
|
48
|
+
MARKER_ALIAS = "=".freeze
|
|
49
|
+
MARKER_FOOTNOTE = "*".freeze
|
|
50
|
+
MARKER_INSTRUCTIONAL = "%".freeze
|
|
51
|
+
MARKER_HEADING = "~".freeze
|
|
52
|
+
|
|
53
|
+
SOURCE_TAG = "names_list".freeze
|
|
54
|
+
private_constant :SOURCE_TAG
|
|
55
|
+
|
|
56
|
+
class << self
|
|
57
|
+
# Yields one NamesListEntry per codepoint header. Returns a lazy
|
|
58
|
+
# Enumerator when no block is given.
|
|
59
|
+
def each_record(path)
|
|
60
|
+
return enum_for(:each_record, path) unless block_given?
|
|
61
|
+
|
|
62
|
+
entry = nil
|
|
63
|
+
lineno = 0
|
|
64
|
+
path_str = path.to_s
|
|
65
|
+
|
|
66
|
+
File.foreach(path_str) do |raw|
|
|
67
|
+
lineno += 1
|
|
68
|
+
line = raw.chomp
|
|
69
|
+
|
|
70
|
+
begin
|
|
71
|
+
if header_line?(line)
|
|
72
|
+
yield entry if entry
|
|
73
|
+
entry = build_header(line)
|
|
74
|
+
elsif indented_line?(line) && entry
|
|
75
|
+
parsed = parse_annotation(line)
|
|
76
|
+
attach_annotation(entry, parsed) if parsed
|
|
77
|
+
end
|
|
78
|
+
# else: blank, comment, heading, or pre-header — skip
|
|
79
|
+
rescue MalformedLineError => e
|
|
80
|
+
e.context[:file] ||= path_str
|
|
81
|
+
e.context[:line] ||= lineno
|
|
82
|
+
raise
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
yield entry if entry
|
|
87
|
+
nil
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
private
|
|
91
|
+
|
|
92
|
+
# Column-0 line whose first non-blank char is a hex digit and
|
|
93
|
+
# which carries the `;` separator. Excludes `%`, `~`, `#`.
|
|
94
|
+
def header_line?(line)
|
|
95
|
+
return false if line.empty?
|
|
96
|
+
return false if line.start_with?("#", "%", "~", "@")
|
|
97
|
+
|
|
98
|
+
!line.match(HEADER_PATTERN).nil?
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Indented annotation: column 0 is whitespace and the line is
|
|
102
|
+
# non-empty.
|
|
103
|
+
def indented_line?(line)
|
|
104
|
+
return false if line.empty?
|
|
105
|
+
|
|
106
|
+
line[0] == " " || line[0] == "\t"
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def build_header(line)
|
|
110
|
+
m = line.match(HEADER_PATTERN)
|
|
111
|
+
unless m
|
|
112
|
+
raise MalformedLineError.new(
|
|
113
|
+
"invalid NamesList.txt header: #{line.inspect}",
|
|
114
|
+
context: { line: line }
|
|
115
|
+
)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
Models::NamesListEntry.new(
|
|
119
|
+
codepoint: m[1].to_i(16),
|
|
120
|
+
name: m[2]
|
|
121
|
+
)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Parses one indented annotation line. Returns a
|
|
125
|
+
# `[container_attribute, Relationship]` pair, or `nil` if the
|
|
126
|
+
# marker is dropped (`%`, `~`) or unknown.
|
|
127
|
+
def parse_annotation(line)
|
|
128
|
+
stripped = line.lstrip
|
|
129
|
+
marker = stripped[0]
|
|
130
|
+
rest = stripped[1..].to_s.lstrip
|
|
131
|
+
|
|
132
|
+
case marker
|
|
133
|
+
when MARKER_CROSS_REFERENCE
|
|
134
|
+
target_ids, note = split_targets_and_note(rest)
|
|
135
|
+
[
|
|
136
|
+
:cross_references,
|
|
137
|
+
build_cross_reference(target_ids, note),
|
|
138
|
+
]
|
|
139
|
+
when MARKER_SAMPLE_SEQUENCE
|
|
140
|
+
target_ids, note = split_targets_and_note(rest)
|
|
141
|
+
[
|
|
142
|
+
:sample_sequences,
|
|
143
|
+
build_sample_sequence(target_ids, note),
|
|
144
|
+
]
|
|
145
|
+
when MARKER_COMPAT_EQUIV
|
|
146
|
+
target_ids, note = split_targets_and_note(rest)
|
|
147
|
+
[
|
|
148
|
+
:compatibility_equivalents,
|
|
149
|
+
build_compat_equiv(target_ids, note),
|
|
150
|
+
]
|
|
151
|
+
when MARKER_ALIAS
|
|
152
|
+
[:informal_aliases, build_alias(rest)]
|
|
153
|
+
when MARKER_FOOTNOTE
|
|
154
|
+
[:footnotes, build_footnote(rest)]
|
|
155
|
+
when MARKER_INSTRUCTIONAL, MARKER_HEADING
|
|
156
|
+
nil
|
|
157
|
+
else
|
|
158
|
+
nil
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def build_cross_reference(target_ids, note)
|
|
163
|
+
Models::Relationship::CrossReference.new(
|
|
164
|
+
target_ids: target_ids,
|
|
165
|
+
description: note.empty? ? nil : note,
|
|
166
|
+
source: SOURCE_TAG
|
|
167
|
+
)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def build_sample_sequence(target_ids, note)
|
|
171
|
+
rendered = extract_rendered(note)
|
|
172
|
+
Models::Relationship::SampleSequence.new(
|
|
173
|
+
target_ids: target_ids,
|
|
174
|
+
description: note.empty? ? nil : note,
|
|
175
|
+
rendered_form: rendered,
|
|
176
|
+
source: SOURCE_TAG
|
|
177
|
+
)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def build_compat_equiv(target_ids, note)
|
|
181
|
+
Models::Relationship::CompatEquiv.new(
|
|
182
|
+
target_ids: target_ids,
|
|
183
|
+
description: note.empty? ? nil : note,
|
|
184
|
+
source: SOURCE_TAG
|
|
185
|
+
)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def build_alias(text)
|
|
189
|
+
Models::Relationship::InformalAlias.new(
|
|
190
|
+
description: text.empty? ? nil : text,
|
|
191
|
+
source: SOURCE_TAG
|
|
192
|
+
)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def build_footnote(text)
|
|
196
|
+
Models::Relationship::Footnote.new(
|
|
197
|
+
description: text.empty? ? nil : text,
|
|
198
|
+
category: detect_footnote_category(text),
|
|
199
|
+
source: SOURCE_TAG
|
|
200
|
+
)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Splits a `U+XXXX [U+YYYY ...] note` payload into leading target
|
|
204
|
+
# ids (zero-padded `U+XXXX` form) and the trailing prose note.
|
|
205
|
+
def split_targets_and_note(rest)
|
|
206
|
+
targets = []
|
|
207
|
+
remaining = rest.dup
|
|
208
|
+
|
|
209
|
+
while (m = remaining.match(CP_REF_PATTERN))
|
|
210
|
+
targets << format("U+%04X", m[1].to_i(16))
|
|
211
|
+
remaining = remaining[m[0].length..].to_s.lstrip
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
[targets, remaining]
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Pulls `(rendered: X)` suffix from sample-sequence notes when
|
|
218
|
+
# present. Returns nil otherwise.
|
|
219
|
+
def extract_rendered(note)
|
|
220
|
+
m = note.match(RENDERED_PATTERN)
|
|
221
|
+
return nil unless m
|
|
222
|
+
|
|
223
|
+
m[1].strip
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Heuristic footnote category. The Unicode names list does not
|
|
227
|
+
# tag these explicitly; the categories are useful for UI grouping.
|
|
228
|
+
def detect_footnote_category(text)
|
|
229
|
+
first = text.split(/\s+/, 2).first&.downcase
|
|
230
|
+
case first
|
|
231
|
+
when "cap", "capital", "small", "lowercase", "uppercase",
|
|
232
|
+
"letter", "letterform", "glyph", "shape"
|
|
233
|
+
"letterform"
|
|
234
|
+
when "see", "compare", "vs", "versus", "distinguished"
|
|
235
|
+
"comparison"
|
|
236
|
+
when "history", "origin", "originally", "introduced"
|
|
237
|
+
"history"
|
|
238
|
+
else
|
|
239
|
+
"general"
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def attach_annotation(entry, parsed)
|
|
244
|
+
attr_name, instance = parsed
|
|
245
|
+
entry.public_send(attr_name) << instance
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/property_alias"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `PropertyAliases.txt` — Unicode property short ↔ long name.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# short; long_name; other_alias; other_alias; ...
|
|
12
|
+
#
|
|
13
|
+
# Example: `ccc; Canonical_Combining_Class; ccc`
|
|
14
|
+
class PropertyAliases < Base
|
|
15
|
+
class << self
|
|
16
|
+
# Yields one PropertyAlias per non-comment line. Returns a lazy
|
|
17
|
+
# Enumerator when called without a block.
|
|
18
|
+
def each_record(path)
|
|
19
|
+
return enum_for(:each_record, path) unless block_given?
|
|
20
|
+
|
|
21
|
+
each_line(path) do |line|
|
|
22
|
+
fields = line.fields
|
|
23
|
+
next if fields.length < 2
|
|
24
|
+
|
|
25
|
+
short = fields[0]
|
|
26
|
+
long = fields[1]
|
|
27
|
+
others = fields[2..].reject { |f| f.nil? || f.empty? }
|
|
28
|
+
|
|
29
|
+
yield Models::PropertyAlias.new(
|
|
30
|
+
short: short,
|
|
31
|
+
long: long,
|
|
32
|
+
other_aliases: others
|
|
33
|
+
)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
nil
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/property_value_alias"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `PropertyValueAliases.txt` — per-property value aliases.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# property; short_value; long_value; other_alias; ...
|
|
12
|
+
#
|
|
13
|
+
# Examples:
|
|
14
|
+
# gc; Lu; Uppercase_Letter
|
|
15
|
+
# sc; Latn; Latin
|
|
16
|
+
# ccc; 0; NR
|
|
17
|
+
class PropertyValueAliases < Base
|
|
18
|
+
class << self
|
|
19
|
+
# Yields one PropertyValueAlias per non-comment line. Returns a
|
|
20
|
+
# lazy Enumerator when called without a block.
|
|
21
|
+
def each_record(path)
|
|
22
|
+
return enum_for(:each_record, path) unless block_given?
|
|
23
|
+
|
|
24
|
+
each_line(path) do |line|
|
|
25
|
+
fields = line.fields
|
|
26
|
+
next if fields.length < 3
|
|
27
|
+
|
|
28
|
+
property = fields[0]
|
|
29
|
+
short = fields[1]
|
|
30
|
+
long = fields[2]
|
|
31
|
+
others = fields[3..].reject { |f| f.nil? || f.empty? }
|
|
32
|
+
|
|
33
|
+
yield Models::PropertyValueAlias.new(
|
|
34
|
+
property: property,
|
|
35
|
+
short: short,
|
|
36
|
+
long: long,
|
|
37
|
+
other_aliases: others
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Parsers
|
|
7
|
+
# Parses `ScriptExtensions.txt` — additional scripts per codepoint.
|
|
8
|
+
#
|
|
9
|
+
# Format (UAX #44):
|
|
10
|
+
# XXXX..XXXX ; Latn Grek Cyrl # trailing comment
|
|
11
|
+
#
|
|
12
|
+
# A codepoint can be associated with many scripts. The parser yields
|
|
13
|
+
# one Tuple per (codepoint, script_code) pair; the Coordinator merges
|
|
14
|
+
# these into CodePoint#script_extensions.
|
|
15
|
+
#
|
|
16
|
+
# `script_code` is the ISO 15924 4-letter code already present in the
|
|
17
|
+
# source file (e.g. `Latn`, `Grek`). No alias resolution is needed.
|
|
18
|
+
class ScriptExtensions < Base
|
|
19
|
+
# One (codepoint, ISO 15924 code) pair yielded by `.each_record`.
|
|
20
|
+
Tuple = Struct.new(:cp, :script_code, keyword_init: true) do
|
|
21
|
+
def cp_id
|
|
22
|
+
format("U+%04X", cp)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
class << self
|
|
27
|
+
# Yields one Tuple per (codepoint, script_code) pair. Returns a
|
|
28
|
+
# lazy Enumerator when called without a block.
|
|
29
|
+
def each_record(path)
|
|
30
|
+
return enum_for(:each_record, path) unless block_given?
|
|
31
|
+
|
|
32
|
+
each_line(path) do |line|
|
|
33
|
+
fields = line.fields
|
|
34
|
+
next if fields.length < 2
|
|
35
|
+
|
|
36
|
+
codes_field = fields[1]
|
|
37
|
+
next if codes_field.nil? || codes_field.empty?
|
|
38
|
+
|
|
39
|
+
range = parse_codepoint_or_range(fields[0])
|
|
40
|
+
codes = codes_field.split(/\s+/)
|
|
41
|
+
|
|
42
|
+
each_cp(range) do |cp|
|
|
43
|
+
codes.each do |code|
|
|
44
|
+
yield Tuple.new(cp: cp, script_code: code)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def each_cp(range)
|
|
55
|
+
if range.is_a?(Range)
|
|
56
|
+
range.each { |cp| yield cp }
|
|
57
|
+
else
|
|
58
|
+
yield range
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/script"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `Scripts.txt` — the primary Script property assignment per
|
|
9
|
+
# codepoint range.
|
|
10
|
+
#
|
|
11
|
+
# Format (UAX #44):
|
|
12
|
+
# XXXX..XXXX ; Script_Name # trailing comment
|
|
13
|
+
# XXXX ; Script_Name # trailing comment
|
|
14
|
+
#
|
|
15
|
+
# Yields one Script per line, with `range_first` and `range_last`
|
|
16
|
+
# set. The Coordinator bsearches the resulting sorted array by cp.
|
|
17
|
+
# The ISO 15924 `code` is resolved later by the Coordinator via
|
|
18
|
+
# PropertyValueAliases (property=sc).
|
|
19
|
+
class Scripts < Base
|
|
20
|
+
class << self
|
|
21
|
+
def each_record(path)
|
|
22
|
+
return enum_for(:each_record, path) unless block_given?
|
|
23
|
+
|
|
24
|
+
each_line(path) do |line|
|
|
25
|
+
fields = line.fields
|
|
26
|
+
next if fields.length < 2
|
|
27
|
+
|
|
28
|
+
name = fields[1]
|
|
29
|
+
next if name.nil? || name.empty?
|
|
30
|
+
next if name == "@missing"
|
|
31
|
+
|
|
32
|
+
range = parse_codepoint_or_range(fields[0])
|
|
33
|
+
yield build_script(range, name)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
nil
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def build_script(range, name)
|
|
42
|
+
first, last = bounds_of(range)
|
|
43
|
+
Models::Script.new(
|
|
44
|
+
name: name,
|
|
45
|
+
range_first: first,
|
|
46
|
+
range_last: last
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def bounds_of(range)
|
|
51
|
+
if range.is_a?(Range)
|
|
52
|
+
[range.begin, range.end]
|
|
53
|
+
else
|
|
54
|
+
[range, range]
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ucode/parsers/base"
|
|
4
|
+
require "ucode/models/special_casing_rule"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Parsers
|
|
8
|
+
# Parses `SpecialCasing.txt` — context-sensitive case mappings.
|
|
9
|
+
#
|
|
10
|
+
# Format (UAX #44):
|
|
11
|
+
# cp; lower; title; upper; [conditions;] # name
|
|
12
|
+
#
|
|
13
|
+
# The `lower`/`title`/`upper` fields are either empty or a
|
|
14
|
+
# space-separated list of hex codepoints. `conditions` is a
|
|
15
|
+
# space-separated list of context identifiers (`Final_Sigma`,
|
|
16
|
+
# `After_I`) and/or locale codes (`tr`, `az`). Filtering by
|
|
17
|
+
# condition is the consumer's job.
|
|
18
|
+
class SpecialCasing < Base
|
|
19
|
+
class << self
|
|
20
|
+
# Yields one SpecialCasingRule per non-comment line. Returns a
|
|
21
|
+
# lazy Enumerator when called without a block.
|
|
22
|
+
def each_record(path)
|
|
23
|
+
return enum_for(:each_record, path) unless block_given?
|
|
24
|
+
|
|
25
|
+
each_line(path) do |line|
|
|
26
|
+
fields = line.fields
|
|
27
|
+
next if fields.length < 4
|
|
28
|
+
|
|
29
|
+
cp = parse_hex_cp(fields[0])
|
|
30
|
+
|
|
31
|
+
yield Models::SpecialCasingRule.new(
|
|
32
|
+
codepoint: cp,
|
|
33
|
+
lower_ids: parse_mapping(fields[1]),
|
|
34
|
+
title_ids: parse_mapping(fields[2]),
|
|
35
|
+
upper_ids: parse_mapping(fields[3]),
|
|
36
|
+
conditions: parse_conditions(fields[4]),
|
|
37
|
+
comment: line.comment
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def parse_mapping(field)
|
|
47
|
+
return [] if field.nil? || field.empty?
|
|
48
|
+
|
|
49
|
+
field.split(/\s+/).reject(&:empty?).map do |hex|
|
|
50
|
+
format("U+%04X", parse_hex_cp(hex))
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def parse_conditions(field)
|
|
55
|
+
return [] if field.nil? || field.empty?
|
|
56
|
+
|
|
57
|
+
field.split(/\s+/).reject(&:empty?)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|