ucode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +211 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +406 -0
- data/README.md +469 -0
- data/Rakefile +18 -0
- data/TODO.new/00-README.md +66 -0
- data/TODO.new/01-pillar-terminology-alignment.md +69 -0
- data/TODO.new/02-audit-schema-design.md +255 -0
- data/TODO.new/03-directory-output-spec.md +203 -0
- data/TODO.new/04-fontist-org-contract.md +173 -0
- data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
- data/TODO.new/06-audit-namespace-skeleton.md +105 -0
- data/TODO.new/07-audit-models-port.md +132 -0
- data/TODO.new/08-extractors-cheap-port.md +113 -0
- data/TODO.new/09-extractors-expensive-port.md +99 -0
- data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
- data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
- data/TODO.new/12-formatters-port.md +115 -0
- data/TODO.new/13-directory-emitter.md +147 -0
- data/TODO.new/14-html-face-browser.md +144 -0
- data/TODO.new/15-html-library-browser.md +102 -0
- data/TODO.new/16-cli-audit-subcommands.md +142 -0
- data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
- data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
- data/TODO.new/19-fontisan-docs-update.md +155 -0
- data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
- data/TODO.new/21-canonical-unicode17-build.md +148 -0
- data/TODO.new/22-implementation-order.md +176 -0
- data/UCODE_CHANGELOG.md +97 -0
- data/exe/ucode +8 -0
- data/lib/ucode/aggregator.rb +77 -0
- data/lib/ucode/audit/block_aggregator.rb +90 -0
- data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
- data/lib/ucode/audit/context.rb +137 -0
- data/lib/ucode/audit/discrepancy_detector.rb +213 -0
- data/lib/ucode/audit/extractors/aggregations.rb +70 -0
- data/lib/ucode/audit/extractors/base.rb +21 -0
- data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
- data/lib/ucode/audit/extractors/coverage.rb +55 -0
- data/lib/ucode/audit/extractors/hinting.rb +199 -0
- data/lib/ucode/audit/extractors/identity.rb +65 -0
- data/lib/ucode/audit/extractors/licensing.rb +75 -0
- data/lib/ucode/audit/extractors/metrics.rb +108 -0
- data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
- data/lib/ucode/audit/extractors/provenance.rb +34 -0
- data/lib/ucode/audit/extractors/style.rb +88 -0
- data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
- data/lib/ucode/audit/extractors.rb +31 -0
- data/lib/ucode/audit/plane_aggregator.rb +37 -0
- data/lib/ucode/audit/registry.rb +63 -0
- data/lib/ucode/audit/script_aggregator.rb +92 -0
- data/lib/ucode/audit.rb +27 -0
- data/lib/ucode/cache.rb +113 -0
- data/lib/ucode/cli.rb +272 -0
- data/lib/ucode/commands/build.rb +68 -0
- data/lib/ucode/commands/cache.rb +46 -0
- data/lib/ucode/commands/fetch.rb +62 -0
- data/lib/ucode/commands/font_coverage.rb +57 -0
- data/lib/ucode/commands/glyphs.rb +136 -0
- data/lib/ucode/commands/lookup.rb +65 -0
- data/lib/ucode/commands/parse.rb +62 -0
- data/lib/ucode/commands/site.rb +33 -0
- data/lib/ucode/commands.rb +19 -0
- data/lib/ucode/config.rb +110 -0
- data/lib/ucode/coordinator/indices.rb +34 -0
- data/lib/ucode/coordinator.rb +397 -0
- data/lib/ucode/database.rb +214 -0
- data/lib/ucode/db_builder.rb +107 -0
- data/lib/ucode/error.rb +96 -0
- data/lib/ucode/fetch/code_charts.rb +57 -0
- data/lib/ucode/fetch/http.rb +83 -0
- data/lib/ucode/fetch/ucd_zip.rb +57 -0
- data/lib/ucode/fetch/unihan_zip.rb +57 -0
- data/lib/ucode/fetch.rb +14 -0
- data/lib/ucode/glyphs/cell_extractor.rb +130 -0
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
- data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
- data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
- data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
- data/lib/ucode/glyphs/grid.rb +30 -0
- data/lib/ucode/glyphs/grid_detector.rb +165 -0
- data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
- data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
- data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
- data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
- data/lib/ucode/glyphs/last_resort/source.rb +125 -0
- data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
- data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
- data/lib/ucode/glyphs/last_resort.rb +36 -0
- data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
- data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
- data/lib/ucode/glyphs/page_renderer.rb +221 -0
- data/lib/ucode/glyphs/path_bbox.rb +62 -0
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
- data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
- data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
- data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
- data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
- data/lib/ucode/glyphs/real_fonts.rb +32 -0
- data/lib/ucode/glyphs/writer.rb +250 -0
- data/lib/ucode/glyphs.rb +27 -0
- data/lib/ucode/index.rb +106 -0
- data/lib/ucode/index_builder.rb +94 -0
- data/lib/ucode/models/audit/audit_axis.rb +30 -0
- data/lib/ucode/models/audit/audit_diff.rb +77 -0
- data/lib/ucode/models/audit/audit_report.rb +137 -0
- data/lib/ucode/models/audit/baseline.rb +32 -0
- data/lib/ucode/models/audit/block_summary.rb +72 -0
- data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
- data/lib/ucode/models/audit/codepoint_range.rb +39 -0
- data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
- data/lib/ucode/models/audit/color_capabilities.rb +91 -0
- data/lib/ucode/models/audit/discrepancy.rb +38 -0
- data/lib/ucode/models/audit/duplicate_group.rb +23 -0
- data/lib/ucode/models/audit/embedding_type.rb +81 -0
- data/lib/ucode/models/audit/field_change.rb +28 -0
- data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
- data/lib/ucode/models/audit/gasp_range.rb +63 -0
- data/lib/ucode/models/audit/hinting.rb +99 -0
- data/lib/ucode/models/audit/library_summary.rb +40 -0
- data/lib/ucode/models/audit/licensing.rb +48 -0
- data/lib/ucode/models/audit/metrics.rb +111 -0
- data/lib/ucode/models/audit/named_instance.rb +41 -0
- data/lib/ucode/models/audit/opentype_layout.rb +38 -0
- data/lib/ucode/models/audit/plane_summary.rb +31 -0
- data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
- data/lib/ucode/models/audit/script_features.rb +28 -0
- data/lib/ucode/models/audit/script_summary.rb +54 -0
- data/lib/ucode/models/audit/variation_detail.rb +42 -0
- data/lib/ucode/models/audit.rb +50 -0
- data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
- data/lib/ucode/models/bidi_mirroring.rb +19 -0
- data/lib/ucode/models/binary_property_assignment.rb +26 -0
- data/lib/ucode/models/block.rb +36 -0
- data/lib/ucode/models/case_folding_rule.rb +23 -0
- data/lib/ucode/models/cjk_radical.rb +23 -0
- data/lib/ucode/models/codepoint/bidi.rb +28 -0
- data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
- data/lib/ucode/models/codepoint/case_folding.rb +25 -0
- data/lib/ucode/models/codepoint/casing.rb +32 -0
- data/lib/ucode/models/codepoint/decomposition.rb +27 -0
- data/lib/ucode/models/codepoint/display.rb +24 -0
- data/lib/ucode/models/codepoint/emoji.rb +29 -0
- data/lib/ucode/models/codepoint/hangul.rb +20 -0
- data/lib/ucode/models/codepoint/identifier.rb +30 -0
- data/lib/ucode/models/codepoint/indic.rb +20 -0
- data/lib/ucode/models/codepoint/joining.rb +20 -0
- data/lib/ucode/models/codepoint/normalization.rb +35 -0
- data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
- data/lib/ucode/models/codepoint.rb +122 -0
- data/lib/ucode/models/name_alias.rb +21 -0
- data/lib/ucode/models/named_sequence.rb +19 -0
- data/lib/ucode/models/names_list_entry.rb +38 -0
- data/lib/ucode/models/plane.rb +36 -0
- data/lib/ucode/models/property_alias.rb +24 -0
- data/lib/ucode/models/property_value_alias.rb +26 -0
- data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
- data/lib/ucode/models/relationship/cross_reference.rb +17 -0
- data/lib/ucode/models/relationship/footnote.rb +24 -0
- data/lib/ucode/models/relationship/informal_alias.rb +18 -0
- data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
- data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
- data/lib/ucode/models/relationship.rb +57 -0
- data/lib/ucode/models/script.rb +41 -0
- data/lib/ucode/models/special_casing_rule.rb +28 -0
- data/lib/ucode/models/standardized_variant.rb +24 -0
- data/lib/ucode/models/unihan_entry.rb +23 -0
- data/lib/ucode/models.rb +47 -0
- data/lib/ucode/parsers/auxiliary.rb +26 -0
- data/lib/ucode/parsers/base.rb +137 -0
- data/lib/ucode/parsers/bidi_brackets.rb +41 -0
- data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
- data/lib/ucode/parsers/blocks.rb +63 -0
- data/lib/ucode/parsers/case_folding.rb +53 -0
- data/lib/ucode/parsers/cjk_radicals.rb +102 -0
- data/lib/ucode/parsers/derived_age.rb +59 -0
- data/lib/ucode/parsers/derived_core_properties.rb +60 -0
- data/lib/ucode/parsers/extracted_properties.rb +74 -0
- data/lib/ucode/parsers/name_aliases.rb +44 -0
- data/lib/ucode/parsers/named_sequences.rb +51 -0
- data/lib/ucode/parsers/names_list.rb +250 -0
- data/lib/ucode/parsers/property_aliases.rb +41 -0
- data/lib/ucode/parsers/property_value_aliases.rb +46 -0
- data/lib/ucode/parsers/script_extensions.rb +64 -0
- data/lib/ucode/parsers/scripts.rb +60 -0
- data/lib/ucode/parsers/special_casing.rb +62 -0
- data/lib/ucode/parsers/standardized_variants.rb +56 -0
- data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
- data/lib/ucode/parsers/unicode_data.rb +268 -0
- data/lib/ucode/parsers/unihan.rb +125 -0
- data/lib/ucode/parsers.rb +35 -0
- data/lib/ucode/range_entry.rb +58 -0
- data/lib/ucode/repo/aggregate_writer.rb +364 -0
- data/lib/ucode/repo/atomic_writes.rb +48 -0
- data/lib/ucode/repo/codepoint_writer.rb +96 -0
- data/lib/ucode/repo/paths.rb +122 -0
- data/lib/ucode/repo.rb +22 -0
- data/lib/ucode/site/config_emitter.rb +124 -0
- data/lib/ucode/site/generator.rb +178 -0
- data/lib/ucode/site/search_index.rb +68 -0
- data/lib/ucode/site/template/.gitignore +4 -0
- data/lib/ucode/site/template/.vitepress/config.ts +8 -0
- data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
- data/lib/ucode/site/template/char/[codepoint].md +13 -0
- data/lib/ucode/site/template/components/BlockView.vue +57 -0
- data/lib/ucode/site/template/components/CharView.vue +85 -0
- data/lib/ucode/site/template/components/PlaneView.vue +56 -0
- data/lib/ucode/site/template/components/SearchView.vue +66 -0
- data/lib/ucode/site/template/index.md +25 -0
- data/lib/ucode/site/template/package.json +18 -0
- data/lib/ucode/site/template/search.md +9 -0
- data/lib/ucode/site.rb +13 -0
- data/lib/ucode/version.rb +5 -0
- data/lib/ucode/version_resolver.rb +76 -0
- data/lib/ucode.rb +74 -0
- data/ucode.gemspec +56 -0
- metadata +404 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
|
|
5
|
+
require "fontist"
|
|
6
|
+
|
|
7
|
+
module Ucode
|
|
8
|
+
module Glyphs
|
|
9
|
+
module RealFonts
|
|
10
|
+
# Resolves a user-provided font specifier to a concrete file
|
|
11
|
+
# path on disk. Resolution order:
|
|
12
|
+
#
|
|
13
|
+
# 1. Direct file path — returns it if it exists. Useful for
|
|
14
|
+
# local checkouts (e.g. a developer's clone of Lentariso).
|
|
15
|
+
# 2. `Fontist::Font.find(name)` — returns the already-installed
|
|
16
|
+
# font path if fontist has it on disk.
|
|
17
|
+
# 3. `Fontist::Font.install(name)` — downloads + installs the
|
|
18
|
+
# font via the fontist formula index.
|
|
19
|
+
#
|
|
20
|
+
# Fontist is the canonical discovery layer for the fontist
|
|
21
|
+
# ecosystem. We never reach into other package managers or
|
|
22
|
+
# hardcode URLs here — formulas live in fontist/formulas.
|
|
23
|
+
class FontLocator
|
|
24
|
+
LocateResult = Struct.new(:name, :path, :via, keyword_init: true)
|
|
25
|
+
|
|
26
|
+
# @param spec [String] either a file path or a fontist formula
|
|
27
|
+
# name (case-insensitive). A `name=path` form is also
|
|
28
|
+
# accepted so a CLI can name the font whatever the user
|
|
29
|
+
# wants without depending on the formula's family name.
|
|
30
|
+
# @param install [Boolean] if true and the font is not on
|
|
31
|
+
# disk, attempt `Fontist::Font.install`. Default: true.
|
|
32
|
+
# @return [LocateResult]
|
|
33
|
+
# @raise [Errno::ENOENT] if path does not exist and fontist
|
|
34
|
+
# cannot resolve the name.
|
|
35
|
+
def locate(spec, install: true)
|
|
36
|
+
name, path = split_spec(spec)
|
|
37
|
+
return result(name, path, :direct) if path && File.exist?(path)
|
|
38
|
+
|
|
39
|
+
via_fontist = find_via_fontist(name, install: install)
|
|
40
|
+
return via_fontist if via_fontist
|
|
41
|
+
|
|
42
|
+
raise Errno::ENOENT, "Font not found: #{spec}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def split_spec(spec)
|
|
48
|
+
if spec.include?("=")
|
|
49
|
+
name, path = spec.split("=", 2)
|
|
50
|
+
[name.strip, path]
|
|
51
|
+
else
|
|
52
|
+
[spec, spec]
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def find_via_fontist(name, install:)
|
|
57
|
+
found = safe_fontist_lookup { Fontist::Font.find(name) }
|
|
58
|
+
return result(name, found, :fontist_find) if found
|
|
59
|
+
return nil unless install
|
|
60
|
+
|
|
61
|
+
paths = install_via_fontist(name)
|
|
62
|
+
return nil unless paths&.any?
|
|
63
|
+
|
|
64
|
+
result(name, paths.first, :fontist_install)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def install_via_fontist(name)
|
|
68
|
+
Fontist::Font.install(
|
|
69
|
+
name,
|
|
70
|
+
confirmation: "yes",
|
|
71
|
+
hide_licenses: true,
|
|
72
|
+
)
|
|
73
|
+
rescue Fontist::Errors::UnsupportedFontError,
|
|
74
|
+
Fontist::Errors::FontNotFoundError
|
|
75
|
+
nil
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# `Fontist::Font.find` raises `UnsupportedFontError` when the
|
|
79
|
+
# name isn't in the formula index — that's a "not found"
|
|
80
|
+
# outcome for our purposes, not an exceptional control-flow
|
|
81
|
+
# event. Translate to nil so the caller can fall through to
|
|
82
|
+
# the install-or-fail branch.
|
|
83
|
+
def safe_fontist_lookup
|
|
84
|
+
yield
|
|
85
|
+
rescue Fontist::Errors::UnsupportedFontError, Fontist::Errors::FontNotFoundError
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def result(name, path, via)
|
|
90
|
+
LocateResult.new(name: name, path: Pathname(path), via: via)
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Glyphs
|
|
5
|
+
module RealFonts
|
|
6
|
+
# The new blocks introduced by Unicode 17.0 that this audit
|
|
7
|
+
# cares about. Each block carries its explicit assigned-codepoint
|
|
8
|
+
# ranges.
|
|
9
|
+
#
|
|
10
|
+
# Sources (in priority order):
|
|
11
|
+
# 1. Unicode 17.0 `Blocks.txt` — block name + first/last cp.
|
|
12
|
+
# 2. Per-block code-chart legend on unicode.org — published
|
|
13
|
+
# assigned-codepoint count.
|
|
14
|
+
# 3. Direct inspection of a known-good font (fontisan audit)
|
|
15
|
+
# — confirms at least the assigned count when a font has
|
|
16
|
+
# 100% coverage.
|
|
17
|
+
#
|
|
18
|
+
# Where the chart legend publishes an assigned count but not the
|
|
19
|
+
# exact ranges, we approximate by extending from the block's
|
|
20
|
+
# first codepoint up to the count. This may mis-attribute a few
|
|
21
|
+
# reserved slots in the middle of a block as "assigned"; the
|
|
22
|
+
# `missing_cps` list then over-reports by those slots. Refining
|
|
23
|
+
# to exact ranges is a follow-up once UCD 17.0 text files are
|
|
24
|
+
# integrated into the ucode dataset.
|
|
25
|
+
#
|
|
26
|
+
# Block names match the verbatim UCD block name (`Blocks.txt`
|
|
27
|
+
# field 2) — never slugified.
|
|
28
|
+
Block = Struct.new(:name, :first_cp, :last_cp, :assigned_ranges,
|
|
29
|
+
keyword_init: true) do
|
|
30
|
+
def covers?(codepoint)
|
|
31
|
+
codepoint.between?(first_cp, last_cp)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
module Unicode17Blocks
|
|
36
|
+
ALL = [
|
|
37
|
+
# Sidetic — U+10940..U+1095F, 26 assigned (verified via
|
|
38
|
+
# Lentariso: covers U+10940..U+10959 exactly).
|
|
39
|
+
Block.new(name: "Sidetic",
|
|
40
|
+
first_cp: 0x10940, last_cp: 0x1095F,
|
|
41
|
+
assigned_ranges: [0x10940..0x10959]),
|
|
42
|
+
# Sharada Supplement — U+11B60..U+11B7F, 8 assigned.
|
|
43
|
+
Block.new(name: "Sharada Supplement",
|
|
44
|
+
first_cp: 0x11B60, last_cp: 0x11B7F,
|
|
45
|
+
assigned_ranges: [0x11B60..0x11B67]),
|
|
46
|
+
# Tolong Siki — U+11DB0..U+11DEF, 54 assigned (letters +
|
|
47
|
+
# digits; ranges approximate).
|
|
48
|
+
Block.new(name: "Tolong Siki",
|
|
49
|
+
first_cp: 0x11DB0, last_cp: 0x11DEF,
|
|
50
|
+
assigned_ranges: [0x11DB0..0x11DE5]),
|
|
51
|
+
# Beria Erfe — U+16EA0..U+16EDF, 50 assigned across two runs
|
|
52
|
+
# (U+16EB9-U+16EBA reserved — verified via Kedebideri).
|
|
53
|
+
Block.new(name: "Beria Erfe",
|
|
54
|
+
first_cp: 0x16EA0, last_cp: 0x16EDF,
|
|
55
|
+
assigned_ranges: [0x16EA0..0x16EB8, 0x16EBB..0x16ED3]),
|
|
56
|
+
# Tai Yo — full block range; published as 52 codepoints in
|
|
57
|
+
# the UCD 17.0 block list.
|
|
58
|
+
Block.new(name: "Tai Yo",
|
|
59
|
+
first_cp: 0x1E6C0, last_cp: 0x1E6F3,
|
|
60
|
+
assigned_ranges: [0x1E6C0..0x1E6F3]),
|
|
61
|
+
# Symbols for Legacy Computing Supplement — 9 assigned
|
|
62
|
+
# (approximate; U+1CC00..U+1CC08).
|
|
63
|
+
Block.new(name: "Symbols for Legacy Computing Supplement",
|
|
64
|
+
first_cp: 0x1CC00, last_cp: 0x1CCFF,
|
|
65
|
+
assigned_ranges: [0x1CC00..0x1CC08]),
|
|
66
|
+
# Supplemental Arrows-C — 9 assigned (U+1CF00..U+1CF08).
|
|
67
|
+
Block.new(name: "Supplemental Arrows-C",
|
|
68
|
+
first_cp: 0x1CF00, last_cp: 0x1CFCF,
|
|
69
|
+
assigned_ranges: [0x1CF00..0x1CF08]),
|
|
70
|
+
# Alchemical Symbols — 4 new in Unicode 17.
|
|
71
|
+
Block.new(name: "Alchemical Symbols",
|
|
72
|
+
first_cp: 0x1F740, last_cp: 0x1F77F,
|
|
73
|
+
assigned_ranges: [0x1F740..0x1F743]),
|
|
74
|
+
# Miscellaneous Symbols Supplement — published as 34
|
|
75
|
+
# assigned in Unicode 17; ranges approximate.
|
|
76
|
+
Block.new(name: "Miscellaneous Symbols Supplement",
|
|
77
|
+
first_cp: 0x1FA70, last_cp: 0x1FAFF,
|
|
78
|
+
assigned_ranges: [0x1FA70..0x1FA91]),
|
|
79
|
+
# Musical Symbols Supplement (Znamenny Notation additions)
|
|
80
|
+
# — U+1D200..U+1D24F, additions in Unicode 17. Range
|
|
81
|
+
# approximate.
|
|
82
|
+
Block.new(name: "Musical Symbols Supplement",
|
|
83
|
+
first_cp: 0x1D200, last_cp: 0x1D24F,
|
|
84
|
+
assigned_ranges: [0x1D200..0x1D245]),
|
|
85
|
+
# CJK Unified Ideographs Extension J — U+31350..U+323AF,
|
|
86
|
+
# 4,293 assigned per UCD 17.0. Audit uses the published
|
|
87
|
+
# block range; the assigned set may extend slightly past
|
|
88
|
+
# U+323AF in some distributions.
|
|
89
|
+
Block.new(name: "CJK Unified Ideographs Extension J",
|
|
90
|
+
first_cp: 0x31350, last_cp: 0x323AF,
|
|
91
|
+
assigned_ranges: [0x31350..0x323AF]),
|
|
92
|
+
].freeze
|
|
93
|
+
|
|
94
|
+
def self.each(&)
|
|
95
|
+
ALL.each(&)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def self.for_codepoint(codepoint)
|
|
99
|
+
ALL.find { |b| codepoint >= b.first_cp && codepoint <= b.last_cp }
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "pathname"
|
|
5
|
+
|
|
6
|
+
require_relative "font_coverage_report"
|
|
7
|
+
|
|
8
|
+
module Ucode
|
|
9
|
+
module Glyphs
|
|
10
|
+
module RealFonts
|
|
11
|
+
# Persists a {FontCoverageReport} as a JSON file under
|
|
12
|
+
# `output/font_coverage/`. One file per audited face; the
|
|
13
|
+
# filename is derived from the report's `source_file` so the
|
|
14
|
+
# source and the report are trivially correlated.
|
|
15
|
+
class Writer
|
|
16
|
+
DEFAULT_OUTPUT_DIR = "font_coverage"
|
|
17
|
+
|
|
18
|
+
# @param output_root [Pathname, String] parent directory; the
|
|
19
|
+
# `font_coverage/` subdirectory is created inside it.
|
|
20
|
+
def initialize(output_root)
|
|
21
|
+
@output_root = Pathname(output_root)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @param report [FontCoverageReport]
|
|
25
|
+
# @return [Pathname] absolute path of the written file
|
|
26
|
+
def write(report)
|
|
27
|
+
path = target_path(report)
|
|
28
|
+
path.dirname.mkpath
|
|
29
|
+
path.write("#{JSON.pretty_generate(report.to_hash)}\n")
|
|
30
|
+
path
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def target_path(report)
|
|
36
|
+
base = safe_basename(source_label(report))
|
|
37
|
+
@output_root.join(DEFAULT_OUTPUT_DIR, "#{base}.json")
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def source_label(report)
|
|
41
|
+
report.source_file || report.postscript_name || "font"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def safe_basename(name)
|
|
45
|
+
File.basename(name, ".*").gsub(/[^A-Za-z0-9._-]/, "_")
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Glyphs
|
|
5
|
+
# Tier-1 glyph sourcing — real font cmaps.
|
|
6
|
+
#
|
|
7
|
+
# When a real OpenType/TrueType font covers a Unicode 17 block,
|
|
8
|
+
# walking its cmap and lifting glyph outlines directly from the
|
|
9
|
+
# font's `glyf`/`CFF ` table produces higher-fidelity SVGs than
|
|
10
|
+
# vector-extracting from the Code Charts PDF (which composites
|
|
11
|
+
# chart-grid chrome into the same glyph). Tier 1 is the preferred
|
|
12
|
+
# source; Code Charts PDF (pillar 1 ToUnicode, pillar 2 positional
|
|
13
|
+
# correlation, pillar 3 Last Resort) are fallbacks for codepoints
|
|
14
|
+
# no real font covers.
|
|
15
|
+
#
|
|
16
|
+
# Font discovery goes through **fontist** (`Fontist::Font.find` /
|
|
17
|
+
# `install`); font parsing/audit/outline extraction goes through
|
|
18
|
+
# **fontisan** (`Fontisan::Commands::AuditCommand`,
|
|
19
|
+
# `Fontisan::OutlineExtractor`). Both gems live in the fontist
|
|
20
|
+
# org; fontist already depends on fontisan. No other Ruby
|
|
21
|
+
# font-parsing library is permitted.
|
|
22
|
+
module RealFonts
|
|
23
|
+
autoload :Unicode17Blocks, "ucode/glyphs/real_fonts/unicode_17_blocks"
|
|
24
|
+
autoload :BlockCoverage, "ucode/glyphs/real_fonts/block_coverage"
|
|
25
|
+
autoload :FontCoverageReport,
|
|
26
|
+
"ucode/glyphs/real_fonts/font_coverage_report"
|
|
27
|
+
autoload :FontLocator, "ucode/glyphs/real_fonts/font_locator"
|
|
28
|
+
autoload :CoverageAuditor, "ucode/glyphs/real_fonts/coverage_auditor"
|
|
29
|
+
autoload :Writer, "ucode/glyphs/real_fonts/writer"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
require "thread"
|
|
5
|
+
require "tmpdir"
|
|
6
|
+
require "nokogiri"
|
|
7
|
+
|
|
8
|
+
require "ucode/error"
|
|
9
|
+
require "ucode/glyphs/page_renderer"
|
|
10
|
+
require "ucode/glyphs/grid_detector"
|
|
11
|
+
require "ucode/glyphs/cell_extractor"
|
|
12
|
+
require "ucode/repo/atomic_writes"
|
|
13
|
+
require "ucode/repo/paths"
|
|
14
|
+
|
|
15
|
+
module Ucode
|
|
16
|
+
module Glyphs
|
|
17
|
+
# Writes `glyph.svg` for every codepoint in a block by orchestrating
|
|
18
|
+
# the per-block pipeline: render PDF page → detect grid → extract
|
|
19
|
+
# each cell → write atomic file.
|
|
20
|
+
#
|
|
21
|
+
# The Writer is **page-driven**: the caller hands it a `page_map`
|
|
22
|
+
# (`{ page_num => first_cp_on_that_page }`) so the writer knows what
|
|
23
|
+
# codepoint each detected cell anchor corresponds to. This is the
|
|
24
|
+
# one piece of state the Writer can't derive on its own — pdftocairo
|
|
25
|
+
# converts the row's codepoint labels to outlined glyphs, so they
|
|
26
|
+
# aren't readable as text.
|
|
27
|
+
#
|
|
28
|
+
# **Idempotent**: re-runs are no-ops via `Repo::AtomicWrites` (byte
|
|
29
|
+
# comparison; same content is skipped). Safe to re-run on the whole
|
|
30
|
+
# output tree.
|
|
31
|
+
#
|
|
32
|
+
# **Atomic**: writes go through `<path>.tmp` + rename. A crash mid-
|
|
33
|
+
# write leaves either the old file or no file, never a truncated one.
|
|
34
|
+
#
|
|
35
|
+
# **Placeholder for assigned codepoints with no glyph**: when a
|
|
36
|
+
# codepoint is listed in `block.codepoint_ids` but no cell is found
|
|
37
|
+
# on any rendered page, a small placeholder SVG is written so the
|
|
38
|
+
# site can render a "no official glyph" badge. Counted in the tally
|
|
39
|
+
# as `placeholder`.
|
|
40
|
+
#
|
|
41
|
+
# **Pure-ish**: takes a renderer instance (defaults to the first
|
|
42
|
+
# available system renderer) and a fetcher; both are injectable for
|
|
43
|
+
# tests. The only I/O is the renderer, the writer's output_root, and
|
|
44
|
+
# any optional cache.
|
|
45
|
+
class Writer
|
|
46
|
+
include Repo::AtomicWrites
|
|
47
|
+
|
|
48
|
+
PlaceholderViewBoxSize = 100
|
|
49
|
+
private_constant :PlaceholderViewBoxSize
|
|
50
|
+
|
|
51
|
+
# @param output_root [String, Pathname]
|
|
52
|
+
# @param renderer [Ucode::Glyphs::PageRenderer] concrete renderer class
|
|
53
|
+
# @param parallel_workers [Integer] worker pool size for #write_all
|
|
54
|
+
def initialize(output_root:, renderer: PageRenderer.default, parallel_workers: 4)
|
|
55
|
+
@output_root = Pathname.new(output_root)
|
|
56
|
+
@renderer = renderer
|
|
57
|
+
@parallel_workers = parallel_workers
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Process every page in `page_map`, writing glyph.svg for each
|
|
61
|
+
# codepoint that (a) falls inside the block's range and (b) has a
|
|
62
|
+
# detectable glyph on the page.
|
|
63
|
+
#
|
|
64
|
+
# @param block [Ucode::Models::Block]
|
|
65
|
+
# @param pdf_path [String, Pathname]
|
|
66
|
+
# @param page_map [Hash{Integer => Integer}] page_num => first cp on that page
|
|
67
|
+
# @param strict [Boolean] raise GlyphError when the PDF is missing
|
|
68
|
+
# or no grid is detected on any page; when false, returns a tally
|
|
69
|
+
# with `no_grid` set and writes placeholders for assigned cps.
|
|
70
|
+
# @return [Hash] tally { written: N, skipped: N, empty: N,
|
|
71
|
+
# placeholder: N, no_grid: N }
|
|
72
|
+
def write_block(block:, pdf_path:, page_map:, strict: false)
|
|
73
|
+
unless pdf_path && Pathname.new(pdf_path).exist?
|
|
74
|
+
raise_missing_pdf!(block, pdf_path) if strict
|
|
75
|
+
return placeholder_pass(block, zero_tally.tap { |h| h[:no_grid] = 1 })
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
tally = zero_tally
|
|
79
|
+
page_map.each do |page_num, first_cp|
|
|
80
|
+
merge_tally!(tally, write_page(block: block, pdf_path: pdf_path,
|
|
81
|
+
page_num: page_num, first_cp: first_cp))
|
|
82
|
+
end
|
|
83
|
+
placeholder_pass(block, tally)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Render one page, detect its grid, write every cell whose codepoint
|
|
87
|
+
# falls inside `block`'s range.
|
|
88
|
+
#
|
|
89
|
+
# @param block [Ucode::Models::Block]
|
|
90
|
+
# @param pdf_path [String, Pathname]
|
|
91
|
+
# @param page_num [Integer] 1-based PDF page number
|
|
92
|
+
# @param first_cp [Integer] codepoint of the grid's top-left cell
|
|
93
|
+
# @return [Hash] tally
|
|
94
|
+
def write_page(block:, pdf_path:, page_num:, first_cp:)
|
|
95
|
+
svg_doc = render_page(pdf_path, page_num)
|
|
96
|
+
return no_grid_tally unless svg_doc
|
|
97
|
+
|
|
98
|
+
grid = GridDetector.detect(svg_doc, block_first_cp: first_cp)
|
|
99
|
+
return no_grid_tally unless grid
|
|
100
|
+
|
|
101
|
+
counts = zero_tally
|
|
102
|
+
extractor = CellExtractor.new(svg_doc)
|
|
103
|
+
grid.rows.times do |row|
|
|
104
|
+
grid.columns.times do |col|
|
|
105
|
+
cp = grid.codepoint_at(row, col)
|
|
106
|
+
next unless cp && block.covers?(cp)
|
|
107
|
+
|
|
108
|
+
cell_svg = extractor.extract(grid, cp)
|
|
109
|
+
if cell_svg.nil?
|
|
110
|
+
counts[:empty] += 1
|
|
111
|
+
next
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
written = write_glyph(block, cp, cell_svg)
|
|
115
|
+
counts[written ? :written : :skipped] += 1
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
counts
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Drain a list of block-spec hashes through the worker pool.
|
|
122
|
+
# Each spec has the same shape as #write_block's kwargs:
|
|
123
|
+
#
|
|
124
|
+
# { block:, pdf_path:, page_map: }
|
|
125
|
+
#
|
|
126
|
+
# @param specs [Array<Hash>]
|
|
127
|
+
# @return [Hash] aggregated tally across all blocks
|
|
128
|
+
def write_all(specs)
|
|
129
|
+
return drain_inline(specs) if @parallel_workers <= 1
|
|
130
|
+
|
|
131
|
+
drain_threaded(specs)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
def zero_tally
|
|
137
|
+
{ written: 0, skipped: 0, empty: 0, placeholder: 0, no_grid: 0 }
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def no_grid_tally
|
|
141
|
+
zero_tally.tap { |h| h[:no_grid] = 1 }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def merge_tally!(acc, other)
|
|
145
|
+
other.each { |k, v| acc[k] = (acc[k] || 0) + v }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def drain_inline(specs)
|
|
149
|
+
specs.each_with_object(zero_tally) do |spec, tally|
|
|
150
|
+
merge_tally!(tally, write_block(**spec))
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def drain_threaded(specs)
|
|
155
|
+
queue = Queue.new
|
|
156
|
+
mutex = Mutex.new
|
|
157
|
+
tally = zero_tally
|
|
158
|
+
|
|
159
|
+
workers = Array.new(@parallel_workers) do
|
|
160
|
+
Thread.new do
|
|
161
|
+
loop do
|
|
162
|
+
spec = queue.pop
|
|
163
|
+
break if spec.nil?
|
|
164
|
+
|
|
165
|
+
result = write_block(**spec)
|
|
166
|
+
mutex.synchronize { merge_tally!(tally, result) }
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
specs.each { |spec| queue << spec }
|
|
172
|
+
@parallel_workers.times { queue << nil }
|
|
173
|
+
workers.each(&:join)
|
|
174
|
+
tally
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def render_page(pdf_path, page_num)
|
|
178
|
+
Dir.mktmpdir do |dir|
|
|
179
|
+
out = File.join(dir, "p#{page_num}.svg")
|
|
180
|
+
begin
|
|
181
|
+
result = @renderer.render(Pathname.new(pdf_path), page_num, out)
|
|
182
|
+
rescue Ucode::PdfRenderError
|
|
183
|
+
# Graceful degradation: a broken renderer (e.g. mutool on a
|
|
184
|
+
# host without LCMS) yields no_grid → placeholders downstream.
|
|
185
|
+
next nil
|
|
186
|
+
end
|
|
187
|
+
return nil unless result == :ok && File.exist?(out)
|
|
188
|
+
|
|
189
|
+
Nokogiri::XML(File.read(out))
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def write_glyph(block, codepoint, cell_svg)
|
|
194
|
+
cp_id = Repo::Paths.cp_id(codepoint)
|
|
195
|
+
path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
|
|
196
|
+
write_atomic(path, serialize_svg(cell_svg))
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# For every assigned codepoint in the block that doesn't already
|
|
200
|
+
# have a glyph.svg on disk, write a placeholder.
|
|
201
|
+
def placeholder_pass(block, tally)
|
|
202
|
+
return tally if block.codepoint_ids.nil? || block.codepoint_ids.empty?
|
|
203
|
+
|
|
204
|
+
block.codepoint_ids.each do |cp_id|
|
|
205
|
+
cp = cp_id_to_int(cp_id)
|
|
206
|
+
next unless cp
|
|
207
|
+
next unless block.covers?(cp)
|
|
208
|
+
|
|
209
|
+
path = Repo::Paths.codepoint_glyph_path(@output_root, block.id, cp_id)
|
|
210
|
+
next if path.exist?
|
|
211
|
+
|
|
212
|
+
if write_atomic(path, placeholder_svg_payload)
|
|
213
|
+
tally[:placeholder] = (tally[:placeholder] || 0) + 1
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
tally
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def cp_id_to_int(cp_id)
|
|
220
|
+
return nil unless cp_id.is_a?(String) && cp_id.start_with?("U+")
|
|
221
|
+
|
|
222
|
+
cp_id[2..].to_i(16)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def placeholder_svg_payload
|
|
226
|
+
size = PlaceholderViewBoxSize
|
|
227
|
+
# A simple dashed square + text marker so the site can render
|
|
228
|
+
# an obvious "no official glyph" badge without needing extra state.
|
|
229
|
+
<<~SVG
|
|
230
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
231
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 #{size} #{size}" width="#{size}" height="#{size}">
|
|
232
|
+
<rect x="1" y="1" width="#{size - 2}" height="#{size - 2}" fill="none" stroke="#999" stroke-width="1" stroke-dasharray="4 4"/>
|
|
233
|
+
<text x="#{size / 2}" y="#{size / 2}" font-family="sans-serif" font-size="14" text-anchor="middle" dominant-baseline="middle" fill="#999">no glyph</text>
|
|
234
|
+
</svg>
|
|
235
|
+
SVG
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def serialize_svg(doc)
|
|
239
|
+
doc.to_xml.strip
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def raise_missing_pdf!(block, pdf_path)
|
|
243
|
+
raise Ucode::GlyphError.new(
|
|
244
|
+
"no PDF available for block '#{block.id}'",
|
|
245
|
+
context: { block_id: block.id, pdf_path: pdf_path&.to_s },
|
|
246
|
+
)
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
data/lib/ucode/glyphs.rb
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
# Glyphs — converts Code Charts PDF pages into per-codepoint SVGs.
|
|
5
|
+
#
|
|
6
|
+
# Pipeline: fetch per-block PDF → render to SVG → detect grid → extract
|
|
7
|
+
# cell → normalize viewBox → write glyph.svg.
|
|
8
|
+
#
|
|
9
|
+
# Vector extraction only. NEVER run OCR.
|
|
10
|
+
module Glyphs
|
|
11
|
+
autoload :PdfFetcher, "ucode/glyphs/pdf_fetcher"
|
|
12
|
+
autoload :PageRenderer, "ucode/glyphs/page_renderer"
|
|
13
|
+
autoload :MutoolRenderer, "ucode/glyphs/mutool_renderer"
|
|
14
|
+
autoload :Pdf2svgRenderer, "ucode/glyphs/pdf2svg_renderer"
|
|
15
|
+
autoload :DvisvgmRenderer, "ucode/glyphs/dvisvgm_renderer"
|
|
16
|
+
autoload :PdftocairoRenderer, "ucode/glyphs/pdftocairo_renderer"
|
|
17
|
+
autoload :Grid, "ucode/glyphs/grid"
|
|
18
|
+
autoload :PathBbox, "ucode/glyphs/path_bbox"
|
|
19
|
+
autoload :GridDetector, "ucode/glyphs/grid_detector"
|
|
20
|
+
autoload :CellExtractor, "ucode/glyphs/cell_extractor"
|
|
21
|
+
autoload :MonolithPageMap, "ucode/glyphs/monolith_page_map"
|
|
22
|
+
autoload :Writer, "ucode/glyphs/writer"
|
|
23
|
+
autoload :LastResort, "ucode/glyphs/last_resort"
|
|
24
|
+
autoload :EmbeddedFonts, "ucode/glyphs/embedded_fonts"
|
|
25
|
+
autoload :RealFonts, "ucode/glyphs/real_fonts"
|
|
26
|
+
end
|
|
27
|
+
end
|
data/lib/ucode/index.rb
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
require "yaml"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
# Sorted, run-length-encoded lookup table over Unicode codepoints.
|
|
8
|
+
#
|
|
9
|
+
# One Index answers "what <thing> does codepoint N belong to?" for one
|
|
10
|
+
# property (block, or script). Lookup is O(log N) via `bsearch_index`.
|
|
11
|
+
#
|
|
12
|
+
# Two ways to construct:
|
|
13
|
+
# - `Index.from_triples([[first, last, name], ...])`
|
|
14
|
+
# - `Index.load(path)` from a YAML file previously written by `#save`.
|
|
15
|
+
#
|
|
16
|
+
# The YAML form is the dependency-free alternative to SQLite — same
|
|
17
|
+
# query API, simpler ops. Pick whichever fits the deployment.
|
|
18
|
+
class Index
|
|
19
|
+
include Enumerable
|
|
20
|
+
|
|
21
|
+
def initialize(entries)
|
|
22
|
+
@entries = entries.sort
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
attr_reader :entries
|
|
26
|
+
|
|
27
|
+
def each(&block)
|
|
28
|
+
@entries.each(&block)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def size
|
|
32
|
+
@entries.size
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @param codepoint [Integer]
|
|
36
|
+
# @return [String, nil] the name of the range covering `codepoint`, or nil
|
|
37
|
+
def lookup(codepoint)
|
|
38
|
+
idx = bsearch_index(codepoint)
|
|
39
|
+
idx && @entries[idx].name
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Enumerate every range whose [first_cp, last_cp] overlaps the
|
|
43
|
+
# inclusive query range. Returns a lazy Enumerator when called
|
|
44
|
+
# without a block.
|
|
45
|
+
# @param first [Integer]
|
|
46
|
+
# @param last [Integer]
|
|
47
|
+
# @return [Enumerator<RangeEntry>, nil]
|
|
48
|
+
def each_overlapping(first, last, &block)
|
|
49
|
+
return enum_for(:each_overlapping, first, last) unless block_given?
|
|
50
|
+
|
|
51
|
+
start_idx = bsearch_first_overlap(first)
|
|
52
|
+
return if start_idx.nil?
|
|
53
|
+
|
|
54
|
+
@entries[start_idx..].each do |entry|
|
|
55
|
+
break if entry.first_cp > last
|
|
56
|
+
|
|
57
|
+
yield entry if entry.last_cp >= first
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Serialize to a YAML file.
|
|
62
|
+
# @param path [String, Pathname]
|
|
63
|
+
# @return [void]
|
|
64
|
+
def save(path)
|
|
65
|
+
File.open(path, "w") do |file|
|
|
66
|
+
YAML.dump(@entries.map(&:to_h), file)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Load from a YAML file previously written by #save.
|
|
71
|
+
# @param path [String, Pathname]
|
|
72
|
+
# @return [Index]
|
|
73
|
+
def self.load(path)
|
|
74
|
+
hashes = YAML.load_file(path)
|
|
75
|
+
new(hashes.map { |h| RangeEntry.from_h(h) })
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Build an Index from raw [first_cp, last_cp, name] triples.
|
|
79
|
+
# @param triples [Array<Array(Integer, Integer, String)>]
|
|
80
|
+
# @return [Index]
|
|
81
|
+
def self.from_triples(triples)
|
|
82
|
+
new(triples.map { |first, last, name| RangeEntry.new(first, last, name) })
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
# bsearch_index integer-mode convention: -1 = search LEFT, +1 = RIGHT,
|
|
88
|
+
# 0 = match. See Coordinator#find_in_range for the same convention.
|
|
89
|
+
def bsearch_index(codepoint)
|
|
90
|
+
@entries.bsearch_index do |entry|
|
|
91
|
+
if codepoint < entry.first_cp
|
|
92
|
+
-1
|
|
93
|
+
elsif codepoint > entry.last_cp
|
|
94
|
+
1
|
|
95
|
+
else
|
|
96
|
+
0
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Boolean-mode bsearch: first entry whose `last_cp >= first`.
|
|
102
|
+
def bsearch_first_overlap(first)
|
|
103
|
+
@entries.bsearch_index { |entry| entry.last_cp >= first }
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|