ucode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CLAUDE.md +211 -0
- data/Gemfile +22 -0
- data/Gemfile.lock +406 -0
- data/README.md +469 -0
- data/Rakefile +18 -0
- data/TODO.new/00-README.md +66 -0
- data/TODO.new/01-pillar-terminology-alignment.md +69 -0
- data/TODO.new/02-audit-schema-design.md +255 -0
- data/TODO.new/03-directory-output-spec.md +203 -0
- data/TODO.new/04-fontist-org-contract.md +173 -0
- data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
- data/TODO.new/06-audit-namespace-skeleton.md +105 -0
- data/TODO.new/07-audit-models-port.md +132 -0
- data/TODO.new/08-extractors-cheap-port.md +113 -0
- data/TODO.new/09-extractors-expensive-port.md +99 -0
- data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
- data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
- data/TODO.new/12-formatters-port.md +115 -0
- data/TODO.new/13-directory-emitter.md +147 -0
- data/TODO.new/14-html-face-browser.md +144 -0
- data/TODO.new/15-html-library-browser.md +102 -0
- data/TODO.new/16-cli-audit-subcommands.md +142 -0
- data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
- data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
- data/TODO.new/19-fontisan-docs-update.md +155 -0
- data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
- data/TODO.new/21-canonical-unicode17-build.md +148 -0
- data/TODO.new/22-implementation-order.md +176 -0
- data/UCODE_CHANGELOG.md +97 -0
- data/exe/ucode +8 -0
- data/lib/ucode/aggregator.rb +77 -0
- data/lib/ucode/audit/block_aggregator.rb +90 -0
- data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
- data/lib/ucode/audit/context.rb +137 -0
- data/lib/ucode/audit/discrepancy_detector.rb +213 -0
- data/lib/ucode/audit/extractors/aggregations.rb +70 -0
- data/lib/ucode/audit/extractors/base.rb +21 -0
- data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
- data/lib/ucode/audit/extractors/coverage.rb +55 -0
- data/lib/ucode/audit/extractors/hinting.rb +199 -0
- data/lib/ucode/audit/extractors/identity.rb +65 -0
- data/lib/ucode/audit/extractors/licensing.rb +75 -0
- data/lib/ucode/audit/extractors/metrics.rb +108 -0
- data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
- data/lib/ucode/audit/extractors/provenance.rb +34 -0
- data/lib/ucode/audit/extractors/style.rb +88 -0
- data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
- data/lib/ucode/audit/extractors.rb +31 -0
- data/lib/ucode/audit/plane_aggregator.rb +37 -0
- data/lib/ucode/audit/registry.rb +63 -0
- data/lib/ucode/audit/script_aggregator.rb +92 -0
- data/lib/ucode/audit.rb +27 -0
- data/lib/ucode/cache.rb +113 -0
- data/lib/ucode/cli.rb +272 -0
- data/lib/ucode/commands/build.rb +68 -0
- data/lib/ucode/commands/cache.rb +46 -0
- data/lib/ucode/commands/fetch.rb +62 -0
- data/lib/ucode/commands/font_coverage.rb +57 -0
- data/lib/ucode/commands/glyphs.rb +136 -0
- data/lib/ucode/commands/lookup.rb +65 -0
- data/lib/ucode/commands/parse.rb +62 -0
- data/lib/ucode/commands/site.rb +33 -0
- data/lib/ucode/commands.rb +19 -0
- data/lib/ucode/config.rb +110 -0
- data/lib/ucode/coordinator/indices.rb +34 -0
- data/lib/ucode/coordinator.rb +397 -0
- data/lib/ucode/database.rb +214 -0
- data/lib/ucode/db_builder.rb +107 -0
- data/lib/ucode/error.rb +96 -0
- data/lib/ucode/fetch/code_charts.rb +57 -0
- data/lib/ucode/fetch/http.rb +83 -0
- data/lib/ucode/fetch/ucd_zip.rb +57 -0
- data/lib/ucode/fetch/unihan_zip.rb +57 -0
- data/lib/ucode/fetch.rb +14 -0
- data/lib/ucode/glyphs/cell_extractor.rb +130 -0
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
- data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
- data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
- data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
- data/lib/ucode/glyphs/grid.rb +30 -0
- data/lib/ucode/glyphs/grid_detector.rb +165 -0
- data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
- data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
- data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
- data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
- data/lib/ucode/glyphs/last_resort/source.rb +125 -0
- data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
- data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
- data/lib/ucode/glyphs/last_resort.rb +36 -0
- data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
- data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
- data/lib/ucode/glyphs/page_renderer.rb +221 -0
- data/lib/ucode/glyphs/path_bbox.rb +62 -0
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
- data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
- data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
- data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
- data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
- data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
- data/lib/ucode/glyphs/real_fonts.rb +32 -0
- data/lib/ucode/glyphs/writer.rb +250 -0
- data/lib/ucode/glyphs.rb +27 -0
- data/lib/ucode/index.rb +106 -0
- data/lib/ucode/index_builder.rb +94 -0
- data/lib/ucode/models/audit/audit_axis.rb +30 -0
- data/lib/ucode/models/audit/audit_diff.rb +77 -0
- data/lib/ucode/models/audit/audit_report.rb +137 -0
- data/lib/ucode/models/audit/baseline.rb +32 -0
- data/lib/ucode/models/audit/block_summary.rb +72 -0
- data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
- data/lib/ucode/models/audit/codepoint_range.rb +39 -0
- data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
- data/lib/ucode/models/audit/color_capabilities.rb +91 -0
- data/lib/ucode/models/audit/discrepancy.rb +38 -0
- data/lib/ucode/models/audit/duplicate_group.rb +23 -0
- data/lib/ucode/models/audit/embedding_type.rb +81 -0
- data/lib/ucode/models/audit/field_change.rb +28 -0
- data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
- data/lib/ucode/models/audit/gasp_range.rb +63 -0
- data/lib/ucode/models/audit/hinting.rb +99 -0
- data/lib/ucode/models/audit/library_summary.rb +40 -0
- data/lib/ucode/models/audit/licensing.rb +48 -0
- data/lib/ucode/models/audit/metrics.rb +111 -0
- data/lib/ucode/models/audit/named_instance.rb +41 -0
- data/lib/ucode/models/audit/opentype_layout.rb +38 -0
- data/lib/ucode/models/audit/plane_summary.rb +31 -0
- data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
- data/lib/ucode/models/audit/script_features.rb +28 -0
- data/lib/ucode/models/audit/script_summary.rb +54 -0
- data/lib/ucode/models/audit/variation_detail.rb +42 -0
- data/lib/ucode/models/audit.rb +50 -0
- data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
- data/lib/ucode/models/bidi_mirroring.rb +19 -0
- data/lib/ucode/models/binary_property_assignment.rb +26 -0
- data/lib/ucode/models/block.rb +36 -0
- data/lib/ucode/models/case_folding_rule.rb +23 -0
- data/lib/ucode/models/cjk_radical.rb +23 -0
- data/lib/ucode/models/codepoint/bidi.rb +28 -0
- data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
- data/lib/ucode/models/codepoint/case_folding.rb +25 -0
- data/lib/ucode/models/codepoint/casing.rb +32 -0
- data/lib/ucode/models/codepoint/decomposition.rb +27 -0
- data/lib/ucode/models/codepoint/display.rb +24 -0
- data/lib/ucode/models/codepoint/emoji.rb +29 -0
- data/lib/ucode/models/codepoint/hangul.rb +20 -0
- data/lib/ucode/models/codepoint/identifier.rb +30 -0
- data/lib/ucode/models/codepoint/indic.rb +20 -0
- data/lib/ucode/models/codepoint/joining.rb +20 -0
- data/lib/ucode/models/codepoint/normalization.rb +35 -0
- data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
- data/lib/ucode/models/codepoint.rb +122 -0
- data/lib/ucode/models/name_alias.rb +21 -0
- data/lib/ucode/models/named_sequence.rb +19 -0
- data/lib/ucode/models/names_list_entry.rb +38 -0
- data/lib/ucode/models/plane.rb +36 -0
- data/lib/ucode/models/property_alias.rb +24 -0
- data/lib/ucode/models/property_value_alias.rb +26 -0
- data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
- data/lib/ucode/models/relationship/cross_reference.rb +17 -0
- data/lib/ucode/models/relationship/footnote.rb +24 -0
- data/lib/ucode/models/relationship/informal_alias.rb +18 -0
- data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
- data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
- data/lib/ucode/models/relationship.rb +57 -0
- data/lib/ucode/models/script.rb +41 -0
- data/lib/ucode/models/special_casing_rule.rb +28 -0
- data/lib/ucode/models/standardized_variant.rb +24 -0
- data/lib/ucode/models/unihan_entry.rb +23 -0
- data/lib/ucode/models.rb +47 -0
- data/lib/ucode/parsers/auxiliary.rb +26 -0
- data/lib/ucode/parsers/base.rb +137 -0
- data/lib/ucode/parsers/bidi_brackets.rb +41 -0
- data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
- data/lib/ucode/parsers/blocks.rb +63 -0
- data/lib/ucode/parsers/case_folding.rb +53 -0
- data/lib/ucode/parsers/cjk_radicals.rb +102 -0
- data/lib/ucode/parsers/derived_age.rb +59 -0
- data/lib/ucode/parsers/derived_core_properties.rb +60 -0
- data/lib/ucode/parsers/extracted_properties.rb +74 -0
- data/lib/ucode/parsers/name_aliases.rb +44 -0
- data/lib/ucode/parsers/named_sequences.rb +51 -0
- data/lib/ucode/parsers/names_list.rb +250 -0
- data/lib/ucode/parsers/property_aliases.rb +41 -0
- data/lib/ucode/parsers/property_value_aliases.rb +46 -0
- data/lib/ucode/parsers/script_extensions.rb +64 -0
- data/lib/ucode/parsers/scripts.rb +60 -0
- data/lib/ucode/parsers/special_casing.rb +62 -0
- data/lib/ucode/parsers/standardized_variants.rb +56 -0
- data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
- data/lib/ucode/parsers/unicode_data.rb +268 -0
- data/lib/ucode/parsers/unihan.rb +125 -0
- data/lib/ucode/parsers.rb +35 -0
- data/lib/ucode/range_entry.rb +58 -0
- data/lib/ucode/repo/aggregate_writer.rb +364 -0
- data/lib/ucode/repo/atomic_writes.rb +48 -0
- data/lib/ucode/repo/codepoint_writer.rb +96 -0
- data/lib/ucode/repo/paths.rb +122 -0
- data/lib/ucode/repo.rb +22 -0
- data/lib/ucode/site/config_emitter.rb +124 -0
- data/lib/ucode/site/generator.rb +178 -0
- data/lib/ucode/site/search_index.rb +68 -0
- data/lib/ucode/site/template/.gitignore +4 -0
- data/lib/ucode/site/template/.vitepress/config.ts +8 -0
- data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
- data/lib/ucode/site/template/char/[codepoint].md +13 -0
- data/lib/ucode/site/template/components/BlockView.vue +57 -0
- data/lib/ucode/site/template/components/CharView.vue +85 -0
- data/lib/ucode/site/template/components/PlaneView.vue +56 -0
- data/lib/ucode/site/template/components/SearchView.vue +66 -0
- data/lib/ucode/site/template/index.md +25 -0
- data/lib/ucode/site/template/package.json +18 -0
- data/lib/ucode/site/template/search.md +9 -0
- data/lib/ucode/site.rb +13 -0
- data/lib/ucode/version.rb +5 -0
- data/lib/ucode/version_resolver.rb +76 -0
- data/lib/ucode.rb +74 -0
- data/ucode.gemspec +56 -0
- metadata +404 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open3"
|
|
4
|
+
require "pathname"
|
|
5
|
+
|
|
6
|
+
require_relative "../../error"
|
|
7
|
+
require_relative "font_entry"
|
|
8
|
+
require_relative "tounicode"
|
|
9
|
+
|
|
10
|
+
module Ucode
|
|
11
|
+
module Glyphs
|
|
12
|
+
module EmbeddedFonts
|
|
13
|
+
# Walks the Code Charts PDF once and builds a global
|
|
14
|
+
# `{codepoint => FontEntry}` index.
|
|
15
|
+
#
|
|
16
|
+
# Discovery uses `mutool info` for the font list (one line per
|
|
17
|
+
# page-font), then `mutool show -g` to fetch the Type0 font dicts,
|
|
18
|
+
# their descendant CIDFont dicts, and the FontDescriptors — all in
|
|
19
|
+
# a handful of batched subprocess calls rather than one per font.
|
|
20
|
+
#
|
|
21
|
+
# For each Type0 font we then fetch its ToUnicode CMap stream
|
|
22
|
+
# (one `mutool show -b -o <tmpfile>` per font — these can't be
|
|
23
|
+
# batched because each is a separate stream) and parse it into a
|
|
24
|
+
# `{cid => codepoint}` map. With `/CIDToGIDMap /Identity` (the
|
|
25
|
+
# only form we currently support), `gid == cid`, so the per-font
|
|
26
|
+
# map is directly `{codepoint => gid}`.
|
|
27
|
+
#
|
|
28
|
+
# When multiple fonts cover the same codepoint (which happens for
|
|
29
|
+
# a handful of codepoints that appear in multiple blocks), the
|
|
30
|
+
# first font discovered wins. The discovery order follows the
|
|
31
|
+
# `mutool info` listing, which is page-major, so the earlier
|
|
32
|
+
# block's font wins — the expected behavior for the Code Charts.
|
|
33
|
+
class Catalog
|
|
34
|
+
# @param source [Source]
|
|
35
|
+
# @param correlator_configs [Hash{Integer=>ContentStreamCorrelator::Config}]
|
|
36
|
+
# maps a Type0 font's PDF object ID to the pillar-2 config to
|
|
37
|
+
# use when the font has no /ToUnicode CMap. Empty by default
|
|
38
|
+
# — fonts without ToUnicode and without a config are skipped
|
|
39
|
+
# (the v0.1 behavior).
|
|
40
|
+
def initialize(source, correlator_configs: {})
|
|
41
|
+
@source = source
|
|
42
|
+
@correlator_configs = correlator_configs
|
|
43
|
+
@index = nil
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# @return [Hash{Integer=>FontEntry}] frozen codepoint → entry map
|
|
47
|
+
def index
|
|
48
|
+
@index ||= build_index.freeze
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @param codepoint [Integer]
|
|
52
|
+
# @return [FontEntry, nil]
|
|
53
|
+
def lookup(codepoint)
|
|
54
|
+
index[codepoint]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [Array<Integer>] every codepoint this PDF covers
|
|
58
|
+
def codepoints
|
|
59
|
+
index.keys
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# @return [Integer] number of codepoints covered
|
|
63
|
+
def size
|
|
64
|
+
index.size
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# @return [Integer] number of Type0 fonts discovered
|
|
68
|
+
def font_count
|
|
69
|
+
font_entries.size
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# @return [Array<FontEntry>] every font entry (one per Type0 font)
|
|
73
|
+
def font_entries
|
|
74
|
+
@font_entries ||= build_font_entries
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def build_index
|
|
80
|
+
idx = {}
|
|
81
|
+
font_entries.each do |entry|
|
|
82
|
+
entry.codepoints.each do |cp|
|
|
83
|
+
idx[cp] ||= entry
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
idx
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Step 1: parse `mutool info` for the Type0 font list.
|
|
90
|
+
# Step 2: batch `mutool show -g` to get the Type0 dicts.
|
|
91
|
+
# Step 3: batch `mutool show -g` for the descendant CIDFont dicts.
|
|
92
|
+
# Step 4: batch `mutool show -g` for the FontDescriptors.
|
|
93
|
+
# Step 5: for each font, fetch + parse the ToUnicode CMap.
|
|
94
|
+
def build_font_entries
|
|
95
|
+
type0_refs = discover_type0_fonts
|
|
96
|
+
return [] if type0_refs.empty?
|
|
97
|
+
|
|
98
|
+
type0_dicts = fetch_objects(type0_refs.keys)
|
|
99
|
+
descendant_refs = []
|
|
100
|
+
tounicode_refs = []
|
|
101
|
+
type0_refs.each_key do |font_obj_id|
|
|
102
|
+
d = type0_dicts[font_obj_id] || {}
|
|
103
|
+
desc_ref = first_ref(d["DescendantFonts"])
|
|
104
|
+
tu_ref = first_ref(d["ToUnicode"])
|
|
105
|
+
descendant_refs << desc_ref if desc_ref
|
|
106
|
+
tounicode_refs << tu_ref if tu_ref
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
descendant_dicts = fetch_objects(descendant_refs)
|
|
110
|
+
fontdesc_refs = []
|
|
111
|
+
descendant_dicts.each_value do |d|
|
|
112
|
+
fd_ref = first_ref(d["FontDescriptor"])
|
|
113
|
+
fontdesc_refs << fd_ref if fd_ref
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
fontdesc_dicts = fetch_objects(fontdesc_refs)
|
|
117
|
+
|
|
118
|
+
# Walk again, now with all dicts in hand, and build entries.
|
|
119
|
+
entries = []
|
|
120
|
+
type0_refs.each do |font_obj_id, base_font|
|
|
121
|
+
entry = build_entry(
|
|
122
|
+
font_obj_id: font_obj_id,
|
|
123
|
+
base_font: base_font,
|
|
124
|
+
type0_dict: type0_dicts[font_obj_id],
|
|
125
|
+
descendant_dicts: descendant_dicts,
|
|
126
|
+
fontdesc_dicts: fontdesc_dicts,
|
|
127
|
+
)
|
|
128
|
+
entries << entry if entry
|
|
129
|
+
end
|
|
130
|
+
entries
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Parse `mutool info` output for Type0 fonts.
|
|
134
|
+
# Format per line: `\t<page>\t(<page_obj> 0 R):\tType0 '<name>' <enc> (<font_obj> 0 R)`
|
|
135
|
+
# Returns `{font_obj_id => base_font}` preserving first-seen order.
|
|
136
|
+
def discover_type0_fonts
|
|
137
|
+
# `mutool info` writes its report to STDERR, not STDOUT.
|
|
138
|
+
out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
|
|
139
|
+
unless status.success?
|
|
140
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
141
|
+
"mutool info failed: #{(out + err).strip}"
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
text = out + err
|
|
145
|
+
result = {}
|
|
146
|
+
seen = Set.new
|
|
147
|
+
text.each_line do |line|
|
|
148
|
+
next unless line.include?("Type0")
|
|
149
|
+
|
|
150
|
+
# Font lines look like: "<page>\t(<pageobj> 0 R):\tType0 '<base>' <enc> (<fontobj> 0 R)"
|
|
151
|
+
m = line.match(/Type0\s+'([^']+)'\s+\S+\s+\((\d+)\s+0\s+R\)/)
|
|
152
|
+
next unless m
|
|
153
|
+
|
|
154
|
+
base_font = m[1]
|
|
155
|
+
font_obj_id = m[2].to_i
|
|
156
|
+
next if seen.include?(font_obj_id)
|
|
157
|
+
|
|
158
|
+
seen << font_obj_id
|
|
159
|
+
result[font_obj_id] = base_font
|
|
160
|
+
end
|
|
161
|
+
result
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Batch `mutool show -g` for many object numbers at once.
|
|
165
|
+
# Returns `{obj_id => parsed_dict_hash}`.
|
|
166
|
+
def fetch_objects(obj_ids)
|
|
167
|
+
return {} if obj_ids.empty?
|
|
168
|
+
|
|
169
|
+
args = ["mutool", "show", "-g",
|
|
170
|
+
@source.pdf_to_s].concat(obj_ids.map(&:to_s))
|
|
171
|
+
out, err, status = Open3.capture3(*args)
|
|
172
|
+
unless status.success?
|
|
173
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
174
|
+
"mutool show failed: #{err.strip}"
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
parse_grep_output(out)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Parse the `mutool show -g` output: one `<id> 0 obj <<...>>` per line.
|
|
181
|
+
# The dictionary body is a flat string of `/Key value` pairs;
|
|
182
|
+
# value can be a number, name, string, array, or nested dict.
|
|
183
|
+
# We extract a small set of keys we care about and represent
|
|
184
|
+
# their values as strings (caller uses helpers like first_ref).
|
|
185
|
+
def parse_grep_output(text)
|
|
186
|
+
result = {}
|
|
187
|
+
text.each_line do |line|
|
|
188
|
+
m = line.match(/^(\d+)\s+0\s+obj\s+(.*)$/)
|
|
189
|
+
next unless m
|
|
190
|
+
|
|
191
|
+
obj_id = m[1].to_i
|
|
192
|
+
result[obj_id] = parse_dict(m[2])
|
|
193
|
+
end
|
|
194
|
+
result
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# We don't try to fully parse the PDF dict grammar. Instead we
|
|
198
|
+
# regex each field we need directly out of the dict body. This
|
|
199
|
+
# is robust to `<<...>>`/`[...]` nesting and to `/Key/Value`
|
|
200
|
+
# pairs (no whitespace) that break naive whitespace-split parsers.
|
|
201
|
+
def parse_dict(body)
|
|
202
|
+
body = body.to_s
|
|
203
|
+
{
|
|
204
|
+
"BaseFont" => field_match(body, %r{/BaseFont/([^\s/<>]+)}),
|
|
205
|
+
"DescendantFonts" => field_match(body,
|
|
206
|
+
%r{/DescendantFonts\s*\[\s*(\d+)\s+0\s+R\s*\]}),
|
|
207
|
+
"ToUnicode" => field_match(body, %r{/ToUnicode\s+(\d+)\s+0\s+R}),
|
|
208
|
+
"FontDescriptor" => field_match(body,
|
|
209
|
+
%r{/FontDescriptor\s+(\d+)\s+0\s+R}),
|
|
210
|
+
"FontFile2" => field_match(body, %r{/FontFile2\s+(\d+)\s+0\s+R}),
|
|
211
|
+
"FontFile3" => field_match(body, %r{/FontFile3\s+(\d+)\s+0\s+R}),
|
|
212
|
+
"CIDToGIDMap" => field_match(body,
|
|
213
|
+
%r{/CIDToGIDMap(?:/([^\s/<>]+)|\s+(\d+)\s+0\s+R)}),
|
|
214
|
+
}.compact
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def field_match(body, regex)
|
|
218
|
+
m = body.match(regex)
|
|
219
|
+
return nil unless m
|
|
220
|
+
|
|
221
|
+
m.captures.compact.first
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Cast a captured integer string into an Integer, tolerant of nil.
|
|
225
|
+
# {parse_dict}'s regexes already extract just the digit run.
|
|
226
|
+
def first_ref(value)
|
|
227
|
+
return nil if value.nil? || value.empty?
|
|
228
|
+
|
|
229
|
+
Integer(value)
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def build_entry(font_obj_id:, base_font:, type0_dict:,
|
|
233
|
+
descendant_dicts:, fontdesc_dicts:)
|
|
234
|
+
desc_ref = first_ref(type0_dict["DescendantFonts"])
|
|
235
|
+
tu_ref = first_ref(type0_dict["ToUnicode"])
|
|
236
|
+
return nil unless desc_ref
|
|
237
|
+
|
|
238
|
+
desc_dict = descendant_dicts[desc_ref] || {}
|
|
239
|
+
fd_dict = fontdesc_for(desc_dict, fontdesc_dicts)
|
|
240
|
+
return nil unless fd_dict
|
|
241
|
+
|
|
242
|
+
fontfile_obj_id, fontfile_kind = resolve_fontfile(fd_dict)
|
|
243
|
+
return nil unless fontfile_obj_id
|
|
244
|
+
|
|
245
|
+
cid_map_kind = resolve_cid_to_gid(desc_dict)
|
|
246
|
+
return nil unless cid_map_kind
|
|
247
|
+
|
|
248
|
+
cp_to_gid = build_codepoint_to_gid(
|
|
249
|
+
font_obj_id: font_obj_id,
|
|
250
|
+
tu_ref: tu_ref,
|
|
251
|
+
cid_map_kind: cid_map_kind,
|
|
252
|
+
)
|
|
253
|
+
return nil if cp_to_gid.empty?
|
|
254
|
+
|
|
255
|
+
FontEntry.new(
|
|
256
|
+
base_font: base_font,
|
|
257
|
+
font_obj_id: font_obj_id,
|
|
258
|
+
fontfile_obj_id: fontfile_obj_id,
|
|
259
|
+
fontfile_kind: fontfile_kind,
|
|
260
|
+
tounicode_obj_id: tu_ref,
|
|
261
|
+
cid_to_gid_map: cid_map_kind,
|
|
262
|
+
codepoint_to_gid: cp_to_gid.freeze,
|
|
263
|
+
source: @source,
|
|
264
|
+
)
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def fontdesc_for(desc_dict, fontdesc_dicts)
|
|
268
|
+
fd_ref = first_ref(desc_dict["FontDescriptor"])
|
|
269
|
+
return nil unless fd_ref
|
|
270
|
+
|
|
271
|
+
fontdesc_dicts[fd_ref]
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
# Tier-1 path: parse the /ToUnicode CMap. Pillar-2 fallback:
|
|
275
|
+
# when no /ToUnicode is present, consult the correlator_configs
|
|
276
|
+
# registry — if the user supplied a config for this font, render
|
|
277
|
+
# the relevant page(s) to SVG and run positional correlation.
|
|
278
|
+
# Returns an empty hash when neither path produces a map (the
|
|
279
|
+
# caller treats that as "skip this font").
|
|
280
|
+
def build_codepoint_to_gid(font_obj_id:, tu_ref:, cid_map_kind:)
|
|
281
|
+
return {} if cid_map_kind != :identity
|
|
282
|
+
|
|
283
|
+
return codepoint_map_from_tounicode(tu_ref) if tu_ref
|
|
284
|
+
|
|
285
|
+
codepoint_map_from_correlator(font_obj_id)
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
def codepoint_map_from_tounicode(tu_ref)
|
|
289
|
+
cmap_text = fetch_tounicode(tu_ref)
|
|
290
|
+
build_codepoint_map(ToUnicode.parse(cmap_text), :identity)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def codepoint_map_from_correlator(font_obj_id)
|
|
294
|
+
config = @correlator_configs[font_obj_id]
|
|
295
|
+
return {} unless config
|
|
296
|
+
|
|
297
|
+
svg = render_pages(config.page_numbers)
|
|
298
|
+
ContentStreamCorrelator.new(config).correlate(svg)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def resolve_fontfile(fd_dict)
|
|
302
|
+
if fd_dict.key?("FontFile2")
|
|
303
|
+
[first_ref(fd_dict["FontFile2"]), :ttf]
|
|
304
|
+
elsif fd_dict.key?("FontFile3")
|
|
305
|
+
[first_ref(fd_dict["FontFile3"]), :cff]
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def resolve_cid_to_gid(desc_dict)
|
|
310
|
+
raw = desc_dict["CIDToGIDMap"]
|
|
311
|
+
return nil if raw.nil?
|
|
312
|
+
|
|
313
|
+
# parse_dict captures the name without the leading slash, so
|
|
314
|
+
# "/Identity" comes through as "Identity". A stream-form map
|
|
315
|
+
# is captured as the integer obj id — not supported yet.
|
|
316
|
+
if raw.to_s == "Identity"
|
|
317
|
+
:identity
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
def fetch_tounicode(obj_id)
|
|
322
|
+
Tempfile.create("ucode-tounicode") do |tmp|
|
|
323
|
+
tmp.close
|
|
324
|
+
ok = system("mutool", "show", "-o", tmp.path, "-b",
|
|
325
|
+
@source.pdf_to_s, obj_id.to_s,
|
|
326
|
+
out: File::NULL, err: File::NULL)
|
|
327
|
+
unless ok
|
|
328
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
329
|
+
"mutool show failed for ToUnicode obj=#{obj_id}"
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
File.binread(tmp.path).force_encoding("UTF-8")
|
|
333
|
+
end
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
# Render the given 1-based PDF pages to a single SVG string
|
|
337
|
+
# suitable for {ContentStreamCorrelator#correlate}. Each page
|
|
338
|
+
# is a separate `<svg>...</svg>` document; the correlator's
|
|
339
|
+
# `<use>` regex tolerates either a single concatenated blob or
|
|
340
|
+
# multiple documents. Output is captured from mutool's stdout.
|
|
341
|
+
def render_pages(page_numbers)
|
|
342
|
+
return "" if page_numbers.nil? || page_numbers.empty?
|
|
343
|
+
|
|
344
|
+
out, err, status = run_mutool_draw(page_numbers)
|
|
345
|
+
unless status.success?
|
|
346
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
347
|
+
"mutool draw failed: #{err.strip}"
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
out
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def run_mutool_draw(page_numbers)
|
|
354
|
+
Open3.capture3(
|
|
355
|
+
"mutool", "draw", "-F", "svg",
|
|
356
|
+
@source.pdf_to_s,
|
|
357
|
+
*page_numbers.map(&:to_s)
|
|
358
|
+
)
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
def build_codepoint_map(cid_to_cp, cid_map_kind)
|
|
362
|
+
return {} if cid_to_cp.empty? || cid_map_kind != :identity
|
|
363
|
+
|
|
364
|
+
# With /CIDToGIDMap /Identity, gid == cid.
|
|
365
|
+
cid_to_cp.each_with_object({}) do |(cid, cp), h|
|
|
366
|
+
h[cp] = cid
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
end
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Glyphs
|
|
5
|
+
module EmbeddedFonts
|
|
6
|
+
# Pillar 2 fallback: build a `{codepoint => gid}` map for a Type0
|
|
7
|
+
# font whose PDF object graph has no `/ToUnicode` CMap stream.
|
|
8
|
+
#
|
|
9
|
+
# The Code Charts draw every chart cell as a `<use>` element that
|
|
10
|
+
# references the font's GID via an `href` of the form
|
|
11
|
+
# `#font_<font_obj_id>_<gid>`. The chart also prints the row +
|
|
12
|
+
# column codepoint labels using one or more "label" fonts (small
|
|
13
|
+
# Latin glyphs) that show the hex codepoint as text. By clustering
|
|
14
|
+
# the labels positionally (Y-bucket for the row, X-bucket for the
|
|
15
|
+
# column) we recover the codepoint each cluster represents, then
|
|
16
|
+
# match each cluster positionally to the specimen glyph at the
|
|
17
|
+
# same Y/X position.
|
|
18
|
+
#
|
|
19
|
+
# The algorithm generalizes the Tai Yo correlator that was tested
|
|
20
|
+
# against `data/pdfs/U1E6C0.pdf` (50/52 specimen codepoints
|
|
21
|
+
# matched, with the two missing being layout edge cases). The
|
|
22
|
+
# bucket sizes are configurable because some blocks use a tighter
|
|
23
|
+
# grid than others.
|
|
24
|
+
#
|
|
25
|
+
# Inputs are deliberately pure: a string of SVG markup plus a
|
|
26
|
+
# {Config}. The catalog is responsible for sourcing the SVG (by
|
|
27
|
+
# rendering the relevant PDF page(s) via `mutool draw -F svg`) and
|
|
28
|
+
# for knowing which font_obj_ids are labels vs specimen on that
|
|
29
|
+
# page. That keeps this class trivially testable with synthetic
|
|
30
|
+
# SVG fixtures.
|
|
31
|
+
class ContentStreamCorrelator
|
|
32
|
+
# Per-font / per-block configuration.
|
|
33
|
+
#
|
|
34
|
+
# @!attribute label_font_ids [Array<Integer>] Type0 font object
|
|
35
|
+
# IDs whose glyphs print the hex codepoint labels on the page.
|
|
36
|
+
# @!attribute specimen_font_id [Integer] Type0 font object ID
|
|
37
|
+
# whose glyphs are the specimens we want to attribute.
|
|
38
|
+
# @!attribute page_numbers [Array<Integer>] 1-based PDF page
|
|
39
|
+
# numbers whose content streams reference the specimen font.
|
|
40
|
+
# @!attribute y_bucket [Float] vertical clustering granularity
|
|
41
|
+
# in PDF points. Default 1.5 — matches mutool's text matrix
|
|
42
|
+
# granularity for the row labels.
|
|
43
|
+
# @!attribute x_bucket [Float] horizontal clustering granularity
|
|
44
|
+
# in PDF points. Default 50.0 — separates label clusters
|
|
45
|
+
# within a row (labels are ~16pt wide, clusters ~60-160pt
|
|
46
|
+
# apart).
|
|
47
|
+
Config = Struct.new(
|
|
48
|
+
:label_font_ids,
|
|
49
|
+
:specimen_font_id,
|
|
50
|
+
:page_numbers,
|
|
51
|
+
:y_bucket,
|
|
52
|
+
:x_bucket,
|
|
53
|
+
keyword_init: true,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
DEFAULT_Y_BUCKET = 1.5
|
|
57
|
+
DEFAULT_X_BUCKET = 50.0
|
|
58
|
+
|
|
59
|
+
# Internal value object for a parsed `<use>` element. Public so
|
|
60
|
+
# the spec can construct realistic fixtures without re-implementing
|
|
61
|
+
# the parser shape.
|
|
62
|
+
Use = Struct.new(:font_id, :gid, :text, :x, :y, keyword_init: true)
|
|
63
|
+
|
|
64
|
+
# @param config [Config]
|
|
65
|
+
def initialize(config)
|
|
66
|
+
@config = config
|
|
67
|
+
@y_bucket = config.y_bucket || DEFAULT_Y_BUCKET
|
|
68
|
+
@x_bucket = config.x_bucket || DEFAULT_X_BUCKET
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @param svg [String] rendered PDF page(s) as SVG markup. May
|
|
72
|
+
# contain multiple `<svg>` documents concatenated (one per
|
|
73
|
+
# page); the regex scan handles either case.
|
|
74
|
+
# @return [Hash{Integer=>Integer}] codepoint => gid. Empty if
|
|
75
|
+
# no clusters could be matched.
|
|
76
|
+
def correlate(svg)
|
|
77
|
+
uses = parse_uses(svg)
|
|
78
|
+
return {} if uses.empty?
|
|
79
|
+
|
|
80
|
+
partition_and_map(uses)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def partition_and_map(uses)
|
|
86
|
+
labels, specimens = partition_uses(uses)
|
|
87
|
+
return {} if labels.empty? || specimens.empty?
|
|
88
|
+
|
|
89
|
+
cp_per_cluster = decode_label_clusters(labels)
|
|
90
|
+
return {} if cp_per_cluster.empty?
|
|
91
|
+
|
|
92
|
+
build_mapping(cp_per_cluster, group_rows(specimens))
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def partition_uses(uses)
|
|
96
|
+
labels = uses.select do |u|
|
|
97
|
+
@config.label_font_ids.include?(u.font_id)
|
|
98
|
+
end
|
|
99
|
+
specimens = uses.select { |u| u.font_id == @config.specimen_font_id }
|
|
100
|
+
[labels, specimens]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Match `<use .../>` elements and pull out the font_obj_id and
|
|
104
|
+
# gid from the href, plus the text matrix's e and f terms (which
|
|
105
|
+
# give the X/Y origin). The data-text attribute carries the
|
|
106
|
+
# show-string as mutool emitted it (HTML-entity-encoded).
|
|
107
|
+
def parse_uses(svg)
|
|
108
|
+
svg.scan(%r{<use ([^/>]*?)/>}).filter_map do |(attrs_s)|
|
|
109
|
+
use_from_attrs(attrs_s)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def use_from_attrs(attrs)
|
|
114
|
+
font_match = match_font_ref(attrs)
|
|
115
|
+
return nil unless font_match
|
|
116
|
+
|
|
117
|
+
tm = attrs.match(
|
|
118
|
+
/matrix\([^,]+,[^,]+,[^,]+,[^,]+,([\d.-]+),([\d.-]+)\)/,
|
|
119
|
+
)
|
|
120
|
+
return nil unless tm
|
|
121
|
+
|
|
122
|
+
build_use(attrs, font_match, tm)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def match_font_ref(attrs)
|
|
126
|
+
href = extract_href(attrs)
|
|
127
|
+
return nil unless href
|
|
128
|
+
|
|
129
|
+
href.match(/#font_(\d+)_(\d+)\z/)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def build_use(attrs, font_match, transform)
|
|
133
|
+
Use.new(
|
|
134
|
+
font_id: font_match[1].to_i,
|
|
135
|
+
gid: font_match[2].to_i,
|
|
136
|
+
text: attrs[/data-text="([^"]*)"/, 1] || "",
|
|
137
|
+
x: transform[1].to_f,
|
|
138
|
+
y: transform[2].to_f,
|
|
139
|
+
)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def extract_href(attrs)
|
|
143
|
+
attrs[/xlink:href="([^"]+)"/, 1] || attrs[/href="([^"]+)"/, 1]
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Cluster label uses by quantized (Y, X) position. Within each
|
|
147
|
+
# cluster, members are sorted by X so that joined text reads
|
|
148
|
+
# left-to-right (hex codepoint string).
|
|
149
|
+
def decode_label_clusters(labels)
|
|
150
|
+
cluster_members = bucket_labels_by_position(labels)
|
|
151
|
+
decode_each_cluster(cluster_members)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def bucket_labels_by_position(labels)
|
|
155
|
+
clusters = Hash.new { |h, k| h[k] = [] }
|
|
156
|
+
labels.each do |label|
|
|
157
|
+
key = [bucket(label.y, @y_bucket), bucket(label.x, @x_bucket)]
|
|
158
|
+
clusters[key] << label
|
|
159
|
+
end
|
|
160
|
+
clusters
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def decode_each_cluster(clusters)
|
|
164
|
+
clusters.each_with_object({}) do |(key, members), decoded|
|
|
165
|
+
text = members.sort_by(&:x).map { |m| decode_entities(m.text) }.join
|
|
166
|
+
next unless text.match?(/\A[0-9A-Fa-f]{4,6}\z/)
|
|
167
|
+
|
|
168
|
+
decoded[key] = text.to_i(16)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Group any set of uses (labels or specimens) by Y-bucket; sort
|
|
173
|
+
# each row by X so positional matching is straightforward.
|
|
174
|
+
def group_rows(uses)
|
|
175
|
+
rows = Hash.new { |h, k| h[k] = [] }
|
|
176
|
+
uses.each do |u|
|
|
177
|
+
rows[bucket(u.y, @y_bucket)] << u
|
|
178
|
+
end
|
|
179
|
+
rows.each_value { |v| v.sort_by!(&:x) }
|
|
180
|
+
rows
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Within each Y-row, the rightmost label cluster is the
|
|
184
|
+
# specimen codepoint; the rightmost specimen glyph is the
|
|
185
|
+
# specimen GID. The preceding label clusters (if any) are
|
|
186
|
+
# cross-reference codepoints, matched positionally to the
|
|
187
|
+
# preceding specimen glyphs in the same row.
|
|
188
|
+
def build_mapping(cp_per_cluster, specimen_rows)
|
|
189
|
+
cp_rows = group_cps_by_row(cp_per_cluster)
|
|
190
|
+
cp_rows.keys.sort.each_with_object({}) do |yb, mapping|
|
|
191
|
+
assign_row(mapping, cp_rows[yb], specimen_rows[yb] || [])
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def assign_row(mapping, cps, glyphs)
|
|
196
|
+
return if cps.empty? || glyphs.empty?
|
|
197
|
+
|
|
198
|
+
mapping[cps.last] = glyphs.last.gid
|
|
199
|
+
assign_xrefs(mapping, cps[0...-1], glyphs[0...-1])
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def assign_xrefs(mapping, xref_cps, xref_glyphs)
|
|
203
|
+
xref_cps.each_with_index do |cp, i|
|
|
204
|
+
g = xref_glyphs[i]
|
|
205
|
+
mapping[cp] = g.gid if g
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def group_cps_by_row(cp_per_cluster)
|
|
210
|
+
rows = Hash.new { |h, k| h[k] = [] }
|
|
211
|
+
cp_per_cluster.each do |(yb, xb), cp|
|
|
212
|
+
rows[yb] << [cp, xb]
|
|
213
|
+
end
|
|
214
|
+
rows.each_value { |v| v.sort_by! { |_, xb| xb } }
|
|
215
|
+
rows.transform_values { |v| v.map(&:first) }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def bucket(value, size)
|
|
219
|
+
(value / size).round * size
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def decode_entities(text)
|
|
223
|
+
text.gsub(/&#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U") }
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|