ucode 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ucode/code_chart/extractor.rb +1 -9
- data/lib/ucode/code_chart/writer.rb +1 -1
- data/lib/ucode/commands/canonical_build.rb +4 -4
- data/lib/ucode/commands/universal_set.rb +5 -3
- data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
- data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
- data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
- data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
- data/lib/ucode/coordinator/enrichment/display.rb +36 -0
- data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
- data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
- data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
- data/lib/ucode/coordinator/enrichment/names.rb +63 -0
- data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
- data/lib/ucode/coordinator/enrichment.rb +51 -0
- data/lib/ucode/coordinator/range_lookup.rb +65 -0
- data/lib/ucode/coordinator.rb +4 -276
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
- data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
- data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
- data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
- data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
- data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
- data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
- data/lib/ucode/glyphs/resolver_factory.rb +45 -0
- data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
- data/lib/ucode/glyphs.rb +1 -0
- data/lib/ucode/version.rb +1 -1
- metadata +20 -3
|
@@ -6,28 +6,15 @@ module Ucode
|
|
|
6
6
|
# Pillar 2 fallback: build a `{codepoint => gid}` map for a Type0
|
|
7
7
|
# font whose PDF object graph has no `/ToUnicode` CMap stream.
|
|
8
8
|
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
# Latin glyphs) that show the hex codepoint as text. By clustering
|
|
14
|
-
# the labels positionally (Y-bucket for the row, X-bucket for the
|
|
15
|
-
# column) we recover the codepoint each cluster represents, then
|
|
16
|
-
# match each cluster positionally to the specimen glyph at the
|
|
17
|
-
# same Y/X position.
|
|
9
|
+
# Adapter for the `mutool draw -F svg` output format: parses
|
|
10
|
+
# `<use>` elements from the rendered PDF page SVG, partitions into
|
|
11
|
+
# labels and specimens by PDF font object ID (supplied via {Config}),
|
|
12
|
+
# then delegates matching to {PositionalMatcher}.
|
|
18
13
|
#
|
|
19
|
-
# The
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
# grid than others.
|
|
24
|
-
#
|
|
25
|
-
# Inputs are deliberately pure: a string of SVG markup plus a
|
|
26
|
-
# {Config}. The catalog is responsible for sourcing the SVG (by
|
|
27
|
-
# rendering the relevant PDF page(s) via `mutool draw -F svg`) and
|
|
28
|
-
# for knowing which font_obj_ids are labels vs specimen on that
|
|
29
|
-
# page. That keeps this class trivially testable with synthetic
|
|
30
|
-
# SVG fixtures.
|
|
14
|
+
# The SVG parsing (regex-based `<use>` extraction, HTML entity
|
|
15
|
+
# decoding) is the only piece of format-specific work here. The
|
|
16
|
+
# matching algorithm lives in {PositionalMatcher} and is shared
|
|
17
|
+
# with {TraceCorrelator}.
|
|
31
18
|
class ContentStreamCorrelator
|
|
32
19
|
# Per-font / per-block configuration.
|
|
33
20
|
#
|
|
@@ -37,25 +24,13 @@ module Ucode
|
|
|
37
24
|
# whose glyphs are the specimens we want to attribute.
|
|
38
25
|
# @!attribute page_numbers [Array<Integer>] 1-based PDF page
|
|
39
26
|
# numbers whose content streams reference the specimen font.
|
|
40
|
-
# @!attribute y_bucket [Float] vertical clustering granularity
|
|
41
|
-
# in PDF points. Default 1.5 — matches mutool's text matrix
|
|
42
|
-
# granularity for the row labels.
|
|
43
|
-
# @!attribute x_bucket [Float] horizontal clustering granularity
|
|
44
|
-
# in PDF points. Default 50.0 — separates label clusters
|
|
45
|
-
# within a row (labels are ~16pt wide, clusters ~60-160pt
|
|
46
|
-
# apart).
|
|
47
27
|
Config = Struct.new(
|
|
48
28
|
:label_font_ids,
|
|
49
29
|
:specimen_font_id,
|
|
50
30
|
:page_numbers,
|
|
51
|
-
:y_bucket,
|
|
52
|
-
:x_bucket,
|
|
53
31
|
keyword_init: true,
|
|
54
32
|
)
|
|
55
33
|
|
|
56
|
-
DEFAULT_Y_BUCKET = 1.5
|
|
57
|
-
DEFAULT_X_BUCKET = 50.0
|
|
58
|
-
|
|
59
34
|
# Internal value object for a parsed `<use>` element. Public so
|
|
60
35
|
# the spec can construct realistic fixtures without re-implementing
|
|
61
36
|
# the parser shape.
|
|
@@ -64,8 +39,6 @@ module Ucode
|
|
|
64
39
|
# @param config [Config]
|
|
65
40
|
def initialize(config)
|
|
66
41
|
@config = config
|
|
67
|
-
@y_bucket = config.y_bucket || DEFAULT_Y_BUCKET
|
|
68
|
-
@x_bucket = config.x_bucket || DEFAULT_X_BUCKET
|
|
69
42
|
end
|
|
70
43
|
|
|
71
44
|
# @param svg [String] rendered PDF page(s) as SVG markup. May
|
|
@@ -77,29 +50,33 @@ module Ucode
|
|
|
77
50
|
uses = parse_uses(svg)
|
|
78
51
|
return {} if uses.empty?
|
|
79
52
|
|
|
80
|
-
partition_and_map(uses)
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
private
|
|
84
|
-
|
|
85
|
-
def partition_and_map(uses)
|
|
86
53
|
labels, specimens = partition_uses(uses)
|
|
87
54
|
return {} if labels.empty? || specimens.empty?
|
|
88
55
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
56
|
+
PositionalMatcher.match(
|
|
57
|
+
specimens.map { |u| to_position(u) },
|
|
58
|
+
labels.map { |u| to_position(u) },
|
|
59
|
+
)
|
|
93
60
|
end
|
|
94
61
|
|
|
62
|
+
private
|
|
63
|
+
|
|
95
64
|
def partition_uses(uses)
|
|
96
|
-
labels = uses.select
|
|
97
|
-
@config.label_font_ids.include?(u.font_id)
|
|
98
|
-
end
|
|
65
|
+
labels = uses.select { |u| @config.label_font_ids.include?(u.font_id) }
|
|
99
66
|
specimens = uses.select { |u| u.font_id == @config.specimen_font_id }
|
|
100
67
|
[labels, specimens]
|
|
101
68
|
end
|
|
102
69
|
|
|
70
|
+
def to_position(use)
|
|
71
|
+
PositionalMatcher::Position.new(
|
|
72
|
+
x: use.x,
|
|
73
|
+
y: use.y,
|
|
74
|
+
font_ref: use.font_id,
|
|
75
|
+
glyph_id: use.gid,
|
|
76
|
+
text: decode_entities(use.text),
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
|
|
103
80
|
# Match `<use .../>` elements and pull out the font_obj_id and
|
|
104
81
|
# gid from the href, plus the text matrix's e and f terms (which
|
|
105
82
|
# give the X/Y origin). The data-text attribute carries the
|
|
@@ -143,82 +120,6 @@ module Ucode
|
|
|
143
120
|
attrs[/xlink:href="([^"]+)"/, 1] || attrs[/href="([^"]+)"/, 1]
|
|
144
121
|
end
|
|
145
122
|
|
|
146
|
-
# Cluster label uses by quantized (Y, X) position. Within each
|
|
147
|
-
# cluster, members are sorted by X so that joined text reads
|
|
148
|
-
# left-to-right (hex codepoint string).
|
|
149
|
-
def decode_label_clusters(labels)
|
|
150
|
-
cluster_members = bucket_labels_by_position(labels)
|
|
151
|
-
decode_each_cluster(cluster_members)
|
|
152
|
-
end
|
|
153
|
-
|
|
154
|
-
def bucket_labels_by_position(labels)
|
|
155
|
-
clusters = Hash.new { |h, k| h[k] = [] }
|
|
156
|
-
labels.each do |label|
|
|
157
|
-
key = [bucket(label.y, @y_bucket), bucket(label.x, @x_bucket)]
|
|
158
|
-
clusters[key] << label
|
|
159
|
-
end
|
|
160
|
-
clusters
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
def decode_each_cluster(clusters)
|
|
164
|
-
clusters.each_with_object({}) do |(key, members), decoded|
|
|
165
|
-
text = members.sort_by(&:x).map { |m| decode_entities(m.text) }.join
|
|
166
|
-
next unless text.match?(/\A[0-9A-Fa-f]{4,6}\z/)
|
|
167
|
-
|
|
168
|
-
decoded[key] = text.to_i(16)
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
# Group any set of uses (labels or specimens) by Y-bucket; sort
|
|
173
|
-
# each row by X so positional matching is straightforward.
|
|
174
|
-
def group_rows(uses)
|
|
175
|
-
rows = Hash.new { |h, k| h[k] = [] }
|
|
176
|
-
uses.each do |u|
|
|
177
|
-
rows[bucket(u.y, @y_bucket)] << u
|
|
178
|
-
end
|
|
179
|
-
rows.each_value { |v| v.sort_by!(&:x) }
|
|
180
|
-
rows
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
# Within each Y-row, the rightmost label cluster is the
|
|
184
|
-
# specimen codepoint; the rightmost specimen glyph is the
|
|
185
|
-
# specimen GID. The preceding label clusters (if any) are
|
|
186
|
-
# cross-reference codepoints, matched positionally to the
|
|
187
|
-
# preceding specimen glyphs in the same row.
|
|
188
|
-
def build_mapping(cp_per_cluster, specimen_rows)
|
|
189
|
-
cp_rows = group_cps_by_row(cp_per_cluster)
|
|
190
|
-
cp_rows.keys.sort.each_with_object({}) do |yb, mapping|
|
|
191
|
-
assign_row(mapping, cp_rows[yb], specimen_rows[yb] || [])
|
|
192
|
-
end
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
def assign_row(mapping, cps, glyphs)
|
|
196
|
-
return if cps.empty? || glyphs.empty?
|
|
197
|
-
|
|
198
|
-
mapping[cps.last] = glyphs.last.gid
|
|
199
|
-
assign_xrefs(mapping, cps[0...-1], glyphs[0...-1])
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
def assign_xrefs(mapping, xref_cps, xref_glyphs)
|
|
203
|
-
xref_cps.each_with_index do |cp, i|
|
|
204
|
-
g = xref_glyphs[i]
|
|
205
|
-
mapping[cp] = g.gid if g
|
|
206
|
-
end
|
|
207
|
-
end
|
|
208
|
-
|
|
209
|
-
def group_cps_by_row(cp_per_cluster)
|
|
210
|
-
rows = Hash.new { |h, k| h[k] = [] }
|
|
211
|
-
cp_per_cluster.each do |(yb, xb), cp|
|
|
212
|
-
rows[yb] << [cp, xb]
|
|
213
|
-
end
|
|
214
|
-
rows.each_value { |v| v.sort_by! { |_, xb| xb } }
|
|
215
|
-
rows.transform_values { |v| v.map(&:first) }
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
def bucket(value, size)
|
|
219
|
-
(value / size).round * size
|
|
220
|
-
end
|
|
221
|
-
|
|
222
123
|
def decode_entities(text)
|
|
223
124
|
text.gsub(/&#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U") }
|
|
224
125
|
end
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open3"
|
|
4
|
+
require "pathname"
|
|
5
|
+
|
|
6
|
+
module Ucode
|
|
7
|
+
module Glyphs
|
|
8
|
+
module EmbeddedFonts
|
|
9
|
+
# Walks the Code Charts PDF once via `mutool info` + `mutool show`
|
|
10
|
+
# and builds an Array of {RawFontDescriptor} — one per Type0 font
|
|
11
|
+
# that has the required descendant CIDFont, FontDescriptor, and
|
|
12
|
+
# FontFile2/3 + Identity CIDToGIDMap.
|
|
13
|
+
#
|
|
14
|
+
# Pure subprocess + parsing concern. Does NOT resolve codepoint →
|
|
15
|
+
# GID (that's {CodepointMapper}'s job). The descriptor carries
|
|
16
|
+
# every ref the mapper needs to do its work.
|
|
17
|
+
class PdfIndexer
|
|
18
|
+
# @param source [PdfLocation]
|
|
19
|
+
def initialize(source:)
|
|
20
|
+
@source = source
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# @return [Array<RawFontDescriptor>]
|
|
24
|
+
def raw_descriptors
|
|
25
|
+
type0_refs = discover_type0_fonts
|
|
26
|
+
return [] if type0_refs.empty?
|
|
27
|
+
|
|
28
|
+
type0_dicts = fetch_objects(type0_refs.keys)
|
|
29
|
+
descendant_refs, = collect_refs(type0_refs, type0_dicts)
|
|
30
|
+
descendant_dicts = fetch_objects(descendant_refs)
|
|
31
|
+
fontdesc_dicts = fetch_fontdescs(descendant_dicts)
|
|
32
|
+
|
|
33
|
+
build_descriptors(type0_refs, type0_dicts, descendant_dicts, fontdesc_dicts)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def collect_refs(type0_refs, type0_dicts)
|
|
37
|
+
descendant_refs = []
|
|
38
|
+
tounicode_refs = []
|
|
39
|
+
type0_refs.each_key do |font_obj_id|
|
|
40
|
+
d = type0_dicts[font_obj_id] || {}
|
|
41
|
+
collect_ref(d["DescendantFonts"], descendant_refs)
|
|
42
|
+
collect_ref(d["ToUnicode"], tounicode_refs)
|
|
43
|
+
end
|
|
44
|
+
[descendant_refs, tounicode_refs]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def collect_ref(dict_value, acc)
|
|
48
|
+
ref = first_ref(dict_value)
|
|
49
|
+
acc << ref if ref
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def fetch_fontdescs(descendant_dicts)
|
|
53
|
+
fontdesc_refs = []
|
|
54
|
+
descendant_dicts.each_value do |d|
|
|
55
|
+
collect_ref(d["FontDescriptor"], fontdesc_refs)
|
|
56
|
+
end
|
|
57
|
+
fetch_objects(fontdesc_refs)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @return [Integer] total pages in the PDF
|
|
61
|
+
def page_count
|
|
62
|
+
@page_count ||= begin
|
|
63
|
+
m = mutool_info_text.match(/^Pages:\s+(\d+)/)
|
|
64
|
+
m ? m[1].to_i : 1
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @param base_font [String] e.g. "GPJAHB+WolofGaraySansSerif"
|
|
69
|
+
# @return [Boolean] true if this font appears on any page
|
|
70
|
+
def font_appears?(base_font)
|
|
71
|
+
font_entries_cache.key?(base_font)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
def build_descriptors(type0_refs, type0_dicts, descendant_dicts, fontdesc_dicts)
|
|
77
|
+
type0_refs.filter_map do |font_obj_id, base_font|
|
|
78
|
+
build_descriptor(
|
|
79
|
+
font_obj_id, base_font, type0_dicts[font_obj_id] || {},
|
|
80
|
+
descendant_dicts, fontdesc_dicts,
|
|
81
|
+
)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def build_descriptor(font_obj_id, base_font, type0_dict,
|
|
86
|
+
descendant_dicts, fontdesc_dicts)
|
|
87
|
+
desc_ref = first_ref(type0_dict["DescendantFonts"])
|
|
88
|
+
return nil unless desc_ref
|
|
89
|
+
|
|
90
|
+
tu_ref = first_ref(type0_dict["ToUnicode"])
|
|
91
|
+
desc_dict = descendant_dicts[desc_ref] || {}
|
|
92
|
+
fd_dict = fontdesc_for(desc_dict, fontdesc_dicts)
|
|
93
|
+
return nil unless fd_dict
|
|
94
|
+
|
|
95
|
+
fontfile_obj_id, fontfile_kind = resolve_fontfile(fd_dict)
|
|
96
|
+
return nil unless fontfile_obj_id
|
|
97
|
+
|
|
98
|
+
cid_map_kind = resolve_cid_to_gid(desc_dict)
|
|
99
|
+
return nil unless cid_map_kind
|
|
100
|
+
|
|
101
|
+
RawFontDescriptor.new(
|
|
102
|
+
base_font: base_font,
|
|
103
|
+
font_obj_id: font_obj_id,
|
|
104
|
+
fontfile_obj_id: fontfile_obj_id,
|
|
105
|
+
fontfile_kind: fontfile_kind,
|
|
106
|
+
tounicode_ref: tu_ref,
|
|
107
|
+
cid_map_kind: cid_map_kind,
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def fontdesc_for(desc_dict, fontdesc_dicts)
|
|
112
|
+
fd_ref = first_ref(desc_dict["FontDescriptor"])
|
|
113
|
+
return nil unless fd_ref
|
|
114
|
+
|
|
115
|
+
fontdesc_dicts[fd_ref]
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def resolve_fontfile(fd_dict)
|
|
119
|
+
if fd_dict.key?("FontFile2")
|
|
120
|
+
[first_ref(fd_dict["FontFile2"]), :ttf]
|
|
121
|
+
elsif fd_dict.key?("FontFile3")
|
|
122
|
+
[first_ref(fd_dict["FontFile3"]), :cff]
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def resolve_cid_to_gid(desc_dict)
|
|
127
|
+
raw = desc_dict["CIDToGIDMap"]
|
|
128
|
+
return nil if raw.nil?
|
|
129
|
+
|
|
130
|
+
raw.to_s == "Identity" ? :identity : nil
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# ---- mutool subprocess + dict parsing ----------------------------
|
|
134
|
+
|
|
135
|
+
def discover_type0_fonts
|
|
136
|
+
text = mutool_info_text
|
|
137
|
+
result = {}
|
|
138
|
+
seen = Set.new
|
|
139
|
+
text.each_line do |line|
|
|
140
|
+
next unless line.include?("Type0")
|
|
141
|
+
|
|
142
|
+
m = line.match(/Type0\s+'([^']+)'\s+\S+\s+\((\d+)\s+0\s+R\)/)
|
|
143
|
+
next unless m
|
|
144
|
+
|
|
145
|
+
font_obj_id = m[2].to_i
|
|
146
|
+
next if seen.include?(font_obj_id)
|
|
147
|
+
|
|
148
|
+
seen << font_obj_id
|
|
149
|
+
result[font_obj_id] = m[1]
|
|
150
|
+
end
|
|
151
|
+
result
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def fetch_objects(obj_ids)
|
|
155
|
+
return {} if obj_ids.empty?
|
|
156
|
+
|
|
157
|
+
args = ["mutool", "show", "-g",
|
|
158
|
+
@source.pdf_to_s].concat(obj_ids.map(&:to_s))
|
|
159
|
+
out, err, status = Open3.capture3(*args)
|
|
160
|
+
unless status.success?
|
|
161
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
162
|
+
"mutool show failed: #{err.strip}"
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
parse_grep_output(out)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def parse_grep_output(text)
|
|
169
|
+
result = {}
|
|
170
|
+
text.each_line do |line|
|
|
171
|
+
m = line.match(/^(\d+)\s+0\s+obj\s+(.*)$/)
|
|
172
|
+
next unless m
|
|
173
|
+
|
|
174
|
+
result[m[1].to_i] = parse_dict(m[2])
|
|
175
|
+
end
|
|
176
|
+
result
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# We don't try to fully parse the PDF dict grammar. Instead we
|
|
180
|
+
# regex each field we need directly out of the dict body.
|
|
181
|
+
def parse_dict(body)
|
|
182
|
+
body = body.to_s
|
|
183
|
+
{
|
|
184
|
+
"BaseFont" => field_match(body, %r{/BaseFont/([^\s/<>]+)}),
|
|
185
|
+
"DescendantFonts" => field_match(body,
|
|
186
|
+
%r{/DescendantFonts\s*\[\s*(\d+)\s+0\s+R\s*\]}),
|
|
187
|
+
"ToUnicode" => field_match(body, %r{/ToUnicode\s+(\d+)\s+0\s+R}),
|
|
188
|
+
"FontDescriptor" => field_match(body,
|
|
189
|
+
%r{/FontDescriptor\s+(\d+)\s+0\s+R}),
|
|
190
|
+
"FontFile2" => field_match(body, %r{/FontFile2\s+(\d+)\s+0\s+R}),
|
|
191
|
+
"FontFile3" => field_match(body, %r{/FontFile3\s+(\d+)\s+0\s+R}),
|
|
192
|
+
"CIDToGIDMap" => field_match(body,
|
|
193
|
+
%r{/CIDToGIDMap(?:/([^\s/<>]+)|\s+(\d+)\s+0\s+R)}),
|
|
194
|
+
}.compact
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def field_match(body, regex)
|
|
198
|
+
m = body.match(regex)
|
|
199
|
+
return nil unless m
|
|
200
|
+
|
|
201
|
+
m.captures.compact.first
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def first_ref(value)
|
|
205
|
+
return nil if value.nil? || value.empty?
|
|
206
|
+
|
|
207
|
+
Integer(value)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def mutool_info_text
|
|
211
|
+
@mutool_info_text ||= run_mutool_info
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def run_mutool_info
|
|
215
|
+
out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
|
|
216
|
+
status.success? ? out + err : ""
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def font_entries_cache
|
|
220
|
+
@font_entries_cache ||= begin
|
|
221
|
+
result = {}
|
|
222
|
+
mutool_info_text.each_line do |line|
|
|
223
|
+
next unless line.include?("Type0")
|
|
224
|
+
|
|
225
|
+
font_match = line.match(/Type0\s+'([^']+)'/)
|
|
226
|
+
next unless font_match
|
|
227
|
+
|
|
228
|
+
result[font_match[1]] = true
|
|
229
|
+
end
|
|
230
|
+
result
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
require "pathname"
|
|
4
4
|
|
|
5
|
-
require_relative "../../error"
|
|
6
|
-
|
|
7
5
|
module Ucode
|
|
8
6
|
module Glyphs
|
|
9
7
|
module EmbeddedFonts
|
|
@@ -29,7 +27,7 @@ module Ucode
|
|
|
29
27
|
# the BaseFont (e.g. `CIAIIP+Uni2000Generalpunctuation.ttf`).
|
|
30
28
|
# Re-runs skip extraction when the cached file is newer than the
|
|
31
29
|
# PDF.
|
|
32
|
-
class
|
|
30
|
+
class PdfLocation
|
|
33
31
|
attr_reader :pdf_path, :cache_dir
|
|
34
32
|
|
|
35
33
|
# @param pdf [String, Pathname, nil] path to a Code Charts PDF
|
|
@@ -41,8 +39,10 @@ module Ucode
|
|
|
41
39
|
# @raise [Ucode::EmbeddedFontsMissingError] if the PDF is missing
|
|
42
40
|
def initialize(pdf: nil, cache_dir: nil, env: ENV, gem_root: nil)
|
|
43
41
|
@pdf_path = resolve_pdf(pdf, env, gem_root)
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
unless @pdf_path&.exist?
|
|
43
|
+
raise Ucode::EmbeddedFontsMissingError,
|
|
44
|
+
"Code Charts PDF not found at #{@pdf_path}"
|
|
45
|
+
end
|
|
46
46
|
|
|
47
47
|
@cache_dir = resolve_cache(cache_dir, env, gem_root)
|
|
48
48
|
@cache_dir.mkpath unless @cache_dir.exist?
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
module Glyphs
|
|
5
|
+
module EmbeddedFonts
|
|
6
|
+
# Shared positional matching for Code Charts specimen attribution.
|
|
7
|
+
#
|
|
8
|
+
# Both {ContentStreamCorrelator} and {TraceCorrelator} need the
|
|
9
|
+
# same algorithm: given a set of specimen glyphs and a set of hex
|
|
10
|
+
# codepoint labels with positions, match each specimen to its
|
|
11
|
+
# nearest valid label cluster by Euclidean distance.
|
|
12
|
+
#
|
|
13
|
+
# This module owns that algorithm. The input is format-agnostic —
|
|
14
|
+
# callers produce {Position} structs from their source format
|
|
15
|
+
# (SVG `<use>` elements or `mutool trace` XML) and delegate here.
|
|
16
|
+
#
|
|
17
|
+
# Handles both Code Charts layouts:
|
|
18
|
+
#
|
|
19
|
+
# 1. **List layout** — label to the LEFT of specimen at the same Y.
|
|
20
|
+
# 2. **Grid layout** — label ABOVE specimen (~12pt higher, same X).
|
|
21
|
+
#
|
|
22
|
+
# Greedy one-to-one matching: each GID and each codepoint is
|
|
23
|
+
# assigned at most once, so a specimen between two labels only
|
|
24
|
+
# claims the closer one.
|
|
25
|
+
module PositionalMatcher
|
|
26
|
+
# Value object: one positioned glyph with text content.
|
|
27
|
+
# font_ref is the font identifier (Integer obj-id for SVG,
|
|
28
|
+
# String font-name for trace); used only for partitioning by
|
|
29
|
+
# the caller, not by the matcher.
|
|
30
|
+
Position = Struct.new(
|
|
31
|
+
:x, :y, :font_ref, :glyph_id, :text, keyword_init: true,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
DEFAULT_Y_BUCKET = 1.0
|
|
35
|
+
private_constant :DEFAULT_Y_BUCKET
|
|
36
|
+
|
|
37
|
+
# Adjacent label chars within one codepoint label are ~4-6 pt
|
|
38
|
+
# apart on X. Different columns are ~30+ pt apart. 10 pt
|
|
39
|
+
# cleanly separates within-label from between-column gaps.
|
|
40
|
+
X_GAP_THRESHOLD = 10.0
|
|
41
|
+
private_constant :X_GAP_THRESHOLD
|
|
42
|
+
|
|
43
|
+
# Maximum valid Unicode codepoint.
|
|
44
|
+
UNICODE_MAX = 0x10FFFF
|
|
45
|
+
private_constant :UNICODE_MAX
|
|
46
|
+
|
|
47
|
+
# Maximum Euclidean distance from a specimen to its matching
|
|
48
|
+
# label cluster. List-layout labels are ~21 pt to the left;
|
|
49
|
+
# grid-layout labels are ~12 pt above. Header/footer text is
|
|
50
|
+
# always > 30 pt away from any specimen.
|
|
51
|
+
MAX_MATCH_DISTANCE = 30.0
|
|
52
|
+
private_constant :MAX_MATCH_DISTANCE
|
|
53
|
+
|
|
54
|
+
module_function
|
|
55
|
+
|
|
56
|
+
# @param specimens [Array<Position>] positioned specimen glyphs
|
|
57
|
+
# @param labels [Array<Position>] positioned label chars
|
|
58
|
+
# @return [Hash{Integer=>Integer}] codepoint => gid
|
|
59
|
+
def match(specimens, labels)
|
|
60
|
+
return {} if specimens.empty? || labels.empty?
|
|
61
|
+
|
|
62
|
+
clusters = build_label_clusters(labels)
|
|
63
|
+
return {} if clusters.empty?
|
|
64
|
+
|
|
65
|
+
build_mapping(specimens, clusters)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# ---- Clustering --------------------------------------------------
|
|
69
|
+
|
|
70
|
+
def build_label_clusters(labels)
|
|
71
|
+
by_y = labels.group_by { |g| quantize(g.y, DEFAULT_Y_BUCKET) }
|
|
72
|
+
|
|
73
|
+
by_y.flat_map do |(_, glyphs)|
|
|
74
|
+
cluster_by_x_gap(glyphs.sort_by(&:x)).filter_map do |cluster|
|
|
75
|
+
build_cluster(cluster)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def cluster_by_x_gap(sorted_glyphs)
|
|
81
|
+
clusters = []
|
|
82
|
+
current = []
|
|
83
|
+
|
|
84
|
+
sorted_glyphs.each do |g|
|
|
85
|
+
if current.empty? || (g.x - current.last.x).abs < X_GAP_THRESHOLD
|
|
86
|
+
current << g
|
|
87
|
+
else
|
|
88
|
+
clusters << current if current.size > 1
|
|
89
|
+
current = [g]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
clusters << current if current.size > 1
|
|
93
|
+
clusters
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def build_cluster(cluster)
|
|
97
|
+
hex = cluster.map(&:text).join
|
|
98
|
+
return nil unless hex.match?(/\A[0-9A-Fa-f]{4,6}\z/)
|
|
99
|
+
|
|
100
|
+
cp = hex.to_i(16)
|
|
101
|
+
return nil unless cp <= UNICODE_MAX
|
|
102
|
+
|
|
103
|
+
{
|
|
104
|
+
x: cluster.sum(&:x) / cluster.size,
|
|
105
|
+
y: cluster.first.y,
|
|
106
|
+
codepoint: cp,
|
|
107
|
+
}
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# ---- Matching ----------------------------------------------------
|
|
111
|
+
|
|
112
|
+
def build_mapping(specimens, clusters)
|
|
113
|
+
assigned_gids = Set.new
|
|
114
|
+
assigned_cps = Set.new
|
|
115
|
+
mapping = {}
|
|
116
|
+
|
|
117
|
+
pairs_by_distance(specimens, clusters).each do |spec_idx, cluster_idx, dist|
|
|
118
|
+
next if dist > MAX_MATCH_DISTANCE
|
|
119
|
+
|
|
120
|
+
spec = specimens[spec_idx]
|
|
121
|
+
cluster = clusters[cluster_idx]
|
|
122
|
+
next if assigned_gids.include?(spec.glyph_id)
|
|
123
|
+
next if assigned_cps.include?(cluster[:codepoint])
|
|
124
|
+
|
|
125
|
+
assigned_gids << spec.glyph_id
|
|
126
|
+
assigned_cps << cluster[:codepoint]
|
|
127
|
+
mapping[cluster[:codepoint]] = spec.glyph_id
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
mapping
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def pairs_by_distance(specimens, clusters)
|
|
134
|
+
candidates = Array.new(clusters.size) do |ci|
|
|
135
|
+
specimen_distances(specimens, clusters, ci)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
candidates.flatten(1).sort_by { |_, _, dist| dist }
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def specimen_distances(specimens, clusters, cluster_idx)
|
|
142
|
+
cluster = clusters[cluster_idx]
|
|
143
|
+
specimens.each_with_index.map do |spec, spec_idx|
|
|
144
|
+
[spec_idx, cluster_idx, distance(spec, cluster)]
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def distance(spec, cluster)
|
|
149
|
+
dx = spec.x - cluster[:x]
|
|
150
|
+
dy = spec.y - cluster[:y]
|
|
151
|
+
Math.sqrt(dx * dx + dy * dy)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def quantize(value, bucket_size)
|
|
155
|
+
return nil if value.nil?
|
|
156
|
+
|
|
157
|
+
(value / bucket_size).round * bucket_size
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ostruct"
|
|
4
|
+
|
|
5
|
+
module Ucode
|
|
6
|
+
module Glyphs
|
|
7
|
+
module EmbeddedFonts
|
|
8
|
+
# Value object: one Type0 font discovered by {PdfIndexer}, carrying
|
|
9
|
+
# every ref the {CodepointMapper} needs to resolve codepoint → GID.
|
|
10
|
+
#
|
|
11
|
+
# Public so tests can construct realistic fixtures without going
|
|
12
|
+
# through the PDF subprocess layer.
|
|
13
|
+
RawFontDescriptor = Struct.new(
|
|
14
|
+
:base_font,
|
|
15
|
+
:font_obj_id,
|
|
16
|
+
:fontfile_obj_id,
|
|
17
|
+
:fontfile_kind,
|
|
18
|
+
:tounicode_ref,
|
|
19
|
+
:cid_map_kind,
|
|
20
|
+
keyword_init: true,
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|