ucode 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ucode/code_chart/extractor.rb +1 -9
  3. data/lib/ucode/code_chart/writer.rb +1 -1
  4. data/lib/ucode/commands/canonical_build.rb +4 -4
  5. data/lib/ucode/commands/universal_set.rb +5 -3
  6. data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
  7. data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
  8. data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
  9. data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
  10. data/lib/ucode/coordinator/enrichment/display.rb +36 -0
  11. data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
  12. data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
  13. data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
  14. data/lib/ucode/coordinator/enrichment/names.rb +63 -0
  15. data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
  16. data/lib/ucode/coordinator/enrichment.rb +51 -0
  17. data/lib/ucode/coordinator/range_lookup.rb +65 -0
  18. data/lib/ucode/coordinator.rb +4 -276
  19. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
  20. data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
  21. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
  22. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
  23. data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
  24. data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
  25. data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
  26. data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
  27. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
  28. data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
  29. data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
  30. data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
  31. data/lib/ucode/glyphs/resolver_factory.rb +45 -0
  32. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
  33. data/lib/ucode/glyphs.rb +1 -0
  34. data/lib/ucode/version.rb +1 -1
  35. metadata +20 -3
@@ -6,28 +6,15 @@ module Ucode
6
6
  # Pillar 2 fallback: build a `{codepoint => gid}` map for a Type0
7
7
  # font whose PDF object graph has no `/ToUnicode` CMap stream.
8
8
  #
9
- # The Code Charts draw every chart cell as a `<use>` element that
10
- # references the font's GID via an `href` of the form
11
- # `#font_<font_obj_id>_<gid>`. The chart also prints the row +
12
- # column codepoint labels using one or more "label" fonts (small
13
- # Latin glyphs) that show the hex codepoint as text. By clustering
14
- # the labels positionally (Y-bucket for the row, X-bucket for the
15
- # column) we recover the codepoint each cluster represents, then
16
- # match each cluster positionally to the specimen glyph at the
17
- # same Y/X position.
9
+ # Adapter for the `mutool draw -F svg` output format: parses
10
+ # `<use>` elements from the rendered PDF page SVG, partitions into
11
+ # labels and specimens by PDF font object ID (supplied via {Config}),
12
+ # then delegates matching to {PositionalMatcher}.
18
13
  #
19
- # The algorithm generalizes the Tai Yo correlator that was tested
20
- # against `data/pdfs/U1E6C0.pdf` (50/52 specimen codepoints
21
- # matched, with the two missing being layout edge cases). The
22
- # bucket sizes are configurable because some blocks use a tighter
23
- # grid than others.
24
- #
25
- # Inputs are deliberately pure: a string of SVG markup plus a
26
- # {Config}. The catalog is responsible for sourcing the SVG (by
27
- # rendering the relevant PDF page(s) via `mutool draw -F svg`) and
28
- # for knowing which font_obj_ids are labels vs specimen on that
29
- # page. That keeps this class trivially testable with synthetic
30
- # SVG fixtures.
14
+ # The SVG parsing (regex-based `<use>` extraction, HTML entity
15
+ # decoding) is the only piece of format-specific work here. The
16
+ # matching algorithm lives in {PositionalMatcher} and is shared
17
+ # with {TraceCorrelator}.
31
18
  class ContentStreamCorrelator
32
19
  # Per-font / per-block configuration.
33
20
  #
@@ -37,25 +24,13 @@ module Ucode
37
24
  # whose glyphs are the specimens we want to attribute.
38
25
  # @!attribute page_numbers [Array<Integer>] 1-based PDF page
39
26
  # numbers whose content streams reference the specimen font.
40
- # @!attribute y_bucket [Float] vertical clustering granularity
41
- # in PDF points. Default 1.5 — matches mutool's text matrix
42
- # granularity for the row labels.
43
- # @!attribute x_bucket [Float] horizontal clustering granularity
44
- # in PDF points. Default 50.0 — separates label clusters
45
- # within a row (labels are ~16pt wide, clusters ~60-160pt
46
- # apart).
47
27
  Config = Struct.new(
48
28
  :label_font_ids,
49
29
  :specimen_font_id,
50
30
  :page_numbers,
51
- :y_bucket,
52
- :x_bucket,
53
31
  keyword_init: true,
54
32
  )
55
33
 
56
- DEFAULT_Y_BUCKET = 1.5
57
- DEFAULT_X_BUCKET = 50.0
58
-
59
34
  # Internal value object for a parsed `<use>` element. Public so
60
35
  # the spec can construct realistic fixtures without re-implementing
61
36
  # the parser shape.
@@ -64,8 +39,6 @@ module Ucode
64
39
  # @param config [Config]
65
40
  def initialize(config)
66
41
  @config = config
67
- @y_bucket = config.y_bucket || DEFAULT_Y_BUCKET
68
- @x_bucket = config.x_bucket || DEFAULT_X_BUCKET
69
42
  end
70
43
 
71
44
  # @param svg [String] rendered PDF page(s) as SVG markup. May
@@ -77,29 +50,33 @@ module Ucode
77
50
  uses = parse_uses(svg)
78
51
  return {} if uses.empty?
79
52
 
80
- partition_and_map(uses)
81
- end
82
-
83
- private
84
-
85
- def partition_and_map(uses)
86
53
  labels, specimens = partition_uses(uses)
87
54
  return {} if labels.empty? || specimens.empty?
88
55
 
89
- cp_per_cluster = decode_label_clusters(labels)
90
- return {} if cp_per_cluster.empty?
91
-
92
- build_mapping(cp_per_cluster, group_rows(specimens))
56
+ PositionalMatcher.match(
57
+ specimens.map { |u| to_position(u) },
58
+ labels.map { |u| to_position(u) },
59
+ )
93
60
  end
94
61
 
62
+ private
63
+
95
64
  def partition_uses(uses)
96
- labels = uses.select do |u|
97
- @config.label_font_ids.include?(u.font_id)
98
- end
65
+ labels = uses.select { |u| @config.label_font_ids.include?(u.font_id) }
99
66
  specimens = uses.select { |u| u.font_id == @config.specimen_font_id }
100
67
  [labels, specimens]
101
68
  end
102
69
 
70
+ def to_position(use)
71
+ PositionalMatcher::Position.new(
72
+ x: use.x,
73
+ y: use.y,
74
+ font_ref: use.font_id,
75
+ glyph_id: use.gid,
76
+ text: decode_entities(use.text),
77
+ )
78
+ end
79
+
103
80
  # Match `<use .../>` elements and pull out the font_obj_id and
104
81
  # gid from the href, plus the text matrix's e and f terms (which
105
82
  # give the X/Y origin). The data-text attribute carries the
@@ -143,82 +120,6 @@ module Ucode
143
120
  attrs[/xlink:href="([^"]+)"/, 1] || attrs[/href="([^"]+)"/, 1]
144
121
  end
145
122
 
146
- # Cluster label uses by quantized (Y, X) position. Within each
147
- # cluster, members are sorted by X so that joined text reads
148
- # left-to-right (hex codepoint string).
149
- def decode_label_clusters(labels)
150
- cluster_members = bucket_labels_by_position(labels)
151
- decode_each_cluster(cluster_members)
152
- end
153
-
154
- def bucket_labels_by_position(labels)
155
- clusters = Hash.new { |h, k| h[k] = [] }
156
- labels.each do |label|
157
- key = [bucket(label.y, @y_bucket), bucket(label.x, @x_bucket)]
158
- clusters[key] << label
159
- end
160
- clusters
161
- end
162
-
163
- def decode_each_cluster(clusters)
164
- clusters.each_with_object({}) do |(key, members), decoded|
165
- text = members.sort_by(&:x).map { |m| decode_entities(m.text) }.join
166
- next unless text.match?(/\A[0-9A-Fa-f]{4,6}\z/)
167
-
168
- decoded[key] = text.to_i(16)
169
- end
170
- end
171
-
172
- # Group any set of uses (labels or specimens) by Y-bucket; sort
173
- # each row by X so positional matching is straightforward.
174
- def group_rows(uses)
175
- rows = Hash.new { |h, k| h[k] = [] }
176
- uses.each do |u|
177
- rows[bucket(u.y, @y_bucket)] << u
178
- end
179
- rows.each_value { |v| v.sort_by!(&:x) }
180
- rows
181
- end
182
-
183
- # Within each Y-row, the rightmost label cluster is the
184
- # specimen codepoint; the rightmost specimen glyph is the
185
- # specimen GID. The preceding label clusters (if any) are
186
- # cross-reference codepoints, matched positionally to the
187
- # preceding specimen glyphs in the same row.
188
- def build_mapping(cp_per_cluster, specimen_rows)
189
- cp_rows = group_cps_by_row(cp_per_cluster)
190
- cp_rows.keys.sort.each_with_object({}) do |yb, mapping|
191
- assign_row(mapping, cp_rows[yb], specimen_rows[yb] || [])
192
- end
193
- end
194
-
195
- def assign_row(mapping, cps, glyphs)
196
- return if cps.empty? || glyphs.empty?
197
-
198
- mapping[cps.last] = glyphs.last.gid
199
- assign_xrefs(mapping, cps[0...-1], glyphs[0...-1])
200
- end
201
-
202
- def assign_xrefs(mapping, xref_cps, xref_glyphs)
203
- xref_cps.each_with_index do |cp, i|
204
- g = xref_glyphs[i]
205
- mapping[cp] = g.gid if g
206
- end
207
- end
208
-
209
- def group_cps_by_row(cp_per_cluster)
210
- rows = Hash.new { |h, k| h[k] = [] }
211
- cp_per_cluster.each do |(yb, xb), cp|
212
- rows[yb] << [cp, xb]
213
- end
214
- rows.each_value { |v| v.sort_by! { |_, xb| xb } }
215
- rows.transform_values { |v| v.map(&:first) }
216
- end
217
-
218
- def bucket(value, size)
219
- (value / size).round * size
220
- end
221
-
222
123
  def decode_entities(text)
223
124
  text.gsub(/&#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U") }
224
125
  end
@@ -5,7 +5,6 @@ require "pathname"
5
5
  require "tempfile"
6
6
 
7
7
  require "fontisan"
8
- require_relative "../../error"
9
8
 
10
9
  module Ucode
11
10
  module Glyphs
@@ -0,0 +1,236 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open3"
4
+ require "pathname"
5
+
6
+ module Ucode
7
+ module Glyphs
8
+ module EmbeddedFonts
9
+ # Walks the Code Charts PDF once via `mutool info` + `mutool show`
10
+ # and builds an Array of {RawFontDescriptor} — one per Type0 font
11
+ # that has the required descendant CIDFont, FontDescriptor, and
12
+ # FontFile2/3 + Identity CIDToGIDMap.
13
+ #
14
+ # Pure subprocess + parsing concern. Does NOT resolve codepoint →
15
+ # GID (that's {CodepointMapper}'s job). The descriptor carries
16
+ # every ref the mapper needs to do its work.
17
+ class PdfIndexer
18
+ # @param source [PdfLocation]
19
+ def initialize(source:)
20
+ @source = source
21
+ end
22
+
23
+ # @return [Array<RawFontDescriptor>]
24
+ def raw_descriptors
25
+ type0_refs = discover_type0_fonts
26
+ return [] if type0_refs.empty?
27
+
28
+ type0_dicts = fetch_objects(type0_refs.keys)
29
+ descendant_refs, = collect_refs(type0_refs, type0_dicts)
30
+ descendant_dicts = fetch_objects(descendant_refs)
31
+ fontdesc_dicts = fetch_fontdescs(descendant_dicts)
32
+
33
+ build_descriptors(type0_refs, type0_dicts, descendant_dicts, fontdesc_dicts)
34
+ end
35
+
36
+ def collect_refs(type0_refs, type0_dicts)
37
+ descendant_refs = []
38
+ tounicode_refs = []
39
+ type0_refs.each_key do |font_obj_id|
40
+ d = type0_dicts[font_obj_id] || {}
41
+ collect_ref(d["DescendantFonts"], descendant_refs)
42
+ collect_ref(d["ToUnicode"], tounicode_refs)
43
+ end
44
+ [descendant_refs, tounicode_refs]
45
+ end
46
+
47
+ def collect_ref(dict_value, acc)
48
+ ref = first_ref(dict_value)
49
+ acc << ref if ref
50
+ end
51
+
52
+ def fetch_fontdescs(descendant_dicts)
53
+ fontdesc_refs = []
54
+ descendant_dicts.each_value do |d|
55
+ collect_ref(d["FontDescriptor"], fontdesc_refs)
56
+ end
57
+ fetch_objects(fontdesc_refs)
58
+ end
59
+
60
+ # @return [Integer] total pages in the PDF
61
+ def page_count
62
+ @page_count ||= begin
63
+ m = mutool_info_text.match(/^Pages:\s+(\d+)/)
64
+ m ? m[1].to_i : 1
65
+ end
66
+ end
67
+
68
+ # @param base_font [String] e.g. "GPJAHB+WolofGaraySansSerif"
69
+ # @return [Boolean] true if this font appears on any page
70
+ def font_appears?(base_font)
71
+ font_entries_cache.key?(base_font)
72
+ end
73
+
74
+ private
75
+
76
+ def build_descriptors(type0_refs, type0_dicts, descendant_dicts, fontdesc_dicts)
77
+ type0_refs.filter_map do |font_obj_id, base_font|
78
+ build_descriptor(
79
+ font_obj_id, base_font, type0_dicts[font_obj_id] || {},
80
+ descendant_dicts, fontdesc_dicts,
81
+ )
82
+ end
83
+ end
84
+
85
+ def build_descriptor(font_obj_id, base_font, type0_dict,
86
+ descendant_dicts, fontdesc_dicts)
87
+ desc_ref = first_ref(type0_dict["DescendantFonts"])
88
+ return nil unless desc_ref
89
+
90
+ tu_ref = first_ref(type0_dict["ToUnicode"])
91
+ desc_dict = descendant_dicts[desc_ref] || {}
92
+ fd_dict = fontdesc_for(desc_dict, fontdesc_dicts)
93
+ return nil unless fd_dict
94
+
95
+ fontfile_obj_id, fontfile_kind = resolve_fontfile(fd_dict)
96
+ return nil unless fontfile_obj_id
97
+
98
+ cid_map_kind = resolve_cid_to_gid(desc_dict)
99
+ return nil unless cid_map_kind
100
+
101
+ RawFontDescriptor.new(
102
+ base_font: base_font,
103
+ font_obj_id: font_obj_id,
104
+ fontfile_obj_id: fontfile_obj_id,
105
+ fontfile_kind: fontfile_kind,
106
+ tounicode_ref: tu_ref,
107
+ cid_map_kind: cid_map_kind,
108
+ )
109
+ end
110
+
111
+ def fontdesc_for(desc_dict, fontdesc_dicts)
112
+ fd_ref = first_ref(desc_dict["FontDescriptor"])
113
+ return nil unless fd_ref
114
+
115
+ fontdesc_dicts[fd_ref]
116
+ end
117
+
118
+ def resolve_fontfile(fd_dict)
119
+ if fd_dict.key?("FontFile2")
120
+ [first_ref(fd_dict["FontFile2"]), :ttf]
121
+ elsif fd_dict.key?("FontFile3")
122
+ [first_ref(fd_dict["FontFile3"]), :cff]
123
+ end
124
+ end
125
+
126
+ def resolve_cid_to_gid(desc_dict)
127
+ raw = desc_dict["CIDToGIDMap"]
128
+ return nil if raw.nil?
129
+
130
+ raw.to_s == "Identity" ? :identity : nil
131
+ end
132
+
133
+ # ---- mutool subprocess + dict parsing ----------------------------
134
+
135
+ def discover_type0_fonts
136
+ text = mutool_info_text
137
+ result = {}
138
+ seen = Set.new
139
+ text.each_line do |line|
140
+ next unless line.include?("Type0")
141
+
142
+ m = line.match(/Type0\s+'([^']+)'\s+\S+\s+\((\d+)\s+0\s+R\)/)
143
+ next unless m
144
+
145
+ font_obj_id = m[2].to_i
146
+ next if seen.include?(font_obj_id)
147
+
148
+ seen << font_obj_id
149
+ result[font_obj_id] = m[1]
150
+ end
151
+ result
152
+ end
153
+
154
+ def fetch_objects(obj_ids)
155
+ return {} if obj_ids.empty?
156
+
157
+ args = ["mutool", "show", "-g",
158
+ @source.pdf_to_s].concat(obj_ids.map(&:to_s))
159
+ out, err, status = Open3.capture3(*args)
160
+ unless status.success?
161
+ raise Ucode::EmbeddedFontsMissingError,
162
+ "mutool show failed: #{err.strip}"
163
+ end
164
+
165
+ parse_grep_output(out)
166
+ end
167
+
168
+ def parse_grep_output(text)
169
+ result = {}
170
+ text.each_line do |line|
171
+ m = line.match(/^(\d+)\s+0\s+obj\s+(.*)$/)
172
+ next unless m
173
+
174
+ result[m[1].to_i] = parse_dict(m[2])
175
+ end
176
+ result
177
+ end
178
+
179
+ # We don't try to fully parse the PDF dict grammar. Instead we
180
+ # regex each field we need directly out of the dict body.
181
+ def parse_dict(body)
182
+ body = body.to_s
183
+ {
184
+ "BaseFont" => field_match(body, %r{/BaseFont/([^\s/<>]+)}),
185
+ "DescendantFonts" => field_match(body,
186
+ %r{/DescendantFonts\s*\[\s*(\d+)\s+0\s+R\s*\]}),
187
+ "ToUnicode" => field_match(body, %r{/ToUnicode\s+(\d+)\s+0\s+R}),
188
+ "FontDescriptor" => field_match(body,
189
+ %r{/FontDescriptor\s+(\d+)\s+0\s+R}),
190
+ "FontFile2" => field_match(body, %r{/FontFile2\s+(\d+)\s+0\s+R}),
191
+ "FontFile3" => field_match(body, %r{/FontFile3\s+(\d+)\s+0\s+R}),
192
+ "CIDToGIDMap" => field_match(body,
193
+ %r{/CIDToGIDMap(?:/([^\s/<>]+)|\s+(\d+)\s+0\s+R)}),
194
+ }.compact
195
+ end
196
+
197
+ def field_match(body, regex)
198
+ m = body.match(regex)
199
+ return nil unless m
200
+
201
+ m.captures.compact.first
202
+ end
203
+
204
+ def first_ref(value)
205
+ return nil if value.nil? || value.empty?
206
+
207
+ Integer(value)
208
+ end
209
+
210
+ def mutool_info_text
211
+ @mutool_info_text ||= run_mutool_info
212
+ end
213
+
214
+ def run_mutool_info
215
+ out, err, status = Open3.capture3("mutool", "info", @source.pdf_to_s)
216
+ status.success? ? out + err : ""
217
+ end
218
+
219
+ def font_entries_cache
220
+ @font_entries_cache ||= begin
221
+ result = {}
222
+ mutool_info_text.each_line do |line|
223
+ next unless line.include?("Type0")
224
+
225
+ font_match = line.match(/Type0\s+'([^']+)'/)
226
+ next unless font_match
227
+
228
+ result[font_match[1]] = true
229
+ end
230
+ result
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end
236
+ end
@@ -2,8 +2,6 @@
2
2
 
3
3
  require "pathname"
4
4
 
5
- require_relative "../../error"
6
-
7
5
  module Ucode
8
6
  module Glyphs
9
7
  module EmbeddedFonts
@@ -29,7 +27,7 @@ module Ucode
29
27
  # the BaseFont (e.g. `CIAIIP+Uni2000Generalpunctuation.ttf`).
30
28
  # Re-runs skip extraction when the cached file is newer than the
31
29
  # PDF.
32
- class Source
30
+ class PdfLocation
33
31
  attr_reader :pdf_path, :cache_dir
34
32
 
35
33
  # @param pdf [String, Pathname, nil] path to a Code Charts PDF
@@ -41,8 +39,10 @@ module Ucode
41
39
  # @raise [Ucode::EmbeddedFontsMissingError] if the PDF is missing
42
40
  def initialize(pdf: nil, cache_dir: nil, env: ENV, gem_root: nil)
43
41
  @pdf_path = resolve_pdf(pdf, env, gem_root)
44
- raise Ucode::EmbeddedFontsMissingError,
45
- "Code Charts PDF not found at #{@pdf_path}" unless @pdf_path&.exist?
42
+ unless @pdf_path&.exist?
43
+ raise Ucode::EmbeddedFontsMissingError,
44
+ "Code Charts PDF not found at #{@pdf_path}"
45
+ end
46
46
 
47
47
  @cache_dir = resolve_cache(cache_dir, env, gem_root)
48
48
  @cache_dir.mkpath unless @cache_dir.exist?
@@ -0,0 +1,162 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module EmbeddedFonts
6
+ # Shared positional matching for Code Charts specimen attribution.
7
+ #
8
+ # Both {ContentStreamCorrelator} and {TraceCorrelator} need the
9
+ # same algorithm: given a set of specimen glyphs and a set of hex
10
+ # codepoint labels with positions, match each specimen to its
11
+ # nearest valid label cluster by Euclidean distance.
12
+ #
13
+ # This module owns that algorithm. The input is format-agnostic —
14
+ # callers produce {Position} structs from their source format
15
+ # (SVG `<use>` elements or `mutool trace` XML) and delegate here.
16
+ #
17
+ # Handles both Code Charts layouts:
18
+ #
19
+ # 1. **List layout** — label to the LEFT of specimen at the same Y.
20
+ # 2. **Grid layout** — label ABOVE specimen (~12pt higher, same X).
21
+ #
22
+ # Greedy one-to-one matching: each GID and each codepoint is
23
+ # assigned at most once, so a specimen between two labels only
24
+ # claims the closer one.
25
+ module PositionalMatcher
26
+ # Value object: one positioned glyph with text content.
27
+ # font_ref is the font identifier (Integer obj-id for SVG,
28
+ # String font-name for trace); used only for partitioning by
29
+ # the caller, not by the matcher.
30
+ Position = Struct.new(
31
+ :x, :y, :font_ref, :glyph_id, :text, keyword_init: true,
32
+ )
33
+
34
+ DEFAULT_Y_BUCKET = 1.0
35
+ private_constant :DEFAULT_Y_BUCKET
36
+
37
+ # Adjacent label chars within one codepoint label are ~4-6 pt
38
+ # apart on X. Different columns are ~30+ pt apart. 10 pt
39
+ # cleanly separates within-label from between-column gaps.
40
+ X_GAP_THRESHOLD = 10.0
41
+ private_constant :X_GAP_THRESHOLD
42
+
43
+ # Maximum valid Unicode codepoint.
44
+ UNICODE_MAX = 0x10FFFF
45
+ private_constant :UNICODE_MAX
46
+
47
+ # Maximum Euclidean distance from a specimen to its matching
48
+ # label cluster. List-layout labels are ~21 pt to the left;
49
+ # grid-layout labels are ~12 pt above. Header/footer text is
50
+ # always > 30 pt away from any specimen.
51
+ MAX_MATCH_DISTANCE = 30.0
52
+ private_constant :MAX_MATCH_DISTANCE
53
+
54
+ module_function
55
+
56
+ # @param specimens [Array<Position>] positioned specimen glyphs
57
+ # @param labels [Array<Position>] positioned label chars
58
+ # @return [Hash{Integer=>Integer}] codepoint => gid
59
+ def match(specimens, labels)
60
+ return {} if specimens.empty? || labels.empty?
61
+
62
+ clusters = build_label_clusters(labels)
63
+ return {} if clusters.empty?
64
+
65
+ build_mapping(specimens, clusters)
66
+ end
67
+
68
+ # ---- Clustering --------------------------------------------------
69
+
70
+ def build_label_clusters(labels)
71
+ by_y = labels.group_by { |g| quantize(g.y, DEFAULT_Y_BUCKET) }
72
+
73
+ by_y.flat_map do |(_, glyphs)|
74
+ cluster_by_x_gap(glyphs.sort_by(&:x)).filter_map do |cluster|
75
+ build_cluster(cluster)
76
+ end
77
+ end
78
+ end
79
+
80
+ def cluster_by_x_gap(sorted_glyphs)
81
+ clusters = []
82
+ current = []
83
+
84
+ sorted_glyphs.each do |g|
85
+ if current.empty? || (g.x - current.last.x).abs < X_GAP_THRESHOLD
86
+ current << g
87
+ else
88
+ clusters << current if current.size > 1
89
+ current = [g]
90
+ end
91
+ end
92
+ clusters << current if current.size > 1
93
+ clusters
94
+ end
95
+
96
+ def build_cluster(cluster)
97
+ hex = cluster.map(&:text).join
98
+ return nil unless hex.match?(/\A[0-9A-Fa-f]{4,6}\z/)
99
+
100
+ cp = hex.to_i(16)
101
+ return nil unless cp <= UNICODE_MAX
102
+
103
+ {
104
+ x: cluster.sum(&:x) / cluster.size,
105
+ y: cluster.first.y,
106
+ codepoint: cp,
107
+ }
108
+ end
109
+
110
+ # ---- Matching ----------------------------------------------------
111
+
112
+ def build_mapping(specimens, clusters)
113
+ assigned_gids = Set.new
114
+ assigned_cps = Set.new
115
+ mapping = {}
116
+
117
+ pairs_by_distance(specimens, clusters).each do |spec_idx, cluster_idx, dist|
118
+ next if dist > MAX_MATCH_DISTANCE
119
+
120
+ spec = specimens[spec_idx]
121
+ cluster = clusters[cluster_idx]
122
+ next if assigned_gids.include?(spec.glyph_id)
123
+ next if assigned_cps.include?(cluster[:codepoint])
124
+
125
+ assigned_gids << spec.glyph_id
126
+ assigned_cps << cluster[:codepoint]
127
+ mapping[cluster[:codepoint]] = spec.glyph_id
128
+ end
129
+
130
+ mapping
131
+ end
132
+
133
+ def pairs_by_distance(specimens, clusters)
134
+ candidates = Array.new(clusters.size) do |ci|
135
+ specimen_distances(specimens, clusters, ci)
136
+ end
137
+
138
+ candidates.flatten(1).sort_by { |_, _, dist| dist }
139
+ end
140
+
141
+ def specimen_distances(specimens, clusters, cluster_idx)
142
+ cluster = clusters[cluster_idx]
143
+ specimens.each_with_index.map do |spec, spec_idx|
144
+ [spec_idx, cluster_idx, distance(spec, cluster)]
145
+ end
146
+ end
147
+
148
+ def distance(spec, cluster)
149
+ dx = spec.x - cluster[:x]
150
+ dy = spec.y - cluster[:y]
151
+ Math.sqrt(dx * dx + dy * dy)
152
+ end
153
+
154
+ def quantize(value, bucket_size)
155
+ return nil if value.nil?
156
+
157
+ (value / bucket_size).round * bucket_size
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ostruct"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ module EmbeddedFonts
8
+ # Value object: one Type0 font discovered by {PdfIndexer}, carrying
9
+ # every ref the {CodepointMapper} needs to resolve codepoint → GID.
10
+ #
11
+ # Public so tests can construct realistic fixtures without going
12
+ # through the PDF subprocess layer.
13
+ RawFontDescriptor = Struct.new(
14
+ :base_font,
15
+ :font_obj_id,
16
+ :fontfile_obj_id,
17
+ :fontfile_kind,
18
+ :tounicode_ref,
19
+ :cid_map_kind,
20
+ keyword_init: true,
21
+ )
22
+ end
23
+ end
24
+ end
@@ -1,7 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "svg"
4
-
5
3
  module Ucode
6
4
  module Glyphs
7
5
  module EmbeddedFonts