ucode 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ucode/code_chart/extractor.rb +1 -9
  3. data/lib/ucode/code_chart/writer.rb +1 -1
  4. data/lib/ucode/commands/canonical_build.rb +4 -4
  5. data/lib/ucode/commands/universal_set.rb +5 -3
  6. data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
  7. data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
  8. data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
  9. data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
  10. data/lib/ucode/coordinator/enrichment/display.rb +36 -0
  11. data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
  12. data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
  13. data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
  14. data/lib/ucode/coordinator/enrichment/names.rb +63 -0
  15. data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
  16. data/lib/ucode/coordinator/enrichment.rb +51 -0
  17. data/lib/ucode/coordinator/range_lookup.rb +65 -0
  18. data/lib/ucode/coordinator.rb +4 -276
  19. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
  20. data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
  21. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
  22. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
  23. data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
  24. data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
  25. data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
  26. data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
  27. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
  28. data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
  29. data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
  30. data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
  31. data/lib/ucode/glyphs/resolver_factory.rb +45 -0
  32. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
  33. data/lib/ucode/glyphs.rb +1 -0
  34. data/lib/ucode/version.rb +1 -1
  35. metadata +20 -3
@@ -7,106 +7,91 @@ module Ucode
7
7
  # their Unicode codepoints via positional matching against hex
8
8
  # codepoint labels on the same chart page.
9
9
  #
10
- # The Unicode Code Charts use two layouts:
10
+ # Adapter for the `mutool trace` XML format: parses {TraceGlyph}
11
+ # arrays, partitions into specimens and labels, auto-detects the
12
+ # label font by proximity, then delegates matching to
13
+ # {PositionalMatcher}.
11
14
  #
12
- # 1. **List layout** (chart pages): the hex codepoint label (e.g.
13
- # "10D75") is printed to the LEFT of the specimen glyph at the
14
- # same Y baseline.
15
- #
16
- # 2. **Grid layout** (summary pages): the hex codepoint label is
17
- # printed directly ABOVE the specimen glyph (~12 pt higher on
18
- # Y, same X).
19
- #
20
- # Both layouts are handled by matching each specimen to the
21
- # nearest valid label cluster by Euclidean distance, with a
22
- # maximum match radius that excludes far-away header/footer text.
23
- #
24
- # The codepoint labels in every Unicode Code Charts PDF are set
25
- # in a single dedicated label font (typically ArialNarrow).
26
- # Character names, headers, and footers use other fonts. To avoid
27
- # false matches from hex chars in those texts, the correlator
28
- # auto-detects the label font as the non-specimen font that
29
- # contributes the most hex-char glyphs.
30
- #
31
- # Matching is greedy one-to-one: each GID and each codepoint is
32
- # assigned at most once, so a specimen that sits between two
33
- # labels only claims the closer one.
34
- #
35
- # Pure logic — no I/O. The caller passes pre-parsed TraceGlyph
36
- # arrays (typically from {TraceRunner} + {TraceParser}).
15
+ # The label font auto-detection is the only piece of "intelligence"
16
+ # in this adapter everything else is format translation. The
17
+ # matching algorithm lives in {PositionalMatcher} and is shared
18
+ # with {ContentStreamCorrelator}.
37
19
  class TraceCorrelator
38
- DEFAULT_Y_BUCKET = 1.0
39
- private_constant :DEFAULT_Y_BUCKET
40
-
41
- # Adjacent label chars within one codepoint label are ~4-6 pt
42
- # apart on X. Different columns are ~30+ pt apart. 10 pt
43
- # cleanly separates within-label from between-column gaps.
44
- X_GAP_THRESHOLD = 10.0
45
- private_constant :X_GAP_THRESHOLD
46
-
47
- # Maximum valid Unicode codepoint. Filters out false labels
48
- # that form hex strings from character-name fragments.
49
- UNICODE_MAX = 0x10FFFF
50
- private_constant :UNICODE_MAX
51
-
52
- # Maximum Euclidean distance from a specimen to its matching
53
- # label cluster. List-layout labels are ~21 pt to the left;
54
- # grid-layout labels are ~12 pt above. Header/footer text is
55
- # always > 30 pt away from any specimen.
56
- MAX_MATCH_DISTANCE = 30.0
57
- private_constant :MAX_MATCH_DISTANCE
20
+ # Proximity radius (in PDF points) for counting how often each
21
+ # non-specimen font's hex-char glyphs appear near a specimen.
22
+ # Code Charts dedicate one small font to the codepoint labels;
23
+ # body text and headers are farther away.
24
+ LABEL_PROXIMITY_RADIUS = 50.0
25
+ private_constant :LABEL_PROXIMITY_RADIUS
58
26
 
59
27
  # @param specimen_font_name [String] the BaseFont name of the
60
28
  # CID font whose glyphs need correlation
61
29
  def initialize(specimen_font_name:)
62
30
  @specimen_font_name = specimen_font_name
63
- @y_bucket = DEFAULT_Y_BUCKET
64
31
  end
65
32
 
66
33
  # @param trace_glyphs [Array<TraceGlyph>]
67
34
  # @return [Hash{Integer=>Integer}] codepoint => gid
68
35
  def correlate(trace_glyphs)
69
- specimens = trace_glyphs.select { |g| g.font_name == @specimen_font_name }
36
+ specimens = select_specimens(trace_glyphs)
70
37
  return {} if specimens.empty?
71
38
 
72
- label_font = detect_label_font(trace_glyphs)
73
- return {} unless label_font
74
-
75
- labels = trace_glyphs.select { |g| label_glyph?(g, label_font) }
39
+ labels = select_labels(trace_glyphs)
76
40
  return {} if labels.empty?
77
41
 
78
- clusters = build_label_clusters(labels)
79
- return {} if clusters.empty?
80
-
81
- build_mapping(specimens, clusters)
42
+ PositionalMatcher.match(
43
+ specimens.map { |g| to_position(g) },
44
+ labels.map { |g| to_position(g) },
45
+ )
82
46
  end
83
47
 
84
48
  private
85
49
 
50
+ def select_specimens(trace_glyphs)
51
+ trace_glyphs.select { |g| g.font_name == @specimen_font_name }
52
+ end
53
+
54
+ def select_labels(trace_glyphs)
55
+ label_font = detect_label_font(trace_glyphs)
56
+ return [] unless label_font
57
+
58
+ trace_glyphs.select { |g| hex_char_from?(g, label_font) }
59
+ end
60
+
61
+ def hex_char_from?(glyph, font_name)
62
+ glyph.font_name == font_name && glyph.unicode&.match?(/\A[0-9A-Fa-f]\z/)
63
+ end
64
+
65
+ def to_position(glyph)
66
+ PositionalMatcher::Position.new(
67
+ x: glyph.x,
68
+ y: glyph.y,
69
+ font_ref: glyph.font_name,
70
+ glyph_id: glyph.gid,
71
+ text: glyph.unicode,
72
+ )
73
+ end
74
+
86
75
  # The label font is the non-specimen font whose hex-char glyphs
87
76
  # appear most often in close proximity to specimen glyphs.
88
77
  # Code Charts dedicate one small font to the codepoint labels;
89
78
  # body text, headers, and character names use other fonts that
90
79
  # may also contain hex chars but are not co-located with
91
- # specimens (e.g. the index page has thousands of hex chars in
92
- # MyriadPro-Light but zero specimens).
93
- LABEL_PROXIMITY_RADIUS = 50.0
94
- private_constant :LABEL_PROXIMITY_RADIUS
95
-
80
+ # specimens.
96
81
  def detect_label_font(trace_glyphs)
97
- specimens = trace_glyphs.select { |g| g.font_name == @specimen_font_name }
82
+ specimens = select_specimens(trace_glyphs)
98
83
  return nil if specimens.empty?
99
84
 
100
- non_specimen_hex = non_specimen_hex_glyphs(trace_glyphs)
101
- return nil if non_specimen_hex.empty?
85
+ candidates = select_hex_candidates(trace_glyphs)
86
+ return nil if candidates.empty?
102
87
 
103
- counts = proximity_counts(specimens, non_specimen_hex)
88
+ counts = proximity_counts(specimens, candidates)
104
89
  return nil if counts.empty?
105
90
 
106
91
  counts.max_by { |_, n| n }.first
107
92
  end
108
93
 
109
- def non_specimen_hex_glyphs(trace_glyphs)
94
+ def select_hex_candidates(trace_glyphs)
110
95
  trace_glyphs.select do |g|
111
96
  g.font_name != @specimen_font_name &&
112
97
  g.unicode&.match?(/\A[0-9A-Fa-f]\z/)
@@ -118,112 +103,13 @@ module Ucode
118
103
  radius_sq = LABEL_PROXIMITY_RADIUS * LABEL_PROXIMITY_RADIUS
119
104
  specimens.each do |spec|
120
105
  candidates.each do |g|
121
- counts[g.font_name] += 1 if within_radius?(spec, g, radius_sq)
106
+ dx = spec.x - g.x
107
+ dy = spec.y - g.y
108
+ counts[g.font_name] += 1 if dx * dx + dy * dy < radius_sq
122
109
  end
123
110
  end
124
111
  counts
125
112
  end
126
-
127
- def within_radius?(spec, glyph, radius_sq)
128
- dx = spec.x - glyph.x
129
- dy = spec.y - glyph.y
130
- dx * dx + dy * dy < radius_sq
131
- end
132
-
133
- def label_glyph?(glyph, label_font)
134
- glyph.font_name == label_font &&
135
- glyph.unicode&.match?(/\A[0-9A-Fa-f]\z/)
136
- end
137
-
138
- # Cluster labels by Y (row), then by X gap (column within row).
139
- # Returns a flat array of {x:, y:, codepoint:} clusters.
140
- def build_label_clusters(labels)
141
- by_y = labels.group_by { |g| quantize(g.y, @y_bucket) }
142
- by_y.flat_map { |(_, glyphs)| clusters_from_row(glyphs) }
143
- end
144
-
145
- def clusters_from_row(glyphs)
146
- cluster_by_x_gap(glyphs.sort_by(&:x)).filter_map { |cluster| build_cluster(cluster) }
147
- end
148
-
149
- def build_cluster(cluster)
150
- hex = cluster.map(&:unicode).join
151
- return nil unless hex.match?(/\A[0-9A-Fa-f]{4,6}\z/)
152
-
153
- cp = hex.to_i(16)
154
- return nil unless cp <= UNICODE_MAX
155
-
156
- {
157
- x: cluster.sum(&:x) / cluster.size,
158
- y: cluster.first.y,
159
- codepoint: cp,
160
- }
161
- end
162
-
163
- def cluster_by_x_gap(sorted_glyphs)
164
- clusters = []
165
- current = []
166
-
167
- sorted_glyphs.each do |g|
168
- if current.empty? || (g.x - current.last.x).abs < X_GAP_THRESHOLD
169
- current << g
170
- else
171
- clusters << current if current.size > 1
172
- current = [g]
173
- end
174
- end
175
- clusters << current if current.size > 1
176
- clusters
177
- end
178
-
179
- # Greedy one-to-one matching: each GID and each codepoint is
180
- # assigned at most once. Candidate pairs are sorted by distance
181
- # so the closest specimen-label pair always wins.
182
- def build_mapping(specimens, clusters)
183
- candidates = Array.new(clusters.size) { |ci| specimen_distances(specimens, clusters, ci) }
184
-
185
- assigned_gids = Set.new
186
- assigned_cps = Set.new
187
- mapping = {}
188
-
189
- pairs_by_distance(candidates).each do |spec_idx, cluster_idx, dist|
190
- next if dist > MAX_MATCH_DISTANCE
191
-
192
- spec = specimens[spec_idx]
193
- cluster = clusters[cluster_idx]
194
- next if assigned_gids.include?(spec.gid)
195
- next if assigned_cps.include?(cluster[:codepoint])
196
-
197
- assigned_gids << spec.gid
198
- assigned_cps << cluster[:codepoint]
199
- mapping[cluster[:codepoint]] = spec.gid
200
- end
201
-
202
- mapping
203
- end
204
-
205
- def specimen_distances(specimens, clusters, cluster_idx)
206
- cluster = clusters[cluster_idx]
207
- specimens.each_with_index.map do |spec, spec_idx|
208
- [spec_idx, cluster_idx, distance(spec, cluster)]
209
- end
210
- end
211
-
212
- def pairs_by_distance(candidates)
213
- candidates.flatten(1).sort_by { |_, _, dist| dist }
214
- end
215
-
216
- def distance(spec, cluster)
217
- dx = spec.x - cluster[:x]
218
- dy = spec.y - cluster[:y]
219
- Math.sqrt(dx * dx + dy * dy)
220
- end
221
-
222
- def quantize(value, bucket_size)
223
- return nil if value.nil?
224
-
225
- (value / bucket_size).round * bucket_size
226
- end
227
113
  end
228
114
  end
229
115
  end
@@ -2,10 +2,6 @@
2
2
 
3
3
  require "pathname"
4
4
 
5
- require_relative "renderer"
6
- require_relative "../../repo/atomic_writes"
7
- require_relative "../../repo/paths"
8
-
9
5
  module Ucode
10
6
  module Glyphs
11
7
  module EmbeddedFonts
@@ -36,12 +36,16 @@ module Ucode
36
36
  # `mutool info` (font enumeration) and `mutool show -b -o` (raw
37
37
  # stream extraction).
38
38
  module EmbeddedFonts
39
- autoload :Source, "ucode/glyphs/embedded_fonts/source"
39
+ autoload :PdfLocation, "ucode/glyphs/embedded_fonts/pdf_location"
40
40
  autoload :ToUnicode, "ucode/glyphs/embedded_fonts/tounicode"
41
41
  autoload :FontEntry, "ucode/glyphs/embedded_fonts/font_entry"
42
+ autoload :RawFontDescriptor, "ucode/glyphs/embedded_fonts/raw_font_descriptor"
43
+ autoload :PdfIndexer, "ucode/glyphs/embedded_fonts/pdf_indexer"
44
+ autoload :CodepointMapper, "ucode/glyphs/embedded_fonts/codepoint_mapper"
42
45
  autoload :Catalog, "ucode/glyphs/embedded_fonts/catalog"
43
46
  autoload :ContentStreamCorrelator,
44
47
  "ucode/glyphs/embedded_fonts/content_stream_correlator"
48
+ autoload :PositionalMatcher, "ucode/glyphs/embedded_fonts/positional_matcher"
45
49
  autoload :TraceGlyph, "ucode/glyphs/embedded_fonts/trace_glyph"
46
50
  autoload :TraceParser, "ucode/glyphs/embedded_fonts/trace_parser"
47
51
  autoload :TraceCorrelator, "ucode/glyphs/embedded_fonts/trace_correlator"
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ # Single injection point for the 4-tier {Resolver}.
8
+ #
9
+ # Both CanonicalBuildCommand and UniversalSet::BuildCommand need the
10
+ # same shape: open a Database, load the SourceConfig, run a
11
+ # SourceBuilder, wrap the resulting tier-1 sources in a Resolver.
12
+ # Extracting it here gives tests one seam to mock (or bypass) and
13
+ # prevents drift between the two call sites.
14
+ module ResolverFactory
15
+ DEFAULT_INSTALL = false
16
+ private_constant :DEFAULT_INSTALL
17
+
18
+ # @param version [String] UCD version, used to open the Database
19
+ # when one is not supplied.
20
+ # @param source_config_path [String, Pathname, nil] override path
21
+ # to the Tier 1 font config YAML; nil uses the default.
22
+ # @param install [Boolean] pass through to SourceBuilder#tier1_sources
23
+ # — whether to fontist-install missing fonts eagerly.
24
+ # @param database [Ucode::Database, nil] an already-open Database,
25
+ # to skip re-opening when the caller already has one.
26
+ # @return [Ucode::Glyphs::Resolver]
27
+ def self.build(version:, source_config_path: nil,
28
+ install: DEFAULT_INSTALL, database: nil)
29
+ db = database || Ucode::Database.open(version)
30
+ config = SourceConfig.new(path: resolve_config_path(source_config_path))
31
+ builder = SourceBuilder.new(config: config, database: db)
32
+ Resolver.new(sources: builder.tier1_sources(install: install))
33
+ end
34
+
35
+ # @api private
36
+ def self.resolve_config_path(path)
37
+ return SourceConfig::DEFAULT_PATH if path.nil?
38
+ return path if path.is_a?(Pathname)
39
+
40
+ Pathname.new(path)
41
+ end
42
+ private_class_method :resolve_config_path
43
+ end
44
+ end
45
+ end
@@ -31,7 +31,7 @@ module Ucode
31
31
  # @param renderer [EmbeddedFonts::Renderer] the renderer to
32
32
  # delegate to. Callers typically construct it with the
33
33
  # {EmbeddedFonts::Catalog} built from the resolved Code
34
- # Charts {EmbeddedFonts::Source}. To enable pillar-2
34
+ # Charts {EmbeddedFonts::PdfLocation}. To enable pillar-2
35
35
  # fallback, that Catalog must be constructed with
36
36
  # +correlator_configs:+.
37
37
  def initialize(renderer:)
data/lib/ucode/glyphs.rb CHANGED
@@ -16,6 +16,7 @@ module Ucode
16
16
  autoload :RealFonts, "ucode/glyphs/real_fonts"
17
17
  autoload :Source, "ucode/glyphs/source"
18
18
  autoload :Resolver, "ucode/glyphs/resolver"
19
+ autoload :ResolverFactory, "ucode/glyphs/resolver_factory"
19
20
  autoload :SourceConfig, "ucode/glyphs/source_config"
20
21
  autoload :SourceBuilder, "ucode/glyphs/source_builder"
21
22
  autoload :Sources, "ucode/glyphs/sources"
data/lib/ucode/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ucode
4
- VERSION = "0.2.3"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ucode
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-07-02 00:00:00.000000000 Z
11
+ date: 2026-07-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -328,7 +328,19 @@ files:
328
328
  - lib/ucode/commands/universal_set.rb
329
329
  - lib/ucode/config.rb
330
330
  - lib/ucode/coordinator.rb
331
+ - lib/ucode/coordinator/enrichment.rb
332
+ - lib/ucode/coordinator/enrichment/bidi.rb
333
+ - lib/ucode/coordinator/enrichment/binary.rb
334
+ - lib/ucode/coordinator/enrichment/casing.rb
335
+ - lib/ucode/coordinator/enrichment/cjk.rb
336
+ - lib/ucode/coordinator/enrichment/display.rb
337
+ - lib/ucode/coordinator/enrichment/emoji.rb
338
+ - lib/ucode/coordinator/enrichment/identity.rb
339
+ - lib/ucode/coordinator/enrichment/indic.rb
340
+ - lib/ucode/coordinator/enrichment/names.rb
341
+ - lib/ucode/coordinator/enrichment/segmentation.rb
331
342
  - lib/ucode/coordinator/indices.rb
343
+ - lib/ucode/coordinator/range_lookup.rb
332
344
  - lib/ucode/database.rb
333
345
  - lib/ucode/db_builder.rb
334
346
  - lib/ucode/error.rb
@@ -343,10 +355,14 @@ files:
343
355
  - lib/ucode/glyphs.rb
344
356
  - lib/ucode/glyphs/embedded_fonts.rb
345
357
  - lib/ucode/glyphs/embedded_fonts/catalog.rb
358
+ - lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb
346
359
  - lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb
347
360
  - lib/ucode/glyphs/embedded_fonts/font_entry.rb
361
+ - lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb
362
+ - lib/ucode/glyphs/embedded_fonts/pdf_location.rb
363
+ - lib/ucode/glyphs/embedded_fonts/positional_matcher.rb
364
+ - lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb
348
365
  - lib/ucode/glyphs/embedded_fonts/renderer.rb
349
- - lib/ucode/glyphs/embedded_fonts/source.rb
350
366
  - lib/ucode/glyphs/embedded_fonts/svg.rb
351
367
  - lib/ucode/glyphs/embedded_fonts/tounicode.rb
352
368
  - lib/ucode/glyphs/embedded_fonts/trace_correlator.rb
@@ -372,6 +388,7 @@ files:
372
388
  - lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb
373
389
  - lib/ucode/glyphs/real_fonts/writer.rb
374
390
  - lib/ucode/glyphs/resolver.rb
391
+ - lib/ucode/glyphs/resolver_factory.rb
375
392
  - lib/ucode/glyphs/source.rb
376
393
  - lib/ucode/glyphs/source_builder.rb
377
394
  - lib/ucode/glyphs/source_config.rb