ucode 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,42 +1,26 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "pathname"
4
- require "open3"
5
4
 
6
5
  require "ucode/cache"
7
6
  require "ucode/fetch/code_charts"
8
- require "ucode/glyphs/monolith_page_map"
9
7
 
10
8
  module Ucode
11
9
  module Glyphs
12
10
  # Resolves a Unicode block to its source PDF on disk.
13
11
  #
14
- # Primary source: the per-block PDF cached at
12
+ # Source: the per-block PDF cached at
15
13
  # `<cache>/<version>/pdfs/U<XXXX>.pdf` (downloaded from
16
14
  # `unicode.org/charts/PDF/` by `Ucode::Fetch::CodeCharts`).
17
- #
18
- # Fallback: slice the page range from the monolith `CodeCharts.pdf`.
19
- # The page range is resolved by `MonolithPageMap` from the PDF's
20
- # bookmark outline, cached under `data/codecharts_page_map.json`.
21
15
  class PdfFetcher
22
16
  # @param version [String] UCD version, used as the cache namespace.
23
- # @param monolith_path [String, Pathname, nil] path to the full
24
- # `CodeCharts.pdf`. Pass nil to disable monolith fallback.
25
- # @param blocks [Array<Ucode::Models::Block>] required for monolith
26
- # fallback — used to match bookmark titles to block first-cps.
27
- # @param page_map_cache [String, Pathname, nil] where to read/write
28
- # the monolith page-map JSON cache.
29
- def initialize(version, monolith_path: nil, blocks: [], page_map_cache: nil)
17
+ def initialize(version)
30
18
  @version = version
31
- @monolith_path = monolith_path && Pathname.new(monolith_path)
32
- @blocks = blocks
33
- @page_map_cache = page_map_cache
34
19
  end
35
20
 
36
21
  # Resolve the per-block PDF for `block_first_cp`, fetching from the
37
22
  # network if missing. Returns the local PDF path, or nil if the
38
- # block's PDF is unavailable (network failure + no monolith, or
39
- # monolith lacks the requested block).
23
+ # block's PDF is unavailable (network failure).
40
24
  #
41
25
  # @param block_first_cp [Integer] first codepoint of the block;
42
26
  # also the PDF's URL slug per unicode.org's naming convention.
@@ -47,9 +31,7 @@ module Ucode
47
31
  return path if path.exist? && !force
48
32
 
49
33
  download(block_first_cp)
50
- return path if path.exist?
51
-
52
- slice_from_monolith(block_first_cp)
34
+ path if path.exist?
53
35
  end
54
36
 
55
37
  private
@@ -65,38 +47,13 @@ module Ucode
65
47
  def download(block_first_cp)
66
48
  Fetch::CodeCharts.call(@version, block_first_cps: [block_first_cp])
67
49
  rescue StandardError => e
68
- # Network failures fall through to monolith fallback. We do not
69
- # swallow programming errors (NoMethodError etc.) — only fetch
70
- # failures (network, checksum, HTTP).
50
+ # Network failures return nil so callers can fall back to other
51
+ # tiers. We do not swallow programming errors (NoMethodError
52
+ # etc.) — only fetch failures (network, checksum, HTTP).
71
53
  return if e.is_a?(Ucode::FetchError)
72
54
 
73
55
  raise
74
56
  end
75
-
76
- def slice_from_monolith(block_first_cp)
77
- return unless @monolith_path&.exist?
78
-
79
- entry = page_map[block_first_cp]
80
- return unless entry && entry.start_page && entry.end_page
81
-
82
- slice_pages(entry.start_page, entry.end_page, per_block_path(block_first_cp))
83
- end
84
-
85
- def page_map
86
- @page_map ||= MonolithPageMap.load(
87
- monolith_path: @monolith_path,
88
- blocks: @blocks,
89
- cache_path: @page_map_cache,
90
- )
91
- end
92
-
93
- def slice_pages(start_page, end_page, out_path)
94
- out_path.dirname.mkpath
95
- cmd = ["pdftk", @monolith_path.to_s, "cat",
96
- "#{start_page}-#{end_page}", "output", out_path.to_s]
97
- _out, status = Open3.capture2e(*cmd)
98
- status.success? ? out_path : nil
99
- end
100
57
  end
101
58
  end
102
59
  end
data/lib/ucode/glyphs.rb CHANGED
@@ -3,24 +3,14 @@
3
3
  module Ucode
4
4
  # Glyphs — converts Code Charts PDF pages into per-codepoint SVGs.
5
5
  #
6
- # Pipeline: fetch per-block PDF render to SVG → detect grid → extract
7
- # cell normalize viewBoxwrite glyph.svg.
6
+ # The current pipeline is the 4-tier sourcing strategy:
7
+ # Tier 1 (real fonts)Pillar 1 (embedded CIDFont + ToUnicode) →
8
+ # Pillar 2 (positional correlation) → Pillar 3 (Last Resort UFO).
9
+ # See {EmbeddedFonts} for Pillar 1 + 2 and {LastResort} for Pillar 3.
8
10
  #
9
11
  # Vector extraction only. NEVER run OCR.
10
12
  module Glyphs
11
13
  autoload :PdfFetcher, "ucode/glyphs/pdf_fetcher"
12
- autoload :PageRenderer, "ucode/glyphs/page_renderer"
13
- autoload :MutoolRenderer, "ucode/glyphs/mutool_renderer"
14
- autoload :Pdf2svgRenderer, "ucode/glyphs/pdf2svg_renderer"
15
- autoload :DvisvgmRenderer, "ucode/glyphs/dvisvgm_renderer"
16
- autoload :PdftocairoRenderer, "ucode/glyphs/pdftocairo_renderer"
17
- autoload :Grid, "ucode/glyphs/grid"
18
- autoload :PathBbox, "ucode/glyphs/path_bbox"
19
- autoload :GridDetector, "ucode/glyphs/grid_detector"
20
- autoload :CellExtractor, "ucode/glyphs/cell_extractor"
21
- autoload :MonolithPageMap, "ucode/glyphs/monolith_page_map"
22
- autoload :Writer, "ucode/glyphs/writer"
23
- autoload :Pipeline, "ucode/glyphs/pipeline"
24
14
  autoload :LastResort, "ucode/glyphs/last_resort"
25
15
  autoload :EmbeddedFonts, "ucode/glyphs/embedded_fonts"
26
16
  autoload :RealFonts, "ucode/glyphs/real_fonts"
data/lib/ucode/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ucode
4
- VERSION = "0.2.2"
4
+ VERSION = "0.2.3"
5
5
  end
data/lib/ucode.rb CHANGED
@@ -32,8 +32,6 @@ module Ucode
32
32
  autoload :UnknownVersionError, "ucode/error"
33
33
  autoload :UnknownBlockError, "ucode/error"
34
34
  autoload :GlyphError, "ucode/error"
35
- autoload :PdfRenderError, "ucode/error"
36
- autoload :GridDetectionError, "ucode/error"
37
35
  autoload :LastResortMissingError, "ucode/error"
38
36
  autoload :EmbeddedFontsMissingError, "ucode/error"
39
37
  autoload :CodeChartNotFoundError, "ucode/error"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ucode
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-07-01 00:00:00.000000000 Z
11
+ date: 2026-07-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -321,7 +321,6 @@ files:
321
321
  - lib/ucode/commands/canonical_build.rb
322
322
  - lib/ucode/commands/fetch.rb
323
323
  - lib/ucode/commands/font_coverage.rb
324
- - lib/ucode/commands/glyphs.rb
325
324
  - lib/ucode/commands/lookup.rb
326
325
  - lib/ucode/commands/parse.rb
327
326
  - lib/ucode/commands/release.rb
@@ -342,8 +341,6 @@ files:
342
341
  - lib/ucode/fetch/ucd_zip.rb
343
342
  - lib/ucode/fetch/unihan_zip.rb
344
343
  - lib/ucode/glyphs.rb
345
- - lib/ucode/glyphs/cell_extractor.rb
346
- - lib/ucode/glyphs/dvisvgm_renderer.rb
347
344
  - lib/ucode/glyphs/embedded_fonts.rb
348
345
  - lib/ucode/glyphs/embedded_fonts/catalog.rb
349
346
  - lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb
@@ -352,9 +349,11 @@ files:
352
349
  - lib/ucode/glyphs/embedded_fonts/source.rb
353
350
  - lib/ucode/glyphs/embedded_fonts/svg.rb
354
351
  - lib/ucode/glyphs/embedded_fonts/tounicode.rb
352
+ - lib/ucode/glyphs/embedded_fonts/trace_correlator.rb
353
+ - lib/ucode/glyphs/embedded_fonts/trace_glyph.rb
354
+ - lib/ucode/glyphs/embedded_fonts/trace_parser.rb
355
+ - lib/ucode/glyphs/embedded_fonts/trace_runner.rb
355
356
  - lib/ucode/glyphs/embedded_fonts/writer.rb
356
- - lib/ucode/glyphs/grid.rb
357
- - lib/ucode/glyphs/grid_detector.rb
358
357
  - lib/ucode/glyphs/last_resort.rb
359
358
  - lib/ucode/glyphs/last_resort/cmap_index.rb
360
359
  - lib/ucode/glyphs/last_resort/contents.rb
@@ -363,14 +362,7 @@ files:
363
362
  - lib/ucode/glyphs/last_resort/source.rb
364
363
  - lib/ucode/glyphs/last_resort/svg.rb
365
364
  - lib/ucode/glyphs/last_resort/writer.rb
366
- - lib/ucode/glyphs/monolith_page_map.rb
367
- - lib/ucode/glyphs/mutool_renderer.rb
368
- - lib/ucode/glyphs/page_renderer.rb
369
- - lib/ucode/glyphs/path_bbox.rb
370
- - lib/ucode/glyphs/pdf2svg_renderer.rb
371
365
  - lib/ucode/glyphs/pdf_fetcher.rb
372
- - lib/ucode/glyphs/pdftocairo_renderer.rb
373
- - lib/ucode/glyphs/pipeline.rb
374
366
  - lib/ucode/glyphs/real_fonts.rb
375
367
  - lib/ucode/glyphs/real_fonts/block_coverage.rb
376
368
  - lib/ucode/glyphs/real_fonts/cmap_cache.rb
@@ -397,7 +389,6 @@ files:
397
389
  - lib/ucode/glyphs/universal_set/manifest_writer.rb
398
390
  - lib/ucode/glyphs/universal_set/pre_build_check.rb
399
391
  - lib/ucode/glyphs/universal_set/validator.rb
400
- - lib/ucode/glyphs/writer.rb
401
392
  - lib/ucode/index.rb
402
393
  - lib/ucode/index_builder.rb
403
394
  - lib/ucode/models.rb
@@ -1,94 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "pathname"
4
-
5
- require "ucode/glyphs"
6
-
7
- module Ucode
8
- module Commands
9
- # `ucode glyphs` — extract per-codepoint SVGs from Code Charts PDFs.
10
- # Thin Thor-facing wrapper around {Ucode::Glyphs::Pipeline}:
11
- # opt-in gate + experimental warning live here; the pipeline
12
- # assembly (block loading, fetcher, per-block specs) lives in
13
- # {Ucode::Glyphs::Pipeline}.
14
- #
15
- # **Status (v0.1): EXPERIMENTAL.** The cell-extraction pipeline
16
- # currently includes cell-border decorations alongside the actual
17
- # character outline because the Code Charts PDFs composite the two
18
- # into a single glyph definition. The output is therefore not yet
19
- # suitable for end-user display. The command is retained so the
20
- # pipeline can be iterated on without churning the CLI surface, but
21
- # callers MUST opt in via `include_glyphs: true` (CLI: `--include-glyphs`)
22
- # and will receive a printed warning. Tracked for v0.2.
23
- #
24
- # Takes a resolved version string; CLI callers resolve via
25
- # {VersionResolver.resolve} once and thread it through. See
26
- # Candidate 4 of the 2026-06-29 architecture review.
27
- class GlyphsCommand
28
- ExperimentalWarning = "ucode glyphs is experimental in v0.1: " \
29
- "extracted SVGs include cell-border decorations " \
30
- "alongside the character outline."
31
- private_constant :ExperimentalWarning
32
-
33
- class << self
34
- # @return [String] the experimental-status banner. Exposed so the
35
- # CLI and BuildCommand surface the same message verbatim.
36
- def experimental_warning
37
- ExperimentalWarning
38
- end
39
- end
40
-
41
- # @param version [String] resolved UCD version
42
- # @param output_root [String, Pathname]
43
- # @param block_filter [Array<String>, nil] block ids to limit to;
44
- # nil = every block
45
- # @param force [Boolean] re-fetch PDFs even when cached
46
- # @param monolith_path [String, Pathname, nil] path to CodeCharts.pdf
47
- # for fallback slicing; defaults to ./CodeCharts.pdf
48
- # @param include_glyphs [Boolean] opt-in for the experimental v0.1
49
- # pipeline. When false (default), the command returns a `skipped`
50
- # payload without touching disk.
51
- # @param warn [IO, nil] when provided, the experimental warning is
52
- # written here exactly once before work begins.
53
- # @return [Hash] aggregated Writer tally + version, or a `skipped`
54
- # payload when opt-in is false.
55
- def call(version, output_root:,
56
- block_filter: nil, force: false,
57
- monolith_path: Glyphs::Pipeline::DEFAULT_MONOLITH_PATH,
58
- include_glyphs: false, warn: nil)
59
- return skipped(version) unless include_glyphs
60
-
61
- warn&.puts(ExperimentalWarning)
62
-
63
- pipeline = Glyphs::Pipeline.new(
64
- version: version,
65
- block_filter: block_filter,
66
- monolith_path: monolith_path,
67
- )
68
- specs = pipeline.build_specs(force: force)
69
-
70
- writer = Glyphs::Writer.new(
71
- output_root: Pathname.new(output_root),
72
- parallel_workers: workers,
73
- )
74
- tally = writer.write_all(specs)
75
- tally.merge(version: version, block_count: specs.size)
76
- end
77
-
78
- private
79
-
80
- def workers
81
- Ucode.configuration.parallel_workers
82
- end
83
-
84
- def skipped(version)
85
- {
86
- version: version,
87
- skipped: true,
88
- reason: :experimental_v0_1,
89
- warning: ExperimentalWarning,
90
- }
91
- end
92
- end
93
- end
94
- end
@@ -1,130 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "nokogiri"
4
-
5
- require "ucode/glyphs/path_bbox"
6
-
7
- module Ucode
8
- module Glyphs
9
- # Extracts a single character cell from a Code Charts SVG page and
10
- # returns a normalized standalone SVG containing only that cell's
11
- # vector paths.
12
- #
13
- # The cell is identified by codepoint. The extractor asks the Grid
14
- # for the cell's anchor position, finds the `<use>` element placed
15
- # at that position, resolves its glyph definition from `<defs>`,
16
- # and emits a fresh `<svg>` whose viewBox is `0 0 1000 1000` and
17
- # whose body is the glyph's `<path>` data translated and scaled to
18
- # fit that viewBox with a small margin.
19
- #
20
- # Vector-only. Never rasterizes, never OCRs. If the cell is empty
21
- # (no character glyph placed there, e.g. unassigned codepoint or
22
- # control character), the extractor returns nil.
23
- class CellExtractor
24
- ViewBoxSize = 1000.0
25
- MarginRatio = 0.1
26
- private_constant :ViewBoxSize, :MarginRatio
27
-
28
- # @param doc [Nokogiri::XML::Document] the rendered Code Charts page
29
- def initialize(doc)
30
- @doc = doc
31
- @glyph_cache = {}
32
- end
33
-
34
- # @param grid [Ucode::Glyphs::Grid]
35
- # @param codepoint [Integer]
36
- # @return [Nokogiri::XML::Document, nil] a standalone `<svg>` doc
37
- # with viewBox `0 0 1000 1000`, or nil if the cell is empty.
38
- def extract(grid, codepoint)
39
- anchor = grid.cell_position(codepoint)
40
- return nil unless anchor
41
-
42
- use_node = find_use_at(anchor, grid)
43
- return nil unless use_node
44
-
45
- path_data = collect_paths(use_node["xlink:href"] || use_node["href"])
46
- return nil if path_data.empty?
47
-
48
- bbox = PathBbox.estimate(path_data.join(" "))
49
- return nil if bbox.empty?
50
-
51
- build_svg(path_data, bbox, use_node["x"].to_f, use_node["y"].to_f)
52
- end
53
-
54
- private
55
-
56
- def find_use_at(anchor, grid)
57
- tolerance_x = grid.column_pitch / 2
58
- tolerance_y = grid.row_pitch / 2
59
-
60
- candidates = @doc.css("use").select do |node|
61
- href = node["xlink:href"] || node["href"] || ""
62
- href.start_with?("#glyph-") &&
63
- (node["x"].to_f - anchor[0]).abs <= tolerance_x &&
64
- (node["y"].to_f - anchor[1]).abs <= tolerance_y
65
- end
66
-
67
- candidates.min_by do |node|
68
- dx = node["x"].to_f - anchor[0]
69
- dy = node["y"].to_f - anchor[1]
70
- (dx * dx) + (dy * dy)
71
- end
72
- end
73
-
74
- def collect_paths(href)
75
- return [] unless href
76
-
77
- glyph_id = href.sub(/\A#/, "")
78
- node = glyph_definition(glyph_id)
79
- return [] unless node
80
-
81
- node.css("path").map { |p| p["d"] }.compact
82
- end
83
-
84
- def glyph_definition(glyph_id)
85
- return @glyph_cache[glyph_id] if @glyph_cache.key?(glyph_id)
86
-
87
- @glyph_cache[glyph_id] = @doc.at_css("defs ##{glyph_id}")
88
- end
89
-
90
- def build_svg(path_data, glyph_bbox, place_x, place_y)
91
- placed = PathBbox::Result.new(
92
- min_x: place_x + glyph_bbox.min_x,
93
- min_y: place_y + glyph_bbox.min_y,
94
- max_x: place_x + glyph_bbox.max_x,
95
- max_y: place_y + glyph_bbox.max_y,
96
- )
97
-
98
- width = placed.width
99
- height = placed.height
100
- return nil if width <= 0 || height <= 0
101
-
102
- content_size = ViewBoxSize * (1.0 - (2.0 * MarginRatio))
103
- scale = [content_size / width, content_size / height].min
104
- offset_x = (ViewBoxSize - (width * scale)) / 2.0
105
- offset_y = (ViewBoxSize - (height * scale)) / 2.0
106
- translate_x = offset_x - (placed.min_x * scale)
107
- translate_y = offset_y - (placed.min_y * scale)
108
-
109
- builder = Nokogiri::XML::Document.new
110
- root = builder.create_element(
111
- "svg",
112
- xmlns: "http://www.w3.org/2000/svg",
113
- viewBox: "0 0 #{ViewBoxSize.to_i} #{ViewBoxSize.to_i}",
114
- width: ViewBoxSize.to_i,
115
- height: ViewBoxSize.to_i,
116
- )
117
- group = builder.create_element(
118
- "g",
119
- transform: "scale(#{format('%.6f', scale)}) translate(#{format('%.6f', translate_x)}, #{format('%.6f', translate_y)})",
120
- )
121
- path_data.each do |d|
122
- group.add_child(builder.create_element("path", d: d, fill: "black"))
123
- end
124
- root.add_child(group)
125
- builder.add_child(root)
126
- builder
127
- end
128
- end
129
- end
130
- end
@@ -1,29 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "ucode/glyphs/page_renderer"
4
-
5
- module Ucode
6
- module Glyphs
7
- # `dvisvgm` — originally a DVI-to-SVG converter, also handles PDF.
8
- # The `--no-fonts` flag forces outline-only output (no font subsetting
9
- # artifacts), which is what we want for vector glyph extraction.
10
- #
11
- # Command: `dvisvgm --pdf --no-fonts --page=<n> <in.pdf> -o <out.svg>`
12
- class DvisvgmRenderer < PageRenderer
13
- class << self
14
- def renderer_name
15
- :dvisvgm
16
- end
17
-
18
- def binary_name
19
- :dvisvgm
20
- end
21
-
22
- def build_command(pdf_path, page_num, out_path)
23
- ["dvisvgm", "--pdf", "--no-fonts", "--page=#{page_num}",
24
- pdf_path.to_s, "-o", out_path.to_s]
25
- end
26
- end
27
- end
28
- end
29
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Ucode
4
- module Glyphs
5
- Grid = Struct.new(
6
- :origin_x, :origin_y,
7
- :column_pitch, :row_pitch,
8
- :columns, :rows,
9
- :block_first_cp,
10
- keyword_init: true,
11
- ) do
12
- def cell_position(codepoint)
13
- offset = codepoint - block_first_cp
14
- return nil if offset.negative?
15
-
16
- row, col = offset.divmod(columns)
17
- return nil if row >= rows
18
-
19
- [origin_x + (col * column_pitch), origin_y + (row * row_pitch)]
20
- end
21
-
22
- def codepoint_at(row, col)
23
- return nil if row.negative? || row >= rows
24
- return nil if col.negative? || col >= columns
25
-
26
- block_first_cp + (row * columns) + col
27
- end
28
- end
29
- end
30
- end
@@ -1,165 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "nokogiri"
4
-
5
- require "ucode/glyphs/grid"
6
- require "ucode/glyphs/path_bbox"
7
-
8
- module Ucode
9
- module Glyphs
10
- # Detects the chart grid in a Code Charts PDF page rendered to SVG.
11
- #
12
- # The PDF page produced by pdftocairo / pdf2svg / dvisvgm contains
13
- # every visible element (title, block name, row labels, codepoint
14
- # digits, and the actual character glyphs) as positioned `<use>`
15
- # references into a `<defs>` block of named glyph outlines. The
16
- # character cells we want to extract correspond to glyphs whose
17
- # bounding box is larger than every label or digit font on the
18
- # page — the chart's character samples are drawn at a larger size
19
- # than any of the surrounding text.
20
- #
21
- # Algorithm:
22
- # 1. Walk `<defs>`, estimate each glyph's bbox via `PathBbox`.
23
- # 2. Classify a glyph as "character-sized" when its width and
24
- # height both exceed `CharSizeThreshold` (default 8 pt).
25
- # This excludes title, row-label, and digit glyphs while
26
- # keeping every actual character sample — including pages
27
- # where the chart mixes multiple character fonts (e.g. the
28
- # Basic Latin page uses one font for punctuation/digits and
29
- # another for letters).
30
- # 3. Collect every `<use>` that references a character-sized
31
- # glyph; these are the cell origins.
32
- # 4. Cluster the Y values of those uses into rows, and within
33
- # each row cluster the X values into columns.
34
- # 5. Drop rows whose column count diverges from the modal value
35
- # (these are footer/header artifacts, not chart rows).
36
- # 6. Return a `Grid` value object anchored at the top-left cell
37
- # with uniform column/row pitches derived from the median
38
- # spacing between adjacent clusters.
39
- #
40
- # This is pure (no I/O). The detector takes a parsed Nokogiri
41
- # document and returns a `Grid`.
42
- class GridDetector
43
- CharSizeThreshold = 8.0
44
- ClusterEpsilon = 15.0
45
- private_constant :CharSizeThreshold, :ClusterEpsilon
46
-
47
- class << self
48
- # @param doc [Nokogiri::XML::Document]
49
- # @param block_first_cp [Integer] first codepoint of the block;
50
- # stored on the Grid so callers can map codepoint ↔ cell.
51
- # @return [Ucode::Glyphs::Grid, nil] nil if no character grid
52
- # could be detected.
53
- def detect(doc, block_first_cp:)
54
- uses = collect_uses(doc)
55
- return nil if uses.empty?
56
-
57
- char_glyph_ids = char_sized_glyph_ids(doc)
58
- return nil if char_glyph_ids.empty?
59
-
60
- cell_uses = uses.select { |u| char_glyph_ids.include?(u.glyph_id) }
61
- return nil if cell_uses.empty?
62
-
63
- build_grid(cell_uses, block_first_cp)
64
- end
65
-
66
- private
67
-
68
- UsePosition = Struct.new(:x, :y, :glyph_id, :set_id, keyword_init: true)
69
-
70
- def collect_uses(doc)
71
- doc.css("use").map do |node|
72
- href = node["xlink:href"] || node["href"] || ""
73
- glyph_id = href.sub(/\A#/, "")
74
- match = glyph_id.match(/\Aglyph-(\d+)-(\d+)\z/)
75
- next nil unless match
76
-
77
- UsePosition.new(
78
- x: node["x"].to_f,
79
- y: node["y"].to_f,
80
- glyph_id: glyph_id,
81
- set_id: match[1].to_i,
82
- )
83
- end.compact
84
- end
85
-
86
- def char_sized_glyph_ids(doc)
87
- doc.css("defs g[id^='glyph-']").each_with_object({}) do |g, acc|
88
- id = g["id"]
89
- next unless id =~ /\Aglyph-\d+-\d+\z/
90
-
91
- paths = g.css("path")
92
- next if paths.empty?
93
-
94
- bbox = paths.map { |p| PathBbox.estimate(p["d"]) }.reject(&:empty?).reduce do |a, b|
95
- PathBbox::Result.new(
96
- min_x: [a.min_x, b.min_x].min,
97
- min_y: [a.min_y, b.min_y].min,
98
- max_x: [a.max_x, b.max_x].max,
99
- max_y: [a.max_y, b.max_y].max,
100
- )
101
- end
102
- next unless bbox
103
-
104
- acc[id] = true if char_sized?(bbox)
105
- end
106
- end
107
-
108
- def char_sized?(bbox)
109
- bbox.width >= CharSizeThreshold && bbox.height >= CharSizeThreshold
110
- end
111
-
112
- def median(values)
113
- return 0.0 if values.empty?
114
-
115
- sorted = values.sort
116
- mid = sorted.size / 2
117
- sorted.size.even? ? (sorted[mid - 1] + sorted[mid]) / 2.0 : sorted[mid]
118
- end
119
-
120
- def build_grid(cell_uses, block_first_cp)
121
- row_clusters = cluster_by_value(cell_uses, :y)
122
- return nil if row_clusters.empty?
123
-
124
- column_clusters = cluster_by_value(cell_uses, :x)
125
- return nil if column_clusters.empty?
126
-
127
- column_starts = column_clusters.map { |c| c.map(&:x).min }.sort
128
- row_starts = row_clusters.map { |c| c.map(&:y).min }.sort
129
-
130
- Grid.new(
131
- origin_x: column_starts.first,
132
- origin_y: row_starts.first,
133
- column_pitch: median_pitch(column_starts),
134
- row_pitch: median_pitch(row_starts),
135
- columns: column_starts.size,
136
- rows: row_starts.size,
137
- block_first_cp: block_first_cp,
138
- )
139
- end
140
-
141
- def cluster_by_value(items, attr)
142
- sorted = items.sort_by { |i| i.public_send(attr) }
143
- clusters = []
144
- sorted.each do |item|
145
- value = item.public_send(attr)
146
- if clusters.empty? || (value - clusters.last[:max]).abs > ClusterEpsilon
147
- clusters << { max: value, items: [item] }
148
- else
149
- clusters.last[:max] = value
150
- clusters.last[:items] << item
151
- end
152
- end
153
- clusters.map { |c| c[:items] }
154
- end
155
-
156
- def median_pitch(sorted_values)
157
- return 0.0 if sorted_values.size < 2
158
-
159
- pitches = sorted_values.each_cons(2).map { |a, b| b - a }
160
- median(pitches)
161
- end
162
- end
163
- end
164
- end
165
- end