ucode 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/config/unicode17_universal_glyph_set.yml +1 -1
- data/lib/ucode/cli.rb +1 -35
- data/lib/ucode/commands/build.rb +3 -26
- data/lib/ucode/commands/canonical_build.rb +1 -4
- data/lib/ucode/commands.rb +0 -1
- data/lib/ucode/error.rb +0 -8
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +81 -4
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +230 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_glyph.rb +27 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_parser.rb +50 -0
- data/lib/ucode/glyphs/embedded_fonts/trace_runner.rb +53 -0
- data/lib/ucode/glyphs/embedded_fonts.rb +4 -0
- data/lib/ucode/glyphs/pdf_fetcher.rb +7 -50
- data/lib/ucode/glyphs.rb +4 -14
- data/lib/ucode/version.rb +1 -1
- data/lib/ucode.rb +0 -2
- metadata +6 -15
- data/lib/ucode/commands/glyphs.rb +0 -94
- data/lib/ucode/glyphs/cell_extractor.rb +0 -130
- data/lib/ucode/glyphs/dvisvgm_renderer.rb +0 -29
- data/lib/ucode/glyphs/grid.rb +0 -30
- data/lib/ucode/glyphs/grid_detector.rb +0 -165
- data/lib/ucode/glyphs/monolith_page_map.rb +0 -181
- data/lib/ucode/glyphs/mutool_renderer.rb +0 -28
- data/lib/ucode/glyphs/page_renderer.rb +0 -234
- data/lib/ucode/glyphs/path_bbox.rb +0 -62
- data/lib/ucode/glyphs/pdf2svg_renderer.rb +0 -26
- data/lib/ucode/glyphs/pdftocairo_renderer.rb +0 -32
- data/lib/ucode/glyphs/pipeline.rb +0 -105
- data/lib/ucode/glyphs/writer.rb +0 -250
|
@@ -1,42 +1,26 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "pathname"
|
|
4
|
-
require "open3"
|
|
5
4
|
|
|
6
5
|
require "ucode/cache"
|
|
7
6
|
require "ucode/fetch/code_charts"
|
|
8
|
-
require "ucode/glyphs/monolith_page_map"
|
|
9
7
|
|
|
10
8
|
module Ucode
|
|
11
9
|
module Glyphs
|
|
12
10
|
# Resolves a Unicode block to its source PDF on disk.
|
|
13
11
|
#
|
|
14
|
-
#
|
|
12
|
+
# Source: the per-block PDF cached at
|
|
15
13
|
# `<cache>/<version>/pdfs/U<XXXX>.pdf` (downloaded from
|
|
16
14
|
# `unicode.org/charts/PDF/` by `Ucode::Fetch::CodeCharts`).
|
|
17
|
-
#
|
|
18
|
-
# Fallback: slice the page range from the monolith `CodeCharts.pdf`.
|
|
19
|
-
# The page range is resolved by `MonolithPageMap` from the PDF's
|
|
20
|
-
# bookmark outline, cached under `data/codecharts_page_map.json`.
|
|
21
15
|
class PdfFetcher
|
|
22
16
|
# @param version [String] UCD version, used as the cache namespace.
|
|
23
|
-
|
|
24
|
-
# `CodeCharts.pdf`. Pass nil to disable monolith fallback.
|
|
25
|
-
# @param blocks [Array<Ucode::Models::Block>] required for monolith
|
|
26
|
-
# fallback — used to match bookmark titles to block first-cps.
|
|
27
|
-
# @param page_map_cache [String, Pathname, nil] where to read/write
|
|
28
|
-
# the monolith page-map JSON cache.
|
|
29
|
-
def initialize(version, monolith_path: nil, blocks: [], page_map_cache: nil)
|
|
17
|
+
def initialize(version)
|
|
30
18
|
@version = version
|
|
31
|
-
@monolith_path = monolith_path && Pathname.new(monolith_path)
|
|
32
|
-
@blocks = blocks
|
|
33
|
-
@page_map_cache = page_map_cache
|
|
34
19
|
end
|
|
35
20
|
|
|
36
21
|
# Resolve the per-block PDF for `block_first_cp`, fetching from the
|
|
37
22
|
# network if missing. Returns the local PDF path, or nil if the
|
|
38
|
-
# block's PDF is unavailable (network failure
|
|
39
|
-
# monolith lacks the requested block).
|
|
23
|
+
# block's PDF is unavailable (network failure).
|
|
40
24
|
#
|
|
41
25
|
# @param block_first_cp [Integer] first codepoint of the block;
|
|
42
26
|
# also the PDF's URL slug per unicode.org's naming convention.
|
|
@@ -47,9 +31,7 @@ module Ucode
|
|
|
47
31
|
return path if path.exist? && !force
|
|
48
32
|
|
|
49
33
|
download(block_first_cp)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
slice_from_monolith(block_first_cp)
|
|
34
|
+
path if path.exist?
|
|
53
35
|
end
|
|
54
36
|
|
|
55
37
|
private
|
|
@@ -65,38 +47,13 @@ module Ucode
|
|
|
65
47
|
def download(block_first_cp)
|
|
66
48
|
Fetch::CodeCharts.call(@version, block_first_cps: [block_first_cp])
|
|
67
49
|
rescue StandardError => e
|
|
68
|
-
# Network failures
|
|
69
|
-
# swallow programming errors (NoMethodError
|
|
70
|
-
# failures (network, checksum, HTTP).
|
|
50
|
+
# Network failures return nil so callers can fall back to other
|
|
51
|
+
# tiers. We do not swallow programming errors (NoMethodError
|
|
52
|
+
# etc.) — only fetch failures (network, checksum, HTTP).
|
|
71
53
|
return if e.is_a?(Ucode::FetchError)
|
|
72
54
|
|
|
73
55
|
raise
|
|
74
56
|
end
|
|
75
|
-
|
|
76
|
-
def slice_from_monolith(block_first_cp)
|
|
77
|
-
return unless @monolith_path&.exist?
|
|
78
|
-
|
|
79
|
-
entry = page_map[block_first_cp]
|
|
80
|
-
return unless entry && entry.start_page && entry.end_page
|
|
81
|
-
|
|
82
|
-
slice_pages(entry.start_page, entry.end_page, per_block_path(block_first_cp))
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def page_map
|
|
86
|
-
@page_map ||= MonolithPageMap.load(
|
|
87
|
-
monolith_path: @monolith_path,
|
|
88
|
-
blocks: @blocks,
|
|
89
|
-
cache_path: @page_map_cache,
|
|
90
|
-
)
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
def slice_pages(start_page, end_page, out_path)
|
|
94
|
-
out_path.dirname.mkpath
|
|
95
|
-
cmd = ["pdftk", @monolith_path.to_s, "cat",
|
|
96
|
-
"#{start_page}-#{end_page}", "output", out_path.to_s]
|
|
97
|
-
_out, status = Open3.capture2e(*cmd)
|
|
98
|
-
status.success? ? out_path : nil
|
|
99
|
-
end
|
|
100
57
|
end
|
|
101
58
|
end
|
|
102
59
|
end
|
data/lib/ucode/glyphs.rb
CHANGED
|
@@ -3,24 +3,14 @@
|
|
|
3
3
|
module Ucode
|
|
4
4
|
# Glyphs — converts Code Charts PDF pages into per-codepoint SVGs.
|
|
5
5
|
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
6
|
+
# The current pipeline is the 4-tier sourcing strategy:
|
|
7
|
+
# Tier 1 (real fonts) → Pillar 1 (embedded CIDFont + ToUnicode) →
|
|
8
|
+
# Pillar 2 (positional correlation) → Pillar 3 (Last Resort UFO).
|
|
9
|
+
# See {EmbeddedFonts} for Pillar 1 + 2 and {LastResort} for Pillar 3.
|
|
8
10
|
#
|
|
9
11
|
# Vector extraction only. NEVER run OCR.
|
|
10
12
|
module Glyphs
|
|
11
13
|
autoload :PdfFetcher, "ucode/glyphs/pdf_fetcher"
|
|
12
|
-
autoload :PageRenderer, "ucode/glyphs/page_renderer"
|
|
13
|
-
autoload :MutoolRenderer, "ucode/glyphs/mutool_renderer"
|
|
14
|
-
autoload :Pdf2svgRenderer, "ucode/glyphs/pdf2svg_renderer"
|
|
15
|
-
autoload :DvisvgmRenderer, "ucode/glyphs/dvisvgm_renderer"
|
|
16
|
-
autoload :PdftocairoRenderer, "ucode/glyphs/pdftocairo_renderer"
|
|
17
|
-
autoload :Grid, "ucode/glyphs/grid"
|
|
18
|
-
autoload :PathBbox, "ucode/glyphs/path_bbox"
|
|
19
|
-
autoload :GridDetector, "ucode/glyphs/grid_detector"
|
|
20
|
-
autoload :CellExtractor, "ucode/glyphs/cell_extractor"
|
|
21
|
-
autoload :MonolithPageMap, "ucode/glyphs/monolith_page_map"
|
|
22
|
-
autoload :Writer, "ucode/glyphs/writer"
|
|
23
|
-
autoload :Pipeline, "ucode/glyphs/pipeline"
|
|
24
14
|
autoload :LastResort, "ucode/glyphs/last_resort"
|
|
25
15
|
autoload :EmbeddedFonts, "ucode/glyphs/embedded_fonts"
|
|
26
16
|
autoload :RealFonts, "ucode/glyphs/real_fonts"
|
data/lib/ucode/version.rb
CHANGED
data/lib/ucode.rb
CHANGED
|
@@ -32,8 +32,6 @@ module Ucode
|
|
|
32
32
|
autoload :UnknownVersionError, "ucode/error"
|
|
33
33
|
autoload :UnknownBlockError, "ucode/error"
|
|
34
34
|
autoload :GlyphError, "ucode/error"
|
|
35
|
-
autoload :PdfRenderError, "ucode/error"
|
|
36
|
-
autoload :GridDetectionError, "ucode/error"
|
|
37
35
|
autoload :LastResortMissingError, "ucode/error"
|
|
38
36
|
autoload :EmbeddedFontsMissingError, "ucode/error"
|
|
39
37
|
autoload :CodeChartNotFoundError, "ucode/error"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ucode
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-07-
|
|
11
|
+
date: 2026-07-02 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: base64
|
|
@@ -321,7 +321,6 @@ files:
|
|
|
321
321
|
- lib/ucode/commands/canonical_build.rb
|
|
322
322
|
- lib/ucode/commands/fetch.rb
|
|
323
323
|
- lib/ucode/commands/font_coverage.rb
|
|
324
|
-
- lib/ucode/commands/glyphs.rb
|
|
325
324
|
- lib/ucode/commands/lookup.rb
|
|
326
325
|
- lib/ucode/commands/parse.rb
|
|
327
326
|
- lib/ucode/commands/release.rb
|
|
@@ -342,8 +341,6 @@ files:
|
|
|
342
341
|
- lib/ucode/fetch/ucd_zip.rb
|
|
343
342
|
- lib/ucode/fetch/unihan_zip.rb
|
|
344
343
|
- lib/ucode/glyphs.rb
|
|
345
|
-
- lib/ucode/glyphs/cell_extractor.rb
|
|
346
|
-
- lib/ucode/glyphs/dvisvgm_renderer.rb
|
|
347
344
|
- lib/ucode/glyphs/embedded_fonts.rb
|
|
348
345
|
- lib/ucode/glyphs/embedded_fonts/catalog.rb
|
|
349
346
|
- lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb
|
|
@@ -352,9 +349,11 @@ files:
|
|
|
352
349
|
- lib/ucode/glyphs/embedded_fonts/source.rb
|
|
353
350
|
- lib/ucode/glyphs/embedded_fonts/svg.rb
|
|
354
351
|
- lib/ucode/glyphs/embedded_fonts/tounicode.rb
|
|
352
|
+
- lib/ucode/glyphs/embedded_fonts/trace_correlator.rb
|
|
353
|
+
- lib/ucode/glyphs/embedded_fonts/trace_glyph.rb
|
|
354
|
+
- lib/ucode/glyphs/embedded_fonts/trace_parser.rb
|
|
355
|
+
- lib/ucode/glyphs/embedded_fonts/trace_runner.rb
|
|
355
356
|
- lib/ucode/glyphs/embedded_fonts/writer.rb
|
|
356
|
-
- lib/ucode/glyphs/grid.rb
|
|
357
|
-
- lib/ucode/glyphs/grid_detector.rb
|
|
358
357
|
- lib/ucode/glyphs/last_resort.rb
|
|
359
358
|
- lib/ucode/glyphs/last_resort/cmap_index.rb
|
|
360
359
|
- lib/ucode/glyphs/last_resort/contents.rb
|
|
@@ -363,14 +362,7 @@ files:
|
|
|
363
362
|
- lib/ucode/glyphs/last_resort/source.rb
|
|
364
363
|
- lib/ucode/glyphs/last_resort/svg.rb
|
|
365
364
|
- lib/ucode/glyphs/last_resort/writer.rb
|
|
366
|
-
- lib/ucode/glyphs/monolith_page_map.rb
|
|
367
|
-
- lib/ucode/glyphs/mutool_renderer.rb
|
|
368
|
-
- lib/ucode/glyphs/page_renderer.rb
|
|
369
|
-
- lib/ucode/glyphs/path_bbox.rb
|
|
370
|
-
- lib/ucode/glyphs/pdf2svg_renderer.rb
|
|
371
365
|
- lib/ucode/glyphs/pdf_fetcher.rb
|
|
372
|
-
- lib/ucode/glyphs/pdftocairo_renderer.rb
|
|
373
|
-
- lib/ucode/glyphs/pipeline.rb
|
|
374
366
|
- lib/ucode/glyphs/real_fonts.rb
|
|
375
367
|
- lib/ucode/glyphs/real_fonts/block_coverage.rb
|
|
376
368
|
- lib/ucode/glyphs/real_fonts/cmap_cache.rb
|
|
@@ -397,7 +389,6 @@ files:
|
|
|
397
389
|
- lib/ucode/glyphs/universal_set/manifest_writer.rb
|
|
398
390
|
- lib/ucode/glyphs/universal_set/pre_build_check.rb
|
|
399
391
|
- lib/ucode/glyphs/universal_set/validator.rb
|
|
400
|
-
- lib/ucode/glyphs/writer.rb
|
|
401
392
|
- lib/ucode/index.rb
|
|
402
393
|
- lib/ucode/index_builder.rb
|
|
403
394
|
- lib/ucode/models.rb
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "pathname"
|
|
4
|
-
|
|
5
|
-
require "ucode/glyphs"
|
|
6
|
-
|
|
7
|
-
module Ucode
|
|
8
|
-
module Commands
|
|
9
|
-
# `ucode glyphs` — extract per-codepoint SVGs from Code Charts PDFs.
|
|
10
|
-
# Thin Thor-facing wrapper around {Ucode::Glyphs::Pipeline}:
|
|
11
|
-
# opt-in gate + experimental warning live here; the pipeline
|
|
12
|
-
# assembly (block loading, fetcher, per-block specs) lives in
|
|
13
|
-
# {Ucode::Glyphs::Pipeline}.
|
|
14
|
-
#
|
|
15
|
-
# **Status (v0.1): EXPERIMENTAL.** The cell-extraction pipeline
|
|
16
|
-
# currently includes cell-border decorations alongside the actual
|
|
17
|
-
# character outline because the Code Charts PDFs composite the two
|
|
18
|
-
# into a single glyph definition. The output is therefore not yet
|
|
19
|
-
# suitable for end-user display. The command is retained so the
|
|
20
|
-
# pipeline can be iterated on without churning the CLI surface, but
|
|
21
|
-
# callers MUST opt in via `include_glyphs: true` (CLI: `--include-glyphs`)
|
|
22
|
-
# and will receive a printed warning. Tracked for v0.2.
|
|
23
|
-
#
|
|
24
|
-
# Takes a resolved version string; CLI callers resolve via
|
|
25
|
-
# {VersionResolver.resolve} once and thread it through. See
|
|
26
|
-
# Candidate 4 of the 2026-06-29 architecture review.
|
|
27
|
-
class GlyphsCommand
|
|
28
|
-
ExperimentalWarning = "ucode glyphs is experimental in v0.1: " \
|
|
29
|
-
"extracted SVGs include cell-border decorations " \
|
|
30
|
-
"alongside the character outline."
|
|
31
|
-
private_constant :ExperimentalWarning
|
|
32
|
-
|
|
33
|
-
class << self
|
|
34
|
-
# @return [String] the experimental-status banner. Exposed so the
|
|
35
|
-
# CLI and BuildCommand surface the same message verbatim.
|
|
36
|
-
def experimental_warning
|
|
37
|
-
ExperimentalWarning
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
# @param version [String] resolved UCD version
|
|
42
|
-
# @param output_root [String, Pathname]
|
|
43
|
-
# @param block_filter [Array<String>, nil] block ids to limit to;
|
|
44
|
-
# nil = every block
|
|
45
|
-
# @param force [Boolean] re-fetch PDFs even when cached
|
|
46
|
-
# @param monolith_path [String, Pathname, nil] path to CodeCharts.pdf
|
|
47
|
-
# for fallback slicing; defaults to ./CodeCharts.pdf
|
|
48
|
-
# @param include_glyphs [Boolean] opt-in for the experimental v0.1
|
|
49
|
-
# pipeline. When false (default), the command returns a `skipped`
|
|
50
|
-
# payload without touching disk.
|
|
51
|
-
# @param warn [IO, nil] when provided, the experimental warning is
|
|
52
|
-
# written here exactly once before work begins.
|
|
53
|
-
# @return [Hash] aggregated Writer tally + version, or a `skipped`
|
|
54
|
-
# payload when opt-in is false.
|
|
55
|
-
def call(version, output_root:,
|
|
56
|
-
block_filter: nil, force: false,
|
|
57
|
-
monolith_path: Glyphs::Pipeline::DEFAULT_MONOLITH_PATH,
|
|
58
|
-
include_glyphs: false, warn: nil)
|
|
59
|
-
return skipped(version) unless include_glyphs
|
|
60
|
-
|
|
61
|
-
warn&.puts(ExperimentalWarning)
|
|
62
|
-
|
|
63
|
-
pipeline = Glyphs::Pipeline.new(
|
|
64
|
-
version: version,
|
|
65
|
-
block_filter: block_filter,
|
|
66
|
-
monolith_path: monolith_path,
|
|
67
|
-
)
|
|
68
|
-
specs = pipeline.build_specs(force: force)
|
|
69
|
-
|
|
70
|
-
writer = Glyphs::Writer.new(
|
|
71
|
-
output_root: Pathname.new(output_root),
|
|
72
|
-
parallel_workers: workers,
|
|
73
|
-
)
|
|
74
|
-
tally = writer.write_all(specs)
|
|
75
|
-
tally.merge(version: version, block_count: specs.size)
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
private
|
|
79
|
-
|
|
80
|
-
def workers
|
|
81
|
-
Ucode.configuration.parallel_workers
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
def skipped(version)
|
|
85
|
-
{
|
|
86
|
-
version: version,
|
|
87
|
-
skipped: true,
|
|
88
|
-
reason: :experimental_v0_1,
|
|
89
|
-
warning: ExperimentalWarning,
|
|
90
|
-
}
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
end
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "nokogiri"
|
|
4
|
-
|
|
5
|
-
require "ucode/glyphs/path_bbox"
|
|
6
|
-
|
|
7
|
-
module Ucode
|
|
8
|
-
module Glyphs
|
|
9
|
-
# Extracts a single character cell from a Code Charts SVG page and
|
|
10
|
-
# returns a normalized standalone SVG containing only that cell's
|
|
11
|
-
# vector paths.
|
|
12
|
-
#
|
|
13
|
-
# The cell is identified by codepoint. The extractor asks the Grid
|
|
14
|
-
# for the cell's anchor position, finds the `<use>` element placed
|
|
15
|
-
# at that position, resolves its glyph definition from `<defs>`,
|
|
16
|
-
# and emits a fresh `<svg>` whose viewBox is `0 0 1000 1000` and
|
|
17
|
-
# whose body is the glyph's `<path>` data translated and scaled to
|
|
18
|
-
# fit that viewBox with a small margin.
|
|
19
|
-
#
|
|
20
|
-
# Vector-only. Never rasterizes, never OCRs. If the cell is empty
|
|
21
|
-
# (no character glyph placed there, e.g. unassigned codepoint or
|
|
22
|
-
# control character), the extractor returns nil.
|
|
23
|
-
class CellExtractor
|
|
24
|
-
ViewBoxSize = 1000.0
|
|
25
|
-
MarginRatio = 0.1
|
|
26
|
-
private_constant :ViewBoxSize, :MarginRatio
|
|
27
|
-
|
|
28
|
-
# @param doc [Nokogiri::XML::Document] the rendered Code Charts page
|
|
29
|
-
def initialize(doc)
|
|
30
|
-
@doc = doc
|
|
31
|
-
@glyph_cache = {}
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# @param grid [Ucode::Glyphs::Grid]
|
|
35
|
-
# @param codepoint [Integer]
|
|
36
|
-
# @return [Nokogiri::XML::Document, nil] a standalone `<svg>` doc
|
|
37
|
-
# with viewBox `0 0 1000 1000`, or nil if the cell is empty.
|
|
38
|
-
def extract(grid, codepoint)
|
|
39
|
-
anchor = grid.cell_position(codepoint)
|
|
40
|
-
return nil unless anchor
|
|
41
|
-
|
|
42
|
-
use_node = find_use_at(anchor, grid)
|
|
43
|
-
return nil unless use_node
|
|
44
|
-
|
|
45
|
-
path_data = collect_paths(use_node["xlink:href"] || use_node["href"])
|
|
46
|
-
return nil if path_data.empty?
|
|
47
|
-
|
|
48
|
-
bbox = PathBbox.estimate(path_data.join(" "))
|
|
49
|
-
return nil if bbox.empty?
|
|
50
|
-
|
|
51
|
-
build_svg(path_data, bbox, use_node["x"].to_f, use_node["y"].to_f)
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
private
|
|
55
|
-
|
|
56
|
-
def find_use_at(anchor, grid)
|
|
57
|
-
tolerance_x = grid.column_pitch / 2
|
|
58
|
-
tolerance_y = grid.row_pitch / 2
|
|
59
|
-
|
|
60
|
-
candidates = @doc.css("use").select do |node|
|
|
61
|
-
href = node["xlink:href"] || node["href"] || ""
|
|
62
|
-
href.start_with?("#glyph-") &&
|
|
63
|
-
(node["x"].to_f - anchor[0]).abs <= tolerance_x &&
|
|
64
|
-
(node["y"].to_f - anchor[1]).abs <= tolerance_y
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
candidates.min_by do |node|
|
|
68
|
-
dx = node["x"].to_f - anchor[0]
|
|
69
|
-
dy = node["y"].to_f - anchor[1]
|
|
70
|
-
(dx * dx) + (dy * dy)
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def collect_paths(href)
|
|
75
|
-
return [] unless href
|
|
76
|
-
|
|
77
|
-
glyph_id = href.sub(/\A#/, "")
|
|
78
|
-
node = glyph_definition(glyph_id)
|
|
79
|
-
return [] unless node
|
|
80
|
-
|
|
81
|
-
node.css("path").map { |p| p["d"] }.compact
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
def glyph_definition(glyph_id)
|
|
85
|
-
return @glyph_cache[glyph_id] if @glyph_cache.key?(glyph_id)
|
|
86
|
-
|
|
87
|
-
@glyph_cache[glyph_id] = @doc.at_css("defs ##{glyph_id}")
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
def build_svg(path_data, glyph_bbox, place_x, place_y)
|
|
91
|
-
placed = PathBbox::Result.new(
|
|
92
|
-
min_x: place_x + glyph_bbox.min_x,
|
|
93
|
-
min_y: place_y + glyph_bbox.min_y,
|
|
94
|
-
max_x: place_x + glyph_bbox.max_x,
|
|
95
|
-
max_y: place_y + glyph_bbox.max_y,
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
width = placed.width
|
|
99
|
-
height = placed.height
|
|
100
|
-
return nil if width <= 0 || height <= 0
|
|
101
|
-
|
|
102
|
-
content_size = ViewBoxSize * (1.0 - (2.0 * MarginRatio))
|
|
103
|
-
scale = [content_size / width, content_size / height].min
|
|
104
|
-
offset_x = (ViewBoxSize - (width * scale)) / 2.0
|
|
105
|
-
offset_y = (ViewBoxSize - (height * scale)) / 2.0
|
|
106
|
-
translate_x = offset_x - (placed.min_x * scale)
|
|
107
|
-
translate_y = offset_y - (placed.min_y * scale)
|
|
108
|
-
|
|
109
|
-
builder = Nokogiri::XML::Document.new
|
|
110
|
-
root = builder.create_element(
|
|
111
|
-
"svg",
|
|
112
|
-
xmlns: "http://www.w3.org/2000/svg",
|
|
113
|
-
viewBox: "0 0 #{ViewBoxSize.to_i} #{ViewBoxSize.to_i}",
|
|
114
|
-
width: ViewBoxSize.to_i,
|
|
115
|
-
height: ViewBoxSize.to_i,
|
|
116
|
-
)
|
|
117
|
-
group = builder.create_element(
|
|
118
|
-
"g",
|
|
119
|
-
transform: "scale(#{format('%.6f', scale)}) translate(#{format('%.6f', translate_x)}, #{format('%.6f', translate_y)})",
|
|
120
|
-
)
|
|
121
|
-
path_data.each do |d|
|
|
122
|
-
group.add_child(builder.create_element("path", d: d, fill: "black"))
|
|
123
|
-
end
|
|
124
|
-
root.add_child(group)
|
|
125
|
-
builder.add_child(root)
|
|
126
|
-
builder
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
end
|
|
130
|
-
end
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "ucode/glyphs/page_renderer"
|
|
4
|
-
|
|
5
|
-
module Ucode
|
|
6
|
-
module Glyphs
|
|
7
|
-
# `dvisvgm` — originally a DVI-to-SVG converter, also handles PDF.
|
|
8
|
-
# The `--no-fonts` flag forces outline-only output (no font subsetting
|
|
9
|
-
# artifacts), which is what we want for vector glyph extraction.
|
|
10
|
-
#
|
|
11
|
-
# Command: `dvisvgm --pdf --no-fonts --page=<n> <in.pdf> -o <out.svg>`
|
|
12
|
-
class DvisvgmRenderer < PageRenderer
|
|
13
|
-
class << self
|
|
14
|
-
def renderer_name
|
|
15
|
-
:dvisvgm
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def binary_name
|
|
19
|
-
:dvisvgm
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def build_command(pdf_path, page_num, out_path)
|
|
23
|
-
["dvisvgm", "--pdf", "--no-fonts", "--page=#{page_num}",
|
|
24
|
-
pdf_path.to_s, "-o", out_path.to_s]
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
end
|
data/lib/ucode/glyphs/grid.rb
DELETED
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Ucode
|
|
4
|
-
module Glyphs
|
|
5
|
-
Grid = Struct.new(
|
|
6
|
-
:origin_x, :origin_y,
|
|
7
|
-
:column_pitch, :row_pitch,
|
|
8
|
-
:columns, :rows,
|
|
9
|
-
:block_first_cp,
|
|
10
|
-
keyword_init: true,
|
|
11
|
-
) do
|
|
12
|
-
def cell_position(codepoint)
|
|
13
|
-
offset = codepoint - block_first_cp
|
|
14
|
-
return nil if offset.negative?
|
|
15
|
-
|
|
16
|
-
row, col = offset.divmod(columns)
|
|
17
|
-
return nil if row >= rows
|
|
18
|
-
|
|
19
|
-
[origin_x + (col * column_pitch), origin_y + (row * row_pitch)]
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def codepoint_at(row, col)
|
|
23
|
-
return nil if row.negative? || row >= rows
|
|
24
|
-
return nil if col.negative? || col >= columns
|
|
25
|
-
|
|
26
|
-
block_first_cp + (row * columns) + col
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
end
|
|
30
|
-
end
|
|
@@ -1,165 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "nokogiri"
|
|
4
|
-
|
|
5
|
-
require "ucode/glyphs/grid"
|
|
6
|
-
require "ucode/glyphs/path_bbox"
|
|
7
|
-
|
|
8
|
-
module Ucode
|
|
9
|
-
module Glyphs
|
|
10
|
-
# Detects the chart grid in a Code Charts PDF page rendered to SVG.
|
|
11
|
-
#
|
|
12
|
-
# The PDF page produced by pdftocairo / pdf2svg / dvisvgm contains
|
|
13
|
-
# every visible element (title, block name, row labels, codepoint
|
|
14
|
-
# digits, and the actual character glyphs) as positioned `<use>`
|
|
15
|
-
# references into a `<defs>` block of named glyph outlines. The
|
|
16
|
-
# character cells we want to extract correspond to glyphs whose
|
|
17
|
-
# bounding box is larger than every label or digit font on the
|
|
18
|
-
# page — the chart's character samples are drawn at a larger size
|
|
19
|
-
# than any of the surrounding text.
|
|
20
|
-
#
|
|
21
|
-
# Algorithm:
|
|
22
|
-
# 1. Walk `<defs>`, estimate each glyph's bbox via `PathBbox`.
|
|
23
|
-
# 2. Classify a glyph as "character-sized" when its width and
|
|
24
|
-
# height both exceed `CharSizeThreshold` (default 8 pt).
|
|
25
|
-
# This excludes title, row-label, and digit glyphs while
|
|
26
|
-
# keeping every actual character sample — including pages
|
|
27
|
-
# where the chart mixes multiple character fonts (e.g. the
|
|
28
|
-
# Basic Latin page uses one font for punctuation/digits and
|
|
29
|
-
# another for letters).
|
|
30
|
-
# 3. Collect every `<use>` that references a character-sized
|
|
31
|
-
# glyph; these are the cell origins.
|
|
32
|
-
# 4. Cluster the Y values of those uses into rows, and within
|
|
33
|
-
# each row cluster the X values into columns.
|
|
34
|
-
# 5. Drop rows whose column count diverges from the modal value
|
|
35
|
-
# (these are footer/header artifacts, not chart rows).
|
|
36
|
-
# 6. Return a `Grid` value object anchored at the top-left cell
|
|
37
|
-
# with uniform column/row pitches derived from the median
|
|
38
|
-
# spacing between adjacent clusters.
|
|
39
|
-
#
|
|
40
|
-
# This is pure (no I/O). The detector takes a parsed Nokogiri
|
|
41
|
-
# document and returns a `Grid`.
|
|
42
|
-
class GridDetector
|
|
43
|
-
CharSizeThreshold = 8.0
|
|
44
|
-
ClusterEpsilon = 15.0
|
|
45
|
-
private_constant :CharSizeThreshold, :ClusterEpsilon
|
|
46
|
-
|
|
47
|
-
class << self
|
|
48
|
-
# @param doc [Nokogiri::XML::Document]
|
|
49
|
-
# @param block_first_cp [Integer] first codepoint of the block;
|
|
50
|
-
# stored on the Grid so callers can map codepoint ↔ cell.
|
|
51
|
-
# @return [Ucode::Glyphs::Grid, nil] nil if no character grid
|
|
52
|
-
# could be detected.
|
|
53
|
-
def detect(doc, block_first_cp:)
|
|
54
|
-
uses = collect_uses(doc)
|
|
55
|
-
return nil if uses.empty?
|
|
56
|
-
|
|
57
|
-
char_glyph_ids = char_sized_glyph_ids(doc)
|
|
58
|
-
return nil if char_glyph_ids.empty?
|
|
59
|
-
|
|
60
|
-
cell_uses = uses.select { |u| char_glyph_ids.include?(u.glyph_id) }
|
|
61
|
-
return nil if cell_uses.empty?
|
|
62
|
-
|
|
63
|
-
build_grid(cell_uses, block_first_cp)
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
private
|
|
67
|
-
|
|
68
|
-
UsePosition = Struct.new(:x, :y, :glyph_id, :set_id, keyword_init: true)
|
|
69
|
-
|
|
70
|
-
def collect_uses(doc)
|
|
71
|
-
doc.css("use").map do |node|
|
|
72
|
-
href = node["xlink:href"] || node["href"] || ""
|
|
73
|
-
glyph_id = href.sub(/\A#/, "")
|
|
74
|
-
match = glyph_id.match(/\Aglyph-(\d+)-(\d+)\z/)
|
|
75
|
-
next nil unless match
|
|
76
|
-
|
|
77
|
-
UsePosition.new(
|
|
78
|
-
x: node["x"].to_f,
|
|
79
|
-
y: node["y"].to_f,
|
|
80
|
-
glyph_id: glyph_id,
|
|
81
|
-
set_id: match[1].to_i,
|
|
82
|
-
)
|
|
83
|
-
end.compact
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def char_sized_glyph_ids(doc)
|
|
87
|
-
doc.css("defs g[id^='glyph-']").each_with_object({}) do |g, acc|
|
|
88
|
-
id = g["id"]
|
|
89
|
-
next unless id =~ /\Aglyph-\d+-\d+\z/
|
|
90
|
-
|
|
91
|
-
paths = g.css("path")
|
|
92
|
-
next if paths.empty?
|
|
93
|
-
|
|
94
|
-
bbox = paths.map { |p| PathBbox.estimate(p["d"]) }.reject(&:empty?).reduce do |a, b|
|
|
95
|
-
PathBbox::Result.new(
|
|
96
|
-
min_x: [a.min_x, b.min_x].min,
|
|
97
|
-
min_y: [a.min_y, b.min_y].min,
|
|
98
|
-
max_x: [a.max_x, b.max_x].max,
|
|
99
|
-
max_y: [a.max_y, b.max_y].max,
|
|
100
|
-
)
|
|
101
|
-
end
|
|
102
|
-
next unless bbox
|
|
103
|
-
|
|
104
|
-
acc[id] = true if char_sized?(bbox)
|
|
105
|
-
end
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
def char_sized?(bbox)
|
|
109
|
-
bbox.width >= CharSizeThreshold && bbox.height >= CharSizeThreshold
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
def median(values)
|
|
113
|
-
return 0.0 if values.empty?
|
|
114
|
-
|
|
115
|
-
sorted = values.sort
|
|
116
|
-
mid = sorted.size / 2
|
|
117
|
-
sorted.size.even? ? (sorted[mid - 1] + sorted[mid]) / 2.0 : sorted[mid]
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def build_grid(cell_uses, block_first_cp)
|
|
121
|
-
row_clusters = cluster_by_value(cell_uses, :y)
|
|
122
|
-
return nil if row_clusters.empty?
|
|
123
|
-
|
|
124
|
-
column_clusters = cluster_by_value(cell_uses, :x)
|
|
125
|
-
return nil if column_clusters.empty?
|
|
126
|
-
|
|
127
|
-
column_starts = column_clusters.map { |c| c.map(&:x).min }.sort
|
|
128
|
-
row_starts = row_clusters.map { |c| c.map(&:y).min }.sort
|
|
129
|
-
|
|
130
|
-
Grid.new(
|
|
131
|
-
origin_x: column_starts.first,
|
|
132
|
-
origin_y: row_starts.first,
|
|
133
|
-
column_pitch: median_pitch(column_starts),
|
|
134
|
-
row_pitch: median_pitch(row_starts),
|
|
135
|
-
columns: column_starts.size,
|
|
136
|
-
rows: row_starts.size,
|
|
137
|
-
block_first_cp: block_first_cp,
|
|
138
|
-
)
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
def cluster_by_value(items, attr)
|
|
142
|
-
sorted = items.sort_by { |i| i.public_send(attr) }
|
|
143
|
-
clusters = []
|
|
144
|
-
sorted.each do |item|
|
|
145
|
-
value = item.public_send(attr)
|
|
146
|
-
if clusters.empty? || (value - clusters.last[:max]).abs > ClusterEpsilon
|
|
147
|
-
clusters << { max: value, items: [item] }
|
|
148
|
-
else
|
|
149
|
-
clusters.last[:max] = value
|
|
150
|
-
clusters.last[:items] << item
|
|
151
|
-
end
|
|
152
|
-
end
|
|
153
|
-
clusters.map { |c| c[:items] }
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
def median_pitch(sorted_values)
|
|
157
|
-
return 0.0 if sorted_values.size < 2
|
|
158
|
-
|
|
159
|
-
pitches = sorted_values.each_cons(2).map { |a, b| b - a }
|
|
160
|
-
median(pitches)
|
|
161
|
-
end
|
|
162
|
-
end
|
|
163
|
-
end
|
|
164
|
-
end
|
|
165
|
-
end
|