ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "pathname"
5
+ require "tempfile"
6
+
7
+ require "fontisan"
8
+ require_relative "../../error"
9
+
10
+ module Ucode
11
+ module Glyphs
12
+ module EmbeddedFonts
13
+ # Value object describing one Type0 font discovered in the Code
14
+ # Charts PDF, plus lazy accessors for its outline data.
15
+ #
16
+ # A FontEntry is constructed by {Catalog} during the PDF walk and
17
+ # is the unit of work for the renderer. Each entry owns:
18
+ #
19
+ # * identity — `base_font` name, font dict object number
20
+ # * stream refs — object numbers of the FontDescriptor's
21
+ # FontFile2 (TrueType) / FontFile3 (CFF) and the ToUnicode CMap
22
+ # * `cid_to_gid_map` — `:identity` (gid == cid) or `:stream`
23
+ # (we'd need to parse a separate map; not currently supported)
24
+ # * `codepoint_to_gid` — the per-font map built from the parsed
25
+ # ToUnicode CMap. Frozen.
26
+ #
27
+ # The fontisan accessor is built lazily on first {#accessor} call,
28
+ # and the font program is extracted to the {Source} cache directory
29
+ # at the same point. Subsequent calls reuse the cached file unless
30
+ # the PDF is newer than the cache.
31
+ class FontEntry
32
+ attr_reader :base_font, :font_obj_id, :fontfile_obj_id,
33
+ :fontfile_kind, :tounicode_obj_id, :cid_to_gid_map,
34
+ :codepoint_to_gid, :source
35
+
36
+ # @param base_font [String] e.g. "CIAIIP+Uni2000Generalpunctuation"
37
+ # @param font_obj_id [Integer] Type0 font dict object number
38
+ # @param fontfile_obj_id [Integer] FontFile2/3 stream object number
39
+ # @param fontfile_kind [Symbol] :ttf (FontFile2) or :cff (FontFile3)
40
+ # @param tounicode_obj_id [Integer] ToUnicode CMap stream object number
41
+ # @param cid_to_gid_map [Symbol] :identity (we only support this)
42
+ # @param codepoint_to_gid [Hash{Integer=>Integer}] frozen cp → gid
43
+ # @param source [Source] for cache path + pdf path
44
+ def initialize(base_font:, font_obj_id:, fontfile_obj_id:,
45
+ fontfile_kind:, tounicode_obj_id:, cid_to_gid_map:,
46
+ codepoint_to_gid:, source:)
47
+ @base_font = base_font
48
+ @font_obj_id = font_obj_id
49
+ @fontfile_obj_id = fontfile_obj_id
50
+ @fontfile_kind = fontfile_kind
51
+ @tounicode_obj_id = tounicode_obj_id
52
+ @cid_to_gid_map = cid_to_gid_map
53
+ @codepoint_to_gid = codepoint_to_gid
54
+ @source = source
55
+ @accessor = nil
56
+ end
57
+
58
+ # @param codepoint [Integer]
59
+ # @return [Integer, nil] GID for the codepoint in this font, or
60
+ # nil if the codepoint isn't covered
61
+ def gid_for(codepoint)
62
+ @codepoint_to_gid[codepoint]
63
+ end
64
+
65
+ # @return [Array<Integer>] codepoints covered by this font
66
+ def codepoints
67
+ @codepoint_to_gid.keys
68
+ end
69
+
70
+ # @return [String] ".ttf" or ".cff" — cache file extension
71
+ def fontfile_extension
72
+ @fontfile_kind == :ttf ? ".ttf" : ".cff"
73
+ end
74
+
75
+ # @return [Pathname] where the extracted font stream is cached
76
+ def cache_path
77
+ @source.font_cache_path(@base_font, fontfile_extension)
78
+ end
79
+
80
+ # Lazy: extracts the font program to the cache (if missing or
81
+ # stale) and loads it via fontisan. Memoized per FontEntry.
82
+ #
83
+ # @return [Fontisan::GlyphAccessor]
84
+ def accessor
85
+ @accessor ||= build_accessor
86
+ end
87
+
88
+ # Force-clear the cached accessor and fontisan state. Useful in
89
+ # long-running processes that walk many fonts.
90
+ #
91
+ # @return [void]
92
+ def reset_accessor!
93
+ @accessor = nil
94
+ end
95
+
96
+ private
97
+
98
+ def build_accessor
99
+ ensure_font_cached!
100
+ font = Fontisan::FontLoader.load(cache_path.to_s)
101
+ Fontisan::GlyphAccessor.new(font)
102
+ end
103
+
104
+ def ensure_font_cached!
105
+ return if cache_path.exist? && cache_path.mtime >= @source.pdf_path.mtime
106
+
107
+ cache_path.dirname.mkpath unless cache_path.dirname.exist?
108
+ extract_font_stream!
109
+ end
110
+
111
+ def extract_font_stream!
112
+ Tempfile.create([@base_font, fontfile_extension], cache_path.dirname.to_s, binmode: true) do |tmp|
113
+ tmp.close
114
+ ok = system("mutool", "show", "-o", tmp.path, "-b",
115
+ @source.pdf_to_s, @fontfile_obj_id.to_s,
116
+ out: File::NULL, err: File::NULL)
117
+ raise Ucode::EmbeddedFontsMissingError,
118
+ "mutool failed to extract font stream (obj=#{@fontfile_obj_id})" unless ok
119
+
120
+ FileUtils.mv(tmp.path, cache_path.to_s, force: true)
121
+ end
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "svg"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ module EmbeddedFonts
8
+ # Renders one codepoint's glyph by chaining the {Catalog} index
9
+ # lookup → {FontEntry} accessor → {Svg} wrapper.
10
+ #
11
+ # Mirrors {LastResort::Renderer}: a Result struct is returned on
12
+ # success, nil on miss. The caller (Writer or CLI) decides how to
13
+ # handle misses — typically by falling back to the LastResort
14
+ # renderer.
15
+ class Renderer
16
+ # Result of rendering one codepoint.
17
+ Result = Struct.new(:codepoint, :base_font, :gid, :svg, keyword_init: true) do
18
+ def ok?
19
+ !svg.nil?
20
+ end
21
+ end
22
+
23
+ # @param catalog [Catalog]
24
+ def initialize(catalog)
25
+ @catalog = catalog
26
+ end
27
+
28
+ # @param codepoint [Integer]
29
+ # @return [Result, nil] nil when no font in the PDF covers this
30
+ # codepoint, or when the GID's outline is empty
31
+ def render(codepoint)
32
+ entry = @catalog.lookup(codepoint)
33
+ return nil unless entry
34
+
35
+ gid = entry.gid_for(codepoint)
36
+ return nil unless gid
37
+
38
+ outline = entry.accessor.outline_for_id(gid)
39
+ return nil if outline.nil? || outline.empty?
40
+
41
+ svg = Svg.new(outline, codepoint: codepoint, base_font: entry.base_font).to_s
42
+ Result.new(codepoint: codepoint, base_font: entry.base_font, gid: gid, svg: svg)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ require_relative "../../error"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ module EmbeddedFonts
10
+ # Locates the Code Charts PDF on disk and the directory where
11
+ # extracted font streams are cached.
12
+ #
13
+ # PDF resolution order (first match wins):
14
+ #
15
+ # 1. Explicit `pdf:` argument.
16
+ # 2. `UCODE_CODE_CHARTS_PDF` environment variable.
17
+ # 3. Conventional `<gem_root>/CodeCharts.pdf`.
18
+ #
19
+ # Per-block PDFs (preferred for incremental runs) can be supplied
20
+ # via the `pdf:` argument by the caller — typically the CLI.
21
+ #
22
+ # Cache resolution order:
23
+ #
24
+ # 1. Explicit `cache_dir:` argument.
25
+ # 2. `UCODE_PDF_FONT_CACHE` environment variable.
26
+ # 3. Conventional `<gem_root>/data/pdf-fonts/`.
27
+ #
28
+ # The cache holds one file per embedded font program, named after
29
+ # the BaseFont (e.g. `CIAIIP+Uni2000Generalpunctuation.ttf`).
30
+ # Re-runs skip extraction when the cached file is newer than the
31
+ # PDF.
32
+ class Source
33
+ attr_reader :pdf_path, :cache_dir
34
+
35
+ # @param pdf [String, Pathname, nil] path to a Code Charts PDF
36
+ # @param cache_dir [String, Pathname, nil] directory for cached
37
+ # font files; created on demand
38
+ # @param env [Hash{String=>String}] env var source (defaults to ENV)
39
+ # @param gem_root [String, Pathname, nil] gem root for the
40
+ # conventional fallback; injectable for tests
41
+ # @raise [Ucode::EmbeddedFontsMissingError] if the PDF is missing
42
+ def initialize(pdf: nil, cache_dir: nil, env: ENV, gem_root: nil)
43
+ @pdf_path = resolve_pdf(pdf, env, gem_root)
44
+ raise Ucode::EmbeddedFontsMissingError,
45
+ "Code Charts PDF not found at #{@pdf_path}" unless @pdf_path&.exist?
46
+
47
+ @cache_dir = resolve_cache(cache_dir, env, gem_root)
48
+ @cache_dir.mkpath unless @cache_dir.exist?
49
+ end
50
+
51
+ # @return [String] absolute path to the PDF, suitable for shelling
52
+ # out to `mutool`
53
+ def pdf_to_s
54
+ @pdf_path.to_s
55
+ end
56
+
57
+ # @param base_font [String] e.g. "CIAIIP+Uni2000Generalpunctuation"
58
+ # @param extension [String] e.g. ".ttf" or ".cff"
59
+ # @return [Pathname] cache path for the named font
60
+ def font_cache_path(base_font, extension)
61
+ @cache_dir.join("#{base_font}#{extension}")
62
+ end
63
+
64
+ private
65
+
66
+ def resolve_pdf(explicit, env, gem_root)
67
+ return Pathname.new(explicit).expand_path if explicit
68
+
69
+ env_val = env["UCODE_CODE_CHARTS_PDF"]
70
+ return Pathname.new(env_val).expand_path if env_val && !env_val.empty?
71
+
72
+ base = gem_root ? Pathname.new(gem_root) : default_gem_root
73
+ base.expand_path.join("CodeCharts.pdf")
74
+ end
75
+
76
+ def resolve_cache(explicit, env, gem_root)
77
+ return Pathname.new(explicit).expand_path if explicit
78
+
79
+ env_val = env["UCODE_PDF_FONT_CACHE"]
80
+ return Pathname.new(env_val).expand_path if env_val && !env_val.empty?
81
+
82
+ base = gem_root ? Pathname.new(gem_root) : default_gem_root
83
+ base.expand_path.join("data", "pdf-fonts")
84
+ end
85
+
86
+ # __dir__ = lib/ucode/glyphs/embedded_fonts/. Five `..` get us
87
+ # back to the project root (the directory containing `lib/`).
88
+ def default_gem_root
89
+ Pathname.new(__dir__).join("..", "..", "..", "..", "..")
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module EmbeddedFonts
6
+ # Converts a fontisan `GlyphOutline` into a standalone SVG document
7
+ # shaped to match the {LastResort::Svg} output (y-flipped, viewBox
8
+ # padded around the bbox, single `<path>` child).
9
+ #
10
+ # The fontisan outline is in font units, with y growing upward
11
+ # (PostScript convention). SVG y grows downward. We:
12
+ #
13
+ # 1. Walk `outline.to_commands` and re-emit each command with
14
+ # the y coordinate negated. The commands we get are
15
+ # `:move_to`, `:line_to`, `:curve_to` (quadratic; one
16
+ # control + one end point), and `:close_path`.
17
+ # 2. Build a viewBox from the outline's bbox with a small pad,
18
+ # y-flipped so min_y is the SVG-space top.
19
+ #
20
+ # The y-negation happens at emit time, not at parse time, so we
21
+ # never have to read back a serialized path string.
22
+ class Svg
23
+ PaddingRatio = 0.08
24
+ private_constant :PaddingRatio
25
+
26
+ # @param outline [Fontisan::Models::GlyphOutline]
27
+ # @param codepoint [Integer, nil] optional, for the `<title>`
28
+ # @param base_font [String, nil] optional source-font name, also
29
+ # for the `<title>` (debugging which PDF font a glyph came from)
30
+ def initialize(outline, codepoint: nil, base_font: nil)
31
+ @outline = outline
32
+ @codepoint = codepoint
33
+ @base_font = base_font
34
+ end
35
+
36
+ # @return [String] complete `<svg>...</svg>` document
37
+ def to_s
38
+ box = view_box
39
+ lines = []
40
+ lines << %(<svg xmlns="http://www.w3.org/2000/svg" viewBox="#{format_dims(box)}" width="#{format_num(box[:width])}" height="#{format_num(box[:height])}" preserveAspectRatio="xMidYMid meet">)
41
+ lines << %( <title>#{title_text}</title>) if title_text
42
+ lines << %( <path d="#{path_data}" fill="currentColor" fill-rule="evenodd"/>)
43
+ lines << %(</svg>)
44
+ %(<?xml version="1.0" encoding="UTF-8"?>\n#{lines.join("\n")}\n)
45
+ end
46
+
47
+ # SVG path data with y already negated. Exposed for tests and
48
+ # for callers that want to embed the path in their own wrapper.
49
+ #
50
+ # @return [String]
51
+ def path_data
52
+ parts = []
53
+ @outline.to_commands.each do |cmd|
54
+ case cmd.first
55
+ when :move_to then parts << format_cmd("M", cmd[1], cmd[2])
56
+ when :line_to then parts << format_cmd("L", cmd[1], cmd[2])
57
+ when :curve_to
58
+ parts << format_cmd_q(cmd[1], cmd[2], cmd[3], cmd[4])
59
+ when :close_path then parts << "Z"
60
+ end
61
+ end
62
+ parts.join(" ")
63
+ end
64
+
65
+ private
66
+
67
+ def title_text
68
+ return nil unless @codepoint
69
+
70
+ label = "U+#{format("%04X", @codepoint)}"
71
+ label << " (Code Charts#{": #{@base_font}" if @base_font})"
72
+ label
73
+ end
74
+
75
+ def view_box
76
+ bb = @outline.bbox
77
+ if bb.nil? || empty_bbox?(bb)
78
+ return { min_x: 0, min_y: 0, width: 1, height: 1 }
79
+ end
80
+
81
+ min_x = bb[:x_min].to_f
82
+ max_x = bb[:x_max].to_f
83
+ min_y = bb[:y_min].to_f
84
+ max_y = bb[:y_max].to_f
85
+ width = (max_x - min_x).nonzero? || 1.0
86
+ height = (max_y - min_y).nonzero? || 1.0
87
+ pad_x = width * PaddingRatio
88
+ pad_y = height * PaddingRatio
89
+ {
90
+ min_x: min_x - pad_x,
91
+ min_y: -(max_y + pad_y),
92
+ width: width + (2 * pad_x),
93
+ height: height + (2 * pad_y),
94
+ }
95
+ end
96
+
97
+ def empty_bbox?(bb)
98
+ bb[:x_min] == 0 && bb[:y_min] == 0 && bb[:x_max] == 0 && bb[:y_max] == 0
99
+ end
100
+
101
+ def format_dims(box)
102
+ format("%<min_x>.2f %<min_y>.2f %<width>.2f %<height>.2f", box)
103
+ end
104
+
105
+ def format_cmd(letter, x, y)
106
+ "#{letter} #{format_num(x)} #{format_num(-y)}"
107
+ end
108
+
109
+ def format_cmd_q(cx, cy, ex, ey)
110
+ "Q #{format_num(cx)} #{format_num(-cy)} #{format_num(ex)} #{format_num(-ey)}"
111
+ end
112
+
113
+ def format_num(n)
114
+ if n.is_a?(Integer) || n.to_f == n.to_i
115
+ n.to_i.to_s
116
+ else
117
+ format("%.2f", n)
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ module EmbeddedFonts
6
+ # Parses a PDF ToUnicode CMap stream into a `{cid => codepoint}` Hash.
7
+ #
8
+ # PDF ToUnicode CMaps (Adobe Technical Note #5014) use a small
9
+ # PostScript-like syntax with three constructs that matter to us:
10
+ #
11
+ # * `N begincodespacerange ... endcodespacerange` — declares the
12
+ # valid code space. We ignore this; we just take whatever the
13
+ # bfchar/bfrange entries hand us.
14
+ # * `N beginbfchar ... endbfchar` — one-to-one cid → unicode
15
+ # mappings, one pair per line: `<cid_hex> <uni_hex>`.
16
+ # * `N beginbfrange ... endbfrange` — range mappings. Two forms:
17
+ # * `<lo> <hi> <start>` — cids lo..hi map to consecutive
18
+ # codepoints starting at `start`.
19
+ # * `<lo> <hi> [<u1> <u2> ... <un>]` — explicit per-cid
20
+ # mapping within the range.
21
+ #
22
+ # The unicode target string may encode one codepoint (4 hex digits
23
+ # for BMP, 8 for an astral codepoint via UTF-16 surrogate pair) or
24
+ # a sequence (multiple codepoints, used for ligatures). For our
25
+ # purposes — attributing one Code Charts glyph to one codepoint —
26
+ # we take the first codepoint of the target string and ignore the
27
+ # rest.
28
+ module ToUnicode
29
+ # @param cmap_text [String] raw decoded CMap stream text
30
+ # @return [Hash{Integer=>Integer}] frozen cid → codepoint map
31
+ def self.parse(cmap_text)
32
+ result = {}
33
+ scan_bfchar(cmap_text, result)
34
+ scan_bfrange(cmap_text, result)
35
+ result.freeze
36
+ end
37
+
38
+ class << self
39
+ private
40
+
41
+ def scan_bfchar(text, result)
42
+ text.scan(/beginbfchar\s*(.*?)\s*endbfchar/m) do
43
+ body = Regexp.last_match(1)
44
+ body.scan(/<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>/).each do |cid_h, uni_h|
45
+ cid = cid_h.to_i(16)
46
+ cp = decode_target(uni_h)
47
+ result[cid] = cp if cp
48
+ end
49
+ end
50
+ end
51
+
52
+ def scan_bfrange(text, result)
53
+ text.scan(/beginbfrange\s*(.*?)\s*endbfrange/m) do
54
+ body = Regexp.last_match(1)
55
+ # Match either `<lo> <hi> <start>` or `<lo> <hi> [<u1> ... <un>]`
56
+ body.scan(/<([0-9A-Fa-f]+)>\s*<([0-9A-Fa-f]+)>\s*(?:<([0-9A-Fa-f]+)>|\[([^\]]*)\])/).each do |lo_h, hi_h, start_h, arr|
57
+ lo = lo_h.to_i(16)
58
+ hi = hi_h.to_i(16)
59
+ if start_h
60
+ start = decode_target(start_h)
61
+ next unless start
62
+ (lo..hi).each_with_index do |cid, i|
63
+ result[cid] = start + i
64
+ end
65
+ elsif arr
66
+ entries = arr.scan(/<([0-9A-Fa-f]+)>/).flatten
67
+ entries.each_with_index do |uni_h, i|
68
+ cid = lo + i
69
+ break if cid > hi
70
+ cp = decode_target(uni_h)
71
+ result[cid] = cp if cp
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ # Decode a CMap target hex string into a single codepoint.
79
+ # The target may be 4 hex digits (BMP), 8 (UTF-16 surrogate pair
80
+ # for astral), or longer (a sequence — we take the first cp).
81
+ #
82
+ # @param hex [String] hexadecimal digits
83
+ # @return [Integer, nil] the first codepoint, or nil if hex is empty
84
+ def decode_target(hex)
85
+ return nil if hex.nil? || hex.empty?
86
+ return hex.to_i(16) if hex.length == 4
87
+
88
+ if hex.length >= 8 && hex.length % 4 == 0
89
+ first = hex[0, 4].to_i(16)
90
+ if first >= 0xD800 && first <= 0xDBFF
91
+ second = hex[4, 4].to_i(16)
92
+ return 0x10000 + ((first - 0xD800) << 10) + (second - 0xDC00)
93
+ end
94
+ return first
95
+ end
96
+
97
+ hex[0, 4].to_i(16)
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ require_relative "renderer"
6
+ require_relative "../../repo/atomic_writes"
7
+ require_relative "../../repo/paths"
8
+
9
+ module Ucode
10
+ module Glyphs
11
+ module EmbeddedFonts
12
+ # Writes one `glyph.svg` per codepoint in `codepoints`, sourcing
13
+ # the outline from the Code Charts PDF's embedded font program.
14
+ #
15
+ # The Catalog and Renderer are shared across the loop so the
16
+ # expensive PDF walk + ToUnicode parse + fontisan load happen
17
+ # once per process. Each FontEntry memoizes its own fontisan
18
+ # accessor; in long CJK runs you may want to call
19
+ # `entry.reset_accessor!` periodically (the Writer doesn't).
20
+ #
21
+ # Idempotent and atomic via `Repo::AtomicWrites` — same protocol
22
+ # as the LastResort and v0.1 cell-extractor writers.
23
+ class Writer
24
+ include Repo::AtomicWrites
25
+
26
+ # @param output_root [String, Pathname]
27
+ # @param catalog [Catalog]
28
+ def initialize(output_root:, catalog:)
29
+ @output_root = Pathname.new(output_root)
30
+ @catalog = catalog
31
+ @renderer = Renderer.new(catalog)
32
+ end
33
+
34
+ # Write `glyph.svg` for every codepoint covered by the PDF.
35
+ #
36
+ # @param codepoints [Array<Integer>, Enumerable<Integer>] which
37
+ # codepoints to render. Defaults to all codepoints the Catalog
38
+ # has fonts for.
39
+ # @param block_lookup [Proc, #call] codepoint → block id string
40
+ # (e.g. `"Basic_Latin"`). Returns nil for codepoints without
41
+ # a block; those are skipped.
42
+ # @return [Hash] tally `{ written:, skipped:, missing:, total: }`
43
+ def write_many(codepoints = nil, block_lookup:)
44
+ cps = codepoints || @catalog.codepoints
45
+ tally = { written: 0, skipped: 0, missing: 0, total: 0 }
46
+ cps.each do |cp|
47
+ tally[:total] += 1
48
+ block_id = block_lookup.call(cp)
49
+ if block_id.nil?
50
+ tally[:missing] += 1
51
+ next
52
+ end
53
+
54
+ result = @renderer.render(cp)
55
+ if result.nil? || !result.ok?
56
+ tally[:missing] += 1
57
+ next
58
+ end
59
+
60
+ written = write_glyph(block_id, cp, result.svg)
61
+ tally[written ? :written : :skipped] += 1
62
+ end
63
+ tally
64
+ end
65
+
66
+ private
67
+
68
+ def write_glyph(block_id, codepoint, svg)
69
+ cp_id = Repo::Paths.cp_id(codepoint)
70
+ path = Repo::Paths.codepoint_glyph_path(@output_root, block_id, cp_id)
71
+ write_atomic(path, svg)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ # Code Charts PDF font-stream extraction — pillar 1 of the v0.2 glyph
6
+ # strategy.
7
+ #
8
+ # The Unicode Code Charts PDFs (per-block or the `CodeCharts.pdf`
9
+ # monolith) embed one subsetted CID-keyed font per "script group"
10
+ # shown in the charts. Each font is a Type0 font whose descendant
11
+ # CIDFont uses `/CIDToGIDMap /Identity` — so the 2-byte character
12
+ # code used in the page's text-show operators IS the GID into the
13
+ # embedded font program. The codepoint mapping lives in the Type0
14
+ # font's `/ToUnicode` CMap stream.
15
+ #
16
+ # The pipeline is therefore:
17
+ #
18
+ # 1. {Catalog} walks the PDF's fonts (via `mutool info`) and builds
19
+ # a global `{codepoint => [font_entry, gid]}` index by parsing
20
+ # every Type0 font's ToUnicode CMap.
21
+ # 2. {Renderer} looks up a codepoint, lazily extracts the font's
22
+ # stream to a cache file, loads it via `fontisan`, and asks for
23
+ # the outline at the resolved GID.
24
+ # 3. {Svg} wraps the fontisan outline as a standalone SVG document
25
+ # (y-flipped, viewBox-padded) — same shape as the LastResort
26
+ # SVGs so downstream consumers don't care which pillar produced
27
+ # the glyph.
28
+ #
29
+ # The v0.1 cell extractor operated on rendered PDF pages and was
30
+ # defeated by the chart cell border being composited into the same
31
+ # glyph as the outline. Going straight to the embedded font program
32
+ # sidesteps that bug entirely: the font's outlines are clean
33
+ # vector geometry with no page chrome.
34
+ #
35
+ # System dependency: `mutool` (mupdf-tools) is on the PATH. Used for
36
+ # `mutool info` (font enumeration) and `mutool show -b -o` (raw
37
+ # stream extraction).
38
+ module EmbeddedFonts
39
+ autoload :Source, "ucode/glyphs/embedded_fonts/source"
40
+ autoload :ToUnicode, "ucode/glyphs/embedded_fonts/tounicode"
41
+ autoload :FontEntry, "ucode/glyphs/embedded_fonts/font_entry"
42
+ autoload :Catalog, "ucode/glyphs/embedded_fonts/catalog"
43
+ autoload :ContentStreamCorrelator,
44
+ "ucode/glyphs/embedded_fonts/content_stream_correlator"
45
+ autoload :Svg, "ucode/glyphs/embedded_fonts/svg"
46
+ autoload :Renderer, "ucode/glyphs/embedded_fonts/renderer"
47
+ autoload :Writer, "ucode/glyphs/embedded_fonts/writer"
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Glyphs
5
+ Grid = Struct.new(
6
+ :origin_x, :origin_y,
7
+ :column_pitch, :row_pitch,
8
+ :columns, :rows,
9
+ :block_first_cp,
10
+ keyword_init: true,
11
+ ) do
12
+ def cell_position(codepoint)
13
+ offset = codepoint - block_first_cp
14
+ return nil if offset.negative?
15
+
16
+ row, col = offset.divmod(columns)
17
+ return nil if row >= rows
18
+
19
+ [origin_x + (col * column_pitch), origin_y + (row * row_pitch)]
20
+ end
21
+
22
+ def codepoint_at(row, col)
23
+ return nil if row.negative? || row >= rows
24
+ return nil if col.negative? || col >= columns
25
+
26
+ block_first_cp + (row * columns) + col
27
+ end
28
+ end
29
+ end
30
+ end