ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "sqlite3"
5
+ require "time"
6
+
7
+ require "ucode/cache"
8
+ require "ucode/coordinator"
9
+ require "ucode/database"
10
+ require "ucode/error"
11
+ require "ucode/index_builder"
12
+
13
+ module Ucode
14
+ # Builds the SQLite cache for one UCD version.
15
+ #
16
+ # Single entry point: `DbBuilder.build(version)`. Streams the
17
+ # Coordinator output through an IndexBuilder, then persists the
18
+ # coalesced block + script ranges into a SQLite DB at
19
+ # `Cache.sqlite_path(version)`.
20
+ #
21
+ # **Streaming**: the Coordinator yields one CodePoint at a time; the
22
+ # IndexBuilder folds it into per-property accumulators. Peak memory
23
+ # is the in-progress accumulators (~10 MB for the full UCD) plus one
24
+ # CodePoint — never all 160k CodePoints at once.
25
+ module DbBuilder
26
+ SCHEMA_SQL = <<~SQL
27
+ PRAGMA journal_mode = DELETE;
28
+ PRAGMA synchronous = NORMAL;
29
+
30
+ CREATE TABLE schema_meta (
31
+ key TEXT PRIMARY KEY,
32
+ value TEXT NOT NULL
33
+ );
34
+
35
+ CREATE TABLE blocks (
36
+ first_cp INTEGER NOT NULL,
37
+ last_cp INTEGER NOT NULL,
38
+ name TEXT NOT NULL
39
+ );
40
+ CREATE INDEX idx_blocks_first_cp ON blocks(first_cp);
41
+ CREATE INDEX idx_blocks_name ON blocks(name);
42
+
43
+ CREATE TABLE scripts (
44
+ first_cp INTEGER NOT NULL,
45
+ last_cp INTEGER NOT NULL,
46
+ name TEXT NOT NULL
47
+ );
48
+ CREATE INDEX idx_scripts_first_cp ON scripts(first_cp);
49
+ CREATE INDEX idx_scripts_name ON scripts(name);
50
+ SQL
51
+ private_constant :SCHEMA_SQL
52
+
53
+ class << self
54
+ # @param version [String]
55
+ # @return [Pathname] path to the built SQLite file
56
+ def build(version)
57
+ Ucode::VersionResolver.validate!(version)
58
+
59
+ ucd_dir = Cache.ucd_dir(version)
60
+ unihan_dir = Cache.unihan_dir(version)
61
+ db_path = Cache.sqlite_path(version)
62
+
63
+ Cache.ensure_version_dir!(version)
64
+
65
+ builder = IndexBuilder.new
66
+ Coordinator.new.each_codepoint(ucd_dir: ucd_dir, unihan_dir: unihan_dir) do |cp|
67
+ builder.add(cp)
68
+ end
69
+
70
+ write_db(db_path, version, builder.blocks_index, builder.scripts_index)
71
+ db_path
72
+ end
73
+
74
+ private
75
+
76
+ def write_db(db_path, version, blocks_index, scripts_index)
77
+ SQLite3::Database.new(db_path.to_s) do |db|
78
+ db.execute_batch(SCHEMA_SQL)
79
+ insert_meta(db, "schema_version", Database::SCHEMA_VERSION)
80
+ insert_meta(db, "ucd_version", version)
81
+ insert_meta(db, "built_at", Time.now.utc.iso8601)
82
+
83
+ db.transaction do
84
+ insert_rows(db, "blocks", blocks_index.entries)
85
+ insert_rows(db, "scripts", scripts_index.entries)
86
+ end
87
+ end
88
+ end
89
+
90
+ def insert_meta(db, key, value)
91
+ db.execute(
92
+ "INSERT INTO schema_meta (key, value) VALUES (?, ?)",
93
+ [key.to_s, value.to_s],
94
+ )
95
+ end
96
+
97
+ def insert_rows(db, table, entries)
98
+ stmt = db.prepare("INSERT INTO #{table} (first_cp, last_cp, name) VALUES (?, ?, ?)")
99
+ entries.each do |entry|
100
+ stmt.execute(entry.first_cp, entry.last_cp, entry.name)
101
+ end
102
+ ensure
103
+ stmt&.close
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Base error class for all ucode failures.
5
+ #
6
+ # Every error raised anywhere in the codebase is_a?(Ucode::Error). Errors
7
+ # carry structured context (file:, line:, codepoint:, version:, etc.) so
8
+ # CLI formatters can render useful diagnostics without re-parsing strings.
9
+ #
10
+ # Hierarchy:
11
+ #
12
+ # Ucode::Error
13
+ # ├── Ucode::FetchError
14
+ # │ ├── Ucode::NetworkError
15
+ # │ └── Ucode::ChecksumError
16
+ # ├── Ucode::ParseError
17
+ # │ ├── Ucode::MalformedLineError
18
+ # │ └── Ucode::UnknownPropertyError
19
+ # ├── Ucode::LookupError
20
+ # │ ├── Ucode::DatabaseMissingError
21
+ # │ ├── Ucode::DatabaseSchemaError
22
+ # │ └── Ucode::UnknownVersionError
23
+ # └── Ucode::GlyphError
24
+ # ├── Ucode::PdfRenderError
25
+ # ├── Ucode::GridDetectionError
26
+ # ├── Ucode::LastResortMissingError
27
+ # └── Ucode::EmbeddedFontsMissingError
28
+ class Error < StandardError
29
+ attr_reader :context
30
+
31
+ # @param message [String, nil]
32
+ # @param context [Hash{Symbol=>Object}] structured diagnostic context
33
+ def initialize(message = nil, context: {})
34
+ @context = context
35
+ super(build_message(message))
36
+ end
37
+
38
+ private
39
+
40
+ def build_message(message)
41
+ return self.class.to_s if message.nil? && context.empty?
42
+
43
+ parts = []
44
+ parts << message if message
45
+ parts << context.map { |k, v| "#{k}=#{v.inspect}" }.join(" ") unless context.empty?
46
+ parts.join(" | ")
47
+ end
48
+ end
49
+
50
+ # Fetch-time failures.
51
+ class FetchError < Error; end
52
+
53
+ # Network failures during fetch.
54
+ class NetworkError < FetchError; end
55
+
56
+ # Checksum or integrity failure.
57
+ class ChecksumError < FetchError; end
58
+
59
+ # Parse-time failures.
60
+ class ParseError < Error; end
61
+
62
+ # A UCD text file line that does not match the expected column layout.
63
+ class MalformedLineError < ParseError; end
64
+
65
+ # A property short code we don't have in PropertyAliases/PropertyValueAliases.
66
+ class UnknownPropertyError < ParseError; end
67
+
68
+ # Lookup-time failures.
69
+ class LookupError < Error; end
70
+
71
+ # Cache missing for a requested version.
72
+ class DatabaseMissingError < LookupError; end
73
+
74
+ # On-disk schema version mismatch.
75
+ class DatabaseSchemaError < LookupError; end
76
+
77
+ # Version string not in Config.known_versions.
78
+ class UnknownVersionError < LookupError; end
79
+
80
+ # Glyph pipeline failures.
81
+ class GlyphError < Error; end
82
+
83
+ # PDF → SVG rendering failure.
84
+ class PdfRenderError < GlyphError; end
85
+
86
+ # Grid detection couldn't anchor on codepoint labels.
87
+ class GridDetectionError < GlyphError; end
88
+
89
+ # The Last Resort Font UFO source cannot be located or is missing a
90
+ # required artifact (cmap-f13.ttx, font.ufo/glyphs/, contents.plist).
91
+ class LastResortMissingError < GlyphError; end
92
+
93
+ # The Code Charts PDF (per-block or monolith) cannot be located, or
94
+ # `mutool` is not installed on the PATH.
95
+ class EmbeddedFontsMissingError < GlyphError; end
96
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Fetch
5
+ # Downloads per-block Code Charts PDFs from unicode.org/charts/PDF/.
6
+ #
7
+ # URL pattern: `https://www.unicode.org/charts/PDF/U<XXXX>.pdf`
8
+ # where `XXXX` is the block's first codepoint zero-padded to 4 digits
9
+ # (5–6 digits for planes > 0).
10
+ module CodeCharts
11
+ class << self
12
+ # @param version [String] used as the on-disk path namespace; PDFs
13
+ # are not versioned on unicode.org so the argument is mostly a
14
+ # convention.
15
+ # @param block_first_cps [Array<Integer>] first codepoint of each
16
+ # block to download. If nil, caller is expected to derive the
17
+ # list from `Parsers::Blocks` (the PDF URL is `U<hex>.pdf`).
18
+ # @param force [Boolean] re-download even if cached.
19
+ # @return [Integer] number of PDFs downloaded.
20
+ def call(version, block_first_cps:, force: false)
21
+ Cache.ensure_version_dir!(version)
22
+ pdfs_dir = Cache.pdfs_dir(version)
23
+ pdfs_dir.mkpath
24
+
25
+ downloaded = 0
26
+ block_first_cps.each do |first_cp|
27
+ filename = "U#{hex_pad(first_cp)}.pdf"
28
+ dest = pdfs_dir.join(filename)
29
+ next if dest.exist? && !force
30
+
31
+ url = "#{Ucode.configuration.charts_base_url}/#{filename}"
32
+ Http.get(url, dest: dest)
33
+ downloaded += 1
34
+ end
35
+ downloaded
36
+ end
37
+
38
+ # Build the block→first-cp list from a parsed Blocks index. The
39
+ # caller passes the output of `Ucode::Parsers::Blocks.each_record`
40
+ # collapsed into `block_id => first_cp`.
41
+ #
42
+ # @param blocks [Array<Ucode::Models::Block>] sorted by first_cp
43
+ # @return [Array<Integer>] first-cp values
44
+ def first_cps_from(blocks)
45
+ blocks.map(&:range_first)
46
+ end
47
+
48
+ private
49
+
50
+ def hex_pad(codepoint)
51
+ width = codepoint > 0xFFFF ? 6 : 4
52
+ codepoint.to_s(16).upcase.rjust(width, "0")
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "fileutils"
6
+ require "pathname"
7
+
8
+ module Ucode
9
+ module Fetch
10
+ # Shared HTTP wrapper. Single network boundary for the whole project.
11
+ #
12
+ # Streaming download with retries and exponential backoff. Raises
13
+ # Ucode::NetworkError on final failure (after `http_retries` attempts).
14
+ module Http
15
+ DEFAULT_BACKOFF = [1, 2, 4, 8, 16].freeze
16
+ private_constant :DEFAULT_BACKOFF
17
+
18
+ class << self
19
+ # Stream `url` to `dest` (a Pathname or String path).
20
+ #
21
+ # @param url [String, URI] full URL.
22
+ # @param dest [Pathname, String] destination file path. Parent
23
+ # directory is created if absent.
24
+ # @param retries [Integer, nil] override Config.http_retries.
25
+ # @param timeout [Integer, nil] override Config.http_timeout.
26
+ # @return [Pathname] destination path on success.
27
+ # @raise [Ucode::NetworkError] if all retries fail.
28
+ def get(url, dest:, retries: nil, timeout: nil)
29
+ uri = url.is_a?(URI) ? url : URI(url)
30
+ destination = Pathname.new(dest)
31
+ destination.dirname.mkpath
32
+
33
+ attempts = retries || Ucode.configuration.http_retries
34
+ read_timeout = timeout || Ucode.configuration.http_timeout
35
+ backoff_sequence = DEFAULT_BACKOFF.take(attempts + 1)
36
+
37
+ last_error = nil
38
+ (attempts + 1).times do |attempt|
39
+ return stream_to(uri, destination, read_timeout)
40
+ rescue StandardError => e
41
+ last_error = e
42
+ sleep_for = backoff_sequence[attempt] || backoff_sequence.last
43
+ Ucode.configuration.logger&.warn do
44
+ "Http GET #{uri} failed (attempt #{attempt + 1}/#{attempts + 1}): " \
45
+ "#{e.class}: #{e.message}; retrying in #{sleep_for}s"
46
+ end
47
+ sleep(sleep_for)
48
+ end
49
+
50
+ raise Ucode::NetworkError.new(
51
+ "GET #{uri} failed after #{attempts + 1} attempts",
52
+ context: { url: uri.to_s, last_error: last_error&.message },
53
+ )
54
+ end
55
+
56
+ private
57
+
58
+ def stream_to(uri, destination, read_timeout)
59
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https",
60
+ read_timeout: read_timeout) do |http|
61
+ request = Net::HTTP::Get.new(uri)
62
+ http.request(request) do |response|
63
+ unless response.is_a?(Net::HTTPSuccess)
64
+ raise "HTTP #{response.code} #{response.message}"
65
+ end
66
+
67
+ write_body(response, destination)
68
+ end
69
+ end
70
+ destination
71
+ end
72
+
73
+ def write_body(response, destination)
74
+ partial = destination.sub_ext("#{destination.extname}.part")
75
+ File.open(partial, "wb") do |file|
76
+ response.read_body { |chunk| file.write(chunk) }
77
+ end
78
+ File.rename(partial.to_s, destination.to_s)
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zip"
4
+
5
+ module Ucode
6
+ module Fetch
7
+ # Downloads UCD.zip from unicode.org and unpacks it into
8
+ # `Cache.ucd_dir(version)`.
9
+ module UcdZip
10
+ URL_SUFFIX = "/ucd/UCD.zip"
11
+ private_constant :URL_SUFFIX
12
+
13
+ class << self
14
+ # @param version [String] e.g. "17.0.0"
15
+ # @param force [Boolean] re-download even if cached.
16
+ # @return [Pathname] the ucd_dir after extraction.
17
+ def call(version, force: false)
18
+ Cache.ensure_version_dir!(version)
19
+ target_dir = Cache.ucd_dir(version)
20
+
21
+ marker = target_dir.join("UnicodeData.txt")
22
+ return target_dir if marker.exist? && !force
23
+
24
+ url = "#{Ucode.configuration.ucd_base_url}/#{version}#{URL_SUFFIX}"
25
+ zip_path = Cache.version_dir(version).join("ucd.zip")
26
+ Http.get(url, dest: zip_path)
27
+ extract(zip_path, target_dir)
28
+ zip_path.delete if zip_path.exist?
29
+ target_dir
30
+ end
31
+
32
+ private
33
+
34
+ def extract(zip_path, target_dir)
35
+ target_dir.mkpath
36
+ Zip::File.open(zip_path.to_s) do |zip|
37
+ zip.each do |entry|
38
+ next if entry.directory?
39
+ next if entry.name.start_with?("__MACOSX/") || entry.name.include?("/._")
40
+
41
+ relative = entry.name.sub(%r{^/+}, "")
42
+ dest = target_dir.join(relative)
43
+ dest.dirname.mkpath
44
+ next if dest.exist?
45
+
46
+ entry.get_input_stream do |input|
47
+ File.open(dest, "wb") do |output|
48
+ IO.copy_stream(input, output)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zip"
4
+
5
+ module Ucode
6
+ module Fetch
7
+ # Downloads Unihan.zip from unicode.org and unpacks it into
8
+ # `Cache.unihan_dir(version)`.
9
+ module UnihanZip
10
+ URL_SUFFIX = "/ucd/Unihan.zip"
11
+ private_constant :URL_SUFFIX
12
+
13
+ class << self
14
+ # @param version [String]
15
+ # @param force [Boolean]
16
+ # @return [Pathname]
17
+ def call(version, force: false)
18
+ Cache.ensure_version_dir!(version)
19
+ target_dir = Cache.unihan_dir(version)
20
+
21
+ marker = target_dir.join("Unihan_Readings.txt")
22
+ return target_dir if marker.exist? && !force
23
+
24
+ url = "#{Ucode.configuration.unihan_base_url}/#{version}#{URL_SUFFIX}"
25
+ zip_path = Cache.version_dir(version).join("unihan.zip")
26
+ Http.get(url, dest: zip_path)
27
+ extract(zip_path, target_dir)
28
+ zip_path.delete if zip_path.exist?
29
+ target_dir
30
+ end
31
+
32
+ private
33
+
34
+ def extract(zip_path, target_dir)
35
+ target_dir.mkpath
36
+ Zip::File.open(zip_path.to_s) do |zip|
37
+ zip.each do |entry|
38
+ next if entry.directory?
39
+ next if entry.name.start_with?("__MACOSX/") || entry.name.include?("/._")
40
+
41
+ relative = entry.name.sub(%r{^/+}, "")
42
+ dest = target_dir.join(relative)
43
+ dest.dirname.mkpath
44
+ next if dest.exist?
45
+
46
+ entry.get_input_stream do |input|
47
+ File.open(dest, "wb") do |output|
48
+ IO.copy_stream(input, output)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ # Fetchers — download UCD.zip, Unihan.zip, and per-block Code Charts PDFs.
5
+ #
6
+ # OCP: Http is the single network boundary. New source types add a new
7
+ # Fetcher class that calls Http.get; no new HTTP stack.
8
+ module Fetch
9
+ autoload :Http, "ucode/fetch/http"
10
+ autoload :UcdZip, "ucode/fetch/ucd_zip"
11
+ autoload :UnihanZip, "ucode/fetch/unihan_zip"
12
+ autoload :CodeCharts, "ucode/fetch/code_charts"
13
+ end
14
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ require "ucode/glyphs/path_bbox"
6
+
7
+ module Ucode
8
+ module Glyphs
9
+ # Extracts a single character cell from a Code Charts SVG page and
10
+ # returns a normalized standalone SVG containing only that cell's
11
+ # vector paths.
12
+ #
13
+ # The cell is identified by codepoint. The extractor asks the Grid
14
+ # for the cell's anchor position, finds the `<use>` element placed
15
+ # at that position, resolves its glyph definition from `<defs>`,
16
+ # and emits a fresh `<svg>` whose viewBox is `0 0 1000 1000` and
17
+ # whose body is the glyph's `<path>` data translated and scaled to
18
+ # fit that viewBox with a small margin.
19
+ #
20
+ # Vector-only. Never rasterizes, never OCRs. If the cell is empty
21
+ # (no character glyph placed there, e.g. unassigned codepoint or
22
+ # control character), the extractor returns nil.
23
+ class CellExtractor
24
+ ViewBoxSize = 1000.0
25
+ MarginRatio = 0.1
26
+ private_constant :ViewBoxSize, :MarginRatio
27
+
28
+ # @param doc [Nokogiri::XML::Document] the rendered Code Charts page
29
+ def initialize(doc)
30
+ @doc = doc
31
+ @glyph_cache = {}
32
+ end
33
+
34
+ # @param grid [Ucode::Glyphs::Grid]
35
+ # @param codepoint [Integer]
36
+ # @return [Nokogiri::XML::Document, nil] a standalone `<svg>` doc
37
+ # with viewBox `0 0 1000 1000`, or nil if the cell is empty.
38
+ def extract(grid, codepoint)
39
+ anchor = grid.cell_position(codepoint)
40
+ return nil unless anchor
41
+
42
+ use_node = find_use_at(anchor, grid)
43
+ return nil unless use_node
44
+
45
+ path_data = collect_paths(use_node["xlink:href"] || use_node["href"])
46
+ return nil if path_data.empty?
47
+
48
+ bbox = PathBbox.estimate(path_data.join(" "))
49
+ return nil if bbox.empty?
50
+
51
+ build_svg(path_data, bbox, use_node["x"].to_f, use_node["y"].to_f)
52
+ end
53
+
54
+ private
55
+
56
+ def find_use_at(anchor, grid)
57
+ tolerance_x = grid.column_pitch / 2
58
+ tolerance_y = grid.row_pitch / 2
59
+
60
+ candidates = @doc.css("use").select do |node|
61
+ href = node["xlink:href"] || node["href"] || ""
62
+ href.start_with?("#glyph-") &&
63
+ (node["x"].to_f - anchor[0]).abs <= tolerance_x &&
64
+ (node["y"].to_f - anchor[1]).abs <= tolerance_y
65
+ end
66
+
67
+ candidates.min_by do |node|
68
+ dx = node["x"].to_f - anchor[0]
69
+ dy = node["y"].to_f - anchor[1]
70
+ (dx * dx) + (dy * dy)
71
+ end
72
+ end
73
+
74
+ def collect_paths(href)
75
+ return [] unless href
76
+
77
+ glyph_id = href.sub(/\A#/, "")
78
+ node = glyph_definition(glyph_id)
79
+ return [] unless node
80
+
81
+ node.css("path").map { |p| p["d"] }.compact
82
+ end
83
+
84
+ def glyph_definition(glyph_id)
85
+ return @glyph_cache[glyph_id] if @glyph_cache.key?(glyph_id)
86
+
87
+ @glyph_cache[glyph_id] = @doc.at_css("defs ##{glyph_id}")
88
+ end
89
+
90
+ def build_svg(path_data, glyph_bbox, place_x, place_y)
91
+ placed = PathBbox::Result.new(
92
+ min_x: place_x + glyph_bbox.min_x,
93
+ min_y: place_y + glyph_bbox.min_y,
94
+ max_x: place_x + glyph_bbox.max_x,
95
+ max_y: place_y + glyph_bbox.max_y,
96
+ )
97
+
98
+ width = placed.width
99
+ height = placed.height
100
+ return nil if width <= 0 || height <= 0
101
+
102
+ content_size = ViewBoxSize * (1.0 - (2.0 * MarginRatio))
103
+ scale = [content_size / width, content_size / height].min
104
+ offset_x = (ViewBoxSize - (width * scale)) / 2.0
105
+ offset_y = (ViewBoxSize - (height * scale)) / 2.0
106
+ translate_x = offset_x - (placed.min_x * scale)
107
+ translate_y = offset_y - (placed.min_y * scale)
108
+
109
+ builder = Nokogiri::XML::Document.new
110
+ root = builder.create_element(
111
+ "svg",
112
+ xmlns: "http://www.w3.org/2000/svg",
113
+ viewBox: "0 0 #{ViewBoxSize.to_i} #{ViewBoxSize.to_i}",
114
+ width: ViewBoxSize.to_i,
115
+ height: ViewBoxSize.to_i,
116
+ )
117
+ group = builder.create_element(
118
+ "g",
119
+ transform: "scale(#{format('%.6f', scale)}) translate(#{format('%.6f', translate_x)}, #{format('%.6f', translate_y)})",
120
+ )
121
+ path_data.each do |d|
122
+ group.add_child(builder.create_element("path", d: d, fill: "black"))
123
+ end
124
+ root.add_child(group)
125
+ builder.add_child(root)
126
+ builder
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ucode/glyphs/page_renderer"
4
+
5
+ module Ucode
6
+ module Glyphs
7
+ # `dvisvgm` — originally a DVI-to-SVG converter, also handles PDF.
8
+ # The `--no-fonts` flag forces outline-only output (no font subsetting
9
+ # artifacts), which is what we want for vector glyph extraction.
10
+ #
11
+ # Command: `dvisvgm --pdf --no-fonts --page=<n> <in.pdf> -o <out.svg>`
12
+ class DvisvgmRenderer < PageRenderer
13
+ class << self
14
+ def renderer_name
15
+ :dvisvgm
16
+ end
17
+
18
+ def binary_name
19
+ :dvisvgm
20
+ end
21
+
22
+ def build_command(pdf_path, page_num, out_path)
23
+ ["dvisvgm", "--pdf", "--no-fonts", "--page=#{page_num}",
24
+ pdf_path.to_s, "-o", out_path.to_s]
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end