ucode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. checksums.yaml +7 -0
  2. data/CLAUDE.md +211 -0
  3. data/Gemfile +22 -0
  4. data/Gemfile.lock +406 -0
  5. data/README.md +469 -0
  6. data/Rakefile +18 -0
  7. data/TODO.new/00-README.md +66 -0
  8. data/TODO.new/01-pillar-terminology-alignment.md +69 -0
  9. data/TODO.new/02-audit-schema-design.md +255 -0
  10. data/TODO.new/03-directory-output-spec.md +203 -0
  11. data/TODO.new/04-fontist-org-contract.md +173 -0
  12. data/TODO.new/05-baseline-unicode17-coverage-audit.md +144 -0
  13. data/TODO.new/06-audit-namespace-skeleton.md +105 -0
  14. data/TODO.new/07-audit-models-port.md +132 -0
  15. data/TODO.new/08-extractors-cheap-port.md +113 -0
  16. data/TODO.new/09-extractors-expensive-port.md +99 -0
  17. data/TODO.new/10-aggregations-ucd-rewrite.md +168 -0
  18. data/TODO.new/11-differ-and-library-auditor-port.md +102 -0
  19. data/TODO.new/12-formatters-port.md +115 -0
  20. data/TODO.new/13-directory-emitter.md +147 -0
  21. data/TODO.new/14-html-face-browser.md +144 -0
  22. data/TODO.new/15-html-library-browser.md +102 -0
  23. data/TODO.new/16-cli-audit-subcommands.md +142 -0
  24. data/TODO.new/17-fontisan-cleanup-audit.md +147 -0
  25. data/TODO.new/18-fontisan-cleanup-ucd.md +156 -0
  26. data/TODO.new/19-fontisan-docs-update.md +155 -0
  27. data/TODO.new/20-canonical-resolver-4-tier.md +182 -0
  28. data/TODO.new/21-canonical-unicode17-build.md +148 -0
  29. data/TODO.new/22-implementation-order.md +176 -0
  30. data/UCODE_CHANGELOG.md +97 -0
  31. data/exe/ucode +8 -0
  32. data/lib/ucode/aggregator.rb +77 -0
  33. data/lib/ucode/audit/block_aggregator.rb +90 -0
  34. data/lib/ucode/audit/codepoint_range_coalescer.rb +42 -0
  35. data/lib/ucode/audit/context.rb +137 -0
  36. data/lib/ucode/audit/discrepancy_detector.rb +213 -0
  37. data/lib/ucode/audit/extractors/aggregations.rb +70 -0
  38. data/lib/ucode/audit/extractors/base.rb +21 -0
  39. data/lib/ucode/audit/extractors/color_capabilities.rb +143 -0
  40. data/lib/ucode/audit/extractors/coverage.rb +55 -0
  41. data/lib/ucode/audit/extractors/hinting.rb +199 -0
  42. data/lib/ucode/audit/extractors/identity.rb +65 -0
  43. data/lib/ucode/audit/extractors/licensing.rb +75 -0
  44. data/lib/ucode/audit/extractors/metrics.rb +108 -0
  45. data/lib/ucode/audit/extractors/opentype_layout.rb +71 -0
  46. data/lib/ucode/audit/extractors/provenance.rb +34 -0
  47. data/lib/ucode/audit/extractors/style.rb +88 -0
  48. data/lib/ucode/audit/extractors/variation_detail.rb +101 -0
  49. data/lib/ucode/audit/extractors.rb +31 -0
  50. data/lib/ucode/audit/plane_aggregator.rb +37 -0
  51. data/lib/ucode/audit/registry.rb +63 -0
  52. data/lib/ucode/audit/script_aggregator.rb +92 -0
  53. data/lib/ucode/audit.rb +27 -0
  54. data/lib/ucode/cache.rb +113 -0
  55. data/lib/ucode/cli.rb +272 -0
  56. data/lib/ucode/commands/build.rb +68 -0
  57. data/lib/ucode/commands/cache.rb +46 -0
  58. data/lib/ucode/commands/fetch.rb +62 -0
  59. data/lib/ucode/commands/font_coverage.rb +57 -0
  60. data/lib/ucode/commands/glyphs.rb +136 -0
  61. data/lib/ucode/commands/lookup.rb +65 -0
  62. data/lib/ucode/commands/parse.rb +62 -0
  63. data/lib/ucode/commands/site.rb +33 -0
  64. data/lib/ucode/commands.rb +19 -0
  65. data/lib/ucode/config.rb +110 -0
  66. data/lib/ucode/coordinator/indices.rb +34 -0
  67. data/lib/ucode/coordinator.rb +397 -0
  68. data/lib/ucode/database.rb +214 -0
  69. data/lib/ucode/db_builder.rb +107 -0
  70. data/lib/ucode/error.rb +96 -0
  71. data/lib/ucode/fetch/code_charts.rb +57 -0
  72. data/lib/ucode/fetch/http.rb +83 -0
  73. data/lib/ucode/fetch/ucd_zip.rb +57 -0
  74. data/lib/ucode/fetch/unihan_zip.rb +57 -0
  75. data/lib/ucode/fetch.rb +14 -0
  76. data/lib/ucode/glyphs/cell_extractor.rb +130 -0
  77. data/lib/ucode/glyphs/dvisvgm_renderer.rb +29 -0
  78. data/lib/ucode/glyphs/embedded_fonts/catalog.rb +372 -0
  79. data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +228 -0
  80. data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +126 -0
  81. data/lib/ucode/glyphs/embedded_fonts/renderer.rb +47 -0
  82. data/lib/ucode/glyphs/embedded_fonts/source.rb +94 -0
  83. data/lib/ucode/glyphs/embedded_fonts/svg.rb +123 -0
  84. data/lib/ucode/glyphs/embedded_fonts/tounicode.rb +103 -0
  85. data/lib/ucode/glyphs/embedded_fonts/writer.rb +76 -0
  86. data/lib/ucode/glyphs/embedded_fonts.rb +50 -0
  87. data/lib/ucode/glyphs/grid.rb +30 -0
  88. data/lib/ucode/glyphs/grid_detector.rb +165 -0
  89. data/lib/ucode/glyphs/last_resort/cmap_index.rb +96 -0
  90. data/lib/ucode/glyphs/last_resort/contents.rb +74 -0
  91. data/lib/ucode/glyphs/last_resort/glif.rb +124 -0
  92. data/lib/ucode/glyphs/last_resort/renderer.rb +67 -0
  93. data/lib/ucode/glyphs/last_resort/source.rb +125 -0
  94. data/lib/ucode/glyphs/last_resort/svg.rb +247 -0
  95. data/lib/ucode/glyphs/last_resort/writer.rb +83 -0
  96. data/lib/ucode/glyphs/last_resort.rb +36 -0
  97. data/lib/ucode/glyphs/monolith_page_map.rb +181 -0
  98. data/lib/ucode/glyphs/mutool_renderer.rb +28 -0
  99. data/lib/ucode/glyphs/page_renderer.rb +221 -0
  100. data/lib/ucode/glyphs/path_bbox.rb +62 -0
  101. data/lib/ucode/glyphs/pdf2svg_renderer.rb +26 -0
  102. data/lib/ucode/glyphs/pdf_fetcher.rb +102 -0
  103. data/lib/ucode/glyphs/pdftocairo_renderer.rb +32 -0
  104. data/lib/ucode/glyphs/real_fonts/block_coverage.rb +45 -0
  105. data/lib/ucode/glyphs/real_fonts/coverage_auditor.rb +117 -0
  106. data/lib/ucode/glyphs/real_fonts/font_coverage_report.rb +45 -0
  107. data/lib/ucode/glyphs/real_fonts/font_locator.rb +95 -0
  108. data/lib/ucode/glyphs/real_fonts/unicode_17_blocks.rb +104 -0
  109. data/lib/ucode/glyphs/real_fonts/writer.rb +50 -0
  110. data/lib/ucode/glyphs/real_fonts.rb +32 -0
  111. data/lib/ucode/glyphs/writer.rb +250 -0
  112. data/lib/ucode/glyphs.rb +27 -0
  113. data/lib/ucode/index.rb +106 -0
  114. data/lib/ucode/index_builder.rb +94 -0
  115. data/lib/ucode/models/audit/audit_axis.rb +30 -0
  116. data/lib/ucode/models/audit/audit_diff.rb +77 -0
  117. data/lib/ucode/models/audit/audit_report.rb +137 -0
  118. data/lib/ucode/models/audit/baseline.rb +32 -0
  119. data/lib/ucode/models/audit/block_summary.rb +72 -0
  120. data/lib/ucode/models/audit/codepoint_detail.rb +45 -0
  121. data/lib/ucode/models/audit/codepoint_range.rb +39 -0
  122. data/lib/ucode/models/audit/codepoint_set_diff.rb +34 -0
  123. data/lib/ucode/models/audit/color_capabilities.rb +91 -0
  124. data/lib/ucode/models/audit/discrepancy.rb +38 -0
  125. data/lib/ucode/models/audit/duplicate_group.rb +23 -0
  126. data/lib/ucode/models/audit/embedding_type.rb +81 -0
  127. data/lib/ucode/models/audit/field_change.rb +28 -0
  128. data/lib/ucode/models/audit/fs_selection_flags.rb +65 -0
  129. data/lib/ucode/models/audit/gasp_range.rb +63 -0
  130. data/lib/ucode/models/audit/hinting.rb +99 -0
  131. data/lib/ucode/models/audit/library_summary.rb +40 -0
  132. data/lib/ucode/models/audit/licensing.rb +48 -0
  133. data/lib/ucode/models/audit/metrics.rb +111 -0
  134. data/lib/ucode/models/audit/named_instance.rb +41 -0
  135. data/lib/ucode/models/audit/opentype_layout.rb +38 -0
  136. data/lib/ucode/models/audit/plane_summary.rb +31 -0
  137. data/lib/ucode/models/audit/script_coverage_row.rb +26 -0
  138. data/lib/ucode/models/audit/script_features.rb +28 -0
  139. data/lib/ucode/models/audit/script_summary.rb +54 -0
  140. data/lib/ucode/models/audit/variation_detail.rb +42 -0
  141. data/lib/ucode/models/audit.rb +50 -0
  142. data/lib/ucode/models/bidi_bracket_pair.rb +20 -0
  143. data/lib/ucode/models/bidi_mirroring.rb +19 -0
  144. data/lib/ucode/models/binary_property_assignment.rb +26 -0
  145. data/lib/ucode/models/block.rb +36 -0
  146. data/lib/ucode/models/case_folding_rule.rb +23 -0
  147. data/lib/ucode/models/cjk_radical.rb +23 -0
  148. data/lib/ucode/models/codepoint/bidi.rb +28 -0
  149. data/lib/ucode/models/codepoint/break_segmentation.rb +22 -0
  150. data/lib/ucode/models/codepoint/case_folding.rb +25 -0
  151. data/lib/ucode/models/codepoint/casing.rb +32 -0
  152. data/lib/ucode/models/codepoint/decomposition.rb +27 -0
  153. data/lib/ucode/models/codepoint/display.rb +24 -0
  154. data/lib/ucode/models/codepoint/emoji.rb +29 -0
  155. data/lib/ucode/models/codepoint/hangul.rb +20 -0
  156. data/lib/ucode/models/codepoint/identifier.rb +30 -0
  157. data/lib/ucode/models/codepoint/indic.rb +20 -0
  158. data/lib/ucode/models/codepoint/joining.rb +20 -0
  159. data/lib/ucode/models/codepoint/normalization.rb +35 -0
  160. data/lib/ucode/models/codepoint/numeric_value.rb +35 -0
  161. data/lib/ucode/models/codepoint.rb +122 -0
  162. data/lib/ucode/models/name_alias.rb +21 -0
  163. data/lib/ucode/models/named_sequence.rb +19 -0
  164. data/lib/ucode/models/names_list_entry.rb +38 -0
  165. data/lib/ucode/models/plane.rb +36 -0
  166. data/lib/ucode/models/property_alias.rb +24 -0
  167. data/lib/ucode/models/property_value_alias.rb +26 -0
  168. data/lib/ucode/models/relationship/compat_equiv.rb +18 -0
  169. data/lib/ucode/models/relationship/cross_reference.rb +17 -0
  170. data/lib/ucode/models/relationship/footnote.rb +24 -0
  171. data/lib/ucode/models/relationship/informal_alias.rb +18 -0
  172. data/lib/ucode/models/relationship/sample_sequence.rb +24 -0
  173. data/lib/ucode/models/relationship/variation_sequence.rb +19 -0
  174. data/lib/ucode/models/relationship.rb +57 -0
  175. data/lib/ucode/models/script.rb +41 -0
  176. data/lib/ucode/models/special_casing_rule.rb +28 -0
  177. data/lib/ucode/models/standardized_variant.rb +24 -0
  178. data/lib/ucode/models/unihan_entry.rb +23 -0
  179. data/lib/ucode/models.rb +47 -0
  180. data/lib/ucode/parsers/auxiliary.rb +26 -0
  181. data/lib/ucode/parsers/base.rb +137 -0
  182. data/lib/ucode/parsers/bidi_brackets.rb +41 -0
  183. data/lib/ucode/parsers/bidi_mirroring.rb +37 -0
  184. data/lib/ucode/parsers/blocks.rb +63 -0
  185. data/lib/ucode/parsers/case_folding.rb +53 -0
  186. data/lib/ucode/parsers/cjk_radicals.rb +102 -0
  187. data/lib/ucode/parsers/derived_age.rb +59 -0
  188. data/lib/ucode/parsers/derived_core_properties.rb +60 -0
  189. data/lib/ucode/parsers/extracted_properties.rb +74 -0
  190. data/lib/ucode/parsers/name_aliases.rb +44 -0
  191. data/lib/ucode/parsers/named_sequences.rb +51 -0
  192. data/lib/ucode/parsers/names_list.rb +250 -0
  193. data/lib/ucode/parsers/property_aliases.rb +41 -0
  194. data/lib/ucode/parsers/property_value_aliases.rb +46 -0
  195. data/lib/ucode/parsers/script_extensions.rb +64 -0
  196. data/lib/ucode/parsers/scripts.rb +60 -0
  197. data/lib/ucode/parsers/special_casing.rb +62 -0
  198. data/lib/ucode/parsers/standardized_variants.rb +56 -0
  199. data/lib/ucode/parsers/unicode_data/hangul_name.rb +73 -0
  200. data/lib/ucode/parsers/unicode_data.rb +268 -0
  201. data/lib/ucode/parsers/unihan.rb +125 -0
  202. data/lib/ucode/parsers.rb +35 -0
  203. data/lib/ucode/range_entry.rb +58 -0
  204. data/lib/ucode/repo/aggregate_writer.rb +364 -0
  205. data/lib/ucode/repo/atomic_writes.rb +48 -0
  206. data/lib/ucode/repo/codepoint_writer.rb +96 -0
  207. data/lib/ucode/repo/paths.rb +122 -0
  208. data/lib/ucode/repo.rb +22 -0
  209. data/lib/ucode/site/config_emitter.rb +124 -0
  210. data/lib/ucode/site/generator.rb +178 -0
  211. data/lib/ucode/site/search_index.rb +68 -0
  212. data/lib/ucode/site/template/.gitignore +4 -0
  213. data/lib/ucode/site/template/.vitepress/config.ts +8 -0
  214. data/lib/ucode/site/template/.vitepress/theme/index.js +20 -0
  215. data/lib/ucode/site/template/char/[codepoint].md +13 -0
  216. data/lib/ucode/site/template/components/BlockView.vue +57 -0
  217. data/lib/ucode/site/template/components/CharView.vue +85 -0
  218. data/lib/ucode/site/template/components/PlaneView.vue +56 -0
  219. data/lib/ucode/site/template/components/SearchView.vue +66 -0
  220. data/lib/ucode/site/template/index.md +25 -0
  221. data/lib/ucode/site/template/package.json +18 -0
  222. data/lib/ucode/site/template/search.md +9 -0
  223. data/lib/ucode/site.rb +13 -0
  224. data/lib/ucode/version.rb +5 -0
  225. data/lib/ucode/version_resolver.rb +76 -0
  226. data/lib/ucode.rb +74 -0
  227. data/ucode.gemspec +56 -0
  228. metadata +404 -0
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Audit
5
+ # Produces one {Models::Audit::BlockSummary} per touched Unicode block
6
+ # for a font's cmap codepoint set, compared against a ucode UCD
7
+ # baseline.
8
+ #
9
+ # Pure transformation: takes the resolved baseline Database + the
10
+ # font's codepoint list, returns BlockSummary[]. No I/O beyond the
11
+ # database lookups, no mutation of inputs.
12
+ #
13
+ # The "assigned" set for a block is derived from the Database's
14
+ # ranges-with-that-name. The Database stores coalesced runs of
15
+ # consecutive assigned codepoints grouped by block name, so the
16
+ # union of those ranges IS the assigned set for that block.
17
+ class BlockAggregator
18
+ # @param database [Ucode::Database, nil] resolved baseline. When
19
+ # nil, #call returns an empty array — caller should treat that
20
+ # as "no UCD baseline available" and surface a warning.
21
+ def initialize(database)
22
+ @database = database
23
+ end
24
+
25
+ # @param codepoints [Enumerable<Integer>]
26
+ # @return [Array<Models::Audit::BlockSummary>] sorted by first_cp
27
+ def call(codepoints)
28
+ return [] if @database.nil? || codepoints.empty?
29
+
30
+ grouped = group_by_block(codepoints)
31
+ grouped.map { |name, covered| build_summary(name, covered) }
32
+ .sort_by(&:first_cp)
33
+ end
34
+
35
+ private
36
+
37
+ def group_by_block(codepoints)
38
+ codepoints.each_with_object(Hash.new { |h, k| h[k] = [] }) do |cp, acc|
39
+ name = @database.lookup_block(cp)
40
+ acc[name] << cp if name
41
+ end
42
+ end
43
+
44
+ def build_summary(name, covered_cps)
45
+ ranges = @database.block_ranges_by_name(name)
46
+ # ranges is non-empty here: the name came from lookup_block,
47
+ # which only returns names present in the blocks table.
48
+ first_cp = ranges.map(&:first_cp).min
49
+ last_cp = ranges.map(&:last_cp).max
50
+ assigned_set = expand_assigned(ranges)
51
+ covered_set = covered_cps.to_set & assigned_set
52
+ missing_set = assigned_set - covered_set
53
+
54
+ Models::Audit::BlockSummary.new(
55
+ name: name,
56
+ first_cp: first_cp,
57
+ last_cp: last_cp,
58
+ range: format_range(first_cp, last_cp),
59
+ plane: first_cp >> 16,
60
+ total_assigned: assigned_set.size,
61
+ covered_count: covered_set.size,
62
+ missing_count: missing_set.size,
63
+ coverage_percent: percent(covered_set.size, assigned_set.size),
64
+ status: Models::Audit::BlockSummary.derive_status(
65
+ covered_count: covered_set.size,
66
+ total_assigned: assigned_set.size,
67
+ ),
68
+ missing_codepoints: missing_set.sort,
69
+ covered_codepoints: covered_set.sort,
70
+ )
71
+ end
72
+
73
+ def expand_assigned(ranges)
74
+ ranges.each_with_object(Set.new) do |r, acc|
75
+ (r.first_cp..r.last_cp).each { |cp| acc << cp }
76
+ end
77
+ end
78
+
79
+ def percent(covered, total)
80
+ return 0.0 if total.zero?
81
+
82
+ (covered.to_f / total * 100).round(2)
83
+ end
84
+
85
+ def format_range(first, last)
86
+ format("U+%<first>04X–U+%<last>04X", first: first, last: last)
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Audit
5
+ # Coalesces a flat codepoint list into contiguous
6
+ # {Models::Audit::CodepointRange} instances.
7
+ #
8
+ # Pure function: input is any Enumerable<Integer>, output is a sorted
9
+ # array of contiguous ranges. Used by {Extractors::Coverage} to produce
10
+ # the compact range view that is the default AuditReport shape.
11
+ module CodepointRangeCoalescer
12
+ module_function
13
+
14
+ # @param codepoints [Enumerable<Integer>]
15
+ # @return [Array<Models::Audit::CodepointRange>] contiguous, sorted
16
+ def call(codepoints)
17
+ return [] if codepoints.nil? || codepoints.empty?
18
+
19
+ sorted = codepoints.sort.uniq
20
+ ranges = []
21
+ range_start = sorted[0]
22
+ prev = sorted[0]
23
+
24
+ sorted[1..].each do |cp|
25
+ next if cp == prev
26
+
27
+ if cp == prev + 1
28
+ prev = cp
29
+ else
30
+ ranges << Models::Audit::CodepointRange.new(first_cp: range_start,
31
+ last_cp: prev)
32
+ range_start = cp
33
+ prev = cp
34
+ end
35
+ end
36
+ ranges << Models::Audit::CodepointRange.new(first_cp: range_start,
37
+ last_cp: prev)
38
+ ranges
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "time"
4
+
5
+ require "fontisan"
6
+
7
+ module Ucode
8
+ module Audit
9
+ # Value object carrying everything an extractor needs to do its job.
10
+ #
11
+ # Extractors never reach back into AuditCommand state — they read
12
+ # exclusively from the Context. Shared derived data (codepoints,
13
+ # UCD baseline, source format) is memoized here so multiple
14
+ # extractors don't recompute it.
15
+ #
16
+ # ucode deltas vs fontisan's Context:
17
+ #
18
+ # - Drops `cldr` and the entire CLDR resolution path (out of scope).
19
+ # - Replaces fontisan's `ucd` memoizer with `baseline`, a struct
20
+ # carrying version + database + metadata.
21
+ # - Adds optional `renderer` for `--with-glyphs` mode (nil otherwise).
22
+ class Context
23
+ Baseline = Struct.new(:version, :database, :metadata, :warning, keyword_init: true) do
24
+ # True when the baseline is usable (database present and no warning).
25
+ def available?
26
+ !database.nil? && warning.nil?
27
+ end
28
+ end
29
+
30
+ private_constant :Baseline
31
+
32
+ attr_reader :font, :font_path, :font_index, :num_fonts_in_source,
33
+ :options, :renderer
34
+
35
+ # @param font [Fontisan::Font] parsed font handle (has_table?, table).
36
+ # @param font_path [Pathname, String] source path for format detection.
37
+ # @param font_index [Integer] 0-based face index within a collection.
38
+ # @param num_fonts_in_source [Integer] total faces in the source file.
39
+ # @param options [Hash{Symbol=>Object}] audit options (ucd_version,
40
+ # all_codepoints, with_glyphs, etc.).
41
+ # @param renderer [Object, nil] glyph renderer for --with-glyphs mode.
42
+ def initialize(font:, font_path:, font_index:, num_fonts_in_source:,
43
+ options:, renderer: nil)
44
+ @font = font
45
+ @font_path = font_path
46
+ @font_index = font_index
47
+ @num_fonts_in_source = num_fonts_in_source
48
+ @options = options
49
+ @renderer = renderer
50
+ end
51
+
52
+ # Codepoints the font's cmap actually maps. Memoized.
53
+ # @return [Array<Integer>]
54
+ def codepoints
55
+ @codepoints ||= extract_codepoints
56
+ end
57
+
58
+ # Pre-resolved baseline (UCD version + database + metadata).
59
+ # Memoized. When resolution fails, returns a Baseline with a
60
+ # `warning` and nil database so extractors can degrade gracefully.
61
+ # @return [Baseline]
62
+ def baseline
63
+ @baseline ||= resolve_baseline
64
+ end
65
+
66
+ # Detected source format string ("ttf", "otf", "ttc", ...). Memoized.
67
+ # @return [String, nil]
68
+ def source_format
69
+ @source_format ||= Fontisan::FontLoader.detect_format(@font_path)&.to_s
70
+ end
71
+
72
+ # True when the user asked for every codepoint (including unassigned)
73
+ # in the report's `codepoints` field.
74
+ # @return [Boolean]
75
+ def all_codepoints?
76
+ @options[:all_codepoints] == true
77
+ end
78
+
79
+ # True when glyph rendering is requested (--with-glyphs).
80
+ # @return [Boolean]
81
+ def with_glyphs?
82
+ @options[:with_glyphs] == true && !@renderer.nil?
83
+ end
84
+
85
+ private
86
+
87
+ def extract_codepoints
88
+ return [] unless @font.has_table?("cmap")
89
+
90
+ @font.table("cmap").unicode_mappings.keys
91
+ end
92
+
93
+ def resolve_baseline
94
+ version = Ucode::VersionResolver.resolve(@options[:ucd_version])
95
+ database = open_or_build_database(version)
96
+ Baseline.new(
97
+ version: version,
98
+ database: database,
99
+ metadata: build_metadata(version),
100
+ warning: nil,
101
+ )
102
+ rescue Ucode::UnknownVersionError => e
103
+ Baseline.new(version: nil, database: nil, metadata: nil,
104
+ warning: "UCD version rejected: #{e.message}")
105
+ rescue Ucode::DatabaseMissingError => e
106
+ Baseline.new(version: version, database: nil, metadata: nil,
107
+ warning: "UCD unavailable for version #{version}: #{e.message}")
108
+ rescue StandardError => e
109
+ Baseline.new(version: nil, database: nil, metadata: nil,
110
+ warning: "UCD resolution failed: #{e.message}")
111
+ end
112
+
113
+ def open_or_build_database(version)
114
+ return Ucode::Database.open(version) if Ucode::Database.cached?(version)
115
+
116
+ ensure_ucdzip(version)
117
+ Ucode::Database.build(version)
118
+ end
119
+
120
+ def ensure_ucdzip(version)
121
+ return if Ucode::Cache.cached?(version)
122
+
123
+ Ucode::Fetch::UcdZip.call(version)
124
+ end
125
+
126
+ def build_metadata(version)
127
+ Models::Audit::Baseline.new(
128
+ unicode_version: version,
129
+ ucode_version: Ucode::VERSION,
130
+ fontisan_version: Fontisan::VERSION,
131
+ source: "ucode SQLite index (blocks + scripts tables)",
132
+ generated_at: Time.now.utc.iso8601,
133
+ )
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,213 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Audit
5
+ # Detects cheap audit signals — currently OS/2 ulUnicodeRange bit
6
+ # claims that disagree with the font's cmap coverage.
7
+ #
8
+ # Pure transformation: takes the four OS/2 ulUnicodeRange 32-bit
9
+ # words + the font's codepoint set, returns Discrepancy[]. No I/O,
10
+ # no font handle.
11
+ #
12
+ # OCP: a new discrepancy kind = one constant on
13
+ # {Models::Audit::Discrepancy} + one method here. The detector
14
+ # never enumerates kinds directly.
15
+ class DiscrepancyDetector
16
+ # Map of OS/2 ulUnicodeRange bit position => [first_cp, last_cp]
17
+ # per the OpenType spec (OS/2.ulUnicodeRange). Bits without a
18
+ # well-defined contiguous range (e.g. PUA, reserved) are omitted
19
+ # — they cannot be cross-checked against the cmap by this
20
+ # detector.
21
+ #
22
+ # Spec reference:
23
+ # https://learn.microsoft.com/en-us/typography/opentype/spec/os2#ur
24
+ BIT_RANGES = {
25
+ 0 => [0x0000, 0x007F], # Basic Latin
26
+ 1 => [0x0080, 0x00FF], # Latin-1 Supplement
27
+ 2 => [0x0100, 0x017F], # Latin Extended-A
28
+ 3 => [0x0180, 0x024F], # Latin Extended-B
29
+ 4 => [0x0250, 0x02AF], # IPA Extension
30
+ 5 => [0x02B0, 0x02FF], # Spacing Modifier Letters
31
+ 6 => [0x0300, 0x036F], # Combining Diacritical Marks
32
+ 7 => [0x0370, 0x03FF], # Greek and Coptic
33
+ 8 => [0x2C80, 0x2CFF], # Coptic
34
+ 9 => [0x0400, 0x04FF], # Cyrillic
35
+ 10 => [0x0530, 0x058F], # Armenian
36
+ 11 => [0x0590, 0x05FF], # Hebrew
37
+ 13 => [0x0600, 0x06FF], # Arabic
38
+ 14 => [0x07C0, 0x07FF], # NKo
39
+ 15 => [0x0900, 0x097F], # Devanagari
40
+ 16 => [0x0980, 0x09FF], # Bengali
41
+ 17 => [0x0A00, 0x0A7F], # Gurmukhi
42
+ 18 => [0x0A80, 0x0AFF], # Gujarati
43
+ 19 => [0x0B00, 0x0B7F], # Oriya
44
+ 20 => [0x0B80, 0x0BFF], # Tamil
45
+ 21 => [0x0C00, 0x0C7F], # Telugu
46
+ 22 => [0x0C80, 0x0CFF], # Kannada
47
+ 23 => [0x0D00, 0x0D7F], # Malayalam
48
+ 24 => [0x0E00, 0x0E7F], # Thai
49
+ 25 => [0x0E80, 0x0EFF], # Lao
50
+ 26 => [0x10A0, 0x10FF], # Georgian
51
+ 27 => [0x1B00, 0x1B7F], # Balinese
52
+ 29 => [0x1E00, 0x1EFF], # Latin Extended Additional
53
+ 30 => [0x1F00, 0x1FFF], # Greek Extended
54
+ 31 => [0x2000, 0x206F], # General Punctuation
55
+ 32 => [0x2070, 0x209F], # Superscripts And Subscripts
56
+ 33 => [0x20A0, 0x20CF], # Currency Symbols
57
+ 34 => [0x20D0, 0x20FF], # Combining Marks Symbols
58
+ 35 => [0x2100, 0x214F], # Letterlike Symbols
59
+ 36 => [0x2150, 0x218F], # Number Forms
60
+ 37 => [0x2190, 0x21FF], # Arrows
61
+ 38 => [0x2200, 0x22FF], # Mathematical Operators
62
+ 39 => [0x2300, 0x23FF], # Miscellaneous Technical
63
+ 40 => [0x2400, 0x243F], # Control Pictures
64
+ 41 => [0x2440, 0x245F], # Optical Character Recognition
65
+ 42 => [0x2460, 0x24FF], # Enclosed Alphanumerics
66
+ 43 => [0x2500, 0x257F], # Box Drawing
67
+ 44 => [0x2580, 0x259F], # Block Elements
68
+ 45 => [0x25A0, 0x25FF], # Geometric Shapes
69
+ 46 => [0x2600, 0x26FF], # Miscellaneous Symbols
70
+ 47 => [0x2700, 0x27BF], # Dingbats
71
+ 48 => [0x3000, 0x303F], # CJK Symbols and Punctuation
72
+ 49 => [0x3040, 0x309F], # Hiragana
73
+ 50 => [0x30A0, 0x30FF], # Katakana
74
+ 51 => [0x3100, 0x312F], # Bopomofo
75
+ 52 => [0x3130, 0x318F], # Hangul Compatibility Jamo
76
+ 53 => [0xA840, 0xA87F], # Phags-pa
77
+ 54 => [0x3200, 0x32FF], # Enclosed CJK Letters and Months
78
+ 55 => [0x3300, 0x33FF], # CJK Compatibility
79
+ 56 => [0xAC00, 0xD7AF], # Hangul Syllables
80
+ 57 => [0x10000, 0x10FFFF], # Surrogate / Non-BMP fallback
81
+ 58 => [0x10900, 0x1091F], # Phoenician
82
+ 59 => [0x4E00, 0x9FFF], # CJK Unified Ideographs (incl. Ext A)
83
+ 60 => [0xE000, 0xF8FF], # Private Use Area
84
+ 61 => [0xF900, 0xFAFF], # CJK Compatibility Ideographs
85
+ 62 => [0xFB00, 0xFB4F], # Alphabetic Presentation Forms
86
+ 63 => [0xFB50, 0xFDFF], # Arabic Presentation Forms-A
87
+ 64 => [0xFE20, 0xFE2F], # Combining Half Marks
88
+ 65 => [0xFE10, 0xFE1F], # Vertical Forms
89
+ 66 => [0xFE50, 0xFE6F], # Small Form Variants
90
+ 67 => [0xFE70, 0xFEFF], # Arabic Presentation Forms-B
91
+ 68 => [0xFF00, 0xFFEF], # Halfwidth And Fullwidth Forms
92
+ 69 => [0xFFF0, 0xFFFF], # Specials
93
+ 70 => [0x0F00, 0x0FFF], # Tibetan
94
+ 71 => [0x0700, 0x074F], # Syriac
95
+ 72 => [0x0780, 0x07BF], # Thaana
96
+ 73 => [0x0D80, 0x0DFF], # Sinhala
97
+ 74 => [0x1000, 0x109F], # Myanmar
98
+ 75 => [0x1200, 0x137F], # Ethiopic
99
+ 76 => [0x13A0, 0x13FF], # Cherokee
100
+ 77 => [0x1400, 0x167F], # Unified Canadian Aboriginal Syllabics
101
+ 78 => [0x1680, 0x169F], # Ogham
102
+ 79 => [0x16A0, 0x16FF], # Runic
103
+ 80 => [0x1780, 0x17FF], # Khmer
104
+ 81 => [0x1800, 0x18AF], # Mongolian
105
+ 82 => [0x2800, 0x28FF], # Braille Patterns
106
+ 83 => [0xA000, 0xA48F], # Yi Syllables
107
+ 84 => [0x1700, 0x171F], # Tagalog
108
+ 85 => [0x10300, 0x1032F], # Old Italic
109
+ 86 => [0x10330, 0x1034F], # Gothic
110
+ 87 => [0x10400, 0x1044F], # Deseret
111
+ 88 => [0x1D000, 0x1D0FF], # Byzantine Musical Symbols
112
+ 89 => [0x1D400, 0x1D7FF], # Mathematical Alphanumeric Symbols
113
+ 90 => [0xFF000, 0xFFFFD], # Private Use (Plane 15)
114
+ 91 => [0xFE00, 0xFE0F], # Variation Selectors
115
+ 92 => [0xE0000, 0xE007F], # Tags
116
+ 93 => [0x1900, 0x194F], # Limbu
117
+ 94 => [0x1950, 0x197F], # Tai Le
118
+ 95 => [0x1980, 0x19DF], # New Tai Lue
119
+ 96 => [0x1A00, 0x1A1F], # Buginese
120
+ 97 => [0x2C00, 0x2C5F], # Glagolitic
121
+ 98 => [0x2D30, 0x2D7F], # Tifinagh
122
+ 99 => [0x4DC0, 0x4DFF], # Yijing Hexagram Symbols
123
+ 100 => [0xA800, 0xA82F], # Syloti Nagri
124
+ 101 => [0xA500, 0xA63F], # Vai
125
+ 102 => [0xA640, 0xA69F], # Cyrillic Extended-B
126
+ 103 => [0xA700, 0xA71F], # Modifier Tone Letters
127
+ 104 => [0xA720, 0xA7FF], # Latin Extended-D
128
+ 105 => [0xA800, 0xA82F], # Syloti Nagri (duplicate of 100; spec)
129
+ 106 => [0xA840, 0xA87F], # Phags-pa (duplicate of 53; spec)
130
+ 107 => [0x100000, 0x10FFFF], # Supplementary PUA-A fallback
131
+ 108 => [0xA4D0, 0xA4FF], # Lisu
132
+ 109 => [0xA490, 0xA4CF], # Bamum
133
+ 110 => [0x10800, 0x1083F], # Cypriot Syllabary
134
+ 111 => [0x10A00, 0x10A5F], # Kharoshthi
135
+ 112 => [0x1B80, 0x1BBF], # Sundanese
136
+ 113 => [0x1BC0, 0x1BFF], # Batak
137
+ 114 => [0x11000, 0x1107F], # Brahmi
138
+ 115 => [0xA8E0, 0xA8FF], # Devanagari Extended
139
+ 116 => [0x11100, 0x1114F], # Kaithi
140
+ 117 => [0x1D360, 0x1D37F], # Counting Rod Numerals
141
+ 118 => [0x12000, 0x1247F], # Cuneiform
142
+ 119 => [0x1F000, 0x1F09F], # Mahjong Tiles
143
+ 120 => [0xA930, 0xA95F], # Rejang
144
+ 121 => [0xA960, 0xA97F], # Hangul Jamo Extended-A
145
+ 122 => [0xAA00, 0xAA5F], # Cham
146
+ 123 => [0xA980, 0xA9DF], # Javanese
147
+ 124 => [0x11600, 0x1165F], # Modi
148
+ 125 => [0x1E900, 0x1E95F], # Adlam
149
+ 126 => [0x1EE00, 0x1EEFF], # Arabic Mathematical Alphabetic Symbols
150
+ }.freeze
151
+ private_constant :BIT_RANGES
152
+
153
+ # @param ul_unicode_range1 [Integer]
154
+ # @param ul_unicode_range2 [Integer]
155
+ # @param ul_unicode_range3 [Integer]
156
+ # @param ul_unicode_range4 [Integer]
157
+ # @param codepoints [Enumerable<Integer>] font cmap codepoint set
158
+ def initialize(ul_unicode_range1:, ul_unicode_range2:,
159
+ ul_unicode_range3:, ul_unicode_range4:,
160
+ codepoints:)
161
+ @bits = bits_from_words([
162
+ ul_unicode_range1 || 0,
163
+ ul_unicode_range2 || 0,
164
+ ul_unicode_range3 || 0,
165
+ ul_unicode_range4 || 0,
166
+ ])
167
+ @codepoint_set = codepoints.to_set
168
+ end
169
+
170
+ # @return [Array<Models::Audit::Discrepancy>]
171
+ def call
172
+ @bits.sort.map do |bit|
173
+ first, last = BIT_RANGES.fetch(bit, [nil, nil])
174
+ next nil if first.nil? # bit set but range unknown — skip
175
+
176
+ next nil if range_has_codepoints?(first, last)
177
+
178
+ Models::Audit::Discrepancy.new(
179
+ kind: Models::Audit::Discrepancy::KIND_OS2_UNICODE_RANGE_BIT_WITHOUT_CMAP_CODEPOINTS,
180
+ detail: format(
181
+ "OS/2 ulUnicodeRange bit %<bit>d claims %<first>s–%<last>s " \
182
+ "but cmap has 0 codepoints in that range",
183
+ bit: bit,
184
+ first: format("U+%04X", first),
185
+ last: format("U+%04X", last),
186
+ ),
187
+ bit_position: bit,
188
+ )
189
+ end.compact
190
+ end
191
+
192
+ private
193
+
194
+ def bits_from_words(words)
195
+ words.each_with_index.flat_map do |word, word_index|
196
+ bits_in_word(word).map { |bit| word_index * 32 + bit }
197
+ end
198
+ end
199
+
200
+ # Yields bit positions (0-31) that are set in a 32-bit word.
201
+ def bits_in_word(word)
202
+ (0..31).reject { |i| (word & (1 << i)).zero? }
203
+ end
204
+
205
+ def range_has_codepoints?(first, last)
206
+ # Linear scan; codepoint_set is typically small relative to
207
+ # the OS/2 range set. For very large fonts (CJK), this is O(N)
208
+ # per bit — acceptable for one-shot audit cost.
209
+ @codepoint_set.any? { |cp| cp >= first && cp <= last }
210
+ end
211
+ end
212
+ end
213
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Audit
5
+ module Extractors
6
+ # Aggregations: UCD block/script coverage driven by ucode's own
7
+ # parsed baseline (not ucd.all.flat.zip), plus OS/2 ulUnicodeRange
8
+ # discrepancies.
9
+ #
10
+ # Returned fields:
11
+ # baseline, blocks, scripts, plane_summaries, discrepancies
12
+ #
13
+ # MECE: this extractor owns UCD-driven aggregations + the OS/2
14
+ # bit-vs-cmap cross-check. SFNT-driven GSUB/GPOS script/feature
15
+ # coverage lives in {OpenTypeLayout}.
16
+ #
17
+ # ucode delta vs fontisan: replaces UCDXML flat-zip lookup with
18
+ # ucode's own SQLite-backed Database. The Database exposes
19
+ # `lookup_block`, `lookup_script`, `block_ranges_by_name`, and
20
+ # `script_ranges_by_name` — those power every aggregation here.
21
+ class Aggregations < Base
22
+ # @param context [Ucode::Audit::Context]
23
+ # @return [Hash{Symbol=>Object}]
24
+ def extract(context)
25
+ baseline = context.baseline
26
+ return empty_with_warning(baseline) unless baseline.available?
27
+
28
+ codepoints = context.codepoints
29
+ blocks = BlockAggregator.new(baseline.database).call(codepoints)
30
+ scripts = ScriptAggregator.new(baseline.database).call(codepoints)
31
+ planes = PlaneAggregator.new.call(blocks)
32
+ discrepancies = DiscrepancyDetector.new(**os2_args(context))
33
+ .call
34
+
35
+ {
36
+ baseline: baseline.metadata,
37
+ blocks: blocks,
38
+ scripts: scripts,
39
+ plane_summaries: planes,
40
+ discrepancies: discrepancies,
41
+ }
42
+ end
43
+
44
+ private
45
+
46
+ def empty_with_warning(baseline)
47
+ {
48
+ baseline: baseline.metadata,
49
+ blocks: [],
50
+ scripts: [],
51
+ plane_summaries: [],
52
+ discrepancies: [],
53
+ }
54
+ end
55
+
56
+ def os2_args(context)
57
+ font = context.font
58
+ os2 = font.has_table?("OS/2") ? font.table("OS/2") : nil
59
+ {
60
+ ul_unicode_range1: os2&.ul_unicode_range1,
61
+ ul_unicode_range2: os2&.ul_unicode_range2,
62
+ ul_unicode_range3: os2&.ul_unicode_range3,
63
+ ul_unicode_range4: os2&.ul_unicode_range4,
64
+ codepoints: context.codepoints,
65
+ }
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Audit
5
+ module Extractors
6
+ # Abstract extractor interface. Subclasses implement `#extract`.
7
+ #
8
+ # An extractor reads from a {Context} and returns a hash of fields
9
+ # suitable for `Models::Audit::AuditReport.new(**fields)`.
10
+ # Returning an empty hash is valid (no-op).
11
+ class Base
12
+ # @param context [Ucode::Audit::Context]
13
+ # @return [Hash{Symbol=>Object}] fields merged into the AuditReport
14
+ def extract(context)
15
+ raise NotImplementedError,
16
+ "#{self.class} must implement #extract"
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end