ucode 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +72 -0
  3. data/Gemfile.lock +2 -2
  4. data/TODO.full/00-README.md +116 -0
  5. data/TODO.full/01-panglyph-vision.md +112 -0
  6. data/TODO.full/02-panglyph-repo-bootstrap.md +184 -0
  7. data/TODO.full/03-panglyph-font-builder.md +201 -0
  8. data/TODO.full/04-panglyph-publish-pipeline.md +126 -0
  9. data/TODO.full/05-ucode-0-1-1-release.md +139 -0
  10. data/TODO.full/06-fontisan-remove-audit.md +142 -0
  11. data/TODO.full/07-fontisan-remove-ucd.md +125 -0
  12. data/TODO.full/08-archive-private-bin-build.md +143 -0
  13. data/TODO.full/09-archive-public-structure.md +164 -0
  14. data/TODO.full/10-fontist-org-woff-glyphs.md +131 -0
  15. data/TODO.full/11-fontist-org-audit-coverage.md +140 -0
  16. data/TODO.full/12-implementation-order.md +216 -0
  17. data/TODO.full/13-fontisan-font-writer-api.md +189 -0
  18. data/TODO.full/14-fontisan-table-writers.md +66 -0
  19. data/TODO.full/15-panglyph-builder-real.md +82 -0
  20. data/TODO.full/16-archive-public-sync-workflows.md +167 -0
  21. data/TODO.full/17-fontist-org-font-picker.md +73 -0
  22. data/TODO.full/18-comprehensive-spec-coverage.md +64 -0
  23. data/TODO.full/19-ucode-0-1-2-patch.md +32 -0
  24. data/TODO.full/20-fontisan-0-2-23-release.md +52 -0
  25. data/TODO.new/00-README.md +30 -0
  26. data/TODO.new/23-universal-glyph-set-source-map.md +312 -0
  27. data/TODO.new/24-universal-glyph-set-build.md +189 -0
  28. data/TODO.new/25-font-audit-against-universal-set.md +195 -0
  29. data/TODO.new/26-missing-glyph-reporter.md +189 -0
  30. data/TODO.new/27-fontist-org-consumer-integration.md +200 -0
  31. data/TODO.new/28-implementation-order-update.md +187 -0
  32. data/TODO.new/29-universal-set-curation-uc17.md +312 -0
  33. data/TODO.new/30-tier1-font-acquisition.md +241 -0
  34. data/TODO.new/31-universal-set-production-build.md +205 -0
  35. data/TODO.new/32-uc17-coverage-matrix.md +165 -0
  36. data/TODO.new/33-specialist-font-acquisition-refresh.md +138 -0
  37. data/TODO.new/34-pillar2-content-stream-correlator.md +147 -0
  38. data/TODO.new/35-universal-set-production-run.md +160 -0
  39. data/TODO.new/36-per-font-coverage-audit.md +145 -0
  40. data/TODO.new/37-coverage-highlight-reporter.md +125 -0
  41. data/TODO.new/38-fontist-org-glyph-consumer.md +141 -0
  42. data/TODO.new/39-implementation-order-update-32-38.md +258 -0
  43. data/TODO.new/40-archive-private-uses-ucode-audit.md +124 -0
  44. data/TODO.new/41-ucode-unicode-archive-bridge.md +160 -0
  45. data/config/specialist_fonts.yml +102 -0
  46. data/config/unicode17_tier1_fonts.yml +42 -0
  47. data/config/unicode17_universal_glyph_set.yml +293 -0
  48. data/lib/ucode/audit/block_aggregator.rb +57 -29
  49. data/lib/ucode/audit/browser/face_page.rb +128 -0
  50. data/lib/ucode/audit/browser/glyph_panel.rb +124 -0
  51. data/lib/ucode/audit/browser/library_page.rb +74 -0
  52. data/lib/ucode/audit/browser/missing_glyph_page.rb +87 -0
  53. data/lib/ucode/audit/browser/template.rb +47 -0
  54. data/lib/ucode/audit/browser/templates/face.css +200 -0
  55. data/lib/ucode/audit/browser/templates/face.html.erb +41 -0
  56. data/lib/ucode/audit/browser/templates/face.js +298 -0
  57. data/lib/ucode/audit/browser/templates/library.css +119 -0
  58. data/lib/ucode/audit/browser/templates/library.html.erb +42 -0
  59. data/lib/ucode/audit/browser/templates/library.js +99 -0
  60. data/lib/ucode/audit/browser/templates/missing_glyph_page.css +119 -0
  61. data/lib/ucode/audit/browser/templates/missing_glyph_page.html.erb +58 -0
  62. data/lib/ucode/audit/browser/templates/missing_glyph_page.js +2 -0
  63. data/lib/ucode/audit/browser.rb +32 -0
  64. data/lib/ucode/audit/context.rb +27 -1
  65. data/lib/ucode/audit/coverage_reference.rb +103 -0
  66. data/lib/ucode/audit/differ.rb +121 -0
  67. data/lib/ucode/audit/emitter/block_emitter.rb +52 -0
  68. data/lib/ucode/audit/emitter/codepoint_emitter.rb +87 -0
  69. data/lib/ucode/audit/emitter/collection_emitter.rb +80 -0
  70. data/lib/ucode/audit/emitter/face_directory.rb +212 -0
  71. data/lib/ucode/audit/emitter/glyph_emitter.rb +48 -0
  72. data/lib/ucode/audit/emitter/index_emitter.rb +149 -0
  73. data/lib/ucode/audit/emitter/library_emitter.rb +96 -0
  74. data/lib/ucode/audit/emitter/paths.rb +312 -0
  75. data/lib/ucode/audit/emitter/plane_emitter.rb +29 -0
  76. data/lib/ucode/audit/emitter/script_emitter.rb +29 -0
  77. data/lib/ucode/audit/emitter.rb +29 -0
  78. data/lib/ucode/audit/extractors/aggregations.rb +31 -2
  79. data/lib/ucode/audit/face_auditor.rb +86 -0
  80. data/lib/ucode/audit/formatters/audit_diff_text.rb +112 -0
  81. data/lib/ucode/audit/formatters/audit_text.rb +411 -0
  82. data/lib/ucode/audit/formatters/color.rb +48 -0
  83. data/lib/ucode/audit/formatters/library_summary_text.rb +98 -0
  84. data/lib/ucode/audit/formatters/text_formatter.rb +83 -0
  85. data/lib/ucode/audit/formatters.rb +23 -0
  86. data/lib/ucode/audit/library_aggregator.rb +86 -0
  87. data/lib/ucode/audit/library_auditor.rb +105 -0
  88. data/lib/ucode/audit/release/emitter.rb +152 -0
  89. data/lib/ucode/audit/release/face_card.rb +93 -0
  90. data/lib/ucode/audit/release/formula_audits.rb +50 -0
  91. data/lib/ucode/audit/release/library_index_builder.rb +78 -0
  92. data/lib/ucode/audit/release/manifest_builder.rb +127 -0
  93. data/lib/ucode/audit/release.rb +42 -0
  94. data/lib/ucode/audit/ucd_only_reference.rb +81 -0
  95. data/lib/ucode/audit/universal_set_reference.rb +136 -0
  96. data/lib/ucode/audit.rb +31 -0
  97. data/lib/ucode/cli.rb +339 -33
  98. data/lib/ucode/commands/audit/browser_command.rb +82 -0
  99. data/lib/ucode/commands/audit/collection_command.rb +103 -0
  100. data/lib/ucode/commands/audit/compare_command.rb +188 -0
  101. data/lib/ucode/commands/audit/font_command.rb +140 -0
  102. data/lib/ucode/commands/audit/library_command.rb +87 -0
  103. data/lib/ucode/commands/audit/reference_builder.rb +64 -0
  104. data/lib/ucode/commands/audit.rb +20 -0
  105. data/lib/ucode/commands/block_feed.rb +73 -0
  106. data/lib/ucode/commands/canonical_build.rb +138 -0
  107. data/lib/ucode/commands/fetch.rb +37 -1
  108. data/lib/ucode/commands/release.rb +115 -0
  109. data/lib/ucode/commands/universal_set.rb +211 -0
  110. data/lib/ucode/commands.rb +5 -0
  111. data/lib/ucode/coordinator/indices.rb +11 -0
  112. data/lib/ucode/coordinator.rb +138 -5
  113. data/lib/ucode/error.rb +30 -2
  114. data/lib/ucode/fetch/font_fetcher/result.rb +39 -0
  115. data/lib/ucode/fetch/font_fetcher.rb +16 -0
  116. data/lib/ucode/fetch/specialist_font_fetcher.rb +280 -0
  117. data/lib/ucode/fetch.rb +7 -3
  118. data/lib/ucode/glyphs/real_fonts/cmap_cache.rb +74 -0
  119. data/lib/ucode/glyphs/real_fonts.rb +1 -0
  120. data/lib/ucode/glyphs/resolver.rb +62 -0
  121. data/lib/ucode/glyphs/source.rb +48 -0
  122. data/lib/ucode/glyphs/source_builder.rb +61 -0
  123. data/lib/ucode/glyphs/source_config/coverage_assertion.rb +79 -0
  124. data/lib/ucode/glyphs/source_config/gap_report.rb +54 -0
  125. data/lib/ucode/glyphs/source_config.rb +104 -0
  126. data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +63 -0
  127. data/lib/ucode/glyphs/sources/pillar3_last_resort.rb +51 -0
  128. data/lib/ucode/glyphs/sources/tier1_real_font.rb +104 -0
  129. data/lib/ucode/glyphs/sources.rb +20 -0
  130. data/lib/ucode/glyphs/universal_set/builder.rb +161 -0
  131. data/lib/ucode/glyphs/universal_set/coverage_report.rb +139 -0
  132. data/lib/ucode/glyphs/universal_set/idempotency.rb +86 -0
  133. data/lib/ucode/glyphs/universal_set/manifest_accumulator.rb +195 -0
  134. data/lib/ucode/glyphs/universal_set/manifest_writer.rb +61 -0
  135. data/lib/ucode/glyphs/universal_set/pre_build_check.rb +197 -0
  136. data/lib/ucode/glyphs/universal_set/validator.rb +204 -0
  137. data/lib/ucode/glyphs/universal_set.rb +45 -0
  138. data/lib/ucode/glyphs.rb +6 -0
  139. data/lib/ucode/models/audit/baseline.rb +6 -0
  140. data/lib/ucode/models/audit/block_summary.rb +7 -0
  141. data/lib/ucode/models/audit/codepoint_provenance.rb +39 -0
  142. data/lib/ucode/models/audit/release_face.rb +42 -0
  143. data/lib/ucode/models/audit/release_formula.rb +33 -0
  144. data/lib/ucode/models/audit/release_manifest.rb +43 -0
  145. data/lib/ucode/models/audit/release_universal_set.rb +37 -0
  146. data/lib/ucode/models/audit.rb +9 -0
  147. data/lib/ucode/models/block.rb +2 -0
  148. data/lib/ucode/models/build_report.rb +109 -0
  149. data/lib/ucode/models/codepoint/glyph.rb +42 -0
  150. data/lib/ucode/models/codepoint.rb +3 -0
  151. data/lib/ucode/models/glyph_source.rb +86 -0
  152. data/lib/ucode/models/glyph_source_map.rb +138 -0
  153. data/lib/ucode/models/specialist_font.rb +70 -0
  154. data/lib/ucode/models/specialist_font_manifest.rb +48 -0
  155. data/lib/ucode/models/unihan_entry.rb +81 -9
  156. data/lib/ucode/models/unihan_field.rb +21 -0
  157. data/lib/ucode/models/universal_set_entry.rb +47 -0
  158. data/lib/ucode/models/universal_set_manifest.rb +78 -0
  159. data/lib/ucode/models/validation_report.rb +99 -0
  160. data/lib/ucode/models.rb +9 -0
  161. data/lib/ucode/parsers/named_sequences.rb +5 -5
  162. data/lib/ucode/parsers/unihan.rb +50 -19
  163. data/lib/ucode/repo/aggregate_writer.rb +34 -2
  164. data/lib/ucode/repo/block_feed_emitter.rb +153 -0
  165. data/lib/ucode/repo/build_report_accumulator.rb +138 -0
  166. data/lib/ucode/repo/build_report_writer.rb +46 -0
  167. data/lib/ucode/repo/build_validator.rb +229 -0
  168. data/lib/ucode/repo/codepoint_writer.rb +50 -1
  169. data/lib/ucode/repo/paths.rb +8 -0
  170. data/lib/ucode/repo.rb +4 -0
  171. data/lib/ucode/version.rb +1 -1
  172. data/schema/block-feed.output.schema.yml +134 -0
  173. metadata +143 -2
  174. data/ucode.gemspec +0 -56
@@ -123,7 +123,18 @@ module Ucode
123
123
  standardized_variants: multi_cp_index_by_id(ucd_dir, "StandardizedVariants.txt",
124
124
  Parsers::StandardizedVariants, :base_id),
125
125
  names_list: names_list_index(ucd_dir),
126
- unihan: unihan_index(unihan_dir)
126
+ unihan: unihan_index(unihan_dir),
127
+ line_break: range_value_index(ucd_dir, "LineBreak.txt"),
128
+ east_asian_width: range_value_index(ucd_dir, "EastAsianWidth.txt"),
129
+ vertical_orientation: range_value_index(ucd_dir, "VerticalOrientation.txt"),
130
+ grapheme_break: range_value_index(ucd_dir, "auxiliary/GraphemeBreakProperty.txt"),
131
+ word_break: range_value_index(ucd_dir, "auxiliary/WordBreakProperty.txt"),
132
+ sentence_break: range_value_index(ucd_dir, "auxiliary/SentenceBreakProperty.txt"),
133
+ indic_positional: range_value_index(ucd_dir, "IndicPositionalCategory.txt"),
134
+ indic_syllabic: range_value_index(ucd_dir, "IndicSyllabicCategory.txt"),
135
+ hangul_syllable_type: range_value_index(ucd_dir, "HangulSyllableType.txt"),
136
+ emoji_properties: range_value_index(ucd_dir, "emoji/emoji-data.txt"),
137
+ extra_binary_properties: range_value_index(ucd_dir, "PropList.txt"),
127
138
  )
128
139
  end
129
140
 
@@ -136,6 +147,23 @@ module Ucode
136
147
  parser.each_record(path).to_a.sort_by(&:range_first)
137
148
  end
138
149
 
150
+ # Builds a sorted array of (range_first, range_last, value) tuples for
151
+ # any UCD file using the standard `XXXX[..YYYY]; value` format. Used
152
+ # for the many extracted/auxiliary/root properties that share this
153
+ # shape: LineBreak, EastAsianWidth, VerticalOrientation, the three
154
+ # break-segmentation files, the two Indic category files,
155
+ # HangulSyllableType, emoji-data, PropList, etc.
156
+ #
157
+ # Tuple is `Parsers::ExtractedProperties::Tuple` — a Struct with
158
+ # `range_first`, `range_last`, `value` accessors, suitable for the
159
+ # coordinator's `find_in_range` bsearch.
160
+ def range_value_index(ucd_dir, filename)
161
+ path = Pathname.new(ucd_dir).join(filename)
162
+ return [] unless path.exist?
163
+
164
+ Parsers::ExtractedProperties.each_record(path).to_a.sort_by(&:range_first)
165
+ end
166
+
139
167
  # Builds the sorted Script array and resolves each Script's ISO 15924
140
168
  # code in one pass, using the pre-computed property_value_aliases map.
141
169
  # This avoids re-resolving the alias on every per-cp lookup (160k ×
@@ -210,12 +238,11 @@ module Ucode
210
238
  dir = Pathname.new(unihan_dir)
211
239
  return {} unless dir.exist?
212
240
 
213
- by_field = Hash.new { |h, k| h[k] = {} }
241
+ entries = Hash.new { |h, k| h[k] = Models::UnihanEntry.new }
214
242
  Parsers::Unihan.each_in_dir(dir) do |record|
215
- by_field[record.cp][record.field] = record.field_values
243
+ entries[record.cp].add(record.category, record.field, record.field_values)
216
244
  end
217
-
218
- by_field.transform_values { |fields| Models::UnihanEntry.new(fields: fields) }
245
+ entries
219
246
  end
220
247
 
221
248
  # ---- Per-codepoint enrichment --------------------------------------
@@ -235,6 +262,12 @@ module Ucode
235
262
  assign_standardized_variants(cp, indices)
236
263
  assign_unihan(cp, indices)
237
264
  assign_cjk_radical(cp, indices)
265
+ assign_display(cp, indices)
266
+ assign_break_segmentation(cp, indices)
267
+ assign_indic(cp, indices)
268
+ assign_hangul(cp, indices)
269
+ assign_emoji(cp, indices)
270
+ assign_extra_binary_properties(cp, indices)
238
271
  end
239
272
 
240
273
  def assign_script(cp, indices)
@@ -369,6 +402,106 @@ module Ucode
369
402
  end
370
403
  end
371
404
 
405
+ # Display: East Asian Width, Line Break Class, Vertical Orientation.
406
+ # All three are range+value files, looked up via bsearch on sorted
407
+ # arrays of ExtractedProperties::Tuple.
408
+ def assign_display(cp, indices)
409
+ tuple = find_in_range(cp.cp, indices.line_break)
410
+ lb = tuple&.value
411
+ tuple = find_in_range(cp.cp, indices.east_asian_width)
412
+ eaw = tuple&.value
413
+ tuple = find_in_range(cp.cp, indices.vertical_orientation)
414
+ vo = tuple&.value
415
+ return if lb.nil? && eaw.nil? && vo.nil?
416
+
417
+ cp.display ||= Models::CodePoint::Display.new
418
+ cp.display.line_break_class = lb if lb
419
+ cp.display.east_asian_width = eaw if eaw
420
+ cp.display.vertical_orientation = vo if vo
421
+ end
422
+
423
+ # UAX #29 segmentation: Grapheme / Word / Sentence break class.
424
+ def assign_break_segmentation(cp, indices)
425
+ grapheme = find_in_range(cp.cp, indices.grapheme_break)&.value
426
+ word = find_in_range(cp.cp, indices.word_break)&.value
427
+ sentence = find_in_range(cp.cp, indices.sentence_break)&.value
428
+ return if grapheme.nil? && word.nil? && sentence.nil?
429
+
430
+ cp.break_segmentation ||= Models::CodePoint::BreakSegmentation.new
431
+ cp.break_segmentation.grapheme = grapheme if grapheme
432
+ cp.break_segmentation.word = word if word
433
+ cp.break_segmentation.sentence = sentence if sentence
434
+ end
435
+
436
+ def assign_indic(cp, indices)
437
+ positional = find_in_range(cp.cp, indices.indic_positional)&.value
438
+ syllabic = find_in_range(cp.cp, indices.indic_syllabic)&.value
439
+ return if positional.nil? && syllabic.nil?
440
+
441
+ cp.indic ||= Models::CodePoint::Indic.new
442
+ cp.indic.positional_category = positional if positional
443
+ cp.indic.syllabic_category = syllabic if syllabic
444
+ end
445
+
446
+ def assign_hangul(cp, indices)
447
+ tuple = find_in_range(cp.cp, indices.hangul_syllable_type)
448
+ return unless tuple
449
+
450
+ cp.hangul ||= Models::CodePoint::HangulSyllable.new
451
+ cp.hangul.type = tuple.value
452
+ end
453
+
454
+ # Emoji property bundle. Each Emoji_* property from emoji-data.txt
455
+ # flips the matching boolean on the Emoji sub-model.
456
+ def assign_emoji(cp, indices)
457
+ return unless find_in_range(cp.cp, indices.emoji_properties)
458
+
459
+ props = all_range_values(cp.cp, indices.emoji_properties)
460
+ return if props.empty?
461
+
462
+ cp.emoji ||= Models::CodePoint::Emoji.new
463
+ props.each do |prop|
464
+ case prop
465
+ when "Emoji" then cp.emoji.is_emoji = true
466
+ when "Emoji_Presentation" then cp.emoji.is_presentation_default = true
467
+ when "Emoji_Modifier" then cp.emoji.is_modifier = true
468
+ when "Emoji_Modifier_Base" then cp.emoji.is_base = true
469
+ when "Emoji_Component" then cp.emoji.is_component = true
470
+ when "Extended_Pictographic" then cp.emoji.is_extended_pictographic = true
471
+ end
472
+ end
473
+ end
474
+
475
+ # PropList.txt carries binary properties beyond what's in
476
+ # DerivedCoreProperties (White_Space, Hyphen, Variation_Selector,
477
+ # etc.). Merge into the same binary_properties list.
478
+ def assign_extra_binary_properties(cp, indices)
479
+ extras = all_range_values(cp.cp, indices.extra_binary_properties)
480
+ return if extras.empty?
481
+
482
+ cp.binary_properties.concat(extras)
483
+ cp.binary_properties.uniq!
484
+ end
485
+
486
+ # Returns every value whose range contains `cp` in a sorted tuple
487
+ # array. Most codepoint+property pairs match at most one range, but
488
+ # a codepoint can carry multiple binary properties from PropList or
489
+ # emoji-data, so we collect them all.
490
+ def all_range_values(cp, sorted_ranges)
491
+ return [] if sorted_ranges.nil? || sorted_ranges.empty?
492
+
493
+ values = []
494
+ sorted_ranges.each do |record|
495
+ next if cp < record.range_first
496
+ break if cp > record.range_last && record.range_first > cp
497
+
498
+ if cp >= record.range_first && cp <= record.range_last
499
+ values << record.value
500
+ end
501
+ end
502
+ values
503
+ end
504
+
372
505
  # ---- Range lookup (bsearch) ----------------------------------------
373
506
 
374
507
  # Finds the range-containing record in a sorted array via bsearch.
data/lib/ucode/error.rb CHANGED
@@ -12,7 +12,10 @@ module Ucode
12
12
  # Ucode::Error
13
13
  # ├── Ucode::FetchError
14
14
  # │ ├── Ucode::NetworkError
15
- # │ └── Ucode::ChecksumError
15
+ # │ ├── Ucode::ChecksumError
16
+ # │ │ └── Ucode::FontChecksumError
17
+ # │ ├── Ucode::FontLicenseError
18
+ # │ └── Ucode::FontExtractMemberMissingError
16
19
  # ├── Ucode::ParseError
17
20
  # │ ├── Ucode::MalformedLineError
18
21
  # │ └── Ucode::UnknownPropertyError
@@ -24,7 +27,8 @@ module Ucode
24
27
  # ├── Ucode::PdfRenderError
25
28
  # ├── Ucode::GridDetectionError
26
29
  # ├── Ucode::LastResortMissingError
27
- # └── Ucode::EmbeddedFontsMissingError
30
+ # ├── Ucode::EmbeddedFontsMissingError
31
+ # └── Ucode::UniversalSetPreBuildError
28
32
  class Error < StandardError
29
33
  attr_reader :context
30
34
 
@@ -56,6 +60,22 @@ module Ucode
56
60
  # Checksum or integrity failure.
57
61
  class ChecksumError < FetchError; end
58
62
 
63
+ # SHA256 of a downloaded specialist font does not match the value
64
+ # declared in `config/specialist_fonts.yml`. Distinct from
65
+ # {ChecksumError} so callers can rescue the font-pipeline failure
66
+ # without catching every generic checksum mismatch.
67
+ class FontChecksumError < ChecksumError; end
68
+
69
+ # A specialist font has a non-OFL license and the caller did not
70
+ # pass `--allow-proprietary`. Hard guard against pulling
71
+ # non-redistributable fonts into `data/fonts/`.
72
+ class FontLicenseError < FetchError; end
73
+
74
+ # A `extract: true` manifest entry's `extract_member` is missing
75
+ # from the downloaded zip. The zip was fetched correctly but does
76
+ # not contain what we expected.
77
+ class FontExtractMemberMissingError < FetchError; end
78
+
59
79
  # Parse-time failures.
60
80
  class ParseError < Error; end
61
81
 
@@ -93,4 +113,12 @@ module Ucode
93
113
  # The Code Charts PDF (per-block or monolith) cannot be located, or
94
114
  # `mutool` is not installed on the PATH.
95
115
  class EmbeddedFontsMissingError < GlyphError; end
116
+
117
+ # Pre-build validation failed for a universal-set build. The
118
+ # context carries the failing checks so the CLI can render a
119
+ # useful diagnostic without re-running them. Distinct from
120
+ # {EmbeddedFontsMissingError} because pre-build covers more than
121
+ # just PDFs: source config schema, font file presence, coverage
122
+ # assertion.
123
+ class UniversalSetPreBuildError < GlyphError; end
96
124
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Fetch
5
+ module FontFetcher
6
+ STATUSES = %i[downloaded skipped failed local planned].freeze
7
+ private_constant :STATUSES
8
+
9
+ # Typed outcome of fetching one font. The fetcher never raises
10
+ # for a single font failure; it returns a `:failed` Result so
11
+ # the aggregate run can keep going and report every problem.
12
+ #
13
+ # Statuses:
14
+ # - `:downloaded` — fetched this run; bytes are on disk at `path`.
15
+ # - `:skipped` — already present with matching SHA256 (or dry-run).
16
+ # - `:failed` — license refused, checksum mismatch, network
17
+ # error, or zip extraction error. `error` is set.
18
+ # - `:local` — `url: null`; the user supplies the file. May
19
+ # or may not be present yet (see `note`).
20
+ # - `:planned` — dry-run only; this entry would have been fetched.
21
+ Result = Struct.new(:status, :label, :path, :size_bytes, :license,
22
+ :provenance, :error, :note, keyword_init: true) do
23
+ def initialize(status:, **opts)
24
+ unless STATUSES.include?(status)
25
+ raise ArgumentError, "unknown FontFetcher::Result status: #{status.inspect}"
26
+ end
27
+
28
+ super
29
+ end
30
+
31
+ def downloaded? = status == :downloaded
32
+ def skipped? = status == :skipped
33
+ def failed? = status == :failed
34
+ def local? = status == :local
35
+ def planned? = status == :planned
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ucode
4
+ module Fetch
5
+ # Namespace for font-pipeline fetchers. Owns the shared {Result}
6
+ # value object; concrete fetchers live as peer classes in
7
+ # {Ucode::Fetch} (e.g. {SpecialistFontFetcher}).
8
+ #
9
+ # Open/closed: adding a new font source = adding a new fetcher
10
+ # class that produces {Result} instances. The protocol is the
11
+ # `Result`, not an abstract base class.
12
+ module FontFetcher
13
+ autoload :Result, "ucode/fetch/font_fetcher/result"
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,280 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "fileutils"
5
+ require "pathname"
6
+ require "tmpdir"
7
+ require "zip"
8
+
9
+ require "ucode/error"
10
+ require "ucode/fetch/font_fetcher"
11
+ require "ucode/fetch/http"
12
+ require "ucode/models/specialist_font_manifest"
13
+
14
+ module Ucode
15
+ module Fetch
16
+ # Concrete font fetcher: walks a {Models::SpecialistFontManifest}
17
+ # and materializes each font's `path` on disk.
18
+ #
19
+ # Behavior (per acceptance in TODO 30):
20
+ #
21
+ # - **Idempotent.** A font whose `path` already exists with the
22
+ # manifest's SHA256 is `:skipped`. A file with a mismatched hash
23
+ # is re-downloaded.
24
+ # - **Hashed.** On download, SHA256 is computed. If the manifest
25
+ # has a hash, mismatch raises {Ucode::FontChecksumError}. If the
26
+ # manifest hash is null, the computed hash is written back to the
27
+ # YAML at the end of the run (atomic write).
28
+ # - **License-checked.** Non-OFL entries require `allow_proprietary:
29
+ # true`; otherwise the result is `:failed` with {Ucode::FontLicenseError}.
30
+ # - **Extracted.** `extract: true` entries unzip to a temp dir and
31
+ # only `extract_member` is moved into place.
32
+ # - **Local-only.** `url: null` entries are never fetched over the
33
+ # network; the result is `:local` whether or not the file is yet
34
+ # present (with a `note` instructing placement when missing).
35
+ #
36
+ # A single font failure does not abort the run. The fetcher returns
37
+ # an array of {FontFetcher::Result}; the caller decides how to
38
+ # report failures.
39
+ class SpecialistFontFetcher
40
+ # @param manifest_path [String, Pathname] path to the YAML manifest.
41
+ # The file is rewritten in place when SHA256 hashes are populated.
42
+ # @param fonts_root [String, Pathname] root for relative `path:`
43
+ # values. Defaults to the project root (current working dir).
44
+ # Absolute paths in the manifest bypass this.
45
+ # @param allow_proprietary [Boolean] when true, non-OFL entries
46
+ # are fetched; when false, they produce a `:failed` result.
47
+ # @param dry_run [Boolean] when true, no network or disk writes;
48
+ # each font that would have been fetched yields a `:planned`
49
+ # result.
50
+ # @param http [Module, nil] injectable HTTP module responding to
51
+ # `.get(url, dest:)`. Defaults to {Fetch::Http}. Real-in-class
52
+ # test stubs can substitute a module that writes local fixture
53
+ # bytes; never use a double.
54
+ def initialize(manifest_path:, fonts_root: ".", allow_proprietary: false,
55
+ dry_run: false, http: Fetch::Http)
56
+ @manifest_path = Pathname.new(manifest_path)
57
+ @fonts_root = Pathname.new(fonts_root)
58
+ @allow_proprietary = allow_proprietary
59
+ @dry_run = dry_run
60
+ @http = http
61
+ @computed_hashes = {}
62
+ end
63
+
64
+ # @param only_label [String, nil] restrict the run to a single
65
+ # manifest entry by label. nil (default) = run all entries.
66
+ # @return [Array<FontFetcher::Result>] one per manifest entry
67
+ # actually visited, in declared order.
68
+ def call(only_label: nil)
69
+ manifest = load_manifest
70
+ return [unknown_label_result(only_label)] if only_label && manifest.find_by_label(only_label).nil?
71
+
72
+ entries = only_label ? [manifest.find_by_label(only_label)] : manifest.fonts
73
+ results = entries.map { |font| fetch_one(font) }
74
+ persist_computed_hashes(manifest) unless @dry_run
75
+ results
76
+ end
77
+
78
+ private
79
+
80
+ def load_manifest
81
+ Ucode::Models::SpecialistFontManifest.from_yaml(@manifest_path.read)
82
+ end
83
+
84
+ def unknown_label_result(label)
85
+ FontFetcher::Result.new(
86
+ status: :failed,
87
+ label: label,
88
+ error: Ucode::LookupError.new(
89
+ "label #{label.inspect} is not in #{@manifest_path}",
90
+ context: { manifest: @manifest_path.to_s, requested_label: label },
91
+ ),
92
+ )
93
+ end
94
+
95
+ def fetch_one(font)
96
+ if font.local_only?
97
+ local_result(font)
98
+ elsif @dry_run
99
+ dry_run_result(font)
100
+ else
101
+ download_result(font)
102
+ end
103
+ end
104
+
105
+ def local_result(font)
106
+ resolved = expand_local_path(font.path)
107
+ existing = resolved.find(&:exist?)
108
+ if existing
109
+ FontFetcher::Result.new(
110
+ status: :local,
111
+ label: font.label,
112
+ path: existing,
113
+ size_bytes: existing.size,
114
+ license: font.license,
115
+ provenance: font.provenance,
116
+ )
117
+ else
118
+ FontFetcher::Result.new(
119
+ status: :local,
120
+ label: font.label,
121
+ path: font.path,
122
+ license: font.license,
123
+ provenance: font.provenance,
124
+ note: "place at #{font.path}",
125
+ )
126
+ end
127
+ end
128
+
129
+ def dry_run_result(font)
130
+ existing = destination_path(font)
131
+ if existing&.exist? && hash_matches?(existing, font)
132
+ FontFetcher::Result.new(status: :skipped, label: font.label,
133
+ path: existing, license: font.license,
134
+ provenance: font.provenance)
135
+ else
136
+ FontFetcher::Result.new(status: :planned, label: font.label,
137
+ path: destination_for_display(font),
138
+ license: font.license,
139
+ provenance: font.provenance)
140
+ end
141
+ end
142
+
143
+ def download_result(font)
144
+ unless font.ofl? || @allow_proprietary
145
+ return FontFetcher::Result.new(
146
+ status: :failed, label: font.label, license: font.license,
147
+ error: Ucode::FontLicenseError.new(
148
+ "#{font.label} license=#{font.license.inspect} requires --allow-proprietary",
149
+ context: { label: font.label, license: font.license },
150
+ ),
151
+ )
152
+ end
153
+
154
+ dest = destination_path(font)
155
+ return skipped_result(font, dest) if dest.exist? && hash_matches?(dest, font)
156
+
157
+ download_and_install(font, dest)
158
+ rescue Ucode::Error => e
159
+ FontFetcher::Result.new(status: :failed, label: font.label,
160
+ license: font.license, error: e)
161
+ rescue StandardError => e
162
+ FontFetcher::Result.new(status: :failed, label: font.label,
163
+ license: font.license,
164
+ error: Ucode::FetchError.new(
165
+ "#{font.label} fetch failed: #{e.class}: #{e.message}",
166
+ context: { label: font.label, original: e.class.name },
167
+ ))
168
+ end
169
+
170
+ def skipped_result(font, dest)
171
+ FontFetcher::Result.new(status: :skipped, label: font.label,
172
+ path: dest, size_bytes: dest.size,
173
+ license: font.license, provenance: font.provenance)
174
+ end
175
+
176
+ def download_and_install(font, dest)
177
+ dest.dirname.mkpath
178
+ if font.extract?
179
+ download_and_extract(font, dest)
180
+ else
181
+ @http.get(font.url, dest: dest.to_s)
182
+ end
183
+ verify_or_record_hash(font, dest)
184
+ FontFetcher::Result.new(status: :downloaded, label: font.label,
185
+ path: dest, size_bytes: dest.size,
186
+ license: font.license, provenance: font.provenance)
187
+ end
188
+
189
+ def download_and_extract(font, dest)
190
+ Dir.mktmpdir("ucode-font-") do |tmp|
191
+ zip_path = File.join(tmp, "download.zip")
192
+ @http.get(font.url, dest: zip_path)
193
+ extract_member(zip_path, font.extract_member, dest)
194
+ end
195
+ end
196
+
197
+ def extract_member(zip_path, member_name, dest)
198
+ Zip::File.open(zip_path) do |zip|
199
+ entry = zip.find_entry(member_name) ||
200
+ zip.find { |e| !e.directory? && e.name.end_with?("/#{member_name}", member_name) }
201
+ unless entry
202
+ raise Ucode::FontExtractMemberMissingError.new(
203
+ "zip #{File.basename(zip_path)} does not contain #{member_name.inspect}",
204
+ context: { zip: zip_path, expected_member: member_name },
205
+ )
206
+ end
207
+
208
+ entry.get_input_stream do |input|
209
+ File.open(dest, "wb") { |out| IO.copy_stream(input, out) }
210
+ end
211
+ end
212
+ end
213
+
214
+ def verify_or_record_hash(font, dest)
215
+ actual = sha256_of(dest)
216
+ if font.hash_known?
217
+ unless actual.casecmp(font.sha256).zero?
218
+ raise Ucode::FontChecksumError.new(
219
+ "#{font.label} SHA256 mismatch: expected #{font.sha256}, got #{actual}",
220
+ context: { label: font.label, expected: font.sha256, actual: actual },
221
+ )
222
+ end
223
+ else
224
+ @computed_hashes[font.label] = actual
225
+ end
226
+ end
227
+
228
+ def hash_matches?(path, font)
229
+ return true unless font.hash_known?
230
+
231
+ sha256_of(path).casecmp?(font.sha256)
232
+ end
233
+
234
+ def sha256_of(path)
235
+ Digest::SHA256.file(path.to_s).hexdigest
236
+ end
237
+
238
+ def destination_path(font)
239
+ return nil if font.path.nil? || font.path.empty?
240
+
241
+ path = expand_path_for_display(font.path)
242
+ return path if path.absolute?
243
+
244
+ @fonts_root.join(path)
245
+ end
246
+
247
+ def destination_for_display(font)
248
+ destination_path(font) || font.path
249
+ end
250
+
251
+ def expand_path_for_display(raw)
252
+ return Pathname.new(raw) unless raw.start_with?("~")
253
+
254
+ Pathname.new(File.expand_path(raw))
255
+ end
256
+
257
+ def expand_local_path(raw)
258
+ expanded = File.expand_path(raw)
259
+ Dir.glob(expanded).map { |p| Pathname.new(p) }
260
+ end
261
+
262
+ def persist_computed_hashes(manifest)
263
+ return if @computed_hashes.empty?
264
+
265
+ manifest.fonts.each do |font|
266
+ next unless @computed_hashes.key?(font.label)
267
+
268
+ font.sha256 = @computed_hashes[font.label]
269
+ end
270
+ atomic_write(@manifest_path, manifest.to_yaml)
271
+ end
272
+
273
+ def atomic_write(path, content)
274
+ tmp = path.dirname.join("#{path.basename}.tmp")
275
+ tmp.write(content)
276
+ File.rename(tmp.to_s, path.to_s)
277
+ end
278
+ end
279
+ end
280
+ end
data/lib/ucode/fetch.rb CHANGED
@@ -1,14 +1,18 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ucode
4
- # Fetchers — download UCD.zip, Unihan.zip, and per-block Code Charts PDFs.
4
+ # Fetchers — download UCD.zip, Unihan.zip, per-block Code Charts
5
+ # PDFs, and the specialist Tier 1 fonts that fontist's formula
6
+ # index does not carry.
5
7
  #
6
- # OCP: Http is the single network boundary. New source types add a new
7
- # Fetcher class that calls Http.get; no new HTTP stack.
8
+ # OCP: Http is the single network boundary. New source types add a
9
+ # new Fetcher class that calls Http.get; no new HTTP stack.
8
10
  module Fetch
9
11
  autoload :Http, "ucode/fetch/http"
10
12
  autoload :UcdZip, "ucode/fetch/ucd_zip"
11
13
  autoload :UnihanZip, "ucode/fetch/unihan_zip"
12
14
  autoload :CodeCharts, "ucode/fetch/code_charts"
15
+ autoload :FontFetcher, "ucode/fetch/font_fetcher"
16
+ autoload :SpecialistFontFetcher, "ucode/fetch/specialist_font_fetcher"
13
17
  end
14
18
  end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ require "fontisan"
6
+
7
+ require "ucode/glyphs/real_fonts/font_locator"
8
+ require "ucode/models/glyph_source"
9
+
10
+ module Ucode
11
+ module Glyphs
12
+ module RealFonts
13
+ # Lazily loads each Tier 1 font's cmap and answers per-codepoint
14
+ # coverage queries. Used by {SourceConfig::CoverageAssertion}
15
+ # to walk every assigned codepoint without re-parsing the same
16
+ # font once per block.
17
+ #
18
+ # One font load per unique label. The cache key is the source's
19
+ # `label` (fontist formula name or `name=path` short name) —
20
+ # if two blocks reference the same label, the cmap loads once.
21
+ #
22
+ # Fonts that cannot be located or parsed produce an empty set;
23
+ # {CoverageAssertion} records every assigned codepoint in those
24
+ # blocks as a gap. Missing fonts are themselves curation
25
+ # findings — the walker surfaces them rather than hiding them
26
+ # behind an exception.
27
+ class CmapCache
28
+ # @param font_locator [FontLocator] injectable for testing.
29
+ # Defaults to a fresh instance with `install: false`
30
+ # semantics (we never auto-install during a coverage walk;
31
+ # that's a separate operation).
32
+ def initialize(font_locator: FontLocator.new)
33
+ @font_locator = font_locator
34
+ @cmaps = {}
35
+ end
36
+
37
+ # @param source [Ucode::Models::GlyphSource]
38
+ # @param codepoint [Integer]
39
+ # @return [Boolean] true when the source's cmap includes the
40
+ # codepoint. False when the font is missing, fails to load,
41
+ # or doesn't have an outline for that codepoint.
42
+ def covers?(source, codepoint)
43
+ cmap_for(source).include?(codepoint)
44
+ end
45
+
46
+ private
47
+
48
+ def cmap_for(source)
49
+ @cmaps[source.label] ||= load_cmap(source)
50
+ end
51
+
52
+ def load_cmap(source)
53
+ path = resolve_path(source)
54
+ return Set.new unless path
55
+
56
+ font = Fontisan::FontLoader.load(path.to_s)
57
+ cmap = font.table(Fontisan::Constants::CMAP_TAG)
58
+ return Set.new unless cmap
59
+
60
+ cmap.unicode_mappings.keys.to_set
61
+ rescue StandardError
62
+ Set.new
63
+ end
64
+
65
+ def resolve_path(source)
66
+ result = @font_locator.locate(source.to_font_spec, install: false)
67
+ result&.path
68
+ rescue StandardError
69
+ nil
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -26,6 +26,7 @@ module Ucode
26
26
  "ucode/glyphs/real_fonts/font_coverage_report"
27
27
  autoload :FontLocator, "ucode/glyphs/real_fonts/font_locator"
28
28
  autoload :CoverageAuditor, "ucode/glyphs/real_fonts/coverage_auditor"
29
+ autoload :CmapCache, "ucode/glyphs/real_fonts/cmap_cache"
29
30
  autoload :Writer, "ucode/glyphs/real_fonts/writer"
30
31
  end
31
32
  end