@deftai/directive-content 0.55.1 → 0.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/.githooks/pre-commit +143 -0
  2. package/.githooks/pre-push +121 -0
  3. package/QUICK-START.md +13 -3
  4. package/Taskfile.yml +934 -0
  5. package/UPGRADING.md +82 -11
  6. package/events/README.md +3 -3
  7. package/package.json +5 -4
  8. package/packs/skills/skills-pack-0.1.json +22 -22
  9. package/scripts/_agents_md.py +494 -0
  10. package/scripts/_cache_fetch.py +635 -0
  11. package/scripts/_cache_quota.py +529 -0
  12. package/scripts/_cache_refresh.py +163 -0
  13. package/scripts/_cache_validate.py +209 -0
  14. package/scripts/_content_root.py +42 -0
  15. package/scripts/_doctor_state.py +277 -0
  16. package/scripts/_event_detect.py +305 -0
  17. package/scripts/_events.py +514 -0
  18. package/scripts/_lifecycle_hygiene.py +568 -0
  19. package/scripts/_pathspec.py +91 -0
  20. package/scripts/_policy_show_cli.py +266 -0
  21. package/scripts/_precutover.py +92 -0
  22. package/scripts/_project_context.py +224 -0
  23. package/scripts/_project_definition_io.py +164 -0
  24. package/scripts/_relocate_snapshot.py +209 -0
  25. package/scripts/_relocate_states.py +343 -0
  26. package/scripts/_resolve_preflight_path.py +152 -0
  27. package/scripts/_safe_subprocess.py +167 -0
  28. package/scripts/_session_start_hook.py +205 -0
  29. package/scripts/_sor_gate_diff.py +365 -0
  30. package/scripts/_stdio_utf8.py +59 -0
  31. package/scripts/_triage_bootstrap_gitignore.py +904 -0
  32. package/scripts/_triage_classify_cli.py +122 -0
  33. package/scripts/_triage_queue_cli.py +625 -0
  34. package/scripts/_triage_scope_cli.py +343 -0
  35. package/scripts/_triage_scope_drift_cli.py +121 -0
  36. package/scripts/_triage_scope_ignores.py +286 -0
  37. package/scripts/_triage_scope_milestone.py +432 -0
  38. package/scripts/_triage_scope_mutations.py +337 -0
  39. package/scripts/_triage_scope_renderers.py +207 -0
  40. package/scripts/_triage_smoketest_stages.py +674 -0
  41. package/scripts/_triage_subscribe_cli.py +140 -0
  42. package/scripts/_triage_welcome_cli.py +421 -0
  43. package/scripts/_vbrief_build.py +239 -0
  44. package/scripts/_vbrief_fidelity.py +479 -0
  45. package/scripts/_vbrief_legacy.py +589 -0
  46. package/scripts/_vbrief_reconciliation.py +883 -0
  47. package/scripts/_vbrief_routing.py +277 -0
  48. package/scripts/_vbrief_safety.py +778 -0
  49. package/scripts/_vbrief_sources.py +312 -0
  50. package/scripts/_vbrief_speckit.py +262 -0
  51. package/scripts/_vbrief_story_quality.py +353 -0
  52. package/scripts/_vbrief_validation.py +299 -0
  53. package/scripts/build_dist.py +412 -0
  54. package/scripts/cache.py +1078 -0
  55. package/scripts/cache_scanner.py +745 -0
  56. package/scripts/candidates_log.py +432 -0
  57. package/scripts/capacity_backfill.py +680 -0
  58. package/scripts/capacity_show.py +653 -0
  59. package/scripts/ci_local.py +689 -0
  60. package/scripts/code_structure_validate.py +765 -0
  61. package/scripts/codebase_default_extractor.py +495 -0
  62. package/scripts/codebase_map.py +304 -0
  63. package/scripts/codebase_map_fresh.py +104 -0
  64. package/scripts/codebase_projection_registry.py +94 -0
  65. package/scripts/codebase_provider.py +582 -0
  66. package/scripts/doctor.py +2257 -0
  67. package/scripts/framework_commands.py +505 -0
  68. package/scripts/gh_rest.py +882 -0
  69. package/scripts/github_auth_modes.py +437 -0
  70. package/scripts/github_body.py +292 -0
  71. package/scripts/ip_risk.py +531 -0
  72. package/scripts/issue_emit.py +670 -0
  73. package/scripts/issue_ingest.py +1064 -0
  74. package/scripts/migrate_preflight.py +418 -0
  75. package/scripts/migrate_vbrief.py +2677 -0
  76. package/scripts/monitor_pr.py +401 -0
  77. package/scripts/pack_migrate_lessons.py +336 -0
  78. package/scripts/pack_migrate_patterns.py +254 -0
  79. package/scripts/pack_migrate_rules.py +350 -0
  80. package/scripts/pack_migrate_skills.py +423 -0
  81. package/scripts/pack_migrate_strategies.py +311 -0
  82. package/scripts/pack_migrate_swarm_spec.py +250 -0
  83. package/scripts/pack_render.py +434 -0
  84. package/scripts/packs_slice.py +712 -0
  85. package/scripts/platform_capabilities.py +336 -0
  86. package/scripts/policy.py +2826 -0
  87. package/scripts/policy_set.py +324 -0
  88. package/scripts/pr_check_closing_keywords.py +524 -0
  89. package/scripts/pr_check_protected_issues.py +267 -0
  90. package/scripts/pr_merge_readiness.py +1004 -0
  91. package/scripts/pr_wait_mergeable.py +669 -0
  92. package/scripts/prd_render.py +159 -0
  93. package/scripts/preflight_architecture_sor.py +974 -0
  94. package/scripts/preflight_branch.py +289 -0
  95. package/scripts/preflight_cache.py +974 -0
  96. package/scripts/preflight_gh.py +721 -0
  97. package/scripts/preflight_implementation.py +272 -0
  98. package/scripts/preflight_story_start.py +838 -0
  99. package/scripts/preflight_wip_cap.py +149 -0
  100. package/scripts/probe_session.py +545 -0
  101. package/scripts/project_render.py +293 -0
  102. package/scripts/quarantine_ext.py +237 -0
  103. package/scripts/reconcile_issues.py +1442 -0
  104. package/scripts/refresh-path.ps1 +107 -0
  105. package/scripts/release.py +2030 -0
  106. package/scripts/release_e2e.py +1011 -0
  107. package/scripts/release_publish.py +486 -0
  108. package/scripts/release_rollback.py +980 -0
  109. package/scripts/relocate.py +1034 -0
  110. package/scripts/resolve_changelog_unreleased.py +667 -0
  111. package/scripts/resolve_version.py +490 -0
  112. package/scripts/resume_conditions.py +706 -0
  113. package/scripts/ritual_sentinel.py +609 -0
  114. package/scripts/roadmap_render.py +635 -0
  115. package/scripts/rule_ownership_lint.py +325 -0
  116. package/scripts/scm.py +591 -0
  117. package/scripts/scope_audit_log.py +387 -0
  118. package/scripts/scope_decompose.py +654 -0
  119. package/scripts/scope_demote.py +509 -0
  120. package/scripts/scope_lifecycle.py +1126 -0
  121. package/scripts/scope_undo.py +772 -0
  122. package/scripts/session_start.py +406 -0
  123. package/scripts/setup_ghx.py +339 -0
  124. package/scripts/setup_windows.ps1 +220 -0
  125. package/scripts/slice_audit.py +585 -0
  126. package/scripts/slice_record.py +530 -0
  127. package/scripts/slice_record_existing.py +692 -0
  128. package/scripts/slug_normalize.py +178 -0
  129. package/scripts/spec_render.py +477 -0
  130. package/scripts/spec_validate.py +238 -0
  131. package/scripts/subagent_monitor.py +658 -0
  132. package/scripts/swarm_complete_cohort.py +644 -0
  133. package/scripts/swarm_launch.py +1206 -0
  134. package/scripts/swarm_readiness.py +554 -0
  135. package/scripts/swarm_verify_review_clean.py +438 -0
  136. package/scripts/swarm_worktrees.py +497 -0
  137. package/scripts/toolchain-check.py +52 -0
  138. package/scripts/triage_actions.py +871 -0
  139. package/scripts/triage_bootstrap.py +1153 -0
  140. package/scripts/triage_bulk.py +630 -0
  141. package/scripts/triage_classify.py +932 -0
  142. package/scripts/triage_help.py +1685 -0
  143. package/scripts/triage_queue.py +1944 -0
  144. package/scripts/triage_reconcile.py +581 -0
  145. package/scripts/triage_refresh.py +643 -0
  146. package/scripts/triage_scope.py +999 -0
  147. package/scripts/triage_scope_drift.py +575 -0
  148. package/scripts/triage_smoketest.py +396 -0
  149. package/scripts/triage_subscribe.py +399 -0
  150. package/scripts/triage_summary.py +1011 -0
  151. package/scripts/triage_welcome.py +1178 -0
  152. package/scripts/ts_check_lane.py +86 -0
  153. package/scripts/validate-links.py +64 -0
  154. package/scripts/validate_strategy_output.py +212 -0
  155. package/scripts/vbrief_activate.py +228 -0
  156. package/scripts/vbrief_migrate_conformance.py +368 -0
  157. package/scripts/vbrief_reconcile_graph.py +306 -0
  158. package/scripts/vbrief_reconcile_labels.py +460 -0
  159. package/scripts/vbrief_reconcile_umbrellas.py +741 -0
  160. package/scripts/vbrief_validate.py +1195 -0
  161. package/scripts/verify-stubs.py +61 -0
  162. package/scripts/verify_capacity.py +160 -0
  163. package/scripts/verify_encoding.py +699 -0
  164. package/scripts/verify_hooks_installed.py +206 -0
  165. package/scripts/verify_investigation.py +360 -0
  166. package/scripts/verify_judgment_gates.py +827 -0
  167. package/scripts/verify_no_task_runtime.py +171 -0
  168. package/scripts/verify_scm_boundary.py +509 -0
  169. package/scripts/verify_session_ritual.py +389 -0
  170. package/scripts/verify_tools.py +426 -0
  171. package/scripts/verify_vbrief_conformance.py +478 -0
  172. package/skills/deft-directive-swarm/SKILL.md +7 -26
  173. package/skills/deft-directive-sync/SKILL.md +1 -1
  174. package/tasks/architecture.yml +13 -0
  175. package/tasks/cache.yml +69 -0
  176. package/tasks/capacity.yml +38 -0
  177. package/tasks/change.yml +46 -0
  178. package/tasks/changelog.yml +24 -0
  179. package/tasks/ci.yml +49 -0
  180. package/tasks/codebase.yml +47 -0
  181. package/tasks/commit.yml +30 -0
  182. package/tasks/core.yml +126 -0
  183. package/tasks/deployments.yml +54 -0
  184. package/tasks/framework.yml +74 -0
  185. package/tasks/install.yml +60 -0
  186. package/tasks/issue.yml +50 -0
  187. package/tasks/migrate.yml +73 -0
  188. package/tasks/packs.yml +92 -0
  189. package/tasks/policy.yml +75 -0
  190. package/tasks/pr.yml +89 -0
  191. package/tasks/prd.yml +39 -0
  192. package/tasks/project.yml +27 -0
  193. package/tasks/reconcile.yml +32 -0
  194. package/tasks/relocate.yml +56 -0
  195. package/tasks/roadmap.yml +28 -0
  196. package/tasks/scm.yml +126 -0
  197. package/tasks/scope-undo.yml +36 -0
  198. package/tasks/scope.yml +141 -0
  199. package/tasks/session.yml +19 -0
  200. package/tasks/setup.yml +37 -0
  201. package/tasks/slice.yml +69 -0
  202. package/tasks/spec.yml +41 -0
  203. package/tasks/swarm.yml +85 -0
  204. package/tasks/toolchain.yml +13 -0
  205. package/tasks/triage-actions.yml +94 -0
  206. package/tasks/triage-bootstrap.yml +43 -0
  207. package/tasks/triage-bulk.yml +75 -0
  208. package/tasks/triage-classify.yml +30 -0
  209. package/tasks/triage-queue.yml +50 -0
  210. package/tasks/triage-reconcile.yml +29 -0
  211. package/tasks/triage-scope-drift.yml +29 -0
  212. package/tasks/triage-scope.yml +31 -0
  213. package/tasks/triage-smoketest.yml +33 -0
  214. package/tasks/triage-subscribe.yml +36 -0
  215. package/tasks/triage-summary.yml +29 -0
  216. package/tasks/triage-welcome.yml +32 -0
  217. package/tasks/ts.yml +328 -0
  218. package/tasks/vbrief.yml +206 -0
  219. package/tasks/verify.yml +292 -0
  220. package/templates/agents-entry.md +2 -2
@@ -0,0 +1,699 @@
1
+ #!/usr/bin/env python3
2
+ """verify_encoding.py -- deterministic gate against PS 5.1 non-ASCII round-trip corruption (#798).
3
+
4
+ Pure stdlib, cross-platform. Invoked from:
5
+
6
+ - ``.githooks/pre-commit`` via ``--staged`` after ``preflight_branch.py`` (#747)
7
+ - ``task verify:encoding`` (aggregated into ``task check``) via ``--all``
8
+ - ``uv run python scripts/verify_encoding.py [--staged|--all] [--allow-list <path>]``
9
+
10
+ Recurrence chain (this gate elevates the rule from prose tier to deterministic
11
+ tier per main.md Rule Authority [AXIOM]):
12
+
13
+ - #236 t1.11.1 -- ``Get-Content -Raw`` + BOM-safe write rules in scm/github.md
14
+ - #240 t1.11.2 -- Warp multi-line PS here-string rule in scm/github.md
15
+ - #283 t1.20.1 -- ``New-Object System.Text.UTF8Encoding $false`` rule in AGENTS.md
16
+ - PR #795 (2026-05-01) -- 132-line CHANGELOG mojibake on the same maintainer
17
+ with all three rules loaded, because the corruption happened on the READ side
18
+ (``Get-Content -Raw`` decodes via the active codepage, typically CP1252 or
19
+ CP437 on Windows) BEFORE any safe write could preserve the bytes.
20
+
21
+ Detection scope (UTF-8 codepoint sequences that appear after a Windows
22
+ codepage round-trip):
23
+
24
+ - U+FFFD replacement chars (universal corruption marker).
25
+ - CP1252-as-UTF-8 mojibake bigrams (``§``, ``°``, ``’``, ``…``, ``â†'`` ...).
26
+ - CP437-as-UTF-8 mojibake bigrams (``⊗``, ``✓``, ``…``, ``—`` ...).
27
+ - Unexpected UTF-8 BOM (``EF BB BF``) on text formats where BOM is non-canonical
28
+ (.md, .json, .yml, .yaml, .txt).
29
+
30
+ False-positive guards:
31
+
32
+ - Markdown inline code spans (single backticks) and fenced code blocks (triple
33
+ backticks) are stripped before scanning .md files -- recurrence-record prose
34
+ legitimately quotes mojibake bytes inside backticks.
35
+ - A built-in allow-list skips the #798 brief itself (which documents the
36
+ bigram catalog as part of its acceptance criteria).
37
+ - ``--allow-list <path>`` accepts a newline-separated list of glob patterns
38
+ for project-specific documented exceptions (e.g. regression fixtures).
39
+
40
+ Exit codes (three-state, mirrors ``scripts/preflight_branch.py``):
41
+
42
+ - ``0`` -- clean: no mojibake / U+FFFD / unexpected BOM detected.
43
+ - ``1`` -- corruption found: prints per-hit ``path:line:[label] context``.
44
+ - ``2`` -- config error: ``--allow-list`` path unreadable, ``--staged``
45
+ outside a git repo, or unrecognised CLI shape.
46
+ """
47
+
48
+ from __future__ import annotations
49
+
50
+ import argparse
51
+ import fnmatch
52
+ import json
53
+ import re
54
+ import subprocess
55
+ import sys
56
+ from collections.abc import Iterable
57
+ from pathlib import Path
58
+
59
+ #: Codepoint sequences that signal a Windows codepage round-trip corruption.
60
+ #: Each entry maps a mojibake bigram to a short label naming the canonical
61
+ #: codepoint that was corrupted. The set is intentionally CONSERVATIVE --
62
+ #: only the bigrams observed in the four-recurrence record (#236, #240, #283,
63
+ #: PR #795 / #844) plus the most common Windows-codepage analogues are listed.
64
+ #: Adding a pattern here MUST be paired with a parametrized regression test
65
+ #: in ``tests/cli/test_verify_encoding.py``.
66
+ MOJIBAKE_PATTERNS: dict[str, str] = {
67
+ # CP437-as-UTF-8 (Windows DOS codepage; recurrence record PR #844 / fix #846).
68
+ # Pattern: original UTF-8 bytes E2 XX YY decoded by cp437 yields "Γ" + two cp437 glyphs.
69
+ "Γèù": "U+2297 (⊗) corrupted via cp437 read",
70
+ "Γ£ô": "U+2713 (✓) corrupted via cp437 read",
71
+ "ΓǪ": "U+2026 (…) corrupted via cp437 read",
72
+ "ΓÇö": "U+2014 (—) corrupted via cp437 read",
73
+ "ΓÇô": "U+2013 (–) corrupted via cp437 read",
74
+ "ΓÇó": "U+2022 (•) corrupted via cp437 read",
75
+ "ΓÇÖ": "U+2019 (’) corrupted via cp437 read",
76
+ "ΓÇÿ": "U+2018 (‘) corrupted via cp437 read",
77
+ "ΓÇ£": "U+201C (“) corrupted via cp437 read",
78
+ "ΓÇØ": "U+201D (”) corrupted via cp437 read",
79
+ "ΓåÆ": "U+2192 (→) corrupted via cp437 read",
80
+ # CP1252-as-UTF-8 (Windows ANSI codepage; recurrence record #236, #240, #283, PR #795).
81
+ # Pattern: original UTF-8 bytes (typically prefixed C2/C3/E2) decoded by cp1252.
82
+ "’": "U+2019 (’) corrupted via cp1252 read",
83
+ "‘": "U+2018 (‘) corrupted via cp1252 read",
84
+ "“": "U+201C (“) corrupted via cp1252 read",
85
+ "â€\x9d": "U+201D (”) corrupted via cp1252 read",
86
+ "–": "U+2013 (–) corrupted via cp1252 read",
87
+ "—": "U+2014 (—) corrupted via cp1252 read",
88
+ "…": "U+2026 (…) corrupted via cp1252 read",
89
+ "•": "U+2022 (•) corrupted via cp1252 read",
90
+ "→": "U+2192 (→) corrupted via cp1252 read",
91
+ "§": "U+00A7 (§) corrupted via cp1252 read",
92
+ "°": "U+00B0 (°) corrupted via cp1252 read",
93
+ "´": "U+00B4 (´) corrupted via cp1252 read",
94
+ "­": "U+00AD (soft hyphen) corrupted via cp1252 read",
95
+ "©": "U+00A9 (©) corrupted via cp1252 read",
96
+ "®": "U+00AE (®) corrupted via cp1252 read",
97
+ "±": "U+00B1 (±) corrupted via cp1252 read",
98
+ }
99
+
100
+ #: U+FFFD REPLACEMENT CHARACTER -- the universal mojibake marker emitted by
101
+ #: ``str.decode(..., errors='replace')`` when input bytes can't be decoded.
102
+ #: Distinct from MOJIBAKE_PATTERNS because U+FFFD detection is encoding-agnostic.
103
+ REPLACEMENT_CHAR = "\ufffd"
104
+
105
+ #: UTF-8 BOM byte sequence (``EF BB BF``). Some text formats accept it
106
+ #: (.ps1, .csv on Windows) but markdown / JSON / YAML / plain text do NOT --
107
+ #: a BOM in those files corrupts downstream parsers and is the signature
108
+ #: of a PS 5.1 ``Set-Content -Encoding UTF8`` write.
109
+ UTF8_BOM = b"\xef\xbb\xbf"
110
+
111
+ #: File extensions where a leading UTF-8 BOM is non-canonical and should be
112
+ #: flagged. Other extensions (.csv, .ps1, .bat) tolerate or expect a BOM.
113
+ NO_BOM_EXTENSIONS = frozenset({".md", ".json", ".yml", ".yaml", ".txt"})
114
+
115
+ #: Control characters that must not hide inside decoded vBRIEF narratives.
116
+ #: JSON serializes these as escapes (for example ``\u000b``), so the raw
117
+ #: text scanner below cannot see them until the vBRIEF is parsed.
118
+ VBRIEF_CONTROL_CHAR_LABELS: dict[str, str] = {
119
+ "\b": "U+0008 backspace in vBRIEF narrative",
120
+ "\t": "U+0009 tab in vBRIEF narrative",
121
+ "\v": "U+000B vertical tab in vBRIEF narrative",
122
+ "\f": "U+000C form feed in vBRIEF narrative",
123
+ }
124
+
125
+ #: File extensions to scan by default. Conservative -- excludes binary formats
126
+ #: and source files where the cost/benefit of mojibake detection is lower.
127
+ SCANNABLE_EXTENSIONS = frozenset(
128
+ {
129
+ ".md",
130
+ ".json",
131
+ ".yml",
132
+ ".yaml",
133
+ ".txt",
134
+ ".py",
135
+ ".sh",
136
+ ".ps1",
137
+ ".toml",
138
+ ".cfg",
139
+ }
140
+ )
141
+
142
+ #: Path-glob patterns auto-skipped because the file legitimately contains
143
+ #: mojibake byte sequences as part of its purpose. Each entry is matched
144
+ #: against the path's POSIX form (forward slashes) via ``fnmatch.fnmatchcase``.
145
+ #: When a future recurrence-record vBRIEF documents a new bigram, append its
146
+ #: rel-path here -- the rule body lives in this gate, NOT in prose.
147
+ BUILTIN_ALLOW_LIST: tuple[str, ...] = (
148
+ # The #798 brief catalogues the bigram set being detected; quoting
149
+ # the bigrams in its narrative is the brief's own acceptance criterion.
150
+ "vbrief/active/*-798-*.vbrief.json",
151
+ "vbrief/completed/*-798-*.vbrief.json",
152
+ "vbrief/cancelled/*-798-*.vbrief.json",
153
+ "vbrief/pending/*-798-*.vbrief.json",
154
+ "vbrief/proposed/*-798-*.vbrief.json",
155
+ ".deft/core/vbrief/active/*-798-*.vbrief.json",
156
+ ".deft/core/vbrief/completed/*-798-*.vbrief.json",
157
+ ".deft/core/vbrief/cancelled/*-798-*.vbrief.json",
158
+ ".deft/core/vbrief/pending/*-798-*.vbrief.json",
159
+ ".deft/core/vbrief/proposed/*-798-*.vbrief.json",
160
+ "deft/vbrief/active/*-798-*.vbrief.json",
161
+ "deft/vbrief/completed/*-798-*.vbrief.json",
162
+ "deft/vbrief/cancelled/*-798-*.vbrief.json",
163
+ "deft/vbrief/pending/*-798-*.vbrief.json",
164
+ "deft/vbrief/proposed/*-798-*.vbrief.json",
165
+ # history/archive/ preserves historical task / vbrief state byte-for-byte.
166
+ # Pre-existing mojibake in archived artifacts (e.g. v0.20 migration residue)
167
+ # is intentionally retained as a forensic record and MUST NOT be rewritten.
168
+ "history/archive/**",
169
+ "history/archive/**/*",
170
+ ".deft/core/history/archive/**",
171
+ ".deft/core/history/archive/**/*",
172
+ "deft/history/archive/**",
173
+ "deft/history/archive/**/*",
174
+ # Self-skip: this script and its test file are the canonical catalog of
175
+ # the bigrams being detected. Scanning them would flag every entry in
176
+ # MOJIBAKE_PATTERNS as a hit against the file that defines it. The
177
+ # forward-coverage contract is upheld by tests/cli/test_verify_encoding.py
178
+ # (parametrized over MOJIBAKE_PATTERNS), not by the gate scanning itself.
179
+ "scripts/verify_encoding.py",
180
+ "tests/cli/test_verify_encoding.py",
181
+ ".deft/core/scripts/verify_encoding.py",
182
+ ".deft/core/tests/cli/test_verify_encoding.py",
183
+ "deft/scripts/verify_encoding.py",
184
+ "deft/tests/cli/test_verify_encoding.py",
185
+ )
186
+
187
+ #: Markdown inline-code span: single backtick to single backtick on one line,
188
+ #: not crossing line boundaries (handles both LF and CRLF). Conservative: the
189
+ #: regex is non-greedy so a line like `` `foo` and `bar` `` produces two
190
+ #: separate matches, not one.
191
+ _MD_INLINE_CODE = re.compile(r"`[^`\r\n]*`")
192
+
193
+ #: Markdown fenced code block: ``` (or ~~~) ... ``` (or ~~~) across multiple
194
+ #: lines. CRLF-robust: trailing-whitespace classes include ``\r`` so the
195
+ #: ``$`` anchor still matches when the file is CRLF (Python regex MULTILINE
196
+ #: ``$`` matches *before* ``\n``, which on CRLF lines leaves the prior ``\r``
197
+ #: needing to be absorbed by the trailing whitespace class). The opening
198
+ #: fence allows a language tag (e.g. ``` ```python ```) before the newline.
199
+ _MD_FENCED_BLOCK = re.compile(r"(?ms)^[ \t]*(```|~~~)[^\n]*\n.*?^[ \t]*\1[ \t\r]*$")
200
+
201
+
202
+ class Finding:
203
+ """One mojibake / U+FFFD / BOM detection record."""
204
+
205
+ __slots__ = ("path", "line", "label", "context")
206
+
207
+ def __init__(self, path: str, line: int, label: str, context: str) -> None:
208
+ self.path = path
209
+ self.line = line
210
+ self.label = label
211
+ self.context = context
212
+
213
+ def render(self) -> str:
214
+ ctx = self.context if len(self.context) <= 120 else self.context[:117] + "..."
215
+ return f" {self.path}:{self.line} [{self.label}] {ctx}"
216
+
217
+
218
+ def _blank_block(match: re.Match[str]) -> str:
219
+ """Replace a fenced code block with the same number of newlines.
220
+
221
+ Greptile P1 (PR #862): the prior implementation used
222
+ ``_MD_FENCED_BLOCK.sub("", text)`` which removed the newlines that lived
223
+ INSIDE the matched fence. After substitution every line that followed in
224
+ ``scan_text`` shifted upward by the number of consumed newlines, so a
225
+ mojibake hit AFTER a fenced block was reported at the wrong line number
226
+ with the wrong context (and the true line was not reported at all). The
227
+ gate still exited 1 -- corruption did not silently pass -- but the
228
+ diagnostic was misleading.
229
+
230
+ Replacing with ``\n`` * count preserves line-count alignment between
231
+ ``original_lines`` and ``stripped_lines`` so the zip in :func:`scan_file`
232
+ pairs each original line with its stripped counterpart at the same index.
233
+ """
234
+ return "\n" * match.group(0).count("\n")
235
+
236
+
237
+ def _strip_markdown_quotes(text: str) -> str:
238
+ """Strip fenced code blocks and inline-code spans from markdown content.
239
+
240
+ Rationale: recurrence-record documentation legitimately quotes mojibake
241
+ bytes inside backticks (e.g. CHANGELOG entries describing the corruption
242
+ being fixed). Stripping these before scanning prevents the gate from
243
+ flagging its own documentation. Other file formats (JSON, YAML, source
244
+ code) are scanned without this treatment because the false-positive rate
245
+ is much lower outside markdown prose.
246
+
247
+ Order matters: fenced blocks are stripped first (they may contain
248
+ backticks themselves), then inline spans. Fenced blocks are replaced
249
+ with newline-preserving blanks (see :func:`_blank_block`) so post-fence
250
+ line numbers stay aligned with the original file.
251
+ """
252
+ text = _MD_FENCED_BLOCK.sub(_blank_block, text)
253
+ return _MD_INLINE_CODE.sub("", text)
254
+
255
+
256
+ def _load_allow_list(path: Path | None) -> list[str]:
257
+ """Read newline-separated glob patterns from ``path``; ignore comments.
258
+
259
+ Lines starting with ``#`` and blank lines are skipped. Returns an empty
260
+ list when ``path`` is ``None``. Raises :class:`FileNotFoundError` when
261
+ a non-``None`` path does not exist (caller maps to exit 2).
262
+ """
263
+ if path is None:
264
+ return []
265
+ raw = path.read_text(encoding="utf-8", errors="replace")
266
+ out: list[str] = []
267
+ for line in raw.splitlines():
268
+ stripped = line.strip()
269
+ if not stripped or stripped.startswith("#"):
270
+ continue
271
+ out.append(stripped)
272
+ return out
273
+
274
+
275
+ def _is_allow_listed(rel_path: str, patterns: Iterable[str]) -> bool:
276
+ """Return True when ``rel_path`` (POSIX form) matches any glob in patterns."""
277
+ return any(fnmatch.fnmatchcase(rel_path, pat) for pat in patterns)
278
+
279
+
280
+ def _git_tracked_files(project_root: Path) -> list[str]:
281
+ """Return ``git ls-files`` output as a list of POSIX-form rel paths."""
282
+ proc = subprocess.run(
283
+ ["git", "ls-files"],
284
+ cwd=str(project_root),
285
+ capture_output=True,
286
+ text=True,
287
+ check=False,
288
+ )
289
+ if proc.returncode != 0:
290
+ raise RuntimeError(f"git ls-files failed (rc={proc.returncode}): {proc.stderr.strip()}")
291
+ return [line for line in proc.stdout.splitlines() if line.strip()]
292
+
293
+
294
+ def _git_staged_files(project_root: Path) -> list[str]:
295
+ """Return ``git diff --cached --name-only`` output as POSIX-form rel paths."""
296
+ proc = subprocess.run(
297
+ ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"],
298
+ cwd=str(project_root),
299
+ capture_output=True,
300
+ text=True,
301
+ check=False,
302
+ )
303
+ if proc.returncode != 0:
304
+ raise RuntimeError(
305
+ f"git diff --cached failed (rc={proc.returncode}): {proc.stderr.strip()}"
306
+ )
307
+ return [line for line in proc.stdout.splitlines() if line.strip()]
308
+
309
+
310
+ def scan_file(rel_path: str, full_path: Path) -> list[Finding]:
311
+ """Scan one file for U+FFFD / mojibake / unexpected BOM.
312
+
313
+ Returns a list of :class:`Finding` records (one per hit). An unreadable
314
+ or binary file returns an empty list rather than raising -- the gate
315
+ is intentionally permissive on read failures so a single unreadable
316
+ file does not block a whole pre-commit.
317
+ """
318
+ findings: list[Finding] = []
319
+ suffix = full_path.suffix.lower()
320
+
321
+ try:
322
+ raw = full_path.read_bytes()
323
+ except OSError:
324
+ return findings
325
+
326
+ if suffix in NO_BOM_EXTENSIONS and raw.startswith(UTF8_BOM):
327
+ findings.append(
328
+ Finding(
329
+ rel_path,
330
+ 1,
331
+ "unexpected UTF-8 BOM",
332
+ "leading bytes EF BB BF on a format where BOM is non-canonical",
333
+ )
334
+ )
335
+
336
+ try:
337
+ text = raw.decode("utf-8", errors="replace")
338
+ except UnicodeDecodeError:
339
+ # Should not happen with errors='replace' but guard anyway.
340
+ return findings
341
+
342
+ if "\x00" in text[:1024]:
343
+ # Likely binary file -- skip mojibake scan.
344
+ return findings
345
+
346
+ scan_text = text
347
+ if suffix == ".md":
348
+ scan_text = _strip_markdown_quotes(text)
349
+
350
+ # We need original line numbers for diagnostics, so iterate the original
351
+ # text but check membership against the stripped form.
352
+ if scan_text == text:
353
+ lines = text.splitlines()
354
+ for lineno, line in enumerate(lines, 1):
355
+ findings.extend(_scan_line(rel_path, lineno, line))
356
+ else:
357
+ # For markdown, scan line-by-line on the stripped form so reported
358
+ # line numbers correspond to the original file's line layout. We
359
+ # split BOTH the original and stripped text on \n; fenced-block
360
+ # stripping replaces blocks with empty strings, which preserves
361
+ # line-count alignment because each newline in the block is
362
+ # consumed by the regex.
363
+ original_lines = text.splitlines()
364
+ stripped_lines = scan_text.splitlines()
365
+ # Pad stripped to original length defensively so a regex edge-case
366
+ # (e.g. trailing newline mismatch) doesn't drop late findings.
367
+ if len(stripped_lines) < len(original_lines):
368
+ stripped_lines = stripped_lines + [""] * (len(original_lines) - len(stripped_lines))
369
+ for lineno, (orig, stripped) in enumerate(
370
+ zip(original_lines, stripped_lines, strict=False), 1
371
+ ):
372
+ findings.extend(_scan_line(rel_path, lineno, stripped, context=orig))
373
+
374
+ if _is_vbrief_narrative_control_scope(rel_path):
375
+ findings.extend(_scan_vbrief_narrative_controls(rel_path, text))
376
+
377
+ return findings
378
+
379
+
380
+ def _scan_line(
381
+ rel_path: str,
382
+ lineno: int,
383
+ line: str,
384
+ *,
385
+ context: str | None = None,
386
+ ) -> list[Finding]:
387
+ """Scan one line; return findings for U+FFFD + each mojibake pattern hit."""
388
+ findings: list[Finding] = []
389
+ ctx = context if context is not None else line
390
+ if REPLACEMENT_CHAR in line:
391
+ findings.append(
392
+ Finding(
393
+ rel_path,
394
+ lineno,
395
+ "U+FFFD replacement char",
396
+ ctx,
397
+ )
398
+ )
399
+ for pattern, label in MOJIBAKE_PATTERNS.items():
400
+ if pattern in line:
401
+ findings.append(Finding(rel_path, lineno, label, ctx))
402
+ return findings
403
+
404
+
405
+ def _is_vbrief_narrative_control_scope(rel_path: str) -> bool:
406
+ """Return True for in-flight vBRIEF files that may receive issue ingest."""
407
+ if not rel_path.endswith(".vbrief.json"):
408
+ return False
409
+ normalized_path = rel_path.replace("\\", "/")
410
+ normalized = f"/{normalized_path}"
411
+ return "/vbrief/proposed/" in normalized or "/vbrief/active/" in normalized
412
+
413
+
414
+ def _scan_vbrief_narrative_controls(rel_path: str, text: str) -> list[Finding]:
415
+ """Scan decoded ``plan.narratives`` strings for hidden control chars.
416
+
417
+ The general scanner works on raw file text. That catches mojibake and
418
+ BOMs, but not JSON-escaped controls such as ``"\u000b"`` because the
419
+ raw bytes are printable. vBRIEF narratives are user-facing Markdown, so
420
+ decode just that structured surface and flag controls after JSON parsing.
421
+ """
422
+ try:
423
+ data = json.loads(text)
424
+ except json.JSONDecodeError:
425
+ return []
426
+ if not isinstance(data, dict):
427
+ return []
428
+ plan = data.get("plan")
429
+ if not isinstance(plan, dict):
430
+ return []
431
+ narratives = plan.get("narratives")
432
+ if not isinstance(narratives, dict):
433
+ return []
434
+
435
+ findings: list[Finding] = []
436
+ for key, value in narratives.items():
437
+ if not isinstance(key, str) or not isinstance(value, str):
438
+ continue
439
+ key_line = _json_key_line(text, key)
440
+ for label in _decoded_control_labels(value):
441
+ findings.append(
442
+ Finding(
443
+ rel_path,
444
+ key_line,
445
+ label,
446
+ f"plan.narratives.{key} contains {label}",
447
+ )
448
+ )
449
+ return findings
450
+
451
+
452
+ def _decoded_control_labels(value: str) -> list[str]:
453
+ """Return unique finding labels for disallowed decoded controls."""
454
+ labels: list[str] = []
455
+ seen: set[str] = set()
456
+ for index, char in enumerate(value):
457
+ if char == "\t" and not _tab_is_non_indentation(value, index):
458
+ continue
459
+ label = VBRIEF_CONTROL_CHAR_LABELS.get(char)
460
+ if label is None and ord(char) < 32 and char not in {"\n", "\r"}:
461
+ label = f"U+{ord(char):04X} control character in vBRIEF narrative"
462
+ if label and label not in seen:
463
+ seen.add(label)
464
+ labels.append(label)
465
+ return labels
466
+
467
+
468
+ def _tab_is_non_indentation(value: str, index: int) -> bool:
469
+ """Treat tabs after prose as suspicious, but allow leading indentation."""
470
+ line_start = value.rfind("\n", 0, index) + 1
471
+ return any(ch not in " \t" for ch in value[line_start:index])
472
+
473
+
474
+ def _json_key_line(text: str, key: str) -> int:
475
+ """Best-effort line number for a JSON object key."""
476
+ match = re.search(rf'"{re.escape(key)}"\s*:', text)
477
+ if match is None:
478
+ return 1
479
+ return text.count("\n", 0, match.start()) + 1
480
+
481
+
482
+ def _filter_scannable(
483
+ rel_paths: Iterable[str],
484
+ project_root: Path,
485
+ allow_globs: Iterable[str],
486
+ ) -> list[tuple[str, Path]]:
487
+ """Filter rel paths to existing scannable files, applying allow-list.
488
+
489
+ SLizard P1 (PR #862): an earlier draft used
490
+ ``str(full).startswith(str(project_root.resolve()))`` as a fallback for
491
+ the path-containment check. That string-based comparison is vulnerable
492
+ to substring path-traversal (e.g. ``project_root=/a/b`` would match a
493
+ sibling ``/a/b-evil/file.txt`` because ``/a/b`` is a string prefix of
494
+ ``/a/b-evil``). The current implementation uses
495
+ :meth:`Path.is_relative_to` exclusively (Python 3.9+; this project
496
+ targets 3.12+) which does proper path-segment containment and rejects
497
+ the substring-match attack class by construction. A non-relative path
498
+ is dropped silently because it cannot represent a tracked file under
499
+ the working tree the gate is scanning.
500
+ """
501
+ out: list[tuple[str, Path]] = []
502
+ allow_globs = list(allow_globs)
503
+ project_root_resolved = project_root.resolve()
504
+ for rel in rel_paths:
505
+ # Normalize to POSIX form for glob matching (git output already is).
506
+ posix = rel.replace("\\", "/")
507
+ full = (project_root / rel).resolve()
508
+ if not full.is_relative_to(project_root_resolved):
509
+ continue
510
+ if not full.is_file():
511
+ continue
512
+ if full.suffix.lower() not in SCANNABLE_EXTENSIONS:
513
+ continue
514
+ if _is_allow_listed(posix, allow_globs):
515
+ continue
516
+ out.append((posix, full))
517
+ return out
518
+
519
+
520
+ def evaluate(
521
+ project_root: Path,
522
+ *,
523
+ mode: str = "all",
524
+ allow_list_path: Path | None = None,
525
+ ) -> tuple[int, list[Finding], str]:
526
+ """Pure function returning ``(exit_code, findings, human_message)``.
527
+
528
+ Separated from :func:`main` so tests can drive every state without
529
+ ``capsys`` plumbing or env-var leak.
530
+ """
531
+ if mode not in {"all", "staged"}:
532
+ return (
533
+ 2,
534
+ [],
535
+ (f"❌ verify_encoding: unrecognised mode '{mode}' (expected 'all' or 'staged')."),
536
+ )
537
+
538
+ try:
539
+ custom_globs = _load_allow_list(allow_list_path)
540
+ except FileNotFoundError as exc:
541
+ return (
542
+ 2,
543
+ [],
544
+ (
545
+ f"❌ verify_encoding: --allow-list file not found: {exc}\n"
546
+ " Recovery: pass an existing path or omit the flag."
547
+ ),
548
+ )
549
+ except OSError as exc:
550
+ return (
551
+ 2,
552
+ [],
553
+ (
554
+ f"❌ verify_encoding: --allow-list unreadable: {exc}\n"
555
+ " Recovery: check file permissions."
556
+ ),
557
+ )
558
+
559
+ allow_globs = list(BUILTIN_ALLOW_LIST) + custom_globs
560
+
561
+ try:
562
+ if mode == "staged":
563
+ rel_paths = _git_staged_files(project_root)
564
+ else:
565
+ rel_paths = _git_tracked_files(project_root)
566
+ except FileNotFoundError:
567
+ return (
568
+ 2,
569
+ [],
570
+ (
571
+ "❌ verify_encoding: 'git' executable not found on PATH.\n"
572
+ " Recovery: install git or set DEFT_PYTHON to a python that "
573
+ "can spawn git."
574
+ ),
575
+ )
576
+ except RuntimeError as exc:
577
+ return (
578
+ 2,
579
+ [],
580
+ (
581
+ f"❌ verify_encoding: git failed -- {exc}\n"
582
+ " Recovery: ensure --project-root points at a git working tree."
583
+ ),
584
+ )
585
+
586
+ candidates = _filter_scannable(rel_paths, project_root, allow_globs)
587
+
588
+ findings: list[Finding] = []
589
+ for rel, full in candidates:
590
+ findings.extend(scan_file(rel, full))
591
+
592
+ if findings:
593
+ header = (
594
+ f"❌ verify_encoding: detected {len(findings)} mojibake / "
595
+ f"U+FFFD / unexpected-BOM hit(s) across {len({f.path for f in findings})} "
596
+ f"file(s) (#798).\n"
597
+ " Root cause: PowerShell 5.1 Get-Content -Raw decodes via the active "
598
+ "Windows codepage (cp1252 or cp437) on the READ side, BEFORE any\n"
599
+ " safe write can preserve the bytes. Fix: rewrite the offending "
600
+ "files with Python pathlib.Path.write_text(text, encoding='utf-8'),\n"
601
+ " re-read from a clean source (git checkout HEAD -- <path>), and "
602
+ "do NOT round-trip through PS 5.1 again. See AGENTS.md ## PowerShell.\n"
603
+ " Allow-list a documented exception via --allow-list <path> "
604
+ "(file with newline-separated glob patterns)."
605
+ )
606
+ body = "\n".join(f.render() for f in findings[:50])
607
+ if len(findings) > 50:
608
+ body += f"\n ... and {len(findings) - 50} more"
609
+ return 1, findings, f"{header}\n{body}"
610
+
611
+ msg = (
612
+ f"✓ verify_encoding: {len(candidates)} file(s) clean -- no mojibake / "
613
+ "U+FFFD / unexpected-BOM detected (#798)."
614
+ )
615
+ return 0, findings, msg
616
+
617
+
618
+ def _build_parser() -> argparse.ArgumentParser:
619
+ parser = argparse.ArgumentParser(
620
+ prog="verify_encoding.py",
621
+ description=(
622
+ "Deterministic gate against PS 5.1 non-ASCII round-trip "
623
+ "corruption (#798). Scans tracked text files for U+FFFD "
624
+ "replacement chars, the curated CP1252-as-UTF-8 / "
625
+ "CP437-as-UTF-8 mojibake bigram set, and unexpected UTF-8 "
626
+ "BOM on .md/.json/.yml/.yaml/.txt."
627
+ ),
628
+ )
629
+ mode = parser.add_mutually_exclusive_group()
630
+ mode.add_argument(
631
+ "--all",
632
+ dest="mode",
633
+ action="store_const",
634
+ const="all",
635
+ help="Scan all tracked files via 'git ls-files' (default).",
636
+ )
637
+ mode.add_argument(
638
+ "--staged",
639
+ dest="mode",
640
+ action="store_const",
641
+ const="staged",
642
+ help=(
643
+ "Scan only staged files via 'git diff --cached --name-only' "
644
+ "(used by .githooks/pre-commit)."
645
+ ),
646
+ )
647
+ parser.set_defaults(mode="all")
648
+ parser.add_argument(
649
+ "--project-root",
650
+ default=".",
651
+ help="Project root path (default: current working directory).",
652
+ )
653
+ parser.add_argument(
654
+ "--allow-list",
655
+ default=None,
656
+ help=(
657
+ "Path to a file with newline-separated glob patterns of "
658
+ "documented exceptions. Lines starting with # are comments."
659
+ ),
660
+ )
661
+ parser.add_argument(
662
+ "--quiet",
663
+ action="store_true",
664
+ help="Suppress the OK message (errors still print).",
665
+ )
666
+ return parser
667
+
668
+
669
+ def main(argv: list[str] | None = None) -> int:
670
+ # #814: Force UTF-8 stdout/stderr at hook-script entry. Windows Python
671
+ # defaults stdout/stderr to cp1252 (or cp437) when the hook is invoked
672
+ # by git, neither of which has a glyph for the U+2713 success marker
673
+ # or the various non-ASCII glyphs in this script's diagnostic output.
674
+ # Mirrors the block in scripts/preflight_branch.py exactly.
675
+ if hasattr(sys.stdout, "reconfigure"):
676
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
677
+ if hasattr(sys.stderr, "reconfigure"):
678
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
679
+
680
+ parser = _build_parser()
681
+ args = parser.parse_args(argv)
682
+ project_root = Path(args.project_root).resolve()
683
+ allow_list_path = Path(args.allow_list).resolve() if args.allow_list else None
684
+
685
+ code, _findings, msg = evaluate(
686
+ project_root,
687
+ mode=args.mode,
688
+ allow_list_path=allow_list_path,
689
+ )
690
+ if code == 0:
691
+ if not args.quiet:
692
+ print(msg)
693
+ else:
694
+ print(msg, file=sys.stderr)
695
+ return code
696
+
697
+
698
+ if __name__ == "__main__":
699
+ sys.exit(main())