@deftai/directive-content 0.59.0 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/.githooks/pre-commit +10 -128
  2. package/.githooks/pre-push +8 -108
  3. package/Taskfile.yml +48 -58
  4. package/UPGRADING.md +19 -3
  5. package/docs/assets/directive-lifecycle-diagram.png +0 -0
  6. package/docs/directive-lifecycle.md +73 -0
  7. package/docs/getting-started.md +5 -1
  8. package/package.json +3 -3
  9. package/packs/skills/skills-pack-0.1.json +1 -1
  10. package/packs/strategies/strategies-pack-0.1.json +19 -19
  11. package/scm/github.md +37 -6
  12. package/skills/deft-directive-setup/SKILL.md +24 -15
  13. package/strategies/speckit.md +14 -14
  14. package/strategies/v0-20-contract.md +12 -1
  15. package/tasks/change.yml +16 -31
  16. package/tasks/ci.yml +8 -0
  17. package/tasks/commit.yml +12 -19
  18. package/tasks/core.yml +10 -0
  19. package/tasks/engine.yml +42 -0
  20. package/tasks/framework.yml +3 -0
  21. package/tasks/install.yml +20 -19
  22. package/tasks/migrate.yml +26 -15
  23. package/tasks/project.yml +26 -0
  24. package/tasks/toolchain.yml +15 -5
  25. package/tasks/vbrief.yml +4 -3
  26. package/tasks/verify.yml +12 -14
  27. package/templates/agents-entry.md +1 -1
  28. package/scripts/_agents_md.py +0 -494
  29. package/scripts/_cache_fetch.py +0 -635
  30. package/scripts/_cache_quota.py +0 -529
  31. package/scripts/_cache_refresh.py +0 -163
  32. package/scripts/_cache_validate.py +0 -209
  33. package/scripts/_content_root.py +0 -42
  34. package/scripts/_doctor_state.py +0 -277
  35. package/scripts/_event_detect.py +0 -305
  36. package/scripts/_events.py +0 -514
  37. package/scripts/_lifecycle_hygiene.py +0 -568
  38. package/scripts/_pathspec.py +0 -91
  39. package/scripts/_policy_show_cli.py +0 -266
  40. package/scripts/_precutover.py +0 -92
  41. package/scripts/_project_context.py +0 -224
  42. package/scripts/_project_definition_io.py +0 -164
  43. package/scripts/_relocate_snapshot.py +0 -209
  44. package/scripts/_relocate_states.py +0 -343
  45. package/scripts/_resolve_preflight_path.py +0 -152
  46. package/scripts/_safe_subprocess.py +0 -167
  47. package/scripts/_session_start_hook.py +0 -205
  48. package/scripts/_sor_gate_diff.py +0 -365
  49. package/scripts/_stdio_utf8.py +0 -59
  50. package/scripts/_triage_bootstrap_gitignore.py +0 -904
  51. package/scripts/_triage_classify_cli.py +0 -122
  52. package/scripts/_triage_queue_cli.py +0 -625
  53. package/scripts/_triage_scope_cli.py +0 -343
  54. package/scripts/_triage_scope_drift_cli.py +0 -121
  55. package/scripts/_triage_scope_ignores.py +0 -286
  56. package/scripts/_triage_scope_milestone.py +0 -432
  57. package/scripts/_triage_scope_mutations.py +0 -337
  58. package/scripts/_triage_scope_renderers.py +0 -207
  59. package/scripts/_triage_smoketest_stages.py +0 -674
  60. package/scripts/_triage_subscribe_cli.py +0 -140
  61. package/scripts/_triage_welcome_cli.py +0 -421
  62. package/scripts/_vbrief_build.py +0 -239
  63. package/scripts/_vbrief_fidelity.py +0 -479
  64. package/scripts/_vbrief_legacy.py +0 -589
  65. package/scripts/_vbrief_reconciliation.py +0 -883
  66. package/scripts/_vbrief_routing.py +0 -277
  67. package/scripts/_vbrief_safety.py +0 -778
  68. package/scripts/_vbrief_sources.py +0 -312
  69. package/scripts/_vbrief_speckit.py +0 -262
  70. package/scripts/_vbrief_story_quality.py +0 -353
  71. package/scripts/_vbrief_validation.py +0 -299
  72. package/scripts/build_dist.py +0 -412
  73. package/scripts/cache.py +0 -1078
  74. package/scripts/cache_scanner.py +0 -745
  75. package/scripts/candidates_log.py +0 -432
  76. package/scripts/capacity_backfill.py +0 -680
  77. package/scripts/capacity_show.py +0 -653
  78. package/scripts/ci_local.py +0 -689
  79. package/scripts/code_structure_validate.py +0 -765
  80. package/scripts/codebase_default_extractor.py +0 -495
  81. package/scripts/codebase_map.py +0 -304
  82. package/scripts/codebase_map_fresh.py +0 -104
  83. package/scripts/codebase_projection_registry.py +0 -94
  84. package/scripts/codebase_provider.py +0 -582
  85. package/scripts/doctor.py +0 -2552
  86. package/scripts/framework_commands.py +0 -505
  87. package/scripts/gh_rest.py +0 -882
  88. package/scripts/github_auth_modes.py +0 -437
  89. package/scripts/github_body.py +0 -292
  90. package/scripts/ip_risk.py +0 -531
  91. package/scripts/issue_emit.py +0 -670
  92. package/scripts/issue_ingest.py +0 -1064
  93. package/scripts/migrate_preflight.py +0 -418
  94. package/scripts/migrate_vbrief.py +0 -2677
  95. package/scripts/monitor_pr.py +0 -401
  96. package/scripts/pack_migrate_lessons.py +0 -336
  97. package/scripts/pack_migrate_patterns.py +0 -254
  98. package/scripts/pack_migrate_rules.py +0 -350
  99. package/scripts/pack_migrate_skills.py +0 -423
  100. package/scripts/pack_migrate_strategies.py +0 -311
  101. package/scripts/pack_migrate_swarm_spec.py +0 -250
  102. package/scripts/pack_render.py +0 -434
  103. package/scripts/packs_slice.py +0 -712
  104. package/scripts/platform_capabilities.py +0 -336
  105. package/scripts/policy.py +0 -2826
  106. package/scripts/policy_set.py +0 -324
  107. package/scripts/pr_check_closing_keywords.py +0 -524
  108. package/scripts/pr_check_protected_issues.py +0 -267
  109. package/scripts/pr_merge_readiness.py +0 -1004
  110. package/scripts/pr_wait_mergeable.py +0 -669
  111. package/scripts/prd_render.py +0 -159
  112. package/scripts/preflight_architecture_sor.py +0 -974
  113. package/scripts/preflight_branch.py +0 -289
  114. package/scripts/preflight_cache.py +0 -974
  115. package/scripts/preflight_gh.py +0 -721
  116. package/scripts/preflight_implementation.py +0 -272
  117. package/scripts/preflight_story_start.py +0 -838
  118. package/scripts/preflight_wip_cap.py +0 -149
  119. package/scripts/probe_session.py +0 -545
  120. package/scripts/project_render.py +0 -293
  121. package/scripts/quarantine_ext.py +0 -237
  122. package/scripts/reconcile_issues.py +0 -1442
  123. package/scripts/refresh-path.ps1 +0 -107
  124. package/scripts/release.py +0 -2030
  125. package/scripts/release_e2e.py +0 -1011
  126. package/scripts/release_publish.py +0 -486
  127. package/scripts/release_rollback.py +0 -980
  128. package/scripts/relocate.py +0 -1034
  129. package/scripts/resolve_changelog_unreleased.py +0 -667
  130. package/scripts/resolve_version.py +0 -490
  131. package/scripts/resume_conditions.py +0 -706
  132. package/scripts/ritual_sentinel.py +0 -609
  133. package/scripts/roadmap_render.py +0 -635
  134. package/scripts/rule_ownership_lint.py +0 -325
  135. package/scripts/scm.py +0 -591
  136. package/scripts/scope_audit_log.py +0 -387
  137. package/scripts/scope_decompose.py +0 -654
  138. package/scripts/scope_demote.py +0 -509
  139. package/scripts/scope_lifecycle.py +0 -1126
  140. package/scripts/scope_undo.py +0 -772
  141. package/scripts/session_start.py +0 -406
  142. package/scripts/setup_ghx.py +0 -339
  143. package/scripts/setup_windows.ps1 +0 -220
  144. package/scripts/slice_audit.py +0 -585
  145. package/scripts/slice_record.py +0 -530
  146. package/scripts/slice_record_existing.py +0 -692
  147. package/scripts/slug_normalize.py +0 -178
  148. package/scripts/spec_render.py +0 -477
  149. package/scripts/spec_validate.py +0 -238
  150. package/scripts/subagent_monitor.py +0 -658
  151. package/scripts/swarm_complete_cohort.py +0 -644
  152. package/scripts/swarm_launch.py +0 -1206
  153. package/scripts/swarm_readiness.py +0 -554
  154. package/scripts/swarm_verify_review_clean.py +0 -438
  155. package/scripts/swarm_worktrees.py +0 -497
  156. package/scripts/toolchain-check.py +0 -52
  157. package/scripts/triage_actions.py +0 -871
  158. package/scripts/triage_bootstrap.py +0 -1153
  159. package/scripts/triage_bulk.py +0 -630
  160. package/scripts/triage_classify.py +0 -932
  161. package/scripts/triage_help.py +0 -1685
  162. package/scripts/triage_queue.py +0 -1944
  163. package/scripts/triage_reconcile.py +0 -581
  164. package/scripts/triage_refresh.py +0 -643
  165. package/scripts/triage_scope.py +0 -999
  166. package/scripts/triage_scope_drift.py +0 -575
  167. package/scripts/triage_smoketest.py +0 -396
  168. package/scripts/triage_subscribe.py +0 -399
  169. package/scripts/triage_summary.py +0 -1011
  170. package/scripts/triage_welcome.py +0 -1178
  171. package/scripts/ts_check_lane.py +0 -86
  172. package/scripts/validate-links.py +0 -64
  173. package/scripts/validate_strategy_output.py +0 -212
  174. package/scripts/vbrief_activate.py +0 -228
  175. package/scripts/vbrief_migrate_conformance.py +0 -368
  176. package/scripts/vbrief_reconcile_graph.py +0 -306
  177. package/scripts/vbrief_reconcile_labels.py +0 -460
  178. package/scripts/vbrief_reconcile_umbrellas.py +0 -741
  179. package/scripts/vbrief_validate.py +0 -1144
  180. package/scripts/verify-stubs.py +0 -61
  181. package/scripts/verify_capacity.py +0 -160
  182. package/scripts/verify_encoding.py +0 -699
  183. package/scripts/verify_hooks_installed.py +0 -206
  184. package/scripts/verify_investigation.py +0 -360
  185. package/scripts/verify_judgment_gates.py +0 -827
  186. package/scripts/verify_no_task_runtime.py +0 -171
  187. package/scripts/verify_scm_boundary.py +0 -509
  188. package/scripts/verify_session_ritual.py +0 -389
  189. package/scripts/verify_tools.py +0 -426
  190. package/scripts/verify_vbrief_conformance.py +0 -478
@@ -1,699 +0,0 @@
1
- #!/usr/bin/env python3
2
- """verify_encoding.py -- deterministic gate against PS 5.1 non-ASCII round-trip corruption (#798).
3
-
4
- Pure stdlib, cross-platform. Invoked from:
5
-
6
- - ``.githooks/pre-commit`` via ``--staged`` after ``preflight_branch.py`` (#747)
7
- - ``task verify:encoding`` (aggregated into ``task check``) via ``--all``
8
- - ``uv run python scripts/verify_encoding.py [--staged|--all] [--allow-list <path>]``
9
-
10
- Recurrence chain (this gate elevates the rule from prose tier to deterministic
11
- tier per main.md Rule Authority [AXIOM]):
12
-
13
- - #236 t1.11.1 -- ``Get-Content -Raw`` + BOM-safe write rules in scm/github.md
14
- - #240 t1.11.2 -- Warp multi-line PS here-string rule in scm/github.md
15
- - #283 t1.20.1 -- ``New-Object System.Text.UTF8Encoding $false`` rule in AGENTS.md
16
- - PR #795 (2026-05-01) -- 132-line CHANGELOG mojibake on the same maintainer
17
- with all three rules loaded, because the corruption happened on the READ side
18
- (``Get-Content -Raw`` decodes via the active codepage, typically CP1252 or
19
- CP437 on Windows) BEFORE any safe write could preserve the bytes.
20
-
21
- Detection scope (UTF-8 codepoint sequences that appear after a Windows
22
- codepage round-trip):
23
-
24
- - U+FFFD replacement chars (universal corruption marker).
25
- - CP1252-as-UTF-8 mojibake bigrams (``§``, ``°``, ``’``, ``…``, ``â†'`` ...).
26
- - CP437-as-UTF-8 mojibake bigrams (``⊗``, ``✓``, ``…``, ``—`` ...).
27
- - Unexpected UTF-8 BOM (``EF BB BF``) on text formats where BOM is non-canonical
28
- (.md, .json, .yml, .yaml, .txt).
29
-
30
- False-positive guards:
31
-
32
- - Markdown inline code spans (single backticks) and fenced code blocks (triple
33
- backticks) are stripped before scanning .md files -- recurrence-record prose
34
- legitimately quotes mojibake bytes inside backticks.
35
- - A built-in allow-list skips the #798 brief itself (which documents the
36
- bigram catalog as part of its acceptance criteria).
37
- - ``--allow-list <path>`` accepts a newline-separated list of glob patterns
38
- for project-specific documented exceptions (e.g. regression fixtures).
39
-
40
- Exit codes (three-state, mirrors ``scripts/preflight_branch.py``):
41
-
42
- - ``0`` -- clean: no mojibake / U+FFFD / unexpected BOM detected.
43
- - ``1`` -- corruption found: prints per-hit ``path:line:[label] context``.
44
- - ``2`` -- config error: ``--allow-list`` path unreadable, ``--staged``
45
- outside a git repo, or unrecognised CLI shape.
46
- """
47
-
48
- from __future__ import annotations
49
-
50
- import argparse
51
- import fnmatch
52
- import json
53
- import re
54
- import subprocess
55
- import sys
56
- from collections.abc import Iterable
57
- from pathlib import Path
58
-
59
- #: Codepoint sequences that signal a Windows codepage round-trip corruption.
60
- #: Each entry maps a mojibake bigram to a short label naming the canonical
61
- #: codepoint that was corrupted. The set is intentionally CONSERVATIVE --
62
- #: only the bigrams observed in the four-recurrence record (#236, #240, #283,
63
- #: PR #795 / #844) plus the most common Windows-codepage analogues are listed.
64
- #: Adding a pattern here MUST be paired with a parametrized regression test
65
- #: in ``tests/cli/test_verify_encoding.py``.
66
- MOJIBAKE_PATTERNS: dict[str, str] = {
67
- # CP437-as-UTF-8 (Windows DOS codepage; recurrence record PR #844 / fix #846).
68
- # Pattern: original UTF-8 bytes E2 XX YY decoded by cp437 yields "Γ" + two cp437 glyphs.
69
- "Γèù": "U+2297 (⊗) corrupted via cp437 read",
70
- "Γ£ô": "U+2713 (✓) corrupted via cp437 read",
71
- "ΓǪ": "U+2026 (…) corrupted via cp437 read",
72
- "ΓÇö": "U+2014 (—) corrupted via cp437 read",
73
- "ΓÇô": "U+2013 (–) corrupted via cp437 read",
74
- "ΓÇó": "U+2022 (•) corrupted via cp437 read",
75
- "ΓÇÖ": "U+2019 (’) corrupted via cp437 read",
76
- "ΓÇÿ": "U+2018 (‘) corrupted via cp437 read",
77
- "ΓÇ£": "U+201C (“) corrupted via cp437 read",
78
- "ΓÇØ": "U+201D (”) corrupted via cp437 read",
79
- "ΓåÆ": "U+2192 (→) corrupted via cp437 read",
80
- # CP1252-as-UTF-8 (Windows ANSI codepage; recurrence record #236, #240, #283, PR #795).
81
- # Pattern: original UTF-8 bytes (typically prefixed C2/C3/E2) decoded by cp1252.
82
- "’": "U+2019 (’) corrupted via cp1252 read",
83
- "‘": "U+2018 (‘) corrupted via cp1252 read",
84
- "“": "U+201C (“) corrupted via cp1252 read",
85
- "â€\x9d": "U+201D (”) corrupted via cp1252 read",
86
- "–": "U+2013 (–) corrupted via cp1252 read",
87
- "—": "U+2014 (—) corrupted via cp1252 read",
88
- "…": "U+2026 (…) corrupted via cp1252 read",
89
- "•": "U+2022 (•) corrupted via cp1252 read",
90
- "→": "U+2192 (→) corrupted via cp1252 read",
91
- "§": "U+00A7 (§) corrupted via cp1252 read",
92
- "°": "U+00B0 (°) corrupted via cp1252 read",
93
- "´": "U+00B4 (´) corrupted via cp1252 read",
94
- "­": "U+00AD (soft hyphen) corrupted via cp1252 read",
95
- "©": "U+00A9 (©) corrupted via cp1252 read",
96
- "®": "U+00AE (®) corrupted via cp1252 read",
97
- "±": "U+00B1 (±) corrupted via cp1252 read",
98
- }
99
-
100
- #: U+FFFD REPLACEMENT CHARACTER -- the universal mojibake marker emitted by
101
- #: ``str.decode(..., errors='replace')`` when input bytes can't be decoded.
102
- #: Distinct from MOJIBAKE_PATTERNS because U+FFFD detection is encoding-agnostic.
103
- REPLACEMENT_CHAR = "\ufffd"
104
-
105
- #: UTF-8 BOM byte sequence (``EF BB BF``). Some text formats accept it
106
- #: (.ps1, .csv on Windows) but markdown / JSON / YAML / plain text do NOT --
107
- #: a BOM in those files corrupts downstream parsers and is the signature
108
- #: of a PS 5.1 ``Set-Content -Encoding UTF8`` write.
109
- UTF8_BOM = b"\xef\xbb\xbf"
110
-
111
- #: File extensions where a leading UTF-8 BOM is non-canonical and should be
112
- #: flagged. Other extensions (.csv, .ps1, .bat) tolerate or expect a BOM.
113
- NO_BOM_EXTENSIONS = frozenset({".md", ".json", ".yml", ".yaml", ".txt"})
114
-
115
- #: Control characters that must not hide inside decoded vBRIEF narratives.
116
- #: JSON serializes these as escapes (for example ``\u000b``), so the raw
117
- #: text scanner below cannot see them until the vBRIEF is parsed.
118
- VBRIEF_CONTROL_CHAR_LABELS: dict[str, str] = {
119
- "\b": "U+0008 backspace in vBRIEF narrative",
120
- "\t": "U+0009 tab in vBRIEF narrative",
121
- "\v": "U+000B vertical tab in vBRIEF narrative",
122
- "\f": "U+000C form feed in vBRIEF narrative",
123
- }
124
-
125
- #: File extensions to scan by default. Conservative -- excludes binary formats
126
- #: and source files where the cost/benefit of mojibake detection is lower.
127
- SCANNABLE_EXTENSIONS = frozenset(
128
- {
129
- ".md",
130
- ".json",
131
- ".yml",
132
- ".yaml",
133
- ".txt",
134
- ".py",
135
- ".sh",
136
- ".ps1",
137
- ".toml",
138
- ".cfg",
139
- }
140
- )
141
-
142
- #: Path-glob patterns auto-skipped because the file legitimately contains
143
- #: mojibake byte sequences as part of its purpose. Each entry is matched
144
- #: against the path's POSIX form (forward slashes) via ``fnmatch.fnmatchcase``.
145
- #: When a future recurrence-record vBRIEF documents a new bigram, append its
146
- #: rel-path here -- the rule body lives in this gate, NOT in prose.
147
- BUILTIN_ALLOW_LIST: tuple[str, ...] = (
148
- # The #798 brief catalogues the bigram set being detected; quoting
149
- # the bigrams in its narrative is the brief's own acceptance criterion.
150
- "vbrief/active/*-798-*.vbrief.json",
151
- "vbrief/completed/*-798-*.vbrief.json",
152
- "vbrief/cancelled/*-798-*.vbrief.json",
153
- "vbrief/pending/*-798-*.vbrief.json",
154
- "vbrief/proposed/*-798-*.vbrief.json",
155
- ".deft/core/vbrief/active/*-798-*.vbrief.json",
156
- ".deft/core/vbrief/completed/*-798-*.vbrief.json",
157
- ".deft/core/vbrief/cancelled/*-798-*.vbrief.json",
158
- ".deft/core/vbrief/pending/*-798-*.vbrief.json",
159
- ".deft/core/vbrief/proposed/*-798-*.vbrief.json",
160
- "deft/vbrief/active/*-798-*.vbrief.json",
161
- "deft/vbrief/completed/*-798-*.vbrief.json",
162
- "deft/vbrief/cancelled/*-798-*.vbrief.json",
163
- "deft/vbrief/pending/*-798-*.vbrief.json",
164
- "deft/vbrief/proposed/*-798-*.vbrief.json",
165
- # history/archive/ preserves historical task / vbrief state byte-for-byte.
166
- # Pre-existing mojibake in archived artifacts (e.g. v0.20 migration residue)
167
- # is intentionally retained as a forensic record and MUST NOT be rewritten.
168
- "history/archive/**",
169
- "history/archive/**/*",
170
- ".deft/core/history/archive/**",
171
- ".deft/core/history/archive/**/*",
172
- "deft/history/archive/**",
173
- "deft/history/archive/**/*",
174
- # Self-skip: this script and its test file are the canonical catalog of
175
- # the bigrams being detected. Scanning them would flag every entry in
176
- # MOJIBAKE_PATTERNS as a hit against the file that defines it. The
177
- # forward-coverage contract is upheld by tests/cli/test_verify_encoding.py
178
- # (parametrized over MOJIBAKE_PATTERNS), not by the gate scanning itself.
179
- "scripts/verify_encoding.py",
180
- "tests/cli/test_verify_encoding.py",
181
- ".deft/core/scripts/verify_encoding.py",
182
- ".deft/core/tests/cli/test_verify_encoding.py",
183
- "deft/scripts/verify_encoding.py",
184
- "deft/tests/cli/test_verify_encoding.py",
185
- )
186
-
187
- #: Markdown inline-code span: single backtick to single backtick on one line,
188
- #: not crossing line boundaries (handles both LF and CRLF). Conservative: the
189
- #: regex is non-greedy so a line like `` `foo` and `bar` `` produces two
190
- #: separate matches, not one.
191
- _MD_INLINE_CODE = re.compile(r"`[^`\r\n]*`")
192
-
193
- #: Markdown fenced code block: ``` (or ~~~) ... ``` (or ~~~) across multiple
194
- #: lines. CRLF-robust: trailing-whitespace classes include ``\r`` so the
195
- #: ``$`` anchor still matches when the file is CRLF (Python regex MULTILINE
196
- #: ``$`` matches *before* ``\n``, which on CRLF lines leaves the prior ``\r``
197
- #: needing to be absorbed by the trailing whitespace class). The opening
198
- #: fence allows a language tag (e.g. ``` ```python ```) before the newline.
199
- _MD_FENCED_BLOCK = re.compile(r"(?ms)^[ \t]*(```|~~~)[^\n]*\n.*?^[ \t]*\1[ \t\r]*$")
200
-
201
-
202
- class Finding:
203
- """One mojibake / U+FFFD / BOM detection record."""
204
-
205
- __slots__ = ("path", "line", "label", "context")
206
-
207
- def __init__(self, path: str, line: int, label: str, context: str) -> None:
208
- self.path = path
209
- self.line = line
210
- self.label = label
211
- self.context = context
212
-
213
- def render(self) -> str:
214
- ctx = self.context if len(self.context) <= 120 else self.context[:117] + "..."
215
- return f" {self.path}:{self.line} [{self.label}] {ctx}"
216
-
217
-
218
- def _blank_block(match: re.Match[str]) -> str:
219
- """Replace a fenced code block with the same number of newlines.
220
-
221
- Greptile P1 (PR #862): the prior implementation used
222
- ``_MD_FENCED_BLOCK.sub("", text)`` which removed the newlines that lived
223
- INSIDE the matched fence. After substitution every line that followed in
224
- ``scan_text`` shifted upward by the number of consumed newlines, so a
225
- mojibake hit AFTER a fenced block was reported at the wrong line number
226
- with the wrong context (and the true line was not reported at all). The
227
- gate still exited 1 -- corruption did not silently pass -- but the
228
- diagnostic was misleading.
229
-
230
- Replacing with ``\n`` * count preserves line-count alignment between
231
- ``original_lines`` and ``stripped_lines`` so the zip in :func:`scan_file`
232
- pairs each original line with its stripped counterpart at the same index.
233
- """
234
- return "\n" * match.group(0).count("\n")
235
-
236
-
237
- def _strip_markdown_quotes(text: str) -> str:
238
- """Strip fenced code blocks and inline-code spans from markdown content.
239
-
240
- Rationale: recurrence-record documentation legitimately quotes mojibake
241
- bytes inside backticks (e.g. CHANGELOG entries describing the corruption
242
- being fixed). Stripping these before scanning prevents the gate from
243
- flagging its own documentation. Other file formats (JSON, YAML, source
244
- code) are scanned without this treatment because the false-positive rate
245
- is much lower outside markdown prose.
246
-
247
- Order matters: fenced blocks are stripped first (they may contain
248
- backticks themselves), then inline spans. Fenced blocks are replaced
249
- with newline-preserving blanks (see :func:`_blank_block`) so post-fence
250
- line numbers stay aligned with the original file.
251
- """
252
- text = _MD_FENCED_BLOCK.sub(_blank_block, text)
253
- return _MD_INLINE_CODE.sub("", text)
254
-
255
-
256
- def _load_allow_list(path: Path | None) -> list[str]:
257
- """Read newline-separated glob patterns from ``path``; ignore comments.
258
-
259
- Lines starting with ``#`` and blank lines are skipped. Returns an empty
260
- list when ``path`` is ``None``. Raises :class:`FileNotFoundError` when
261
- a non-``None`` path does not exist (caller maps to exit 2).
262
- """
263
- if path is None:
264
- return []
265
- raw = path.read_text(encoding="utf-8", errors="replace")
266
- out: list[str] = []
267
- for line in raw.splitlines():
268
- stripped = line.strip()
269
- if not stripped or stripped.startswith("#"):
270
- continue
271
- out.append(stripped)
272
- return out
273
-
274
-
275
- def _is_allow_listed(rel_path: str, patterns: Iterable[str]) -> bool:
276
- """Return True when ``rel_path`` (POSIX form) matches any glob in patterns."""
277
- return any(fnmatch.fnmatchcase(rel_path, pat) for pat in patterns)
278
-
279
-
280
- def _git_tracked_files(project_root: Path) -> list[str]:
281
- """Return ``git ls-files`` output as a list of POSIX-form rel paths."""
282
- proc = subprocess.run(
283
- ["git", "ls-files"],
284
- cwd=str(project_root),
285
- capture_output=True,
286
- text=True,
287
- check=False,
288
- )
289
- if proc.returncode != 0:
290
- raise RuntimeError(f"git ls-files failed (rc={proc.returncode}): {proc.stderr.strip()}")
291
- return [line for line in proc.stdout.splitlines() if line.strip()]
292
-
293
-
294
- def _git_staged_files(project_root: Path) -> list[str]:
295
- """Return ``git diff --cached --name-only`` output as POSIX-form rel paths."""
296
- proc = subprocess.run(
297
- ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"],
298
- cwd=str(project_root),
299
- capture_output=True,
300
- text=True,
301
- check=False,
302
- )
303
- if proc.returncode != 0:
304
- raise RuntimeError(
305
- f"git diff --cached failed (rc={proc.returncode}): {proc.stderr.strip()}"
306
- )
307
- return [line for line in proc.stdout.splitlines() if line.strip()]
308
-
309
-
310
- def scan_file(rel_path: str, full_path: Path) -> list[Finding]:
311
- """Scan one file for U+FFFD / mojibake / unexpected BOM.
312
-
313
- Returns a list of :class:`Finding` records (one per hit). An unreadable
314
- or binary file returns an empty list rather than raising -- the gate
315
- is intentionally permissive on read failures so a single unreadable
316
- file does not block a whole pre-commit.
317
- """
318
- findings: list[Finding] = []
319
- suffix = full_path.suffix.lower()
320
-
321
- try:
322
- raw = full_path.read_bytes()
323
- except OSError:
324
- return findings
325
-
326
- if suffix in NO_BOM_EXTENSIONS and raw.startswith(UTF8_BOM):
327
- findings.append(
328
- Finding(
329
- rel_path,
330
- 1,
331
- "unexpected UTF-8 BOM",
332
- "leading bytes EF BB BF on a format where BOM is non-canonical",
333
- )
334
- )
335
-
336
- try:
337
- text = raw.decode("utf-8", errors="replace")
338
- except UnicodeDecodeError:
339
- # Should not happen with errors='replace' but guard anyway.
340
- return findings
341
-
342
- if "\x00" in text[:1024]:
343
- # Likely binary file -- skip mojibake scan.
344
- return findings
345
-
346
- scan_text = text
347
- if suffix == ".md":
348
- scan_text = _strip_markdown_quotes(text)
349
-
350
- # We need original line numbers for diagnostics, so iterate the original
351
- # text but check membership against the stripped form.
352
- if scan_text == text:
353
- lines = text.splitlines()
354
- for lineno, line in enumerate(lines, 1):
355
- findings.extend(_scan_line(rel_path, lineno, line))
356
- else:
357
- # For markdown, scan line-by-line on the stripped form so reported
358
- # line numbers correspond to the original file's line layout. We
359
- # split BOTH the original and stripped text on \n; fenced-block
360
- # stripping replaces blocks with empty strings, which preserves
361
- # line-count alignment because each newline in the block is
362
- # consumed by the regex.
363
- original_lines = text.splitlines()
364
- stripped_lines = scan_text.splitlines()
365
- # Pad stripped to original length defensively so a regex edge-case
366
- # (e.g. trailing newline mismatch) doesn't drop late findings.
367
- if len(stripped_lines) < len(original_lines):
368
- stripped_lines = stripped_lines + [""] * (len(original_lines) - len(stripped_lines))
369
- for lineno, (orig, stripped) in enumerate(
370
- zip(original_lines, stripped_lines, strict=False), 1
371
- ):
372
- findings.extend(_scan_line(rel_path, lineno, stripped, context=orig))
373
-
374
- if _is_vbrief_narrative_control_scope(rel_path):
375
- findings.extend(_scan_vbrief_narrative_controls(rel_path, text))
376
-
377
- return findings
378
-
379
-
380
- def _scan_line(
381
- rel_path: str,
382
- lineno: int,
383
- line: str,
384
- *,
385
- context: str | None = None,
386
- ) -> list[Finding]:
387
- """Scan one line; return findings for U+FFFD + each mojibake pattern hit."""
388
- findings: list[Finding] = []
389
- ctx = context if context is not None else line
390
- if REPLACEMENT_CHAR in line:
391
- findings.append(
392
- Finding(
393
- rel_path,
394
- lineno,
395
- "U+FFFD replacement char",
396
- ctx,
397
- )
398
- )
399
- for pattern, label in MOJIBAKE_PATTERNS.items():
400
- if pattern in line:
401
- findings.append(Finding(rel_path, lineno, label, ctx))
402
- return findings
403
-
404
-
405
- def _is_vbrief_narrative_control_scope(rel_path: str) -> bool:
406
- """Return True for in-flight vBRIEF files that may receive issue ingest."""
407
- if not rel_path.endswith(".vbrief.json"):
408
- return False
409
- normalized_path = rel_path.replace("\\", "/")
410
- normalized = f"/{normalized_path}"
411
- return "/vbrief/proposed/" in normalized or "/vbrief/active/" in normalized
412
-
413
-
414
- def _scan_vbrief_narrative_controls(rel_path: str, text: str) -> list[Finding]:
415
- """Scan decoded ``plan.narratives`` strings for hidden control chars.
416
-
417
- The general scanner works on raw file text. That catches mojibake and
418
- BOMs, but not JSON-escaped controls such as ``"\u000b"`` because the
419
- raw bytes are printable. vBRIEF narratives are user-facing Markdown, so
420
- decode just that structured surface and flag controls after JSON parsing.
421
- """
422
- try:
423
- data = json.loads(text)
424
- except json.JSONDecodeError:
425
- return []
426
- if not isinstance(data, dict):
427
- return []
428
- plan = data.get("plan")
429
- if not isinstance(plan, dict):
430
- return []
431
- narratives = plan.get("narratives")
432
- if not isinstance(narratives, dict):
433
- return []
434
-
435
- findings: list[Finding] = []
436
- for key, value in narratives.items():
437
- if not isinstance(key, str) or not isinstance(value, str):
438
- continue
439
- key_line = _json_key_line(text, key)
440
- for label in _decoded_control_labels(value):
441
- findings.append(
442
- Finding(
443
- rel_path,
444
- key_line,
445
- label,
446
- f"plan.narratives.{key} contains {label}",
447
- )
448
- )
449
- return findings
450
-
451
-
452
- def _decoded_control_labels(value: str) -> list[str]:
453
- """Return unique finding labels for disallowed decoded controls."""
454
- labels: list[str] = []
455
- seen: set[str] = set()
456
- for index, char in enumerate(value):
457
- if char == "\t" and not _tab_is_non_indentation(value, index):
458
- continue
459
- label = VBRIEF_CONTROL_CHAR_LABELS.get(char)
460
- if label is None and ord(char) < 32 and char not in {"\n", "\r"}:
461
- label = f"U+{ord(char):04X} control character in vBRIEF narrative"
462
- if label and label not in seen:
463
- seen.add(label)
464
- labels.append(label)
465
- return labels
466
-
467
-
468
- def _tab_is_non_indentation(value: str, index: int) -> bool:
469
- """Treat tabs after prose as suspicious, but allow leading indentation."""
470
- line_start = value.rfind("\n", 0, index) + 1
471
- return any(ch not in " \t" for ch in value[line_start:index])
472
-
473
-
474
- def _json_key_line(text: str, key: str) -> int:
475
- """Best-effort line number for a JSON object key."""
476
- match = re.search(rf'"{re.escape(key)}"\s*:', text)
477
- if match is None:
478
- return 1
479
- return text.count("\n", 0, match.start()) + 1
480
-
481
-
482
- def _filter_scannable(
483
- rel_paths: Iterable[str],
484
- project_root: Path,
485
- allow_globs: Iterable[str],
486
- ) -> list[tuple[str, Path]]:
487
- """Filter rel paths to existing scannable files, applying allow-list.
488
-
489
- SLizard P1 (PR #862): an earlier draft used
490
- ``str(full).startswith(str(project_root.resolve()))`` as a fallback for
491
- the path-containment check. That string-based comparison is vulnerable
492
- to substring path-traversal (e.g. ``project_root=/a/b`` would match a
493
- sibling ``/a/b-evil/file.txt`` because ``/a/b`` is a string prefix of
494
- ``/a/b-evil``). The current implementation uses
495
- :meth:`Path.is_relative_to` exclusively (Python 3.9+; this project
496
- targets 3.12+) which does proper path-segment containment and rejects
497
- the substring-match attack class by construction. A non-relative path
498
- is dropped silently because it cannot represent a tracked file under
499
- the working tree the gate is scanning.
500
- """
501
- out: list[tuple[str, Path]] = []
502
- allow_globs = list(allow_globs)
503
- project_root_resolved = project_root.resolve()
504
- for rel in rel_paths:
505
- # Normalize to POSIX form for glob matching (git output already is).
506
- posix = rel.replace("\\", "/")
507
- full = (project_root / rel).resolve()
508
- if not full.is_relative_to(project_root_resolved):
509
- continue
510
- if not full.is_file():
511
- continue
512
- if full.suffix.lower() not in SCANNABLE_EXTENSIONS:
513
- continue
514
- if _is_allow_listed(posix, allow_globs):
515
- continue
516
- out.append((posix, full))
517
- return out
518
-
519
-
520
- def evaluate(
521
- project_root: Path,
522
- *,
523
- mode: str = "all",
524
- allow_list_path: Path | None = None,
525
- ) -> tuple[int, list[Finding], str]:
526
- """Pure function returning ``(exit_code, findings, human_message)``.
527
-
528
- Separated from :func:`main` so tests can drive every state without
529
- ``capsys`` plumbing or env-var leak.
530
- """
531
- if mode not in {"all", "staged"}:
532
- return (
533
- 2,
534
- [],
535
- (f"❌ verify_encoding: unrecognised mode '{mode}' (expected 'all' or 'staged')."),
536
- )
537
-
538
- try:
539
- custom_globs = _load_allow_list(allow_list_path)
540
- except FileNotFoundError as exc:
541
- return (
542
- 2,
543
- [],
544
- (
545
- f"❌ verify_encoding: --allow-list file not found: {exc}\n"
546
- " Recovery: pass an existing path or omit the flag."
547
- ),
548
- )
549
- except OSError as exc:
550
- return (
551
- 2,
552
- [],
553
- (
554
- f"❌ verify_encoding: --allow-list unreadable: {exc}\n"
555
- " Recovery: check file permissions."
556
- ),
557
- )
558
-
559
- allow_globs = list(BUILTIN_ALLOW_LIST) + custom_globs
560
-
561
- try:
562
- if mode == "staged":
563
- rel_paths = _git_staged_files(project_root)
564
- else:
565
- rel_paths = _git_tracked_files(project_root)
566
- except FileNotFoundError:
567
- return (
568
- 2,
569
- [],
570
- (
571
- "❌ verify_encoding: 'git' executable not found on PATH.\n"
572
- " Recovery: install git or set DEFT_PYTHON to a python that "
573
- "can spawn git."
574
- ),
575
- )
576
- except RuntimeError as exc:
577
- return (
578
- 2,
579
- [],
580
- (
581
- f"❌ verify_encoding: git failed -- {exc}\n"
582
- " Recovery: ensure --project-root points at a git working tree."
583
- ),
584
- )
585
-
586
- candidates = _filter_scannable(rel_paths, project_root, allow_globs)
587
-
588
- findings: list[Finding] = []
589
- for rel, full in candidates:
590
- findings.extend(scan_file(rel, full))
591
-
592
- if findings:
593
- header = (
594
- f"❌ verify_encoding: detected {len(findings)} mojibake / "
595
- f"U+FFFD / unexpected-BOM hit(s) across {len({f.path for f in findings})} "
596
- f"file(s) (#798).\n"
597
- " Root cause: PowerShell 5.1 Get-Content -Raw decodes via the active "
598
- "Windows codepage (cp1252 or cp437) on the READ side, BEFORE any\n"
599
- " safe write can preserve the bytes. Fix: rewrite the offending "
600
- "files with Python pathlib.Path.write_text(text, encoding='utf-8'),\n"
601
- " re-read from a clean source (git checkout HEAD -- <path>), and "
602
- "do NOT round-trip through PS 5.1 again. See AGENTS.md ## PowerShell.\n"
603
- " Allow-list a documented exception via --allow-list <path> "
604
- "(file with newline-separated glob patterns)."
605
- )
606
- body = "\n".join(f.render() for f in findings[:50])
607
- if len(findings) > 50:
608
- body += f"\n ... and {len(findings) - 50} more"
609
- return 1, findings, f"{header}\n{body}"
610
-
611
- msg = (
612
- f"✓ verify_encoding: {len(candidates)} file(s) clean -- no mojibake / "
613
- "U+FFFD / unexpected-BOM detected (#798)."
614
- )
615
- return 0, findings, msg
616
-
617
-
618
- def _build_parser() -> argparse.ArgumentParser:
619
- parser = argparse.ArgumentParser(
620
- prog="verify_encoding.py",
621
- description=(
622
- "Deterministic gate against PS 5.1 non-ASCII round-trip "
623
- "corruption (#798). Scans tracked text files for U+FFFD "
624
- "replacement chars, the curated CP1252-as-UTF-8 / "
625
- "CP437-as-UTF-8 mojibake bigram set, and unexpected UTF-8 "
626
- "BOM on .md/.json/.yml/.yaml/.txt."
627
- ),
628
- )
629
- mode = parser.add_mutually_exclusive_group()
630
- mode.add_argument(
631
- "--all",
632
- dest="mode",
633
- action="store_const",
634
- const="all",
635
- help="Scan all tracked files via 'git ls-files' (default).",
636
- )
637
- mode.add_argument(
638
- "--staged",
639
- dest="mode",
640
- action="store_const",
641
- const="staged",
642
- help=(
643
- "Scan only staged files via 'git diff --cached --name-only' "
644
- "(used by .githooks/pre-commit)."
645
- ),
646
- )
647
- parser.set_defaults(mode="all")
648
- parser.add_argument(
649
- "--project-root",
650
- default=".",
651
- help="Project root path (default: current working directory).",
652
- )
653
- parser.add_argument(
654
- "--allow-list",
655
- default=None,
656
- help=(
657
- "Path to a file with newline-separated glob patterns of "
658
- "documented exceptions. Lines starting with # are comments."
659
- ),
660
- )
661
- parser.add_argument(
662
- "--quiet",
663
- action="store_true",
664
- help="Suppress the OK message (errors still print).",
665
- )
666
- return parser
667
-
668
-
669
- def main(argv: list[str] | None = None) -> int:
670
- # #814: Force UTF-8 stdout/stderr at hook-script entry. Windows Python
671
- # defaults stdout/stderr to cp1252 (or cp437) when the hook is invoked
672
- # by git, neither of which has a glyph for the U+2713 success marker
673
- # or the various non-ASCII glyphs in this script's diagnostic output.
674
- # Mirrors the block in scripts/preflight_branch.py exactly.
675
- if hasattr(sys.stdout, "reconfigure"):
676
- sys.stdout.reconfigure(encoding="utf-8", errors="replace")
677
- if hasattr(sys.stderr, "reconfigure"):
678
- sys.stderr.reconfigure(encoding="utf-8", errors="replace")
679
-
680
- parser = _build_parser()
681
- args = parser.parse_args(argv)
682
- project_root = Path(args.project_root).resolve()
683
- allow_list_path = Path(args.allow_list).resolve() if args.allow_list else None
684
-
685
- code, _findings, msg = evaluate(
686
- project_root,
687
- mode=args.mode,
688
- allow_list_path=allow_list_path,
689
- )
690
- if code == 0:
691
- if not args.quiet:
692
- print(msg)
693
- else:
694
- print(msg, file=sys.stderr)
695
- return code
696
-
697
-
698
- if __name__ == "__main__":
699
- sys.exit(main())