@deftai/directive-content 0.59.0 → 0.61.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +10 -128
- package/.githooks/pre-push +8 -108
- package/Taskfile.yml +48 -58
- package/UPGRADING.md +19 -3
- package/docs/assets/directive-lifecycle-diagram.png +0 -0
- package/docs/directive-lifecycle.md +73 -0
- package/docs/getting-started.md +5 -1
- package/package.json +3 -3
- package/packs/skills/skills-pack-0.1.json +1 -1
- package/packs/strategies/strategies-pack-0.1.json +19 -19
- package/scm/github.md +37 -6
- package/skills/deft-directive-setup/SKILL.md +24 -15
- package/strategies/speckit.md +14 -14
- package/strategies/v0-20-contract.md +12 -1
- package/tasks/change.yml +16 -31
- package/tasks/ci.yml +8 -0
- package/tasks/commit.yml +12 -19
- package/tasks/core.yml +10 -0
- package/tasks/engine.yml +42 -0
- package/tasks/framework.yml +3 -0
- package/tasks/install.yml +20 -19
- package/tasks/migrate.yml +26 -15
- package/tasks/project.yml +26 -0
- package/tasks/toolchain.yml +15 -5
- package/tasks/vbrief.yml +4 -3
- package/tasks/verify.yml +12 -14
- package/templates/agents-entry.md +1 -1
- package/scripts/_agents_md.py +0 -494
- package/scripts/_cache_fetch.py +0 -635
- package/scripts/_cache_quota.py +0 -529
- package/scripts/_cache_refresh.py +0 -163
- package/scripts/_cache_validate.py +0 -209
- package/scripts/_content_root.py +0 -42
- package/scripts/_doctor_state.py +0 -277
- package/scripts/_event_detect.py +0 -305
- package/scripts/_events.py +0 -514
- package/scripts/_lifecycle_hygiene.py +0 -568
- package/scripts/_pathspec.py +0 -91
- package/scripts/_policy_show_cli.py +0 -266
- package/scripts/_precutover.py +0 -92
- package/scripts/_project_context.py +0 -224
- package/scripts/_project_definition_io.py +0 -164
- package/scripts/_relocate_snapshot.py +0 -209
- package/scripts/_relocate_states.py +0 -343
- package/scripts/_resolve_preflight_path.py +0 -152
- package/scripts/_safe_subprocess.py +0 -167
- package/scripts/_session_start_hook.py +0 -205
- package/scripts/_sor_gate_diff.py +0 -365
- package/scripts/_stdio_utf8.py +0 -59
- package/scripts/_triage_bootstrap_gitignore.py +0 -904
- package/scripts/_triage_classify_cli.py +0 -122
- package/scripts/_triage_queue_cli.py +0 -625
- package/scripts/_triage_scope_cli.py +0 -343
- package/scripts/_triage_scope_drift_cli.py +0 -121
- package/scripts/_triage_scope_ignores.py +0 -286
- package/scripts/_triage_scope_milestone.py +0 -432
- package/scripts/_triage_scope_mutations.py +0 -337
- package/scripts/_triage_scope_renderers.py +0 -207
- package/scripts/_triage_smoketest_stages.py +0 -674
- package/scripts/_triage_subscribe_cli.py +0 -140
- package/scripts/_triage_welcome_cli.py +0 -421
- package/scripts/_vbrief_build.py +0 -239
- package/scripts/_vbrief_fidelity.py +0 -479
- package/scripts/_vbrief_legacy.py +0 -589
- package/scripts/_vbrief_reconciliation.py +0 -883
- package/scripts/_vbrief_routing.py +0 -277
- package/scripts/_vbrief_safety.py +0 -778
- package/scripts/_vbrief_sources.py +0 -312
- package/scripts/_vbrief_speckit.py +0 -262
- package/scripts/_vbrief_story_quality.py +0 -353
- package/scripts/_vbrief_validation.py +0 -299
- package/scripts/build_dist.py +0 -412
- package/scripts/cache.py +0 -1078
- package/scripts/cache_scanner.py +0 -745
- package/scripts/candidates_log.py +0 -432
- package/scripts/capacity_backfill.py +0 -680
- package/scripts/capacity_show.py +0 -653
- package/scripts/ci_local.py +0 -689
- package/scripts/code_structure_validate.py +0 -765
- package/scripts/codebase_default_extractor.py +0 -495
- package/scripts/codebase_map.py +0 -304
- package/scripts/codebase_map_fresh.py +0 -104
- package/scripts/codebase_projection_registry.py +0 -94
- package/scripts/codebase_provider.py +0 -582
- package/scripts/doctor.py +0 -2552
- package/scripts/framework_commands.py +0 -505
- package/scripts/gh_rest.py +0 -882
- package/scripts/github_auth_modes.py +0 -437
- package/scripts/github_body.py +0 -292
- package/scripts/ip_risk.py +0 -531
- package/scripts/issue_emit.py +0 -670
- package/scripts/issue_ingest.py +0 -1064
- package/scripts/migrate_preflight.py +0 -418
- package/scripts/migrate_vbrief.py +0 -2677
- package/scripts/monitor_pr.py +0 -401
- package/scripts/pack_migrate_lessons.py +0 -336
- package/scripts/pack_migrate_patterns.py +0 -254
- package/scripts/pack_migrate_rules.py +0 -350
- package/scripts/pack_migrate_skills.py +0 -423
- package/scripts/pack_migrate_strategies.py +0 -311
- package/scripts/pack_migrate_swarm_spec.py +0 -250
- package/scripts/pack_render.py +0 -434
- package/scripts/packs_slice.py +0 -712
- package/scripts/platform_capabilities.py +0 -336
- package/scripts/policy.py +0 -2826
- package/scripts/policy_set.py +0 -324
- package/scripts/pr_check_closing_keywords.py +0 -524
- package/scripts/pr_check_protected_issues.py +0 -267
- package/scripts/pr_merge_readiness.py +0 -1004
- package/scripts/pr_wait_mergeable.py +0 -669
- package/scripts/prd_render.py +0 -159
- package/scripts/preflight_architecture_sor.py +0 -974
- package/scripts/preflight_branch.py +0 -289
- package/scripts/preflight_cache.py +0 -974
- package/scripts/preflight_gh.py +0 -721
- package/scripts/preflight_implementation.py +0 -272
- package/scripts/preflight_story_start.py +0 -838
- package/scripts/preflight_wip_cap.py +0 -149
- package/scripts/probe_session.py +0 -545
- package/scripts/project_render.py +0 -293
- package/scripts/quarantine_ext.py +0 -237
- package/scripts/reconcile_issues.py +0 -1442
- package/scripts/refresh-path.ps1 +0 -107
- package/scripts/release.py +0 -2030
- package/scripts/release_e2e.py +0 -1011
- package/scripts/release_publish.py +0 -486
- package/scripts/release_rollback.py +0 -980
- package/scripts/relocate.py +0 -1034
- package/scripts/resolve_changelog_unreleased.py +0 -667
- package/scripts/resolve_version.py +0 -490
- package/scripts/resume_conditions.py +0 -706
- package/scripts/ritual_sentinel.py +0 -609
- package/scripts/roadmap_render.py +0 -635
- package/scripts/rule_ownership_lint.py +0 -325
- package/scripts/scm.py +0 -591
- package/scripts/scope_audit_log.py +0 -387
- package/scripts/scope_decompose.py +0 -654
- package/scripts/scope_demote.py +0 -509
- package/scripts/scope_lifecycle.py +0 -1126
- package/scripts/scope_undo.py +0 -772
- package/scripts/session_start.py +0 -406
- package/scripts/setup_ghx.py +0 -339
- package/scripts/setup_windows.ps1 +0 -220
- package/scripts/slice_audit.py +0 -585
- package/scripts/slice_record.py +0 -530
- package/scripts/slice_record_existing.py +0 -692
- package/scripts/slug_normalize.py +0 -178
- package/scripts/spec_render.py +0 -477
- package/scripts/spec_validate.py +0 -238
- package/scripts/subagent_monitor.py +0 -658
- package/scripts/swarm_complete_cohort.py +0 -644
- package/scripts/swarm_launch.py +0 -1206
- package/scripts/swarm_readiness.py +0 -554
- package/scripts/swarm_verify_review_clean.py +0 -438
- package/scripts/swarm_worktrees.py +0 -497
- package/scripts/toolchain-check.py +0 -52
- package/scripts/triage_actions.py +0 -871
- package/scripts/triage_bootstrap.py +0 -1153
- package/scripts/triage_bulk.py +0 -630
- package/scripts/triage_classify.py +0 -932
- package/scripts/triage_help.py +0 -1685
- package/scripts/triage_queue.py +0 -1944
- package/scripts/triage_reconcile.py +0 -581
- package/scripts/triage_refresh.py +0 -643
- package/scripts/triage_scope.py +0 -999
- package/scripts/triage_scope_drift.py +0 -575
- package/scripts/triage_smoketest.py +0 -396
- package/scripts/triage_subscribe.py +0 -399
- package/scripts/triage_summary.py +0 -1011
- package/scripts/triage_welcome.py +0 -1178
- package/scripts/ts_check_lane.py +0 -86
- package/scripts/validate-links.py +0 -64
- package/scripts/validate_strategy_output.py +0 -212
- package/scripts/vbrief_activate.py +0 -228
- package/scripts/vbrief_migrate_conformance.py +0 -368
- package/scripts/vbrief_reconcile_graph.py +0 -306
- package/scripts/vbrief_reconcile_labels.py +0 -460
- package/scripts/vbrief_reconcile_umbrellas.py +0 -741
- package/scripts/vbrief_validate.py +0 -1144
- package/scripts/verify-stubs.py +0 -61
- package/scripts/verify_capacity.py +0 -160
- package/scripts/verify_encoding.py +0 -699
- package/scripts/verify_hooks_installed.py +0 -206
- package/scripts/verify_investigation.py +0 -360
- package/scripts/verify_judgment_gates.py +0 -827
- package/scripts/verify_no_task_runtime.py +0 -171
- package/scripts/verify_scm_boundary.py +0 -509
- package/scripts/verify_session_ritual.py +0 -389
- package/scripts/verify_tools.py +0 -426
- package/scripts/verify_vbrief_conformance.py +0 -478
|
@@ -1,699 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""verify_encoding.py -- deterministic gate against PS 5.1 non-ASCII round-trip corruption (#798).
|
|
3
|
-
|
|
4
|
-
Pure stdlib, cross-platform. Invoked from:
|
|
5
|
-
|
|
6
|
-
- ``.githooks/pre-commit`` via ``--staged`` after ``preflight_branch.py`` (#747)
|
|
7
|
-
- ``task verify:encoding`` (aggregated into ``task check``) via ``--all``
|
|
8
|
-
- ``uv run python scripts/verify_encoding.py [--staged|--all] [--allow-list <path>]``
|
|
9
|
-
|
|
10
|
-
Recurrence chain (this gate elevates the rule from prose tier to deterministic
|
|
11
|
-
tier per main.md Rule Authority [AXIOM]):
|
|
12
|
-
|
|
13
|
-
- #236 t1.11.1 -- ``Get-Content -Raw`` + BOM-safe write rules in scm/github.md
|
|
14
|
-
- #240 t1.11.2 -- Warp multi-line PS here-string rule in scm/github.md
|
|
15
|
-
- #283 t1.20.1 -- ``New-Object System.Text.UTF8Encoding $false`` rule in AGENTS.md
|
|
16
|
-
- PR #795 (2026-05-01) -- 132-line CHANGELOG mojibake on the same maintainer
|
|
17
|
-
with all three rules loaded, because the corruption happened on the READ side
|
|
18
|
-
(``Get-Content -Raw`` decodes via the active codepage, typically CP1252 or
|
|
19
|
-
CP437 on Windows) BEFORE any safe write could preserve the bytes.
|
|
20
|
-
|
|
21
|
-
Detection scope (UTF-8 codepoint sequences that appear after a Windows
|
|
22
|
-
codepage round-trip):
|
|
23
|
-
|
|
24
|
-
- U+FFFD replacement chars (universal corruption marker).
|
|
25
|
-
- CP1252-as-UTF-8 mojibake bigrams (``§``, ``°``, ``’``, ``…``, ``â†'`` ...).
|
|
26
|
-
- CP437-as-UTF-8 mojibake bigrams (``⊗``, ``✓``, ``…``, ``—`` ...).
|
|
27
|
-
- Unexpected UTF-8 BOM (``EF BB BF``) on text formats where BOM is non-canonical
|
|
28
|
-
(.md, .json, .yml, .yaml, .txt).
|
|
29
|
-
|
|
30
|
-
False-positive guards:
|
|
31
|
-
|
|
32
|
-
- Markdown inline code spans (single backticks) and fenced code blocks (triple
|
|
33
|
-
backticks) are stripped before scanning .md files -- recurrence-record prose
|
|
34
|
-
legitimately quotes mojibake bytes inside backticks.
|
|
35
|
-
- A built-in allow-list skips the #798 brief itself (which documents the
|
|
36
|
-
bigram catalog as part of its acceptance criteria).
|
|
37
|
-
- ``--allow-list <path>`` accepts a newline-separated list of glob patterns
|
|
38
|
-
for project-specific documented exceptions (e.g. regression fixtures).
|
|
39
|
-
|
|
40
|
-
Exit codes (three-state, mirrors ``scripts/preflight_branch.py``):
|
|
41
|
-
|
|
42
|
-
- ``0`` -- clean: no mojibake / U+FFFD / unexpected BOM detected.
|
|
43
|
-
- ``1`` -- corruption found: prints per-hit ``path:line:[label] context``.
|
|
44
|
-
- ``2`` -- config error: ``--allow-list`` path unreadable, ``--staged``
|
|
45
|
-
outside a git repo, or unrecognised CLI shape.
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
from __future__ import annotations
|
|
49
|
-
|
|
50
|
-
import argparse
|
|
51
|
-
import fnmatch
|
|
52
|
-
import json
|
|
53
|
-
import re
|
|
54
|
-
import subprocess
|
|
55
|
-
import sys
|
|
56
|
-
from collections.abc import Iterable
|
|
57
|
-
from pathlib import Path
|
|
58
|
-
|
|
59
|
-
#: Codepoint sequences that signal a Windows codepage round-trip corruption.
|
|
60
|
-
#: Each entry maps a mojibake bigram to a short label naming the canonical
|
|
61
|
-
#: codepoint that was corrupted. The set is intentionally CONSERVATIVE --
|
|
62
|
-
#: only the bigrams observed in the four-recurrence record (#236, #240, #283,
|
|
63
|
-
#: PR #795 / #844) plus the most common Windows-codepage analogues are listed.
|
|
64
|
-
#: Adding a pattern here MUST be paired with a parametrized regression test
|
|
65
|
-
#: in ``tests/cli/test_verify_encoding.py``.
|
|
66
|
-
MOJIBAKE_PATTERNS: dict[str, str] = {
|
|
67
|
-
# CP437-as-UTF-8 (Windows DOS codepage; recurrence record PR #844 / fix #846).
|
|
68
|
-
# Pattern: original UTF-8 bytes E2 XX YY decoded by cp437 yields "Γ" + two cp437 glyphs.
|
|
69
|
-
"Γèù": "U+2297 (⊗) corrupted via cp437 read",
|
|
70
|
-
"Γ£ô": "U+2713 (✓) corrupted via cp437 read",
|
|
71
|
-
"ΓǪ": "U+2026 (…) corrupted via cp437 read",
|
|
72
|
-
"ΓÇö": "U+2014 (—) corrupted via cp437 read",
|
|
73
|
-
"ΓÇô": "U+2013 (–) corrupted via cp437 read",
|
|
74
|
-
"ΓÇó": "U+2022 (•) corrupted via cp437 read",
|
|
75
|
-
"ΓÇÖ": "U+2019 (’) corrupted via cp437 read",
|
|
76
|
-
"ΓÇÿ": "U+2018 (‘) corrupted via cp437 read",
|
|
77
|
-
"ΓÇ£": "U+201C (“) corrupted via cp437 read",
|
|
78
|
-
"ΓÇØ": "U+201D (”) corrupted via cp437 read",
|
|
79
|
-
"ΓåÆ": "U+2192 (→) corrupted via cp437 read",
|
|
80
|
-
# CP1252-as-UTF-8 (Windows ANSI codepage; recurrence record #236, #240, #283, PR #795).
|
|
81
|
-
# Pattern: original UTF-8 bytes (typically prefixed C2/C3/E2) decoded by cp1252.
|
|
82
|
-
"’": "U+2019 (’) corrupted via cp1252 read",
|
|
83
|
-
"‘": "U+2018 (‘) corrupted via cp1252 read",
|
|
84
|
-
"“": "U+201C (“) corrupted via cp1252 read",
|
|
85
|
-
"â€\x9d": "U+201D (”) corrupted via cp1252 read",
|
|
86
|
-
"–": "U+2013 (–) corrupted via cp1252 read",
|
|
87
|
-
"—": "U+2014 (—) corrupted via cp1252 read",
|
|
88
|
-
"…": "U+2026 (…) corrupted via cp1252 read",
|
|
89
|
-
"•": "U+2022 (•) corrupted via cp1252 read",
|
|
90
|
-
"→": "U+2192 (→) corrupted via cp1252 read",
|
|
91
|
-
"§": "U+00A7 (§) corrupted via cp1252 read",
|
|
92
|
-
"°": "U+00B0 (°) corrupted via cp1252 read",
|
|
93
|
-
"´": "U+00B4 (´) corrupted via cp1252 read",
|
|
94
|
-
"Â": "U+00AD (soft hyphen) corrupted via cp1252 read",
|
|
95
|
-
"©": "U+00A9 (©) corrupted via cp1252 read",
|
|
96
|
-
"®": "U+00AE (®) corrupted via cp1252 read",
|
|
97
|
-
"±": "U+00B1 (±) corrupted via cp1252 read",
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
#: U+FFFD REPLACEMENT CHARACTER -- the universal mojibake marker emitted by
|
|
101
|
-
#: ``str.decode(..., errors='replace')`` when input bytes can't be decoded.
|
|
102
|
-
#: Distinct from MOJIBAKE_PATTERNS because U+FFFD detection is encoding-agnostic.
|
|
103
|
-
REPLACEMENT_CHAR = "\ufffd"
|
|
104
|
-
|
|
105
|
-
#: UTF-8 BOM byte sequence (``EF BB BF``). Some text formats accept it
|
|
106
|
-
#: (.ps1, .csv on Windows) but markdown / JSON / YAML / plain text do NOT --
|
|
107
|
-
#: a BOM in those files corrupts downstream parsers and is the signature
|
|
108
|
-
#: of a PS 5.1 ``Set-Content -Encoding UTF8`` write.
|
|
109
|
-
UTF8_BOM = b"\xef\xbb\xbf"
|
|
110
|
-
|
|
111
|
-
#: File extensions where a leading UTF-8 BOM is non-canonical and should be
|
|
112
|
-
#: flagged. Other extensions (.csv, .ps1, .bat) tolerate or expect a BOM.
|
|
113
|
-
NO_BOM_EXTENSIONS = frozenset({".md", ".json", ".yml", ".yaml", ".txt"})
|
|
114
|
-
|
|
115
|
-
#: Control characters that must not hide inside decoded vBRIEF narratives.
|
|
116
|
-
#: JSON serializes these as escapes (for example ``\u000b``), so the raw
|
|
117
|
-
#: text scanner below cannot see them until the vBRIEF is parsed.
|
|
118
|
-
VBRIEF_CONTROL_CHAR_LABELS: dict[str, str] = {
|
|
119
|
-
"\b": "U+0008 backspace in vBRIEF narrative",
|
|
120
|
-
"\t": "U+0009 tab in vBRIEF narrative",
|
|
121
|
-
"\v": "U+000B vertical tab in vBRIEF narrative",
|
|
122
|
-
"\f": "U+000C form feed in vBRIEF narrative",
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
#: File extensions to scan by default. Conservative -- excludes binary formats
|
|
126
|
-
#: and source files where the cost/benefit of mojibake detection is lower.
|
|
127
|
-
SCANNABLE_EXTENSIONS = frozenset(
|
|
128
|
-
{
|
|
129
|
-
".md",
|
|
130
|
-
".json",
|
|
131
|
-
".yml",
|
|
132
|
-
".yaml",
|
|
133
|
-
".txt",
|
|
134
|
-
".py",
|
|
135
|
-
".sh",
|
|
136
|
-
".ps1",
|
|
137
|
-
".toml",
|
|
138
|
-
".cfg",
|
|
139
|
-
}
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
#: Path-glob patterns auto-skipped because the file legitimately contains
|
|
143
|
-
#: mojibake byte sequences as part of its purpose. Each entry is matched
|
|
144
|
-
#: against the path's POSIX form (forward slashes) via ``fnmatch.fnmatchcase``.
|
|
145
|
-
#: When a future recurrence-record vBRIEF documents a new bigram, append its
|
|
146
|
-
#: rel-path here -- the rule body lives in this gate, NOT in prose.
|
|
147
|
-
BUILTIN_ALLOW_LIST: tuple[str, ...] = (
|
|
148
|
-
# The #798 brief catalogues the bigram set being detected; quoting
|
|
149
|
-
# the bigrams in its narrative is the brief's own acceptance criterion.
|
|
150
|
-
"vbrief/active/*-798-*.vbrief.json",
|
|
151
|
-
"vbrief/completed/*-798-*.vbrief.json",
|
|
152
|
-
"vbrief/cancelled/*-798-*.vbrief.json",
|
|
153
|
-
"vbrief/pending/*-798-*.vbrief.json",
|
|
154
|
-
"vbrief/proposed/*-798-*.vbrief.json",
|
|
155
|
-
".deft/core/vbrief/active/*-798-*.vbrief.json",
|
|
156
|
-
".deft/core/vbrief/completed/*-798-*.vbrief.json",
|
|
157
|
-
".deft/core/vbrief/cancelled/*-798-*.vbrief.json",
|
|
158
|
-
".deft/core/vbrief/pending/*-798-*.vbrief.json",
|
|
159
|
-
".deft/core/vbrief/proposed/*-798-*.vbrief.json",
|
|
160
|
-
"deft/vbrief/active/*-798-*.vbrief.json",
|
|
161
|
-
"deft/vbrief/completed/*-798-*.vbrief.json",
|
|
162
|
-
"deft/vbrief/cancelled/*-798-*.vbrief.json",
|
|
163
|
-
"deft/vbrief/pending/*-798-*.vbrief.json",
|
|
164
|
-
"deft/vbrief/proposed/*-798-*.vbrief.json",
|
|
165
|
-
# history/archive/ preserves historical task / vbrief state byte-for-byte.
|
|
166
|
-
# Pre-existing mojibake in archived artifacts (e.g. v0.20 migration residue)
|
|
167
|
-
# is intentionally retained as a forensic record and MUST NOT be rewritten.
|
|
168
|
-
"history/archive/**",
|
|
169
|
-
"history/archive/**/*",
|
|
170
|
-
".deft/core/history/archive/**",
|
|
171
|
-
".deft/core/history/archive/**/*",
|
|
172
|
-
"deft/history/archive/**",
|
|
173
|
-
"deft/history/archive/**/*",
|
|
174
|
-
# Self-skip: this script and its test file are the canonical catalog of
|
|
175
|
-
# the bigrams being detected. Scanning them would flag every entry in
|
|
176
|
-
# MOJIBAKE_PATTERNS as a hit against the file that defines it. The
|
|
177
|
-
# forward-coverage contract is upheld by tests/cli/test_verify_encoding.py
|
|
178
|
-
# (parametrized over MOJIBAKE_PATTERNS), not by the gate scanning itself.
|
|
179
|
-
"scripts/verify_encoding.py",
|
|
180
|
-
"tests/cli/test_verify_encoding.py",
|
|
181
|
-
".deft/core/scripts/verify_encoding.py",
|
|
182
|
-
".deft/core/tests/cli/test_verify_encoding.py",
|
|
183
|
-
"deft/scripts/verify_encoding.py",
|
|
184
|
-
"deft/tests/cli/test_verify_encoding.py",
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
#: Markdown inline-code span: single backtick to single backtick on one line,
|
|
188
|
-
#: not crossing line boundaries (handles both LF and CRLF). Conservative: the
|
|
189
|
-
#: regex is non-greedy so a line like `` `foo` and `bar` `` produces two
|
|
190
|
-
#: separate matches, not one.
|
|
191
|
-
_MD_INLINE_CODE = re.compile(r"`[^`\r\n]*`")
|
|
192
|
-
|
|
193
|
-
#: Markdown fenced code block: ``` (or ~~~) ... ``` (or ~~~) across multiple
|
|
194
|
-
#: lines. CRLF-robust: trailing-whitespace classes include ``\r`` so the
|
|
195
|
-
#: ``$`` anchor still matches when the file is CRLF (Python regex MULTILINE
|
|
196
|
-
#: ``$`` matches *before* ``\n``, which on CRLF lines leaves the prior ``\r``
|
|
197
|
-
#: needing to be absorbed by the trailing whitespace class). The opening
|
|
198
|
-
#: fence allows a language tag (e.g. ``` ```python ```) before the newline.
|
|
199
|
-
_MD_FENCED_BLOCK = re.compile(r"(?ms)^[ \t]*(```|~~~)[^\n]*\n.*?^[ \t]*\1[ \t\r]*$")
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
class Finding:
|
|
203
|
-
"""One mojibake / U+FFFD / BOM detection record."""
|
|
204
|
-
|
|
205
|
-
__slots__ = ("path", "line", "label", "context")
|
|
206
|
-
|
|
207
|
-
def __init__(self, path: str, line: int, label: str, context: str) -> None:
|
|
208
|
-
self.path = path
|
|
209
|
-
self.line = line
|
|
210
|
-
self.label = label
|
|
211
|
-
self.context = context
|
|
212
|
-
|
|
213
|
-
def render(self) -> str:
|
|
214
|
-
ctx = self.context if len(self.context) <= 120 else self.context[:117] + "..."
|
|
215
|
-
return f" {self.path}:{self.line} [{self.label}] {ctx}"
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def _blank_block(match: re.Match[str]) -> str:
|
|
219
|
-
"""Replace a fenced code block with the same number of newlines.
|
|
220
|
-
|
|
221
|
-
Greptile P1 (PR #862): the prior implementation used
|
|
222
|
-
``_MD_FENCED_BLOCK.sub("", text)`` which removed the newlines that lived
|
|
223
|
-
INSIDE the matched fence. After substitution every line that followed in
|
|
224
|
-
``scan_text`` shifted upward by the number of consumed newlines, so a
|
|
225
|
-
mojibake hit AFTER a fenced block was reported at the wrong line number
|
|
226
|
-
with the wrong context (and the true line was not reported at all). The
|
|
227
|
-
gate still exited 1 -- corruption did not silently pass -- but the
|
|
228
|
-
diagnostic was misleading.
|
|
229
|
-
|
|
230
|
-
Replacing with ``\n`` * count preserves line-count alignment between
|
|
231
|
-
``original_lines`` and ``stripped_lines`` so the zip in :func:`scan_file`
|
|
232
|
-
pairs each original line with its stripped counterpart at the same index.
|
|
233
|
-
"""
|
|
234
|
-
return "\n" * match.group(0).count("\n")
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def _strip_markdown_quotes(text: str) -> str:
|
|
238
|
-
"""Strip fenced code blocks and inline-code spans from markdown content.
|
|
239
|
-
|
|
240
|
-
Rationale: recurrence-record documentation legitimately quotes mojibake
|
|
241
|
-
bytes inside backticks (e.g. CHANGELOG entries describing the corruption
|
|
242
|
-
being fixed). Stripping these before scanning prevents the gate from
|
|
243
|
-
flagging its own documentation. Other file formats (JSON, YAML, source
|
|
244
|
-
code) are scanned without this treatment because the false-positive rate
|
|
245
|
-
is much lower outside markdown prose.
|
|
246
|
-
|
|
247
|
-
Order matters: fenced blocks are stripped first (they may contain
|
|
248
|
-
backticks themselves), then inline spans. Fenced blocks are replaced
|
|
249
|
-
with newline-preserving blanks (see :func:`_blank_block`) so post-fence
|
|
250
|
-
line numbers stay aligned with the original file.
|
|
251
|
-
"""
|
|
252
|
-
text = _MD_FENCED_BLOCK.sub(_blank_block, text)
|
|
253
|
-
return _MD_INLINE_CODE.sub("", text)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def _load_allow_list(path: Path | None) -> list[str]:
|
|
257
|
-
"""Read newline-separated glob patterns from ``path``; ignore comments.
|
|
258
|
-
|
|
259
|
-
Lines starting with ``#`` and blank lines are skipped. Returns an empty
|
|
260
|
-
list when ``path`` is ``None``. Raises :class:`FileNotFoundError` when
|
|
261
|
-
a non-``None`` path does not exist (caller maps to exit 2).
|
|
262
|
-
"""
|
|
263
|
-
if path is None:
|
|
264
|
-
return []
|
|
265
|
-
raw = path.read_text(encoding="utf-8", errors="replace")
|
|
266
|
-
out: list[str] = []
|
|
267
|
-
for line in raw.splitlines():
|
|
268
|
-
stripped = line.strip()
|
|
269
|
-
if not stripped or stripped.startswith("#"):
|
|
270
|
-
continue
|
|
271
|
-
out.append(stripped)
|
|
272
|
-
return out
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
def _is_allow_listed(rel_path: str, patterns: Iterable[str]) -> bool:
|
|
276
|
-
"""Return True when ``rel_path`` (POSIX form) matches any glob in patterns."""
|
|
277
|
-
return any(fnmatch.fnmatchcase(rel_path, pat) for pat in patterns)
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
def _git_tracked_files(project_root: Path) -> list[str]:
|
|
281
|
-
"""Return ``git ls-files`` output as a list of POSIX-form rel paths."""
|
|
282
|
-
proc = subprocess.run(
|
|
283
|
-
["git", "ls-files"],
|
|
284
|
-
cwd=str(project_root),
|
|
285
|
-
capture_output=True,
|
|
286
|
-
text=True,
|
|
287
|
-
check=False,
|
|
288
|
-
)
|
|
289
|
-
if proc.returncode != 0:
|
|
290
|
-
raise RuntimeError(f"git ls-files failed (rc={proc.returncode}): {proc.stderr.strip()}")
|
|
291
|
-
return [line for line in proc.stdout.splitlines() if line.strip()]
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def _git_staged_files(project_root: Path) -> list[str]:
|
|
295
|
-
"""Return ``git diff --cached --name-only`` output as POSIX-form rel paths."""
|
|
296
|
-
proc = subprocess.run(
|
|
297
|
-
["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"],
|
|
298
|
-
cwd=str(project_root),
|
|
299
|
-
capture_output=True,
|
|
300
|
-
text=True,
|
|
301
|
-
check=False,
|
|
302
|
-
)
|
|
303
|
-
if proc.returncode != 0:
|
|
304
|
-
raise RuntimeError(
|
|
305
|
-
f"git diff --cached failed (rc={proc.returncode}): {proc.stderr.strip()}"
|
|
306
|
-
)
|
|
307
|
-
return [line for line in proc.stdout.splitlines() if line.strip()]
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
def scan_file(rel_path: str, full_path: Path) -> list[Finding]:
|
|
311
|
-
"""Scan one file for U+FFFD / mojibake / unexpected BOM.
|
|
312
|
-
|
|
313
|
-
Returns a list of :class:`Finding` records (one per hit). An unreadable
|
|
314
|
-
or binary file returns an empty list rather than raising -- the gate
|
|
315
|
-
is intentionally permissive on read failures so a single unreadable
|
|
316
|
-
file does not block a whole pre-commit.
|
|
317
|
-
"""
|
|
318
|
-
findings: list[Finding] = []
|
|
319
|
-
suffix = full_path.suffix.lower()
|
|
320
|
-
|
|
321
|
-
try:
|
|
322
|
-
raw = full_path.read_bytes()
|
|
323
|
-
except OSError:
|
|
324
|
-
return findings
|
|
325
|
-
|
|
326
|
-
if suffix in NO_BOM_EXTENSIONS and raw.startswith(UTF8_BOM):
|
|
327
|
-
findings.append(
|
|
328
|
-
Finding(
|
|
329
|
-
rel_path,
|
|
330
|
-
1,
|
|
331
|
-
"unexpected UTF-8 BOM",
|
|
332
|
-
"leading bytes EF BB BF on a format where BOM is non-canonical",
|
|
333
|
-
)
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
try:
|
|
337
|
-
text = raw.decode("utf-8", errors="replace")
|
|
338
|
-
except UnicodeDecodeError:
|
|
339
|
-
# Should not happen with errors='replace' but guard anyway.
|
|
340
|
-
return findings
|
|
341
|
-
|
|
342
|
-
if "\x00" in text[:1024]:
|
|
343
|
-
# Likely binary file -- skip mojibake scan.
|
|
344
|
-
return findings
|
|
345
|
-
|
|
346
|
-
scan_text = text
|
|
347
|
-
if suffix == ".md":
|
|
348
|
-
scan_text = _strip_markdown_quotes(text)
|
|
349
|
-
|
|
350
|
-
# We need original line numbers for diagnostics, so iterate the original
|
|
351
|
-
# text but check membership against the stripped form.
|
|
352
|
-
if scan_text == text:
|
|
353
|
-
lines = text.splitlines()
|
|
354
|
-
for lineno, line in enumerate(lines, 1):
|
|
355
|
-
findings.extend(_scan_line(rel_path, lineno, line))
|
|
356
|
-
else:
|
|
357
|
-
# For markdown, scan line-by-line on the stripped form so reported
|
|
358
|
-
# line numbers correspond to the original file's line layout. We
|
|
359
|
-
# split BOTH the original and stripped text on \n; fenced-block
|
|
360
|
-
# stripping replaces blocks with empty strings, which preserves
|
|
361
|
-
# line-count alignment because each newline in the block is
|
|
362
|
-
# consumed by the regex.
|
|
363
|
-
original_lines = text.splitlines()
|
|
364
|
-
stripped_lines = scan_text.splitlines()
|
|
365
|
-
# Pad stripped to original length defensively so a regex edge-case
|
|
366
|
-
# (e.g. trailing newline mismatch) doesn't drop late findings.
|
|
367
|
-
if len(stripped_lines) < len(original_lines):
|
|
368
|
-
stripped_lines = stripped_lines + [""] * (len(original_lines) - len(stripped_lines))
|
|
369
|
-
for lineno, (orig, stripped) in enumerate(
|
|
370
|
-
zip(original_lines, stripped_lines, strict=False), 1
|
|
371
|
-
):
|
|
372
|
-
findings.extend(_scan_line(rel_path, lineno, stripped, context=orig))
|
|
373
|
-
|
|
374
|
-
if _is_vbrief_narrative_control_scope(rel_path):
|
|
375
|
-
findings.extend(_scan_vbrief_narrative_controls(rel_path, text))
|
|
376
|
-
|
|
377
|
-
return findings
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def _scan_line(
|
|
381
|
-
rel_path: str,
|
|
382
|
-
lineno: int,
|
|
383
|
-
line: str,
|
|
384
|
-
*,
|
|
385
|
-
context: str | None = None,
|
|
386
|
-
) -> list[Finding]:
|
|
387
|
-
"""Scan one line; return findings for U+FFFD + each mojibake pattern hit."""
|
|
388
|
-
findings: list[Finding] = []
|
|
389
|
-
ctx = context if context is not None else line
|
|
390
|
-
if REPLACEMENT_CHAR in line:
|
|
391
|
-
findings.append(
|
|
392
|
-
Finding(
|
|
393
|
-
rel_path,
|
|
394
|
-
lineno,
|
|
395
|
-
"U+FFFD replacement char",
|
|
396
|
-
ctx,
|
|
397
|
-
)
|
|
398
|
-
)
|
|
399
|
-
for pattern, label in MOJIBAKE_PATTERNS.items():
|
|
400
|
-
if pattern in line:
|
|
401
|
-
findings.append(Finding(rel_path, lineno, label, ctx))
|
|
402
|
-
return findings
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
def _is_vbrief_narrative_control_scope(rel_path: str) -> bool:
|
|
406
|
-
"""Return True for in-flight vBRIEF files that may receive issue ingest."""
|
|
407
|
-
if not rel_path.endswith(".vbrief.json"):
|
|
408
|
-
return False
|
|
409
|
-
normalized_path = rel_path.replace("\\", "/")
|
|
410
|
-
normalized = f"/{normalized_path}"
|
|
411
|
-
return "/vbrief/proposed/" in normalized or "/vbrief/active/" in normalized
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
def _scan_vbrief_narrative_controls(rel_path: str, text: str) -> list[Finding]:
|
|
415
|
-
"""Scan decoded ``plan.narratives`` strings for hidden control chars.
|
|
416
|
-
|
|
417
|
-
The general scanner works on raw file text. That catches mojibake and
|
|
418
|
-
BOMs, but not JSON-escaped controls such as ``"\u000b"`` because the
|
|
419
|
-
raw bytes are printable. vBRIEF narratives are user-facing Markdown, so
|
|
420
|
-
decode just that structured surface and flag controls after JSON parsing.
|
|
421
|
-
"""
|
|
422
|
-
try:
|
|
423
|
-
data = json.loads(text)
|
|
424
|
-
except json.JSONDecodeError:
|
|
425
|
-
return []
|
|
426
|
-
if not isinstance(data, dict):
|
|
427
|
-
return []
|
|
428
|
-
plan = data.get("plan")
|
|
429
|
-
if not isinstance(plan, dict):
|
|
430
|
-
return []
|
|
431
|
-
narratives = plan.get("narratives")
|
|
432
|
-
if not isinstance(narratives, dict):
|
|
433
|
-
return []
|
|
434
|
-
|
|
435
|
-
findings: list[Finding] = []
|
|
436
|
-
for key, value in narratives.items():
|
|
437
|
-
if not isinstance(key, str) or not isinstance(value, str):
|
|
438
|
-
continue
|
|
439
|
-
key_line = _json_key_line(text, key)
|
|
440
|
-
for label in _decoded_control_labels(value):
|
|
441
|
-
findings.append(
|
|
442
|
-
Finding(
|
|
443
|
-
rel_path,
|
|
444
|
-
key_line,
|
|
445
|
-
label,
|
|
446
|
-
f"plan.narratives.{key} contains {label}",
|
|
447
|
-
)
|
|
448
|
-
)
|
|
449
|
-
return findings
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
def _decoded_control_labels(value: str) -> list[str]:
|
|
453
|
-
"""Return unique finding labels for disallowed decoded controls."""
|
|
454
|
-
labels: list[str] = []
|
|
455
|
-
seen: set[str] = set()
|
|
456
|
-
for index, char in enumerate(value):
|
|
457
|
-
if char == "\t" and not _tab_is_non_indentation(value, index):
|
|
458
|
-
continue
|
|
459
|
-
label = VBRIEF_CONTROL_CHAR_LABELS.get(char)
|
|
460
|
-
if label is None and ord(char) < 32 and char not in {"\n", "\r"}:
|
|
461
|
-
label = f"U+{ord(char):04X} control character in vBRIEF narrative"
|
|
462
|
-
if label and label not in seen:
|
|
463
|
-
seen.add(label)
|
|
464
|
-
labels.append(label)
|
|
465
|
-
return labels
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
def _tab_is_non_indentation(value: str, index: int) -> bool:
|
|
469
|
-
"""Treat tabs after prose as suspicious, but allow leading indentation."""
|
|
470
|
-
line_start = value.rfind("\n", 0, index) + 1
|
|
471
|
-
return any(ch not in " \t" for ch in value[line_start:index])
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
def _json_key_line(text: str, key: str) -> int:
|
|
475
|
-
"""Best-effort line number for a JSON object key."""
|
|
476
|
-
match = re.search(rf'"{re.escape(key)}"\s*:', text)
|
|
477
|
-
if match is None:
|
|
478
|
-
return 1
|
|
479
|
-
return text.count("\n", 0, match.start()) + 1
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
def _filter_scannable(
|
|
483
|
-
rel_paths: Iterable[str],
|
|
484
|
-
project_root: Path,
|
|
485
|
-
allow_globs: Iterable[str],
|
|
486
|
-
) -> list[tuple[str, Path]]:
|
|
487
|
-
"""Filter rel paths to existing scannable files, applying allow-list.
|
|
488
|
-
|
|
489
|
-
SLizard P1 (PR #862): an earlier draft used
|
|
490
|
-
``str(full).startswith(str(project_root.resolve()))`` as a fallback for
|
|
491
|
-
the path-containment check. That string-based comparison is vulnerable
|
|
492
|
-
to substring path-traversal (e.g. ``project_root=/a/b`` would match a
|
|
493
|
-
sibling ``/a/b-evil/file.txt`` because ``/a/b`` is a string prefix of
|
|
494
|
-
``/a/b-evil``). The current implementation uses
|
|
495
|
-
:meth:`Path.is_relative_to` exclusively (Python 3.9+; this project
|
|
496
|
-
targets 3.12+) which does proper path-segment containment and rejects
|
|
497
|
-
the substring-match attack class by construction. A non-relative path
|
|
498
|
-
is dropped silently because it cannot represent a tracked file under
|
|
499
|
-
the working tree the gate is scanning.
|
|
500
|
-
"""
|
|
501
|
-
out: list[tuple[str, Path]] = []
|
|
502
|
-
allow_globs = list(allow_globs)
|
|
503
|
-
project_root_resolved = project_root.resolve()
|
|
504
|
-
for rel in rel_paths:
|
|
505
|
-
# Normalize to POSIX form for glob matching (git output already is).
|
|
506
|
-
posix = rel.replace("\\", "/")
|
|
507
|
-
full = (project_root / rel).resolve()
|
|
508
|
-
if not full.is_relative_to(project_root_resolved):
|
|
509
|
-
continue
|
|
510
|
-
if not full.is_file():
|
|
511
|
-
continue
|
|
512
|
-
if full.suffix.lower() not in SCANNABLE_EXTENSIONS:
|
|
513
|
-
continue
|
|
514
|
-
if _is_allow_listed(posix, allow_globs):
|
|
515
|
-
continue
|
|
516
|
-
out.append((posix, full))
|
|
517
|
-
return out
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
def evaluate(
|
|
521
|
-
project_root: Path,
|
|
522
|
-
*,
|
|
523
|
-
mode: str = "all",
|
|
524
|
-
allow_list_path: Path | None = None,
|
|
525
|
-
) -> tuple[int, list[Finding], str]:
|
|
526
|
-
"""Pure function returning ``(exit_code, findings, human_message)``.
|
|
527
|
-
|
|
528
|
-
Separated from :func:`main` so tests can drive every state without
|
|
529
|
-
``capsys`` plumbing or env-var leak.
|
|
530
|
-
"""
|
|
531
|
-
if mode not in {"all", "staged"}:
|
|
532
|
-
return (
|
|
533
|
-
2,
|
|
534
|
-
[],
|
|
535
|
-
(f"❌ verify_encoding: unrecognised mode '{mode}' (expected 'all' or 'staged')."),
|
|
536
|
-
)
|
|
537
|
-
|
|
538
|
-
try:
|
|
539
|
-
custom_globs = _load_allow_list(allow_list_path)
|
|
540
|
-
except FileNotFoundError as exc:
|
|
541
|
-
return (
|
|
542
|
-
2,
|
|
543
|
-
[],
|
|
544
|
-
(
|
|
545
|
-
f"❌ verify_encoding: --allow-list file not found: {exc}\n"
|
|
546
|
-
" Recovery: pass an existing path or omit the flag."
|
|
547
|
-
),
|
|
548
|
-
)
|
|
549
|
-
except OSError as exc:
|
|
550
|
-
return (
|
|
551
|
-
2,
|
|
552
|
-
[],
|
|
553
|
-
(
|
|
554
|
-
f"❌ verify_encoding: --allow-list unreadable: {exc}\n"
|
|
555
|
-
" Recovery: check file permissions."
|
|
556
|
-
),
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
allow_globs = list(BUILTIN_ALLOW_LIST) + custom_globs
|
|
560
|
-
|
|
561
|
-
try:
|
|
562
|
-
if mode == "staged":
|
|
563
|
-
rel_paths = _git_staged_files(project_root)
|
|
564
|
-
else:
|
|
565
|
-
rel_paths = _git_tracked_files(project_root)
|
|
566
|
-
except FileNotFoundError:
|
|
567
|
-
return (
|
|
568
|
-
2,
|
|
569
|
-
[],
|
|
570
|
-
(
|
|
571
|
-
"❌ verify_encoding: 'git' executable not found on PATH.\n"
|
|
572
|
-
" Recovery: install git or set DEFT_PYTHON to a python that "
|
|
573
|
-
"can spawn git."
|
|
574
|
-
),
|
|
575
|
-
)
|
|
576
|
-
except RuntimeError as exc:
|
|
577
|
-
return (
|
|
578
|
-
2,
|
|
579
|
-
[],
|
|
580
|
-
(
|
|
581
|
-
f"❌ verify_encoding: git failed -- {exc}\n"
|
|
582
|
-
" Recovery: ensure --project-root points at a git working tree."
|
|
583
|
-
),
|
|
584
|
-
)
|
|
585
|
-
|
|
586
|
-
candidates = _filter_scannable(rel_paths, project_root, allow_globs)
|
|
587
|
-
|
|
588
|
-
findings: list[Finding] = []
|
|
589
|
-
for rel, full in candidates:
|
|
590
|
-
findings.extend(scan_file(rel, full))
|
|
591
|
-
|
|
592
|
-
if findings:
|
|
593
|
-
header = (
|
|
594
|
-
f"❌ verify_encoding: detected {len(findings)} mojibake / "
|
|
595
|
-
f"U+FFFD / unexpected-BOM hit(s) across {len({f.path for f in findings})} "
|
|
596
|
-
f"file(s) (#798).\n"
|
|
597
|
-
" Root cause: PowerShell 5.1 Get-Content -Raw decodes via the active "
|
|
598
|
-
"Windows codepage (cp1252 or cp437) on the READ side, BEFORE any\n"
|
|
599
|
-
" safe write can preserve the bytes. Fix: rewrite the offending "
|
|
600
|
-
"files with Python pathlib.Path.write_text(text, encoding='utf-8'),\n"
|
|
601
|
-
" re-read from a clean source (git checkout HEAD -- <path>), and "
|
|
602
|
-
"do NOT round-trip through PS 5.1 again. See AGENTS.md ## PowerShell.\n"
|
|
603
|
-
" Allow-list a documented exception via --allow-list <path> "
|
|
604
|
-
"(file with newline-separated glob patterns)."
|
|
605
|
-
)
|
|
606
|
-
body = "\n".join(f.render() for f in findings[:50])
|
|
607
|
-
if len(findings) > 50:
|
|
608
|
-
body += f"\n ... and {len(findings) - 50} more"
|
|
609
|
-
return 1, findings, f"{header}\n{body}"
|
|
610
|
-
|
|
611
|
-
msg = (
|
|
612
|
-
f"✓ verify_encoding: {len(candidates)} file(s) clean -- no mojibake / "
|
|
613
|
-
"U+FFFD / unexpected-BOM detected (#798)."
|
|
614
|
-
)
|
|
615
|
-
return 0, findings, msg
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
def _build_parser() -> argparse.ArgumentParser:
|
|
619
|
-
parser = argparse.ArgumentParser(
|
|
620
|
-
prog="verify_encoding.py",
|
|
621
|
-
description=(
|
|
622
|
-
"Deterministic gate against PS 5.1 non-ASCII round-trip "
|
|
623
|
-
"corruption (#798). Scans tracked text files for U+FFFD "
|
|
624
|
-
"replacement chars, the curated CP1252-as-UTF-8 / "
|
|
625
|
-
"CP437-as-UTF-8 mojibake bigram set, and unexpected UTF-8 "
|
|
626
|
-
"BOM on .md/.json/.yml/.yaml/.txt."
|
|
627
|
-
),
|
|
628
|
-
)
|
|
629
|
-
mode = parser.add_mutually_exclusive_group()
|
|
630
|
-
mode.add_argument(
|
|
631
|
-
"--all",
|
|
632
|
-
dest="mode",
|
|
633
|
-
action="store_const",
|
|
634
|
-
const="all",
|
|
635
|
-
help="Scan all tracked files via 'git ls-files' (default).",
|
|
636
|
-
)
|
|
637
|
-
mode.add_argument(
|
|
638
|
-
"--staged",
|
|
639
|
-
dest="mode",
|
|
640
|
-
action="store_const",
|
|
641
|
-
const="staged",
|
|
642
|
-
help=(
|
|
643
|
-
"Scan only staged files via 'git diff --cached --name-only' "
|
|
644
|
-
"(used by .githooks/pre-commit)."
|
|
645
|
-
),
|
|
646
|
-
)
|
|
647
|
-
parser.set_defaults(mode="all")
|
|
648
|
-
parser.add_argument(
|
|
649
|
-
"--project-root",
|
|
650
|
-
default=".",
|
|
651
|
-
help="Project root path (default: current working directory).",
|
|
652
|
-
)
|
|
653
|
-
parser.add_argument(
|
|
654
|
-
"--allow-list",
|
|
655
|
-
default=None,
|
|
656
|
-
help=(
|
|
657
|
-
"Path to a file with newline-separated glob patterns of "
|
|
658
|
-
"documented exceptions. Lines starting with # are comments."
|
|
659
|
-
),
|
|
660
|
-
)
|
|
661
|
-
parser.add_argument(
|
|
662
|
-
"--quiet",
|
|
663
|
-
action="store_true",
|
|
664
|
-
help="Suppress the OK message (errors still print).",
|
|
665
|
-
)
|
|
666
|
-
return parser
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
def main(argv: list[str] | None = None) -> int:
|
|
670
|
-
# #814: Force UTF-8 stdout/stderr at hook-script entry. Windows Python
|
|
671
|
-
# defaults stdout/stderr to cp1252 (or cp437) when the hook is invoked
|
|
672
|
-
# by git, neither of which has a glyph for the U+2713 success marker
|
|
673
|
-
# or the various non-ASCII glyphs in this script's diagnostic output.
|
|
674
|
-
# Mirrors the block in scripts/preflight_branch.py exactly.
|
|
675
|
-
if hasattr(sys.stdout, "reconfigure"):
|
|
676
|
-
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
677
|
-
if hasattr(sys.stderr, "reconfigure"):
|
|
678
|
-
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
|
679
|
-
|
|
680
|
-
parser = _build_parser()
|
|
681
|
-
args = parser.parse_args(argv)
|
|
682
|
-
project_root = Path(args.project_root).resolve()
|
|
683
|
-
allow_list_path = Path(args.allow_list).resolve() if args.allow_list else None
|
|
684
|
-
|
|
685
|
-
code, _findings, msg = evaluate(
|
|
686
|
-
project_root,
|
|
687
|
-
mode=args.mode,
|
|
688
|
-
allow_list_path=allow_list_path,
|
|
689
|
-
)
|
|
690
|
-
if code == 0:
|
|
691
|
-
if not args.quiet:
|
|
692
|
-
print(msg)
|
|
693
|
-
else:
|
|
694
|
-
print(msg, file=sys.stderr)
|
|
695
|
-
return code
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
if __name__ == "__main__":
|
|
699
|
-
sys.exit(main())
|