@deftai/directive-content 0.55.2 → 0.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.githooks/pre-commit +143 -0
- package/.githooks/pre-push +121 -0
- package/QUICK-START.md +2 -2
- package/Taskfile.yml +934 -0
- package/UPGRADING.md +47 -1
- package/events/README.md +3 -3
- package/package.json +5 -4
- package/scripts/_agents_md.py +494 -0
- package/scripts/_cache_fetch.py +635 -0
- package/scripts/_cache_quota.py +529 -0
- package/scripts/_cache_refresh.py +163 -0
- package/scripts/_cache_validate.py +209 -0
- package/scripts/_content_root.py +42 -0
- package/scripts/_doctor_state.py +277 -0
- package/scripts/_event_detect.py +305 -0
- package/scripts/_events.py +514 -0
- package/scripts/_lifecycle_hygiene.py +568 -0
- package/scripts/_pathspec.py +91 -0
- package/scripts/_policy_show_cli.py +266 -0
- package/scripts/_precutover.py +92 -0
- package/scripts/_project_context.py +224 -0
- package/scripts/_project_definition_io.py +164 -0
- package/scripts/_relocate_snapshot.py +209 -0
- package/scripts/_relocate_states.py +343 -0
- package/scripts/_resolve_preflight_path.py +152 -0
- package/scripts/_safe_subprocess.py +167 -0
- package/scripts/_session_start_hook.py +205 -0
- package/scripts/_sor_gate_diff.py +365 -0
- package/scripts/_stdio_utf8.py +59 -0
- package/scripts/_triage_bootstrap_gitignore.py +904 -0
- package/scripts/_triage_classify_cli.py +122 -0
- package/scripts/_triage_queue_cli.py +625 -0
- package/scripts/_triage_scope_cli.py +343 -0
- package/scripts/_triage_scope_drift_cli.py +121 -0
- package/scripts/_triage_scope_ignores.py +286 -0
- package/scripts/_triage_scope_milestone.py +432 -0
- package/scripts/_triage_scope_mutations.py +337 -0
- package/scripts/_triage_scope_renderers.py +207 -0
- package/scripts/_triage_smoketest_stages.py +674 -0
- package/scripts/_triage_subscribe_cli.py +140 -0
- package/scripts/_triage_welcome_cli.py +421 -0
- package/scripts/_vbrief_build.py +239 -0
- package/scripts/_vbrief_fidelity.py +479 -0
- package/scripts/_vbrief_legacy.py +589 -0
- package/scripts/_vbrief_reconciliation.py +883 -0
- package/scripts/_vbrief_routing.py +277 -0
- package/scripts/_vbrief_safety.py +778 -0
- package/scripts/_vbrief_sources.py +312 -0
- package/scripts/_vbrief_speckit.py +262 -0
- package/scripts/_vbrief_story_quality.py +353 -0
- package/scripts/_vbrief_validation.py +299 -0
- package/scripts/build_dist.py +412 -0
- package/scripts/cache.py +1078 -0
- package/scripts/cache_scanner.py +745 -0
- package/scripts/candidates_log.py +432 -0
- package/scripts/capacity_backfill.py +680 -0
- package/scripts/capacity_show.py +653 -0
- package/scripts/ci_local.py +689 -0
- package/scripts/code_structure_validate.py +765 -0
- package/scripts/codebase_default_extractor.py +495 -0
- package/scripts/codebase_map.py +304 -0
- package/scripts/codebase_map_fresh.py +104 -0
- package/scripts/codebase_projection_registry.py +94 -0
- package/scripts/codebase_provider.py +582 -0
- package/scripts/doctor.py +2257 -0
- package/scripts/framework_commands.py +505 -0
- package/scripts/gh_rest.py +882 -0
- package/scripts/github_auth_modes.py +437 -0
- package/scripts/github_body.py +292 -0
- package/scripts/ip_risk.py +531 -0
- package/scripts/issue_emit.py +670 -0
- package/scripts/issue_ingest.py +1064 -0
- package/scripts/migrate_preflight.py +418 -0
- package/scripts/migrate_vbrief.py +2677 -0
- package/scripts/monitor_pr.py +401 -0
- package/scripts/pack_migrate_lessons.py +336 -0
- package/scripts/pack_migrate_patterns.py +254 -0
- package/scripts/pack_migrate_rules.py +350 -0
- package/scripts/pack_migrate_skills.py +423 -0
- package/scripts/pack_migrate_strategies.py +311 -0
- package/scripts/pack_migrate_swarm_spec.py +250 -0
- package/scripts/pack_render.py +434 -0
- package/scripts/packs_slice.py +712 -0
- package/scripts/platform_capabilities.py +336 -0
- package/scripts/policy.py +2826 -0
- package/scripts/policy_set.py +324 -0
- package/scripts/pr_check_closing_keywords.py +524 -0
- package/scripts/pr_check_protected_issues.py +267 -0
- package/scripts/pr_merge_readiness.py +1004 -0
- package/scripts/pr_wait_mergeable.py +669 -0
- package/scripts/prd_render.py +159 -0
- package/scripts/preflight_architecture_sor.py +974 -0
- package/scripts/preflight_branch.py +289 -0
- package/scripts/preflight_cache.py +974 -0
- package/scripts/preflight_gh.py +721 -0
- package/scripts/preflight_implementation.py +272 -0
- package/scripts/preflight_story_start.py +838 -0
- package/scripts/preflight_wip_cap.py +149 -0
- package/scripts/probe_session.py +545 -0
- package/scripts/project_render.py +293 -0
- package/scripts/quarantine_ext.py +237 -0
- package/scripts/reconcile_issues.py +1442 -0
- package/scripts/refresh-path.ps1 +107 -0
- package/scripts/release.py +2030 -0
- package/scripts/release_e2e.py +1011 -0
- package/scripts/release_publish.py +486 -0
- package/scripts/release_rollback.py +980 -0
- package/scripts/relocate.py +1034 -0
- package/scripts/resolve_changelog_unreleased.py +667 -0
- package/scripts/resolve_version.py +490 -0
- package/scripts/resume_conditions.py +706 -0
- package/scripts/ritual_sentinel.py +609 -0
- package/scripts/roadmap_render.py +635 -0
- package/scripts/rule_ownership_lint.py +325 -0
- package/scripts/scm.py +591 -0
- package/scripts/scope_audit_log.py +387 -0
- package/scripts/scope_decompose.py +654 -0
- package/scripts/scope_demote.py +509 -0
- package/scripts/scope_lifecycle.py +1126 -0
- package/scripts/scope_undo.py +772 -0
- package/scripts/session_start.py +406 -0
- package/scripts/setup_ghx.py +339 -0
- package/scripts/setup_windows.ps1 +220 -0
- package/scripts/slice_audit.py +585 -0
- package/scripts/slice_record.py +530 -0
- package/scripts/slice_record_existing.py +692 -0
- package/scripts/slug_normalize.py +178 -0
- package/scripts/spec_render.py +477 -0
- package/scripts/spec_validate.py +238 -0
- package/scripts/subagent_monitor.py +658 -0
- package/scripts/swarm_complete_cohort.py +644 -0
- package/scripts/swarm_launch.py +1206 -0
- package/scripts/swarm_readiness.py +554 -0
- package/scripts/swarm_verify_review_clean.py +438 -0
- package/scripts/swarm_worktrees.py +497 -0
- package/scripts/toolchain-check.py +52 -0
- package/scripts/triage_actions.py +871 -0
- package/scripts/triage_bootstrap.py +1153 -0
- package/scripts/triage_bulk.py +630 -0
- package/scripts/triage_classify.py +932 -0
- package/scripts/triage_help.py +1685 -0
- package/scripts/triage_queue.py +1944 -0
- package/scripts/triage_reconcile.py +581 -0
- package/scripts/triage_refresh.py +643 -0
- package/scripts/triage_scope.py +999 -0
- package/scripts/triage_scope_drift.py +575 -0
- package/scripts/triage_smoketest.py +396 -0
- package/scripts/triage_subscribe.py +399 -0
- package/scripts/triage_summary.py +1011 -0
- package/scripts/triage_welcome.py +1178 -0
- package/scripts/ts_check_lane.py +86 -0
- package/scripts/validate-links.py +64 -0
- package/scripts/validate_strategy_output.py +212 -0
- package/scripts/vbrief_activate.py +228 -0
- package/scripts/vbrief_migrate_conformance.py +368 -0
- package/scripts/vbrief_reconcile_graph.py +306 -0
- package/scripts/vbrief_reconcile_labels.py +460 -0
- package/scripts/vbrief_reconcile_umbrellas.py +741 -0
- package/scripts/vbrief_validate.py +1195 -0
- package/scripts/verify-stubs.py +61 -0
- package/scripts/verify_capacity.py +160 -0
- package/scripts/verify_encoding.py +699 -0
- package/scripts/verify_hooks_installed.py +206 -0
- package/scripts/verify_investigation.py +360 -0
- package/scripts/verify_judgment_gates.py +827 -0
- package/scripts/verify_no_task_runtime.py +171 -0
- package/scripts/verify_scm_boundary.py +509 -0
- package/scripts/verify_session_ritual.py +389 -0
- package/scripts/verify_tools.py +426 -0
- package/scripts/verify_vbrief_conformance.py +478 -0
- package/tasks/architecture.yml +13 -0
- package/tasks/cache.yml +69 -0
- package/tasks/capacity.yml +38 -0
- package/tasks/change.yml +46 -0
- package/tasks/changelog.yml +24 -0
- package/tasks/ci.yml +49 -0
- package/tasks/codebase.yml +47 -0
- package/tasks/commit.yml +30 -0
- package/tasks/core.yml +126 -0
- package/tasks/deployments.yml +54 -0
- package/tasks/framework.yml +74 -0
- package/tasks/install.yml +60 -0
- package/tasks/issue.yml +50 -0
- package/tasks/migrate.yml +73 -0
- package/tasks/packs.yml +92 -0
- package/tasks/policy.yml +75 -0
- package/tasks/pr.yml +89 -0
- package/tasks/prd.yml +39 -0
- package/tasks/project.yml +27 -0
- package/tasks/reconcile.yml +32 -0
- package/tasks/relocate.yml +56 -0
- package/tasks/roadmap.yml +28 -0
- package/tasks/scm.yml +126 -0
- package/tasks/scope-undo.yml +36 -0
- package/tasks/scope.yml +141 -0
- package/tasks/session.yml +19 -0
- package/tasks/setup.yml +37 -0
- package/tasks/slice.yml +69 -0
- package/tasks/spec.yml +41 -0
- package/tasks/swarm.yml +85 -0
- package/tasks/toolchain.yml +13 -0
- package/tasks/triage-actions.yml +94 -0
- package/tasks/triage-bootstrap.yml +43 -0
- package/tasks/triage-bulk.yml +75 -0
- package/tasks/triage-classify.yml +30 -0
- package/tasks/triage-queue.yml +50 -0
- package/tasks/triage-reconcile.yml +29 -0
- package/tasks/triage-scope-drift.yml +29 -0
- package/tasks/triage-scope.yml +31 -0
- package/tasks/triage-smoketest.yml +33 -0
- package/tasks/triage-subscribe.yml +36 -0
- package/tasks/triage-summary.yml +29 -0
- package/tasks/triage-welcome.yml +32 -0
- package/tasks/ts.yml +328 -0
- package/tasks/vbrief.yml +206 -0
- package/tasks/verify.yml +292 -0
- package/templates/agents-entry.md +1 -1
|
@@ -0,0 +1,745 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
r"""cache_scanner.py -- quarantine scanner v2 for the unified cache (#883 Story 2).
|
|
3
|
+
|
|
4
|
+
Public surface
|
|
5
|
+
--------------
|
|
6
|
+
|
|
7
|
+
``scan(content_md: str) -> ScanResult``
|
|
8
|
+
Run the three baseline categories over ``content_md`` and return a
|
|
9
|
+
structured :class:`ScanResult` carrying ``passed`` (False iff any
|
|
10
|
+
hard-fail severity flag fires), the per-category ``flags`` list, and
|
|
11
|
+
the ``transformed_content`` that the cache layer should persist as
|
|
12
|
+
``content.md`` when ``passed`` is True.
|
|
13
|
+
|
|
14
|
+
``SCANNER_VERSION``
|
|
15
|
+
Module-level SemVer string. Bumped per the documented rule:
|
|
16
|
+
|
|
17
|
+
- patch (``2.0.x``) -- pattern additions to an existing category
|
|
18
|
+
- minor (``2.x.0``) -- new category landed (e.g. shell-cmd-injection)
|
|
19
|
+
OR a material detection-policy change that alters which bodies flag
|
|
20
|
+
(e.g. v2.1.0 #949 strict-signal injection-heading tuning)
|
|
21
|
+
- major (``x.0.0``) -- semantic rewrite (e.g. cache:put hard-fails on
|
|
22
|
+
every fence-and-pass match instead of writing content.md)
|
|
23
|
+
|
|
24
|
+
Scanner v2 baseline categories
|
|
25
|
+
------------------------------
|
|
26
|
+
|
|
27
|
+
1. ``injection-heading`` -- severity ``fence-and-pass``. Tuned in v2.1.0 for
|
|
28
|
+
precision against organic GitHub issue templates (#949). The detector
|
|
29
|
+
no longer fires on bare imperative-shaped headings (``## STEP 1``,
|
|
30
|
+
``## Action items``, ``## Important notes``, ``## Task list``,
|
|
31
|
+
``## Background``, ...). It now requires a *structural* injection
|
|
32
|
+
signal before flagging: either (a) an instruction-override / role-hijack
|
|
33
|
+
phrase in the heading text -- ``IGNORE/DISREGARD/FORGET PREVIOUS``,
|
|
34
|
+
``SYSTEM:`` / ``ASSISTANT:`` / ``USER:`` / ``AGENT:`` / ``OVERRIDE:`` /
|
|
35
|
+
``DIRECTIVE:`` / ``ROLE:`` / ``INSTRUCTION(S):`` / ``PROMPT:`` /
|
|
36
|
+
``TOOL:`` / ``FUNCTION:`` at the heading's start -- or (b) a shell
|
|
37
|
+
vector inside the heading's body (``curl ... | sh``, ``wget ... | sh``,
|
|
38
|
+
``base64 -d``, ``eval``, ``sh -c``, ```eval `cmd``` ``). Plain-prose
|
|
39
|
+
lines with the same instruction-override phrasing also flag. The
|
|
40
|
+
structural-signal check is the sole gate -- there is no allowlist
|
|
41
|
+
short-circuit, so a benign-template heading whose tail smuggles an
|
|
42
|
+
injection phrase (e.g. ``## STEP 1 - Ignore previous instructions``)
|
|
43
|
+
still flags. ``quarantine_ext`` keeps its broader policy untouched --
|
|
44
|
+
this category owns its own detection + wrapping. The flag carries
|
|
45
|
+
``match_count`` = number of detected sections.
|
|
46
|
+
|
|
47
|
+
2. ``credentials`` -- severity ``hard-fail``. A curated regex set covering
|
|
48
|
+
the canonical exfiltratable secret shapes (``gh[pousr]_``, ``sk-`` /
|
|
49
|
+
``sk-ant-``, ``xox[bp]-``, ``AKIA``, PEM private-key headers, ``Bearer``
|
|
50
|
+
tokens, JWTs). When any pattern matches, ``passed`` is set to ``False``
|
|
51
|
+
and ``cache:put`` declines to write ``content.md``. The flag's
|
|
52
|
+
``detail`` field carries the pattern label (e.g. ``"github-pat"``)
|
|
53
|
+
NOT the matched bytes -- a redacted descriptor only, so the audit log
|
|
54
|
+
never persists the secret it caught.
|
|
55
|
+
|
|
56
|
+
3. ``invisible-unicode`` -- severity ``strip-and-pass``. A codepoint
|
|
57
|
+
membership test against the canonical bidi / zero-width / tag character
|
|
58
|
+
set (U+200B-200F, U+202A-202E, U+2060, U+2066-2069, U+FEFF,
|
|
59
|
+
U+E0000-U+E007F). Matched codepoints are stripped from
|
|
60
|
+
``transformed_content`` and the flag's ``match_count`` field records
|
|
61
|
+
how many were removed (the precise codepoint set is summarised in
|
|
62
|
+
``detail`` as a comma-separated list of ``U+XXXX`` labels).
|
|
63
|
+
|
|
64
|
+
Order of operations
|
|
65
|
+
-------------------
|
|
66
|
+
|
|
67
|
+
Within a single :func:`scan` call:
|
|
68
|
+
|
|
69
|
+
1. Invisible-unicode strip runs FIRST so subsequent categories scan the
|
|
70
|
+
visible-only text. A credential token that smuggles itself across a
|
|
71
|
+
word boundary using a U+200B (e.g. ``gh\u200bp_<...>``) would otherwise
|
|
72
|
+
slip past the credentials regex; stripping first closes that hole.
|
|
73
|
+
|
|
74
|
+
2. Credentials regex runs on the stripped text. The flag is recorded
|
|
75
|
+
immediately; we do NOT short-circuit the scan even when ``passed``
|
|
76
|
+
becomes False, because the meta.json audit trail is more useful with
|
|
77
|
+
the full flag list.
|
|
78
|
+
|
|
79
|
+
3. Injection-heading wrap runs LAST on the stripped text. The transform
|
|
80
|
+
is applied unconditionally; ``transformed_content`` is the
|
|
81
|
+
strip-then-fence output regardless of ``passed``. (Callers that ignore
|
|
82
|
+
the transform on hard-fail are fine -- ``cache:put`` writes
|
|
83
|
+
raw.json + meta.json only when ``passed`` is False, never the
|
|
84
|
+
transformed_content.)
|
|
85
|
+
|
|
86
|
+
CLI
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
The module is callable as a script for ad-hoc inspection:
|
|
90
|
+
|
|
91
|
+
python scripts/cache_scanner.py [<input-file>]
|
|
92
|
+
|
|
93
|
+
Reads input file (or stdin), runs :func:`scan`, and writes the JSON
|
|
94
|
+
representation of the :class:`ScanResult` to stdout. Exit code is
|
|
95
|
+
0 when scan_result.passed is True, 2 when False -- mirrors the cache:put
|
|
96
|
+
exit-code contract so a caller piping content through the scanner gets
|
|
97
|
+
an actionable signal without having to parse the JSON.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
from __future__ import annotations
|
|
101
|
+
|
|
102
|
+
import json
|
|
103
|
+
import re
|
|
104
|
+
import sys
|
|
105
|
+
from dataclasses import asdict, dataclass, field
|
|
106
|
+
from datetime import UTC, datetime
|
|
107
|
+
from pathlib import Path
|
|
108
|
+
|
|
109
|
+
# Make ``scripts`` importable when this file is invoked via
|
|
110
|
+
# ``python scripts/cache_scanner.py`` from a Taskfile dispatch.
|
|
111
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# Constants
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
#: Module-level scanner SemVer. The version is persisted into every
|
|
118
|
+
#: meta.json scan_result.scanner_version field on cache:put so a future
|
|
119
|
+
#: cache:doctor --rescan (deferred to v2) can detect entries written by
|
|
120
|
+
#: an older scanner and re-run them. Bump rules in module docstring.
|
|
121
|
+
#:
|
|
122
|
+
#: 2.0.0 -- baseline (3 categories on, injection-heading reused
|
|
123
|
+
#: quarantine_ext.SUSPICIOUS_TOKENS).
|
|
124
|
+
#: 2.1.0 -- injection-heading detector tuned for precision (#949): tighter
|
|
125
|
+
#: structural-signal policy (instruction-override / role-hijack / body
|
|
126
|
+
#: shell-vector) reduces the false-positive rate from ~85% to <20% on
|
|
127
|
+
#: organic deftai/directive issue bodies. No schema break; existing
|
|
128
|
+
#: meta.json + audit-log records remain valid.
|
|
129
|
+
SCANNER_VERSION: str = "2.1.0"
|
|
130
|
+
|
|
131
|
+
#: Categories baselined in scanner v2. Frozen tuple so the ordering
|
|
132
|
+
#: matches the meta.json ScanFlag.category enum in
|
|
133
|
+
#: vbrief/schemas/cache-meta.schema.json.
|
|
134
|
+
CATEGORIES: tuple[str, ...] = (
|
|
135
|
+
"injection-heading",
|
|
136
|
+
"credentials",
|
|
137
|
+
"invisible-unicode",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
#: Severity per category. Per-category severity is a documented epic
|
|
141
|
+
#: departure from the design doc's uniform hard-fail; rationale lives in
|
|
142
|
+
#: vbrief/active/.../883-deft-cache-quarantine-v1.vbrief.json under
|
|
143
|
+
#: metadata.x-tracking.design_doc_departures.
|
|
144
|
+
SEVERITY_BY_CATEGORY: dict[str, str] = {
|
|
145
|
+
"injection-heading": "fence-and-pass",
|
|
146
|
+
"credentials": "hard-fail",
|
|
147
|
+
"invisible-unicode": "strip-and-pass",
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
# Credentials patterns
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
#: Curated regex set for the credentials category. Each entry pairs a
|
|
155
|
+
#: short label (carried into ScanFlag.detail) with a compiled regex. The
|
|
156
|
+
#: label is what the audit log persists -- the matched secret itself is
|
|
157
|
+
#: NEVER persisted (per cache-meta.schema.json's ScanFlag.detail
|
|
158
|
+
#: redaction rule). Patterns are anchored loose-but-specific: tight
|
|
159
|
+
#: enough to avoid false positives in benign prose, loose enough to
|
|
160
|
+
#: catch real-world variations.
|
|
161
|
+
#:
|
|
162
|
+
#: Layout: list of (label, compiled-regex) tuples. Order is the order
|
|
163
|
+
#: emitted into flags; not security-critical, but kept consistent so
|
|
164
|
+
#: tests can pin offsets without flake.
|
|
165
|
+
_CREDENTIAL_PATTERNS: list[tuple[str, re.Pattern[str]]] = [
|
|
166
|
+
# GitHub personal-access tokens. The four prefixes (``ghp_``, ``gho_``,
|
|
167
|
+
# ``ghu_``, ``ghs_``, ``ghr_``) cover personal / oauth / user-to-server
|
|
168
|
+
# / server-to-server / refresh tokens respectively. The 30+ trailing
|
|
169
|
+
# alphanumeric run is the documented gh format.
|
|
170
|
+
("github-pat", re.compile(r"\bgh[pousr]_[A-Za-z0-9]{30,}\b")),
|
|
171
|
+
# Anthropic API key (sk-ant-...). Listed BEFORE the generic ``sk-``
|
|
172
|
+
# OpenAI pattern so the more-specific match wins (re.search is
|
|
173
|
+
# iteration-order independent but the per-flag label depends on
|
|
174
|
+
# which pattern fired first; sk-ant should win for clarity).
|
|
175
|
+
("anthropic-api-key", re.compile(r"\bsk-ant-[A-Za-z0-9_-]{20,}\b")),
|
|
176
|
+
# OpenAI API key (sk-...). The 20+ trailing run keeps the pattern
|
|
177
|
+
# specific enough to skip false positives like ``sk-discovery`` or
|
|
178
|
+
# ``sk-rules`` that show up in non-token prose.
|
|
179
|
+
("openai-api-key", re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")),
|
|
180
|
+
# Slack tokens. ``xoxb-`` (bot) and ``xoxp-`` (user) are the two
|
|
181
|
+
# commonly-leaked variants; ``xoxa-`` / ``xoxs-`` are session-scoped
|
|
182
|
+
# and out of v1 scope.
|
|
183
|
+
("slack-token", re.compile(r"\bxox[bp]-[A-Za-z0-9-]{20,}\b")),
|
|
184
|
+
# AWS access-key-id. The ``AKIA`` prefix + exactly-16 A-Z0-9 run is
|
|
185
|
+
# the canonical AWS IAM access-key shape; ``ASIA`` (session keys)
|
|
186
|
+
# is intentionally NOT covered in v1 because session keys are
|
|
187
|
+
# short-lived and the false-positive rate against codenames is high.
|
|
188
|
+
("aws-access-key", re.compile(r"\bAKIA[0-9A-Z]{16}\b")),
|
|
189
|
+
# PEM private key BEGIN header. Matches RSA / DSA / EC / generic
|
|
190
|
+
# ``PRIVATE KEY`` variants (``OPENSSH PRIVATE KEY`` is the modern
|
|
191
|
+
# ssh-keygen default).
|
|
192
|
+
(
|
|
193
|
+
"pem-private-key",
|
|
194
|
+
re.compile(
|
|
195
|
+
r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |ENCRYPTED )?PRIVATE KEY-----"
|
|
196
|
+
),
|
|
197
|
+
),
|
|
198
|
+
# Bearer authorization header. The 20+ run guards against the
|
|
199
|
+
# word "Bearer" used in benign prose (e.g. "the Bearer of bad news").
|
|
200
|
+
(
|
|
201
|
+
"bearer-token",
|
|
202
|
+
re.compile(r"\bBearer\s+[A-Za-z0-9_.~+/=-]{20,}\b"),
|
|
203
|
+
),
|
|
204
|
+
# JWT shape: three base64url segments separated by dots. The
|
|
205
|
+
# ``eyJ`` prefix is the base64url encoding of the JSON ``{"`` header
|
|
206
|
+
# opener -- effectively unique to JWTs.
|
|
207
|
+
(
|
|
208
|
+
"jwt",
|
|
209
|
+
re.compile(r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"),
|
|
210
|
+
),
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
# ---------------------------------------------------------------------------
|
|
214
|
+
# Invisible-unicode codepoints
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
#: Codepoint set for the invisible-unicode category. Each codepoint that
|
|
218
|
+
#: appears here is stripped from the content and counted against the
|
|
219
|
+
#: invisible-unicode flag. The set covers:
|
|
220
|
+
#:
|
|
221
|
+
#: - U+200B..U+200F -- zero-width space, zero-width non-joiner, joiner,
|
|
222
|
+
#: left-to-right mark, right-to-left mark.
|
|
223
|
+
#: - U+202A..U+202E -- LRE, RLE, PDF, LRO, RLO (bidi overrides; the
|
|
224
|
+
#: well-known "trojan source" attack vector).
|
|
225
|
+
#: - U+2060 -- word joiner (zero-width non-breaking).
|
|
226
|
+
#: - U+2066..U+2069 -- LRI, RLI, FSI, PDI (isolates; #2024-bidi-attack
|
|
227
|
+
#: vector).
|
|
228
|
+
#: - U+FEFF -- byte-order mark / zero-width no-break space.
|
|
229
|
+
#: - U+E0000..U+E007F -- tag characters / language-tag block (Unicode
|
|
230
|
+
#: "tag" plane; abused for invisible exfiltration).
|
|
231
|
+
_INVISIBLE_RANGES: tuple[tuple[int, int], ...] = (
|
|
232
|
+
(0x200B, 0x200F),
|
|
233
|
+
(0x202A, 0x202E),
|
|
234
|
+
(0x2060, 0x2060),
|
|
235
|
+
(0x2066, 0x2069),
|
|
236
|
+
(0xFEFF, 0xFEFF),
|
|
237
|
+
(0xE0000, 0xE007F),
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _is_invisible(ch: str) -> bool:
|
|
242
|
+
"""Return True iff ``ch`` is in the invisible-unicode strip set."""
|
|
243
|
+
cp = ord(ch)
|
|
244
|
+
return any(lo <= cp <= hi for lo, hi in _INVISIBLE_RANGES)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ---------------------------------------------------------------------------
|
|
248
|
+
# Injection-heading detection (#949 tuning)
|
|
249
|
+
# ---------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
#: Structural injection signal detected within a heading TEXT. A heading
|
|
252
|
+
#: that triggers any of these patterns is treated as a real injection
|
|
253
|
+
#: vector. The patterns are deliberately narrow so generic prose ("the
|
|
254
|
+
#: user clicks ...", "system requires ...") does NOT fire.
|
|
255
|
+
#:
|
|
256
|
+
#: 1. Override / disregard phrases -- the canonical prompt-injection
|
|
257
|
+
#: opener ("ignore previous instructions", "disregard the above",
|
|
258
|
+
#: "forget prior"). Word-boundary on the verb; the operand requires
|
|
259
|
+
#: one of {previous, prior, above, all, earlier, your}.
|
|
260
|
+
#: 2. Role-hijack prefix at the START of the heading text -- ``SYSTEM:``,
|
|
261
|
+
#: ``ASSISTANT:``, ``USER:``, ``AGENT:``, ``TOOL:``, ``FUNCTION:``,
|
|
262
|
+
#: ``OVERRIDE:``, ``DIRECTIVE:``, ``ROLE:``, ``PROMPT:``,
|
|
263
|
+
#: ``INSTRUCTION:``, ``INSTRUCTIONS:``. The colon-anchored shape is
|
|
264
|
+
#: the distinctive injection vector; ``System Requirements`` / ``User
|
|
265
|
+
#: Story`` / etc. do NOT match because they lack the colon.
|
|
266
|
+
_INJECTION_OVERRIDE_RE: re.Pattern[str] = re.compile(
|
|
267
|
+
r"\b(?:ignore|disregard|forget|override|bypass)\s+(?:the\s+|all\s+|any\s+)?"
|
|
268
|
+
r"(?:previous|prior|above|earlier|all|your|preceding|original|system)\b",
|
|
269
|
+
re.IGNORECASE,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
_HEADING_ROLE_PREFIXES: tuple[str, ...] = (
|
|
273
|
+
"SYSTEM",
|
|
274
|
+
"ASSISTANT",
|
|
275
|
+
"USER",
|
|
276
|
+
"AGENT",
|
|
277
|
+
"TOOL",
|
|
278
|
+
"FUNCTION",
|
|
279
|
+
"OVERRIDE",
|
|
280
|
+
"DIRECTIVE",
|
|
281
|
+
"ROLE",
|
|
282
|
+
"PROMPT",
|
|
283
|
+
"INSTRUCTION",
|
|
284
|
+
"INSTRUCTIONS",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
_HEADING_ROLE_PREFIX_RE: re.Pattern[str] = re.compile(
|
|
288
|
+
r"^(?:" + "|".join(re.escape(p) for p in _HEADING_ROLE_PREFIXES) + r")\s*:",
|
|
289
|
+
re.IGNORECASE,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
#: Body-context shell-vector regex. When a heading's body (the lines
|
|
293
|
+
#: between the heading and the next heading) contains any of these
|
|
294
|
+
#: patterns, the heading is treated as a pre-roll for a shell-injection
|
|
295
|
+
#: vector and flagged. This is the ONLY body-context signal the v2.1.0
|
|
296
|
+
#: injection-heading detector consumes; the dedicated shell-cmd-injection
|
|
297
|
+
#: scanner category that would do this body-wide is intentionally
|
|
298
|
+
#: deferred (#949 follow-up).
|
|
299
|
+
_BODY_VECTOR_RE: re.Pattern[str] = re.compile(
|
|
300
|
+
# Shell set kept consistent across all three sub-patterns (pipe-to-
|
|
301
|
+
# shell, ``sh -c``, ``/bin/sh -c``) so a vector like ``ksh -c '...'``
|
|
302
|
+
# or ``/bin/ksh -c '...'`` is not silently passed through; ksh was
|
|
303
|
+
# previously only listed in the pipe-to-shell alternative which left
|
|
304
|
+
# a blind spot the other two branches did not cover. Refs PR #957
|
|
305
|
+
# Greptile review on commit 5acfa8a.
|
|
306
|
+
r"(?:curl|wget|fetch)\s+[^|\n]*\|\s*(?:sh|bash|zsh|ksh)\b"
|
|
307
|
+
r"|\bbase64\s+(?:-d|--decode|-D)\b"
|
|
308
|
+
r"|\beval\s*[\(\$\"'`]"
|
|
309
|
+
r"|\b(?:sh|bash|zsh|ksh)\s+-c\s+[\"']"
|
|
310
|
+
r"|\b/bin/(?:sh|bash|zsh|ksh)\s+-c\s+[\"']",
|
|
311
|
+
re.IGNORECASE,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
#: Heading regex (mirrors quarantine_ext._HEADING_RE). 1-6 hashes plus
|
|
315
|
+
#: at least one space; setext-style ``===`` / ``---`` is intentionally
|
|
316
|
+
#: out of scope (vanishingly rare in GitHub issue bodies, multi-line
|
|
317
|
+
#: lookahead would complicate the iteration).
|
|
318
|
+
_HEADING_RE: re.Pattern[str] = re.compile(r"^(#{1,6})\s+(.*\S.*)$")
|
|
319
|
+
|
|
320
|
+
#: Code-fence delimiter regex.
|
|
321
|
+
_FENCE_RE: re.Pattern[str] = re.compile(r"^(```|~~~)")
|
|
322
|
+
|
|
323
|
+
#: Quarantine fence labels (mirror quarantine_ext for downstream-grep
|
|
324
|
+
#: compatibility -- the literal ``quarantined`` label is the contract).
|
|
325
|
+
_QUARANTINE_FENCE_OPEN: str = "```quarantined"
|
|
326
|
+
_QUARANTINE_FENCE_CLOSE: str = "```"
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _heading_text(line: str) -> str | None:
|
|
330
|
+
"""Return the heading text portion (after the ``#``-prefix) or None."""
|
|
331
|
+
match = _HEADING_RE.match(line)
|
|
332
|
+
if match is None:
|
|
333
|
+
return None
|
|
334
|
+
return match.group(2).strip()
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _heading_signal(text: str) -> bool:
|
|
338
|
+
"""Return True iff the heading text carries a structural injection signal."""
|
|
339
|
+
if _INJECTION_OVERRIDE_RE.search(text):
|
|
340
|
+
return True
|
|
341
|
+
return bool(_HEADING_ROLE_PREFIX_RE.match(text))
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _body_has_shell_vector(body_lines: list[str]) -> bool:
|
|
345
|
+
"""Return True iff any non-fenced line in ``body_lines`` matches the shell vector.
|
|
346
|
+
|
|
347
|
+
Lines inside nested code-fence blocks are skipped so a legitimate
|
|
348
|
+
technical doc that illustrates a shell command inside a fenced
|
|
349
|
+
example (e.g. a ``## Steps to reproduce`` body containing
|
|
350
|
+
```` ```sh\ncurl ... | sh\n``` ````) does not FP-flag. The fence
|
|
351
|
+
state machine mirrors the outer loop in
|
|
352
|
+
:func:`_detect_injection_heading`: a closing fence MUST be only the
|
|
353
|
+
delim chars after right-trim (per CommonMark a closer carries no
|
|
354
|
+
info string), so ``` ```python ``` is an OPENER for a nested block
|
|
355
|
+
rather than a closer for the outer one. Refs PR #957 Greptile P1.
|
|
356
|
+
"""
|
|
357
|
+
in_fence: str | None = None
|
|
358
|
+
for ln in body_lines:
|
|
359
|
+
fence_match = _FENCE_RE.match(ln)
|
|
360
|
+
if fence_match:
|
|
361
|
+
delim = fence_match.group(1)
|
|
362
|
+
if in_fence is None:
|
|
363
|
+
in_fence = delim
|
|
364
|
+
elif ln.rstrip() == in_fence:
|
|
365
|
+
in_fence = None
|
|
366
|
+
continue
|
|
367
|
+
if in_fence is not None:
|
|
368
|
+
continue
|
|
369
|
+
if _BODY_VECTOR_RE.search(ln):
|
|
370
|
+
return True
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
# ---------------------------------------------------------------------------
|
|
375
|
+
# Result dataclasses
|
|
376
|
+
# ---------------------------------------------------------------------------
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
@dataclass
|
|
380
|
+
class ScanFlag:
|
|
381
|
+
"""One scanner finding. Mirrors vbrief/schemas/cache-meta.schema.json $defs/ScanFlag."""
|
|
382
|
+
|
|
383
|
+
category: str
|
|
384
|
+
severity: str
|
|
385
|
+
detail: str
|
|
386
|
+
match_count: int = 0
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
@dataclass
|
|
390
|
+
class ScanResult:
|
|
391
|
+
"""Aggregate scanner outcome."""
|
|
392
|
+
|
|
393
|
+
passed: bool
|
|
394
|
+
scanner_version: str
|
|
395
|
+
flags: list[ScanFlag] = field(default_factory=list)
|
|
396
|
+
transformed_content: str = ""
|
|
397
|
+
scanned_at: str = ""
|
|
398
|
+
|
|
399
|
+
def to_meta_dict(self) -> dict[str, object]:
|
|
400
|
+
"""Render the scan_result subset of meta.json (per the schema).
|
|
401
|
+
|
|
402
|
+
The cache layer composes this with the source/key/fetched_at/...
|
|
403
|
+
envelope before persisting; the scanner does NOT compose the full
|
|
404
|
+
meta.json itself because TTL / fetched_at are cache-layer concerns.
|
|
405
|
+
"""
|
|
406
|
+
return {
|
|
407
|
+
"passed": self.passed,
|
|
408
|
+
"scanned_at": self.scanned_at,
|
|
409
|
+
"scanner_version": self.scanner_version,
|
|
410
|
+
"flags": [
|
|
411
|
+
{k: v for k, v in asdict(flag).items() if k != "match_count" or v}
|
|
412
|
+
for flag in self.flags
|
|
413
|
+
],
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
# ---------------------------------------------------------------------------
|
|
418
|
+
# Strip-then-flag helpers
|
|
419
|
+
# ---------------------------------------------------------------------------
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _strip_invisible(text: str) -> tuple[str, list[str]]:
|
|
423
|
+
"""Strip invisible-unicode codepoints; return ``(stripped_text, removed_labels)``.
|
|
424
|
+
|
|
425
|
+
``removed_labels`` is a list of unique ``U+XXXX`` labels for the
|
|
426
|
+
codepoints that were removed; the order matches first-occurrence
|
|
427
|
+
in the input. The list is what the ScanFlag.detail field summarises.
|
|
428
|
+
"""
|
|
429
|
+
if not text:
|
|
430
|
+
return text, []
|
|
431
|
+
out_chars: list[str] = []
|
|
432
|
+
seen: dict[int, str] = {}
|
|
433
|
+
for ch in text:
|
|
434
|
+
if _is_invisible(ch):
|
|
435
|
+
cp = ord(ch)
|
|
436
|
+
if cp not in seen:
|
|
437
|
+
seen[cp] = f"U+{cp:04X}"
|
|
438
|
+
continue
|
|
439
|
+
out_chars.append(ch)
|
|
440
|
+
return "".join(out_chars), list(seen.values())
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _detect_credentials(text: str) -> list[ScanFlag]:
|
|
444
|
+
"""Return one :class:`ScanFlag` per pattern that matched in ``text``.
|
|
445
|
+
|
|
446
|
+
The detail string carries the pattern label (e.g. ``"github-pat"``)
|
|
447
|
+
NOT the matched bytes -- the secret itself is never persisted into
|
|
448
|
+
the audit log. ``match_count`` records how many distinct matches
|
|
449
|
+
fired for that pattern.
|
|
450
|
+
"""
|
|
451
|
+
flags: list[ScanFlag] = []
|
|
452
|
+
if not text:
|
|
453
|
+
return flags
|
|
454
|
+
for label, pattern in _CREDENTIAL_PATTERNS:
|
|
455
|
+
matches = pattern.findall(text)
|
|
456
|
+
if not matches:
|
|
457
|
+
continue
|
|
458
|
+
flags.append(
|
|
459
|
+
ScanFlag(
|
|
460
|
+
category="credentials",
|
|
461
|
+
severity="hard-fail",
|
|
462
|
+
detail=f"matched credentials pattern: {label}",
|
|
463
|
+
match_count=len(matches),
|
|
464
|
+
)
|
|
465
|
+
)
|
|
466
|
+
return flags
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _detect_injection_heading(text: str) -> tuple[str, ScanFlag | None]:
|
|
470
|
+
"""Wrap suspicious sections in ``quarantined`` fences using v2.1.0 policy.
|
|
471
|
+
|
|
472
|
+
Detection rule (#949 tuning):
|
|
473
|
+
|
|
474
|
+
1. **Heading structural signal.** When the heading text contains an
|
|
475
|
+
instruction-override phrase (``IGNORE PREVIOUS`` /
|
|
476
|
+
``DISREGARD ABOVE`` / ``OVERRIDE ALL``) OR the heading text starts
|
|
477
|
+
with a role-hijack prefix (``SYSTEM:`` / ``ASSISTANT:`` /
|
|
478
|
+
``USER:`` / ``AGENT:`` / ``OVERRIDE:`` / ``DIRECTIVE:`` /
|
|
479
|
+
``ROLE:`` / ``INSTRUCTION(S):`` / ``PROMPT:`` / ``TOOL:`` /
|
|
480
|
+
``FUNCTION:``) we wrap the heading + section. The signal check is
|
|
481
|
+
evaluated on the full heading text with no allowlist short-circuit
|
|
482
|
+
so a benign-template heading whose tail smuggles an injection
|
|
483
|
+
phrase (e.g. ``## STEP 1 - Ignore previous instructions``) still
|
|
484
|
+
flags.
|
|
485
|
+
2. **Body shell vector.** When the heading's body contains a
|
|
486
|
+
shell-injection vector (``curl ... | sh`` / ``base64 -d`` /
|
|
487
|
+
``eval`` -- including ``eval `cmd``` `` backtick form -- /
|
|
488
|
+
``sh -c``) we wrap the heading + section even when the heading
|
|
489
|
+
text itself is benign-looking.
|
|
490
|
+
3. **Inline (non-heading) injection.** Any line outside a heading
|
|
491
|
+
that carries an instruction-override phrase or a body shell
|
|
492
|
+
vector is wrapped on its own.
|
|
493
|
+
4. **Idempotency.** Lines inside an existing fenced code block (any
|
|
494
|
+
```` ``` ```` / ``~~~`` opener) are passed through verbatim, so a
|
|
495
|
+
previously-wrapped ``quarantined`` block is a no-op on re-scan.
|
|
496
|
+
|
|
497
|
+
Returns ``(transformed_content, flag)``. ``flag.match_count`` is the
|
|
498
|
+
number of distinct sections wrapped (NOT individual tokens) -- the
|
|
499
|
+
audit log's primary signal under the new policy is "how many
|
|
500
|
+
injection-shaped sections did we observe", which is the value
|
|
501
|
+
operators reach for when triaging detector noise.
|
|
502
|
+
"""
|
|
503
|
+
if not text:
|
|
504
|
+
return text, None
|
|
505
|
+
|
|
506
|
+
lines = text.splitlines()
|
|
507
|
+
out: list[str] = []
|
|
508
|
+
in_fence: str | None = None
|
|
509
|
+
sections_wrapped = 0
|
|
510
|
+
i = 0
|
|
511
|
+
|
|
512
|
+
while i < len(lines):
|
|
513
|
+
line = lines[i]
|
|
514
|
+
|
|
515
|
+
# Existing fenced code blocks pass through verbatim (idempotent
|
|
516
|
+
# on re-scan; the v1 ``quarantine_body`` semantic is preserved).
|
|
517
|
+
# Closer detection requires the line to be ONLY the fence delim
|
|
518
|
+
# (after right-trim) -- per CommonMark a closing fence carries no
|
|
519
|
+
# info string, so ``` ```python ``` ``` is an OPENER for a nested
|
|
520
|
+
# block, not a closer for the outer one. The naive
|
|
521
|
+
# ``line.startswith(in_fence)`` check would otherwise drop the
|
|
522
|
+
# outer fence prematurely on the nested opener and re-process the
|
|
523
|
+
# nested block's content as live, breaking the idempotency
|
|
524
|
+
# guarantee on previously-quarantined bodies that happen to
|
|
525
|
+
# contain an embedded code example.
|
|
526
|
+
fence_match = _FENCE_RE.match(line)
|
|
527
|
+
if fence_match:
|
|
528
|
+
delim = fence_match.group(1)
|
|
529
|
+
if in_fence is None:
|
|
530
|
+
in_fence = delim
|
|
531
|
+
elif line.rstrip() == in_fence:
|
|
532
|
+
in_fence = None
|
|
533
|
+
out.append(line)
|
|
534
|
+
i += 1
|
|
535
|
+
continue
|
|
536
|
+
if in_fence is not None:
|
|
537
|
+
out.append(line)
|
|
538
|
+
i += 1
|
|
539
|
+
continue
|
|
540
|
+
|
|
541
|
+
heading_text = _heading_text(line)
|
|
542
|
+
if heading_text is not None:
|
|
543
|
+
# Determine the section span (this heading down to but not
|
|
544
|
+
# including the next heading; nested fences are consumed
|
|
545
|
+
# whole so an unbalanced opener never splits a section).
|
|
546
|
+
section_end = i + 1
|
|
547
|
+
while section_end < len(lines):
|
|
548
|
+
nxt = lines[section_end]
|
|
549
|
+
nested_fence = _FENCE_RE.match(nxt)
|
|
550
|
+
if nested_fence:
|
|
551
|
+
section_end += 1
|
|
552
|
+
nested = nxt[:3]
|
|
553
|
+
# Same closer-vs-nested-opener disambiguation as the
|
|
554
|
+
# outer fence loop above: a closing fence MUST be
|
|
555
|
+
# only the delim chars (no info string).
|
|
556
|
+
while (
|
|
557
|
+
section_end < len(lines)
|
|
558
|
+
and lines[section_end].rstrip() != nested
|
|
559
|
+
):
|
|
560
|
+
section_end += 1
|
|
561
|
+
section_end += 1 # consume the closer
|
|
562
|
+
continue
|
|
563
|
+
if _HEADING_RE.match(nxt):
|
|
564
|
+
break
|
|
565
|
+
section_end += 1
|
|
566
|
+
|
|
567
|
+
body_lines = lines[i + 1 : section_end]
|
|
568
|
+
|
|
569
|
+
# Structural-signal check is the sole gate on the heading
|
|
570
|
+
# text (no allowlist short-circuit -- a benign-template
|
|
571
|
+
# heading whose tail smuggles an injection phrase like
|
|
572
|
+
# ``## STEP 1 - Ignore previous instructions`` would
|
|
573
|
+
# otherwise pass through unwrapped). The body shell-vector
|
|
574
|
+
# check fires independently so a clean heading that smuggles
|
|
575
|
+
# ``curl ... | sh`` in the body is still flagged.
|
|
576
|
+
heading_signal = _heading_signal(heading_text)
|
|
577
|
+
body_signal = _body_has_shell_vector(body_lines)
|
|
578
|
+
|
|
579
|
+
if heading_signal or body_signal:
|
|
580
|
+
out.append(_QUARANTINE_FENCE_OPEN)
|
|
581
|
+
out.extend(lines[i:section_end])
|
|
582
|
+
out.append(_QUARANTINE_FENCE_CLOSE)
|
|
583
|
+
sections_wrapped += 1
|
|
584
|
+
i = section_end
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
# Heading itself is fine; pass it through and let the
|
|
588
|
+
# per-line scan below handle inline body content.
|
|
589
|
+
out.append(line)
|
|
590
|
+
i += 1
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
# Non-heading line: wrap if it carries an inline injection
|
|
594
|
+
# phrase or a shell vector. We deliberately do NOT wrap on a
|
|
595
|
+
# bare ``IMPORTANT:`` / ``STEP`` / etc. token in prose -- those
|
|
596
|
+
# are noise tokens under the v2.1.0 policy.
|
|
597
|
+
if _INJECTION_OVERRIDE_RE.search(line) or _BODY_VECTOR_RE.search(
|
|
598
|
+
line
|
|
599
|
+
):
|
|
600
|
+
out.append(_QUARANTINE_FENCE_OPEN)
|
|
601
|
+
out.append(line)
|
|
602
|
+
out.append(_QUARANTINE_FENCE_CLOSE)
|
|
603
|
+
sections_wrapped += 1
|
|
604
|
+
i += 1
|
|
605
|
+
continue
|
|
606
|
+
|
|
607
|
+
out.append(line)
|
|
608
|
+
i += 1
|
|
609
|
+
|
|
610
|
+
suffix = "\n" if text.endswith("\n") else ""
|
|
611
|
+
wrapped_text = "\n".join(out) + suffix
|
|
612
|
+
|
|
613
|
+
if sections_wrapped == 0:
|
|
614
|
+
return wrapped_text, None
|
|
615
|
+
return wrapped_text, ScanFlag(
|
|
616
|
+
category="injection-heading",
|
|
617
|
+
severity="fence-and-pass",
|
|
618
|
+
detail=(
|
|
619
|
+
f"wrapped {sections_wrapped} injection-shaped section(s) in"
|
|
620
|
+
" `quarantined` fence (v2.1.0 strict-signal policy)"
|
|
621
|
+
),
|
|
622
|
+
match_count=sections_wrapped,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
# ---------------------------------------------------------------------------
|
|
627
|
+
# Public scan API
|
|
628
|
+
# ---------------------------------------------------------------------------
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def scan(content_md: str, *, scanned_at: str | None = None) -> ScanResult:
|
|
632
|
+
"""Run scanner v2 over ``content_md`` and return a :class:`ScanResult`.
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
content_md: Untrusted markdown body (e.g. an issue body fetched
|
|
636
|
+
via ``scm:issue:view --json body``).
|
|
637
|
+
scanned_at: Optional override for the scanned_at timestamp. When
|
|
638
|
+
``None`` the current UTC time is used. Tests pass an explicit
|
|
639
|
+
value for deterministic snapshots.
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
A :class:`ScanResult` carrying:
|
|
643
|
+
|
|
644
|
+
- ``passed``: ``False`` iff any credentials-category flag fired.
|
|
645
|
+
- ``flags``: per-category findings in the order
|
|
646
|
+
(invisible-unicode, credentials, injection-heading).
|
|
647
|
+
- ``transformed_content``: the strip-then-fence output. Callers
|
|
648
|
+
treat this as the canonical content.md when ``passed`` is True;
|
|
649
|
+
when ``passed`` is False, the cache layer skips the
|
|
650
|
+
content.md write entirely.
|
|
651
|
+
- ``scanner_version`` / ``scanned_at``: timestamp + version
|
|
652
|
+
stamps for the meta.json scan_result envelope.
|
|
653
|
+
"""
|
|
654
|
+
timestamp = scanned_at if scanned_at is not None else _utc_now_iso()
|
|
655
|
+
flags: list[ScanFlag] = []
|
|
656
|
+
|
|
657
|
+
# 1. Strip invisibles first so subsequent regexes see the visible-only
|
|
658
|
+
# surface (a U+200B-smuggled credential token would otherwise dodge
|
|
659
|
+
# the credentials regex).
|
|
660
|
+
stripped, removed_labels = _strip_invisible(content_md)
|
|
661
|
+
if removed_labels:
|
|
662
|
+
# match_count here is the COUNT of stripped codepoints, not the
|
|
663
|
+
# cardinality of distinct labels -- we recompute against the
|
|
664
|
+
# original text so a body with 17 U+200B chars surfaces 17, not 1.
|
|
665
|
+
total_stripped = sum(1 for ch in (content_md or "") if _is_invisible(ch))
|
|
666
|
+
flags.append(
|
|
667
|
+
ScanFlag(
|
|
668
|
+
category="invisible-unicode",
|
|
669
|
+
severity="strip-and-pass",
|
|
670
|
+
detail=(
|
|
671
|
+
f"stripped {total_stripped} invisible-unicode codepoint(s): "
|
|
672
|
+
+ ", ".join(removed_labels)
|
|
673
|
+
),
|
|
674
|
+
match_count=total_stripped,
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
# 2. Credentials regex on the stripped text. We do NOT short-circuit
|
|
679
|
+
# on first match -- meta.json audit value comes from the full flag
|
|
680
|
+
# list, so we run every pattern.
|
|
681
|
+
cred_flags = _detect_credentials(stripped)
|
|
682
|
+
flags.extend(cred_flags)
|
|
683
|
+
|
|
684
|
+
# 3. Injection-heading wrap on the stripped text. Idempotent on
|
|
685
|
+
# already-wrapped content (quarantine_body's #583 contract).
|
|
686
|
+
wrapped, inj_flag = _detect_injection_heading(stripped)
|
|
687
|
+
if inj_flag is not None:
|
|
688
|
+
flags.append(inj_flag)
|
|
689
|
+
|
|
690
|
+
passed = not any(f.severity == "hard-fail" for f in flags)
|
|
691
|
+
return ScanResult(
|
|
692
|
+
passed=passed,
|
|
693
|
+
scanner_version=SCANNER_VERSION,
|
|
694
|
+
flags=flags,
|
|
695
|
+
transformed_content=wrapped,
|
|
696
|
+
scanned_at=timestamp,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
def _utc_now_iso() -> str:
|
|
701
|
+
"""Return current UTC time as an RFC-3339 / ISO-8601 string with ``Z`` suffix."""
|
|
702
|
+
# The cache-meta.schema.json dateTime guard requires a ``Z`` or
|
|
703
|
+
# +HH:MM suffix; ``datetime.isoformat()`` emits ``+00:00`` which
|
|
704
|
+
# doesn't match the schema's regex. We replace the suffix
|
|
705
|
+
# manually so the scan output validates without an extra normalisation
|
|
706
|
+
# pass at the caller.
|
|
707
|
+
return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
# ---------------------------------------------------------------------------
|
|
711
|
+
# CLI
|
|
712
|
+
# ---------------------------------------------------------------------------
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def main(argv: list[str] | None = None) -> int:
|
|
716
|
+
"""CLI entry point. Reads input file (or stdin), emits JSON ScanResult.
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
``0`` when the scan passed; ``2`` when at least one hard-fail
|
|
720
|
+
flag fired. Mirrors the cache:put exit-code contract so a caller
|
|
721
|
+
piping content through ``cache_scanner.py`` gets an actionable
|
|
722
|
+
signal without parsing the JSON.
|
|
723
|
+
"""
|
|
724
|
+
args = list(argv if argv is not None else sys.argv[1:])
|
|
725
|
+
if args and args[0] in {"-h", "--help"}:
|
|
726
|
+
sys.stdout.write(__doc__ or "")
|
|
727
|
+
return 0
|
|
728
|
+
text = (
|
|
729
|
+
Path(args[0]).read_text(encoding="utf-8") if args else sys.stdin.read()
|
|
730
|
+
)
|
|
731
|
+
result = scan(text)
|
|
732
|
+
payload = {
|
|
733
|
+
"passed": result.passed,
|
|
734
|
+
"scanner_version": result.scanner_version,
|
|
735
|
+
"scanned_at": result.scanned_at,
|
|
736
|
+
"flags": [asdict(f) for f in result.flags],
|
|
737
|
+
"transformed_content": result.transformed_content,
|
|
738
|
+
}
|
|
739
|
+
sys.stdout.write(json.dumps(payload, indent=2, ensure_ascii=False))
|
|
740
|
+
sys.stdout.write("\n")
|
|
741
|
+
return 0 if result.passed else 2
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
if __name__ == "__main__":
|
|
745
|
+
raise SystemExit(main())
|