@deftai/directive-content 0.55.2 → 0.56.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. package/.githooks/pre-commit +143 -0
  2. package/.githooks/pre-push +121 -0
  3. package/QUICK-START.md +2 -2
  4. package/Taskfile.yml +934 -0
  5. package/UPGRADING.md +47 -1
  6. package/events/README.md +3 -3
  7. package/package.json +5 -4
  8. package/scripts/_agents_md.py +494 -0
  9. package/scripts/_cache_fetch.py +635 -0
  10. package/scripts/_cache_quota.py +529 -0
  11. package/scripts/_cache_refresh.py +163 -0
  12. package/scripts/_cache_validate.py +209 -0
  13. package/scripts/_content_root.py +42 -0
  14. package/scripts/_doctor_state.py +277 -0
  15. package/scripts/_event_detect.py +305 -0
  16. package/scripts/_events.py +514 -0
  17. package/scripts/_lifecycle_hygiene.py +568 -0
  18. package/scripts/_pathspec.py +91 -0
  19. package/scripts/_policy_show_cli.py +266 -0
  20. package/scripts/_precutover.py +92 -0
  21. package/scripts/_project_context.py +224 -0
  22. package/scripts/_project_definition_io.py +164 -0
  23. package/scripts/_relocate_snapshot.py +209 -0
  24. package/scripts/_relocate_states.py +343 -0
  25. package/scripts/_resolve_preflight_path.py +152 -0
  26. package/scripts/_safe_subprocess.py +167 -0
  27. package/scripts/_session_start_hook.py +205 -0
  28. package/scripts/_sor_gate_diff.py +365 -0
  29. package/scripts/_stdio_utf8.py +59 -0
  30. package/scripts/_triage_bootstrap_gitignore.py +904 -0
  31. package/scripts/_triage_classify_cli.py +122 -0
  32. package/scripts/_triage_queue_cli.py +625 -0
  33. package/scripts/_triage_scope_cli.py +343 -0
  34. package/scripts/_triage_scope_drift_cli.py +121 -0
  35. package/scripts/_triage_scope_ignores.py +286 -0
  36. package/scripts/_triage_scope_milestone.py +432 -0
  37. package/scripts/_triage_scope_mutations.py +337 -0
  38. package/scripts/_triage_scope_renderers.py +207 -0
  39. package/scripts/_triage_smoketest_stages.py +674 -0
  40. package/scripts/_triage_subscribe_cli.py +140 -0
  41. package/scripts/_triage_welcome_cli.py +421 -0
  42. package/scripts/_vbrief_build.py +239 -0
  43. package/scripts/_vbrief_fidelity.py +479 -0
  44. package/scripts/_vbrief_legacy.py +589 -0
  45. package/scripts/_vbrief_reconciliation.py +883 -0
  46. package/scripts/_vbrief_routing.py +277 -0
  47. package/scripts/_vbrief_safety.py +778 -0
  48. package/scripts/_vbrief_sources.py +312 -0
  49. package/scripts/_vbrief_speckit.py +262 -0
  50. package/scripts/_vbrief_story_quality.py +353 -0
  51. package/scripts/_vbrief_validation.py +299 -0
  52. package/scripts/build_dist.py +412 -0
  53. package/scripts/cache.py +1078 -0
  54. package/scripts/cache_scanner.py +745 -0
  55. package/scripts/candidates_log.py +432 -0
  56. package/scripts/capacity_backfill.py +680 -0
  57. package/scripts/capacity_show.py +653 -0
  58. package/scripts/ci_local.py +689 -0
  59. package/scripts/code_structure_validate.py +765 -0
  60. package/scripts/codebase_default_extractor.py +495 -0
  61. package/scripts/codebase_map.py +304 -0
  62. package/scripts/codebase_map_fresh.py +104 -0
  63. package/scripts/codebase_projection_registry.py +94 -0
  64. package/scripts/codebase_provider.py +582 -0
  65. package/scripts/doctor.py +2257 -0
  66. package/scripts/framework_commands.py +505 -0
  67. package/scripts/gh_rest.py +882 -0
  68. package/scripts/github_auth_modes.py +437 -0
  69. package/scripts/github_body.py +292 -0
  70. package/scripts/ip_risk.py +531 -0
  71. package/scripts/issue_emit.py +670 -0
  72. package/scripts/issue_ingest.py +1064 -0
  73. package/scripts/migrate_preflight.py +418 -0
  74. package/scripts/migrate_vbrief.py +2677 -0
  75. package/scripts/monitor_pr.py +401 -0
  76. package/scripts/pack_migrate_lessons.py +336 -0
  77. package/scripts/pack_migrate_patterns.py +254 -0
  78. package/scripts/pack_migrate_rules.py +350 -0
  79. package/scripts/pack_migrate_skills.py +423 -0
  80. package/scripts/pack_migrate_strategies.py +311 -0
  81. package/scripts/pack_migrate_swarm_spec.py +250 -0
  82. package/scripts/pack_render.py +434 -0
  83. package/scripts/packs_slice.py +712 -0
  84. package/scripts/platform_capabilities.py +336 -0
  85. package/scripts/policy.py +2826 -0
  86. package/scripts/policy_set.py +324 -0
  87. package/scripts/pr_check_closing_keywords.py +524 -0
  88. package/scripts/pr_check_protected_issues.py +267 -0
  89. package/scripts/pr_merge_readiness.py +1004 -0
  90. package/scripts/pr_wait_mergeable.py +669 -0
  91. package/scripts/prd_render.py +159 -0
  92. package/scripts/preflight_architecture_sor.py +974 -0
  93. package/scripts/preflight_branch.py +289 -0
  94. package/scripts/preflight_cache.py +974 -0
  95. package/scripts/preflight_gh.py +721 -0
  96. package/scripts/preflight_implementation.py +272 -0
  97. package/scripts/preflight_story_start.py +838 -0
  98. package/scripts/preflight_wip_cap.py +149 -0
  99. package/scripts/probe_session.py +545 -0
  100. package/scripts/project_render.py +293 -0
  101. package/scripts/quarantine_ext.py +237 -0
  102. package/scripts/reconcile_issues.py +1442 -0
  103. package/scripts/refresh-path.ps1 +107 -0
  104. package/scripts/release.py +2030 -0
  105. package/scripts/release_e2e.py +1011 -0
  106. package/scripts/release_publish.py +486 -0
  107. package/scripts/release_rollback.py +980 -0
  108. package/scripts/relocate.py +1034 -0
  109. package/scripts/resolve_changelog_unreleased.py +667 -0
  110. package/scripts/resolve_version.py +490 -0
  111. package/scripts/resume_conditions.py +706 -0
  112. package/scripts/ritual_sentinel.py +609 -0
  113. package/scripts/roadmap_render.py +635 -0
  114. package/scripts/rule_ownership_lint.py +325 -0
  115. package/scripts/scm.py +591 -0
  116. package/scripts/scope_audit_log.py +387 -0
  117. package/scripts/scope_decompose.py +654 -0
  118. package/scripts/scope_demote.py +509 -0
  119. package/scripts/scope_lifecycle.py +1126 -0
  120. package/scripts/scope_undo.py +772 -0
  121. package/scripts/session_start.py +406 -0
  122. package/scripts/setup_ghx.py +339 -0
  123. package/scripts/setup_windows.ps1 +220 -0
  124. package/scripts/slice_audit.py +585 -0
  125. package/scripts/slice_record.py +530 -0
  126. package/scripts/slice_record_existing.py +692 -0
  127. package/scripts/slug_normalize.py +178 -0
  128. package/scripts/spec_render.py +477 -0
  129. package/scripts/spec_validate.py +238 -0
  130. package/scripts/subagent_monitor.py +658 -0
  131. package/scripts/swarm_complete_cohort.py +644 -0
  132. package/scripts/swarm_launch.py +1206 -0
  133. package/scripts/swarm_readiness.py +554 -0
  134. package/scripts/swarm_verify_review_clean.py +438 -0
  135. package/scripts/swarm_worktrees.py +497 -0
  136. package/scripts/toolchain-check.py +52 -0
  137. package/scripts/triage_actions.py +871 -0
  138. package/scripts/triage_bootstrap.py +1153 -0
  139. package/scripts/triage_bulk.py +630 -0
  140. package/scripts/triage_classify.py +932 -0
  141. package/scripts/triage_help.py +1685 -0
  142. package/scripts/triage_queue.py +1944 -0
  143. package/scripts/triage_reconcile.py +581 -0
  144. package/scripts/triage_refresh.py +643 -0
  145. package/scripts/triage_scope.py +999 -0
  146. package/scripts/triage_scope_drift.py +575 -0
  147. package/scripts/triage_smoketest.py +396 -0
  148. package/scripts/triage_subscribe.py +399 -0
  149. package/scripts/triage_summary.py +1011 -0
  150. package/scripts/triage_welcome.py +1178 -0
  151. package/scripts/ts_check_lane.py +86 -0
  152. package/scripts/validate-links.py +64 -0
  153. package/scripts/validate_strategy_output.py +212 -0
  154. package/scripts/vbrief_activate.py +228 -0
  155. package/scripts/vbrief_migrate_conformance.py +368 -0
  156. package/scripts/vbrief_reconcile_graph.py +306 -0
  157. package/scripts/vbrief_reconcile_labels.py +460 -0
  158. package/scripts/vbrief_reconcile_umbrellas.py +741 -0
  159. package/scripts/vbrief_validate.py +1195 -0
  160. package/scripts/verify-stubs.py +61 -0
  161. package/scripts/verify_capacity.py +160 -0
  162. package/scripts/verify_encoding.py +699 -0
  163. package/scripts/verify_hooks_installed.py +206 -0
  164. package/scripts/verify_investigation.py +360 -0
  165. package/scripts/verify_judgment_gates.py +827 -0
  166. package/scripts/verify_no_task_runtime.py +171 -0
  167. package/scripts/verify_scm_boundary.py +509 -0
  168. package/scripts/verify_session_ritual.py +389 -0
  169. package/scripts/verify_tools.py +426 -0
  170. package/scripts/verify_vbrief_conformance.py +478 -0
  171. package/tasks/architecture.yml +13 -0
  172. package/tasks/cache.yml +69 -0
  173. package/tasks/capacity.yml +38 -0
  174. package/tasks/change.yml +46 -0
  175. package/tasks/changelog.yml +24 -0
  176. package/tasks/ci.yml +49 -0
  177. package/tasks/codebase.yml +47 -0
  178. package/tasks/commit.yml +30 -0
  179. package/tasks/core.yml +126 -0
  180. package/tasks/deployments.yml +54 -0
  181. package/tasks/framework.yml +74 -0
  182. package/tasks/install.yml +60 -0
  183. package/tasks/issue.yml +50 -0
  184. package/tasks/migrate.yml +73 -0
  185. package/tasks/packs.yml +92 -0
  186. package/tasks/policy.yml +75 -0
  187. package/tasks/pr.yml +89 -0
  188. package/tasks/prd.yml +39 -0
  189. package/tasks/project.yml +27 -0
  190. package/tasks/reconcile.yml +32 -0
  191. package/tasks/relocate.yml +56 -0
  192. package/tasks/roadmap.yml +28 -0
  193. package/tasks/scm.yml +126 -0
  194. package/tasks/scope-undo.yml +36 -0
  195. package/tasks/scope.yml +141 -0
  196. package/tasks/session.yml +19 -0
  197. package/tasks/setup.yml +37 -0
  198. package/tasks/slice.yml +69 -0
  199. package/tasks/spec.yml +41 -0
  200. package/tasks/swarm.yml +85 -0
  201. package/tasks/toolchain.yml +13 -0
  202. package/tasks/triage-actions.yml +94 -0
  203. package/tasks/triage-bootstrap.yml +43 -0
  204. package/tasks/triage-bulk.yml +75 -0
  205. package/tasks/triage-classify.yml +30 -0
  206. package/tasks/triage-queue.yml +50 -0
  207. package/tasks/triage-reconcile.yml +29 -0
  208. package/tasks/triage-scope-drift.yml +29 -0
  209. package/tasks/triage-scope.yml +31 -0
  210. package/tasks/triage-smoketest.yml +33 -0
  211. package/tasks/triage-subscribe.yml +36 -0
  212. package/tasks/triage-summary.yml +29 -0
  213. package/tasks/triage-welcome.yml +32 -0
  214. package/tasks/ts.yml +328 -0
  215. package/tasks/vbrief.yml +206 -0
  216. package/tasks/verify.yml +292 -0
  217. package/templates/agents-entry.md +1 -1
@@ -0,0 +1,745 @@
1
+ #!/usr/bin/env python3
2
+ r"""cache_scanner.py -- quarantine scanner v2 for the unified cache (#883 Story 2).
3
+
4
+ Public surface
5
+ --------------
6
+
7
+ ``scan(content_md: str) -> ScanResult``
8
+ Run the three baseline categories over ``content_md`` and return a
9
+ structured :class:`ScanResult` carrying ``passed`` (False iff any
10
+ hard-fail severity flag fires), the per-category ``flags`` list, and
11
+ the ``transformed_content`` that the cache layer should persist as
12
+ ``content.md`` when ``passed`` is True.
13
+
14
+ ``SCANNER_VERSION``
15
+ Module-level SemVer string. Bumped per the documented rule:
16
+
17
+ - patch (``2.0.x``) -- pattern additions to an existing category
18
+ - minor (``2.x.0``) -- new category landed (e.g. shell-cmd-injection)
19
+ OR a material detection-policy change that alters which bodies flag
20
+ (e.g. v2.1.0 #949 strict-signal injection-heading tuning)
21
+ - major (``x.0.0``) -- semantic rewrite (e.g. cache:put hard-fails on
22
+ every fence-and-pass match instead of writing content.md)
23
+
24
+ Scanner v2 baseline categories
25
+ ------------------------------
26
+
27
+ 1. ``injection-heading`` -- severity ``fence-and-pass``. Tuned in v2.1.0 for
28
+ precision against organic GitHub issue templates (#949). The detector
29
+ no longer fires on bare imperative-shaped headings (``## STEP 1``,
30
+ ``## Action items``, ``## Important notes``, ``## Task list``,
31
+ ``## Background``, ...). It now requires a *structural* injection
32
+ signal before flagging: either (a) an instruction-override / role-hijack
33
+ phrase in the heading text -- ``IGNORE/DISREGARD/FORGET PREVIOUS``,
34
+ ``SYSTEM:`` / ``ASSISTANT:`` / ``USER:`` / ``AGENT:`` / ``OVERRIDE:`` /
35
+ ``DIRECTIVE:`` / ``ROLE:`` / ``INSTRUCTION(S):`` / ``PROMPT:`` /
36
+ ``TOOL:`` / ``FUNCTION:`` at the heading's start -- or (b) a shell
37
+ vector inside the heading's body (``curl ... | sh``, ``wget ... | sh``,
38
+ ``base64 -d``, ``eval``, ``sh -c``, ```eval `cmd``` ``). Plain-prose
39
+ lines with the same instruction-override phrasing also flag. The
40
+ structural-signal check is the sole gate -- there is no allowlist
41
+ short-circuit, so a benign-template heading whose tail smuggles an
42
+ injection phrase (e.g. ``## STEP 1 - Ignore previous instructions``)
43
+ still flags. ``quarantine_ext`` keeps its broader policy untouched --
44
+ this category owns its own detection + wrapping. The flag carries
45
+ ``match_count`` = number of detected sections.
46
+
47
+ 2. ``credentials`` -- severity ``hard-fail``. A curated regex set covering
48
+ the canonical exfiltratable secret shapes (``gh[pousr]_``, ``sk-`` /
49
+ ``sk-ant-``, ``xox[bp]-``, ``AKIA``, PEM private-key headers, ``Bearer``
50
+ tokens, JWTs). When any pattern matches, ``passed`` is set to ``False``
51
+ and ``cache:put`` declines to write ``content.md``. The flag's
52
+ ``detail`` field carries the pattern label (e.g. ``"github-pat"``)
53
+ NOT the matched bytes -- a redacted descriptor only, so the audit log
54
+ never persists the secret it caught.
55
+
56
+ 3. ``invisible-unicode`` -- severity ``strip-and-pass``. A codepoint
57
+ membership test against the canonical bidi / zero-width / tag character
58
+ set (U+200B-200F, U+202A-202E, U+2060, U+2066-2069, U+FEFF,
59
+ U+E0000-U+E007F). Matched codepoints are stripped from
60
+ ``transformed_content`` and the flag's ``match_count`` field records
61
+ how many were removed (the precise codepoint set is summarised in
62
+ ``detail`` as a comma-separated list of ``U+XXXX`` labels).
63
+
64
+ Order of operations
65
+ -------------------
66
+
67
+ Within a single :func:`scan` call:
68
+
69
+ 1. Invisible-unicode strip runs FIRST so subsequent categories scan the
70
+ visible-only text. A credential token that smuggles itself across a
71
+ word boundary using a U+200B (e.g. ``gh\u200bp_<...>``) would otherwise
72
+ slip past the credentials regex; stripping first closes that hole.
73
+
74
+ 2. Credentials regex runs on the stripped text. The flag is recorded
75
+ immediately; we do NOT short-circuit the scan even when ``passed``
76
+ becomes False, because the meta.json audit trail is more useful with
77
+ the full flag list.
78
+
79
+ 3. Injection-heading wrap runs LAST on the stripped text. The transform
80
+ is applied unconditionally; ``transformed_content`` is the
81
+ strip-then-fence output regardless of ``passed``. (Callers that ignore
82
+ the transform on hard-fail are fine -- ``cache:put`` writes
83
+ raw.json + meta.json only when ``passed`` is False, never the
84
+ transformed_content.)
85
+
86
+ CLI
87
+ ---
88
+
89
+ The module is callable as a script for ad-hoc inspection:
90
+
91
+ python scripts/cache_scanner.py [<input-file>]
92
+
93
+ Reads input file (or stdin), runs :func:`scan`, and writes the JSON
94
+ representation of the :class:`ScanResult` to stdout. Exit code is
95
+ 0 when scan_result.passed is True, 2 when False -- mirrors the cache:put
96
+ exit-code contract so a caller piping content through the scanner gets
97
+ an actionable signal without having to parse the JSON.
98
+ """
99
+
100
+ from __future__ import annotations
101
+
102
+ import json
103
+ import re
104
+ import sys
105
+ from dataclasses import asdict, dataclass, field
106
+ from datetime import UTC, datetime
107
+ from pathlib import Path
108
+
109
+ # Make ``scripts`` importable when this file is invoked via
110
+ # ``python scripts/cache_scanner.py`` from a Taskfile dispatch.
111
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # Constants
115
+ # ---------------------------------------------------------------------------
116
+
117
+ #: Module-level scanner SemVer. The version is persisted into every
118
+ #: meta.json scan_result.scanner_version field on cache:put so a future
119
+ #: cache:doctor --rescan (deferred to v2) can detect entries written by
120
+ #: an older scanner and re-run them. Bump rules in module docstring.
121
+ #:
122
+ #: 2.0.0 -- baseline (3 categories on, injection-heading reused
123
+ #: quarantine_ext.SUSPICIOUS_TOKENS).
124
+ #: 2.1.0 -- injection-heading detector tuned for precision (#949): tighter
125
+ #: structural-signal policy (instruction-override / role-hijack / body
126
+ #: shell-vector) reduces the false-positive rate from ~85% to <20% on
127
+ #: organic deftai/directive issue bodies. No schema break; existing
128
+ #: meta.json + audit-log records remain valid.
129
+ SCANNER_VERSION: str = "2.1.0"
130
+
131
+ #: Categories baselined in scanner v2. Frozen tuple so the ordering
132
+ #: matches the meta.json ScanFlag.category enum in
133
+ #: vbrief/schemas/cache-meta.schema.json.
134
+ CATEGORIES: tuple[str, ...] = (
135
+ "injection-heading",
136
+ "credentials",
137
+ "invisible-unicode",
138
+ )
139
+
140
+ #: Severity per category. Per-category severity is a documented epic
141
+ #: departure from the design doc's uniform hard-fail; rationale lives in
142
+ #: vbrief/active/.../883-deft-cache-quarantine-v1.vbrief.json under
143
+ #: metadata.x-tracking.design_doc_departures.
144
+ SEVERITY_BY_CATEGORY: dict[str, str] = {
145
+ "injection-heading": "fence-and-pass",
146
+ "credentials": "hard-fail",
147
+ "invisible-unicode": "strip-and-pass",
148
+ }
149
+
150
+ # ---------------------------------------------------------------------------
151
+ # Credentials patterns
152
+ # ---------------------------------------------------------------------------
153
+
154
+ #: Curated regex set for the credentials category. Each entry pairs a
155
+ #: short label (carried into ScanFlag.detail) with a compiled regex. The
156
+ #: label is what the audit log persists -- the matched secret itself is
157
+ #: NEVER persisted (per cache-meta.schema.json's ScanFlag.detail
158
+ #: redaction rule). Patterns are anchored loose-but-specific: tight
159
+ #: enough to avoid false positives in benign prose, loose enough to
160
+ #: catch real-world variations.
161
+ #:
162
+ #: Layout: list of (label, compiled-regex) tuples. Order is the order
163
+ #: emitted into flags; not security-critical, but kept consistent so
164
+ #: tests can pin offsets without flake.
165
+ _CREDENTIAL_PATTERNS: list[tuple[str, re.Pattern[str]]] = [
166
+ # GitHub personal-access tokens. The four prefixes (``ghp_``, ``gho_``,
167
+ # ``ghu_``, ``ghs_``, ``ghr_``) cover personal / oauth / user-to-server
168
+ # / server-to-server / refresh tokens respectively. The 30+ trailing
169
+ # alphanumeric run is the documented gh format.
170
+ ("github-pat", re.compile(r"\bgh[pousr]_[A-Za-z0-9]{30,}\b")),
171
+ # Anthropic API key (sk-ant-...). Listed BEFORE the generic ``sk-``
172
+ # OpenAI pattern so the more-specific match wins (re.search is
173
+ # iteration-order independent but the per-flag label depends on
174
+ # which pattern fired first; sk-ant should win for clarity).
175
+ ("anthropic-api-key", re.compile(r"\bsk-ant-[A-Za-z0-9_-]{20,}\b")),
176
+ # OpenAI API key (sk-...). The 20+ trailing run keeps the pattern
177
+ # specific enough to skip false positives like ``sk-discovery`` or
178
+ # ``sk-rules`` that show up in non-token prose.
179
+ ("openai-api-key", re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")),
180
+ # Slack tokens. ``xoxb-`` (bot) and ``xoxp-`` (user) are the two
181
+ # commonly-leaked variants; ``xoxa-`` / ``xoxs-`` are session-scoped
182
+ # and out of v1 scope.
183
+ ("slack-token", re.compile(r"\bxox[bp]-[A-Za-z0-9-]{20,}\b")),
184
+ # AWS access-key-id. The ``AKIA`` prefix + exactly-16 A-Z0-9 run is
185
+ # the canonical AWS IAM access-key shape; ``ASIA`` (session keys)
186
+ # is intentionally NOT covered in v1 because session keys are
187
+ # short-lived and the false-positive rate against codenames is high.
188
+ ("aws-access-key", re.compile(r"\bAKIA[0-9A-Z]{16}\b")),
189
+ # PEM private key BEGIN header. Matches RSA / DSA / EC / generic
190
+ # ``PRIVATE KEY`` variants (``OPENSSH PRIVATE KEY`` is the modern
191
+ # ssh-keygen default).
192
+ (
193
+ "pem-private-key",
194
+ re.compile(
195
+ r"-----BEGIN (?:RSA |DSA |EC |OPENSSH |ENCRYPTED )?PRIVATE KEY-----"
196
+ ),
197
+ ),
198
+ # Bearer authorization header. The 20+ run guards against the
199
+ # word "Bearer" used in benign prose (e.g. "the Bearer of bad news").
200
+ (
201
+ "bearer-token",
202
+ re.compile(r"\bBearer\s+[A-Za-z0-9_.~+/=-]{20,}\b"),
203
+ ),
204
+ # JWT shape: three base64url segments separated by dots. The
205
+ # ``eyJ`` prefix is the base64url encoding of the JSON ``{"`` header
206
+ # opener -- effectively unique to JWTs.
207
+ (
208
+ "jwt",
209
+ re.compile(r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"),
210
+ ),
211
+ ]
212
+
213
+ # ---------------------------------------------------------------------------
214
+ # Invisible-unicode codepoints
215
+ # ---------------------------------------------------------------------------
216
+
217
+ #: Codepoint set for the invisible-unicode category. Each codepoint that
218
+ #: appears here is stripped from the content and counted against the
219
+ #: invisible-unicode flag. The set covers:
220
+ #:
221
+ #: - U+200B..U+200F -- zero-width space, zero-width non-joiner, joiner,
222
+ #: left-to-right mark, right-to-left mark.
223
+ #: - U+202A..U+202E -- LRE, RLE, PDF, LRO, RLO (bidi overrides; the
224
+ #: well-known "trojan source" attack vector).
225
+ #: - U+2060 -- word joiner (zero-width non-breaking).
226
+ #: - U+2066..U+2069 -- LRI, RLI, FSI, PDI (isolates; #2024-bidi-attack
227
+ #: vector).
228
+ #: - U+FEFF -- byte-order mark / zero-width no-break space.
229
+ #: - U+E0000..U+E007F -- tag characters / language-tag block (Unicode
230
+ #: "tag" plane; abused for invisible exfiltration).
231
+ _INVISIBLE_RANGES: tuple[tuple[int, int], ...] = (
232
+ (0x200B, 0x200F),
233
+ (0x202A, 0x202E),
234
+ (0x2060, 0x2060),
235
+ (0x2066, 0x2069),
236
+ (0xFEFF, 0xFEFF),
237
+ (0xE0000, 0xE007F),
238
+ )
239
+
240
+
241
+ def _is_invisible(ch: str) -> bool:
242
+ """Return True iff ``ch`` is in the invisible-unicode strip set."""
243
+ cp = ord(ch)
244
+ return any(lo <= cp <= hi for lo, hi in _INVISIBLE_RANGES)
245
+
246
+
247
+ # ---------------------------------------------------------------------------
248
+ # Injection-heading detection (#949 tuning)
249
+ # ---------------------------------------------------------------------------
250
+
251
+ #: Structural injection signal detected within a heading TEXT. A heading
252
+ #: that triggers any of these patterns is treated as a real injection
253
+ #: vector. The patterns are deliberately narrow so generic prose ("the
254
+ #: user clicks ...", "system requires ...") does NOT fire.
255
+ #:
256
+ #: 1. Override / disregard phrases -- the canonical prompt-injection
257
+ #: opener ("ignore previous instructions", "disregard the above",
258
+ #: "forget prior"). Word-boundary on the verb; the operand requires
259
+ #: one of {previous, prior, above, all, earlier, your}.
260
+ #: 2. Role-hijack prefix at the START of the heading text -- ``SYSTEM:``,
261
+ #: ``ASSISTANT:``, ``USER:``, ``AGENT:``, ``TOOL:``, ``FUNCTION:``,
262
+ #: ``OVERRIDE:``, ``DIRECTIVE:``, ``ROLE:``, ``PROMPT:``,
263
+ #: ``INSTRUCTION:``, ``INSTRUCTIONS:``. The colon-anchored shape is
264
+ #: the distinctive injection vector; ``System Requirements`` / ``User
265
+ #: Story`` / etc. do NOT match because they lack the colon.
266
+ _INJECTION_OVERRIDE_RE: re.Pattern[str] = re.compile(
267
+ r"\b(?:ignore|disregard|forget|override|bypass)\s+(?:the\s+|all\s+|any\s+)?"
268
+ r"(?:previous|prior|above|earlier|all|your|preceding|original|system)\b",
269
+ re.IGNORECASE,
270
+ )
271
+
272
+ _HEADING_ROLE_PREFIXES: tuple[str, ...] = (
273
+ "SYSTEM",
274
+ "ASSISTANT",
275
+ "USER",
276
+ "AGENT",
277
+ "TOOL",
278
+ "FUNCTION",
279
+ "OVERRIDE",
280
+ "DIRECTIVE",
281
+ "ROLE",
282
+ "PROMPT",
283
+ "INSTRUCTION",
284
+ "INSTRUCTIONS",
285
+ )
286
+
287
+ _HEADING_ROLE_PREFIX_RE: re.Pattern[str] = re.compile(
288
+ r"^(?:" + "|".join(re.escape(p) for p in _HEADING_ROLE_PREFIXES) + r")\s*:",
289
+ re.IGNORECASE,
290
+ )
291
+
292
+ #: Body-context shell-vector regex. When a heading's body (the lines
293
+ #: between the heading and the next heading) contains any of these
294
+ #: patterns, the heading is treated as a pre-roll for a shell-injection
295
+ #: vector and flagged. This is the ONLY body-context signal the v2.1.0
296
+ #: injection-heading detector consumes; the dedicated shell-cmd-injection
297
+ #: scanner category that would do this body-wide is intentionally
298
+ #: deferred (#949 follow-up).
299
+ _BODY_VECTOR_RE: re.Pattern[str] = re.compile(
300
+ # Shell set kept consistent across all three sub-patterns (pipe-to-
301
+ # shell, ``sh -c``, ``/bin/sh -c``) so a vector like ``ksh -c '...'``
302
+ # or ``/bin/ksh -c '...'`` is not silently passed through; ksh was
303
+ # previously only listed in the pipe-to-shell alternative which left
304
+ # a blind spot the other two branches did not cover. Refs PR #957
305
+ # Greptile review on commit 5acfa8a.
306
+ r"(?:curl|wget|fetch)\s+[^|\n]*\|\s*(?:sh|bash|zsh|ksh)\b"
307
+ r"|\bbase64\s+(?:-d|--decode|-D)\b"
308
+ r"|\beval\s*[\(\$\"'`]"
309
+ r"|\b(?:sh|bash|zsh|ksh)\s+-c\s+[\"']"
310
+ r"|\b/bin/(?:sh|bash|zsh|ksh)\s+-c\s+[\"']",
311
+ re.IGNORECASE,
312
+ )
313
+
314
+ #: Heading regex (mirrors quarantine_ext._HEADING_RE). 1-6 hashes plus
315
+ #: at least one space; setext-style ``===`` / ``---`` is intentionally
316
+ #: out of scope (vanishingly rare in GitHub issue bodies, multi-line
317
+ #: lookahead would complicate the iteration).
318
+ _HEADING_RE: re.Pattern[str] = re.compile(r"^(#{1,6})\s+(.*\S.*)$")
319
+
320
+ #: Code-fence delimiter regex.
321
+ _FENCE_RE: re.Pattern[str] = re.compile(r"^(```|~~~)")
322
+
323
+ #: Quarantine fence labels (mirror quarantine_ext for downstream-grep
324
+ #: compatibility -- the literal ``quarantined`` label is the contract).
325
+ _QUARANTINE_FENCE_OPEN: str = "```quarantined"
326
+ _QUARANTINE_FENCE_CLOSE: str = "```"
327
+
328
+
329
+ def _heading_text(line: str) -> str | None:
330
+ """Return the heading text portion (after the ``#``-prefix) or None."""
331
+ match = _HEADING_RE.match(line)
332
+ if match is None:
333
+ return None
334
+ return match.group(2).strip()
335
+
336
+
337
+ def _heading_signal(text: str) -> bool:
338
+ """Return True iff the heading text carries a structural injection signal."""
339
+ if _INJECTION_OVERRIDE_RE.search(text):
340
+ return True
341
+ return bool(_HEADING_ROLE_PREFIX_RE.match(text))
342
+
343
+
344
+ def _body_has_shell_vector(body_lines: list[str]) -> bool:
345
+ """Return True iff any non-fenced line in ``body_lines`` matches the shell vector.
346
+
347
+ Lines inside nested code-fence blocks are skipped so a legitimate
348
+ technical doc that illustrates a shell command inside a fenced
349
+ example (e.g. a ``## Steps to reproduce`` body containing
350
+ ```` ```sh\ncurl ... | sh\n``` ````) does not FP-flag. The fence
351
+ state machine mirrors the outer loop in
352
+ :func:`_detect_injection_heading`: a closing fence MUST be only the
353
+ delim chars after right-trim (per CommonMark a closer carries no
354
+ info string), so ``` ```python ``` is an OPENER for a nested block
355
+ rather than a closer for the outer one. Refs PR #957 Greptile P1.
356
+ """
357
+ in_fence: str | None = None
358
+ for ln in body_lines:
359
+ fence_match = _FENCE_RE.match(ln)
360
+ if fence_match:
361
+ delim = fence_match.group(1)
362
+ if in_fence is None:
363
+ in_fence = delim
364
+ elif ln.rstrip() == in_fence:
365
+ in_fence = None
366
+ continue
367
+ if in_fence is not None:
368
+ continue
369
+ if _BODY_VECTOR_RE.search(ln):
370
+ return True
371
+ return False
372
+
373
+
374
+ # ---------------------------------------------------------------------------
375
+ # Result dataclasses
376
+ # ---------------------------------------------------------------------------
377
+
378
+
379
+ @dataclass
380
+ class ScanFlag:
381
+ """One scanner finding. Mirrors vbrief/schemas/cache-meta.schema.json $defs/ScanFlag."""
382
+
383
+ category: str
384
+ severity: str
385
+ detail: str
386
+ match_count: int = 0
387
+
388
+
389
+ @dataclass
390
+ class ScanResult:
391
+ """Aggregate scanner outcome."""
392
+
393
+ passed: bool
394
+ scanner_version: str
395
+ flags: list[ScanFlag] = field(default_factory=list)
396
+ transformed_content: str = ""
397
+ scanned_at: str = ""
398
+
399
+ def to_meta_dict(self) -> dict[str, object]:
400
+ """Render the scan_result subset of meta.json (per the schema).
401
+
402
+ The cache layer composes this with the source/key/fetched_at/...
403
+ envelope before persisting; the scanner does NOT compose the full
404
+ meta.json itself because TTL / fetched_at are cache-layer concerns.
405
+ """
406
+ return {
407
+ "passed": self.passed,
408
+ "scanned_at": self.scanned_at,
409
+ "scanner_version": self.scanner_version,
410
+ "flags": [
411
+ {k: v for k, v in asdict(flag).items() if k != "match_count" or v}
412
+ for flag in self.flags
413
+ ],
414
+ }
415
+
416
+
417
+ # ---------------------------------------------------------------------------
418
+ # Strip-then-flag helpers
419
+ # ---------------------------------------------------------------------------
420
+
421
+
422
+ def _strip_invisible(text: str) -> tuple[str, list[str]]:
423
+ """Strip invisible-unicode codepoints; return ``(stripped_text, removed_labels)``.
424
+
425
+ ``removed_labels`` is a list of unique ``U+XXXX`` labels for the
426
+ codepoints that were removed; the order matches first-occurrence
427
+ in the input. The list is what the ScanFlag.detail field summarises.
428
+ """
429
+ if not text:
430
+ return text, []
431
+ out_chars: list[str] = []
432
+ seen: dict[int, str] = {}
433
+ for ch in text:
434
+ if _is_invisible(ch):
435
+ cp = ord(ch)
436
+ if cp not in seen:
437
+ seen[cp] = f"U+{cp:04X}"
438
+ continue
439
+ out_chars.append(ch)
440
+ return "".join(out_chars), list(seen.values())
441
+
442
+
443
+ def _detect_credentials(text: str) -> list[ScanFlag]:
444
+ """Return one :class:`ScanFlag` per pattern that matched in ``text``.
445
+
446
+ The detail string carries the pattern label (e.g. ``"github-pat"``)
447
+ NOT the matched bytes -- the secret itself is never persisted into
448
+ the audit log. ``match_count`` records how many distinct matches
449
+ fired for that pattern.
450
+ """
451
+ flags: list[ScanFlag] = []
452
+ if not text:
453
+ return flags
454
+ for label, pattern in _CREDENTIAL_PATTERNS:
455
+ matches = pattern.findall(text)
456
+ if not matches:
457
+ continue
458
+ flags.append(
459
+ ScanFlag(
460
+ category="credentials",
461
+ severity="hard-fail",
462
+ detail=f"matched credentials pattern: {label}",
463
+ match_count=len(matches),
464
+ )
465
+ )
466
+ return flags
467
+
468
+
469
+ def _detect_injection_heading(text: str) -> tuple[str, ScanFlag | None]:
470
+ """Wrap suspicious sections in ``quarantined`` fences using v2.1.0 policy.
471
+
472
+ Detection rule (#949 tuning):
473
+
474
+ 1. **Heading structural signal.** When the heading text contains an
475
+ instruction-override phrase (``IGNORE PREVIOUS`` /
476
+ ``DISREGARD ABOVE`` / ``OVERRIDE ALL``) OR the heading text starts
477
+ with a role-hijack prefix (``SYSTEM:`` / ``ASSISTANT:`` /
478
+ ``USER:`` / ``AGENT:`` / ``OVERRIDE:`` / ``DIRECTIVE:`` /
479
+ ``ROLE:`` / ``INSTRUCTION(S):`` / ``PROMPT:`` / ``TOOL:`` /
480
+ ``FUNCTION:``) we wrap the heading + section. The signal check is
481
+ evaluated on the full heading text with no allowlist short-circuit
482
+ so a benign-template heading whose tail smuggles an injection
483
+ phrase (e.g. ``## STEP 1 - Ignore previous instructions``) still
484
+ flags.
485
+ 2. **Body shell vector.** When the heading's body contains a
486
+ shell-injection vector (``curl ... | sh`` / ``base64 -d`` /
487
+ ``eval`` -- including ``eval `cmd``` `` backtick form -- /
488
+ ``sh -c``) we wrap the heading + section even when the heading
489
+ text itself is benign-looking.
490
+ 3. **Inline (non-heading) injection.** Any line outside a heading
491
+ that carries an instruction-override phrase or a body shell
492
+ vector is wrapped on its own.
493
+ 4. **Idempotency.** Lines inside an existing fenced code block (any
494
+ ```` ``` ```` / ``~~~`` opener) are passed through verbatim, so a
495
+ previously-wrapped ``quarantined`` block is a no-op on re-scan.
496
+
497
+ Returns ``(transformed_content, flag)``. ``flag.match_count`` is the
498
+ number of distinct sections wrapped (NOT individual tokens) -- the
499
+ audit log's primary signal under the new policy is "how many
500
+ injection-shaped sections did we observe", which is the value
501
+ operators reach for when triaging detector noise.
502
+ """
503
+ if not text:
504
+ return text, None
505
+
506
+ lines = text.splitlines()
507
+ out: list[str] = []
508
+ in_fence: str | None = None
509
+ sections_wrapped = 0
510
+ i = 0
511
+
512
+ while i < len(lines):
513
+ line = lines[i]
514
+
515
+ # Existing fenced code blocks pass through verbatim (idempotent
516
+ # on re-scan; the v1 ``quarantine_body`` semantic is preserved).
517
+ # Closer detection requires the line to be ONLY the fence delim
518
+ # (after right-trim) -- per CommonMark a closing fence carries no
519
+ # info string, so ``` ```python ``` ``` is an OPENER for a nested
520
+ # block, not a closer for the outer one. The naive
521
+ # ``line.startswith(in_fence)`` check would otherwise drop the
522
+ # outer fence prematurely on the nested opener and re-process the
523
+ # nested block's content as live, breaking the idempotency
524
+ # guarantee on previously-quarantined bodies that happen to
525
+ # contain an embedded code example.
526
+ fence_match = _FENCE_RE.match(line)
527
+ if fence_match:
528
+ delim = fence_match.group(1)
529
+ if in_fence is None:
530
+ in_fence = delim
531
+ elif line.rstrip() == in_fence:
532
+ in_fence = None
533
+ out.append(line)
534
+ i += 1
535
+ continue
536
+ if in_fence is not None:
537
+ out.append(line)
538
+ i += 1
539
+ continue
540
+
541
+ heading_text = _heading_text(line)
542
+ if heading_text is not None:
543
+ # Determine the section span (this heading down to but not
544
+ # including the next heading; nested fences are consumed
545
+ # whole so an unbalanced opener never splits a section).
546
+ section_end = i + 1
547
+ while section_end < len(lines):
548
+ nxt = lines[section_end]
549
+ nested_fence = _FENCE_RE.match(nxt)
550
+ if nested_fence:
551
+ section_end += 1
552
+ nested = nxt[:3]
553
+ # Same closer-vs-nested-opener disambiguation as the
554
+ # outer fence loop above: a closing fence MUST be
555
+ # only the delim chars (no info string).
556
+ while (
557
+ section_end < len(lines)
558
+ and lines[section_end].rstrip() != nested
559
+ ):
560
+ section_end += 1
561
+ section_end += 1 # consume the closer
562
+ continue
563
+ if _HEADING_RE.match(nxt):
564
+ break
565
+ section_end += 1
566
+
567
+ body_lines = lines[i + 1 : section_end]
568
+
569
+ # Structural-signal check is the sole gate on the heading
570
+ # text (no allowlist short-circuit -- a benign-template
571
+ # heading whose tail smuggles an injection phrase like
572
+ # ``## STEP 1 - Ignore previous instructions`` would
573
+ # otherwise pass through unwrapped). The body shell-vector
574
+ # check fires independently so a clean heading that smuggles
575
+ # ``curl ... | sh`` in the body is still flagged.
576
+ heading_signal = _heading_signal(heading_text)
577
+ body_signal = _body_has_shell_vector(body_lines)
578
+
579
+ if heading_signal or body_signal:
580
+ out.append(_QUARANTINE_FENCE_OPEN)
581
+ out.extend(lines[i:section_end])
582
+ out.append(_QUARANTINE_FENCE_CLOSE)
583
+ sections_wrapped += 1
584
+ i = section_end
585
+ continue
586
+
587
+ # Heading itself is fine; pass it through and let the
588
+ # per-line scan below handle inline body content.
589
+ out.append(line)
590
+ i += 1
591
+ continue
592
+
593
+ # Non-heading line: wrap if it carries an inline injection
594
+ # phrase or a shell vector. We deliberately do NOT wrap on a
595
+ # bare ``IMPORTANT:`` / ``STEP`` / etc. token in prose -- those
596
+ # are noise tokens under the v2.1.0 policy.
597
+ if _INJECTION_OVERRIDE_RE.search(line) or _BODY_VECTOR_RE.search(
598
+ line
599
+ ):
600
+ out.append(_QUARANTINE_FENCE_OPEN)
601
+ out.append(line)
602
+ out.append(_QUARANTINE_FENCE_CLOSE)
603
+ sections_wrapped += 1
604
+ i += 1
605
+ continue
606
+
607
+ out.append(line)
608
+ i += 1
609
+
610
+ suffix = "\n" if text.endswith("\n") else ""
611
+ wrapped_text = "\n".join(out) + suffix
612
+
613
+ if sections_wrapped == 0:
614
+ return wrapped_text, None
615
+ return wrapped_text, ScanFlag(
616
+ category="injection-heading",
617
+ severity="fence-and-pass",
618
+ detail=(
619
+ f"wrapped {sections_wrapped} injection-shaped section(s) in"
620
+ " `quarantined` fence (v2.1.0 strict-signal policy)"
621
+ ),
622
+ match_count=sections_wrapped,
623
+ )
624
+
625
+
626
+ # ---------------------------------------------------------------------------
627
+ # Public scan API
628
+ # ---------------------------------------------------------------------------
629
+
630
+
631
+ def scan(content_md: str, *, scanned_at: str | None = None) -> ScanResult:
632
+ """Run scanner v2 over ``content_md`` and return a :class:`ScanResult`.
633
+
634
+ Args:
635
+ content_md: Untrusted markdown body (e.g. an issue body fetched
636
+ via ``scm:issue:view --json body``).
637
+ scanned_at: Optional override for the scanned_at timestamp. When
638
+ ``None`` the current UTC time is used. Tests pass an explicit
639
+ value for deterministic snapshots.
640
+
641
+ Returns:
642
+ A :class:`ScanResult` carrying:
643
+
644
+ - ``passed``: ``False`` iff any credentials-category flag fired.
645
+ - ``flags``: per-category findings in the order
646
+ (invisible-unicode, credentials, injection-heading).
647
+ - ``transformed_content``: the strip-then-fence output. Callers
648
+ treat this as the canonical content.md when ``passed`` is True;
649
+ when ``passed`` is False, the cache layer skips the
650
+ content.md write entirely.
651
+ - ``scanner_version`` / ``scanned_at``: timestamp + version
652
+ stamps for the meta.json scan_result envelope.
653
+ """
654
+ timestamp = scanned_at if scanned_at is not None else _utc_now_iso()
655
+ flags: list[ScanFlag] = []
656
+
657
+ # 1. Strip invisibles first so subsequent regexes see the visible-only
658
+ # surface (a U+200B-smuggled credential token would otherwise dodge
659
+ # the credentials regex).
660
+ stripped, removed_labels = _strip_invisible(content_md)
661
+ if removed_labels:
662
+ # match_count here is the COUNT of stripped codepoints, not the
663
+ # cardinality of distinct labels -- we recompute against the
664
+ # original text so a body with 17 U+200B chars surfaces 17, not 1.
665
+ total_stripped = sum(1 for ch in (content_md or "") if _is_invisible(ch))
666
+ flags.append(
667
+ ScanFlag(
668
+ category="invisible-unicode",
669
+ severity="strip-and-pass",
670
+ detail=(
671
+ f"stripped {total_stripped} invisible-unicode codepoint(s): "
672
+ + ", ".join(removed_labels)
673
+ ),
674
+ match_count=total_stripped,
675
+ )
676
+ )
677
+
678
+ # 2. Credentials regex on the stripped text. We do NOT short-circuit
679
+ # on first match -- meta.json audit value comes from the full flag
680
+ # list, so we run every pattern.
681
+ cred_flags = _detect_credentials(stripped)
682
+ flags.extend(cred_flags)
683
+
684
+ # 3. Injection-heading wrap on the stripped text. Idempotent on
685
+ # already-wrapped content (quarantine_body's #583 contract).
686
+ wrapped, inj_flag = _detect_injection_heading(stripped)
687
+ if inj_flag is not None:
688
+ flags.append(inj_flag)
689
+
690
+ passed = not any(f.severity == "hard-fail" for f in flags)
691
+ return ScanResult(
692
+ passed=passed,
693
+ scanner_version=SCANNER_VERSION,
694
+ flags=flags,
695
+ transformed_content=wrapped,
696
+ scanned_at=timestamp,
697
+ )
698
+
699
+
700
+ def _utc_now_iso() -> str:
701
+ """Return current UTC time as an RFC-3339 / ISO-8601 string with ``Z`` suffix."""
702
+ # The cache-meta.schema.json dateTime guard requires a ``Z`` or
703
+ # +HH:MM suffix; ``datetime.isoformat()`` emits ``+00:00`` which
704
+ # doesn't match the schema's regex. We replace the suffix
705
+ # manually so the scan output validates without an extra normalisation
706
+ # pass at the caller.
707
+ return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
708
+
709
+
710
+ # ---------------------------------------------------------------------------
711
+ # CLI
712
+ # ---------------------------------------------------------------------------
713
+
714
+
715
+ def main(argv: list[str] | None = None) -> int:
716
+ """CLI entry point. Reads input file (or stdin), emits JSON ScanResult.
717
+
718
+ Returns:
719
+ ``0`` when the scan passed; ``2`` when at least one hard-fail
720
+ flag fired. Mirrors the cache:put exit-code contract so a caller
721
+ piping content through ``cache_scanner.py`` gets an actionable
722
+ signal without parsing the JSON.
723
+ """
724
+ args = list(argv if argv is not None else sys.argv[1:])
725
+ if args and args[0] in {"-h", "--help"}:
726
+ sys.stdout.write(__doc__ or "")
727
+ return 0
728
+ text = (
729
+ Path(args[0]).read_text(encoding="utf-8") if args else sys.stdin.read()
730
+ )
731
+ result = scan(text)
732
+ payload = {
733
+ "passed": result.passed,
734
+ "scanner_version": result.scanner_version,
735
+ "scanned_at": result.scanned_at,
736
+ "flags": [asdict(f) for f in result.flags],
737
+ "transformed_content": result.transformed_content,
738
+ }
739
+ sys.stdout.write(json.dumps(payload, indent=2, ensure_ascii=False))
740
+ sys.stdout.write("\n")
741
+ return 0 if result.passed else 2
742
+
743
+
744
+ if __name__ == "__main__":
745
+ raise SystemExit(main())