@onlooker-community/ecosystem 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude-plugin/marketplace.json +13 -0
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/.release-please-manifest.json +3 -2
  4. package/CHANGELOG.md +7 -0
  5. package/CLAUDE.md +1 -0
  6. package/package.json +2 -2
  7. package/plugins/warden/.claude-plugin/plugin.json +14 -0
  8. package/plugins/warden/CHANGELOG.md +10 -0
  9. package/plugins/warden/config.json +51 -0
  10. package/plugins/warden/docs/adr/001-detect-after-ingest-gate-before-action.md +62 -0
  11. package/plugins/warden/docs/design.md +123 -0
  12. package/plugins/warden/hooks/hooks.json +73 -0
  13. package/plugins/warden/scripts/hooks/warden-post-tool-use.sh +201 -0
  14. package/plugins/warden/scripts/hooks/warden-pre-tool-use.sh +94 -0
  15. package/plugins/warden/scripts/hooks/warden-session-start.sh +52 -0
  16. package/plugins/warden/scripts/lib/warden-cli.sh +124 -0
  17. package/plugins/warden/scripts/lib/warden-config.sh +79 -0
  18. package/plugins/warden/scripts/lib/warden-evaluator.sh +246 -0
  19. package/plugins/warden/scripts/lib/warden-events.sh +85 -0
  20. package/plugins/warden/scripts/lib/warden-gate-state.sh +105 -0
  21. package/plugins/warden/scripts/lib/warden-patterns.sh +132 -0
  22. package/plugins/warden/scripts/lib/warden-sanitizer.sh +80 -0
  23. package/plugins/warden/scripts/lib/warden-scanner.sh +119 -0
  24. package/plugins/warden/scripts/lib/warden-ulid.sh +50 -0
  25. package/plugins/warden/skills/warden/SKILL.md +49 -0
  26. package/release-please-config.json +16 -0
  27. package/test/bats/warden-config.bats +54 -0
  28. package/test/bats/warden-events.bats +85 -0
  29. package/test/bats/warden-gate-state.bats +67 -0
  30. package/test/bats/warden-patterns.bats +58 -0
  31. package/test/bats/warden-sanitizer.bats +53 -0
  32. package/test/bats/warden-scanner.bats +56 -0
  33. package/test/bats/warden-ulid.bats +30 -0
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env bash
2
+ # Session-scoped content gate state for Warden.
3
+ #
4
+ # The gate is a single JSON lock per session under
5
+ # $ONLOOKER_DIR/warden/sessions/<session_id>/gate.json
6
+ #
7
+ # Absent file or {"state":"open"} → gate open (writes/edits/bash allowed).
8
+ # {"state":"closed", ...} → gate closed (those operations are blocked).
9
+ #
10
+ # The gate is closed by the detection hook on a positive scan and cleared
11
+ # ONLY by the user via the /warden skill (clear_policy: user_override_only).
12
+ #
13
+ # Exposes:
14
+ # warden_gate_dir <session_id>
15
+ # warden_gate_file <session_id>
16
+ # warden_gate_is_closed <session_id> # return 0 if closed
17
+ # warden_gate_close <session_id> <threat_json> # write closed lock
18
+ # warden_gate_read <session_id> # echo gate JSON (empty if open/absent)
19
+ # warden_gate_threat <session_id> # echo stored threat object (empty if open)
20
+ # warden_gate_clear <session_id> # remove lock; echo prior threat object
21
+
22
+ warden_gate_dir() {
23
+ local session_id="$1"
24
+ local onlooker_dir="${ONLOOKER_DIR:-${HOME}/.onlooker}"
25
+ printf '%s' "${onlooker_dir}/warden/sessions/${session_id}"
26
+ }
27
+
28
+ warden_gate_file() {
29
+ local session_id="$1"
30
+ printf '%s/gate.json' "$(warden_gate_dir "$session_id")"
31
+ }
32
+
33
+ warden_gate_is_closed() {
34
+ local session_id="$1"
35
+ local file
36
+ file=$(warden_gate_file "$session_id")
37
+ [[ -f "$file" ]] || return 1
38
+ local state
39
+ state=$(jq -r '.state // "open"' "$file" 2>/dev/null) || return 1
40
+ [[ "$state" == "closed" ]]
41
+ }
42
+
43
+ warden_gate_read() {
44
+ local session_id="$1"
45
+ local file
46
+ file=$(warden_gate_file "$session_id")
47
+ [[ -f "$file" ]] || { printf ''; return 1; }
48
+ cat "$file" 2>/dev/null
49
+ }
50
+
51
+ warden_gate_threat() {
52
+ local session_id="$1"
53
+ local file
54
+ file=$(warden_gate_file "$session_id")
55
+ [[ -f "$file" ]] || { printf ''; return 1; }
56
+ jq -c '.threat // empty' "$file" 2>/dev/null
57
+ }
58
+
59
+ # Close the gate. $2 is the threat object (JSON) to record.
60
+ warden_gate_close() {
61
+ local session_id="$1"
62
+ local threat_json="${2:-}"
63
+ [[ -z "$threat_json" ]] && threat_json='{}'
64
+ local dir file now
65
+ dir=$(warden_gate_dir "$session_id")
66
+ file=$(warden_gate_file "$session_id")
67
+ mkdir -p "$dir" 2>/dev/null || return 1
68
+ now=$(date +%s 2>/dev/null) || now=0
69
+ local out
70
+ out=$(jq -n \
71
+ --argjson ts "$now" \
72
+ --argjson threat "$threat_json" \
73
+ '{state:"closed", closed_at:$ts, threat:$threat}' 2>/dev/null) || return 1
74
+ printf '%s\n' "$out" > "$file"
75
+ }
76
+
77
+ # List session ids that currently have a CLOSED gate (one per line). Used by
78
+ # the /warden skill to resolve the active gate when CLAUDE_SESSION_ID is not
79
+ # in the skill environment.
80
+ warden_list_closed_sessions() {
81
+ local onlooker_dir="${ONLOOKER_DIR:-${HOME}/.onlooker}"
82
+ local base="${onlooker_dir}/warden/sessions"
83
+ [[ -d "$base" ]] || return 0
84
+ local gate sid state
85
+ for gate in "$base"/*/gate.json; do
86
+ [[ -f "$gate" ]] || continue
87
+ state=$(jq -r '.state // "open"' "$gate" 2>/dev/null) || continue
88
+ [[ "$state" == "closed" ]] || continue
89
+ sid=$(basename "$(dirname "$gate")")
90
+ printf '%s\n' "$sid"
91
+ done
92
+ }
93
+
94
+ # Clear the gate. Echoes the prior threat object (for the cleared event), then
95
+ # removes the lock. Returns 1 if the gate was not closed.
96
+ warden_gate_clear() {
97
+ local session_id="$1"
98
+ local file
99
+ file=$(warden_gate_file "$session_id")
100
+ [[ -f "$file" ]] || return 1
101
+ local prior_threat
102
+ prior_threat=$(jq -c '.threat // empty' "$file" 2>/dev/null) || prior_threat=""
103
+ rm -f "$file" 2>/dev/null || return 1
104
+ printf '%s' "$prior_threat"
105
+ }
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env bash
2
+ # Deterministic injection-pattern floor for Warden.
3
+ #
4
+ # Classifies a block of ingested content against a curated set of
5
+ # prompt-injection signatures, mapped to the five schema threat_types:
6
+ # prompt_injection · instruction_override · credential_exfiltration
7
+ # command_injection · social_engineering
8
+ #
9
+ # Two severities:
10
+ # strong — explicit, high-precision phrasing. Closes the gate on its own.
11
+ # weak — heuristic suspicion. Below the close threshold; escalates to the
12
+ # evaluator (when escalation is enabled) rather than closing alone.
13
+ #
14
+ # Exposes:
15
+ # warden_pattern_classify <content>
16
+ # → JSON {"severity":"strong|weak|none","threat_type":"<t>",
17
+ # "matched_pattern":"<regex>","hit_count":<n>}
18
+ #
19
+ # The matched_pattern is retained for the local gate record only — it is NOT
20
+ # emitted in warden.threat.detected (the schema forbids extra fields there).
21
+
22
+ # Each entry: "threat_type|regex" (extended regex, matched case-insensitively).
23
+
24
+ _WARDEN_STRONG_PATTERNS=(
25
+ # instruction_override — explicit attempts to discard the standing prompt.
26
+ 'instruction_override|ignore (all (of )?)?(the )?(previous|prior|above|preceding|earlier) (instructions|prompts|directions|context)'
27
+ 'instruction_override|disregard (all |the |any )?(previous|prior|above|earlier|preceding) (instructions|directions|prompts|rules)'
28
+ 'instruction_override|forget (everything|all (of )?your|all (the )?previous|your (previous )?instructions)'
29
+ 'instruction_override|(override|overrule|bypass) (your|the|all) (instructions|system prompt|guardrails|safety)'
30
+ 'instruction_override|do not (follow|obey|comply with) (the|your|any) (previous|prior|system|earlier) (instructions|prompt)'
31
+ # prompt_injection — role / system-prompt hijack and delimiter spoofing.
32
+ 'prompt_injection|you are now (a|an|the|in|no longer|free|developer mode)'
33
+ 'prompt_injection|new (instructions|system prompt|directives?)( for you)?:'
34
+ 'prompt_injection|(reveal|print|repeat|disclose|show) (me )?(your|the) (system prompt|initial instructions|hidden instructions)'
35
+ 'prompt_injection|<\/?(system|instructions?|assistant|im_start|im_end)>'
36
+ 'prompt_injection|\[/?INST\]|<<SYS>>|<\|im_(start|end)\|>'
37
+ # credential_exfiltration — moving secrets out.
38
+ 'credential_exfiltration|(send|post|upload|transmit|exfiltrate|email|leak|forward) .{0,40}(api[ _-]?key|password|secret|token|credential|\.env|private key|ssh key)'
39
+ 'credential_exfiltration|(curl|wget|fetch|http) .{0,80}(api[ _-]?key|token|secret|password|\$\{?[A-Z_]*(KEY|TOKEN|SECRET|PASSWORD))'
40
+ 'credential_exfiltration|(AWS_SECRET_ACCESS_KEY|ANTHROPIC_API_KEY|GITHUB_TOKEN|OPENAI_API_KEY|cat .{0,30}\.env)'
41
+ # command_injection — get the agent to run attacker-chosen commands.
42
+ 'command_injection|(curl|wget) .{0,120}\| ?(ba)?sh'
43
+ 'command_injection|base64 (--?d|--decode) .{0,80}\| ?(ba)?sh'
44
+ 'command_injection|rm -rf (/|~|\$HOME|\*)'
45
+ 'command_injection|(run|execute|eval) (the following|this) (command|code|script|payload)'
46
+ )
47
+
48
+ _WARDEN_WEAK_PATTERNS=(
49
+ # social_engineering — pressure and false authority.
50
+ 'social_engineering|do not (tell|inform|notify|alert) (the )?(user|human|operator)'
51
+ 'social_engineering|without (asking|informing|telling|notifying) (the )?(user|anyone)'
52
+ 'social_engineering|i am (your|the) (developer|administrator|admin|owner|creator|operator)'
53
+ 'social_engineering|as an? (authorized|trusted|admin|administrator|privileged) (user|agent|developer)'
54
+ 'social_engineering|this is (urgent|critical|an emergency|time.?sensitive)'
55
+ # prompt_injection — softer instruction-shaped imperatives in fetched text.
56
+ 'prompt_injection|(important|attention|note to|message for|hey) (ai|assistant|claude|chatbot|llm|model)'
57
+ 'prompt_injection|(please |kindly )?(now )?(follow|execute|carry out) (these|the following) (instructions|steps|commands)'
58
+ # command_injection — pipe-to-shell shapes that did not hit the strong rule.
59
+ 'command_injection|(eval|exec|system)\(.{0,60}\)'
60
+ )
61
+
62
+ # Run one pattern list against the content. Echoes the first matching entry's
63
+ # "threat_type|matched_regex" and returns 0; returns 1 if nothing matches.
64
+ _warden_first_match() {
65
+ local content="$1"
66
+ shift
67
+ local entry threat regex
68
+ for entry in "$@"; do
69
+ threat="${entry%%|*}"
70
+ regex="${entry#*|}"
71
+ if printf '%s' "$content" | grep -iqE -- "$regex" 2>/dev/null; then
72
+ printf '%s|%s' "$threat" "$regex"
73
+ return 0
74
+ fi
75
+ done
76
+ return 1
77
+ }
78
+
79
+ # Count how many entries in a list match (signal strength for borderline calls).
80
+ _warden_count_matches() {
81
+ local content="$1"
82
+ shift
83
+ local entry regex count=0
84
+ for entry in "$@"; do
85
+ regex="${entry#*|}"
86
+ if printf '%s' "$content" | grep -iqE -- "$regex" 2>/dev/null; then
87
+ count=$((count + 1))
88
+ fi
89
+ done
90
+ printf '%d' "$count"
91
+ }
92
+
93
+ # Classify content. Echoes a JSON verdict object.
94
+ warden_pattern_classify() {
95
+ local content="$1"
96
+
97
+ local strong_hit weak_hit
98
+ strong_hit=$(_warden_first_match "$content" "${_WARDEN_STRONG_PATTERNS[@]}") || strong_hit=""
99
+
100
+ if [[ -n "$strong_hit" ]]; then
101
+ local threat="${strong_hit%%|*}"
102
+ local regex="${strong_hit#*|}"
103
+ local n
104
+ n=$(_warden_count_matches "$content" "${_WARDEN_STRONG_PATTERNS[@]}")
105
+ jq -n \
106
+ --arg sev "strong" \
107
+ --arg t "$threat" \
108
+ --arg p "$regex" \
109
+ --argjson n "$n" \
110
+ '{severity:$sev, threat_type:$t, matched_pattern:$p, hit_count:$n}' 2>/dev/null \
111
+ || printf '{"severity":"strong","threat_type":"%s","matched_pattern":"","hit_count":%s}' "$threat" "$n"
112
+ return 0
113
+ fi
114
+
115
+ weak_hit=$(_warden_first_match "$content" "${_WARDEN_WEAK_PATTERNS[@]}") || weak_hit=""
116
+ if [[ -n "$weak_hit" ]]; then
117
+ local threat="${weak_hit%%|*}"
118
+ local regex="${weak_hit#*|}"
119
+ local n
120
+ n=$(_warden_count_matches "$content" "${_WARDEN_WEAK_PATTERNS[@]}")
121
+ jq -n \
122
+ --arg sev "weak" \
123
+ --arg t "$threat" \
124
+ --arg p "$regex" \
125
+ --argjson n "$n" \
126
+ '{severity:$sev, threat_type:$t, matched_pattern:$p, hit_count:$n}' 2>/dev/null \
127
+ || printf '{"severity":"weak","threat_type":"%s","matched_pattern":"","hit_count":%s}' "$threat" "$n"
128
+ return 0
129
+ fi
130
+
131
+ printf '{"severity":"none","threat_type":"none","matched_pattern":"","hit_count":0}'
132
+ }
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env bash
2
+ # Input sanitization for Warden evaluator-bound content.
3
+ #
4
+ # Applied before any ingested content is interpolated into the escalation
5
+ # evaluator prompt. The content warden scans is, by definition, untrusted —
6
+ # so before it is shown to the evaluator it must be neutralized against a
7
+ # second-order injection (content that tries to talk the evaluator out of
8
+ # flagging it).
9
+ #
10
+ # Exposes:
11
+ # warden_sanitize <string> <max_chars> # echoes sanitized, truncated string
12
+ #
13
+ # Sanitization steps (applied in order):
14
+ # 1. Null-byte removal
15
+ # 2. Control-character removal (0x00–0x1F and 0x7F, except \t and \n)
16
+ # 3. Prompt-delimiter stripping (evaluator prompt tag sequences → [STRIPPED])
17
+ # 4. Truncation to max_chars
18
+
19
+ # Tags that, if present in scanned content, would inject into the evaluator prompt.
20
+ _WARDEN_STRIP_SEQUENCES=(
21
+ "<source_content>"
22
+ "</source_content>"
23
+ "<instructions>"
24
+ "</instructions>"
25
+ "<|"
26
+ "[INST]"
27
+ "[/INST]"
28
+ "<<SYS>>"
29
+ "<</SYS>>"
30
+ )
31
+
32
+ # Remove null bytes and ASCII control characters except \t (0x09) and \n (0x0A).
33
+ _warden_strip_control_chars() {
34
+ local input="$1"
35
+ printf '%s' "$input" \
36
+ | tr -d '\000-\010\013-\037\177' \
37
+ 2>/dev/null
38
+ }
39
+
40
+ # Replace all occurrences of a literal string with [STRIPPED].
41
+ #
42
+ # Uses bash native substring replacement rather than sed: the strip sequences
43
+ # contain '/', '[', and '|', any of which would collide with sed's delimiter
44
+ # or regex syntax. Quoting the needle in ${var//"needle"/repl} forces a literal
45
+ # (non-glob) match that is safe for arbitrary bytes.
46
+ _warden_strip_literal() {
47
+ local input="$1"
48
+ local needle="$2"
49
+ printf '%s' "${input//"$needle"/[STRIPPED]}"
50
+ }
51
+
52
+ # Truncate a string to at most max_chars characters.
53
+ _warden_truncate() {
54
+ local input="$1"
55
+ local max_chars="${2:-0}"
56
+ if [[ "$max_chars" -le 0 ]]; then
57
+ printf '%s' "$input"
58
+ return
59
+ fi
60
+ printf '%s' "$input" | cut -c "1-${max_chars}" 2>/dev/null
61
+ }
62
+
63
+ # Full sanitization pipeline. Echoes the sanitized string.
64
+ # $1 — raw input string
65
+ # $2 — max chars (0 = no truncation)
66
+ warden_sanitize() {
67
+ local input="$1"
68
+ local max_chars="${2:-0}"
69
+
70
+ local s
71
+ s=$(_warden_strip_control_chars "$input")
72
+
73
+ local seq
74
+ for seq in "${_WARDEN_STRIP_SEQUENCES[@]}"; do
75
+ s=$(_warden_strip_literal "$s" "$seq")
76
+ done
77
+
78
+ s=$(_warden_truncate "$s" "$max_chars")
79
+ printf '%s' "$s"
80
+ }
@@ -0,0 +1,119 @@
1
+ #!/usr/bin/env bash
2
+ # Hybrid scanner orchestration for Warden.
3
+ #
4
+ # Combines the deterministic pattern floor (warden-patterns.sh) with optional
5
+ # LLM escalation (warden-evaluator.sh):
6
+ #
7
+ # strong pattern hit → detected immediately (no model call)
8
+ # weak pattern hit → escalate to the evaluator when enabled; otherwise
9
+ # fall back to the weak-pattern confidence
10
+ # no hit → clean (no model call)
11
+ #
12
+ # On evaluator error the scanner falls back to the pattern verdict, so a model
13
+ # outage degrades coverage but never silently closes the gate on every read.
14
+ #
15
+ # Depends on (sourced by the caller):
16
+ # warden-config.sh · warden-patterns.sh · warden-sanitizer.sh · warden-evaluator.sh
17
+ #
18
+ # Exposes:
19
+ # warden_scan <source_type> <content>
20
+ # → JSON {"detected":bool, "threat_type":"<t>", "confidence":<f>,
21
+ # "matched_pattern":"<p>", "method":"<m>", "rationale":"<str>"}
22
+
23
+ # awk-based float >= comparison. Returns 0 (true) if $1 >= $2.
24
+ #
25
+ # Values are passed via `awk -v` (data), never interpolated into the program
26
+ # string: thresholds can originate from repo-level .claude/settings.json, which
27
+ # is untrusted under warden's threat model. -v also makes non-numeric input
28
+ # degrade to 0 rather than executing as awk code.
29
+ _warden_ge() {
30
+ awk -v a="${1:-0}" -v b="${2:-0}" 'BEGIN {exit !(a >= b)}' 2>/dev/null
31
+ }
32
+
33
+ _warden_scan_result() {
34
+ local detected="$1" threat="$2" confidence="$3" pattern="$4" method="$5" rationale="$6"
35
+ jq -n \
36
+ --argjson detected "$detected" \
37
+ --arg t "$threat" \
38
+ --argjson c "${confidence:-0}" \
39
+ --arg p "$pattern" \
40
+ --arg m "$method" \
41
+ --arg r "$rationale" \
42
+ '{detected:$detected, threat_type:$t, confidence:$c, matched_pattern:$p, method:$m, rationale:$r}' \
43
+ 2>/dev/null \
44
+ || printf '{"detected":%s,"threat_type":"%s","confidence":%s,"matched_pattern":"%s","method":"%s","rationale":"%s"}' \
45
+ "$detected" "$threat" "${confidence:-0}" "$pattern" "$method" "$rationale"
46
+ }
47
+
48
+ warden_scan() {
49
+ local source_type="$1"
50
+ local content="$2"
51
+
52
+ local close_threshold strong_conf weak_conf
53
+ close_threshold=$(warden_config_get '.warden.detection.close_threshold')
54
+ close_threshold="${close_threshold:-0.65}"
55
+ strong_conf=$(warden_config_get '.warden.detection.strong_pattern_confidence')
56
+ strong_conf="${strong_conf:-0.9}"
57
+ weak_conf=$(warden_config_get '.warden.detection.weak_pattern_confidence')
58
+ weak_conf="${weak_conf:-0.5}"
59
+
60
+ local classify severity threat pattern
61
+ classify=$(warden_pattern_classify "$content")
62
+ severity=$(printf '%s' "$classify" | jq -r '.severity // "none"' 2>/dev/null) || severity="none"
63
+ threat=$(printf '%s' "$classify" | jq -r '.threat_type // "none"' 2>/dev/null) || threat="none"
64
+ pattern=$(printf '%s' "$classify" | jq -r '.matched_pattern // ""' 2>/dev/null) || pattern=""
65
+
66
+ # ---- Clean: no signal at all. ------------------------------------
67
+ if [[ "$severity" == "none" ]]; then
68
+ _warden_scan_result false "none" 0 "" "none" "no injection pattern matched"
69
+ return 0
70
+ fi
71
+
72
+ # ---- Strong: explicit, high-precision phrasing. ------------------
73
+ if [[ "$severity" == "strong" ]]; then
74
+ local detected="false"
75
+ _warden_ge "$strong_conf" "$close_threshold" && detected="true"
76
+ _warden_scan_result "$detected" "$threat" "$strong_conf" "$pattern" "pattern_strong" "matched a strong injection signature"
77
+ return 0
78
+ fi
79
+
80
+ # ---- Weak: borderline. Escalate when enabled. --------------------
81
+ local escalation_enabled
82
+ escalation_enabled=$(warden_config_get '.warden.escalation.enabled')
83
+ escalation_enabled="${escalation_enabled:-true}"
84
+
85
+ if [[ "$escalation_enabled" == "true" ]]; then
86
+ local max_chars excerpt
87
+ max_chars=$(warden_config_get '.warden.scan.max_content_chars')
88
+ max_chars="${max_chars:-20000}"
89
+ excerpt=$(warden_sanitize "$content" "$max_chars")
90
+
91
+ local eval_result decision eval_conf eval_threat eval_rationale
92
+ eval_result=$(warden_evaluate "$source_type" "$excerpt" "$threat")
93
+ decision=$(printf '%s' "$eval_result" | jq -r '.decision // "error"' 2>/dev/null) || decision="error"
94
+ eval_conf=$(printf '%s' "$eval_result" | jq -r '.confidence // 0' 2>/dev/null) || eval_conf="0"
95
+ eval_threat=$(printf '%s' "$eval_result" | jq -r '.threat_type // "none"' 2>/dev/null) || eval_threat="none"
96
+ eval_rationale=$(printf '%s' "$eval_result" | jq -r '.rationale // ""' 2>/dev/null) || eval_rationale=""
97
+
98
+ if [[ "$decision" == "injection" ]]; then
99
+ [[ "$eval_threat" == "none" || -z "$eval_threat" ]] && eval_threat="$threat"
100
+ local detected="false"
101
+ _warden_ge "$eval_conf" "$close_threshold" && detected="true"
102
+ _warden_scan_result "$detected" "$eval_threat" "$eval_conf" "$pattern" "escalation" "$eval_rationale"
103
+ return 0
104
+ fi
105
+
106
+ if [[ "$decision" == "clean" ]]; then
107
+ _warden_scan_result false "none" "$eval_conf" "$pattern" "escalation" "evaluator judged the borderline content clean"
108
+ return 0
109
+ fi
110
+
111
+ # decision == error → fall back to the weak-pattern verdict below.
112
+ fi
113
+
114
+ # ---- Weak fallback: no escalation, or evaluator errored. ---------
115
+ local detected="false"
116
+ _warden_ge "$weak_conf" "$close_threshold" && detected="true"
117
+ _warden_scan_result "$detected" "$threat" "$weak_conf" "$pattern" "pattern_weak" "weak injection signal; escalation unavailable"
118
+ return 0
119
+ }
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env bash
2
+ # Minimal ULID generator for Warden threat_id values.
3
+ #
4
+ # Spec: https://github.com/ulid/spec
5
+ # - 48-bit timestamp (ms since epoch) → 10 chars Crockford Base32
6
+ # - 80-bit randomness → 16 chars Crockford Base32
7
+ # - lexicographically sortable, time-ordered
8
+
9
+ _WARDEN_ULID_ALPHABET="0123456789ABCDEFGHJKMNPQRSTVWXYZ"
10
+
11
+ _warden_ulid_encode() {
12
+ local n="$1"
13
+ local len="$2"
14
+ local out=""
15
+ local i
16
+ for ((i = 0; i < len; i++)); do
17
+ out="${_WARDEN_ULID_ALPHABET:$((n % 32)):1}${out}"
18
+ n=$((n / 32))
19
+ done
20
+ printf '%s' "$out"
21
+ }
22
+
23
+ warden_ulid() {
24
+ local now_ms
25
+ if [[ "$(uname)" == "Darwin" ]]; then
26
+ now_ms=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
27
+ || now_ms=$(($(date +%s) * 1000))
28
+ else
29
+ now_ms=$(date +%s%3N 2>/dev/null) || now_ms=$(($(date +%s) * 1000))
30
+ fi
31
+
32
+ local rand_hex rand_hi rand_lo
33
+ rand_hex=$(openssl rand -hex 10 2>/dev/null)
34
+ if [[ -n "$rand_hex" && ${#rand_hex} -eq 20 ]]; then
35
+ rand_hi=$((16#${rand_hex:0:10}))
36
+ rand_lo=$((16#${rand_hex:10:10}))
37
+ else
38
+ rand_hi=$((RANDOM * 32768 + RANDOM))
39
+ rand_lo=$((RANDOM * 32768 + RANDOM))
40
+ rand_hi=$(((rand_hi * 256 + RANDOM % 256) & ((1 << 40) - 1)))
41
+ rand_lo=$(((rand_lo * 256 + RANDOM % 256) & ((1 << 40) - 1)))
42
+ fi
43
+
44
+ local ts_part hi_part lo_part
45
+ ts_part=$(_warden_ulid_encode "$now_ms" 10)
46
+ hi_part=$(_warden_ulid_encode "$rand_hi" 8)
47
+ lo_part=$(_warden_ulid_encode "$rand_lo" 8)
48
+
49
+ printf '%s%s%s' "$ts_part" "$hi_part" "$lo_part"
50
+ }
@@ -0,0 +1,49 @@
1
+ ---
2
+ name: warden
3
+ description: Inspect and control the Warden content gate. Shows whether the session's content gate is open or closed and the threat that closed it (`/warden` or `/warden status`), and explicitly clears a closed gate to re-enable Write/Edit/Bash (`/warden clear`). Clearing is the only sanctioned way to reopen the gate — it records a user override in the warden.* event stream. Use when Warden has blocked a write/edit/bash operation, or when the user asks to check or clear the content gate.
4
+ ---
5
+
6
+ # Warden: Content Gate Control
7
+
8
+ You are operating the **Warden** content gate — the user-facing control surface for the gate that Warden's hooks open and close automatically.
9
+
10
+ Warden enforces Meta's **Agents Rule of Two**: an agent should hold at most two of {access to private data, ability to take external actions, processing of untrusted content}. When Warden's detection hook finds an injection pattern in content ingested via WebFetch or Read, it closes a session-scoped gate that revokes the *external actions* property — blocking Write, Edit, MultiEdit, and Bash until the user explicitly clears it. This skill is that explicit clear (and a status readout).
11
+
12
+ ## Parse the request
13
+
14
+ Read the user's argument after `/warden`:
15
+
16
+ - no argument, or `status` → **status** action
17
+ - `clear`, `reopen`, `override`, `unblock` → **clear** action
18
+
19
+ If the user passed a session id explicitly (rare), capture it as the optional second argument.
20
+
21
+ ## Run the control surface
22
+
23
+ Source the plugin helpers and invoke `warden_cli`. Run this in a single bash call:
24
+
25
+ ```bash
26
+ set -uo pipefail
27
+ source "$CLAUDE_PLUGIN_ROOT/scripts/lib/warden-config.sh"
28
+ source "$CLAUDE_PLUGIN_ROOT/scripts/lib/warden-events.sh"
29
+ source "$CLAUDE_PLUGIN_ROOT/scripts/lib/warden-gate-state.sh"
30
+ source "$CLAUDE_PLUGIN_ROOT/scripts/lib/warden-cli.sh"
31
+
32
+ # action is "status" or "clear"; SESSION_ID_ARG is optional and usually empty.
33
+ warden_cli "<action>" "${SESSION_ID_ARG:-}"
34
+ ```
35
+
36
+ `warden_cli` resolves the session automatically: it prefers `$CLAUDE_SESSION_ID`, falls back to the single closed gate if exactly one exists, and reports ambiguity if several sessions have closed gates (re-run with an explicit session id in that case).
37
+
38
+ ## Behavior
39
+
40
+ - **status** — prints whether the gate is OPEN or CLOSED. When closed, prints the recorded threat: `threat_type`, `source_type`, source URL/path, confidence, detection method, matched pattern, and the flagged snippet (if storage is enabled).
41
+ - **clear** — verifies the gate is closed, removes the lock, and emits `warden.threat.cleared` with `cleared_by: user_override`. This re-enables Write/Edit/Bash for the session.
42
+
43
+ ## After clearing
44
+
45
+ When you clear the gate on the user's behalf:
46
+
47
+ 1. Confirm the gate is reopened and name the source that triggered it.
48
+ 2. Remind the user briefly that the flagged content is still in the conversation context — clearing the gate does not remove it. If they have not reviewed the source, suggest they do before continuing with external actions.
49
+ 3. Do not clear a gate the user has not asked you to clear. Closing is automatic; clearing is always a deliberate user decision.
@@ -142,6 +142,22 @@
142
142
  "jsonpath": "$.version"
143
143
  }
144
144
  ]
145
+ },
146
+ "plugins/warden": {
147
+ "changelog-path": "CHANGELOG.md",
148
+ "release-type": "simple",
149
+ "bump-minor-pre-major": true,
150
+ "bump-patch-for-minor-pre-major": false,
151
+ "component": "warden",
152
+ "draft": false,
153
+ "prerelease": false,
154
+ "extra-files": [
155
+ {
156
+ "type": "json",
157
+ "path": ".claude-plugin/plugin.json",
158
+ "jsonpath": "$.version"
159
+ }
160
+ ]
145
161
  }
146
162
  },
147
163
  "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json"
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env bats
2
+
3
+ setup() {
4
+ source "${BATS_TEST_DIRNAME}/../helpers/setup.bash"
5
+ setup_test_env
6
+
7
+ PLUGIN_ROOT="${REPO_ROOT}/plugins/warden"
8
+ export CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT"
9
+ # shellcheck disable=SC1091
10
+ source "${PLUGIN_ROOT}/scripts/lib/warden-config.sh"
11
+ }
12
+
13
+ @test "warden is disabled by default" {
14
+ warden_config_load ""
15
+ run warden_config_enabled
16
+ [ "$status" -ne 0 ]
17
+ }
18
+
19
+ @test "user-level settings.json can enable warden" {
20
+ mkdir -p "${HOME}/.claude"
21
+ printf '%s\n' '{"warden":{"enabled":true}}' > "${HOME}/.claude/settings.json"
22
+ warden_config_load ""
23
+ run warden_config_enabled
24
+ [ "$status" -eq 0 ]
25
+ }
26
+
27
+ @test "repo-level settings.json overrides user-level" {
28
+ mkdir -p "${HOME}/.claude"
29
+ printf '%s\n' '{"warden":{"enabled":true}}' > "${HOME}/.claude/settings.json"
30
+ local repo="${BATS_TEST_TMPDIR}/repo"
31
+ mkdir -p "${repo}/.claude"
32
+ printf '%s\n' '{"warden":{"enabled":false}}' > "${repo}/.claude/settings.json"
33
+ warden_config_load "$repo"
34
+ run warden_config_enabled
35
+ [ "$status" -ne 0 ]
36
+ }
37
+
38
+ @test "defaults are preserved when an overlay sets only some keys" {
39
+ mkdir -p "${HOME}/.claude"
40
+ printf '%s\n' '{"warden":{"enabled":true,"escalation":{"enabled":false}}}' > "${HOME}/.claude/settings.json"
41
+ warden_config_load ""
42
+ # escalation.enabled overridden to false…
43
+ [ "$(warden_config_get '.warden.escalation.enabled')" = "false" ]
44
+ # …but shipped defaults survive the deep merge.
45
+ [ "$(warden_config_get '.warden.detection.close_threshold')" = "0.65" ]
46
+ [ "$(warden_config_get '.warden.scan.max_content_chars')" = "20000" ]
47
+ }
48
+
49
+ @test "config_get_json returns arrays" {
50
+ warden_config_load ""
51
+ run warden_config_get_json '.warden.scan.sources'
52
+ [ "$status" -eq 0 ]
53
+ printf '%s' "$output" | jq -e 'index("web_fetch") != null and index("file_read") != null' >/dev/null
54
+ }