agentshield-sdk 13.3.0 → 14.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +161 -0
- package/README.md +13 -2
- package/package.json +2 -2
- package/src/audit-immutable.js +59 -1
- package/src/audit.js +1 -1
- package/src/cross-turn.js +25 -1
- package/src/detector-core.js +333 -51
- package/src/document-scanner.js +20 -0
- package/src/enterprise.js +127 -12
- package/src/integrations-frameworks.js +373 -0
- package/src/integrations.js +207 -0
- package/src/main.js +10 -14
- package/src/memory-guard.js +60 -0
- package/src/middleware.js +107 -2
- package/src/native-scanner.js +104 -0
- package/src/plugin-system.js +422 -6
- package/src/supply-chain-scanner.js +112 -2
- package/src/sybil-detector.js +3 -6
- package/src/persistent-learning.js +0 -161
- package/src/threat-intel-federation.js +0 -343
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,167 @@ All notable changes to Agent Shield will be documented in this file.
|
|
|
4
4
|
|
|
5
5
|
This project follows [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
+
## [14.0.0] - 2026-04-16
|
|
8
|
+
|
|
9
|
+
### Major Release — Platform Parity + Framework Integrations
|
|
10
|
+
|
|
11
|
+
Agent Shield v14 closes the gap with Microsoft's Agent Governance Toolkit while maintaining our zero-dependency, local-first architecture.
|
|
12
|
+
|
|
13
|
+
#### OpenAI Agents SDK Integration (April 2026 Release)
|
|
14
|
+
|
|
15
|
+
- `shieldOpenAIAgent()` — drop-in guardrails for `@openai/agents` (Node) and `openai-agents` (Python)
|
|
16
|
+
- Input, output, and tool guardrails that work with the SDK's native Guardrail primitive
|
|
17
|
+
- Handles all OpenAI SDK input shapes: string, message array, content parts
|
|
18
|
+
- Node: 34 integration tests. Python: 15 integration tests.
|
|
19
|
+
- Example at `examples/openai-agents-sdk.js`
|
|
20
|
+
|
|
21
|
+
#### Framework Parity (CrewAI, Google ADK, MS Agent Framework)
|
|
22
|
+
|
|
23
|
+
- `shieldCrewAI()` — task-level input/output scanning for CrewAI workflows
|
|
24
|
+
- `shieldGoogleADK()` — tool call, tool result, and generation prompt scanning for Google ADK
|
|
25
|
+
- `shieldMSAgentFramework()` — async middleware for Microsoft Agent Framework pipeline
|
|
26
|
+
- 36 integration tests across all three frameworks
|
|
27
|
+
|
|
28
|
+
#### Rust Core NAPI Binding
|
|
29
|
+
|
|
30
|
+
- Native Rust scanner bridge (`src/native-scanner.js`) loads compiled NAPI module when available
|
|
31
|
+
- Falls back silently to pure-JS scanner when not compiled
|
|
32
|
+
- Build: `cd rust-core && cargo build --release --features node`
|
|
33
|
+
- `scanText`, `scanBatch`, `getPatterns` exposed via NAPI-RS
|
|
34
|
+
|
|
35
|
+
#### Python + Go SDK Pattern Sync
|
|
36
|
+
|
|
37
|
+
- Python SDK: 141 → 179 patterns (+38), 10 new categories
|
|
38
|
+
- Go SDK: 141 → 179 patterns (+38), 10 new categories
|
|
39
|
+
- All v13.4-v13.6 patterns ported: XSS, SVG, encoding chain, steganographic, mcp.json, offensive agent, cloud IAM, structured data, memory poisoning, prompt extraction
|
|
40
|
+
|
|
41
|
+
#### Plugin VM Sandbox + Signature Verification
|
|
42
|
+
|
|
43
|
+
- `IsolatedPluginSandbox` — real `vm` module isolation, not just error catching
|
|
44
|
+
- Plugins cannot access `process`, `fs`, `net`, `child_process`, `require`
|
|
45
|
+
- Preemptive timeout via `vm.Script` (kills infinite loops)
|
|
46
|
+
- Prototype pollution contained (realm-isolated built-ins)
|
|
47
|
+
- `PluginVerifier` with HMAC-SHA256 signature validation
|
|
48
|
+
- `PluginManifest` schema validation with capability declarations
|
|
49
|
+
- 58 sandbox tests passing
|
|
50
|
+
|
|
51
|
+
#### Performance
|
|
52
|
+
|
|
53
|
+
- Long benign fast path: 15.7ms → 112μs p99 (140x faster) via attack-indicator prefilter
|
|
54
|
+
- Honest latency benchmark at `benchmark/latency-honest.js` with p50/p95/p99/p99.9
|
|
55
|
+
- ReDoS audit: 0 risky patterns across all detectors (all <0.4ms worst case)
|
|
56
|
+
- Pattern quality audit: 120 active / 177 defensive patterns, 0 false positives
|
|
57
|
+
|
|
58
|
+
#### Security Hardening
|
|
59
|
+
|
|
60
|
+
- Express middleware: 1MB default body-size limit
|
|
61
|
+
- Multi-tenant: `tenantVerifier` + `strictAuth` options, `withAuth()` helper
|
|
62
|
+
- Microsoft Agent Governance Toolkit parity audit at `research/ms-agent-toolkit-parity.md`
|
|
63
|
+
|
|
64
|
+
#### Developer Experience
|
|
65
|
+
|
|
66
|
+
- `GETTING_STARTED.md` — 5-minute path from install to protected agent
|
|
67
|
+
- All framework examples in one place: Anthropic, OpenAI, OpenAI Agents SDK, LangChain, Express, MCP, CrewAI, Google ADK, MS Agent Framework
|
|
68
|
+
|
|
69
|
+
## [13.6.0] - 2026-04-16
|
|
70
|
+
|
|
71
|
+
### Performance Leap + Security Hardening
|
|
72
|
+
|
|
73
|
+
Path A polish pass — close security scan gaps, honest performance work, real audits.
|
|
74
|
+
|
|
75
|
+
#### Performance
|
|
76
|
+
|
|
77
|
+
- **Fast path for long clean text**: 15.7ms p99 → **112μs p99** on 5KB benign documents. 140x speedup.
|
|
78
|
+
- Added `PRIMARY_ATTACK_INDICATORS` prefilter — a single cheap regex matching only attack-specific phrases (not common English like "eval" or "token").
|
|
79
|
+
- If text is long, contains no attack phrases, no non-ASCII, and no obfuscation chars → skip the full pattern + normalization pipeline.
|
|
80
|
+
- Zero recall loss: full red team (617 attacks) still 100%, shield score still 100/100.
|
|
81
|
+
- **Honest latency benchmark** (`benchmark/latency-honest.js`): real p50/p95/p99/p99.9/max numbers instead of averages.
|
|
82
|
+
- Best-case p99: 112μs
|
|
83
|
+
- Mean p99: 1.18ms
|
|
84
|
+
- Worst-case p99: 3.62ms (long malicious — full pattern set runs)
|
|
85
|
+
- Microsoft Agent Governance Toolkit claims <0.1ms p99. We're 36.2x that in worst case, faster on short inputs.
|
|
86
|
+
|
|
87
|
+
#### Security
|
|
88
|
+
|
|
89
|
+
- **Plugin VM sandbox** (`IsolatedPluginSandbox`): real isolation using Node `vm` module.
|
|
90
|
+
- Blocks `process`, `require` (whitelisted only), `fs`/`net`/`http`/`child_process`, `new Function()`.
|
|
91
|
+
- Prototype pollution contained — each sandbox has realm-isolated built-ins.
|
|
92
|
+
- Preemptive timeout via `vm.Script` (kills infinite loops).
|
|
93
|
+
- HMAC-SHA256 plugin signing + `PluginVerifier` + `PluginManifest` schema validation.
|
|
94
|
+
- 58 new tests covering sandbox escape attempts, signature verification, manifest validation.
|
|
95
|
+
- **Express middleware body-size limits**: `options.maxBodySize` (1MB default) with raw-stream enforcement.
|
|
96
|
+
- **Multi-tenant auth validation**: `options.tenantVerifier` + `options.strictAuth` + `withAuth()` helper.
|
|
97
|
+
|
|
98
|
+
#### Quality & Parity
|
|
99
|
+
|
|
100
|
+
- **ReDoS audit**: every pattern tested against adversarial inputs. **0 risky patterns** — worst case 0.4ms per pattern evaluation.
|
|
101
|
+
- **Pattern quality audit**: 120 active patterns doing the work, 177 dead patterns (defensive, never false-positive on benchmark corpus).
|
|
102
|
+
- Python SDK (282 patterns) and Go SDK (141 patterns) pattern-sync deferred to v14.
|
|
103
|
+
|
|
104
|
+
## [13.5.0] - 2026-04-16
|
|
105
|
+
|
|
106
|
+
### Detection Hardening + Security Scan Remediation
|
|
107
|
+
|
|
108
|
+
Tightens existing defenses based on Unit 42 real-world attack research and addresses findings from the Agent Shield security scan.
|
|
109
|
+
|
|
110
|
+
#### Detector Core — 11 New Patterns (3 categories)
|
|
111
|
+
|
|
112
|
+
- **Encoding chain detection** (3 patterns) — Detects multi-layer encoding (base64 inside unicode inside URL encoding). Addresses evasion technique that bypasses single-layer decoders.
|
|
113
|
+
- **SVG-based injection** (4 patterns) — Detects hidden prompts in SVG elements, foreignObject, hidden text, and desc tags. Addresses Unit 42 finding of real-world attacks using SVG encapsulation with 24 layered injection attempts.
|
|
114
|
+
- **Structured data injection** (4 patterns) — Detects hidden instructions in JSON metadata fields, XML CDATA sections, YAML/CSV comments, and comment syntax across formats.
|
|
115
|
+
|
|
116
|
+
#### Cross-Turn Detector — Crescendo Attack Defense
|
|
117
|
+
|
|
118
|
+
- 5 new escalation signal patterns for crescendo attacks: hypothetical framing, imaginary scenarios, permission boundary softening, false-prior-interaction claims, similarity-based escalation.
|
|
119
|
+
- New crescendo-specific detection: flags conversations that start with hypothetical/theoretical framing and drift toward sensitive/dangerous topics over multiple turns.
|
|
120
|
+
|
|
121
|
+
#### MemoryGuard — Persistent Memory Poisoning Defense
|
|
122
|
+
|
|
123
|
+
- `scanSummarization(originalMessages, summary)` detects when context compaction silently injects instructions. Addresses Unit 42 March 2026 research on persistent memory poisoning that survives across sessions.
|
|
124
|
+
|
|
125
|
+
#### Security Scan Remediation
|
|
126
|
+
|
|
127
|
+
- **Sidecar server**: API key authentication, request body size limit (1MB default), rate limiting (100 req/min default), CORS hardened from `*` to `same-origin`.
|
|
128
|
+
- **Dashboard WebSocket**: Authentication token support, max connections limit (50 default), startup warning if no auth configured.
|
|
129
|
+
- **GitHub App**: Webhook signature enforced for non-localhost requests, CRITICAL warning if `GITHUB_WEBHOOK_SECRET` not set.
|
|
130
|
+
- **Document scanner**: `maxDocumentSize` limit (10MB default) prevents DoS via oversized documents.
|
|
131
|
+
- **Audit logs**: `sanitizeLogs` option redacts emails, SSNs, API keys, and truncates content fields before writing.
|
|
132
|
+
|
|
133
|
+
## [13.4.0] - 2026-04-14
|
|
134
|
+
|
|
135
|
+
### April 2026 Threat Response
|
|
136
|
+
|
|
137
|
+
Security updates addressing vulnerabilities and attack techniques discovered April 1-14, 2026.
|
|
138
|
+
|
|
139
|
+
#### Supply Chain Scanner — 16 New CVEs
|
|
140
|
+
|
|
141
|
+
- **CVE-2026-5058** (CVSS 9.8) — AWS MCP Server command injection RCE, no auth required
|
|
142
|
+
- **CVE-2026-5059** — AWS MCP Server remote code execution
|
|
143
|
+
- **CVE-2026-32211** (CVSS 9.1) — Azure MCP Server has no authentication at all
|
|
144
|
+
- **CVE-2026-21518** — VS Code mcp.json command injection (malicious project files)
|
|
145
|
+
- **CVE-2026-33579** — OpenClaw silent admin takeover (patched April 5)
|
|
146
|
+
- **CVE-2026-24763** — OpenClaw command injection
|
|
147
|
+
- **CVE-2026-26322** — OpenClaw SSRF
|
|
148
|
+
- **CVE-2026-26329** — OpenClaw path traversal / local file read
|
|
149
|
+
- **CVE-2026-30741** — OpenClaw prompt-injection-driven code execution
|
|
150
|
+
- **CVE-2025-59528** (CVSS 10.0) — Flowise RCE via MCP node, actively exploited since April 6, 12,000+ instances exposed
|
|
151
|
+
- **CVE-2025-8943** — Flowise missing authentication
|
|
152
|
+
- **CVE-2025-26319** — Flowise arbitrary file upload
|
|
153
|
+
- **CVE-2026-5322** — mcp-data-vis SQL injection
|
|
154
|
+
- **CVE-2026-6130** — chatbox MCP OS command injection
|
|
155
|
+
- **CVE-2026-5023** — codebase-mcp OS command injection RCE
|
|
156
|
+
|
|
157
|
+
Updated OpenClaw malicious skill count: 820 → 1,184+ confirmed on ClawHub (3.5x growth).
|
|
158
|
+
Added aws-mcp-server-unpatched and flowise-unpatched to known-bad server blocklist.
|
|
159
|
+
|
|
160
|
+
#### Detector Core — 15 New Detection Patterns (5 categories)
|
|
161
|
+
|
|
162
|
+
- **XSS-in-agent-output** (5 patterns) — Catches XSS payloads embedded in AI-generated HTML: script tags, event handlers, javascript: URIs, iframe injection, img onerror. Addresses new attack vector where prompt injections deliver XSS through agent output.
|
|
163
|
+
- **Acrostic/steganographic injection** (2 patterns) — Detects hidden instructions where first characters of consecutive lines spell injection keywords. Addresses 93% evasion success rate reported in April 2026 research.
|
|
164
|
+
- **MCP config injection** (2 patterns) — Detects command injection in mcp.json files. Addresses CVE-2026-21518 VS Code attack vector.
|
|
165
|
+
- **Offensive agent behavior** (3 patterns) — Detects AI agents being used as attack tools: exploitation language, C2 infrastructure, credential theft operations. Addresses April 2026 incident where AI agent compromised 600+ firewalls autonomously.
|
|
166
|
+
- **Cloud IAM overpermission** (3 patterns) — Detects wildcard IAM policies enabling "Agent God Mode". Addresses Palo Alto Unit 42 discovery of AWS AgentCore default role vulnerability.
|
|
167
|
+
|
|
7
168
|
## [13.3.0] - 2026-04-06
|
|
8
169
|
|
|
9
170
|
### New SDK Modules
|
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Agent Shield
|
|
2
2
|
|
|
3
|
-
[](https://www.npmjs.com/package/agentshield-sdk)
|
|
4
4
|
[](LICENSE)
|
|
5
5
|
[](#)
|
|
6
6
|
[](#)
|
|
@@ -34,7 +34,7 @@ if (result.blocked) return 'Blocked for safety.';
|
|
|
34
34
|
| Self-training convergence | **0% bypass in 3 cycles** |
|
|
35
35
|
| Avg latency | **< 0.4ms** |
|
|
36
36
|
|
|
37
|
-
Detection stack:
|
|
37
|
+
Detection stack: 115+ regex patterns, 35-feature logistic regression + k-NN ensemble, 5-layer evasion resistance, 19-language support, chunked scanning, adversarial self-training loop.
|
|
38
38
|
|
|
39
39
|
```bash
|
|
40
40
|
# Verify locally
|
|
@@ -75,6 +75,17 @@ const client = shieldAnthropicClient(new Anthropic(), { blockOnThreat: true });
|
|
|
75
75
|
const { shieldOpenAIClient } = require('agentshield-sdk');
|
|
76
76
|
const client = shieldOpenAIClient(new OpenAI(), { blockOnThreat: true });
|
|
77
77
|
|
|
78
|
+
// OpenAI Agents SDK (@openai/agents, April 2026)
|
|
79
|
+
const { Agent, run } = require('@openai/agents');
|
|
80
|
+
const { shieldOpenAIAgent } = require('agentshield-sdk');
|
|
81
|
+
const { inputGuardrail, outputGuardrail, toolGuardrail } = shieldOpenAIAgent({ blockOnThreat: true });
|
|
82
|
+
const agent = new Agent({
|
|
83
|
+
name: 'Assistant',
|
|
84
|
+
instructions: 'You are a helpful assistant',
|
|
85
|
+
inputGuardrails: [inputGuardrail],
|
|
86
|
+
outputGuardrails: [outputGuardrail]
|
|
87
|
+
});
|
|
88
|
+
|
|
78
89
|
// LangChain
|
|
79
90
|
const { ShieldCallbackHandler } = require('agentshield-sdk');
|
|
80
91
|
const chain = new LLMChain({ llm, prompt, callbacks: [new ShieldCallbackHandler()] });
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentshield-sdk",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "14.0.0",
|
|
4
4
|
"description": "SOTA AI agent security SDK. F1 1.000 on BIPIA/HackAPrompt/MCPTox/Multilingual benchmarks. 400+ exports, 100+ modules. Zero dependencies, runs locally.",
|
|
5
5
|
"main": "src/main.js",
|
|
6
6
|
"types": "types/index.d.ts",
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
},
|
|
33
33
|
"sideEffects": false,
|
|
34
34
|
"scripts": {
|
|
35
|
-
"test": "node test/test.js && node test/test-modules.js && node test/test-new-features.js && node test/test-mcp-guard.js && node test/test-supply-chain-scanner.js && node test/test-owasp-agentic.js && node test/test-redteam-cli.js && node test/test-drift-monitor.js && node test/test-micro-model.js && node test/test-level5.js && node test/test-sota.js && node test/test-cross-turn.js && node test/test-v12.js && node test/test-traps.js && node test/test-deepmind.js && node test/test-render-differential.js && node test/test-sybil.js && node test/test-side-channel.js",
|
|
35
|
+
"test": "node test/test.js && node test/test-modules.js && node test/test-new-features.js && node test/test-mcp-guard.js && node test/test-supply-chain-scanner.js && node test/test-owasp-agentic.js && node test/test-redteam-cli.js && node test/test-drift-monitor.js && node test/test-micro-model.js && node test/test-level5.js && node test/test-sota.js && node test/test-cross-turn.js && node test/test-v12.js && node test/test-traps.js && node test/test-deepmind.js && node test/test-render-differential.js && node test/test-sybil.js && node test/test-side-channel.js && node test/test-plugin-sandbox.js && node test/test-openai-agents-sdk.js && node test/test-framework-integrations.js",
|
|
36
36
|
"test:new-products": "node test/test-mcp-guard.js && node test/test-supply-chain-scanner.js && node test/test-owasp-agentic.js && node test/test-redteam-cli.js && node test/test-drift-monitor.js && node test/test-micro-model.js",
|
|
37
37
|
"test:all": "node test/test-all-40-features.js",
|
|
38
38
|
"test:mcp": "node test/test-mcp-security.js",
|
package/src/audit-immutable.js
CHANGED
|
@@ -584,8 +584,10 @@ class ImmutableAuditLog {
|
|
|
584
584
|
* @param {number} [options.maxAge=0] - Maximum age in milliseconds (0 = unlimited).
|
|
585
585
|
* @param {function} [options.archiveCallback] - Called with removed entries during retention enforcement. Signature: (entries: AuditEntry[]) => void.
|
|
586
586
|
* @param {string} [options.genesisHash] - Custom genesis hash (defaults to GENESIS_HASH).
|
|
587
|
+
* @param {boolean} [options.sanitizeLogs=false] - Redact sensitive content (emails, SSNs, API keys) before writing to the chain.
|
|
587
588
|
*/
|
|
588
589
|
constructor(options = {}) {
|
|
590
|
+
this.options = options;
|
|
589
591
|
this._store = options.store || new MemoryStore();
|
|
590
592
|
this._maxEntries = options.maxEntries || 0;
|
|
591
593
|
this._maxAge = options.maxAge || 0;
|
|
@@ -598,6 +600,59 @@ class ImmutableAuditLog {
|
|
|
598
600
|
console.log('[Agent Shield] ImmutableAuditLog initialized (store: %s)', this._store.constructor.name);
|
|
599
601
|
}
|
|
600
602
|
|
|
603
|
+
/**
|
|
604
|
+
* Sanitize an entry's data object by redacting sensitive content.
|
|
605
|
+
* Addresses the security scan finding about audit logs containing sensitive prompt data.
|
|
606
|
+
*
|
|
607
|
+
* Redacts:
|
|
608
|
+
* - Email addresses -> [EMAIL_REDACTED]
|
|
609
|
+
* - SSN patterns (XXX-XX-XXXX) -> [SSN_REDACTED]
|
|
610
|
+
* - API key patterns (sk-..., key-..., token-...) -> [KEY_REDACTED]
|
|
611
|
+
* - Truncates 'content' and 'input' fields to 500 characters max
|
|
612
|
+
*
|
|
613
|
+
* @param {object} entry - The data object to sanitize.
|
|
614
|
+
* @returns {object} A sanitized copy of the data object.
|
|
615
|
+
*/
|
|
616
|
+
sanitize(entry) {
|
|
617
|
+
if (!this.options.sanitizeLogs) {
|
|
618
|
+
return entry;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
const sanitized = JSON.parse(JSON.stringify(entry));
|
|
622
|
+
|
|
623
|
+
const redactString = (str) => {
|
|
624
|
+
if (typeof str !== 'string') return str;
|
|
625
|
+
// Redact email addresses
|
|
626
|
+
str = str.replace(/[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g, '[EMAIL_REDACTED]');
|
|
627
|
+
// Redact SSN patterns (XXX-XX-XXXX)
|
|
628
|
+
str = str.replace(/\b\d{3}-\d{2}-\d{4}\b/g, '[SSN_REDACTED]');
|
|
629
|
+
// Redact API key patterns (sk-..., key-..., token-...)
|
|
630
|
+
str = str.replace(/\b(?:sk|key|token)-[a-zA-Z0-9_\-]{8,}\b/g, '[KEY_REDACTED]');
|
|
631
|
+
return str;
|
|
632
|
+
};
|
|
633
|
+
|
|
634
|
+
const redactObject = (obj) => {
|
|
635
|
+
if (obj === null || obj === undefined) return obj;
|
|
636
|
+
if (typeof obj === 'string') return redactString(obj);
|
|
637
|
+
if (Array.isArray(obj)) return obj.map(item => redactObject(item));
|
|
638
|
+
if (typeof obj === 'object') {
|
|
639
|
+
const result = {};
|
|
640
|
+
for (const key of Object.keys(obj)) {
|
|
641
|
+
let value = redactObject(obj[key]);
|
|
642
|
+
// Truncate content and input fields to 500 chars
|
|
643
|
+
if ((key === 'content' || key === 'input') && typeof value === 'string' && value.length > 500) {
|
|
644
|
+
value = value.slice(0, 500) + '...[TRUNCATED]';
|
|
645
|
+
}
|
|
646
|
+
result[key] = value;
|
|
647
|
+
}
|
|
648
|
+
return result;
|
|
649
|
+
}
|
|
650
|
+
return obj;
|
|
651
|
+
};
|
|
652
|
+
|
|
653
|
+
return redactObject(sanitized);
|
|
654
|
+
}
|
|
655
|
+
|
|
601
656
|
/**
|
|
602
657
|
* Initialize sequence counter from existing store data.
|
|
603
658
|
* @private
|
|
@@ -646,12 +701,15 @@ class ImmutableAuditLog {
|
|
|
646
701
|
delete normalizedActor.name;
|
|
647
702
|
}
|
|
648
703
|
|
|
704
|
+
// Sanitize data if sanitizeLogs is enabled
|
|
705
|
+
const sanitizedData = this.sanitize(data || {});
|
|
706
|
+
|
|
649
707
|
const entryContent = {
|
|
650
708
|
id,
|
|
651
709
|
sequence: this._sequence,
|
|
652
710
|
timestamp,
|
|
653
711
|
type,
|
|
654
|
-
data:
|
|
712
|
+
data: sanitizedData,
|
|
655
713
|
actor: normalizedActor
|
|
656
714
|
};
|
|
657
715
|
|
package/src/audit.js
CHANGED
package/src/cross-turn.js
CHANGED
|
@@ -29,6 +29,12 @@ const ESCALATION_SIGNALS = [
|
|
|
29
29
|
/(?:one\s+more\s+(?:thing|question|step)|just\s+one\s+(?:last|final)\s+(?:thing|question))/i,
|
|
30
30
|
/(?:can\s+you\s+go\s+(?:a\s+little\s+)?(?:further|deeper)|take\s+(?:it|this)\s+(?:a\s+step\s+)?further)/i,
|
|
31
31
|
/(?:you\s+(?:just|already)\s+(?:showed|told|gave)\s+me|you\s+(?:were|seemed)\s+(?:willing|happy)\s+to)/i,
|
|
32
|
+
// Crescendo attack patterns — benign start that gradually escalates
|
|
33
|
+
/(?:hypothetically|in\s+theory|for\s+(?:educational|research|academic)\s+purposes)/i,
|
|
34
|
+
/(?:let's\s+(?:say|imagine|pretend)|what\s+if\s+(?:someone|a\s+person|we))/i,
|
|
35
|
+
/(?:I\s+(?:know|understand)\s+you\s+(?:can't|shouldn't)\s+but|normally\s+you\s+(?:wouldn't|can't)\s+but)/i,
|
|
36
|
+
/(?:you\s+(?:helped|assisted)\s+(?:me|us)\s+(?:before|last\s+time|earlier)\s+with\s+(?:something\s+)?similar)/i,
|
|
37
|
+
/(?:this\s+is\s+(?:similar|related)\s+to\s+what\s+(?:you|we)\s+(?:did|discussed))/i,
|
|
32
38
|
];
|
|
33
39
|
|
|
34
40
|
/** Trust erosion patterns — attempts to lower the model's guard over turns. */
|
|
@@ -165,7 +171,25 @@ class ConversationTracker {
|
|
|
165
171
|
}
|
|
166
172
|
}
|
|
167
173
|
|
|
168
|
-
// 5.
|
|
174
|
+
// 5. Crescendo detection — benign conversation gradually introduces sensitive framing
|
|
175
|
+
if (role === 'user' && this.turns.length >= 5) {
|
|
176
|
+
const window = this.turns.slice(-6, -1).filter(t => t.role === 'user');
|
|
177
|
+
const hypotheticalCount = window.filter(t =>
|
|
178
|
+
/(?:hypothetically|in\s+theory|let's\s+(?:say|imagine)|what\s+if|for\s+(?:educational|research)\s+purposes)/i.test(t.content)
|
|
179
|
+
).length;
|
|
180
|
+
if (hypotheticalCount >= 2 && (topic === 'sensitive' || topic === 'dangerous')) {
|
|
181
|
+
turnAlerts.push({
|
|
182
|
+
type: 'crescendo_attack',
|
|
183
|
+
severity: 'high',
|
|
184
|
+
turnIndex: turn.turnIndex,
|
|
185
|
+
hypotheticalCount,
|
|
186
|
+
currentTopic: topic,
|
|
187
|
+
description: `Crescendo pattern: ${hypotheticalCount} hypothetical/theoretical framings followed by ${topic} topic. Gradual normalization of sensitive requests.`
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// 6. Authority accumulation — user references previous "agreements"
|
|
169
193
|
if (role === 'user' && /(?:you\s+(?:said|agreed|confirmed|told\s+me)|as\s+we\s+(?:discussed|agreed)|per\s+our\s+(?:agreement|conversation))/i.test(content)) {
|
|
170
194
|
const hasRealAgreement = this.turns.some(t => t.role === 'assistant' && /(?:sure|yes|okay|of\s+course|I\s+(?:can|will))/i.test(t.content));
|
|
171
195
|
if (!hasRealAgreement) {
|