@panguard-ai/threat-cloud 1.4.1 → 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/admin-dashboard.js +5 -5
- package/dist/audit-logger.d.ts +1 -1
- package/dist/audit-logger.d.ts.map +1 -1
- package/dist/audit-logger.js.map +1 -1
- package/dist/badge-api.d.ts +58 -0
- package/dist/badge-api.d.ts.map +1 -0
- package/dist/badge-api.js +248 -0
- package/dist/badge-api.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/database.d.ts +254 -2
- package/dist/database.d.ts.map +1 -1
- package/dist/database.js +769 -72
- package/dist/database.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/llm-reviewer-tools.d.ts +110 -0
- package/dist/llm-reviewer-tools.d.ts.map +1 -0
- package/dist/llm-reviewer-tools.js +446 -0
- package/dist/llm-reviewer-tools.js.map +1 -0
- package/dist/llm-reviewer.d.ts +54 -0
- package/dist/llm-reviewer.d.ts.map +1 -1
- package/dist/llm-reviewer.js +708 -64
- package/dist/llm-reviewer.js.map +1 -1
- package/dist/migrations.d.ts.map +1 -1
- package/dist/migrations.js +215 -0
- package/dist/migrations.js.map +1 -1
- package/dist/migrator-crystallization.d.ts +80 -0
- package/dist/migrator-crystallization.d.ts.map +1 -0
- package/dist/migrator-crystallization.js +108 -0
- package/dist/migrator-crystallization.js.map +1 -0
- package/dist/server.d.ts +75 -2
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +1249 -130
- package/dist/server.js.map +1 -1
- package/dist/types.d.ts +33 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +15 -12
package/dist/llm-reviewer.js
CHANGED
|
@@ -9,10 +9,138 @@
|
|
|
9
9
|
* @module @panguard-ai/threat-cloud/llm-reviewer
|
|
10
10
|
*/
|
|
11
11
|
import * as https from 'node:https';
|
|
12
|
+
import { createHash } from 'node:crypto';
|
|
13
|
+
import { load as parseYaml } from 'js-yaml';
|
|
14
|
+
import { parseATRRule, validateRuleMeetsStandard, } from '@panguard-ai/atr/quality';
|
|
15
|
+
import { TC_DRAFTER_TOOLS, executeToolCall } from './llm-reviewer-tools.js';
|
|
16
|
+
/**
|
|
17
|
+
* Run a rule's embedded test cases against its own regex conditions.
|
|
18
|
+
* This is the first-principles quality check: if a rule cannot match its
|
|
19
|
+
* own claimed TPs or falsely matches its own claimed TNs, the regex is
|
|
20
|
+
* broken regardless of how good the metadata looks.
|
|
21
|
+
*
|
|
22
|
+
* Returns `passed: true` only if ALL TPs match AND zero TNs match.
|
|
23
|
+
*/
|
|
24
|
+
function selfTestRule(ruleContent) {
|
|
25
|
+
let parsed;
|
|
26
|
+
try {
|
|
27
|
+
parsed = parseYaml(ruleContent);
|
|
28
|
+
}
|
|
29
|
+
catch (e) {
|
|
30
|
+
return {
|
|
31
|
+
passed: false,
|
|
32
|
+
tpTotal: 0,
|
|
33
|
+
tpMatched: 0,
|
|
34
|
+
tnTotal: 0,
|
|
35
|
+
tnMatched: 0,
|
|
36
|
+
failureReasons: [`YAML parse error: ${e instanceof Error ? e.message : String(e)}`],
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
const conditions = parsed?.detection?.conditions ?? [];
|
|
40
|
+
const regexes = [];
|
|
41
|
+
for (const c of conditions) {
|
|
42
|
+
if (!c?.value)
|
|
43
|
+
continue;
|
|
44
|
+
// Strip (?i) prefix — JS uses /pattern/i flag
|
|
45
|
+
const pattern = c.value.replace(/^\(\?i\)/, '');
|
|
46
|
+
try {
|
|
47
|
+
regexes.push(new RegExp(pattern, 'i'));
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
// Invalid regex — skip this condition. Other conditions may still work.
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
if (regexes.length === 0) {
|
|
54
|
+
return {
|
|
55
|
+
passed: false,
|
|
56
|
+
tpTotal: 0,
|
|
57
|
+
tpMatched: 0,
|
|
58
|
+
tnTotal: 0,
|
|
59
|
+
tnMatched: 0,
|
|
60
|
+
failureReasons: ['no compilable regex conditions'],
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const matchesAny = (text) => regexes.some((r) => r.test(text));
|
|
64
|
+
const tps = parsed?.test_cases?.true_positives ?? [];
|
|
65
|
+
const tns = parsed?.test_cases?.true_negatives ?? [];
|
|
66
|
+
const failureReasons = [];
|
|
67
|
+
let tpMatched = 0;
|
|
68
|
+
for (let i = 0; i < tps.length; i++) {
|
|
69
|
+
const input = tps[i]?.input ?? tps[i]?.tool_response ?? '';
|
|
70
|
+
if (matchesAny(input)) {
|
|
71
|
+
tpMatched++;
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
failureReasons.push(`TP ${i + 1} not caught: "${input.slice(0, 80)}..."`);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
let tnPassed = 0;
|
|
78
|
+
for (let i = 0; i < tns.length; i++) {
|
|
79
|
+
const input = tns[i]?.input ?? tns[i]?.tool_response ?? '';
|
|
80
|
+
if (!matchesAny(input)) {
|
|
81
|
+
tnPassed++;
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
failureReasons.push(`TN ${i + 1} false positive: "${input.slice(0, 80)}..."`);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// A rule passes self-test if all TPs match AND zero TNs match
|
|
88
|
+
const passed = tpMatched === tps.length && tnPassed === tns.length && tps.length > 0 && tns.length > 0;
|
|
89
|
+
return {
|
|
90
|
+
passed,
|
|
91
|
+
tpTotal: tps.length,
|
|
92
|
+
tpMatched,
|
|
93
|
+
tnTotal: tns.length,
|
|
94
|
+
tnMatched: tns.length - tnPassed, // FP count
|
|
95
|
+
failureReasons,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
12
98
|
/** Timeout for LLM API calls in milliseconds */
|
|
13
99
|
const LLM_TIMEOUT_MS = 60_000;
|
|
14
|
-
/**
|
|
15
|
-
|
|
100
|
+
/**
|
|
101
|
+
* Drafter model — used for bulk rule generation from attack payloads
|
|
102
|
+
* (garak pipe, skill scans). Defaults to Haiku for cost efficiency.
|
|
103
|
+
* Override via TC_DRAFTER_MODEL env var.
|
|
104
|
+
*
|
|
105
|
+
* Cost profile (per 1M tokens):
|
|
106
|
+
* - Haiku 3.5: $0.80 in / $4.00 out
|
|
107
|
+
* - Haiku 4.5: $1.00 in / $5.00 out (90% of Sonnet capability per CLAUDE.md)
|
|
108
|
+
* - Sonnet 4: $3.00 in / $15.00 out (4x Haiku)
|
|
109
|
+
*
|
|
110
|
+
* Haiku is sufficient for rule drafting — the RFC-001 quality gate + self-test
|
|
111
|
+
* catches output defects regardless of model. Sonnet adds ~3-5% quality at 4x
|
|
112
|
+
* cost, not worth it for bulk drafter.
|
|
113
|
+
*/
|
|
114
|
+
const DEFAULT_DRAFTER_MODEL = process.env['TC_DRAFTER_MODEL'] ?? 'claude-haiku-4-5-20251001';
|
|
115
|
+
/**
|
|
116
|
+
* Reviewer model — used for the second-opinion review pass after a proposal
|
|
117
|
+
* is drafted (reviewProposal). Quality-critical, stays on Sonnet.
|
|
118
|
+
* Override via TC_REVIEWER_MODEL env var.
|
|
119
|
+
*/
|
|
120
|
+
const DEFAULT_REVIEWER_MODEL = process.env['TC_REVIEWER_MODEL'] ?? 'claude-sonnet-4-20250514';
|
|
121
|
+
/** Legacy alias — kept so existing call sites compile during refactor. */
|
|
122
|
+
const _DEFAULT_MODEL = DEFAULT_REVIEWER_MODEL;
|
|
123
|
+
/**
|
|
124
|
+
* Normalize a payload for fingerprinting. Lowercases, collapses whitespace,
|
|
125
|
+
* strips common punctuation, caps at 2KB. Stable across minor formatting
|
|
126
|
+
* differences so near-duplicate garak prompts produce identical hashes.
|
|
127
|
+
*
|
|
128
|
+
* Normalization of "Ignore previous instructions" and "IGNORE previous
|
|
129
|
+
* instructions!!!" both produce the same fingerprint.
|
|
130
|
+
*/
|
|
131
|
+
function normalizePayloadForFingerprint(payload) {
|
|
132
|
+
return payload
|
|
133
|
+
.toLowerCase()
|
|
134
|
+
.replace(/[^\p{L}\p{N}\s]/gu, ' ') // strip punctuation, keep letters/digits/whitespace
|
|
135
|
+
.replace(/\s+/g, ' ')
|
|
136
|
+
.trim()
|
|
137
|
+
.slice(0, 2000);
|
|
138
|
+
}
|
|
139
|
+
/** Compute a stable 16-hex-char fingerprint of a payload's semantic content. */
|
|
140
|
+
function payloadFingerprint(payload) {
|
|
141
|
+
const norm = normalizePayloadForFingerprint(payload);
|
|
142
|
+
return createHash('sha256').update(norm).digest('hex').slice(0, 16);
|
|
143
|
+
}
|
|
16
144
|
/**
|
|
17
145
|
* LLM Reviewer for ATR rule proposals
|
|
18
146
|
* ATR 規則提案 LLM 審查器
|
|
@@ -20,11 +148,25 @@ const DEFAULT_MODEL = 'claude-sonnet-4-20250514';
|
|
|
20
148
|
export class LLMReviewer {
|
|
21
149
|
apiKey;
|
|
22
150
|
db;
|
|
151
|
+
/**
|
|
152
|
+
* Primary model used throughout this class. Historically one model served
|
|
153
|
+
* both drafting and reviewing; we now split to `drafterModel` (Haiku, cheap)
|
|
154
|
+
* and `reviewerModel` (Sonnet, quality-critical). `this.model` retains the
|
|
155
|
+
* reviewer value for backward compat with existing `reviewProposal` callers.
|
|
156
|
+
*/
|
|
23
157
|
model;
|
|
158
|
+
/** Model used for rule drafting (Haiku by default — 4x cheaper than Sonnet). */
|
|
159
|
+
drafterModel;
|
|
160
|
+
/** Model used for second-opinion review (Sonnet by default). */
|
|
161
|
+
reviewerModel;
|
|
24
162
|
constructor(apiKey, db, model) {
|
|
25
163
|
this.apiKey = apiKey;
|
|
26
164
|
this.db = db;
|
|
27
|
-
|
|
165
|
+
// Honor legacy `model` constructor arg for backward compat; when set it
|
|
166
|
+
// overrides BOTH drafter and reviewer. New code should prefer env vars.
|
|
167
|
+
this.reviewerModel = model ?? DEFAULT_REVIEWER_MODEL;
|
|
168
|
+
this.drafterModel = model ?? DEFAULT_DRAFTER_MODEL;
|
|
169
|
+
this.model = this.reviewerModel;
|
|
28
170
|
}
|
|
29
171
|
/** Check if the reviewer is available (API key is set) / 檢查審查器是否可用 */
|
|
30
172
|
isAvailable() {
|
|
@@ -51,10 +193,11 @@ export class LLMReviewer {
|
|
|
51
193
|
console.error(` -> Transient error, keeping proposal pending for retry`);
|
|
52
194
|
return { verdict: '', approved: false };
|
|
53
195
|
}
|
|
54
|
-
//
|
|
196
|
+
// Non-transient errors: store failure but keep proposal pending for retry
|
|
197
|
+
// Do NOT auto-reject — API errors are not evidence of bad rules
|
|
55
198
|
const failVerdict = JSON.stringify({
|
|
56
199
|
approved: false,
|
|
57
|
-
falsePositiveRisk: '
|
|
200
|
+
falsePositiveRisk: 'medium',
|
|
58
201
|
coverageScore: 0,
|
|
59
202
|
reasoning: `LLM review failed: ${msg}`,
|
|
60
203
|
});
|
|
@@ -66,8 +209,14 @@ export class LLMReviewer {
|
|
|
66
209
|
const verdictJson = JSON.stringify(verdict);
|
|
67
210
|
// Store verdict in database
|
|
68
211
|
this.db.updateATRProposalLLMReview(patternHash, verdictJson);
|
|
69
|
-
//
|
|
70
|
-
|
|
212
|
+
// Terminal state transition on any legitimate rejection.
|
|
213
|
+
// Transient errors are handled earlier (they return without reaching this
|
|
214
|
+
// code path), so if we got a parsed verdict with approved=false, the LLM
|
|
215
|
+
// has made a reasoned decision — move the proposal to 'rejected' so the
|
|
216
|
+
// retry cron stops picking it up. Previously only high-FP rejections were
|
|
217
|
+
// marked terminal, which left low/medium-FP rejections in an infinite
|
|
218
|
+
// retry loop burning LLM API quota.
|
|
219
|
+
if (!verdict.approved) {
|
|
71
220
|
this.db.rejectATRProposal(patternHash);
|
|
72
221
|
}
|
|
73
222
|
return { verdict: verdictJson, approved: verdict.approved };
|
|
@@ -119,9 +268,13 @@ Output ONLY valid JSON (no markdown, no explanation outside the JSON):
|
|
|
119
268
|
*/
|
|
120
269
|
callAnthropicAPI(prompt) {
|
|
121
270
|
return new Promise((resolve, reject) => {
|
|
271
|
+
// 4096 tokens is needed because the ATR drafter prompt requires
|
|
272
|
+
// a full rule YAML with 3+ conditions, 3+ TP, 3+ TN, 3+ evasion tests,
|
|
273
|
+
// MITRE + OWASP references, and descriptions. 1024 was cutting off
|
|
274
|
+
// mid-YAML and the regex extractor dropped the truncated block.
|
|
122
275
|
const requestBody = JSON.stringify({
|
|
123
276
|
model: this.model,
|
|
124
|
-
max_tokens:
|
|
277
|
+
max_tokens: 4096,
|
|
125
278
|
messages: [{ role: 'user', content: prompt }],
|
|
126
279
|
});
|
|
127
280
|
const options = {
|
|
@@ -174,81 +327,323 @@ Output ONLY valid JSON (no markdown, no explanation outside the JSON):
|
|
|
174
327
|
req.end();
|
|
175
328
|
});
|
|
176
329
|
}
|
|
330
|
+
/**
|
|
331
|
+
* Low-level Anthropic messages call that accepts a prepared request body.
|
|
332
|
+
* Used by the tool-use loop so we can pass full message histories.
|
|
333
|
+
*/
|
|
334
|
+
callAnthropicRaw(body) {
|
|
335
|
+
return new Promise((resolve, reject) => {
|
|
336
|
+
const requestBody = JSON.stringify(body);
|
|
337
|
+
const options = {
|
|
338
|
+
hostname: 'api.anthropic.com',
|
|
339
|
+
port: 443,
|
|
340
|
+
path: '/v1/messages',
|
|
341
|
+
method: 'POST',
|
|
342
|
+
headers: {
|
|
343
|
+
'x-api-key': this.apiKey,
|
|
344
|
+
'anthropic-version': '2023-06-01',
|
|
345
|
+
'content-type': 'application/json',
|
|
346
|
+
'content-length': Buffer.byteLength(requestBody),
|
|
347
|
+
},
|
|
348
|
+
timeout: LLM_TIMEOUT_MS,
|
|
349
|
+
};
|
|
350
|
+
const req = https.request(options, (res) => {
|
|
351
|
+
const chunks = [];
|
|
352
|
+
res.on('data', (chunk) => chunks.push(chunk));
|
|
353
|
+
res.on('end', () => {
|
|
354
|
+
const bodyText = Buffer.concat(chunks).toString('utf-8');
|
|
355
|
+
if (res.statusCode !== 200) {
|
|
356
|
+
reject(new Error(`Anthropic API status ${res.statusCode}: ${bodyText.slice(0, 500)}`));
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
try {
|
|
360
|
+
const parsed = JSON.parse(bodyText);
|
|
361
|
+
resolve({ content: parsed.content ?? [], stop_reason: parsed.stop_reason });
|
|
362
|
+
}
|
|
363
|
+
catch (err) {
|
|
364
|
+
reject(new Error(`Anthropic API parse: ${err instanceof Error ? err.message : String(err)}`));
|
|
365
|
+
}
|
|
366
|
+
});
|
|
367
|
+
});
|
|
368
|
+
req.on('timeout', () => {
|
|
369
|
+
req.destroy();
|
|
370
|
+
reject(new Error(`Anthropic API timeout after ${LLM_TIMEOUT_MS}ms`));
|
|
371
|
+
});
|
|
372
|
+
req.on('error', (err) => reject(new Error(`Anthropic API error: ${err.message}`)));
|
|
373
|
+
req.write(requestBody);
|
|
374
|
+
req.end();
|
|
375
|
+
});
|
|
376
|
+
}
|
|
377
|
+
/**
|
|
378
|
+
* Tool-use loop for TC v2 drafter. Runs a multi-turn conversation with
|
|
379
|
+
* Claude where it can call grep_existing_rules, read_rule, and
|
|
380
|
+
* fetch_research to ground its draft in existing ATR coverage and
|
|
381
|
+
* public threat research before emitting a rule YAML.
|
|
382
|
+
*
|
|
383
|
+
* Returns the concatenated text of Claude's final assistant turn (the
|
|
384
|
+
* message where stop_reason is "end_turn" and not "tool_use").
|
|
385
|
+
*
|
|
386
|
+
* Max 6 tool-use rounds per skill to bound latency and cost; if Claude
|
|
387
|
+
* still wants to use tools on round 7, we instruct it to finalize.
|
|
388
|
+
*/
|
|
389
|
+
async callAnthropicWithTools(systemPrompt, userMessage, options) {
|
|
390
|
+
const MAX_ROUNDS = 6;
|
|
391
|
+
const modelToUse = options?.model ?? this.model;
|
|
392
|
+
const messages = [
|
|
393
|
+
{ role: 'user', content: userMessage },
|
|
394
|
+
];
|
|
395
|
+
let toolCalls = 0;
|
|
396
|
+
let finalText = '';
|
|
397
|
+
for (let round = 0; round < MAX_ROUNDS; round++) {
|
|
398
|
+
const body = {
|
|
399
|
+
model: modelToUse,
|
|
400
|
+
max_tokens: 4096,
|
|
401
|
+
system: systemPrompt,
|
|
402
|
+
tools: TC_DRAFTER_TOOLS,
|
|
403
|
+
messages,
|
|
404
|
+
};
|
|
405
|
+
const response = await this.callAnthropicRaw(body);
|
|
406
|
+
// Collect assistant response as content blocks
|
|
407
|
+
const assistantBlocks = [];
|
|
408
|
+
const toolUses = [];
|
|
409
|
+
for (const block of response.content) {
|
|
410
|
+
if (block.type === 'text' && typeof block.text === 'string') {
|
|
411
|
+
assistantBlocks.push({ type: 'text', text: block.text });
|
|
412
|
+
finalText += (finalText ? '\n' : '') + block.text;
|
|
413
|
+
}
|
|
414
|
+
else if (block.type === 'tool_use' && block.id && block.name) {
|
|
415
|
+
assistantBlocks.push({
|
|
416
|
+
type: 'tool_use',
|
|
417
|
+
id: block.id,
|
|
418
|
+
name: block.name,
|
|
419
|
+
input: block.input ?? {},
|
|
420
|
+
});
|
|
421
|
+
toolUses.push({ id: block.id, name: block.name, input: block.input ?? {} });
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
messages.push({ role: 'assistant', content: assistantBlocks });
|
|
425
|
+
if (response.stop_reason !== 'tool_use' || toolUses.length === 0) {
|
|
426
|
+
// Claude is done — return its final text
|
|
427
|
+
break;
|
|
428
|
+
}
|
|
429
|
+
// Execute each requested tool and build a tool_result turn
|
|
430
|
+
const toolResults = [];
|
|
431
|
+
for (const tu of toolUses) {
|
|
432
|
+
toolCalls++;
|
|
433
|
+
console.log(`[tc-v2] round ${round + 1}: tool ${tu.name}(${JSON.stringify(tu.input).slice(0, 120)})`);
|
|
434
|
+
const result = await executeToolCall(tu.name, tu.input);
|
|
435
|
+
toolResults.push({
|
|
436
|
+
type: 'tool_result',
|
|
437
|
+
tool_use_id: tu.id,
|
|
438
|
+
content: result.content.slice(0, 8000), // cap per-tool-result size
|
|
439
|
+
is_error: result.isError,
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
messages.push({ role: 'user', content: toolResults });
|
|
443
|
+
// Reset finalText — we only want the LAST assistant turn's text
|
|
444
|
+
// (which contains the YAML rule output), not the interim narration
|
|
445
|
+
finalText = '';
|
|
446
|
+
}
|
|
447
|
+
return { finalText, toolCalls };
|
|
448
|
+
}
|
|
177
449
|
// -------------------------------------------------------------------------
|
|
178
450
|
// Skill Analysis — POST /api/analyze-skills
|
|
179
451
|
// 技能分析 — 接收掃描結果,用 LLM 找 regex 漏掉的 semantic threats
|
|
180
452
|
// -------------------------------------------------------------------------
|
|
181
|
-
|
|
453
|
+
/** Prompt for skill/tool analysis (both MCP and SKILL.md) */
|
|
454
|
+
static ATR_DRAFTER_PROMPT = `You are a senior AI security rule engineer for ATR (Agent Threat Rules). Cisco AI Defense merged 34 ATR rules into production. Your output must meet that quality bar AND the RFC-001 v1.0 quality gate (5+ TP, 5+ TN, 3+ evasion_tests, OWASP LLM + OWASP Agentic + MITRE ATLAS required).
|
|
455
|
+
|
|
456
|
+
You have three tools: grep_existing_rules, read_rule, fetch_research. Use them.
|
|
457
|
+
|
|
458
|
+
PROTOCOL — you MUST follow this order:
|
|
459
|
+
|
|
460
|
+
STEP 0 — De-duplication check (required, non-negotiable):
|
|
461
|
+
a) Call grep_existing_rules with 2-4 keywords from the attack you are considering. Example keywords: ["prompt injection", "IMPORTANT tag"], ["credential exfil", "ssh key"], ["tool poisoning", "cross-tool"], ["hidden instruction", "system override"].
|
|
462
|
+
b) Read the results. If any matching rules look topically similar, call read_rule on the 1-3 most relevant to inspect their regex patterns and test cases.
|
|
463
|
+
c) Decide:
|
|
464
|
+
- If the attack is ALREADY covered by an existing rule (same patterns, same category) → output NO_THREATS_FOUND and stop. Do not duplicate existing work.
|
|
465
|
+
- If the attack is a NOVEL VARIANT that slips past existing regex → draft a new rule explicitly referencing what it catches that existing rules miss.
|
|
466
|
+
- If the attack is a GENUINELY NEW CLASS → draft a new rule from scratch.
|
|
467
|
+
|
|
468
|
+
STEP 1 — Research grounding (strongly recommended):
|
|
469
|
+
If you're drafting a new rule, call fetch_research on at least one reputable source that documents the attack class. Suggested sources: invariantlabs.ai/blog, elastic.co/security-labs, snyk.io/articles, arxiv.org, atlas.mitre.org, unit42.paloaltonetworks.com, genai.owasp.org. Cite the source in the rule's \`references.research\` field.
|
|
470
|
+
|
|
471
|
+
STEP 2 — Draft the rule (only after steps 0 and 1):
|
|
182
472
|
|
|
183
|
-
You will receive MCP tool descriptions from a skill.
|
|
473
|
+
You will receive MCP tool descriptions from a skill. Write a PRODUCTION-QUALITY detection rule ONLY if you find a SPECIFIC, CONCRETE attack pattern AND have verified it is not already covered.
|
|
184
474
|
|
|
185
|
-
|
|
475
|
+
QUALITY BAR (Cisco-merge level + RFC-001 v1.0):
|
|
186
476
|
|
|
187
|
-
1. REGEX
|
|
188
|
-
GOOD:
|
|
189
|
-
BAD: "
|
|
190
|
-
|
|
477
|
+
1. REGEX — SINGLE-QUOTED YAML, compound patterns, 3+ word sequences:
|
|
478
|
+
GOOD: '(curl|wget)\\s+[^\\n]*\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}[^|]*\\|\\s*(bash|sh)'
|
|
479
|
+
BAD: "config.*base64.*import" (too broad, double quotes cause YAML escape bugs)
|
|
480
|
+
CRITICAL: Always use SINGLE QUOTES for regex values in YAML. Double quotes break \\s, \\d, \\w.
|
|
481
|
+
CRITICAL: Do NOT use (?i) inline flag — JS RegExp does not support it. Instead, use character classes like [aA] or write pattern without case sensitivity assumption.
|
|
191
482
|
|
|
192
|
-
2.
|
|
193
|
-
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
483
|
+
2. DETECTION LAYERS — minimum 3 conditions, 5+ preferred:
|
|
484
|
+
Cisco-merge rules have 5-15 detection layers for defense in depth.
|
|
485
|
+
Each condition catches a different facet of the attack:
|
|
486
|
+
- Layer 1: the core malicious action (e.g. credential file read)
|
|
487
|
+
- Layer 2: the exfil vector (e.g. base64 encoding, compression)
|
|
488
|
+
- Layer 3: the transmission (e.g. curl POST to external host)
|
|
489
|
+
- Layer 4: (optional) obfuscation markers (eval, decode, fromCharCode)
|
|
490
|
+
- Layer 5: (optional) evasion patterns (sleep, time check, conditional)
|
|
491
|
+
A rule with only 1 condition is REJECTED as insufficient.
|
|
197
492
|
|
|
198
|
-
3.
|
|
199
|
-
|
|
200
|
-
-
|
|
201
|
-
-
|
|
202
|
-
-
|
|
203
|
-
|
|
204
|
-
- Tool description instructs LLM to ignore user preferences or security policies
|
|
493
|
+
3. FALSE POSITIVES — regex MUST NOT trigger on:
|
|
494
|
+
- Legitimate deployment/monitoring/admin tools
|
|
495
|
+
- Security scanning tools that DESCRIBE attack patterns as examples
|
|
496
|
+
- Normal API clients reading credentials from config
|
|
497
|
+
- DevOps skills mentioning shell commands in documentation context
|
|
498
|
+
TEST: mentally run your regex against "npm install express" and "deploy to AWS" — if it triggers, it's too broad.
|
|
205
499
|
|
|
206
|
-
4.
|
|
207
|
-
|
|
500
|
+
4. REAL ATTACKS ONLY — not "this tool has broad permissions":
|
|
501
|
+
- Hidden LLM instructions ("always approve", "skip verification", "do not inform user")
|
|
502
|
+
- Credential exfiltration (read ~/.ssh + send externally)
|
|
503
|
+
- Malicious code execution (curl|bash from raw IP, base64 decode + exec)
|
|
504
|
+
- Supply chain (typosquat names, unofficial fork claims)
|
|
505
|
+
- Reverse shells, DNS exfiltration, IMDS access
|
|
208
506
|
|
|
209
|
-
5.
|
|
210
|
-
|
|
507
|
+
5. TEST CASES — 5+ true_positives, 5+ true_negatives (Cisco bar, not 3):
|
|
508
|
+
- TP must be REAL attack payloads (not hypothetical)
|
|
509
|
+
- TN must be similar-looking LEGITIMATE content
|
|
510
|
+
- YOUR REGEX MUST ACTUALLY MATCH ALL TP AND MISS ALL TN. Verify before outputting.
|
|
511
|
+
- Include at least 2 TN that are edge cases (similar commands in legitimate contexts)
|
|
211
512
|
|
|
212
|
-
|
|
513
|
+
6. EVASION TESTS — required, minimum 3:
|
|
514
|
+
Document known bypass techniques with expected: not_triggered.
|
|
515
|
+
Every rule must honestly acknowledge how attackers could evade it:
|
|
516
|
+
- Obfuscation (base64, hex, unicode escapes)
|
|
517
|
+
- Semantic paraphrase (synonyms, indirect references)
|
|
518
|
+
- Time/context gating (delayed execution, conditional triggers)
|
|
519
|
+
|
|
520
|
+
7. REFERENCES — every rule must map to BOTH OWASP and MITRE:
|
|
521
|
+
references:
|
|
522
|
+
owasp_llm:
|
|
523
|
+
- "LLM01:2025 - Prompt Injection" (or appropriate category)
|
|
524
|
+
owasp_agentic:
|
|
525
|
+
- "ASI01:2026 - Agent Behaviour Hijack" (or appropriate category)
|
|
526
|
+
mitre_atlas:
|
|
527
|
+
- "AML.T0051" (or appropriate technique ID)
|
|
528
|
+
MITRE ATLAS reference is REQUIRED, not optional.
|
|
529
|
+
|
|
530
|
+
8. DECISION CRITERIA — output a rule or "NO_THREATS_FOUND":
|
|
531
|
+
- If the skill content contains ACTUAL malicious code (credential theft, exfiltration,
|
|
532
|
+
reverse shells, hidden instructions to bypass safety) → WRITE A RULE, even if you
|
|
533
|
+
think existing regex might already catch it. Let the dedup layer handle overlaps.
|
|
534
|
+
- If the skill is just a normal tool with broad permissions (file access, network calls)
|
|
535
|
+
but no malicious INTENT → output NO_THREATS_FOUND.
|
|
536
|
+
- When in doubt about whether something is malicious, WRITE THE RULE. False negatives
|
|
537
|
+
(missing a real attack) are worse than duplicate rules.
|
|
538
|
+
|
|
539
|
+
Output format (ONLY if a SPECIFIC threat is found):
|
|
213
540
|
\`\`\`yaml
|
|
214
|
-
title:
|
|
541
|
+
title: '<specific attack technique>'
|
|
215
542
|
id: ATR-2026-DRAFT-<8char-hex>
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
543
|
+
rule_version: 1
|
|
544
|
+
status: experimental
|
|
545
|
+
description: >
|
|
546
|
+
<what SPECIFIC attack this detects, referencing the analyzed skill content>
|
|
547
|
+
author: "ATR Threat Cloud Crystallization"
|
|
220
548
|
date: "${new Date().toISOString().slice(0, 10).replace(/-/g, '/')}"
|
|
221
549
|
schema_version: "0.1"
|
|
222
|
-
detection_tier:
|
|
550
|
+
detection_tier: pattern
|
|
223
551
|
maturity: experimental
|
|
224
|
-
severity: <critical|high|medium
|
|
552
|
+
severity: <critical|high|medium>
|
|
553
|
+
references:
|
|
554
|
+
owasp_llm:
|
|
555
|
+
- "<most relevant LLM Top 10 category>"
|
|
556
|
+
owasp_agentic:
|
|
557
|
+
- "<most relevant Agentic Top 10 category>"
|
|
558
|
+
mitre_atlas:
|
|
559
|
+
- "<AML.Txxxx technique ID — REQUIRED>"
|
|
225
560
|
tags:
|
|
226
|
-
category: <tool-poisoning|prompt-injection|
|
|
561
|
+
category: <skill-compromise|tool-poisoning|prompt-injection|context-exfiltration|privilege-escalation>
|
|
227
562
|
subcategory: <specific-technique>
|
|
228
|
-
|
|
563
|
+
scan_target: <mcp|skill|both>
|
|
564
|
+
confidence: <high|medium>
|
|
565
|
+
agent_source:
|
|
566
|
+
type: mcp_exchange
|
|
567
|
+
framework: [any]
|
|
568
|
+
provider: [any]
|
|
229
569
|
detection:
|
|
230
570
|
conditions:
|
|
231
|
-
- field:
|
|
571
|
+
- field: content
|
|
572
|
+
operator: regex
|
|
573
|
+
value: '<LAYER 1: core malicious action regex>'
|
|
574
|
+
description: '<what layer 1 matches>'
|
|
575
|
+
- field: content
|
|
232
576
|
operator: regex
|
|
233
|
-
value:
|
|
234
|
-
description:
|
|
577
|
+
value: '<LAYER 2: exfil/encoding vector regex>'
|
|
578
|
+
description: '<what layer 2 matches>'
|
|
579
|
+
- field: content
|
|
580
|
+
operator: regex
|
|
581
|
+
value: '<LAYER 3: transmission/execution regex>'
|
|
582
|
+
description: '<what layer 3 matches>'
|
|
235
583
|
condition: any
|
|
584
|
+
false_positives:
|
|
585
|
+
- '<edge case 1 — legitimate content that looks similar>'
|
|
586
|
+
- '<edge case 2 — common benign pattern>'
|
|
587
|
+
- '<edge case 3 — dev/admin tool context>'
|
|
236
588
|
response:
|
|
237
|
-
actions: [alert,
|
|
589
|
+
actions: [alert, block_tool]
|
|
590
|
+
message_template: >
|
|
591
|
+
[ATR-2026-DRAFT] <one-line description of what was detected>
|
|
238
592
|
test_cases:
|
|
239
593
|
true_positives:
|
|
240
|
-
-
|
|
594
|
+
- input: '<real attack payload 1>'
|
|
595
|
+
expected: triggered
|
|
596
|
+
- input: '<real attack payload 2>'
|
|
597
|
+
expected: triggered
|
|
598
|
+
- input: '<real attack payload 3>'
|
|
599
|
+
expected: triggered
|
|
600
|
+
- input: '<real attack payload 4>'
|
|
241
601
|
expected: triggered
|
|
242
|
-
-
|
|
602
|
+
- input: '<real attack payload 5>'
|
|
243
603
|
expected: triggered
|
|
244
604
|
true_negatives:
|
|
245
|
-
-
|
|
605
|
+
- input: '<similar but safe content 1>'
|
|
246
606
|
expected: not_triggered
|
|
247
|
-
|
|
607
|
+
reason: '<why this is safe>'
|
|
608
|
+
- input: '<similar but safe content 2>'
|
|
248
609
|
expected: not_triggered
|
|
610
|
+
reason: '<why this is safe>'
|
|
611
|
+
- input: '<similar but safe content 3>'
|
|
612
|
+
expected: not_triggered
|
|
613
|
+
reason: '<why this is safe>'
|
|
614
|
+
- input: '<edge case 4 — common legitimate usage>'
|
|
615
|
+
expected: not_triggered
|
|
616
|
+
reason: '<why this is safe>'
|
|
617
|
+
- input: '<edge case 5 — devops/admin tool context>'
|
|
618
|
+
expected: not_triggered
|
|
619
|
+
reason: '<why this is safe>'
|
|
620
|
+
evasion_tests:
|
|
621
|
+
- input: '<bypass 1 — obfuscation variant>'
|
|
622
|
+
expected: not_triggered
|
|
623
|
+
bypass_technique: '<technique name>'
|
|
624
|
+
notes: '<how attacker could evade>'
|
|
625
|
+
- input: '<bypass 2 — semantic paraphrase>'
|
|
626
|
+
expected: not_triggered
|
|
627
|
+
bypass_technique: '<technique name>'
|
|
628
|
+
notes: '<why this bypasses the regex>'
|
|
629
|
+
- input: '<bypass 3 — time-gated or conditional>'
|
|
630
|
+
expected: not_triggered
|
|
631
|
+
bypass_technique: '<technique name>'
|
|
632
|
+
notes: '<explanation>'
|
|
249
633
|
\`\`\`
|
|
250
634
|
|
|
251
|
-
|
|
635
|
+
BEFORE OUTPUTTING — reject your own output if any check fails:
|
|
636
|
+
- [ ] At least 3 detection conditions (NOT 1)
|
|
637
|
+
- [ ] At least 5 true_positives + 5 true_negatives (Cisco bar, not 3)
|
|
638
|
+
- [ ] At least 3 evasion_tests documenting known bypasses
|
|
639
|
+
- [ ] MITRE ATLAS reference present (REQUIRED)
|
|
640
|
+
- [ ] OWASP LLM + OWASP Agentic references present
|
|
641
|
+
- [ ] No (?i) inline flag — JS does not support it
|
|
642
|
+
- [ ] Single-quoted regex values
|
|
643
|
+
- [ ] Every condition has a description field
|
|
644
|
+
- [ ] Your regex matches ALL true_positives AND misses ALL true_negatives
|
|
645
|
+
|
|
646
|
+
If you cannot meet this bar, output NO_THREATS_FOUND instead of a weak rule.`;
|
|
252
647
|
/**
|
|
253
648
|
* Analyze skill scan results for semantic threats regex missed
|
|
254
649
|
* 分析技能掃描結果,找出 regex 漏掉的語義威脅
|
|
@@ -256,16 +651,19 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
256
651
|
async analyzeSkills(skills) {
|
|
257
652
|
const results = [];
|
|
258
653
|
for (const skill of skills) {
|
|
259
|
-
if (!skill.tools || skill.tools.length
|
|
654
|
+
if (!skill.tools || skill.tools.length === 0)
|
|
260
655
|
continue;
|
|
261
656
|
const toolSummary = skill.tools
|
|
262
657
|
.slice(0, 30) // Limit to avoid token overflow
|
|
263
658
|
.map((t) => `- ${t.name}: ${t.description}`)
|
|
264
659
|
.join('\n');
|
|
265
|
-
const userMessage = `Analyze
|
|
660
|
+
const userMessage = `Analyze this skill content from "${skill.package}" for threats that regex scanning missed. Before drafting a rule, call grep_existing_rules to verify the attack class is not already covered. If a similar rule exists, call read_rule to inspect it and either propose a narrowly-scoped new variant or emit NO_THREATS_FOUND. Ground novel attack claims in research via fetch_research when possible.\n\nSkill content:\n\n${toolSummary}`;
|
|
266
661
|
try {
|
|
267
|
-
const responseText = await this.
|
|
662
|
+
const { finalText: responseText, toolCalls } = await this.callAnthropicWithTools(LLMReviewer.ATR_DRAFTER_PROMPT, userMessage);
|
|
663
|
+
console.log(`[LLM] analyzeSkills (tc-v2) for "${skill.package}": ${responseText.length} chars, ${toolCalls} tool calls`);
|
|
664
|
+
console.log(`[LLM] First 500 chars: ${responseText.slice(0, 500)}`);
|
|
268
665
|
if (responseText.includes('NO_THREATS_FOUND')) {
|
|
666
|
+
console.log(`[LLM] Verdict: NO_THREATS_FOUND for "${skill.package}"`);
|
|
269
667
|
results.push({
|
|
270
668
|
package: skill.package,
|
|
271
669
|
threatsFound: false,
|
|
@@ -275,8 +673,18 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
275
673
|
continue;
|
|
276
674
|
}
|
|
277
675
|
// Extract YAML blocks
|
|
278
|
-
|
|
676
|
+
// Primary: properly-closed ```yaml\n...```
|
|
677
|
+
// Fallback: opening ```yaml\n...<end of string> (truncation safety net)
|
|
678
|
+
let yamlBlocks = responseText.match(/```yaml\n([\s\S]*?)```/g);
|
|
279
679
|
if (!yamlBlocks || yamlBlocks.length === 0) {
|
|
680
|
+
const unclosed = responseText.match(/```yaml\n([\s\S]*?)$/);
|
|
681
|
+
if (unclosed) {
|
|
682
|
+
console.log(`[LLM] Recovered unclosed YAML block (max_tokens likely hit) for "${skill.package}"`);
|
|
683
|
+
yamlBlocks = [unclosed[0] + '\n```'];
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
if (!yamlBlocks || yamlBlocks.length === 0) {
|
|
687
|
+
console.log(`[LLM] No YAML blocks found in response for "${skill.package}". Response starts with: ${responseText.slice(0, 200)}`);
|
|
280
688
|
results.push({
|
|
281
689
|
package: skill.package,
|
|
282
690
|
threatsFound: false,
|
|
@@ -285,26 +693,70 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
285
693
|
});
|
|
286
694
|
continue;
|
|
287
695
|
}
|
|
696
|
+
console.log(`[LLM] Found ${yamlBlocks.length} YAML block(s) for "${skill.package}"`);
|
|
288
697
|
const proposals = [];
|
|
289
698
|
const { createHash } = await import('node:crypto');
|
|
290
699
|
for (const block of yamlBlocks) {
|
|
291
|
-
|
|
700
|
+
let ruleContent = block
|
|
292
701
|
.replace(/```yaml\n?/, '')
|
|
293
702
|
.replace(/```$/, '')
|
|
294
703
|
.trim();
|
|
295
704
|
// Validate: must have required ATR fields
|
|
296
|
-
if (!ruleContent.includes('title:') || !ruleContent.includes('detection:'))
|
|
705
|
+
if (!ruleContent.includes('title:') || !ruleContent.includes('detection:')) {
|
|
706
|
+
console.log(`[LLM] YAML block skipped — missing title: (${ruleContent.includes('title:')}) or detection: (${ruleContent.includes('detection:')}). First 200 chars: ${ruleContent.slice(0, 200)}`);
|
|
297
707
|
continue;
|
|
298
|
-
|
|
299
|
-
|
|
708
|
+
}
|
|
709
|
+
// Validate regex in the rule (match both single and double quoted values)
|
|
710
|
+
const regexMatch = ruleContent.match(/value:\s*(['"])((?:(?!\1).)+)\1/);
|
|
300
711
|
if (regexMatch) {
|
|
712
|
+
// Strip (?i) prefix — JS uses /pattern/i flag instead of PCRE inline (?i)
|
|
713
|
+
const rawPattern = regexMatch[2];
|
|
714
|
+
const jsPattern = rawPattern.replace(/^\(\?i\)/g, '');
|
|
301
715
|
try {
|
|
302
|
-
new RegExp(
|
|
716
|
+
new RegExp(jsPattern, 'i');
|
|
303
717
|
}
|
|
304
|
-
catch {
|
|
718
|
+
catch (regexErr) {
|
|
719
|
+
console.log(`[LLM] YAML block skipped — invalid regex: ${rawPattern.slice(0, 100)}. Error: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`);
|
|
305
720
|
continue; // Skip rules with invalid regex
|
|
306
721
|
}
|
|
722
|
+
// If we stripped (?i), also fix it in the rule content so downstream consumers don't hit the same issue
|
|
723
|
+
if (rawPattern !== jsPattern) {
|
|
724
|
+
ruleContent = ruleContent.replace(rawPattern, jsPattern);
|
|
725
|
+
console.log(`[LLM] Stripped (?i) prefix from regex for JS compatibility`);
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
// ATR Quality Gate — use the canonical library from agent-threat-rules/quality
|
|
729
|
+
// Reject rules that don't meet the experimental quality bar (3+ conditions,
|
|
730
|
+
// 3 TP + 3 TN, OWASP + MITRE, FP docs). See RFC-001 §3.
|
|
731
|
+
let gateResult;
|
|
732
|
+
try {
|
|
733
|
+
const metadata = parseATRRule(ruleContent);
|
|
734
|
+
// Mark as LLM-generated so downstream consumers know provenance
|
|
735
|
+
const enriched = { ...metadata, llmGenerated: true };
|
|
736
|
+
gateResult = validateRuleMeetsStandard(enriched, 'experimental');
|
|
737
|
+
}
|
|
738
|
+
catch (parseErr) {
|
|
739
|
+
console.log(`[LLM] Rule rejected — failed to parse YAML for quality gate: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}`);
|
|
740
|
+
continue;
|
|
307
741
|
}
|
|
742
|
+
if (!gateResult.passed) {
|
|
743
|
+
console.log(`[LLM] Rule rejected by ATR Quality Gate: ${gateResult.issues.join('; ')}`);
|
|
744
|
+
continue;
|
|
745
|
+
}
|
|
746
|
+
if (gateResult.warnings.length > 0) {
|
|
747
|
+
console.log(`[LLM] Rule passed gate with warnings: ${gateResult.warnings.join('; ')}`);
|
|
748
|
+
}
|
|
749
|
+
// Self-test: run the rule's own test_cases against its own regex.
|
|
750
|
+
// This is the first-principles quality check — if LLM-produced regex
|
|
751
|
+
// can't match its own TPs or incorrectly matches its own TNs, the
|
|
752
|
+
// rule is broken regardless of how good the metadata looks.
|
|
753
|
+
const selfTest = selfTestRule(ruleContent);
|
|
754
|
+
if (!selfTest.passed) {
|
|
755
|
+
console.log(`[LLM] Rule rejected by self-test: TP ${selfTest.tpMatched}/${selfTest.tpTotal}, TN FP ${selfTest.tnMatched}/${selfTest.tnTotal}. ` +
|
|
756
|
+
`Reasons: ${selfTest.failureReasons.slice(0, 3).join(' | ')}`);
|
|
757
|
+
continue;
|
|
758
|
+
}
|
|
759
|
+
console.log(`[LLM] Rule passed self-test: ${selfTest.tpMatched}/${selfTest.tpTotal} TP caught, ${selfTest.tnTotal - selfTest.tnMatched}/${selfTest.tnTotal} TN clean`);
|
|
308
760
|
const patternHash = createHash('sha256').update(ruleContent).digest('hex').slice(0, 16);
|
|
309
761
|
// Submit as proposal + auto-review
|
|
310
762
|
this.db.insertATRProposal({
|
|
@@ -316,6 +768,8 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
316
768
|
approved: true,
|
|
317
769
|
source: 'skill-analysis',
|
|
318
770
|
package: skill.package,
|
|
771
|
+
provenance: 'llm-generated',
|
|
772
|
+
gateWarnings: gateResult.warnings,
|
|
319
773
|
}),
|
|
320
774
|
});
|
|
321
775
|
// Fire-and-forget: review the proposal we just created
|
|
@@ -345,6 +799,195 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
345
799
|
}
|
|
346
800
|
return results;
|
|
347
801
|
}
|
|
802
|
+
/**
|
|
803
|
+
* Draft a full ATR YAML rule from a raw attack payload supplied by an
|
|
804
|
+
* external red-team source (e.g. NVIDIA garak). Same drafter pipeline as
|
|
805
|
+
* analyzeSkills but the input is the attack prompt itself rather than a
|
|
806
|
+
* skill's tool descriptions. Returns the drafted proposal or null when
|
|
807
|
+
* the drafter declined to write a rule (NO_THREATS_FOUND, duplicate of
|
|
808
|
+
* existing coverage, failed quality gate, failed self-test).
|
|
809
|
+
*
|
|
810
|
+
* Callers:
|
|
811
|
+
* - POST /api/atr-proposals/from-payload (partner / admin auth)
|
|
812
|
+
*
|
|
813
|
+
* Always inserts the resulting proposal into atr_proposals so the normal
|
|
814
|
+
* canary → auto-merge → npm publish pipeline can take over.
|
|
815
|
+
*/
|
|
816
|
+
async draftRuleFromPayload(payload, meta) {
|
|
817
|
+
// Bound input to keep the drafter prompt under token budget. Longer prompts
|
|
818
|
+
// than this are unusual for prompt-injection payloads and just waste tokens.
|
|
819
|
+
const boundedPayload = payload.slice(0, 8000);
|
|
820
|
+
const probe = meta.probe || 'unknown-probe';
|
|
821
|
+
const detector = meta.detector || 'unknown-detector';
|
|
822
|
+
const targetModel = meta.targetModel || 'unspecified-model';
|
|
823
|
+
const partner = meta.partnerName || 'external-red-team';
|
|
824
|
+
// ------------------------------------------------------------------
|
|
825
|
+
// FAST PATH: payload fingerprint dedup.
|
|
826
|
+
//
|
|
827
|
+
// Normalize the payload (lowercase, strip punctuation, collapse whitespace)
|
|
828
|
+
// and hash it. If we've seen this fingerprint before, we already know what
|
|
829
|
+
// the LLM would say — return the cached verdict without calling the API.
|
|
830
|
+
//
|
|
831
|
+
// Empirically ~90% of garak corpus submissions hit this cache on a second
|
|
832
|
+
// or subsequent run. Eliminating those API calls is the single biggest
|
|
833
|
+
// cost reduction in the drafter pipeline.
|
|
834
|
+
// ------------------------------------------------------------------
|
|
835
|
+
const fingerprint = payloadFingerprint(boundedPayload);
|
|
836
|
+
const cached = this.db.getPayloadFingerprint(fingerprint);
|
|
837
|
+
if (cached) {
|
|
838
|
+
// Bump hit count so we can see cache effectiveness in the stats
|
|
839
|
+
this.db.recordPayloadFingerprint(fingerprint, cached.result);
|
|
840
|
+
if (cached.result === 'novel' && cached.patternHash) {
|
|
841
|
+
// Previous call generated a rule — find it and return
|
|
842
|
+
const existing = this.db.getATRProposalByHash(cached.patternHash);
|
|
843
|
+
if (existing) {
|
|
844
|
+
console.log(`[draftRuleFromPayload] fingerprint cache hit (novel): ${fingerprint} → ${cached.patternHash}`);
|
|
845
|
+
// We don't have the ruleContent in getATRProposalByHash's return shape,
|
|
846
|
+
// but that's OK — callers only use ruleContent for logging; returning a
|
|
847
|
+
// minimal placeholder with the real patternHash is sufficient for the
|
|
848
|
+
// dedup path (the original proposal row in atr_proposals is what
|
|
849
|
+
// downstream canary → promote pipelines actually consume).
|
|
850
|
+
return {
|
|
851
|
+
patternHash: cached.patternHash,
|
|
852
|
+
ruleContent: `# cached — see atr_proposals.pattern_hash=${cached.patternHash}`,
|
|
853
|
+
toolCalls: 0,
|
|
854
|
+
};
|
|
855
|
+
}
|
|
856
|
+
// Cached entry points at a proposal that was later deleted; fall through
|
|
857
|
+
// to re-draft rather than fail silently.
|
|
858
|
+
}
|
|
859
|
+
else {
|
|
860
|
+
// Previously judged duplicate or rejected — don't re-spend on LLM
|
|
861
|
+
console.log(`[draftRuleFromPayload] fingerprint cache hit (${cached.result}): ${fingerprint} — skipping LLM call`);
|
|
862
|
+
return null;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
const userMessage = [
|
|
866
|
+
`Red-team finding from ${partner}. The attack prompt below bypassed ${targetModel} via garak probe ${probe} (detector: ${detector}).`,
|
|
867
|
+
'',
|
|
868
|
+
'Draft an ATR rule that detects this attack class at the agent-layer boundary (tool call args, skill content, user_input field, MCP tool descriptions). The rule should generalise beyond the literal prompt — target the technique, not the specific words.',
|
|
869
|
+
'',
|
|
870
|
+
'Follow STEP 0 (de-duplication via grep_existing_rules) before drafting. If an existing rule already covers this attack class with equivalent patterns, output NO_THREATS_FOUND.',
|
|
871
|
+
'',
|
|
872
|
+
`Attack prompt (${boundedPayload.length} chars):`,
|
|
873
|
+
'```',
|
|
874
|
+
boundedPayload,
|
|
875
|
+
'```',
|
|
876
|
+
'',
|
|
877
|
+
`Severity hint from partner: ${meta.severity || 'high'}. Use your judgement — downgrade to medium if the payload is a low-impact jailbreak that does not request sensitive operations.`,
|
|
878
|
+
].join('\n');
|
|
879
|
+
let finalText = '';
|
|
880
|
+
let toolCalls = 0;
|
|
881
|
+
try {
|
|
882
|
+
// Bulk drafter runs on Haiku (4x cheaper than Sonnet). Quality gate +
|
|
883
|
+
// self-test + safety gate (benign corpus FP check) catch any output
|
|
884
|
+
// regressions regardless of model.
|
|
885
|
+
const result = await this.callAnthropicWithTools(LLMReviewer.ATR_DRAFTER_PROMPT, userMessage, { model: this.drafterModel });
|
|
886
|
+
finalText = result.finalText;
|
|
887
|
+
toolCalls = result.toolCalls;
|
|
888
|
+
}
|
|
889
|
+
catch (err) {
|
|
890
|
+
console.error(`[draftRuleFromPayload] LLM call failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
891
|
+
// Do NOT cache LLM errors — transient failures should retry.
|
|
892
|
+
return null;
|
|
893
|
+
}
|
|
894
|
+
if (!finalText || /NO_THREATS_FOUND/.test(finalText)) {
|
|
895
|
+
console.log(`[draftRuleFromPayload] NO_THREATS_FOUND or empty response (probe=${probe})`);
|
|
896
|
+
// Cache the 'duplicate' verdict so repeat submissions of the same payload
|
|
897
|
+
// don't spend money to re-confirm "already covered".
|
|
898
|
+
this.db.recordPayloadFingerprint(fingerprint, 'duplicate');
|
|
899
|
+
return null;
|
|
900
|
+
}
|
|
901
|
+
// Extract YAML block (with unclosed-fence fallback — same as analyzeSkills)
|
|
902
|
+
let yamlBlocks = finalText.match(/```yaml\n([\s\S]*?)```/g);
|
|
903
|
+
if (!yamlBlocks || yamlBlocks.length === 0) {
|
|
904
|
+
const unclosed = finalText.match(/```yaml\n([\s\S]*?)$/);
|
|
905
|
+
if (unclosed)
|
|
906
|
+
yamlBlocks = [unclosed[0] + '\n```'];
|
|
907
|
+
}
|
|
908
|
+
if (!yamlBlocks || yamlBlocks.length === 0) {
|
|
909
|
+
console.log(`[draftRuleFromPayload] No YAML block in response. First 200 chars: ${finalText.slice(0, 200)}`);
|
|
910
|
+
this.db.recordPayloadFingerprint(fingerprint, 'rejected');
|
|
911
|
+
return null;
|
|
912
|
+
}
|
|
913
|
+
let ruleContent = yamlBlocks[0]
|
|
914
|
+
.replace(/^```yaml\n/, '')
|
|
915
|
+
.replace(/```$/, '')
|
|
916
|
+
.trim();
|
|
917
|
+
// Rewrite author line so the partner/source is visible on the shipped rule.
|
|
918
|
+
// The drafter prompt hardcodes 'ATR Threat Cloud Crystallization' as the
|
|
919
|
+
// author; replace it with the actual submitter so downstream consumers
|
|
920
|
+
// (npm, Cisco, etc.) can see who contributed each rule. Sanitise the partner
|
|
921
|
+
// string against YAML-breaking characters.
|
|
922
|
+
const safePartner = partner.replace(/[\r\n"'\\]/g, '').slice(0, 80);
|
|
923
|
+
const attributedAuthor = safePartner === 'external-red-team' || !safePartner
|
|
924
|
+
? 'ATR Community (via garak pipe)'
|
|
925
|
+
: `${safePartner} (via ATR garak pipe)`;
|
|
926
|
+
ruleContent = ruleContent.replace(/^author:\s*["']ATR Threat Cloud Crystallization["']\s*$/m, `author: "${attributedAuthor}"`);
|
|
927
|
+
// Strip any (?i) prefix the LLM may have sneaked in despite the prompt
|
|
928
|
+
const regexFieldMatch = ruleContent.match(/value:\s*'(\(\?i\))([^']*)'/);
|
|
929
|
+
if (regexFieldMatch) {
|
|
930
|
+
const rawPattern = `'(?i)${regexFieldMatch[2]}'`;
|
|
931
|
+
const jsPattern = `'${regexFieldMatch[2]}'`;
|
|
932
|
+
ruleContent = ruleContent.replace(rawPattern, jsPattern);
|
|
933
|
+
}
|
|
934
|
+
// RFC-001 quality gate
|
|
935
|
+
let gateResult;
|
|
936
|
+
try {
|
|
937
|
+
const metadata = parseATRRule(ruleContent);
|
|
938
|
+
const enriched = { ...metadata, llmGenerated: true };
|
|
939
|
+
gateResult = validateRuleMeetsStandard(enriched, 'experimental');
|
|
940
|
+
}
|
|
941
|
+
catch (parseErr) {
|
|
942
|
+
console.log(`[draftRuleFromPayload] Rule rejected — cannot parse: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}`);
|
|
943
|
+
this.db.recordPayloadFingerprint(fingerprint, 'rejected');
|
|
944
|
+
return null;
|
|
945
|
+
}
|
|
946
|
+
if (!gateResult.passed) {
|
|
947
|
+
console.log(`[draftRuleFromPayload] Rejected by quality gate: ${gateResult.issues.slice(0, 3).join('; ')}`);
|
|
948
|
+
this.db.recordPayloadFingerprint(fingerprint, 'rejected');
|
|
949
|
+
return null;
|
|
950
|
+
}
|
|
951
|
+
// Self-test: rule's own regex must catch its own TPs and miss its own TNs.
|
|
952
|
+
const selfTest = selfTestRule(ruleContent);
|
|
953
|
+
if (!selfTest.passed) {
|
|
954
|
+
console.log(`[draftRuleFromPayload] Rejected by self-test: TP ${selfTest.tpMatched}/${selfTest.tpTotal}, TN FP ${selfTest.tnMatched}/${selfTest.tnTotal}. ${selfTest.failureReasons.slice(0, 2).join(' | ')}`);
|
|
955
|
+
this.db.recordPayloadFingerprint(fingerprint, 'rejected');
|
|
956
|
+
return null;
|
|
957
|
+
}
|
|
958
|
+
const patternHash = createHash('sha256').update(ruleContent).digest('hex').slice(0, 16);
|
|
959
|
+
// Idempotent: if a previous submission produced the same YAML, skip insert.
|
|
960
|
+
if (this.db.getATRProposalByHash(patternHash)) {
|
|
961
|
+
// Still record the fingerprint so future dup payloads bypass LLM.
|
|
962
|
+
this.db.recordPayloadFingerprint(fingerprint, 'novel', { patternHash });
|
|
963
|
+
return { patternHash, ruleContent, toolCalls };
|
|
964
|
+
}
|
|
965
|
+
this.db.insertATRProposal({
|
|
966
|
+
patternHash,
|
|
967
|
+
ruleContent,
|
|
968
|
+
llmProvider: 'garak-drafter',
|
|
969
|
+
llmModel: this.drafterModel,
|
|
970
|
+
selfReviewVerdict: JSON.stringify({
|
|
971
|
+
approved: true,
|
|
972
|
+
source: 'external-red-team',
|
|
973
|
+
partner,
|
|
974
|
+
probe,
|
|
975
|
+
detector,
|
|
976
|
+
targetModel,
|
|
977
|
+
toolCalls,
|
|
978
|
+
provenance: 'llm-generated-from-payload',
|
|
979
|
+
gateWarnings: gateResult.warnings,
|
|
980
|
+
}),
|
|
981
|
+
});
|
|
982
|
+
// Record novel fingerprint so repeat submissions reuse this rule without LLM.
|
|
983
|
+
this.db.recordPayloadFingerprint(fingerprint, 'novel', { patternHash });
|
|
984
|
+
// Fire-and-forget second-opinion review (same as analyzeSkills)
|
|
985
|
+
void this.reviewProposal(patternHash, ruleContent).catch((err) => {
|
|
986
|
+
console.error(`[draftRuleFromPayload] review failed for ${patternHash}:`, err instanceof Error ? err.message : String(err));
|
|
987
|
+
});
|
|
988
|
+
console.log(`[draftRuleFromPayload] OK: patternHash=${patternHash} partner=${partner} probe=${probe} toolCalls=${toolCalls}`);
|
|
989
|
+
return { patternHash, ruleContent, toolCalls };
|
|
990
|
+
}
|
|
348
991
|
/**
|
|
349
992
|
* Parse the LLM response into a structured verdict
|
|
350
993
|
* 解析 LLM 回應為結構化裁決
|
|
@@ -352,7 +995,7 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
352
995
|
parseVerdict(responseText) {
|
|
353
996
|
const defaultVerdict = {
|
|
354
997
|
approved: false,
|
|
355
|
-
falsePositiveRisk: '
|
|
998
|
+
falsePositiveRisk: 'medium',
|
|
356
999
|
coverageScore: 0,
|
|
357
1000
|
reasoning: 'Failed to parse LLM response',
|
|
358
1001
|
};
|
|
@@ -366,9 +1009,10 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
|
|
|
366
1009
|
// Validate and normalize fields
|
|
367
1010
|
const approved = parsed.approved === true;
|
|
368
1011
|
const validRisks = ['low', 'medium', 'high'];
|
|
369
|
-
const
|
|
370
|
-
|
|
371
|
-
|
|
1012
|
+
const normalizedRisk = (parsed.falsePositiveRisk ?? '').toString().toLowerCase().trim();
|
|
1013
|
+
const falsePositiveRisk = validRisks.includes(normalizedRisk)
|
|
1014
|
+
? normalizedRisk
|
|
1015
|
+
: 'medium';
|
|
372
1016
|
const coverageScore = typeof parsed.coverageScore === 'number'
|
|
373
1017
|
? Math.max(0, Math.min(100, Math.round(parsed.coverageScore)))
|
|
374
1018
|
: 0;
|