@panguard-ai/threat-cloud 1.4.2 → 1.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,10 +9,138 @@
9
9
  * @module @panguard-ai/threat-cloud/llm-reviewer
10
10
  */
11
11
  import * as https from 'node:https';
12
+ import { createHash } from 'node:crypto';
13
+ import { load as parseYaml } from 'js-yaml';
14
+ import { parseATRRule, validateRuleMeetsStandard, } from '@panguard-ai/atr/quality';
15
+ import { TC_DRAFTER_TOOLS, executeToolCall } from './llm-reviewer-tools.js';
16
+ /**
17
+ * Run a rule's embedded test cases against its own regex conditions.
18
+ * This is the first-principles quality check: if a rule cannot match its
19
+ * own claimed TPs or falsely matches its own claimed TNs, the regex is
20
+ * broken regardless of how good the metadata looks.
21
+ *
22
+ * Returns `passed: true` only if ALL TPs match AND zero TNs match.
23
+ */
24
+ function selfTestRule(ruleContent) {
25
+ let parsed;
26
+ try {
27
+ parsed = parseYaml(ruleContent);
28
+ }
29
+ catch (e) {
30
+ return {
31
+ passed: false,
32
+ tpTotal: 0,
33
+ tpMatched: 0,
34
+ tnTotal: 0,
35
+ tnMatched: 0,
36
+ failureReasons: [`YAML parse error: ${e instanceof Error ? e.message : String(e)}`],
37
+ };
38
+ }
39
+ const conditions = parsed?.detection?.conditions ?? [];
40
+ const regexes = [];
41
+ for (const c of conditions) {
42
+ if (!c?.value)
43
+ continue;
44
+ // Strip (?i) prefix — JS uses /pattern/i flag
45
+ const pattern = c.value.replace(/^\(\?i\)/, '');
46
+ try {
47
+ regexes.push(new RegExp(pattern, 'i'));
48
+ }
49
+ catch {
50
+ // Invalid regex — skip this condition. Other conditions may still work.
51
+ }
52
+ }
53
+ if (regexes.length === 0) {
54
+ return {
55
+ passed: false,
56
+ tpTotal: 0,
57
+ tpMatched: 0,
58
+ tnTotal: 0,
59
+ tnMatched: 0,
60
+ failureReasons: ['no compilable regex conditions'],
61
+ };
62
+ }
63
+ const matchesAny = (text) => regexes.some((r) => r.test(text));
64
+ const tps = parsed?.test_cases?.true_positives ?? [];
65
+ const tns = parsed?.test_cases?.true_negatives ?? [];
66
+ const failureReasons = [];
67
+ let tpMatched = 0;
68
+ for (let i = 0; i < tps.length; i++) {
69
+ const input = tps[i]?.input ?? tps[i]?.tool_response ?? '';
70
+ if (matchesAny(input)) {
71
+ tpMatched++;
72
+ }
73
+ else {
74
+ failureReasons.push(`TP ${i + 1} not caught: "${input.slice(0, 80)}..."`);
75
+ }
76
+ }
77
+ let tnPassed = 0;
78
+ for (let i = 0; i < tns.length; i++) {
79
+ const input = tns[i]?.input ?? tns[i]?.tool_response ?? '';
80
+ if (!matchesAny(input)) {
81
+ tnPassed++;
82
+ }
83
+ else {
84
+ failureReasons.push(`TN ${i + 1} false positive: "${input.slice(0, 80)}..."`);
85
+ }
86
+ }
87
+ // A rule passes self-test if all TPs match AND zero TNs match
88
+ const passed = tpMatched === tps.length && tnPassed === tns.length && tps.length > 0 && tns.length > 0;
89
+ return {
90
+ passed,
91
+ tpTotal: tps.length,
92
+ tpMatched,
93
+ tnTotal: tns.length,
94
+ tnMatched: tns.length - tnPassed, // FP count
95
+ failureReasons,
96
+ };
97
+ }
12
98
  /** Timeout for LLM API calls in milliseconds */
13
99
  const LLM_TIMEOUT_MS = 60_000;
14
- /** Default model for review */
15
- const DEFAULT_MODEL = 'claude-sonnet-4-20250514';
100
+ /**
101
+ * Drafter model — used for bulk rule generation from attack payloads
102
+ * (garak pipe, skill scans). Defaults to Haiku for cost efficiency.
103
+ * Override via TC_DRAFTER_MODEL env var.
104
+ *
105
+ * Cost profile (per 1M tokens):
106
+ * - Haiku 3.5: $0.80 in / $4.00 out
107
+ * - Haiku 4.5: $1.00 in / $5.00 out (90% of Sonnet capability per CLAUDE.md)
108
+ * - Sonnet 4: $3.00 in / $15.00 out (4x Haiku)
109
+ *
110
+ * Haiku is sufficient for rule drafting — the RFC-001 quality gate + self-test
111
+ * catches output defects regardless of model. Sonnet adds ~3-5% quality at 4x
112
+ * cost, not worth it for bulk drafter.
113
+ */
114
+ const DEFAULT_DRAFTER_MODEL = process.env['TC_DRAFTER_MODEL'] ?? 'claude-haiku-4-5-20251001';
115
+ /**
116
+ * Reviewer model — used for the second-opinion review pass after a proposal
117
+ * is drafted (reviewProposal). Quality-critical, stays on Sonnet.
118
+ * Override via TC_REVIEWER_MODEL env var.
119
+ */
120
+ const DEFAULT_REVIEWER_MODEL = process.env['TC_REVIEWER_MODEL'] ?? 'claude-sonnet-4-20250514';
121
+ /** Legacy alias — kept so existing call sites compile during refactor. */
122
+ const _DEFAULT_MODEL = DEFAULT_REVIEWER_MODEL;
123
+ /**
124
+ * Normalize a payload for fingerprinting. Lowercases, collapses whitespace,
125
+ * strips common punctuation, caps at 2KB. Stable across minor formatting
126
+ * differences so near-duplicate garak prompts produce identical hashes.
127
+ *
128
+ * Normalization of "Ignore previous instructions" and "IGNORE previous
129
+ * instructions!!!" both produce the same fingerprint.
130
+ */
131
+ function normalizePayloadForFingerprint(payload) {
132
+ return payload
133
+ .toLowerCase()
134
+ .replace(/[^\p{L}\p{N}\s]/gu, ' ') // strip punctuation, keep letters/digits/whitespace
135
+ .replace(/\s+/g, ' ')
136
+ .trim()
137
+ .slice(0, 2000);
138
+ }
139
+ /** Compute a stable 16-hex-char fingerprint of a payload's semantic content. */
140
+ function payloadFingerprint(payload) {
141
+ const norm = normalizePayloadForFingerprint(payload);
142
+ return createHash('sha256').update(norm).digest('hex').slice(0, 16);
143
+ }
16
144
  /**
17
145
  * LLM Reviewer for ATR rule proposals
18
146
  * ATR 規則提案 LLM 審查器
@@ -20,11 +148,25 @@ const DEFAULT_MODEL = 'claude-sonnet-4-20250514';
20
148
  export class LLMReviewer {
21
149
  apiKey;
22
150
  db;
151
+ /**
152
+ * Primary model used throughout this class. Historically one model served
153
+ * both drafting and reviewing; we now split to `drafterModel` (Haiku, cheap)
154
+ * and `reviewerModel` (Sonnet, quality-critical). `this.model` retains the
155
+ * reviewer value for backward compat with existing `reviewProposal` callers.
156
+ */
23
157
  model;
158
+ /** Model used for rule drafting (Haiku by default — 4x cheaper than Sonnet). */
159
+ drafterModel;
160
+ /** Model used for second-opinion review (Sonnet by default). */
161
+ reviewerModel;
24
162
  constructor(apiKey, db, model) {
25
163
  this.apiKey = apiKey;
26
164
  this.db = db;
27
- this.model = model ?? DEFAULT_MODEL;
165
+ // Honor legacy `model` constructor arg for backward compat; when set it
166
+ // overrides BOTH drafter and reviewer. New code should prefer env vars.
167
+ this.reviewerModel = model ?? DEFAULT_REVIEWER_MODEL;
168
+ this.drafterModel = model ?? DEFAULT_DRAFTER_MODEL;
169
+ this.model = this.reviewerModel;
28
170
  }
29
171
  /** Check if the reviewer is available (API key is set) / 檢查審查器是否可用 */
30
172
  isAvailable() {
@@ -51,10 +193,11 @@ export class LLMReviewer {
51
193
  console.error(` -> Transient error, keeping proposal pending for retry`);
52
194
  return { verdict: '', approved: false };
53
195
  }
54
- // Permanent errors: store failure verdict
196
+ // Non-transient errors: store failure but keep proposal pending for retry
197
+ // Do NOT auto-reject — API errors are not evidence of bad rules
55
198
  const failVerdict = JSON.stringify({
56
199
  approved: false,
57
- falsePositiveRisk: 'high',
200
+ falsePositiveRisk: 'medium',
58
201
  coverageScore: 0,
59
202
  reasoning: `LLM review failed: ${msg}`,
60
203
  });
@@ -66,8 +209,14 @@ export class LLMReviewer {
66
209
  const verdictJson = JSON.stringify(verdict);
67
210
  // Store verdict in database
68
211
  this.db.updateATRProposalLLMReview(patternHash, verdictJson);
69
- // If high false positive risk AND not approved, reject the proposal
70
- if (!verdict.approved && verdict.falsePositiveRisk === 'high') {
212
+ // Terminal state transition on any legitimate rejection.
213
+ // Transient errors are handled earlier (they return without reaching this
214
+ // code path), so if we got a parsed verdict with approved=false, the LLM
215
+ // has made a reasoned decision — move the proposal to 'rejected' so the
216
+ // retry cron stops picking it up. Previously only high-FP rejections were
217
+ // marked terminal, which left low/medium-FP rejections in an infinite
218
+ // retry loop burning LLM API quota.
219
+ if (!verdict.approved) {
71
220
  this.db.rejectATRProposal(patternHash);
72
221
  }
73
222
  return { verdict: verdictJson, approved: verdict.approved };
@@ -119,9 +268,13 @@ Output ONLY valid JSON (no markdown, no explanation outside the JSON):
119
268
  */
120
269
  callAnthropicAPI(prompt) {
121
270
  return new Promise((resolve, reject) => {
271
+ // 4096 tokens is needed because the ATR drafter prompt requires
272
+ // a full rule YAML with 3+ conditions, 3+ TP, 3+ TN, 3+ evasion tests,
273
+ // MITRE + OWASP references, and descriptions. 1024 was cutting off
274
+ // mid-YAML and the regex extractor dropped the truncated block.
122
275
  const requestBody = JSON.stringify({
123
276
  model: this.model,
124
- max_tokens: 1024,
277
+ max_tokens: 4096,
125
278
  messages: [{ role: 'user', content: prompt }],
126
279
  });
127
280
  const options = {
@@ -174,81 +327,323 @@ Output ONLY valid JSON (no markdown, no explanation outside the JSON):
174
327
  req.end();
175
328
  });
176
329
  }
330
+ /**
331
+ * Low-level Anthropic messages call that accepts a prepared request body.
332
+ * Used by the tool-use loop so we can pass full message histories.
333
+ */
334
+ callAnthropicRaw(body) {
335
+ return new Promise((resolve, reject) => {
336
+ const requestBody = JSON.stringify(body);
337
+ const options = {
338
+ hostname: 'api.anthropic.com',
339
+ port: 443,
340
+ path: '/v1/messages',
341
+ method: 'POST',
342
+ headers: {
343
+ 'x-api-key': this.apiKey,
344
+ 'anthropic-version': '2023-06-01',
345
+ 'content-type': 'application/json',
346
+ 'content-length': Buffer.byteLength(requestBody),
347
+ },
348
+ timeout: LLM_TIMEOUT_MS,
349
+ };
350
+ const req = https.request(options, (res) => {
351
+ const chunks = [];
352
+ res.on('data', (chunk) => chunks.push(chunk));
353
+ res.on('end', () => {
354
+ const bodyText = Buffer.concat(chunks).toString('utf-8');
355
+ if (res.statusCode !== 200) {
356
+ reject(new Error(`Anthropic API status ${res.statusCode}: ${bodyText.slice(0, 500)}`));
357
+ return;
358
+ }
359
+ try {
360
+ const parsed = JSON.parse(bodyText);
361
+ resolve({ content: parsed.content ?? [], stop_reason: parsed.stop_reason });
362
+ }
363
+ catch (err) {
364
+ reject(new Error(`Anthropic API parse: ${err instanceof Error ? err.message : String(err)}`));
365
+ }
366
+ });
367
+ });
368
+ req.on('timeout', () => {
369
+ req.destroy();
370
+ reject(new Error(`Anthropic API timeout after ${LLM_TIMEOUT_MS}ms`));
371
+ });
372
+ req.on('error', (err) => reject(new Error(`Anthropic API error: ${err.message}`)));
373
+ req.write(requestBody);
374
+ req.end();
375
+ });
376
+ }
377
+ /**
378
+ * Tool-use loop for TC v2 drafter. Runs a multi-turn conversation with
379
+ * Claude where it can call grep_existing_rules, read_rule, and
380
+ * fetch_research to ground its draft in existing ATR coverage and
381
+ * public threat research before emitting a rule YAML.
382
+ *
383
+ * Returns the concatenated text of Claude's final assistant turn (the
384
+ * message where stop_reason is "end_turn" and not "tool_use").
385
+ *
386
+ * Max 6 tool-use rounds per skill to bound latency and cost; if Claude
387
+ * still wants to use tools on round 7, we instruct it to finalize.
388
+ */
389
+ async callAnthropicWithTools(systemPrompt, userMessage, options) {
390
+ const MAX_ROUNDS = 6;
391
+ const modelToUse = options?.model ?? this.model;
392
+ const messages = [
393
+ { role: 'user', content: userMessage },
394
+ ];
395
+ let toolCalls = 0;
396
+ let finalText = '';
397
+ for (let round = 0; round < MAX_ROUNDS; round++) {
398
+ const body = {
399
+ model: modelToUse,
400
+ max_tokens: 4096,
401
+ system: systemPrompt,
402
+ tools: TC_DRAFTER_TOOLS,
403
+ messages,
404
+ };
405
+ const response = await this.callAnthropicRaw(body);
406
+ // Collect assistant response as content blocks
407
+ const assistantBlocks = [];
408
+ const toolUses = [];
409
+ for (const block of response.content) {
410
+ if (block.type === 'text' && typeof block.text === 'string') {
411
+ assistantBlocks.push({ type: 'text', text: block.text });
412
+ finalText += (finalText ? '\n' : '') + block.text;
413
+ }
414
+ else if (block.type === 'tool_use' && block.id && block.name) {
415
+ assistantBlocks.push({
416
+ type: 'tool_use',
417
+ id: block.id,
418
+ name: block.name,
419
+ input: block.input ?? {},
420
+ });
421
+ toolUses.push({ id: block.id, name: block.name, input: block.input ?? {} });
422
+ }
423
+ }
424
+ messages.push({ role: 'assistant', content: assistantBlocks });
425
+ if (response.stop_reason !== 'tool_use' || toolUses.length === 0) {
426
+ // Claude is done — return its final text
427
+ break;
428
+ }
429
+ // Execute each requested tool and build a tool_result turn
430
+ const toolResults = [];
431
+ for (const tu of toolUses) {
432
+ toolCalls++;
433
+ console.log(`[tc-v2] round ${round + 1}: tool ${tu.name}(${JSON.stringify(tu.input).slice(0, 120)})`);
434
+ const result = await executeToolCall(tu.name, tu.input);
435
+ toolResults.push({
436
+ type: 'tool_result',
437
+ tool_use_id: tu.id,
438
+ content: result.content.slice(0, 8000), // cap per-tool-result size
439
+ is_error: result.isError,
440
+ });
441
+ }
442
+ messages.push({ role: 'user', content: toolResults });
443
+ // Reset finalText — we only want the LAST assistant turn's text
444
+ // (which contains the YAML rule output), not the interim narration
445
+ finalText = '';
446
+ }
447
+ return { finalText, toolCalls };
448
+ }
177
449
  // -------------------------------------------------------------------------
178
450
  // Skill Analysis — POST /api/analyze-skills
179
451
  // 技能分析 — 接收掃描結果,用 LLM 找 regex 漏掉的 semantic threats
180
452
  // -------------------------------------------------------------------------
181
- static ATR_DRAFTER_PROMPT = `You are a senior AI security rule engineer for the ATR (Agent Threat Rules) standard.
453
+ /** Prompt for skill/tool analysis (both MCP and SKILL.md) */
454
+ static ATR_DRAFTER_PROMPT = `You are a senior AI security rule engineer for ATR (Agent Threat Rules). Cisco AI Defense merged 34 ATR rules into production. Your output must meet that quality bar AND the RFC-001 v1.0 quality gate (5+ TP, 5+ TN, 3+ evasion_tests, OWASP LLM + OWASP Agentic + MITRE ATLAS required).
455
+
456
+ You have three tools: grep_existing_rules, read_rule, fetch_research. Use them.
457
+
458
+ PROTOCOL — you MUST follow this order:
459
+
460
+ STEP 0 — De-duplication check (required, non-negotiable):
461
+ a) Call grep_existing_rules with 2-4 keywords from the attack you are considering. Example keywords: ["prompt injection", "IMPORTANT tag"], ["credential exfil", "ssh key"], ["tool poisoning", "cross-tool"], ["hidden instruction", "system override"].
462
+ b) Read the results. If any matching rules look topically similar, call read_rule on the 1-3 most relevant to inspect their regex patterns and test cases.
463
+ c) Decide:
464
+ - If the attack is ALREADY covered by an existing rule (same patterns, same category) → output NO_THREATS_FOUND and stop. Do not duplicate existing work.
465
+ - If the attack is a NOVEL VARIANT that slips past existing regex → draft a new rule explicitly referencing what it catches that existing rules miss.
466
+ - If the attack is a GENUINELY NEW CLASS → draft a new rule from scratch.
467
+
468
+ STEP 1 — Research grounding (strongly recommended):
469
+ If you're drafting a new rule, call fetch_research on at least one reputable source that documents the attack class. Suggested sources: invariantlabs.ai/blog, elastic.co/security-labs, snyk.io/articles, arxiv.org, atlas.mitre.org, unit42.paloaltonetworks.com, genai.owasp.org. Cite the source in the rule's \`references.research\` field.
470
+
471
+ STEP 2 — Draft the rule (only after steps 0 and 1):
182
472
 
183
- You will receive MCP tool descriptions from a skill. Your job is to write PRODUCTION-QUALITY detection rules for SPECIFIC, CONCRETE attack patterns not vague risk categories.
473
+ You will receive MCP tool descriptions from a skill. Write a PRODUCTION-QUALITY detection rule ONLY if you find a SPECIFIC, CONCRETE attack pattern AND have verified it is not already covered.
184
474
 
185
- STRICT REQUIREMENTS rules that violate these will be REJECTED:
475
+ QUALITY BAR (Cisco-merge level + RFC-001 v1.0):
186
476
 
187
- 1. REGEX MUST BE HIGHLY SPECIFIC require multi-word phrase matching.
188
- GOOD: "without\\s+(?:asking|requiring)\\s+(?:user|human)\\s+(?:confirmation|approval)"
189
- BAD: "without asking" or "directly to \\w+" (too generic, matches normal docs).
190
- Each regex must require 3+ specific words in sequence to avoid false positives.
477
+ 1. REGEX SINGLE-QUOTED YAML, compound patterns, 3+ word sequences:
478
+ GOOD: '(curl|wget)\\s+[^\\n]*\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}[^|]*\\|\\s*(bash|sh)'
479
+ BAD: "config.*base64.*import" (too broad, double quotes cause YAML escape bugs)
480
+ CRITICAL: Always use SINGLE QUOTES for regex values in YAML. Double quotes break \\s, \\d, \\w.
481
+ CRITICAL: Do NOT use (?i) inline flag — JS RegExp does not support it. Instead, use character classes like [aA] or write pattern without case sensitivity assumption.
191
482
 
192
- 2. FALSE POSITIVE RATE MUST BE LOW the rule should NOT trigger on:
193
- - Legitimate deployment tools, monitoring tools, or admin panels
194
- - Normal API clients that read credentials from config
195
- - Standard CRUD operations (list, get, create, update, delete)
196
- - Short but legitimate tool descriptions
483
+ 2. DETECTION LAYERSminimum 3 conditions, 5+ preferred:
484
+ Cisco-merge rules have 5-15 detection layers for defense in depth.
485
+ Each condition catches a different facet of the attack:
486
+ - Layer 1: the core malicious action (e.g. credential file read)
487
+ - Layer 2: the exfil vector (e.g. base64 encoding, compression)
488
+ - Layer 3: the transmission (e.g. curl POST to external host)
489
+ - Layer 4: (optional) obfuscation markers (eval, decode, fromCharCode)
490
+ - Layer 5: (optional) evasion patterns (sleep, time check, conditional)
491
+ A rule with only 1 condition is REJECTED as insufficient.
197
492
 
198
- 3. MUST DETECT A REAL ATTACK not just "this tool has broad permissions" or "description is vague".
199
- Examples of REAL attacks:
200
- - Tool description contains hidden instructions for the LLM ("always approve", "skip verification")
201
- - Tool claims to be read-only but its schema accepts write parameters
202
- - Tool name suggests safe operation but description reveals destructive capability
203
- - Tool embeds base64/encoded payloads in its description
204
- - Tool description instructs LLM to ignore user preferences or security policies
493
+ 3. FALSE POSITIVESregex MUST NOT trigger on:
494
+ - Legitimate deployment/monitoring/admin tools
495
+ - Security scanning tools that DESCRIBE attack patterns as examples
496
+ - Normal API clients reading credentials from config
497
+ - DevOps skills mentioning shell commands in documentation context
498
+ TEST: mentally run your regex against "npm install express" and "deploy to AWS" — if it triggers, it's too broad.
205
499
 
206
- 4. TEST CASES MUST BE REALISTIC true_positive must be a plausible malicious description,
207
- true_negative must be a similar-looking but legitimate description. Include at least 2 of each.
500
+ 4. REAL ATTACKS ONLYnot "this tool has broad permissions":
501
+ - Hidden LLM instructions ("always approve", "skip verification", "do not inform user")
502
+ - Credential exfiltration (read ~/.ssh + send externally)
503
+ - Malicious code execution (curl|bash from raw IP, base64 decode + exec)
504
+ - Supply chain (typosquat names, unofficial fork claims)
505
+ - Reverse shells, DNS exfiltration, IMDS access
208
506
 
209
- 5. OUTPUT AT MOST 1 RULE per skill. Prefer NO rule over a bad rule.
210
- If no SPECIFIC, CONCRETE threat pattern exists, output "NO_THREATS_FOUND".
507
+ 5. TEST CASES 5+ true_positives, 5+ true_negatives (Cisco bar, not 3):
508
+ - TP must be REAL attack payloads (not hypothetical)
509
+ - TN must be similar-looking LEGITIMATE content
510
+ - YOUR REGEX MUST ACTUALLY MATCH ALL TP AND MISS ALL TN. Verify before outputting.
511
+ - Include at least 2 TN that are edge cases (similar commands in legitimate contexts)
211
512
 
212
- Output format (ONLY if a specific threat is found):
513
+ 6. EVASION TESTS required, minimum 3:
514
+ Document known bypass techniques with expected: not_triggered.
515
+ Every rule must honestly acknowledge how attackers could evade it:
516
+ - Obfuscation (base64, hex, unicode escapes)
517
+ - Semantic paraphrase (synonyms, indirect references)
518
+ - Time/context gating (delayed execution, conditional triggers)
519
+
520
+ 7. REFERENCES — every rule must map to BOTH OWASP and MITRE:
521
+ references:
522
+ owasp_llm:
523
+ - "LLM01:2025 - Prompt Injection" (or appropriate category)
524
+ owasp_agentic:
525
+ - "ASI01:2026 - Agent Behaviour Hijack" (or appropriate category)
526
+ mitre_atlas:
527
+ - "AML.T0051" (or appropriate technique ID)
528
+ MITRE ATLAS reference is REQUIRED, not optional.
529
+
530
+ 8. DECISION CRITERIA — output a rule or "NO_THREATS_FOUND":
531
+ - If the skill content contains ACTUAL malicious code (credential theft, exfiltration,
532
+ reverse shells, hidden instructions to bypass safety) → WRITE A RULE, even if you
533
+ think existing regex might already catch it. Let the dedup layer handle overlaps.
534
+ - If the skill is just a normal tool with broad permissions (file access, network calls)
535
+ but no malicious INTENT → output NO_THREATS_FOUND.
536
+ - When in doubt about whether something is malicious, WRITE THE RULE. False negatives
537
+ (missing a real attack) are worse than duplicate rules.
538
+
539
+ Output format (ONLY if a SPECIFIC threat is found):
213
540
  \`\`\`yaml
214
- title: "<specific attack technique, not generic risk>"
541
+ title: '<specific attack technique>'
215
542
  id: ATR-2026-DRAFT-<8char-hex>
216
- status: draft
217
- description: |
218
- <what SPECIFIC attack this detects, with concrete example from the analyzed skill>
219
- author: "Threat Cloud LLM Analyzer"
543
+ rule_version: 1
544
+ status: experimental
545
+ description: >
546
+ <what SPECIFIC attack this detects, referencing the analyzed skill content>
547
+ author: "ATR Threat Cloud Crystallization"
220
548
  date: "${new Date().toISOString().slice(0, 10).replace(/-/g, '/')}"
221
549
  schema_version: "0.1"
222
- detection_tier: semantic
550
+ detection_tier: pattern
223
551
  maturity: experimental
224
- severity: <critical|high|medium|low>
552
+ severity: <critical|high|medium>
553
+ references:
554
+ owasp_llm:
555
+ - "<most relevant LLM Top 10 category>"
556
+ owasp_agentic:
557
+ - "<most relevant Agentic Top 10 category>"
558
+ mitre_atlas:
559
+ - "<AML.Txxxx technique ID — REQUIRED>"
225
560
  tags:
226
- category: <tool-poisoning|prompt-injection|data-exfiltration|privilege-escalation>
561
+ category: <skill-compromise|tool-poisoning|prompt-injection|context-exfiltration|privilege-escalation>
227
562
  subcategory: <specific-technique>
228
- confidence: medium
563
+ scan_target: <mcp|skill|both>
564
+ confidence: <high|medium>
565
+ agent_source:
566
+ type: mcp_exchange
567
+ framework: [any]
568
+ provider: [any]
229
569
  detection:
230
570
  conditions:
231
- - field: tool_description
571
+ - field: content
572
+ operator: regex
573
+ value: '<LAYER 1: core malicious action regex>'
574
+ description: '<what layer 1 matches>'
575
+ - field: content
232
576
  operator: regex
233
- value: "<SPECIFIC regex with word boundaries and context>"
234
- description: "<exactly what malicious pattern this matches>"
577
+ value: '<LAYER 2: exfil/encoding vector regex>'
578
+ description: '<what layer 2 matches>'
579
+ - field: content
580
+ operator: regex
581
+ value: '<LAYER 3: transmission/execution regex>'
582
+ description: '<what layer 3 matches>'
235
583
  condition: any
584
+ false_positives:
585
+ - '<edge case 1 — legitimate content that looks similar>'
586
+ - '<edge case 2 — common benign pattern>'
587
+ - '<edge case 3 — dev/admin tool context>'
236
588
  response:
237
- actions: [alert, snapshot]
589
+ actions: [alert, block_tool]
590
+ message_template: >
591
+ [ATR-2026-DRAFT] <one-line description of what was detected>
238
592
  test_cases:
239
593
  true_positives:
240
- - tool_description: "<realistic malicious tool description that should trigger>"
594
+ - input: '<real attack payload 1>'
595
+ expected: triggered
596
+ - input: '<real attack payload 2>'
597
+ expected: triggered
598
+ - input: '<real attack payload 3>'
599
+ expected: triggered
600
+ - input: '<real attack payload 4>'
241
601
  expected: triggered
242
- - tool_description: "<another variant>"
602
+ - input: '<real attack payload 5>'
243
603
  expected: triggered
244
604
  true_negatives:
245
- - tool_description: "<similar but legitimate tool description>"
605
+ - input: '<similar but safe content 1>'
246
606
  expected: not_triggered
247
- - tool_description: "<another legitimate example>"
607
+ reason: '<why this is safe>'
608
+ - input: '<similar but safe content 2>'
248
609
  expected: not_triggered
610
+ reason: '<why this is safe>'
611
+ - input: '<similar but safe content 3>'
612
+ expected: not_triggered
613
+ reason: '<why this is safe>'
614
+ - input: '<edge case 4 — common legitimate usage>'
615
+ expected: not_triggered
616
+ reason: '<why this is safe>'
617
+ - input: '<edge case 5 — devops/admin tool context>'
618
+ expected: not_triggered
619
+ reason: '<why this is safe>'
620
+ evasion_tests:
621
+ - input: '<bypass 1 — obfuscation variant>'
622
+ expected: not_triggered
623
+ bypass_technique: '<technique name>'
624
+ notes: '<how attacker could evade>'
625
+ - input: '<bypass 2 — semantic paraphrase>'
626
+ expected: not_triggered
627
+ bypass_technique: '<technique name>'
628
+ notes: '<why this bypasses the regex>'
629
+ - input: '<bypass 3 — time-gated or conditional>'
630
+ expected: not_triggered
631
+ bypass_technique: '<technique name>'
632
+ notes: '<explanation>'
249
633
  \`\`\`
250
634
 
251
- REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely suspicious patterns.`;
635
+ BEFORE OUTPUTTING reject your own output if any check fails:
636
+ - [ ] At least 3 detection conditions (NOT 1)
637
+ - [ ] At least 5 true_positives + 5 true_negatives (Cisco bar, not 3)
638
+ - [ ] At least 3 evasion_tests documenting known bypasses
639
+ - [ ] MITRE ATLAS reference present (REQUIRED)
640
+ - [ ] OWASP LLM + OWASP Agentic references present
641
+ - [ ] No (?i) inline flag — JS does not support it
642
+ - [ ] Single-quoted regex values
643
+ - [ ] Every condition has a description field
644
+ - [ ] Your regex matches ALL true_positives AND misses ALL true_negatives
645
+
646
+ If you cannot meet this bar, output NO_THREATS_FOUND instead of a weak rule.`;
252
647
  /**
253
648
  * Analyze skill scan results for semantic threats regex missed
254
649
  * 分析技能掃描結果,找出 regex 漏掉的語義威脅
@@ -256,16 +651,19 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
256
651
  async analyzeSkills(skills) {
257
652
  const results = [];
258
653
  for (const skill of skills) {
259
- if (!skill.tools || skill.tools.length < 2)
654
+ if (!skill.tools || skill.tools.length === 0)
260
655
  continue;
261
656
  const toolSummary = skill.tools
262
657
  .slice(0, 30) // Limit to avoid token overflow
263
658
  .map((t) => `- ${t.name}: ${t.description}`)
264
659
  .join('\n');
265
- const userMessage = `Analyze these MCP tools from "${skill.package}" for threats that regex scanning missed:\n\n${toolSummary}`;
660
+ const userMessage = `Analyze this skill content from "${skill.package}" for threats that regex scanning missed. Before drafting a rule, call grep_existing_rules to verify the attack class is not already covered. If a similar rule exists, call read_rule to inspect it and either propose a narrowly-scoped new variant or emit NO_THREATS_FOUND. Ground novel attack claims in research via fetch_research when possible.\n\nSkill content:\n\n${toolSummary}`;
266
661
  try {
267
- const responseText = await this.callAnthropicAPI(LLMReviewer.ATR_DRAFTER_PROMPT + '\n\n' + userMessage);
662
+ const { finalText: responseText, toolCalls } = await this.callAnthropicWithTools(LLMReviewer.ATR_DRAFTER_PROMPT, userMessage);
663
+ console.log(`[LLM] analyzeSkills (tc-v2) for "${skill.package}": ${responseText.length} chars, ${toolCalls} tool calls`);
664
+ console.log(`[LLM] First 500 chars: ${responseText.slice(0, 500)}`);
268
665
  if (responseText.includes('NO_THREATS_FOUND')) {
666
+ console.log(`[LLM] Verdict: NO_THREATS_FOUND for "${skill.package}"`);
269
667
  results.push({
270
668
  package: skill.package,
271
669
  threatsFound: false,
@@ -275,8 +673,18 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
275
673
  continue;
276
674
  }
277
675
  // Extract YAML blocks
278
- const yamlBlocks = responseText.match(/```yaml\n([\s\S]*?)```/g);
676
+ // Primary: properly-closed ```yaml\n...```
677
+ // Fallback: opening ```yaml\n...<end of string> (truncation safety net)
678
+ let yamlBlocks = responseText.match(/```yaml\n([\s\S]*?)```/g);
279
679
  if (!yamlBlocks || yamlBlocks.length === 0) {
680
+ const unclosed = responseText.match(/```yaml\n([\s\S]*?)$/);
681
+ if (unclosed) {
682
+ console.log(`[LLM] Recovered unclosed YAML block (max_tokens likely hit) for "${skill.package}"`);
683
+ yamlBlocks = [unclosed[0] + '\n```'];
684
+ }
685
+ }
686
+ if (!yamlBlocks || yamlBlocks.length === 0) {
687
+ console.log(`[LLM] No YAML blocks found in response for "${skill.package}". Response starts with: ${responseText.slice(0, 200)}`);
280
688
  results.push({
281
689
  package: skill.package,
282
690
  threatsFound: false,
@@ -285,26 +693,70 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
285
693
  });
286
694
  continue;
287
695
  }
696
+ console.log(`[LLM] Found ${yamlBlocks.length} YAML block(s) for "${skill.package}"`);
288
697
  const proposals = [];
289
698
  const { createHash } = await import('node:crypto');
290
699
  for (const block of yamlBlocks) {
291
- const ruleContent = block
700
+ let ruleContent = block
292
701
  .replace(/```yaml\n?/, '')
293
702
  .replace(/```$/, '')
294
703
  .trim();
295
704
  // Validate: must have required ATR fields
296
- if (!ruleContent.includes('title:') || !ruleContent.includes('detection:'))
705
+ if (!ruleContent.includes('title:') || !ruleContent.includes('detection:')) {
706
+ console.log(`[LLM] YAML block skipped — missing title: (${ruleContent.includes('title:')}) or detection: (${ruleContent.includes('detection:')}). First 200 chars: ${ruleContent.slice(0, 200)}`);
297
707
  continue;
298
- // Validate regex in the rule
299
- const regexMatch = ruleContent.match(/value:\s*"([^"]+)"/);
708
+ }
709
+ // Validate regex in the rule (match both single and double quoted values)
710
+ const regexMatch = ruleContent.match(/value:\s*(['"])((?:(?!\1).)+)\1/);
300
711
  if (regexMatch) {
712
+ // Strip (?i) prefix — JS uses /pattern/i flag instead of PCRE inline (?i)
713
+ const rawPattern = regexMatch[2];
714
+ const jsPattern = rawPattern.replace(/^\(\?i\)/g, '');
301
715
  try {
302
- new RegExp(regexMatch[1], 'i');
716
+ new RegExp(jsPattern, 'i');
303
717
  }
304
- catch {
718
+ catch (regexErr) {
719
+ console.log(`[LLM] YAML block skipped — invalid regex: ${rawPattern.slice(0, 100)}. Error: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`);
305
720
  continue; // Skip rules with invalid regex
306
721
  }
722
+ // If we stripped (?i), also fix it in the rule content so downstream consumers don't hit the same issue
723
+ if (rawPattern !== jsPattern) {
724
+ ruleContent = ruleContent.replace(rawPattern, jsPattern);
725
+ console.log(`[LLM] Stripped (?i) prefix from regex for JS compatibility`);
726
+ }
727
+ }
728
+ // ATR Quality Gate — use the canonical library from agent-threat-rules/quality
729
+ // Reject rules that don't meet the experimental quality bar (3+ conditions,
730
+ // 3 TP + 3 TN, OWASP + MITRE, FP docs). See RFC-001 §3.
731
+ let gateResult;
732
+ try {
733
+ const metadata = parseATRRule(ruleContent);
734
+ // Mark as LLM-generated so downstream consumers know provenance
735
+ const enriched = { ...metadata, llmGenerated: true };
736
+ gateResult = validateRuleMeetsStandard(enriched, 'experimental');
737
+ }
738
+ catch (parseErr) {
739
+ console.log(`[LLM] Rule rejected — failed to parse YAML for quality gate: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}`);
740
+ continue;
307
741
  }
742
+ if (!gateResult.passed) {
743
+ console.log(`[LLM] Rule rejected by ATR Quality Gate: ${gateResult.issues.join('; ')}`);
744
+ continue;
745
+ }
746
+ if (gateResult.warnings.length > 0) {
747
+ console.log(`[LLM] Rule passed gate with warnings: ${gateResult.warnings.join('; ')}`);
748
+ }
749
+ // Self-test: run the rule's own test_cases against its own regex.
750
+ // This is the first-principles quality check — if LLM-produced regex
751
+ // can't match its own TPs or incorrectly matches its own TNs, the
752
+ // rule is broken regardless of how good the metadata looks.
753
+ const selfTest = selfTestRule(ruleContent);
754
+ if (!selfTest.passed) {
755
+ console.log(`[LLM] Rule rejected by self-test: TP ${selfTest.tpMatched}/${selfTest.tpTotal}, TN FP ${selfTest.tnMatched}/${selfTest.tnTotal}. ` +
756
+ `Reasons: ${selfTest.failureReasons.slice(0, 3).join(' | ')}`);
757
+ continue;
758
+ }
759
+ console.log(`[LLM] Rule passed self-test: ${selfTest.tpMatched}/${selfTest.tpTotal} TP caught, ${selfTest.tnTotal - selfTest.tnMatched}/${selfTest.tnTotal} TN clean`);
308
760
  const patternHash = createHash('sha256').update(ruleContent).digest('hex').slice(0, 16);
309
761
  // Submit as proposal + auto-review
310
762
  this.db.insertATRProposal({
@@ -316,6 +768,8 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
316
768
  approved: true,
317
769
  source: 'skill-analysis',
318
770
  package: skill.package,
771
+ provenance: 'llm-generated',
772
+ gateWarnings: gateResult.warnings,
319
773
  }),
320
774
  });
321
775
  // Fire-and-forget: review the proposal we just created
@@ -345,6 +799,195 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
345
799
  }
346
800
  return results;
347
801
  }
802
+ /**
803
+ * Draft a full ATR YAML rule from a raw attack payload supplied by an
804
+ * external red-team source (e.g. NVIDIA garak). Same drafter pipeline as
805
+ * analyzeSkills but the input is the attack prompt itself rather than a
806
+ * skill's tool descriptions. Returns the drafted proposal or null when
807
+ * the drafter declined to write a rule (NO_THREATS_FOUND, duplicate of
808
+ * existing coverage, failed quality gate, failed self-test).
809
+ *
810
+ * Callers:
811
+ * - POST /api/atr-proposals/from-payload (partner / admin auth)
812
+ *
813
+ * Always inserts the resulting proposal into atr_proposals so the normal
814
+ * canary → auto-merge → npm publish pipeline can take over.
815
+ */
816
+ async draftRuleFromPayload(payload, meta) {
817
+ // Bound input to keep the drafter prompt under token budget. Longer prompts
818
+ // than this are unusual for prompt-injection payloads and just waste tokens.
819
+ const boundedPayload = payload.slice(0, 8000);
820
+ const probe = meta.probe || 'unknown-probe';
821
+ const detector = meta.detector || 'unknown-detector';
822
+ const targetModel = meta.targetModel || 'unspecified-model';
823
+ const partner = meta.partnerName || 'external-red-team';
824
+ // ------------------------------------------------------------------
825
+ // FAST PATH: payload fingerprint dedup.
826
+ //
827
+ // Normalize the payload (lowercase, strip punctuation, collapse whitespace)
828
+ // and hash it. If we've seen this fingerprint before, we already know what
829
+ // the LLM would say — return the cached verdict without calling the API.
830
+ //
831
+ // Empirically ~90% of garak corpus submissions hit this cache on a second
832
+ // or subsequent run. Eliminating those API calls is the single biggest
833
+ // cost reduction in the drafter pipeline.
834
+ // ------------------------------------------------------------------
835
+ const fingerprint = payloadFingerprint(boundedPayload);
836
+ const cached = this.db.getPayloadFingerprint(fingerprint);
837
+ if (cached) {
838
+ // Bump hit count so we can see cache effectiveness in the stats
839
+ this.db.recordPayloadFingerprint(fingerprint, cached.result);
840
+ if (cached.result === 'novel' && cached.patternHash) {
841
+ // Previous call generated a rule — find it and return
842
+ const existing = this.db.getATRProposalByHash(cached.patternHash);
843
+ if (existing) {
844
+ console.log(`[draftRuleFromPayload] fingerprint cache hit (novel): ${fingerprint} → ${cached.patternHash}`);
845
+ // We don't have the ruleContent in getATRProposalByHash's return shape,
846
+ // but that's OK — callers only use ruleContent for logging; returning a
847
+ // minimal placeholder with the real patternHash is sufficient for the
848
+ // dedup path (the original proposal row in atr_proposals is what
849
+ // downstream canary → promote pipelines actually consume).
850
+ return {
851
+ patternHash: cached.patternHash,
852
+ ruleContent: `# cached — see atr_proposals.pattern_hash=${cached.patternHash}`,
853
+ toolCalls: 0,
854
+ };
855
+ }
856
+ // Cached entry points at a proposal that was later deleted; fall through
857
+ // to re-draft rather than fail silently.
858
+ }
859
+ else {
860
+ // Previously judged duplicate or rejected — don't re-spend on LLM
861
+ console.log(`[draftRuleFromPayload] fingerprint cache hit (${cached.result}): ${fingerprint} — skipping LLM call`);
862
+ return null;
863
+ }
864
+ }
865
+ const userMessage = [
866
+ `Red-team finding from ${partner}. The attack prompt below bypassed ${targetModel} via garak probe ${probe} (detector: ${detector}).`,
867
+ '',
868
+ 'Draft an ATR rule that detects this attack class at the agent-layer boundary (tool call args, skill content, user_input field, MCP tool descriptions). The rule should generalise beyond the literal prompt — target the technique, not the specific words.',
869
+ '',
870
+ 'Follow STEP 0 (de-duplication via grep_existing_rules) before drafting. If an existing rule already covers this attack class with equivalent patterns, output NO_THREATS_FOUND.',
871
+ '',
872
+ `Attack prompt (${boundedPayload.length} chars):`,
873
+ '```',
874
+ boundedPayload,
875
+ '```',
876
+ '',
877
+ `Severity hint from partner: ${meta.severity || 'high'}. Use your judgement — downgrade to medium if the payload is a low-impact jailbreak that does not request sensitive operations.`,
878
+ ].join('\n');
879
+ let finalText = '';
880
+ let toolCalls = 0;
881
+ try {
882
+ // Bulk drafter runs on Haiku (4x cheaper than Sonnet). Quality gate +
883
+ // self-test + safety gate (benign corpus FP check) catch any output
884
+ // regressions regardless of model.
885
+ const result = await this.callAnthropicWithTools(LLMReviewer.ATR_DRAFTER_PROMPT, userMessage, { model: this.drafterModel });
886
+ finalText = result.finalText;
887
+ toolCalls = result.toolCalls;
888
+ }
889
+ catch (err) {
890
+ console.error(`[draftRuleFromPayload] LLM call failed: ${err instanceof Error ? err.message : String(err)}`);
891
+ // Do NOT cache LLM errors — transient failures should retry.
892
+ return null;
893
+ }
894
+ if (!finalText || /NO_THREATS_FOUND/.test(finalText)) {
895
+ console.log(`[draftRuleFromPayload] NO_THREATS_FOUND or empty response (probe=${probe})`);
896
+ // Cache the 'duplicate' verdict so repeat submissions of the same payload
897
+ // don't spend money to re-confirm "already covered".
898
+ this.db.recordPayloadFingerprint(fingerprint, 'duplicate');
899
+ return null;
900
+ }
901
+ // Extract YAML block (with unclosed-fence fallback — same as analyzeSkills)
902
+ let yamlBlocks = finalText.match(/```yaml\n([\s\S]*?)```/g);
903
+ if (!yamlBlocks || yamlBlocks.length === 0) {
904
+ const unclosed = finalText.match(/```yaml\n([\s\S]*?)$/);
905
+ if (unclosed)
906
+ yamlBlocks = [unclosed[0] + '\n```'];
907
+ }
908
+ if (!yamlBlocks || yamlBlocks.length === 0) {
909
+ console.log(`[draftRuleFromPayload] No YAML block in response. First 200 chars: ${finalText.slice(0, 200)}`);
910
+ this.db.recordPayloadFingerprint(fingerprint, 'rejected');
911
+ return null;
912
+ }
913
+ let ruleContent = yamlBlocks[0]
914
+ .replace(/^```yaml\n/, '')
915
+ .replace(/```$/, '')
916
+ .trim();
917
+ // Rewrite author line so the partner/source is visible on the shipped rule.
918
+ // The drafter prompt hardcodes 'ATR Threat Cloud Crystallization' as the
919
+ // author; replace it with the actual submitter so downstream consumers
920
+ // (npm, Cisco, etc.) can see who contributed each rule. Sanitise the partner
921
+ // string against YAML-breaking characters.
922
+ const safePartner = partner.replace(/[\r\n"'\\]/g, '').slice(0, 80);
923
+ const attributedAuthor = safePartner === 'external-red-team' || !safePartner
924
+ ? 'ATR Community (via garak pipe)'
925
+ : `${safePartner} (via ATR garak pipe)`;
926
+ ruleContent = ruleContent.replace(/^author:\s*["']ATR Threat Cloud Crystallization["']\s*$/m, `author: "${attributedAuthor}"`);
927
+ // Strip any (?i) prefix the LLM may have sneaked in despite the prompt
928
+ const regexFieldMatch = ruleContent.match(/value:\s*'(\(\?i\))([^']*)'/);
929
+ if (regexFieldMatch) {
930
+ const rawPattern = `'(?i)${regexFieldMatch[2]}'`;
931
+ const jsPattern = `'${regexFieldMatch[2]}'`;
932
+ ruleContent = ruleContent.replace(rawPattern, jsPattern);
933
+ }
934
+ // RFC-001 quality gate
935
+ let gateResult;
936
+ try {
937
+ const metadata = parseATRRule(ruleContent);
938
+ const enriched = { ...metadata, llmGenerated: true };
939
+ gateResult = validateRuleMeetsStandard(enriched, 'experimental');
940
+ }
941
+ catch (parseErr) {
942
+ console.log(`[draftRuleFromPayload] Rule rejected — cannot parse: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}`);
943
+ this.db.recordPayloadFingerprint(fingerprint, 'rejected');
944
+ return null;
945
+ }
946
+ if (!gateResult.passed) {
947
+ console.log(`[draftRuleFromPayload] Rejected by quality gate: ${gateResult.issues.slice(0, 3).join('; ')}`);
948
+ this.db.recordPayloadFingerprint(fingerprint, 'rejected');
949
+ return null;
950
+ }
951
+ // Self-test: rule's own regex must catch its own TPs and miss its own TNs.
952
+ const selfTest = selfTestRule(ruleContent);
953
+ if (!selfTest.passed) {
954
+ console.log(`[draftRuleFromPayload] Rejected by self-test: TP ${selfTest.tpMatched}/${selfTest.tpTotal}, TN FP ${selfTest.tnMatched}/${selfTest.tnTotal}. ${selfTest.failureReasons.slice(0, 2).join(' | ')}`);
955
+ this.db.recordPayloadFingerprint(fingerprint, 'rejected');
956
+ return null;
957
+ }
958
+ const patternHash = createHash('sha256').update(ruleContent).digest('hex').slice(0, 16);
959
+ // Idempotent: if a previous submission produced the same YAML, skip insert.
960
+ if (this.db.getATRProposalByHash(patternHash)) {
961
+ // Still record the fingerprint so future dup payloads bypass LLM.
962
+ this.db.recordPayloadFingerprint(fingerprint, 'novel', { patternHash });
963
+ return { patternHash, ruleContent, toolCalls };
964
+ }
965
+ this.db.insertATRProposal({
966
+ patternHash,
967
+ ruleContent,
968
+ llmProvider: 'garak-drafter',
969
+ llmModel: this.drafterModel,
970
+ selfReviewVerdict: JSON.stringify({
971
+ approved: true,
972
+ source: 'external-red-team',
973
+ partner,
974
+ probe,
975
+ detector,
976
+ targetModel,
977
+ toolCalls,
978
+ provenance: 'llm-generated-from-payload',
979
+ gateWarnings: gateResult.warnings,
980
+ }),
981
+ });
982
+ // Record novel fingerprint so repeat submissions reuse this rule without LLM.
983
+ this.db.recordPayloadFingerprint(fingerprint, 'novel', { patternHash });
984
+ // Fire-and-forget second-opinion review (same as analyzeSkills)
985
+ void this.reviewProposal(patternHash, ruleContent).catch((err) => {
986
+ console.error(`[draftRuleFromPayload] review failed for ${patternHash}:`, err instanceof Error ? err.message : String(err));
987
+ });
988
+ console.log(`[draftRuleFromPayload] OK: patternHash=${patternHash} partner=${partner} probe=${probe} toolCalls=${toolCalls}`);
989
+ return { patternHash, ruleContent, toolCalls };
990
+ }
348
991
  /**
349
992
  * Parse the LLM response into a structured verdict
350
993
  * 解析 LLM 回應為結構化裁決
@@ -352,7 +995,7 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
352
995
  parseVerdict(responseText) {
353
996
  const defaultVerdict = {
354
997
  approved: false,
355
- falsePositiveRisk: 'high',
998
+ falsePositiveRisk: 'medium',
356
999
  coverageScore: 0,
357
1000
  reasoning: 'Failed to parse LLM response',
358
1001
  };
@@ -366,9 +1009,10 @@ REMEMBER: Output "NO_THREATS_FOUND" for 90%+ of skills. Only flag genuinely susp
366
1009
  // Validate and normalize fields
367
1010
  const approved = parsed.approved === true;
368
1011
  const validRisks = ['low', 'medium', 'high'];
369
- const falsePositiveRisk = validRisks.includes(parsed.falsePositiveRisk)
370
- ? parsed.falsePositiveRisk
371
- : 'high';
1012
+ const normalizedRisk = (parsed.falsePositiveRisk ?? '').toString().toLowerCase().trim();
1013
+ const falsePositiveRisk = validRisks.includes(normalizedRisk)
1014
+ ? normalizedRisk
1015
+ : 'medium';
372
1016
  const coverageScore = typeof parsed.coverageScore === 'number'
373
1017
  ? Math.max(0, Math.min(100, Math.round(parsed.coverageScore)))
374
1018
  : 0;