watchmyagents 0.8.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -3
- package/package.json +7 -3
- package/scripts/agents.js +218 -0
- package/scripts/fetch-anthropic.js +82 -55
- package/scripts/service.js +7 -5
- package/scripts/shield.js +91 -94
- package/src/sources/anthropic-managed.js +18 -0
- package/src/typology-weights.json +88 -0
- package/src/typology.js +398 -0
package/README.md
CHANGED
|
@@ -105,14 +105,15 @@ Each entry carries: `id`, `agent_id`, `framework`, `timestamp`, `action_type`, `
|
|
|
105
105
|
### `wma-fetch` — pull events from Anthropic Managed Agents
|
|
106
106
|
|
|
107
107
|
```bash
|
|
108
|
-
wma-fetch --agent-id <agent_id> [--session-id <sess_id>] [--since 1h]
|
|
108
|
+
wma-fetch (--agent-id <agent_id> | --all-agents) [--session-id <sess_id>] [--since 1h]
|
|
109
109
|
[--log-dir ./watchmyagents-logs] [--dump-raw]
|
|
110
110
|
[--watch [--interval 5m] [--upload]]
|
|
111
111
|
```
|
|
112
112
|
|
|
113
113
|
| Flag | Effect |
|
|
114
114
|
|---|---|
|
|
115
|
-
| `--agent-id agent_xxx` |
|
|
115
|
+
| `--agent-id agent_xxx` | Anthropic agent identifier (required unless `--all-agents`) |
|
|
116
|
+
| `--all-agents` | **Fleet mode** (requires `--watch`) — discover ALL agents under the key and watch them in a single process |
|
|
116
117
|
| `--since 1h` / `24h` / `7d` | Fetch window (default: all) |
|
|
117
118
|
| `--session-id sesn_xxx` | Limit to a single session |
|
|
118
119
|
| `--log-dir ./logs` | Where to write NDJSON (default `./watchmyagents-logs`) |
|
|
@@ -167,6 +168,21 @@ wma-inspect [path]
|
|
|
167
168
|
|
|
168
169
|
Outputs sections aligned with security audit needs: tokens summary, by-tool / by-action-type breakdowns, top tool destinations (URLs / queries), action-sequence transitions, tool error rates, p50/p95/max latency per tool, rate metrics.
|
|
169
170
|
|
|
171
|
+
### `wma-agents` — discover + classify your agents (typology)
|
|
172
|
+
|
|
173
|
+
Lists every Managed Agent under your key and classifies each one's **typology**
|
|
174
|
+
(one of 10 Guardian Core archetypes) from its OBSERVED behaviour in your local
|
|
175
|
+
logs — which drives the cold-start Shield template. Modèle C: reads local logs
|
|
176
|
+
only (tool-category fractions, never raw content) and transmits nothing.
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
wma-agents list [--log-dir ~/.watchmyagents/logs] [--json]
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
With fewer than ~50 observed events an agent stays `generic` (cold start) and
|
|
183
|
+
refines as activity accumulates. Re-classification to a *less strict* type is
|
|
184
|
+
gated (raised confidence + longer window) to resist mimicry-evasion.
|
|
185
|
+
|
|
170
186
|
## Automating — continuous monitoring
|
|
171
187
|
|
|
172
188
|
### `wma-service` — install as an always-on service (recommended)
|
|
@@ -180,7 +196,7 @@ export WMA_API_KEY="wma_..."
|
|
|
180
196
|
export WMA_FORTRESS_BASE_URL="https://<project>.supabase.co/functions/v1"
|
|
181
197
|
export WMA_SIGNALS_SALT="..." # stable per-customer salt
|
|
182
198
|
|
|
183
|
-
wma-service install --agent-id agent_01ABC... --interval 5m [--with-shield]
|
|
199
|
+
wma-service install (--agent-id agent_01ABC... | --all-agents) [--interval 5m] [--with-shield]
|
|
184
200
|
wma-service status
|
|
185
201
|
wma-service uninstall [--with-shield]
|
|
186
202
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "watchmyagents",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "Security observability + real-time policy enforcement for AI agents. Local-first NDJSON capture with a continuous Watch daemon that auto-uploads anonymized signals, Shield CLI that blocks policy violations live (with policies pulled from Fortress cloud), anonymizer producing signals-only payloads, bidirectional sync with WatchMyAgents Fortress, and one-command install as an always-on launchd/systemd service — closing the recursive Watch→Guardian→Shield security loop.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
"scripts/anonymize.js",
|
|
12
12
|
"scripts/upload-fortress.js",
|
|
13
13
|
"scripts/service.js",
|
|
14
|
+
"scripts/agents.js",
|
|
14
15
|
"README.md",
|
|
15
16
|
"SECURITY.md",
|
|
16
17
|
"LICENSE"
|
|
@@ -21,15 +22,18 @@
|
|
|
21
22
|
"wma-shield": "scripts/shield.js",
|
|
22
23
|
"wma-anonymize": "scripts/anonymize.js",
|
|
23
24
|
"wma-upload-fortress": "scripts/upload-fortress.js",
|
|
24
|
-
"wma-service": "scripts/service.js"
|
|
25
|
+
"wma-service": "scripts/service.js",
|
|
26
|
+
"wma-agents": "scripts/agents.js"
|
|
25
27
|
},
|
|
26
28
|
"scripts": {
|
|
29
|
+
"test": "node --test",
|
|
27
30
|
"inspect": "node scripts/inspect.js",
|
|
28
31
|
"fetch": "node scripts/fetch-anthropic.js",
|
|
29
32
|
"shield": "node scripts/shield.js",
|
|
30
33
|
"anonymize": "node scripts/anonymize.js",
|
|
31
34
|
"upload-fortress": "node scripts/upload-fortress.js",
|
|
32
|
-
"service": "node scripts/service.js"
|
|
35
|
+
"service": "node scripts/service.js",
|
|
36
|
+
"agents": "node scripts/agents.js"
|
|
33
37
|
},
|
|
34
38
|
"engines": {
|
|
35
39
|
"node": ">=18.0.0"
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// wma-agents — discover all Managed Agents under your key and classify each
|
|
3
|
+
// agent's typology from its OBSERVED behaviour (for Shield template selection).
|
|
4
|
+
//
|
|
5
|
+
// Usage:
|
|
6
|
+
// wma-agents [list] [--log-dir ~/.watchmyagents/logs] [--json]
|
|
7
|
+
//
|
|
8
|
+
// Reads the local Watch logs (NEVER leaves the machine — Modèle C) and derives
|
|
9
|
+
// the anonymized behavioural FEATURE VECTOR per the typology spec:
|
|
10
|
+
// per-tool-category FRACTIONS (f_*), boolean local flags (flag_*), aux ratios
|
|
11
|
+
// (aux_*), and n_events. It then calls classifyAgentType() and prints the
|
|
12
|
+
// schema-conformant result. With <50 events an agent is `generic` (cold start)
|
|
13
|
+
// and refines as activity accumulates.
|
|
14
|
+
//
|
|
15
|
+
// Modèle C invariant: only counts/ratios/flags are computed here — never raw
|
|
16
|
+
// prompt/output content, never the agent display name. Nothing is transmitted.
|
|
17
|
+
//
|
|
18
|
+
// ANTHROPIC_API_KEY from env (or --api-key, discouraged).
|
|
19
|
+
|
|
20
|
+
import os from 'node:os';
|
|
21
|
+
import { readdir, stat } from 'node:fs/promises';
|
|
22
|
+
import { createReadStream } from 'node:fs';
|
|
23
|
+
import { createInterface } from 'node:readline';
|
|
24
|
+
import { join, resolve } from 'node:path';
|
|
25
|
+
import { listAgents } from '../src/sources/anthropic-managed.js';
|
|
26
|
+
import { classifyAgentType } from '../src/typology.js';
|
|
27
|
+
import { isValidAgentId, assertSafePathSegment } from '../src/validate.js';
|
|
28
|
+
|
|
29
|
+
function parseArgs(argv) {
|
|
30
|
+
const out = { _: [] };
|
|
31
|
+
for (let i = 0; i < argv.length; i++) {
|
|
32
|
+
const a = argv[i];
|
|
33
|
+
if (a.startsWith('--')) {
|
|
34
|
+
const k = a.slice(2); const n = argv[i + 1];
|
|
35
|
+
if (n == null || n.startsWith('--')) out[k] = true; else { out[k] = n; i++; }
|
|
36
|
+
} else out._.push(a);
|
|
37
|
+
}
|
|
38
|
+
return out;
|
|
39
|
+
}
|
|
40
|
+
function die(msg, code = 1) { process.stderr.write(`error: ${msg}\n`); process.exit(code); }
|
|
41
|
+
function info(msg) { process.stdout.write(`[wma-agents] ${msg}\n`); }
|
|
42
|
+
|
|
43
|
+
// Action types that represent a TOOL invocation (the denominator for f_* tool
|
|
44
|
+
// fractions). Confirmed produced by src/sources/anthropic-managed.js.
|
|
45
|
+
const TOOL_ACTIONS = new Set(['tool_use', 'mcp_tool_use', 'custom_tool_use']);
|
|
46
|
+
|
|
47
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
48
|
+
// Tool-name → category mapping (Modèle C: name-based, no content). Managed
|
|
49
|
+
// Agents expose tools as an opaque bundle, so tool_name is free-text. We match
|
|
50
|
+
// the confirmed built-ins (web_search, web_fetch, bash) plus best-effort
|
|
51
|
+
// regexes for common tool names. A tool that matches nothing contributes to the
|
|
52
|
+
// denominator but to no category (honest: unknown ≠ inferred).
|
|
53
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
54
|
+
const CATEGORY_RULES = [
|
|
55
|
+
// category, matcher (lower-cased tool_name)
|
|
56
|
+
['search', (n) => /(^|_)web_search$|(^|_)search($|_)|google|brave/.test(n)],
|
|
57
|
+
['browser', (n) => /web_fetch|browser|playwright|puppeteer|navigate|screenshot/.test(n)],
|
|
58
|
+
['http', (n) => /(^|_)http|fetch_url|curl|request|webhook|api_call/.test(n)],
|
|
59
|
+
['code', (n) => /bash|shell|terminal|code_exec|exec_|python|node_run|run_code|interpreter/.test(n)],
|
|
60
|
+
['database', (n) => /sql|query_db|database|postgres|mysql|mongo|redis|bigquery|snowflake/.test(n)],
|
|
61
|
+
['email', (n) => /email|gmail|smtp|sendmail|mailgun|outlook/.test(n)],
|
|
62
|
+
['payment', (n) => /payment|charge|transfer|invoice|stripe|paypal|payout|refund|checkout/.test(n)],
|
|
63
|
+
['secret', (n) => /secret|vault|credential|kms|keychain|token_get/.test(n)],
|
|
64
|
+
['memory', (n) => /memory|retriev|vector|(^|_)rag($|_)|knowledge|embed|pinecone|chroma/.test(n)],
|
|
65
|
+
['file', (n) => /editor|str_replace|read_file|write_file|create_file|file_io|(^|_)file($|_)|fs_/.test(n)],
|
|
66
|
+
];
|
|
67
|
+
|
|
68
|
+
// Best-effort deploy detection (spec discriminator devops_infra vs coding).
|
|
69
|
+
const DEPLOY_RE = /deploy|terraform|kubectl|helm|(^|_)release($|_)|ansible|pulumi|cloudformation/;
|
|
70
|
+
|
|
71
|
+
function categoryOf(toolName) {
|
|
72
|
+
const n = String(toolName || '').toLowerCase();
|
|
73
|
+
for (const [cat, m] of CATEGORY_RULES) if (m(n)) return cat;
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Aggregate raw counts from an agent's local NDJSON logs (Modèle C: counts only).
|
|
78
|
+
async function aggregate(logDir, agentId) {
|
|
79
|
+
const actionCounts = {}; // action_type → count
|
|
80
|
+
const categoryCounts = {}; // tool category → count
|
|
81
|
+
let toolEvents = 0; // denominator for f_* fractions
|
|
82
|
+
let deployUses = 0;
|
|
83
|
+
const dir = join(logDir, agentId);
|
|
84
|
+
const s = await stat(dir).catch(() => null);
|
|
85
|
+
if (!s || !s.isDirectory()) return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: false };
|
|
86
|
+
let names;
|
|
87
|
+
try { names = await readdir(dir); } catch { return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: false }; }
|
|
88
|
+
const files = names.filter((n) => n.endsWith('.ndjson') && !n.startsWith('raw-'));
|
|
89
|
+
if (files.length === 0) return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: false };
|
|
90
|
+
|
|
91
|
+
for (const f of files) {
|
|
92
|
+
await new Promise((res) => {
|
|
93
|
+
const rl = createInterface({ input: createReadStream(join(dir, f), { encoding: 'utf8' }), crlfDelay: Infinity });
|
|
94
|
+
rl.on('line', (line) => {
|
|
95
|
+
if (!line.trim()) return;
|
|
96
|
+
let e; try { e = JSON.parse(line); } catch { return; }
|
|
97
|
+
if (e.action_type) actionCounts[e.action_type] = (actionCounts[e.action_type] || 0) + 1;
|
|
98
|
+
if (TOOL_ACTIONS.has(e.action_type)) {
|
|
99
|
+
toolEvents += 1;
|
|
100
|
+
const cat = categoryOf(e.tool_name);
|
|
101
|
+
if (cat) categoryCounts[cat] = (categoryCounts[cat] || 0) + 1;
|
|
102
|
+
if (DEPLOY_RE.test(String(e.tool_name || '').toLowerCase())) deployUses += 1;
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
rl.on('close', res); rl.on('error', res);
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: true };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Features that the WMA NDJSON logs CANNOT reliably expose today (opaque tool
|
|
112
|
+
// names / no behavioural signal / content off-limits under Modèle C). They
|
|
113
|
+
// default to 0/false; the caller prints a one-line note.
|
|
114
|
+
const NON_DERIVABLE = [
|
|
115
|
+
'f_database', 'f_email', 'f_payment', 'f_secret', 'f_memory',
|
|
116
|
+
'flag_internal_sys', 'flag_on_behalf', 'aux_untrusted', 'aux_sensitive',
|
|
117
|
+
];
|
|
118
|
+
|
|
119
|
+
// Build the canonical anonymized FEATURE VECTOR from the aggregated counts.
|
|
120
|
+
// Fractions = category_count / toolEvents. n_events = total observed events.
|
|
121
|
+
function buildFeatures(agg) {
|
|
122
|
+
const { actionCounts, categoryCounts, toolEvents, deployUses } = agg;
|
|
123
|
+
const nEvents = Object.values(actionCounts).reduce((a, b) => a + b, 0);
|
|
124
|
+
const frac = (c) => (toolEvents > 0 ? (categoryCounts[c] || 0) / toolEvents : 0);
|
|
125
|
+
const eventFrac = (...types) => (nEvents > 0
|
|
126
|
+
? types.reduce((a, t) => a + (actionCounts[t] || 0), 0) / nEvents
|
|
127
|
+
: 0);
|
|
128
|
+
|
|
129
|
+
// f_handoff / f_user_msg are derived from event TYPE (not tool category):
|
|
130
|
+
// confirmed action_types thread_message_* and user_message.
|
|
131
|
+
const handoff = eventFrac('thread_message_sent', 'thread_message_received', 'thread_created');
|
|
132
|
+
const userMsg = eventFrac('user_message');
|
|
133
|
+
|
|
134
|
+
// aux_autonomy ≈ 1 − (human-in-the-loop event share). Confirmed action_types
|
|
135
|
+
// user_message / user_interrupt / tool_confirmation mark human involvement; an
|
|
136
|
+
// agent that proceeds without them is more autonomous. Heuristic — documented.
|
|
137
|
+
const hitlShare = eventFrac('user_message', 'user_interrupt', 'tool_confirmation');
|
|
138
|
+
const auxAutonomy = nEvents > 0 ? Math.max(0, 1 - hitlShare) : 0;
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
// tool-category fractions (over tool uses)
|
|
142
|
+
f_code: frac('code'),
|
|
143
|
+
f_browser: frac('browser'),
|
|
144
|
+
f_database: frac('database'), // non-derivable in practice → ~0
|
|
145
|
+
f_http: frac('http'),
|
|
146
|
+
f_email: frac('email'), // non-derivable in practice → ~0
|
|
147
|
+
f_payment: frac('payment'), // non-derivable in practice → ~0
|
|
148
|
+
f_secret: frac('secret'), // non-derivable in practice → ~0
|
|
149
|
+
f_search: frac('search'),
|
|
150
|
+
f_memory: frac('memory'), // non-derivable in practice → ~0
|
|
151
|
+
f_file: frac('file'),
|
|
152
|
+
// event-type fractions (over all events)
|
|
153
|
+
f_handoff: handoff,
|
|
154
|
+
f_user_msg: userMsg,
|
|
155
|
+
// discriminator flags (best-effort; only flag_deploy has any behavioural
|
|
156
|
+
// signal — and only if the agent literally names a deploy tool).
|
|
157
|
+
flag_deploy: deployUses > 0 ? 1 : 0,
|
|
158
|
+
flag_internal_sys: 0, // no behavioural signal in logs
|
|
159
|
+
flag_on_behalf: 0, // no behavioural signal in logs
|
|
160
|
+
// aux ratios
|
|
161
|
+
aux_autonomy: auxAutonomy, // heuristic (HITL-frequency)
|
|
162
|
+
aux_untrusted: 0, // no honest source in logs
|
|
163
|
+
aux_sensitive: 0, // no honest source in logs
|
|
164
|
+
// window size
|
|
165
|
+
n_events: nEvents,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async function main() {
|
|
170
|
+
const args = parseArgs(process.argv.slice(2));
|
|
171
|
+
if (args._[0] && args._[0] !== 'list') die(`unknown command "${args._[0]}" (only "list" supported)`);
|
|
172
|
+
const apiKey = args['api-key'] || process.env.ANTHROPIC_API_KEY;
|
|
173
|
+
if (!apiKey) die('--api-key or ANTHROPIC_API_KEY required');
|
|
174
|
+
if (args['api-key']) process.stderr.write('[wma-agents] WARNING: --api-key is visible in shell history; prefer ANTHROPIC_API_KEY env\n');
|
|
175
|
+
const logDir = resolve(args['log-dir'] || join(os.homedir(), '.watchmyagents', 'logs'));
|
|
176
|
+
const asJson = !!args.json;
|
|
177
|
+
|
|
178
|
+
let agents;
|
|
179
|
+
try { agents = await listAgents(apiKey); }
|
|
180
|
+
catch (e) { die(`failed to list agents: ${e.message}`); }
|
|
181
|
+
|
|
182
|
+
const results = [];
|
|
183
|
+
for (const a of agents) {
|
|
184
|
+
if (!a.id || !isValidAgentId(a.id)) continue;
|
|
185
|
+
assertSafePathSegment(a.id, 'agent id');
|
|
186
|
+
const agg = await aggregate(logDir, a.id);
|
|
187
|
+
const features = buildFeatures(agg);
|
|
188
|
+
features.agent_id = a.id;
|
|
189
|
+
// No prior state threaded here (single-shot CLI snapshot); the continuous
|
|
190
|
+
// Watch daemon is responsible for threading window state across runs.
|
|
191
|
+
const cls = classifyAgentType(features);
|
|
192
|
+
results.push({
|
|
193
|
+
id: a.id,
|
|
194
|
+
name: a.name || '(unnamed)', // shown for the human only — NOT a classification signal
|
|
195
|
+
hasLogs: agg.hasLogs,
|
|
196
|
+
...cls,
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (asJson) { process.stdout.write(JSON.stringify(results, null, 2) + '\n'); return; }
|
|
201
|
+
|
|
202
|
+
info(`discovered ${results.length} agent(s) - classified from local logs in ${logDir}`);
|
|
203
|
+
info(`Modele C: features below default to 0 (logs don't expose them): ${NON_DERIVABLE.join(', ')}`);
|
|
204
|
+
process.stdout.write('\n');
|
|
205
|
+
for (const r of results) {
|
|
206
|
+
const mods = (r.modifiers && r.modifiers.length) ? ` [+${r.modifiers.join(',')}]` : '';
|
|
207
|
+
const overlay = r.evidence?.payment_overlay ? ' (+transactional overlay)' : '';
|
|
208
|
+
process.stdout.write(` ${r.name}\n`);
|
|
209
|
+
process.stdout.write(` ${r.id}\n`);
|
|
210
|
+
process.stdout.write(` -> ${r.classified_type} (conf ${Math.round(r.confidence * 100)}%, ${r.stage})${mods}${overlay}\n`);
|
|
211
|
+
process.stdout.write(` evidence: ${r.evidence.window_events} events, top2=${r.evidence.top2_type}, margin=${r.evidence.margin}\n`);
|
|
212
|
+
if (!r.hasLogs) process.stdout.write(' (no local logs yet - cold start)\n');
|
|
213
|
+
process.stdout.write('\n');
|
|
214
|
+
}
|
|
215
|
+
info('type drives the cold-start Shield template (Guardian Core §8). The global-baseline floor applies regardless of classification.');
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
main().catch((e) => { process.stderr.write(`error: ${e.stack || e.message}\n`); process.exit(1); });
|
|
@@ -31,7 +31,7 @@ import { SignalsAggregator } from '../src/anonymizer.js';
|
|
|
31
31
|
import { resolveFortressBase, fortressEndpoint } from '../src/fortress/url.js';
|
|
32
32
|
import { isValidAgentId, isValidSessionId, assertSafePathSegment } from '../src/validate.js';
|
|
33
33
|
import {
|
|
34
|
-
getAgent, listSessions, fetchSessionEntries, fetchRawEvents,
|
|
34
|
+
getAgent, listAgents, listSessions, fetchSessionEntries, fetchRawEvents,
|
|
35
35
|
} from '../src/sources/anthropic-managed.js';
|
|
36
36
|
|
|
37
37
|
function parseArgs(argv) {
|
|
@@ -70,6 +70,9 @@ function parseSince(s) {
|
|
|
70
70
|
function die(msg, code = 1) { process.stderr.write(`${msg}\n`); process.exit(code); }
|
|
71
71
|
function info(msg) { process.stdout.write(`[wma-fetch] ${msg}\n`); }
|
|
72
72
|
function warn(msg) { process.stderr.write(`[wma-fetch] ⚠️ ${msg}\n`); }
|
|
73
|
+
// Strip control chars + truncate a customer-set agent name before it goes into
|
|
74
|
+
// a log line or the Fortress display_name (defense-in-depth vs log/payload injection).
|
|
75
|
+
function cleanLabel(s) { return [...String(s ?? '')].filter((c) => c.charCodeAt(0) >= 32 && c.charCodeAt(0) !== 127).join('').slice(0, 60).trim(); }
|
|
73
76
|
|
|
74
77
|
function resolveModel(agent) {
|
|
75
78
|
const raw = agent.model || agent.config?.model || null;
|
|
@@ -198,10 +201,14 @@ async function fetchOneShot({ apiKey, agentId, model, logDir, since, sessionId,
|
|
|
198
201
|
process.stdout.write(`[wma-fetch] inspect with: npx wma-inspect ${logDir}\n`);
|
|
199
202
|
}
|
|
200
203
|
|
|
201
|
-
// ── CONTINUOUS / DAEMON
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
const
|
|
204
|
+
// ── CONTINUOUS / DAEMON (single agent or whole fleet) ───────────────────────
|
|
205
|
+
// `agents` = [{ agentId, model, displayName }]. One process watches them all.
|
|
206
|
+
async function runWatch({ apiKey, agents, logDir, intervalMs, uploadCtx }) {
|
|
207
|
+
const seenIds = new Set(); // stable Anthropic event ids already captured
|
|
208
|
+
for (const ag of agents) {
|
|
209
|
+
for (const id of await preloadSeenIds(logDir, ag.agentId)) seenIds.add(id);
|
|
210
|
+
}
|
|
211
|
+
const loggers = new Map(); // sessionId → Logger (session ids are globally unique)
|
|
205
212
|
const ended = new Set(); // sessions we've already closed with session_end
|
|
206
213
|
|
|
207
214
|
const ac = new AbortController();
|
|
@@ -209,56 +216,62 @@ async function runWatch({ apiKey, agentId, model, displayName, logDir, intervalM
|
|
|
209
216
|
process.on('SIGINT', shutdown);
|
|
210
217
|
process.on('SIGTERM', shutdown);
|
|
211
218
|
|
|
212
|
-
|
|
219
|
+
const fleet = agents.length > 1;
|
|
220
|
+
info(`watch mode — ${agents.length} agent(s), interval ${Math.round(intervalMs / 1000)}s, upload ${uploadCtx ? 'ON' : 'OFF'}, ${seenIds.size} known events preloaded`);
|
|
213
221
|
|
|
214
222
|
while (!ac.signal.aborted) {
|
|
215
223
|
const since = new Date(Date.now() - 24 * 3600 * 1000);
|
|
216
|
-
let sessions = [];
|
|
217
|
-
try { sessions = await listSessions(apiKey, { agentId, since }); }
|
|
218
|
-
catch (e) { warn(`listSessions failed: ${e.message}`); }
|
|
219
|
-
|
|
220
224
|
let cycleNew = 0;
|
|
221
|
-
for (const s of sessions) {
|
|
222
|
-
if (!s.id || ended.has(s.id)) continue;
|
|
223
|
-
let logger = loggers.get(s.id);
|
|
224
|
-
if (!logger) { logger = new Logger({ logDir, agentId, sessionId: s.id, silent: true }); loggers.set(s.id, logger); }
|
|
225
225
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
const written = await logger.write(entry);
|
|
233
|
-
fresh.push(written);
|
|
234
|
-
if (entry.action_type === 'state_transition'
|
|
235
|
-
&& entry.output?.scope === 'session'
|
|
236
|
-
&& entry.output?.state === 'terminated') sawTerminated = true;
|
|
237
|
-
}
|
|
238
|
-
} catch (e) { warn(`session ${s.id}: fetch failed: ${e.message}`); continue; }
|
|
226
|
+
for (const ag of agents) {
|
|
227
|
+
if (ac.signal.aborted) break;
|
|
228
|
+
const tag = fleet ? `[${ag.displayName}] ` : '';
|
|
229
|
+
let sessions = [];
|
|
230
|
+
try { sessions = await listSessions(apiKey, { agentId: ag.agentId, since }); }
|
|
231
|
+
catch (e) { warn(`${tag}listSessions failed: ${e.message}`); continue; }
|
|
239
232
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
233
|
+
for (const s of sessions) {
|
|
234
|
+
if (!s.id || ended.has(s.id)) continue;
|
|
235
|
+
let logger = loggers.get(s.id);
|
|
236
|
+
if (!logger) { logger = new Logger({ logDir, agentId: ag.agentId, sessionId: s.id, silent: true }); loggers.set(s.id, logger); }
|
|
243
237
|
|
|
244
|
-
|
|
238
|
+
const fresh = [];
|
|
239
|
+
let sawTerminated = false;
|
|
245
240
|
try {
|
|
246
|
-
const
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
241
|
+
for await (const entry of fetchSessionEntries({ apiKey, agentId: ag.agentId, sessionId: s.id, model: ag.model })) {
|
|
242
|
+
if (entry.id && seenIds.has(entry.id)) continue;
|
|
243
|
+
if (entry.id) seenIds.add(entry.id);
|
|
244
|
+
const written = await logger.write(entry);
|
|
245
|
+
fresh.push(written);
|
|
246
|
+
if (entry.action_type === 'state_transition'
|
|
247
|
+
&& entry.output?.scope === 'session'
|
|
248
|
+
&& entry.output?.state === 'terminated') sawTerminated = true;
|
|
249
|
+
}
|
|
250
|
+
} catch (e) { warn(`${tag}session ${s.id.slice(0, 16)}…: fetch failed: ${e.message}`); continue; }
|
|
251
|
+
|
|
252
|
+
if (fresh.length === 0) continue;
|
|
253
|
+
cycleNew += fresh.length;
|
|
254
|
+
info(`${tag}session ${s.id.slice(0, 16)}…: +${fresh.length} new event(s)`);
|
|
250
255
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
256
|
+
if (uploadCtx) {
|
|
257
|
+
try {
|
|
258
|
+
const resp = await uploadSignals(uploadCtx, ag.agentId, ag.displayName, fresh);
|
|
259
|
+
if (resp?.signal_id) info(` ↑ signals uploaded (signal_id ${resp.signal_id})`);
|
|
260
|
+
} catch (e) { warn(` signals upload failed: ${e.message}`); }
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (sawTerminated) {
|
|
264
|
+
const tracker = new TokenTracker();
|
|
265
|
+
for (const e of fresh) tracker.record(e);
|
|
266
|
+
const stats = tracker.stats().total;
|
|
267
|
+
await logger.write({
|
|
268
|
+
action_type: 'session_end', framework: 'anthropic-managed', status: 'ok', model: ag.model,
|
|
269
|
+
session_tokens: { input: stats.input, output: stats.output, cache_read: stats.cache_read, cache_creation: stats.cache_creation, total: stats.sum },
|
|
270
|
+
session_cost_usd: stats.cost_usd || null,
|
|
271
|
+
});
|
|
272
|
+
ended.add(s.id);
|
|
273
|
+
info(`${tag}session ${s.id.slice(0, 16)}… terminated — closed`);
|
|
274
|
+
}
|
|
262
275
|
}
|
|
263
276
|
}
|
|
264
277
|
|
|
@@ -275,10 +288,12 @@ async function main() {
|
|
|
275
288
|
const logDir = resolve(args['log-dir'] || './watchmyagents-logs');
|
|
276
289
|
const watch = !!args.watch;
|
|
277
290
|
const upload = !!args.upload;
|
|
291
|
+
const allAgents = !!args['all-agents'];
|
|
278
292
|
|
|
279
293
|
if (!apiKey) die('error: --api-key or ANTHROPIC_API_KEY required');
|
|
280
|
-
if (!agentId) die('error: --agent-id required (
|
|
281
|
-
if (!
|
|
294
|
+
if (!allAgents && !agentId) die('error: --agent-id required (or --all-agents for fleet mode)');
|
|
295
|
+
if (allAgents && !watch) die('error: --all-agents requires --watch (fleet daemon). For a one-shot, target a single --agent-id.');
|
|
296
|
+
if (agentId && !isValidAgentId(agentId)) {
|
|
282
297
|
die(`error: --agent-id has invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
|
|
283
298
|
}
|
|
284
299
|
const sessionIdArg = args['session-id'];
|
|
@@ -303,18 +318,30 @@ async function main() {
|
|
|
303
318
|
uploadCtx = { apiKey: wmaKey, salt, url: fortressEndpoint(base, 'ingest-signals') };
|
|
304
319
|
}
|
|
305
320
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
321
|
+
// Resolve the agent list: the whole fleet (--all-agents) or a single agent.
|
|
322
|
+
let agents;
|
|
323
|
+
if (allAgents) {
|
|
324
|
+
info('discovering agents (fleet mode)…');
|
|
325
|
+
const all = await listAgents(apiKey).catch((e) => die(`failed to list agents: ${e.message}`));
|
|
326
|
+
agents = all
|
|
327
|
+
.filter((a) => a.id && isValidAgentId(a.id))
|
|
328
|
+
.map((a) => ({ agentId: a.id, model: resolveModel(a), displayName: cleanLabel(a.name || a.id) }));
|
|
329
|
+
if (agents.length === 0) die('error: no agents found under this API key');
|
|
330
|
+
info(`fleet: ${agents.length} agent(s) — ${agents.map((a) => a.displayName).join(', ')}`);
|
|
331
|
+
} else {
|
|
332
|
+
info(`resolving agent ${agentId}…`);
|
|
333
|
+
const agent = await getAgent(apiKey, agentId).catch((e) => die(`failed to GET agent: ${e.message}`));
|
|
334
|
+
agents = [{ agentId, model: resolveModel(agent), displayName: cleanLabel(agent.name || agentId) }];
|
|
335
|
+
info(`model: ${agents[0].model || '(unknown)'}`);
|
|
336
|
+
}
|
|
311
337
|
|
|
312
338
|
if (watch) {
|
|
313
339
|
const intervalMs = parseDurationMs(args.interval, 5 * 60_000);
|
|
314
|
-
await runWatch({ apiKey,
|
|
340
|
+
await runWatch({ apiKey, agents, logDir, intervalMs, uploadCtx });
|
|
315
341
|
} else {
|
|
316
342
|
const since = args.since ? parseSince(args.since) : null;
|
|
317
|
-
|
|
343
|
+
const a = agents[0];
|
|
344
|
+
await fetchOneShot({ apiKey, agentId: a.agentId, model: a.model, logDir, since, sessionId: args['session-id'], dumpRaw: !!args['dump-raw'] });
|
|
318
345
|
}
|
|
319
346
|
}
|
|
320
347
|
|
package/scripts/service.js
CHANGED
|
@@ -251,9 +251,10 @@ function linuxUninstallOne(label) {
|
|
|
251
251
|
|
|
252
252
|
// ── Commands ────────────────────────────────────────────────────────────--
|
|
253
253
|
function cmdInstall(args) {
|
|
254
|
+
const allAgents = !!args['all-agents'];
|
|
254
255
|
const agentId = args['agent-id'];
|
|
255
|
-
if (!agentId) die('--agent-id required (
|
|
256
|
-
if (!isValidAgentId(agentId)) die(`--agent-id invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
|
|
256
|
+
if (!allAgents && !agentId) die('--agent-id required (or --all-agents to cover the whole fleet)');
|
|
257
|
+
if (agentId && !isValidAgentId(agentId)) die(`--agent-id invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
|
|
257
258
|
const interval = args.interval || '5m';
|
|
258
259
|
if (!/^\d+[smhd]$/.test(interval)) die(`--interval invalid format (expected like 30s, 5m, 1h, 2d; got "${interval}")`);
|
|
259
260
|
const logDir = args['log-dir'] || LOG_DIR_DEFAULT;
|
|
@@ -262,14 +263,15 @@ function cmdInstall(args) {
|
|
|
262
263
|
if (PLATFORM !== 'darwin' && PLATFORM !== 'linux') {
|
|
263
264
|
die(`unsupported platform "${PLATFORM}". Supported: macOS (launchd), Linux (systemd).\n` +
|
|
264
265
|
' Run the daemon manually or wrap it in your own process manager:\n' +
|
|
265
|
-
` wma-fetch --agent-id ${agentId} --watch --upload --interval ${interval}`);
|
|
266
|
+
` wma-fetch ${allAgents ? '--all-agents' : `--agent-id ${agentId}`} --watch --upload --interval ${interval}`);
|
|
266
267
|
}
|
|
267
268
|
|
|
268
269
|
mkdirSync(logDir, { recursive: true, mode: 0o700 });
|
|
269
270
|
writeEnvFile();
|
|
270
271
|
|
|
271
|
-
const
|
|
272
|
-
const
|
|
272
|
+
const target = allAgents ? ['--all-agents'] : ['--agent-id', agentId];
|
|
273
|
+
const watchArgs = [...target, '--watch', '--upload', '--interval', interval, '--log-dir', logDir];
|
|
274
|
+
const shieldArgs = [...target, '--policies-source', 'fortress', '--log-dir', logDir];
|
|
273
275
|
|
|
274
276
|
if (PLATFORM === 'darwin') {
|
|
275
277
|
macInstallOne(WATCH_LABEL, FETCH_SCRIPT, watchArgs);
|
package/scripts/shield.js
CHANGED
|
@@ -33,7 +33,7 @@ import {
|
|
|
33
33
|
getAgentConfig, detectAlwaysAsk,
|
|
34
34
|
} from '../src/shield/enforce.js';
|
|
35
35
|
import { DecisionLogger } from '../src/shield/decisions.js';
|
|
36
|
-
import { listSessions } from '../src/sources/anthropic-managed.js';
|
|
36
|
+
import { listSessions, listAgents } from '../src/sources/anthropic-managed.js';
|
|
37
37
|
import { FortressPolicySource, postDecision } from '../src/shield/sources/fortress.js';
|
|
38
38
|
import { resolveFortressBase } from '../src/fortress/url.js';
|
|
39
39
|
import { isValidAgentId, isValidSessionId } from '../src/validate.js';
|
|
@@ -423,10 +423,15 @@ async function main() {
|
|
|
423
423
|
explicitUrl: args['fortress-url'],
|
|
424
424
|
});
|
|
425
425
|
const logDir = resolve(args['log-dir'] || './watchmyagents-logs');
|
|
426
|
+
const allAgents = !!args['all-agents'];
|
|
426
427
|
|
|
427
428
|
if (!apiKey) die('error: --api-key or ANTHROPIC_API_KEY required');
|
|
428
|
-
if (!agentId) die('error: --agent-id required');
|
|
429
|
-
if (
|
|
429
|
+
if (!allAgents && !agentId) die('error: --agent-id required (or --all-agents for fleet mode)');
|
|
430
|
+
if (allAgents && singleSessionId) die('error: --all-agents is incompatible with --session-id');
|
|
431
|
+
if (allAgents && policiesSource !== 'fortress') {
|
|
432
|
+
die('error: --all-agents requires --policies-source fortress (per-agent policies).');
|
|
433
|
+
}
|
|
434
|
+
if (agentId && !isValidAgentId(agentId)) {
|
|
430
435
|
die(`error: --agent-id has invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
|
|
431
436
|
}
|
|
432
437
|
// --session-id ends up in the Anthropic SSE URL path (src/shield/stream.js).
|
|
@@ -435,120 +440,112 @@ async function main() {
|
|
|
435
440
|
die(`error: --session-id has invalid format (expected "sesn_" + alphanumeric, got "${singleSessionId}")`);
|
|
436
441
|
}
|
|
437
442
|
|
|
438
|
-
//
|
|
439
|
-
|
|
440
|
-
let
|
|
441
|
-
|
|
443
|
+
// Validate the policy source config once (shared across the fleet). For local
|
|
444
|
+
// mode the ruleset is loaded once and shared by every agent.
|
|
445
|
+
let sharedLocalRuleset = null;
|
|
442
446
|
if (policiesSource === 'fortress') {
|
|
443
447
|
if (!wmaApiKey) die('error: --policies-source fortress requires --wma-api-key or WMA_API_KEY env');
|
|
444
448
|
if (!fortressBase) die('error: --policies-source fortress requires --fortress-base-url or WMA_FORTRESS_BASE_URL env');
|
|
445
449
|
if (!/^wma_[a-f0-9]{32}$/i.test(wmaApiKey)) warn(`WMA_API_KEY format looks unusual (expected wma_<32hex>).`);
|
|
446
|
-
|
|
447
|
-
fortressPolicies = new FortressPolicySource({
|
|
448
|
-
apiKey: wmaApiKey,
|
|
449
|
-
base: fortressBase,
|
|
450
|
-
anthropicAgentId: agentId,
|
|
451
|
-
refreshIntervalMs: 5 * 60_000,
|
|
452
|
-
onError: (e) => warn(`policy refresh failed (keeping cached): ${e.message}`),
|
|
453
|
-
onRefresh: ({ policies, fetched_at, initial }) => {
|
|
454
|
-
info(`policies ${initial ? 'loaded' : 'refreshed'} from Fortress — ${policies.length} active (fetched_at: ${fetched_at})`);
|
|
455
|
-
},
|
|
456
|
-
});
|
|
457
|
-
try {
|
|
458
|
-
await fortressPolicies.start();
|
|
459
|
-
} catch (e) {
|
|
460
|
-
die(`error fetching policies from Fortress: ${e.message}\n` +
|
|
461
|
-
` Check WMA_FORTRESS_BASE_URL and WMA_API_KEY.`);
|
|
462
|
-
}
|
|
463
|
-
ruleset = fortressPolicies.current();
|
|
464
450
|
} else if (policiesSource === 'local') {
|
|
465
451
|
if (!policyPath) die('error: --policies-source local requires --policy <path-to-policies.json>');
|
|
466
|
-
try {
|
|
467
|
-
|
|
468
|
-
} catch (e) {
|
|
469
|
-
die(`error loading policies: ${e.message}`);
|
|
470
|
-
}
|
|
452
|
+
try { sharedLocalRuleset = await loadPolicies(resolve(policyPath)); }
|
|
453
|
+
catch (e) { die(`error loading policies: ${e.message}`); }
|
|
471
454
|
} else {
|
|
472
455
|
die('error: --policy <path> OR --policies-source fortress required');
|
|
473
456
|
}
|
|
474
457
|
|
|
475
|
-
|
|
476
|
-
let
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
? `Fortress (${fortressBase})`
|
|
486
|
-
: policyPath;
|
|
487
|
-
info(`armed — ${ruleset.policies.length} policies loaded from ${sourceLabel}`);
|
|
488
|
-
info(`default action when no rule matches: ${ruleset.default.action}`);
|
|
489
|
-
info(`agent: ${agentId}${agentMeta?.name ? ` "${agentMeta.name}"` : ''}`);
|
|
490
|
-
info(`enforcement mode: ${mode}`);
|
|
491
|
-
if (mode === 'interrupt') {
|
|
492
|
-
warn('DEGRADED mode — Shield will interrupt AFTER a violating tool runs.');
|
|
493
|
-
warn(`For pre-execution blocking, run: wma-shield --setup-guide --agent-id ${agentId}`);
|
|
458
|
+
// Resolve the agent list: whole fleet (--all-agents) or a single agent.
|
|
459
|
+
let agentIds;
|
|
460
|
+
if (allAgents) {
|
|
461
|
+
info('discovering agents (fleet mode)…');
|
|
462
|
+
const all = await listAgents(apiKey).catch((e) => die(`failed to list agents: ${e.message}`));
|
|
463
|
+
agentIds = all.map((a) => a.id).filter((id) => id && isValidAgentId(id));
|
|
464
|
+
if (agentIds.length === 0) die('error: no agents found under this API key');
|
|
465
|
+
info(`fleet: ${agentIds.length} agent(s)`);
|
|
466
|
+
} else {
|
|
467
|
+
agentIds = [agentId];
|
|
494
468
|
}
|
|
469
|
+
const fleet = agentIds.length > 1;
|
|
495
470
|
|
|
496
|
-
//
|
|
497
|
-
|
|
498
|
-
const
|
|
499
|
-
const
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
return loggers.get(sessionId);
|
|
471
|
+
// Shared infra: one shutdown signal, one fortress-source registry, one pusher.
|
|
472
|
+
const ac = new AbortController();
|
|
473
|
+
const fortressSources = [];
|
|
474
|
+
const shutdown = (sig) => {
|
|
475
|
+
info(`${sig} received, shutting down…`);
|
|
476
|
+
for (const fp of fortressSources) fp.stop();
|
|
477
|
+
ac.abort();
|
|
504
478
|
};
|
|
479
|
+
process.on('SIGINT', () => shutdown('SIGINT'));
|
|
480
|
+
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
|
505
481
|
|
|
506
|
-
// Optional Fortress decision pusher
|
|
507
|
-
//
|
|
508
|
-
// and-forget extra channel if both are set.
|
|
482
|
+
// Optional Fortress decision pusher (each ctx carries its own agent id, so a
|
|
483
|
+
// single shared pusher tags decisions with the right agent).
|
|
509
484
|
const canPushToFortress = !!(wmaApiKey && fortressBase);
|
|
510
485
|
const pushDecisionToFortress = canPushToFortress
|
|
511
486
|
? async (decisionData) => {
|
|
512
|
-
try {
|
|
513
|
-
|
|
514
|
-
} catch (e) {
|
|
515
|
-
warn(`Fortress decision push failed: ${e.message}`);
|
|
516
|
-
}
|
|
487
|
+
try { await postDecision({ apiKey: wmaApiKey, base: fortressBase, decision: decisionData }); }
|
|
488
|
+
catch (e) { warn(`Fortress decision push failed: ${e.message}`); }
|
|
517
489
|
}
|
|
518
490
|
: null;
|
|
519
491
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
492
|
+
// Per-agent SETUP (separate from the long-running phase so we can COUNT how
|
|
493
|
+
// many actually armed). In fleet mode a per-agent startup failure is skipped
|
|
494
|
+
// (warn) instead of killing the fleet. Returns the agent's ctx, or null if skipped.
|
|
495
|
+
async function setupAgent(aid) {
|
|
496
|
+
const tag = fleet ? `[${aid.slice(0, 16)}…] ` : '';
|
|
497
|
+
let fortressPolicies = null;
|
|
498
|
+
let ruleset = sharedLocalRuleset;
|
|
499
|
+
if (policiesSource === 'fortress') {
|
|
500
|
+
fortressPolicies = new FortressPolicySource({
|
|
501
|
+
apiKey: wmaApiKey, base: fortressBase, anthropicAgentId: aid, refreshIntervalMs: 5 * 60_000,
|
|
502
|
+
onError: (e) => warn(`${tag}policy refresh failed (keeping cached): ${e.message}`),
|
|
503
|
+
onRefresh: ({ policies, fetched_at, initial }) => info(`${tag}policies ${initial ? 'loaded' : 'refreshed'} from Fortress — ${policies.length} active (fetched_at: ${fetched_at})`),
|
|
504
|
+
});
|
|
505
|
+
try { await fortressPolicies.start(); }
|
|
506
|
+
catch (e) {
|
|
507
|
+
if (fleet) { warn(`${tag}skipped — policy fetch failed: ${e.message}`); return null; }
|
|
508
|
+
die(`error fetching policies from Fortress: ${e.message}\n Check WMA_FORTRESS_BASE_URL and WMA_API_KEY.`);
|
|
509
|
+
}
|
|
510
|
+
fortressSources.push(fortressPolicies);
|
|
511
|
+
ruleset = fortressPolicies.current();
|
|
512
|
+
}
|
|
531
513
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
apiKey,
|
|
535
|
-
|
|
536
|
-
get ruleset() {
|
|
537
|
-
return fortressPolicies ? fortressPolicies.current() : ruleset;
|
|
538
|
-
},
|
|
539
|
-
mode,
|
|
540
|
-
decisions,
|
|
541
|
-
pushDecisionToFortress,
|
|
542
|
-
signalsSalt,
|
|
543
|
-
signal: ac.signal,
|
|
544
|
-
};
|
|
514
|
+
let mode = 'interrupt';
|
|
515
|
+
let agentMeta = null;
|
|
516
|
+
try { agentMeta = await getAgentConfig(apiKey, aid); if (detectAlwaysAsk(agentMeta)) mode = 'tool_confirmation'; }
|
|
517
|
+
catch (e) { warn(`${tag}could not fetch agent config (${e.message}). Defaulting to interrupt mode.`); }
|
|
545
518
|
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
519
|
+
info(`${tag}armed — ${ruleset.policies.length} policies · default ${ruleset.default.action} · mode ${mode}${agentMeta?.name ? ` · "${agentMeta.name}"` : ''}`);
|
|
520
|
+
if (mode === 'interrupt' && !fleet) {
|
|
521
|
+
warn('DEGRADED mode — Shield will interrupt AFTER a violating tool runs.');
|
|
522
|
+
warn(`For pre-execution blocking, run: wma-shield --setup-guide --agent-id ${aid}`);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
const loggers = new Map();
|
|
526
|
+
const decisions = (sessionId) => {
|
|
527
|
+
if (!loggers.has(sessionId)) loggers.set(sessionId, new DecisionLogger({ logDir, agentId: aid, sessionId }));
|
|
528
|
+
return loggers.get(sessionId);
|
|
529
|
+
};
|
|
530
|
+
return {
|
|
531
|
+
apiKey, agentId: aid,
|
|
532
|
+
get ruleset() { return fortressPolicies ? fortressPolicies.current() : ruleset; },
|
|
533
|
+
mode, decisions, pushDecisionToFortress, signalsSalt, signal: ac.signal,
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
// Phase 1: arm every agent. Fail LOUD if none armed (otherwise the process would
|
|
538
|
+
// exit silently and — under launchd/systemd — restart-loop without a clear cause).
|
|
539
|
+
const ctxs = (await Promise.all(agentIds.map(setupAgent))).filter(Boolean);
|
|
540
|
+
if (ctxs.length === 0) {
|
|
541
|
+
die(`error: no agents could be armed (${agentIds.length} discovered; all policy fetches failed). Check WMA_API_KEY / WMA_FORTRESS_BASE_URL.`);
|
|
551
542
|
}
|
|
543
|
+
if (fleet) info(`armed ${ctxs.length}/${agentIds.length} agent(s); watching.`);
|
|
544
|
+
|
|
545
|
+
// Phase 2: run each agent's loop (blocks until SIGINT/SIGTERM).
|
|
546
|
+
await Promise.all(ctxs.map((ctx) => (
|
|
547
|
+
singleSessionId ? runSessionWorker({ sessionId: singleSessionId, ctx }) : runAgentWide(ctx)
|
|
548
|
+
)));
|
|
552
549
|
}
|
|
553
550
|
|
|
554
551
|
main().catch(e => {
|
|
@@ -77,6 +77,24 @@ export async function getAgent(apiKey, agentId) {
|
|
|
77
77
|
return getWithRetry(apiKey, `/v1/agents/${agentId}`);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
+
// List every Managed Agent under the API key (paginated). Used for fleet mode
|
|
81
|
+
// (watch/shield/service --all-agents) and agent discovery.
|
|
82
|
+
export async function listAgents(apiKey, { limit = 100 } = {}) {
|
|
83
|
+
const agents = [];
|
|
84
|
+
let after = null;
|
|
85
|
+
while (true) {
|
|
86
|
+
const qs = new URLSearchParams({ limit: String(limit) });
|
|
87
|
+
if (after) qs.set('after_id', after);
|
|
88
|
+
const data = await getWithRetry(apiKey, `/v1/agents?${qs}`);
|
|
89
|
+
const page = data.data || [];
|
|
90
|
+
for (const a of page) agents.push(a);
|
|
91
|
+
if (!data.has_more || page.length === 0) break;
|
|
92
|
+
after = page[page.length - 1]?.id;
|
|
93
|
+
if (!after) break;
|
|
94
|
+
}
|
|
95
|
+
return agents;
|
|
96
|
+
}
|
|
97
|
+
|
|
80
98
|
export async function listSessions(apiKey, { agentId, since, limit = 100 } = {}) {
|
|
81
99
|
const sessions = [];
|
|
82
100
|
let after = null;
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$comment": "WatchMyAgents — typology classifier weights + thresholds (Guardian Core, agent-typology-classification.spec.md §3/§4/§5). INVARIANT: weights and thresholds live HERE, never hardcoded in typology.js ('poids de signature en config, pas en dur'). Calibrate on labelled real traffic. Modèle C: all inputs are anonymized behavioural fractions/flags only.",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"updated_at": "2026-05-29T00:00:00Z",
|
|
5
|
+
|
|
6
|
+
"thresholds": {
|
|
7
|
+
"$comment": "§4 'Seuils par défaut (à calibrer)' + §5 downgrade asymmetry.",
|
|
8
|
+
"n_events_min": 50,
|
|
9
|
+
"confidence_min": 0.70,
|
|
10
|
+
"margin_min": 0.15,
|
|
11
|
+
"stable_windows": 3,
|
|
12
|
+
"downgrade_confidence_min": 0.85,
|
|
13
|
+
"downgrade_windows": 5,
|
|
14
|
+
"untrusted_modifier_min": 0.1,
|
|
15
|
+
"sensitive_modifier_min": 0.0,
|
|
16
|
+
"payment_overlay_min": 0.0,
|
|
17
|
+
"autonomy_modifier_min": 0.5,
|
|
18
|
+
"$comment_tie": "§8 conservative tie-break: when |score(top1)-score(top2)| <= tie_epsilon (a near/exact tie between two REAL types with real signal), select the STRICTER of the two rather than falling to the more-permissive generic — 'dans le doute, on reste sur le plus protecteur'. Set to 0 for exact-tie only.",
|
|
19
|
+
"tie_epsilon": 0.0
|
|
20
|
+
},
|
|
21
|
+
|
|
22
|
+
"confidence_sigmoid": {
|
|
23
|
+
"$comment": "§4 confidence = sigmoid(a·top1.score + b·margin + c·log(n_events)). All three coefficients live in config; a naive impl that only used top1.score would be wrong.",
|
|
24
|
+
"a": 4.0,
|
|
25
|
+
"b": 6.0,
|
|
26
|
+
"c": 0.6,
|
|
27
|
+
"bias": -3.5
|
|
28
|
+
},
|
|
29
|
+
|
|
30
|
+
"strictness_rank": {
|
|
31
|
+
"$comment": "§5 restriction ranking — derived from each template's baseline_policies enforcement severity (isolate>block>require_approval>throttle>monitor>warn). Higher rank = STRICTER. Drives re-classification asymmetry: to a stricter rank = normal threshold; to a looser rank = downgrade gate (conf>=0.85 AND 5 windows). NOT alphabetical.",
|
|
32
|
+
"devops_infra": 10,
|
|
33
|
+
"transactional_financial": 9,
|
|
34
|
+
"workflow_backoffice": 8,
|
|
35
|
+
"coding": 7,
|
|
36
|
+
"orchestrator": 6,
|
|
37
|
+
"browser_web": 5,
|
|
38
|
+
"personal_assistant": 4,
|
|
39
|
+
"data_rag": 3,
|
|
40
|
+
"generic": 2,
|
|
41
|
+
"customer_facing": 1
|
|
42
|
+
},
|
|
43
|
+
|
|
44
|
+
"features": {
|
|
45
|
+
"$comment": "Canonical anonymized feature keys (Modèle C). Fractions f_* in [0,1]; flag_* in {0,1}; aux_* in [0,1]. Order is informational only — scoring is key-addressed.",
|
|
46
|
+
"fractions": ["f_code", "f_browser", "f_database", "f_http", "f_email", "f_payment", "f_secret", "f_search", "f_memory", "f_handoff", "f_user_msg", "f_file"],
|
|
47
|
+
"flags": ["flag_deploy", "flag_internal_sys", "flag_on_behalf"],
|
|
48
|
+
"aux": ["aux_autonomy", "aux_untrusted", "aux_sensitive"]
|
|
49
|
+
},
|
|
50
|
+
|
|
51
|
+
"weights": {
|
|
52
|
+
"$comment": "w[type][feature] — signature weights (§3). Positive = signal for the type; negative = signal against. flag_* are the REQUIRED discriminators for the 3 inseparable pairs (coding/devops, data_rag/workflow, personal_assistant/workflow). 'generic' has no positive weights (pure fallback).",
|
|
53
|
+
|
|
54
|
+
"coding": {
|
|
55
|
+
"f_code": 1.0, "f_file": 0.5, "f_search": 0.3, "f_secret": 0.1,
|
|
56
|
+
"flag_deploy": -0.9
|
|
57
|
+
},
|
|
58
|
+
"devops_infra": {
|
|
59
|
+
"f_code": 0.7, "f_secret": 0.6, "f_file": 0.2,
|
|
60
|
+
"flag_deploy": 1.2
|
|
61
|
+
},
|
|
62
|
+
"data_rag": {
|
|
63
|
+
"f_database": 0.8, "f_search": 0.35, "f_memory": 0.7, "aux_untrusted": 0.2,
|
|
64
|
+
"flag_internal_sys": -0.7
|
|
65
|
+
},
|
|
66
|
+
"customer_facing": {
|
|
67
|
+
"f_user_msg": 1.0, "f_handoff": 0.3, "f_email": 0.2
|
|
68
|
+
},
|
|
69
|
+
"browser_web": {
|
|
70
|
+
"f_browser": 1.0, "f_http": 0.6, "f_search": 0.7
|
|
71
|
+
},
|
|
72
|
+
"orchestrator": {
|
|
73
|
+
"f_handoff": 1.2, "f_code": -0.2, "f_browser": -0.2, "f_database": -0.2
|
|
74
|
+
},
|
|
75
|
+
"workflow_backoffice": {
|
|
76
|
+
"f_database": 0.6, "f_http": 0.5, "f_file": 0.2,
|
|
77
|
+
"flag_internal_sys": 0.9, "flag_on_behalf": -0.6
|
|
78
|
+
},
|
|
79
|
+
"personal_assistant": {
|
|
80
|
+
"f_email": 0.8, "f_file": 0.4, "f_user_msg": 0.3,
|
|
81
|
+
"flag_on_behalf": 1.0
|
|
82
|
+
},
|
|
83
|
+
"transactional_financial": {
|
|
84
|
+
"f_payment": 1.5
|
|
85
|
+
},
|
|
86
|
+
"generic": {}
|
|
87
|
+
}
|
|
88
|
+
}
|
package/src/typology.js
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
// Agent typology classifier — maps an agent's OBSERVED behaviour to one of the
|
|
2
|
+
// 10 Guardian Core archetypes, for Shield template selection / refinement.
|
|
3
|
+
//
|
|
4
|
+
// Source of truth: GUARDIAN CORE/agent-typology-classification.spec.md (v0.1) +
|
|
5
|
+
// GUARDIAN CORE/schemas/agent-classification.schema.json. classifyAgentType()
|
|
6
|
+
// returns an object conforming EXACTLY to that schema.
|
|
7
|
+
//
|
|
8
|
+
// Why behaviour, not config: Anthropic Managed Agents expose their tools as an
|
|
9
|
+
// opaque bundle (`agent_toolset_20260401`), so static config can't tell a
|
|
10
|
+
// researcher from a coder. We classify from anonymized behavioural signals
|
|
11
|
+
// (Modèle C): per-tool-category FRACTIONS (f_*), boolean local flags (flag_*),
|
|
12
|
+
// and aux ratios (aux_*). NEVER raw content — no prompts, no outputs, no names.
|
|
13
|
+
//
|
|
14
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
15
|
+
// GLOBAL-BASELINE INDEPENDENCE (spec §1, §5 — INVARIANT, read this):
|
|
16
|
+
// The `global-baseline` (5 mandatory fail_closed floors) ALWAYS applies,
|
|
17
|
+
// regardless of the result — or absence — of classification. A bad
|
|
18
|
+
// classification degrades REFINEMENT, never the FLOOR. This classifier MUST
|
|
19
|
+
// NEVER gate, relax, or sit on the critical path of those floors. Nothing
|
|
20
|
+
// returned here can disable a floor. Template swaps bring new *probabilistic*
|
|
21
|
+
// policies in via `shadow` first; mandatory floors are never relaxed during
|
|
22
|
+
// the transition.
|
|
23
|
+
// ──────────────────────────────────────────────────────────────────────────
|
|
24
|
+
//
|
|
25
|
+
// INVARIANTS enforced here:
|
|
26
|
+
// 1. Modèle C — inputs are anonymized fractions/flags/aux ONLY.
|
|
27
|
+
// 2. Weights + thresholds come from config (typology-weights.json), never
|
|
28
|
+
// hardcoded in the logic below.
|
|
29
|
+
// 3. No easy downgrade — moving to a LESS strict template needs a raised
|
|
30
|
+
// confidence (0.85) AND a longer window (5), per the strictness ranking.
|
|
31
|
+
// 4. global-baseline is independent of classification (see banner above).
|
|
32
|
+
|
|
33
|
+
import { readFileSync } from 'node:fs';
|
|
34
|
+
import { fileURLToPath } from 'node:url';
|
|
35
|
+
import { dirname, join } from 'node:path';
|
|
36
|
+
|
|
37
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
38
|
+
|
|
39
|
+
// The 10 archetypes (schema `classified_type` enum, exact order/spelling).
|
|
40
|
+
export const ARCHETYPES = [
|
|
41
|
+
'coding', 'devops_infra', 'data_rag', 'customer_facing', 'browser_web',
|
|
42
|
+
'orchestrator', 'workflow_backoffice', 'personal_assistant',
|
|
43
|
+
'transactional_financial', 'generic',
|
|
44
|
+
];
|
|
45
|
+
|
|
46
|
+
// Modifiers (schema enum). They ONLY add restrictions, so they are activatable
|
|
47
|
+
// immediately — no asymmetry / hysteresis (spec §6).
|
|
48
|
+
export const MODIFIERS = ['autonomy', 'untrusted_input', 'data_sensitivity', 'regulated'];
|
|
49
|
+
|
|
50
|
+
// ── Config (weights + thresholds + strictness ranking) — loaded once. ───────
|
|
51
|
+
// INVARIANT 2: nothing below hardcodes a weight or threshold; everything that
|
|
52
|
+
// influences the decision is read from this file.
|
|
53
|
+
let _config = null;
|
|
54
|
+
export function loadConfig(path = join(__dirname, 'typology-weights.json')) {
|
|
55
|
+
if (_config && path === _config.__path) return _config;
|
|
56
|
+
const raw = JSON.parse(readFileSync(path, 'utf8'));
|
|
57
|
+
raw.__path = path;
|
|
58
|
+
_config = raw;
|
|
59
|
+
return _config;
|
|
60
|
+
}
|
|
61
|
+
// Test/seam: inject a config object directly.
|
|
62
|
+
export function setConfig(cfg) { _config = { ...cfg, __path: '<injected>' }; return _config; }
|
|
63
|
+
|
|
64
|
+
const sigmoid = (x) => 1 / (1 + Math.exp(-x));
|
|
65
|
+
const clamp01 = (x) => Math.max(0, Math.min(1, x));
|
|
66
|
+
|
|
67
|
+
// Strict comparison helper for re-classification asymmetry. Higher rank =
|
|
68
|
+
// stricter template. Moving to >= current rank is an "upgrade" (or lateral);
|
|
69
|
+
// moving to a strictly LOWER rank is a "downgrade" (gated).
|
|
70
|
+
function strictnessOf(cfg, type) {
|
|
71
|
+
const r = cfg.strictness_rank || {};
|
|
72
|
+
return Number.isFinite(r[type]) ? r[type] : 0;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Build the canonical feature vector from a loose features object.
|
|
77
|
+
* Only the schema-legal keys are kept; everything is coerced to a number and
|
|
78
|
+
* clamped to [0,1] (the schema requires every feature_vector value in [0,1]).
|
|
79
|
+
* Missing features default to 0 — Modèle C: an absent signal is "not observed",
|
|
80
|
+
* never inferred from content.
|
|
81
|
+
*/
|
|
82
|
+
function normalizeFeatures(cfg, features) {
|
|
83
|
+
const fr = cfg.features?.fractions || [];
|
|
84
|
+
const fl = cfg.features?.flags || [];
|
|
85
|
+
const ax = cfg.features?.aux || [];
|
|
86
|
+
const out = {};
|
|
87
|
+
for (const k of [...fr, ...fl, ...ax]) {
|
|
88
|
+
const v = Number(features?.[k]);
|
|
89
|
+
out[k] = Number.isFinite(v) ? clamp01(v) : 0;
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** score(type) = Σ_i w[type][i] · feature_i (spec §4). */
|
|
95
|
+
function scoreType(weightsForType, fv) {
|
|
96
|
+
let s = 0;
|
|
97
|
+
for (const [feat, w] of Object.entries(weightsForType || {})) {
|
|
98
|
+
s += (Number(w) || 0) * (fv[feat] || 0);
|
|
99
|
+
}
|
|
100
|
+
return s;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Rank all archetypes by score. Returns the full sorted list plus top1/top2.
|
|
105
|
+
* Tie-break (spec §8): on EQUAL dominance, the STRICTER type wins (conservative).
|
|
106
|
+
* 'generic' is excluded from the positive ranking — it is the fallback only.
|
|
107
|
+
*/
|
|
108
|
+
function rankTypes(cfg, fv) {
|
|
109
|
+
const scored = ARCHETYPES
|
|
110
|
+
.filter((t) => t !== 'generic')
|
|
111
|
+
.map((t) => ({ type: t, score: scoreType(cfg.weights?.[t], fv) }));
|
|
112
|
+
scored.sort((a, b) => {
|
|
113
|
+
if (b.score !== a.score) return b.score - a.score;
|
|
114
|
+
// tie → stricter (higher strictness_rank) first
|
|
115
|
+
return strictnessOf(cfg, b.type) - strictnessOf(cfg, a.type);
|
|
116
|
+
});
|
|
117
|
+
return scored;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* classifyAgentType(features[, prior][, opts]) → object conforming EXACTLY to
|
|
122
|
+
* agent-classification.schema.json.
|
|
123
|
+
*
|
|
124
|
+
* @param {object} features Anonymized behavioural signals (Modèle C):
|
|
125
|
+
* agent_id {string} pass-through identifier (no content)
|
|
126
|
+
* f_code,f_browser,… {number} per-category FRACTIONS in [0,1]
|
|
127
|
+
* flag_deploy,… {0|1|bool} local discriminator flags (no content)
|
|
128
|
+
* aux_autonomy,… {number} aux ratios in [0,1]
|
|
129
|
+
* n_events {number} events in the current sliding window
|
|
130
|
+
* @param {object} [prior] Previous classification result (the caller
|
|
131
|
+
* threads this so the state machine + asymmetry work across windows). Reads:
|
|
132
|
+
* classified_type, stage, windows_consistent, strictness_rank,
|
|
133
|
+
* last_reclassified_at.
|
|
134
|
+
* @param {object} [opts]
|
|
135
|
+
* regulated {boolean} tenant/Fortress flag (config-driven, NOT
|
|
136
|
+
* behavioural) → adds the `regulated` modifier
|
|
137
|
+
* now {string} ISO timestamp seam for tests
|
|
138
|
+
* config {object} inject config (else loaded from disk)
|
|
139
|
+
* @returns {object} schema-conformant classification result
|
|
140
|
+
*/
|
|
141
|
+
export function classifyAgentType(features = {}, prior = null, opts = {}) {
|
|
142
|
+
const cfg = opts.config ? setConfig(opts.config) : loadConfig();
|
|
143
|
+
const th = cfg.thresholds || {};
|
|
144
|
+
const sg = cfg.confidence_sigmoid || {};
|
|
145
|
+
const now = opts.now || new Date().toISOString();
|
|
146
|
+
|
|
147
|
+
const agent_id = String(features.agent_id ?? prior?.agent_id ?? '');
|
|
148
|
+
const fv = normalizeFeatures(cfg, features);
|
|
149
|
+
// Floor + finiteness guard: the schema declares window_events as integer.
|
|
150
|
+
// Non-finite (Infinity/NaN) → 0 so it can't saturate confidence via log(n).
|
|
151
|
+
const _rawN = Number(features.n_events);
|
|
152
|
+
const nEvents = Number.isFinite(_rawN) ? Math.max(0, Math.floor(_rawN)) : 0;
|
|
153
|
+
|
|
154
|
+
// ── Score every archetype, find top1 / top2 / margin (spec §4). ──────────
|
|
155
|
+
const ranked = rankTypes(cfg, fv);
|
|
156
|
+
const top1 = ranked[0] || { type: 'generic', score: 0 };
|
|
157
|
+
const top2 = ranked[1] || { type: 'generic', score: 0 };
|
|
158
|
+
const margin = top1.score - top2.score;
|
|
159
|
+
|
|
160
|
+
// confidence = sigmoid(a·top1.score + b·margin + c·log(n_events) + bias).
|
|
161
|
+
// All three terms (top1 score, margin, log n_events) are folded in — NOT just
|
|
162
|
+
// top1.score. Coefficients a/b/c/bias come from config.
|
|
163
|
+
const logN = Math.log(Math.max(1, nEvents));
|
|
164
|
+
const confidence = clamp01(
|
|
165
|
+
sigmoid((sg.a || 0) * top1.score + (sg.b || 0) * margin + (sg.c || 0) * logN + (sg.bias || 0)),
|
|
166
|
+
);
|
|
167
|
+
|
|
168
|
+
// ── Candidate type per the gates (spec §4). ──────────────────────────────
|
|
169
|
+
// n_events < MIN_EVENTS → generic (cold-start)
|
|
170
|
+
// OR confidence < CONF_THRESHOLD → generic
|
|
171
|
+
// OR margin < MARGIN_MIN → generic
|
|
172
|
+
// else → top1.type
|
|
173
|
+
let candidate;
|
|
174
|
+
const belowMinEvents = nEvents < th.n_events_min;
|
|
175
|
+
const lowConfidence = confidence < th.confidence_min;
|
|
176
|
+
const lowMargin = margin < th.margin_min;
|
|
177
|
+
|
|
178
|
+
// Conservative tie-break (spec §8): "en cas d'égalité, choisir le plus strict
|
|
179
|
+
// (conservateur)". When the top two are a near-TIE (|margin| ≤ tie_epsilon)
|
|
180
|
+
// between two REAL types and there is real signal (top1.score > 0), dropping
|
|
181
|
+
// to generic would RELAX protection — so instead we keep the STRICTER of the
|
|
182
|
+
// tied pair. rankTypes() already sorts the stricter type first on an exact
|
|
183
|
+
// tie, so top1 IS the stricter one here. This applies only on a true tie; a
|
|
184
|
+
// genuinely ambiguous low-signal window (no tie, just a small margin) still
|
|
185
|
+
// falls back to generic via the margin gate below.
|
|
186
|
+
const tieEps = th.tie_epsilon ?? 0;
|
|
187
|
+
const isTie = top1.score > 0 && top2.type !== 'generic' && Math.abs(margin) <= tieEps;
|
|
188
|
+
|
|
189
|
+
if (belowMinEvents) candidate = 'generic';
|
|
190
|
+
else if (isTie) candidate = top1.type; // stricter-wins, conservative
|
|
191
|
+
else if (lowConfidence || lowMargin) candidate = 'generic';
|
|
192
|
+
else candidate = top1.type;
|
|
193
|
+
|
|
194
|
+
// ── State machine + re-classification asymmetry (spec §5). ───────────────
|
|
195
|
+
// We accept the prior state as input so the CALLER threads it across windows;
|
|
196
|
+
// this function is otherwise pure for a given (features, prior).
|
|
197
|
+
const priorType = prior?.classified_type || 'generic';
|
|
198
|
+
const priorStage = prior?.stage || 'cold_start';
|
|
199
|
+
const priorWindows = Math.max(0, Math.floor(Number(prior?.windows_consistent) || 0));
|
|
200
|
+
const priorReclassAt = prior?.last_reclassified_at || null;
|
|
201
|
+
// Last real (non-generic) type, threaded across generic gaps. Closes the
|
|
202
|
+
// generic-laundering downgrade bypass: a strict→generic→looser-real sequence
|
|
203
|
+
// must still face the downgrade gate against the ORIGINAL strict rank.
|
|
204
|
+
const priorLastReal = prior?.last_real_type || (priorType !== 'generic' ? priorType : null);
|
|
205
|
+
// The candidate the prior window(s) were already accumulating toward (if any).
|
|
206
|
+
// The caller threads this so a pending change builds consecutive evidence
|
|
207
|
+
// across windows instead of resetting every window.
|
|
208
|
+
const priorPending = prior?.pending_type || null;
|
|
209
|
+
|
|
210
|
+
let classified_type = priorType;
|
|
211
|
+
let stage = priorStage;
|
|
212
|
+
let windows_consistent = priorWindows;
|
|
213
|
+
let last_reclassified_at = priorReclassAt;
|
|
214
|
+
// pending_type: the candidate we are accumulating consecutive windows toward
|
|
215
|
+
// but have not yet committed (hysteresis / asymmetry not satisfied). Surfaced
|
|
216
|
+
// in the result so the caller can thread it back next window.
|
|
217
|
+
let pending_type = null;
|
|
218
|
+
let pending_windows = 0;
|
|
219
|
+
|
|
220
|
+
if (belowMinEvents) {
|
|
221
|
+
// A low-traffic window must NOT collapse an established type. An adversary
|
|
222
|
+
// could throttle below MIN_EVENTS to shed a strict template (downgrade
|
|
223
|
+
// bypass). If we already hold a real type, RETAIN it (freeze the window
|
|
224
|
+
// count); only a genuinely cold agent (no prior real type) stays generic.
|
|
225
|
+
if (priorType !== 'generic') {
|
|
226
|
+
classified_type = priorType;
|
|
227
|
+
stage = priorStage;
|
|
228
|
+
windows_consistent = priorWindows;
|
|
229
|
+
} else {
|
|
230
|
+
classified_type = 'generic';
|
|
231
|
+
stage = 'cold_start';
|
|
232
|
+
windows_consistent = 0;
|
|
233
|
+
}
|
|
234
|
+
} else if (candidate === priorType) {
|
|
235
|
+
// Same type as last window → accumulate consistency (hysteresis).
|
|
236
|
+
windows_consistent = priorWindows + 1;
|
|
237
|
+
// provisional → stable after STABLE_WINDOWS consecutive consistent windows.
|
|
238
|
+
if (classified_type !== 'generic' && stage !== 'stable' &&
|
|
239
|
+
windows_consistent >= th.stable_windows) {
|
|
240
|
+
stage = 'stable';
|
|
241
|
+
} else if (classified_type !== 'generic' && stage === 'cold_start') {
|
|
242
|
+
stage = 'provisional';
|
|
243
|
+
}
|
|
244
|
+
} else {
|
|
245
|
+
// Type would CHANGE relative to the prior. Decide whether the change is
|
|
246
|
+
// allowed THIS window, or whether we must accumulate more evidence.
|
|
247
|
+
const toRank = strictnessOf(cfg, candidate);
|
|
248
|
+
// Reference rank for downgrade detection: the prior REAL type, or — across a
|
|
249
|
+
// generic gap — the last real type before we fell to generic. Using the
|
|
250
|
+
// last-real reference closes the generic-laundering bypass (strict → generic
|
|
251
|
+
// → looser-real must still face the downgrade gate against the strict rank).
|
|
252
|
+
const refType = (priorType !== 'generic') ? priorType : (priorLastReal || 'generic');
|
|
253
|
+
const refRank = strictnessOf(cfg, refType);
|
|
254
|
+
// A "downgrade" = moving to a real template STRICTLY LESS strict than the
|
|
255
|
+
// reference. Upgrading / lateral is NOT a downgrade. Falling back TO generic
|
|
256
|
+
// is handled below (never relaxes the floor — the global-baseline always applies).
|
|
257
|
+
const isDowngrade = candidate !== 'generic' && refType !== 'generic' && toRank < refRank;
|
|
258
|
+
|
|
259
|
+
// Required consecutive-consistent-window count BEFORE applying the change.
|
|
260
|
+
// Leaving generic (cold_start → provisional): the FIRST window clearing
|
|
261
|
+
// the gates commits — provisional = "1er type au-dessus du seuil" (§5).
|
|
262
|
+
// Upgrade / lateral (real → real, equal-or-stricter): normal hysteresis
|
|
263
|
+
// STABLE_WINDOWS, never relaxes the floor.
|
|
264
|
+
// Downgrade (real → LESS strict): longer DOWNGRADE_WINDOWS AND a raised
|
|
265
|
+
// confidence floor (anti mimicry-evasion: an agent must not soften its
|
|
266
|
+
// protection by imitating a more permissive type).
|
|
267
|
+
const leavingGeneric = priorType === 'generic';
|
|
268
|
+
// Leaving generic is fast (1 window) ONLY when it is not a net downgrade vs
|
|
269
|
+
// the last real type. A net downgrade — even laundered through generic —
|
|
270
|
+
// takes the full gate: longer window AND raised confidence (anti-evasion).
|
|
271
|
+
const neededWindows = isDowngrade ? th.downgrade_windows
|
|
272
|
+
: (leavingGeneric ? 1 : th.stable_windows);
|
|
273
|
+
const neededConfidence = isDowngrade ? th.downgrade_confidence_min : th.confidence_min;
|
|
274
|
+
|
|
275
|
+
// Consecutive consistent windows toward THIS candidate. If the prior window
|
|
276
|
+
// was already accumulating toward the same candidate, continue the count;
|
|
277
|
+
// otherwise this is the first window of a fresh pending change.
|
|
278
|
+
const accWindows = (priorPending === candidate)
|
|
279
|
+
? Math.max(0, Math.floor(Number(prior?.pending_windows) || 0)) + 1
|
|
280
|
+
: 1;
|
|
281
|
+
|
|
282
|
+
if (candidate === 'generic') {
|
|
283
|
+
// Falling back to generic is never a security relaxation we must gate —
|
|
284
|
+
// the global-baseline floor still applies — but we still respect
|
|
285
|
+
// hysteresis so a single noisy window can't flap us out of a real type.
|
|
286
|
+
if (priorType === 'generic') {
|
|
287
|
+
windows_consistent = priorWindows + 1;
|
|
288
|
+
classified_type = 'generic';
|
|
289
|
+
stage = 'cold_start';
|
|
290
|
+
} else {
|
|
291
|
+
// Accumulate toward dropping the type, but keep the (stricter) prior
|
|
292
|
+
// until hysteresis is satisfied — conservative.
|
|
293
|
+
if (accWindows >= th.stable_windows) {
|
|
294
|
+
classified_type = 'generic';
|
|
295
|
+
stage = 'cold_start';
|
|
296
|
+
windows_consistent = 1;
|
|
297
|
+
last_reclassified_at = now;
|
|
298
|
+
} else {
|
|
299
|
+
pending_type = 'generic';
|
|
300
|
+
pending_windows = accWindows;
|
|
301
|
+
// classified_type / stage / windows_consistent unchanged (stay put).
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
} else if (confidence >= neededConfidence && accWindows >= neededWindows) {
|
|
305
|
+
// Enough consecutive evidence (counting the current window) to commit the
|
|
306
|
+
// change. The caller threads pending_type/pending_windows so consecutive
|
|
307
|
+
// windows toward the same candidate accumulate.
|
|
308
|
+
classified_type = candidate;
|
|
309
|
+
// A freshly committed type always lands in 'provisional'; it climbs to
|
|
310
|
+
// 'stable' only after STABLE_WINDOWS consecutive same-type windows.
|
|
311
|
+
stage = 'provisional';
|
|
312
|
+
windows_consistent = 1;
|
|
313
|
+
last_reclassified_at = now;
|
|
314
|
+
} else {
|
|
315
|
+
// Not enough evidence yet → keep the prior (stricter-by-default) type and
|
|
316
|
+
// record the pending candidate so the next window can build on it. We do
|
|
317
|
+
// NOT touch windows_consistent of the committed type (it still applies).
|
|
318
|
+
pending_type = candidate;
|
|
319
|
+
pending_windows = accWindows;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Stage sanity: generic is always cold_start.
|
|
324
|
+
if (classified_type === 'generic') stage = 'cold_start';
|
|
325
|
+
|
|
326
|
+
// Last real (non-generic) type — threaded so a generic gap doesn't erase the
|
|
327
|
+
// downgrade reference (see priorLastReal). Persists across generic windows.
|
|
328
|
+
const last_real_type = (classified_type !== 'generic') ? classified_type : (priorLastReal || null);
|
|
329
|
+
|
|
330
|
+
// ── Modifiers (spec §6): additive restrictions, no asymmetry/hysteresis. ──
|
|
331
|
+
const modifiers = [];
|
|
332
|
+
const autonomyLevel = String(features.autonomy_level ?? features.aux_autonomy_level ?? '');
|
|
333
|
+
const auxAutonomy = Number(features.aux_autonomy) || 0;
|
|
334
|
+
// autonomy: explicit level in {act_with_approval, autonomous}, or a high ratio.
|
|
335
|
+
if (['act_with_approval', 'autonomous'].includes(autonomyLevel) || auxAutonomy >= (th.autonomy_modifier_min ?? 0.5)) {
|
|
336
|
+
modifiers.push('autonomy');
|
|
337
|
+
}
|
|
338
|
+
if ((fv.aux_untrusted || 0) > (th.untrusted_modifier_min ?? 0.1)) {
|
|
339
|
+
modifiers.push('untrusted_input');
|
|
340
|
+
}
|
|
341
|
+
if ((fv.aux_sensitive || 0) > (th.sensitive_modifier_min ?? 0)) {
|
|
342
|
+
modifiers.push('data_sensitivity');
|
|
343
|
+
}
|
|
344
|
+
// regulated is tenant/Fortress config — NOT behavioural.
|
|
345
|
+
if (opts.regulated === true) modifiers.push('regulated');
|
|
346
|
+
|
|
347
|
+
// ── Payment overlay (spec §3/§5/§6): f_payment > 0 FORCES the transactional
|
|
348
|
+
// profile even when another base type dominates. It is an OVERLAY, not a
|
|
349
|
+
// winner-take-all reclassification: the base type stays, and we surface the
|
|
350
|
+
// overlay in evidence so the Shield layer adds the confirmation/limit
|
|
351
|
+
// policies. Reducing f_payment to flee transactional_financial is neutralized
|
|
352
|
+
// by the downgrade asymmetry + the always-on floor.
|
|
353
|
+
//
|
|
354
|
+
// It is surfaced in `evidence.payment_overlay`, NOT in `modifiers[]`: the
|
|
355
|
+
// schema's modifiers enum is fixed to {autonomy, untrusted_input,
|
|
356
|
+
// data_sensitivity, regulated} — "transactional" is not a legal modifier
|
|
357
|
+
// value, so emitting it there would violate the schema. evidence has no
|
|
358
|
+
// additionalProperties:false, so it is the schema-legal carrier for the overlay.
|
|
359
|
+
const paymentOverlay = (fv.f_payment || 0) > (th.payment_overlay_min ?? 0);
|
|
360
|
+
|
|
361
|
+
// ── Evidence (schema-shaped). ────────────────────────────────────────────
|
|
362
|
+
const evidence = {
|
|
363
|
+
window_events: nEvents,
|
|
364
|
+
top2_type: top2.type,
|
|
365
|
+
margin: Number(margin.toFixed(6)),
|
|
366
|
+
};
|
|
367
|
+
// Extra evidence keys are schema-legal (evidence has no additionalProperties:
|
|
368
|
+
// false). Surface the decision context for audit — never raw content.
|
|
369
|
+
if (paymentOverlay) {
|
|
370
|
+
evidence.payment_overlay = {
|
|
371
|
+
active: true,
|
|
372
|
+
f_payment: fv.f_payment,
|
|
373
|
+
adds: 'transactional_financial confirmation/limit policies (overlay, base type unchanged)',
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
evidence.confidence_terms = { top1_score: Number(top1.score.toFixed(6)), margin: Number(margin.toFixed(6)), log_n_events: Number(logN.toFixed(6)) };
|
|
377
|
+
|
|
378
|
+
return {
|
|
379
|
+
agent_id,
|
|
380
|
+
classified_type,
|
|
381
|
+
confidence: Number(confidence.toFixed(6)),
|
|
382
|
+
stage,
|
|
383
|
+
modifiers,
|
|
384
|
+
evidence,
|
|
385
|
+
feature_vector: fv,
|
|
386
|
+
windows_consistent,
|
|
387
|
+
strictness_rank: strictnessOf(cfg, classified_type),
|
|
388
|
+
...(last_reclassified_at ? { last_reclassified_at } : {}),
|
|
389
|
+
// Hysteresis carry-over (schema-legal extras: root has no
|
|
390
|
+
// additionalProperties:false). The caller threads these back as part of the
|
|
391
|
+
// `prior` next window so a pending change accumulates consecutive evidence,
|
|
392
|
+
// and so the downgrade reference survives a generic gap (anti-evasion).
|
|
393
|
+
...(pending_type ? { pending_type, pending_windows } : {}),
|
|
394
|
+
...(last_real_type ? { last_real_type } : {}),
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
export default classifyAgentType;
|