watchmyagents 0.8.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -105,14 +105,15 @@ Each entry carries: `id`, `agent_id`, `framework`, `timestamp`, `action_type`, `
105
105
  ### `wma-fetch` — pull events from Anthropic Managed Agents
106
106
 
107
107
  ```bash
108
- wma-fetch --agent-id <agent_id> [--session-id <sess_id>] [--since 1h]
108
+ wma-fetch (--agent-id <agent_id> | --all-agents) [--session-id <sess_id>] [--since 1h]
109
109
  [--log-dir ./watchmyagents-logs] [--dump-raw]
110
110
  [--watch [--interval 5m] [--upload]]
111
111
  ```
112
112
 
113
113
  | Flag | Effect |
114
114
  |---|---|
115
- | `--agent-id agent_xxx` | Required — Anthropic agent identifier |
115
+ | `--agent-id agent_xxx` | Anthropic agent identifier (required unless `--all-agents`) |
116
+ | `--all-agents` | **Fleet mode** (requires `--watch`) — discover ALL agents under the key and watch them in a single process |
116
117
  | `--since 1h` / `24h` / `7d` | Fetch window (default: all) |
117
118
  | `--session-id sesn_xxx` | Limit to a single session |
118
119
  | `--log-dir ./logs` | Where to write NDJSON (default `./watchmyagents-logs`) |
@@ -167,6 +168,21 @@ wma-inspect [path]
167
168
 
168
169
  Outputs sections aligned with security audit needs: tokens summary, by-tool / by-action-type breakdowns, top tool destinations (URLs / queries), action-sequence transitions, tool error rates, p50/p95/max latency per tool, rate metrics.
169
170
 
171
+ ### `wma-agents` — discover + classify your agents (typology)
172
+
173
+ Lists every Managed Agent under your key and classifies each one's **typology**
174
+ (one of 10 Guardian Core archetypes) from its OBSERVED behaviour in your local
175
+ logs — which drives the cold-start Shield template. Modèle C: reads local logs
176
+ only (tool-category fractions, never raw content) and transmits nothing.
177
+
178
+ ```bash
179
+ wma-agents list [--log-dir ~/.watchmyagents/logs] [--json]
180
+ ```
181
+
182
+ With fewer than ~50 observed events an agent stays `generic` (cold start) and
183
+ refines as activity accumulates. Re-classification to a *less strict* type is
184
+ gated (raised confidence + longer window) to resist mimicry-evasion.
185
+
170
186
  ## Automating — continuous monitoring
171
187
 
172
188
  ### `wma-service` — install as an always-on service (recommended)
@@ -180,7 +196,7 @@ export WMA_API_KEY="wma_..."
180
196
  export WMA_FORTRESS_BASE_URL="https://<project>.supabase.co/functions/v1"
181
197
  export WMA_SIGNALS_SALT="..." # stable per-customer salt
182
198
 
183
- wma-service install --agent-id agent_01ABC... --interval 5m [--with-shield]
199
+ wma-service install (--agent-id agent_01ABC... | --all-agents) [--interval 5m] [--with-shield]
184
200
  wma-service status
185
201
  wma-service uninstall [--with-shield]
186
202
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "watchmyagents",
3
- "version": "0.8.2",
3
+ "version": "0.9.0",
4
4
  "description": "Security observability + real-time policy enforcement for AI agents. Local-first NDJSON capture with a continuous Watch daemon that auto-uploads anonymized signals, Shield CLI that blocks policy violations live (with policies pulled from Fortress cloud), anonymizer producing signals-only payloads, bidirectional sync with WatchMyAgents Fortress, and one-command install as an always-on launchd/systemd service — closing the recursive Watch→Guardian→Shield security loop.",
5
5
  "type": "module",
6
6
  "files": [
@@ -11,6 +11,7 @@
11
11
  "scripts/anonymize.js",
12
12
  "scripts/upload-fortress.js",
13
13
  "scripts/service.js",
14
+ "scripts/agents.js",
14
15
  "README.md",
15
16
  "SECURITY.md",
16
17
  "LICENSE"
@@ -21,15 +22,18 @@
21
22
  "wma-shield": "scripts/shield.js",
22
23
  "wma-anonymize": "scripts/anonymize.js",
23
24
  "wma-upload-fortress": "scripts/upload-fortress.js",
24
- "wma-service": "scripts/service.js"
25
+ "wma-service": "scripts/service.js",
26
+ "wma-agents": "scripts/agents.js"
25
27
  },
26
28
  "scripts": {
29
+ "test": "node --test",
27
30
  "inspect": "node scripts/inspect.js",
28
31
  "fetch": "node scripts/fetch-anthropic.js",
29
32
  "shield": "node scripts/shield.js",
30
33
  "anonymize": "node scripts/anonymize.js",
31
34
  "upload-fortress": "node scripts/upload-fortress.js",
32
- "service": "node scripts/service.js"
35
+ "service": "node scripts/service.js",
36
+ "agents": "node scripts/agents.js"
33
37
  },
34
38
  "engines": {
35
39
  "node": ">=18.0.0"
@@ -0,0 +1,218 @@
1
+ #!/usr/bin/env node
2
+ // wma-agents — discover all Managed Agents under your key and classify each
3
+ // agent's typology from its OBSERVED behaviour (for Shield template selection).
4
+ //
5
+ // Usage:
6
+ // wma-agents [list] [--log-dir ~/.watchmyagents/logs] [--json]
7
+ //
8
+ // Reads the local Watch logs (NEVER leaves the machine — Modèle C) and derives
9
+ // the anonymized behavioural FEATURE VECTOR per the typology spec:
10
+ // per-tool-category FRACTIONS (f_*), boolean local flags (flag_*), aux ratios
11
+ // (aux_*), and n_events. It then calls classifyAgentType() and prints the
12
+ // schema-conformant result. With <50 events an agent is `generic` (cold start)
13
+ // and refines as activity accumulates.
14
+ //
15
+ // Modèle C invariant: only counts/ratios/flags are computed here — never raw
16
+ // prompt/output content, never the agent display name. Nothing is transmitted.
17
+ //
18
+ // ANTHROPIC_API_KEY from env (or --api-key, discouraged).
19
+
20
+ import os from 'node:os';
21
+ import { readdir, stat } from 'node:fs/promises';
22
+ import { createReadStream } from 'node:fs';
23
+ import { createInterface } from 'node:readline';
24
+ import { join, resolve } from 'node:path';
25
+ import { listAgents } from '../src/sources/anthropic-managed.js';
26
+ import { classifyAgentType } from '../src/typology.js';
27
+ import { isValidAgentId, assertSafePathSegment } from '../src/validate.js';
28
+
29
+ function parseArgs(argv) {
30
+ const out = { _: [] };
31
+ for (let i = 0; i < argv.length; i++) {
32
+ const a = argv[i];
33
+ if (a.startsWith('--')) {
34
+ const k = a.slice(2); const n = argv[i + 1];
35
+ if (n == null || n.startsWith('--')) out[k] = true; else { out[k] = n; i++; }
36
+ } else out._.push(a);
37
+ }
38
+ return out;
39
+ }
40
+ function die(msg, code = 1) { process.stderr.write(`error: ${msg}\n`); process.exit(code); }
41
+ function info(msg) { process.stdout.write(`[wma-agents] ${msg}\n`); }
42
+
43
+ // Action types that represent a TOOL invocation (the denominator for f_* tool
44
+ // fractions). Confirmed produced by src/sources/anthropic-managed.js.
45
+ const TOOL_ACTIONS = new Set(['tool_use', 'mcp_tool_use', 'custom_tool_use']);
46
+
47
+ // ──────────────────────────────────────────────────────────────────────────
48
+ // Tool-name → category mapping (Modèle C: name-based, no content). Managed
49
+ // Agents expose tools as an opaque bundle, so tool_name is free-text. We match
50
+ // the confirmed built-ins (web_search, web_fetch, bash) plus best-effort
51
+ // regexes for common tool names. A tool that matches nothing contributes to the
52
+ // denominator but to no category (honest: unknown ≠ inferred).
53
+ // ──────────────────────────────────────────────────────────────────────────
54
+ const CATEGORY_RULES = [
55
+ // category, matcher (lower-cased tool_name)
56
+ ['search', (n) => /(^|_)web_search$|(^|_)search($|_)|google|brave/.test(n)],
57
+ ['browser', (n) => /web_fetch|browser|playwright|puppeteer|navigate|screenshot/.test(n)],
58
+ ['http', (n) => /(^|_)http|fetch_url|curl|request|webhook|api_call/.test(n)],
59
+ ['code', (n) => /bash|shell|terminal|code_exec|exec_|python|node_run|run_code|interpreter/.test(n)],
60
+ ['database', (n) => /sql|query_db|database|postgres|mysql|mongo|redis|bigquery|snowflake/.test(n)],
61
+ ['email', (n) => /email|gmail|smtp|sendmail|mailgun|outlook/.test(n)],
62
+ ['payment', (n) => /payment|charge|transfer|invoice|stripe|paypal|payout|refund|checkout/.test(n)],
63
+ ['secret', (n) => /secret|vault|credential|kms|keychain|token_get/.test(n)],
64
+ ['memory', (n) => /memory|retriev|vector|(^|_)rag($|_)|knowledge|embed|pinecone|chroma/.test(n)],
65
+ ['file', (n) => /editor|str_replace|read_file|write_file|create_file|file_io|(^|_)file($|_)|fs_/.test(n)],
66
+ ];
67
+
68
+ // Best-effort deploy detection (spec discriminator devops_infra vs coding).
69
+ const DEPLOY_RE = /deploy|terraform|kubectl|helm|(^|_)release($|_)|ansible|pulumi|cloudformation/;
70
+
71
+ function categoryOf(toolName) {
72
+ const n = String(toolName || '').toLowerCase();
73
+ for (const [cat, m] of CATEGORY_RULES) if (m(n)) return cat;
74
+ return null;
75
+ }
76
+
77
+ // Aggregate raw counts from an agent's local NDJSON logs (Modèle C: counts only).
78
+ async function aggregate(logDir, agentId) {
79
+ const actionCounts = {}; // action_type → count
80
+ const categoryCounts = {}; // tool category → count
81
+ let toolEvents = 0; // denominator for f_* fractions
82
+ let deployUses = 0;
83
+ const dir = join(logDir, agentId);
84
+ const s = await stat(dir).catch(() => null);
85
+ if (!s || !s.isDirectory()) return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: false };
86
+ let names;
87
+ try { names = await readdir(dir); } catch { return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: false }; }
88
+ const files = names.filter((n) => n.endsWith('.ndjson') && !n.startsWith('raw-'));
89
+ if (files.length === 0) return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: false };
90
+
91
+ for (const f of files) {
92
+ await new Promise((res) => {
93
+ const rl = createInterface({ input: createReadStream(join(dir, f), { encoding: 'utf8' }), crlfDelay: Infinity });
94
+ rl.on('line', (line) => {
95
+ if (!line.trim()) return;
96
+ let e; try { e = JSON.parse(line); } catch { return; }
97
+ if (e.action_type) actionCounts[e.action_type] = (actionCounts[e.action_type] || 0) + 1;
98
+ if (TOOL_ACTIONS.has(e.action_type)) {
99
+ toolEvents += 1;
100
+ const cat = categoryOf(e.tool_name);
101
+ if (cat) categoryCounts[cat] = (categoryCounts[cat] || 0) + 1;
102
+ if (DEPLOY_RE.test(String(e.tool_name || '').toLowerCase())) deployUses += 1;
103
+ }
104
+ });
105
+ rl.on('close', res); rl.on('error', res);
106
+ });
107
+ }
108
+ return { actionCounts, categoryCounts, toolEvents, deployUses, hasLogs: true };
109
+ }
110
+
111
+ // Features that the WMA NDJSON logs CANNOT reliably expose today (opaque tool
112
+ // names / no behavioural signal / content off-limits under Modèle C). They
113
+ // default to 0/false; the caller prints a one-line note.
114
+ const NON_DERIVABLE = [
115
+ 'f_database', 'f_email', 'f_payment', 'f_secret', 'f_memory',
116
+ 'flag_internal_sys', 'flag_on_behalf', 'aux_untrusted', 'aux_sensitive',
117
+ ];
118
+
119
+ // Build the canonical anonymized FEATURE VECTOR from the aggregated counts.
120
+ // Fractions = category_count / toolEvents. n_events = total observed events.
121
+ function buildFeatures(agg) {
122
+ const { actionCounts, categoryCounts, toolEvents, deployUses } = agg;
123
+ const nEvents = Object.values(actionCounts).reduce((a, b) => a + b, 0);
124
+ const frac = (c) => (toolEvents > 0 ? (categoryCounts[c] || 0) / toolEvents : 0);
125
+ const eventFrac = (...types) => (nEvents > 0
126
+ ? types.reduce((a, t) => a + (actionCounts[t] || 0), 0) / nEvents
127
+ : 0);
128
+
129
+ // f_handoff / f_user_msg are derived from event TYPE (not tool category):
130
+ // confirmed action_types thread_message_* and user_message.
131
+ const handoff = eventFrac('thread_message_sent', 'thread_message_received', 'thread_created');
132
+ const userMsg = eventFrac('user_message');
133
+
134
+ // aux_autonomy ≈ 1 − (human-in-the-loop event share). Confirmed action_types
135
+ // user_message / user_interrupt / tool_confirmation mark human involvement; an
136
+ // agent that proceeds without them is more autonomous. Heuristic — documented.
137
+ const hitlShare = eventFrac('user_message', 'user_interrupt', 'tool_confirmation');
138
+ const auxAutonomy = nEvents > 0 ? Math.max(0, 1 - hitlShare) : 0;
139
+
140
+ return {
141
+ // tool-category fractions (over tool uses)
142
+ f_code: frac('code'),
143
+ f_browser: frac('browser'),
144
+ f_database: frac('database'), // non-derivable in practice → ~0
145
+ f_http: frac('http'),
146
+ f_email: frac('email'), // non-derivable in practice → ~0
147
+ f_payment: frac('payment'), // non-derivable in practice → ~0
148
+ f_secret: frac('secret'), // non-derivable in practice → ~0
149
+ f_search: frac('search'),
150
+ f_memory: frac('memory'), // non-derivable in practice → ~0
151
+ f_file: frac('file'),
152
+ // event-type fractions (over all events)
153
+ f_handoff: handoff,
154
+ f_user_msg: userMsg,
155
+ // discriminator flags (best-effort; only flag_deploy has any behavioural
156
+ // signal — and only if the agent literally names a deploy tool).
157
+ flag_deploy: deployUses > 0 ? 1 : 0,
158
+ flag_internal_sys: 0, // no behavioural signal in logs
159
+ flag_on_behalf: 0, // no behavioural signal in logs
160
+ // aux ratios
161
+ aux_autonomy: auxAutonomy, // heuristic (HITL-frequency)
162
+ aux_untrusted: 0, // no honest source in logs
163
+ aux_sensitive: 0, // no honest source in logs
164
+ // window size
165
+ n_events: nEvents,
166
+ };
167
+ }
168
+
169
+ async function main() {
170
+ const args = parseArgs(process.argv.slice(2));
171
+ if (args._[0] && args._[0] !== 'list') die(`unknown command "${args._[0]}" (only "list" supported)`);
172
+ const apiKey = args['api-key'] || process.env.ANTHROPIC_API_KEY;
173
+ if (!apiKey) die('--api-key or ANTHROPIC_API_KEY required');
174
+ if (args['api-key']) process.stderr.write('[wma-agents] WARNING: --api-key is visible in shell history; prefer ANTHROPIC_API_KEY env\n');
175
+ const logDir = resolve(args['log-dir'] || join(os.homedir(), '.watchmyagents', 'logs'));
176
+ const asJson = !!args.json;
177
+
178
+ let agents;
179
+ try { agents = await listAgents(apiKey); }
180
+ catch (e) { die(`failed to list agents: ${e.message}`); }
181
+
182
+ const results = [];
183
+ for (const a of agents) {
184
+ if (!a.id || !isValidAgentId(a.id)) continue;
185
+ assertSafePathSegment(a.id, 'agent id');
186
+ const agg = await aggregate(logDir, a.id);
187
+ const features = buildFeatures(agg);
188
+ features.agent_id = a.id;
189
+ // No prior state threaded here (single-shot CLI snapshot); the continuous
190
+ // Watch daemon is responsible for threading window state across runs.
191
+ const cls = classifyAgentType(features);
192
+ results.push({
193
+ id: a.id,
194
+ name: a.name || '(unnamed)', // shown for the human only — NOT a classification signal
195
+ hasLogs: agg.hasLogs,
196
+ ...cls,
197
+ });
198
+ }
199
+
200
+ if (asJson) { process.stdout.write(JSON.stringify(results, null, 2) + '\n'); return; }
201
+
202
+ info(`discovered ${results.length} agent(s) - classified from local logs in ${logDir}`);
203
+ info(`Modele C: features below default to 0 (logs don't expose them): ${NON_DERIVABLE.join(', ')}`);
204
+ process.stdout.write('\n');
205
+ for (const r of results) {
206
+ const mods = (r.modifiers && r.modifiers.length) ? ` [+${r.modifiers.join(',')}]` : '';
207
+ const overlay = r.evidence?.payment_overlay ? ' (+transactional overlay)' : '';
208
+ process.stdout.write(` ${r.name}\n`);
209
+ process.stdout.write(` ${r.id}\n`);
210
+ process.stdout.write(` -> ${r.classified_type} (conf ${Math.round(r.confidence * 100)}%, ${r.stage})${mods}${overlay}\n`);
211
+ process.stdout.write(` evidence: ${r.evidence.window_events} events, top2=${r.evidence.top2_type}, margin=${r.evidence.margin}\n`);
212
+ if (!r.hasLogs) process.stdout.write(' (no local logs yet - cold start)\n');
213
+ process.stdout.write('\n');
214
+ }
215
+ info('type drives the cold-start Shield template (Guardian Core §8). The global-baseline floor applies regardless of classification.');
216
+ }
217
+
218
+ main().catch((e) => { process.stderr.write(`error: ${e.stack || e.message}\n`); process.exit(1); });
@@ -31,7 +31,7 @@ import { SignalsAggregator } from '../src/anonymizer.js';
31
31
  import { resolveFortressBase, fortressEndpoint } from '../src/fortress/url.js';
32
32
  import { isValidAgentId, isValidSessionId, assertSafePathSegment } from '../src/validate.js';
33
33
  import {
34
- getAgent, listSessions, fetchSessionEntries, fetchRawEvents,
34
+ getAgent, listAgents, listSessions, fetchSessionEntries, fetchRawEvents,
35
35
  } from '../src/sources/anthropic-managed.js';
36
36
 
37
37
  function parseArgs(argv) {
@@ -70,6 +70,9 @@ function parseSince(s) {
70
70
  function die(msg, code = 1) { process.stderr.write(`${msg}\n`); process.exit(code); }
71
71
  function info(msg) { process.stdout.write(`[wma-fetch] ${msg}\n`); }
72
72
  function warn(msg) { process.stderr.write(`[wma-fetch] ⚠️ ${msg}\n`); }
73
+ // Strip control chars + truncate a customer-set agent name before it goes into
74
+ // a log line or the Fortress display_name (defense-in-depth vs log/payload injection).
75
+ function cleanLabel(s) { return [...String(s ?? '')].filter((c) => c.charCodeAt(0) >= 32 && c.charCodeAt(0) !== 127).join('').slice(0, 60).trim(); }
73
76
 
74
77
  function resolveModel(agent) {
75
78
  const raw = agent.model || agent.config?.model || null;
@@ -198,10 +201,14 @@ async function fetchOneShot({ apiKey, agentId, model, logDir, since, sessionId,
198
201
  process.stdout.write(`[wma-fetch] inspect with: npx wma-inspect ${logDir}\n`);
199
202
  }
200
203
 
201
- // ── CONTINUOUS / DAEMON ─────────────────────────────────────────────────────
202
- async function runWatch({ apiKey, agentId, model, displayName, logDir, intervalMs, uploadCtx }) {
203
- const seenIds = await preloadSeenIds(logDir, agentId);
204
- const loggers = new Map(); // sessionId Logger (persists sequence across cycles)
204
+ // ── CONTINUOUS / DAEMON (single agent or whole fleet) ───────────────────────
205
+ // `agents` = [{ agentId, model, displayName }]. One process watches them all.
206
+ async function runWatch({ apiKey, agents, logDir, intervalMs, uploadCtx }) {
207
+ const seenIds = new Set(); // stable Anthropic event ids already captured
208
+ for (const ag of agents) {
209
+ for (const id of await preloadSeenIds(logDir, ag.agentId)) seenIds.add(id);
210
+ }
211
+ const loggers = new Map(); // sessionId → Logger (session ids are globally unique)
205
212
  const ended = new Set(); // sessions we've already closed with session_end
206
213
 
207
214
  const ac = new AbortController();
@@ -209,56 +216,62 @@ async function runWatch({ apiKey, agentId, model, displayName, logDir, intervalM
209
216
  process.on('SIGINT', shutdown);
210
217
  process.on('SIGTERM', shutdown);
211
218
 
212
- info(`watch mode interval ${Math.round(intervalMs / 1000)}s, upload ${uploadCtx ? 'ON' : 'OFF'}, ${seenIds.size} known events preloaded`);
219
+ const fleet = agents.length > 1;
220
+ info(`watch mode — ${agents.length} agent(s), interval ${Math.round(intervalMs / 1000)}s, upload ${uploadCtx ? 'ON' : 'OFF'}, ${seenIds.size} known events preloaded`);
213
221
 
214
222
  while (!ac.signal.aborted) {
215
223
  const since = new Date(Date.now() - 24 * 3600 * 1000);
216
- let sessions = [];
217
- try { sessions = await listSessions(apiKey, { agentId, since }); }
218
- catch (e) { warn(`listSessions failed: ${e.message}`); }
219
-
220
224
  let cycleNew = 0;
221
- for (const s of sessions) {
222
- if (!s.id || ended.has(s.id)) continue;
223
- let logger = loggers.get(s.id);
224
- if (!logger) { logger = new Logger({ logDir, agentId, sessionId: s.id, silent: true }); loggers.set(s.id, logger); }
225
225
 
226
- const fresh = [];
227
- let sawTerminated = false;
228
- try {
229
- for await (const entry of fetchSessionEntries({ apiKey, agentId, sessionId: s.id, model })) {
230
- if (entry.id && seenIds.has(entry.id)) continue;
231
- if (entry.id) seenIds.add(entry.id);
232
- const written = await logger.write(entry);
233
- fresh.push(written);
234
- if (entry.action_type === 'state_transition'
235
- && entry.output?.scope === 'session'
236
- && entry.output?.state === 'terminated') sawTerminated = true;
237
- }
238
- } catch (e) { warn(`session ${s.id}: fetch failed: ${e.message}`); continue; }
226
+ for (const ag of agents) {
227
+ if (ac.signal.aborted) break;
228
+ const tag = fleet ? `[${ag.displayName}] ` : '';
229
+ let sessions = [];
230
+ try { sessions = await listSessions(apiKey, { agentId: ag.agentId, since }); }
231
+ catch (e) { warn(`${tag}listSessions failed: ${e.message}`); continue; }
239
232
 
240
- if (fresh.length === 0) continue;
241
- cycleNew += fresh.length;
242
- info(`session ${s.id.slice(0, 16)}…: +${fresh.length} new event(s)`);
233
+ for (const s of sessions) {
234
+ if (!s.id || ended.has(s.id)) continue;
235
+ let logger = loggers.get(s.id);
236
+ if (!logger) { logger = new Logger({ logDir, agentId: ag.agentId, sessionId: s.id, silent: true }); loggers.set(s.id, logger); }
243
237
 
244
- if (uploadCtx) {
238
+ const fresh = [];
239
+ let sawTerminated = false;
245
240
  try {
246
- const resp = await uploadSignals(uploadCtx, agentId, displayName, fresh);
247
- if (resp?.signal_id) info(` ↑ signals uploaded (signal_id ${resp.signal_id})`);
248
- } catch (e) { warn(` signals upload failed: ${e.message}`); }
249
- }
241
+ for await (const entry of fetchSessionEntries({ apiKey, agentId: ag.agentId, sessionId: s.id, model: ag.model })) {
242
+ if (entry.id && seenIds.has(entry.id)) continue;
243
+ if (entry.id) seenIds.add(entry.id);
244
+ const written = await logger.write(entry);
245
+ fresh.push(written);
246
+ if (entry.action_type === 'state_transition'
247
+ && entry.output?.scope === 'session'
248
+ && entry.output?.state === 'terminated') sawTerminated = true;
249
+ }
250
+ } catch (e) { warn(`${tag}session ${s.id.slice(0, 16)}…: fetch failed: ${e.message}`); continue; }
251
+
252
+ if (fresh.length === 0) continue;
253
+ cycleNew += fresh.length;
254
+ info(`${tag}session ${s.id.slice(0, 16)}…: +${fresh.length} new event(s)`);
250
255
 
251
- if (sawTerminated) {
252
- const tracker = new TokenTracker();
253
- for (const e of fresh) tracker.record(e);
254
- const stats = tracker.stats().total;
255
- await logger.write({
256
- action_type: 'session_end', framework: 'anthropic-managed', status: 'ok', model,
257
- session_tokens: { input: stats.input, output: stats.output, cache_read: stats.cache_read, cache_creation: stats.cache_creation, total: stats.sum },
258
- session_cost_usd: stats.cost_usd || null,
259
- });
260
- ended.add(s.id);
261
- info(`session ${s.id.slice(0, 16)}… terminated — closed`);
256
+ if (uploadCtx) {
257
+ try {
258
+ const resp = await uploadSignals(uploadCtx, ag.agentId, ag.displayName, fresh);
259
+ if (resp?.signal_id) info(` ↑ signals uploaded (signal_id ${resp.signal_id})`);
260
+ } catch (e) { warn(` signals upload failed: ${e.message}`); }
261
+ }
262
+
263
+ if (sawTerminated) {
264
+ const tracker = new TokenTracker();
265
+ for (const e of fresh) tracker.record(e);
266
+ const stats = tracker.stats().total;
267
+ await logger.write({
268
+ action_type: 'session_end', framework: 'anthropic-managed', status: 'ok', model: ag.model,
269
+ session_tokens: { input: stats.input, output: stats.output, cache_read: stats.cache_read, cache_creation: stats.cache_creation, total: stats.sum },
270
+ session_cost_usd: stats.cost_usd || null,
271
+ });
272
+ ended.add(s.id);
273
+ info(`${tag}session ${s.id.slice(0, 16)}… terminated — closed`);
274
+ }
262
275
  }
263
276
  }
264
277
 
@@ -275,10 +288,12 @@ async function main() {
275
288
  const logDir = resolve(args['log-dir'] || './watchmyagents-logs');
276
289
  const watch = !!args.watch;
277
290
  const upload = !!args.upload;
291
+ const allAgents = !!args['all-agents'];
278
292
 
279
293
  if (!apiKey) die('error: --api-key or ANTHROPIC_API_KEY required');
280
- if (!agentId) die('error: --agent-id required (e.g. agent_01ABC...)');
281
- if (!isValidAgentId(agentId)) {
294
+ if (!allAgents && !agentId) die('error: --agent-id required (or --all-agents for fleet mode)');
295
+ if (allAgents && !watch) die('error: --all-agents requires --watch (fleet daemon). For a one-shot, target a single --agent-id.');
296
+ if (agentId && !isValidAgentId(agentId)) {
282
297
  die(`error: --agent-id has invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
283
298
  }
284
299
  const sessionIdArg = args['session-id'];
@@ -303,18 +318,30 @@ async function main() {
303
318
  uploadCtx = { apiKey: wmaKey, salt, url: fortressEndpoint(base, 'ingest-signals') };
304
319
  }
305
320
 
306
- info(`resolving agent ${agentId}…`);
307
- const agent = await getAgent(apiKey, agentId).catch((e) => die(`failed to GET agent: ${e.message}`));
308
- const model = resolveModel(agent);
309
- const displayName = agent.name || agentId;
310
- info(`model: ${model || '(unknown)'}`);
321
+ // Resolve the agent list: the whole fleet (--all-agents) or a single agent.
322
+ let agents;
323
+ if (allAgents) {
324
+ info('discovering agents (fleet mode)…');
325
+ const all = await listAgents(apiKey).catch((e) => die(`failed to list agents: ${e.message}`));
326
+ agents = all
327
+ .filter((a) => a.id && isValidAgentId(a.id))
328
+ .map((a) => ({ agentId: a.id, model: resolveModel(a), displayName: cleanLabel(a.name || a.id) }));
329
+ if (agents.length === 0) die('error: no agents found under this API key');
330
+ info(`fleet: ${agents.length} agent(s) — ${agents.map((a) => a.displayName).join(', ')}`);
331
+ } else {
332
+ info(`resolving agent ${agentId}…`);
333
+ const agent = await getAgent(apiKey, agentId).catch((e) => die(`failed to GET agent: ${e.message}`));
334
+ agents = [{ agentId, model: resolveModel(agent), displayName: cleanLabel(agent.name || agentId) }];
335
+ info(`model: ${agents[0].model || '(unknown)'}`);
336
+ }
311
337
 
312
338
  if (watch) {
313
339
  const intervalMs = parseDurationMs(args.interval, 5 * 60_000);
314
- await runWatch({ apiKey, agentId, model, displayName, logDir, intervalMs, uploadCtx });
340
+ await runWatch({ apiKey, agents, logDir, intervalMs, uploadCtx });
315
341
  } else {
316
342
  const since = args.since ? parseSince(args.since) : null;
317
- await fetchOneShot({ apiKey, agentId, model, logDir, since, sessionId: args['session-id'], dumpRaw: !!args['dump-raw'] });
343
+ const a = agents[0];
344
+ await fetchOneShot({ apiKey, agentId: a.agentId, model: a.model, logDir, since, sessionId: args['session-id'], dumpRaw: !!args['dump-raw'] });
318
345
  }
319
346
  }
320
347
 
@@ -251,9 +251,10 @@ function linuxUninstallOne(label) {
251
251
 
252
252
  // ── Commands ────────────────────────────────────────────────────────────--
253
253
  function cmdInstall(args) {
254
+ const allAgents = !!args['all-agents'];
254
255
  const agentId = args['agent-id'];
255
- if (!agentId) die('--agent-id required (e.g. agent_01ABC...)');
256
- if (!isValidAgentId(agentId)) die(`--agent-id invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
256
+ if (!allAgents && !agentId) die('--agent-id required (or --all-agents to cover the whole fleet)');
257
+ if (agentId && !isValidAgentId(agentId)) die(`--agent-id invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
257
258
  const interval = args.interval || '5m';
258
259
  if (!/^\d+[smhd]$/.test(interval)) die(`--interval invalid format (expected like 30s, 5m, 1h, 2d; got "${interval}")`);
259
260
  const logDir = args['log-dir'] || LOG_DIR_DEFAULT;
@@ -262,14 +263,15 @@ function cmdInstall(args) {
262
263
  if (PLATFORM !== 'darwin' && PLATFORM !== 'linux') {
263
264
  die(`unsupported platform "${PLATFORM}". Supported: macOS (launchd), Linux (systemd).\n` +
264
265
  ' Run the daemon manually or wrap it in your own process manager:\n' +
265
- ` wma-fetch --agent-id ${agentId} --watch --upload --interval ${interval}`);
266
+ ` wma-fetch ${allAgents ? '--all-agents' : `--agent-id ${agentId}`} --watch --upload --interval ${interval}`);
266
267
  }
267
268
 
268
269
  mkdirSync(logDir, { recursive: true, mode: 0o700 });
269
270
  writeEnvFile();
270
271
 
271
- const watchArgs = ['--agent-id', agentId, '--watch', '--upload', '--interval', interval, '--log-dir', logDir];
272
- const shieldArgs = ['--agent-id', agentId, '--policies-source', 'fortress', '--log-dir', logDir];
272
+ const target = allAgents ? ['--all-agents'] : ['--agent-id', agentId];
273
+ const watchArgs = [...target, '--watch', '--upload', '--interval', interval, '--log-dir', logDir];
274
+ const shieldArgs = [...target, '--policies-source', 'fortress', '--log-dir', logDir];
273
275
 
274
276
  if (PLATFORM === 'darwin') {
275
277
  macInstallOne(WATCH_LABEL, FETCH_SCRIPT, watchArgs);
package/scripts/shield.js CHANGED
@@ -33,7 +33,7 @@ import {
33
33
  getAgentConfig, detectAlwaysAsk,
34
34
  } from '../src/shield/enforce.js';
35
35
  import { DecisionLogger } from '../src/shield/decisions.js';
36
- import { listSessions } from '../src/sources/anthropic-managed.js';
36
+ import { listSessions, listAgents } from '../src/sources/anthropic-managed.js';
37
37
  import { FortressPolicySource, postDecision } from '../src/shield/sources/fortress.js';
38
38
  import { resolveFortressBase } from '../src/fortress/url.js';
39
39
  import { isValidAgentId, isValidSessionId } from '../src/validate.js';
@@ -423,10 +423,15 @@ async function main() {
423
423
  explicitUrl: args['fortress-url'],
424
424
  });
425
425
  const logDir = resolve(args['log-dir'] || './watchmyagents-logs');
426
+ const allAgents = !!args['all-agents'];
426
427
 
427
428
  if (!apiKey) die('error: --api-key or ANTHROPIC_API_KEY required');
428
- if (!agentId) die('error: --agent-id required');
429
- if (!isValidAgentId(agentId)) {
429
+ if (!allAgents && !agentId) die('error: --agent-id required (or --all-agents for fleet mode)');
430
+ if (allAgents && singleSessionId) die('error: --all-agents is incompatible with --session-id');
431
+ if (allAgents && policiesSource !== 'fortress') {
432
+ die('error: --all-agents requires --policies-source fortress (per-agent policies).');
433
+ }
434
+ if (agentId && !isValidAgentId(agentId)) {
430
435
  die(`error: --agent-id has invalid format (expected "agent_" + alphanumeric, got "${agentId}")`);
431
436
  }
432
437
  // --session-id ends up in the Anthropic SSE URL path (src/shield/stream.js).
@@ -435,120 +440,112 @@ async function main() {
435
440
  die(`error: --session-id has invalid format (expected "sesn_" + alphanumeric, got "${singleSessionId}")`);
436
441
  }
437
442
 
438
- // Policies source: --policies-source fortress | local (default infers from --policy)
439
- let ruleset; // for 'local' mode: static; for 'fortress': initial snapshot
440
- let fortressPolicies; // FortressPolicySource instance, used as ground truth at runtime
441
-
443
+ // Validate the policy source config once (shared across the fleet). For local
444
+ // mode the ruleset is loaded once and shared by every agent.
445
+ let sharedLocalRuleset = null;
442
446
  if (policiesSource === 'fortress') {
443
447
  if (!wmaApiKey) die('error: --policies-source fortress requires --wma-api-key or WMA_API_KEY env');
444
448
  if (!fortressBase) die('error: --policies-source fortress requires --fortress-base-url or WMA_FORTRESS_BASE_URL env');
445
449
  if (!/^wma_[a-f0-9]{32}$/i.test(wmaApiKey)) warn(`WMA_API_KEY format looks unusual (expected wma_<32hex>).`);
446
-
447
- fortressPolicies = new FortressPolicySource({
448
- apiKey: wmaApiKey,
449
- base: fortressBase,
450
- anthropicAgentId: agentId,
451
- refreshIntervalMs: 5 * 60_000,
452
- onError: (e) => warn(`policy refresh failed (keeping cached): ${e.message}`),
453
- onRefresh: ({ policies, fetched_at, initial }) => {
454
- info(`policies ${initial ? 'loaded' : 'refreshed'} from Fortress — ${policies.length} active (fetched_at: ${fetched_at})`);
455
- },
456
- });
457
- try {
458
- await fortressPolicies.start();
459
- } catch (e) {
460
- die(`error fetching policies from Fortress: ${e.message}\n` +
461
- ` Check WMA_FORTRESS_BASE_URL and WMA_API_KEY.`);
462
- }
463
- ruleset = fortressPolicies.current();
464
450
  } else if (policiesSource === 'local') {
465
451
  if (!policyPath) die('error: --policies-source local requires --policy <path-to-policies.json>');
466
- try {
467
- ruleset = await loadPolicies(resolve(policyPath));
468
- } catch (e) {
469
- die(`error loading policies: ${e.message}`);
470
- }
452
+ try { sharedLocalRuleset = await loadPolicies(resolve(policyPath)); }
453
+ catch (e) { die(`error loading policies: ${e.message}`); }
471
454
  } else {
472
455
  die('error: --policy <path> OR --policies-source fortress required');
473
456
  }
474
457
 
475
- let mode = 'interrupt';
476
- let agentMeta = null;
477
- try {
478
- agentMeta = await getAgentConfig(apiKey, agentId);
479
- if (detectAlwaysAsk(agentMeta)) mode = 'tool_confirmation';
480
- } catch (e) {
481
- warn(`could not fetch agent config (${e.message}). Defaulting to interrupt mode.`);
482
- }
483
-
484
- const sourceLabel = policiesSource === 'fortress'
485
- ? `Fortress (${fortressBase})`
486
- : policyPath;
487
- info(`armed — ${ruleset.policies.length} policies loaded from ${sourceLabel}`);
488
- info(`default action when no rule matches: ${ruleset.default.action}`);
489
- info(`agent: ${agentId}${agentMeta?.name ? ` "${agentMeta.name}"` : ''}`);
490
- info(`enforcement mode: ${mode}`);
491
- if (mode === 'interrupt') {
492
- warn('DEGRADED mode — Shield will interrupt AFTER a violating tool runs.');
493
- warn(`For pre-execution blocking, run: wma-shield --setup-guide --agent-id ${agentId}`);
458
+ // Resolve the agent list: whole fleet (--all-agents) or a single agent.
459
+ let agentIds;
460
+ if (allAgents) {
461
+ info('discovering agents (fleet mode)…');
462
+ const all = await listAgents(apiKey).catch((e) => die(`failed to list agents: ${e.message}`));
463
+ agentIds = all.map((a) => a.id).filter((id) => id && isValidAgentId(id));
464
+ if (agentIds.length === 0) die('error: no agents found under this API key');
465
+ info(`fleet: ${agentIds.length} agent(s)`);
466
+ } else {
467
+ agentIds = [agentId];
494
468
  }
469
+ const fleet = agentIds.length > 1;
495
470
 
496
- // Per-session DecisionLogger factory (each session gets its own to keep
497
- // sequence numbers monotonic per session).
498
- const loggers = new Map();
499
- const decisions = (sessionId) => {
500
- if (!loggers.has(sessionId)) {
501
- loggers.set(sessionId, new DecisionLogger({ logDir, agentId, sessionId }));
502
- }
503
- return loggers.get(sessionId);
471
+ // Shared infra: one shutdown signal, one fortress-source registry, one pusher.
472
+ const ac = new AbortController();
473
+ const fortressSources = [];
474
+ const shutdown = (sig) => {
475
+ info(`${sig} received, shutting down…`);
476
+ for (const fp of fortressSources) fp.stop();
477
+ ac.abort();
504
478
  };
479
+ process.on('SIGINT', () => shutdown('SIGINT'));
480
+ process.on('SIGTERM', () => shutdown('SIGTERM'));
505
481
 
506
- // Optional Fortress decision pusher only active if we have a wma key + base.
507
- // In 'fortress' mode this is always available. In 'local' mode it's a fire-
508
- // and-forget extra channel if both are set.
482
+ // Optional Fortress decision pusher (each ctx carries its own agent id, so a
483
+ // single shared pusher tags decisions with the right agent).
509
484
  const canPushToFortress = !!(wmaApiKey && fortressBase);
510
485
  const pushDecisionToFortress = canPushToFortress
511
486
  ? async (decisionData) => {
512
- try {
513
- await postDecision({ apiKey: wmaApiKey, base: fortressBase, decision: decisionData });
514
- } catch (e) {
515
- warn(`Fortress decision push failed: ${e.message}`);
516
- }
487
+ try { await postDecision({ apiKey: wmaApiKey, base: fortressBase, decision: decisionData }); }
488
+ catch (e) { warn(`Fortress decision push failed: ${e.message}`); }
517
489
  }
518
490
  : null;
519
491
 
520
- const ac = new AbortController();
521
- process.on('SIGINT', () => {
522
- info('SIGINT received, shutting down…');
523
- if (fortressPolicies) fortressPolicies.stop();
524
- ac.abort();
525
- });
526
- process.on('SIGTERM', () => {
527
- info('SIGTERM received, shutting down…');
528
- if (fortressPolicies) fortressPolicies.stop();
529
- ac.abort();
530
- });
492
+ // Per-agent SETUP (separate from the long-running phase so we can COUNT how
493
+ // many actually armed). In fleet mode a per-agent startup failure is skipped
494
+ // (warn) instead of killing the fleet. Returns the agent's ctx, or null if skipped.
495
+ async function setupAgent(aid) {
496
+ const tag = fleet ? `[${aid.slice(0, 16)}…] ` : '';
497
+ let fortressPolicies = null;
498
+ let ruleset = sharedLocalRuleset;
499
+ if (policiesSource === 'fortress') {
500
+ fortressPolicies = new FortressPolicySource({
501
+ apiKey: wmaApiKey, base: fortressBase, anthropicAgentId: aid, refreshIntervalMs: 5 * 60_000,
502
+ onError: (e) => warn(`${tag}policy refresh failed (keeping cached): ${e.message}`),
503
+ onRefresh: ({ policies, fetched_at, initial }) => info(`${tag}policies ${initial ? 'loaded' : 'refreshed'} from Fortress — ${policies.length} active (fetched_at: ${fetched_at})`),
504
+ });
505
+ try { await fortressPolicies.start(); }
506
+ catch (e) {
507
+ if (fleet) { warn(`${tag}skipped — policy fetch failed: ${e.message}`); return null; }
508
+ die(`error fetching policies from Fortress: ${e.message}\n Check WMA_FORTRESS_BASE_URL and WMA_API_KEY.`);
509
+ }
510
+ fortressSources.push(fortressPolicies);
511
+ ruleset = fortressPolicies.current();
512
+ }
531
513
 
532
- // ctx exposes a getter for the live ruleset so workers see policy refreshes.
533
- const ctx = {
534
- apiKey,
535
- agentId,
536
- get ruleset() {
537
- return fortressPolicies ? fortressPolicies.current() : ruleset;
538
- },
539
- mode,
540
- decisions,
541
- pushDecisionToFortress,
542
- signalsSalt,
543
- signal: ac.signal,
544
- };
514
+ let mode = 'interrupt';
515
+ let agentMeta = null;
516
+ try { agentMeta = await getAgentConfig(apiKey, aid); if (detectAlwaysAsk(agentMeta)) mode = 'tool_confirmation'; }
517
+ catch (e) { warn(`${tag}could not fetch agent config (${e.message}). Defaulting to interrupt mode.`); }
545
518
 
546
- if (singleSessionId) {
547
- info(`single-session mode attached to ${singleSessionId}`);
548
- await runSessionWorker({ sessionId: singleSessionId, ctx });
549
- } else {
550
- await runAgentWide(ctx);
519
+ info(`${tag}armed — ${ruleset.policies.length} policies · default ${ruleset.default.action} · mode ${mode}${agentMeta?.name ? ` · "${agentMeta.name}"` : ''}`);
520
+ if (mode === 'interrupt' && !fleet) {
521
+ warn('DEGRADED mode Shield will interrupt AFTER a violating tool runs.');
522
+ warn(`For pre-execution blocking, run: wma-shield --setup-guide --agent-id ${aid}`);
523
+ }
524
+
525
+ const loggers = new Map();
526
+ const decisions = (sessionId) => {
527
+ if (!loggers.has(sessionId)) loggers.set(sessionId, new DecisionLogger({ logDir, agentId: aid, sessionId }));
528
+ return loggers.get(sessionId);
529
+ };
530
+ return {
531
+ apiKey, agentId: aid,
532
+ get ruleset() { return fortressPolicies ? fortressPolicies.current() : ruleset; },
533
+ mode, decisions, pushDecisionToFortress, signalsSalt, signal: ac.signal,
534
+ };
535
+ }
536
+
537
+ // Phase 1: arm every agent. Fail LOUD if none armed (otherwise the process would
538
+ // exit silently and — under launchd/systemd — restart-loop without a clear cause).
539
+ const ctxs = (await Promise.all(agentIds.map(setupAgent))).filter(Boolean);
540
+ if (ctxs.length === 0) {
541
+ die(`error: no agents could be armed (${agentIds.length} discovered; all policy fetches failed). Check WMA_API_KEY / WMA_FORTRESS_BASE_URL.`);
551
542
  }
543
+ if (fleet) info(`armed ${ctxs.length}/${agentIds.length} agent(s); watching.`);
544
+
545
+ // Phase 2: run each agent's loop (blocks until SIGINT/SIGTERM).
546
+ await Promise.all(ctxs.map((ctx) => (
547
+ singleSessionId ? runSessionWorker({ sessionId: singleSessionId, ctx }) : runAgentWide(ctx)
548
+ )));
552
549
  }
553
550
 
554
551
  main().catch(e => {
@@ -77,6 +77,24 @@ export async function getAgent(apiKey, agentId) {
77
77
  return getWithRetry(apiKey, `/v1/agents/${agentId}`);
78
78
  }
79
79
 
80
+ // List every Managed Agent under the API key (paginated). Used for fleet mode
81
+ // (watch/shield/service --all-agents) and agent discovery.
82
+ export async function listAgents(apiKey, { limit = 100 } = {}) {
83
+ const agents = [];
84
+ let after = null;
85
+ while (true) {
86
+ const qs = new URLSearchParams({ limit: String(limit) });
87
+ if (after) qs.set('after_id', after);
88
+ const data = await getWithRetry(apiKey, `/v1/agents?${qs}`);
89
+ const page = data.data || [];
90
+ for (const a of page) agents.push(a);
91
+ if (!data.has_more || page.length === 0) break;
92
+ after = page[page.length - 1]?.id;
93
+ if (!after) break;
94
+ }
95
+ return agents;
96
+ }
97
+
80
98
  export async function listSessions(apiKey, { agentId, since, limit = 100 } = {}) {
81
99
  const sessions = [];
82
100
  let after = null;
@@ -0,0 +1,88 @@
1
+ {
2
+ "$comment": "WatchMyAgents — typology classifier weights + thresholds (Guardian Core, agent-typology-classification.spec.md §3/§4/§5). INVARIANT: weights and thresholds live HERE, never hardcoded in typology.js ('poids de signature en config, pas en dur'). Calibrate on labelled real traffic. Modèle C: all inputs are anonymized behavioural fractions/flags only.",
3
+ "version": "0.1.0",
4
+ "updated_at": "2026-05-29T00:00:00Z",
5
+
6
+ "thresholds": {
7
+ "$comment": "§4 'Seuils par défaut (à calibrer)' + §5 downgrade asymmetry.",
8
+ "n_events_min": 50,
9
+ "confidence_min": 0.70,
10
+ "margin_min": 0.15,
11
+ "stable_windows": 3,
12
+ "downgrade_confidence_min": 0.85,
13
+ "downgrade_windows": 5,
14
+ "untrusted_modifier_min": 0.1,
15
+ "sensitive_modifier_min": 0.0,
16
+ "payment_overlay_min": 0.0,
17
+ "autonomy_modifier_min": 0.5,
18
+ "$comment_tie": "§8 conservative tie-break: when |score(top1)-score(top2)| <= tie_epsilon (a near/exact tie between two REAL types with real signal), select the STRICTER of the two rather than falling to the more-permissive generic — 'dans le doute, on reste sur le plus protecteur'. Set to 0 for exact-tie only.",
19
+ "tie_epsilon": 0.0
20
+ },
21
+
22
+ "confidence_sigmoid": {
23
+ "$comment": "§4 confidence = sigmoid(a·top1.score + b·margin + c·log(n_events)). All three coefficients live in config; a naive impl that only used top1.score would be wrong.",
24
+ "a": 4.0,
25
+ "b": 6.0,
26
+ "c": 0.6,
27
+ "bias": -3.5
28
+ },
29
+
30
+ "strictness_rank": {
31
+ "$comment": "§5 restriction ranking — derived from each template's baseline_policies enforcement severity (isolate>block>require_approval>throttle>monitor>warn). Higher rank = STRICTER. Drives re-classification asymmetry: to a stricter rank = normal threshold; to a looser rank = downgrade gate (conf>=0.85 AND 5 windows). NOT alphabetical.",
32
+ "devops_infra": 10,
33
+ "transactional_financial": 9,
34
+ "workflow_backoffice": 8,
35
+ "coding": 7,
36
+ "orchestrator": 6,
37
+ "browser_web": 5,
38
+ "personal_assistant": 4,
39
+ "data_rag": 3,
40
+ "generic": 2,
41
+ "customer_facing": 1
42
+ },
43
+
44
+ "features": {
45
+ "$comment": "Canonical anonymized feature keys (Modèle C). Fractions f_* in [0,1]; flag_* in {0,1}; aux_* in [0,1]. Order is informational only — scoring is key-addressed.",
46
+ "fractions": ["f_code", "f_browser", "f_database", "f_http", "f_email", "f_payment", "f_secret", "f_search", "f_memory", "f_handoff", "f_user_msg", "f_file"],
47
+ "flags": ["flag_deploy", "flag_internal_sys", "flag_on_behalf"],
48
+ "aux": ["aux_autonomy", "aux_untrusted", "aux_sensitive"]
49
+ },
50
+
51
+ "weights": {
52
+ "$comment": "w[type][feature] — signature weights (§3). Positive = signal for the type; negative = signal against. flag_* are the REQUIRED discriminators for the 3 inseparable pairs (coding/devops, data_rag/workflow, personal_assistant/workflow). 'generic' has no positive weights (pure fallback).",
53
+
54
+ "coding": {
55
+ "f_code": 1.0, "f_file": 0.5, "f_search": 0.3, "f_secret": 0.1,
56
+ "flag_deploy": -0.9
57
+ },
58
+ "devops_infra": {
59
+ "f_code": 0.7, "f_secret": 0.6, "f_file": 0.2,
60
+ "flag_deploy": 1.2
61
+ },
62
+ "data_rag": {
63
+ "f_database": 0.8, "f_search": 0.35, "f_memory": 0.7, "aux_untrusted": 0.2,
64
+ "flag_internal_sys": -0.7
65
+ },
66
+ "customer_facing": {
67
+ "f_user_msg": 1.0, "f_handoff": 0.3, "f_email": 0.2
68
+ },
69
+ "browser_web": {
70
+ "f_browser": 1.0, "f_http": 0.6, "f_search": 0.7
71
+ },
72
+ "orchestrator": {
73
+ "f_handoff": 1.2, "f_code": -0.2, "f_browser": -0.2, "f_database": -0.2
74
+ },
75
+ "workflow_backoffice": {
76
+ "f_database": 0.6, "f_http": 0.5, "f_file": 0.2,
77
+ "flag_internal_sys": 0.9, "flag_on_behalf": -0.6
78
+ },
79
+ "personal_assistant": {
80
+ "f_email": 0.8, "f_file": 0.4, "f_user_msg": 0.3,
81
+ "flag_on_behalf": 1.0
82
+ },
83
+ "transactional_financial": {
84
+ "f_payment": 1.5
85
+ },
86
+ "generic": {}
87
+ }
88
+ }
@@ -0,0 +1,398 @@
1
+ // Agent typology classifier — maps an agent's OBSERVED behaviour to one of the
2
+ // 10 Guardian Core archetypes, for Shield template selection / refinement.
3
+ //
4
+ // Source of truth: GUARDIAN CORE/agent-typology-classification.spec.md (v0.1) +
5
+ // GUARDIAN CORE/schemas/agent-classification.schema.json. classifyAgentType()
6
+ // returns an object conforming EXACTLY to that schema.
7
+ //
8
+ // Why behaviour, not config: Anthropic Managed Agents expose their tools as an
9
+ // opaque bundle (`agent_toolset_20260401`), so static config can't tell a
10
+ // researcher from a coder. We classify from anonymized behavioural signals
11
+ // (Modèle C): per-tool-category FRACTIONS (f_*), boolean local flags (flag_*),
12
+ // and aux ratios (aux_*). NEVER raw content — no prompts, no outputs, no names.
13
+ //
14
+ // ──────────────────────────────────────────────────────────────────────────
15
+ // GLOBAL-BASELINE INDEPENDENCE (spec §1, §5 — INVARIANT, read this):
16
+ // The `global-baseline` (5 mandatory fail_closed floors) ALWAYS applies,
17
+ // regardless of the result — or absence — of classification. A bad
18
+ // classification degrades REFINEMENT, never the FLOOR. This classifier MUST
19
+ // NEVER gate, relax, or sit on the critical path of those floors. Nothing
20
+ // returned here can disable a floor. Template swaps bring new *probabilistic*
21
+ // policies in via `shadow` first; mandatory floors are never relaxed during
22
+ // the transition.
23
+ // ──────────────────────────────────────────────────────────────────────────
24
+ //
25
+ // INVARIANTS enforced here:
26
+ // 1. Modèle C — inputs are anonymized fractions/flags/aux ONLY.
27
+ // 2. Weights + thresholds come from config (typology-weights.json), never
28
+ // hardcoded in the logic below.
29
+ // 3. No easy downgrade — moving to a LESS strict template needs a raised
30
+ // confidence (0.85) AND a longer window (5), per the strictness ranking.
31
+ // 4. global-baseline is independent of classification (see banner above).
32
+
33
+ import { readFileSync } from 'node:fs';
34
+ import { fileURLToPath } from 'node:url';
35
+ import { dirname, join } from 'node:path';
36
+
37
+ const __dirname = dirname(fileURLToPath(import.meta.url));
38
+
39
+ // The 10 archetypes (schema `classified_type` enum, exact order/spelling).
40
+ export const ARCHETYPES = [
41
+ 'coding', 'devops_infra', 'data_rag', 'customer_facing', 'browser_web',
42
+ 'orchestrator', 'workflow_backoffice', 'personal_assistant',
43
+ 'transactional_financial', 'generic',
44
+ ];
45
+
46
+ // Modifiers (schema enum). They ONLY add restrictions, so they are activatable
47
+ // immediately — no asymmetry / hysteresis (spec §6).
48
+ export const MODIFIERS = ['autonomy', 'untrusted_input', 'data_sensitivity', 'regulated'];
49
+
50
+ // ── Config (weights + thresholds + strictness ranking) — loaded once. ───────
51
+ // INVARIANT 2: nothing below hardcodes a weight or threshold; everything that
52
+ // influences the decision is read from this file.
53
+ let _config = null;
54
+ export function loadConfig(path = join(__dirname, 'typology-weights.json')) {
55
+ if (_config && path === _config.__path) return _config;
56
+ const raw = JSON.parse(readFileSync(path, 'utf8'));
57
+ raw.__path = path;
58
+ _config = raw;
59
+ return _config;
60
+ }
61
+ // Test/seam: inject a config object directly.
62
+ export function setConfig(cfg) { _config = { ...cfg, __path: '<injected>' }; return _config; }
63
+
64
+ const sigmoid = (x) => 1 / (1 + Math.exp(-x));
65
+ const clamp01 = (x) => Math.max(0, Math.min(1, x));
66
+
67
+ // Strict comparison helper for re-classification asymmetry. Higher rank =
68
+ // stricter template. Moving to >= current rank is an "upgrade" (or lateral);
69
+ // moving to a strictly LOWER rank is a "downgrade" (gated).
70
+ function strictnessOf(cfg, type) {
71
+ const r = cfg.strictness_rank || {};
72
+ return Number.isFinite(r[type]) ? r[type] : 0;
73
+ }
74
+
75
+ /**
76
+ * Build the canonical feature vector from a loose features object.
77
+ * Only the schema-legal keys are kept; everything is coerced to a number and
78
+ * clamped to [0,1] (the schema requires every feature_vector value in [0,1]).
79
+ * Missing features default to 0 — Modèle C: an absent signal is "not observed",
80
+ * never inferred from content.
81
+ */
82
+ function normalizeFeatures(cfg, features) {
83
+ const fr = cfg.features?.fractions || [];
84
+ const fl = cfg.features?.flags || [];
85
+ const ax = cfg.features?.aux || [];
86
+ const out = {};
87
+ for (const k of [...fr, ...fl, ...ax]) {
88
+ const v = Number(features?.[k]);
89
+ out[k] = Number.isFinite(v) ? clamp01(v) : 0;
90
+ }
91
+ return out;
92
+ }
93
+
94
+ /** score(type) = Σ_i w[type][i] · feature_i (spec §4). */
95
+ function scoreType(weightsForType, fv) {
96
+ let s = 0;
97
+ for (const [feat, w] of Object.entries(weightsForType || {})) {
98
+ s += (Number(w) || 0) * (fv[feat] || 0);
99
+ }
100
+ return s;
101
+ }
102
+
103
+ /**
104
+ * Rank all archetypes by score. Returns the full sorted list plus top1/top2.
105
+ * Tie-break (spec §8): on EQUAL dominance, the STRICTER type wins (conservative).
106
+ * 'generic' is excluded from the positive ranking — it is the fallback only.
107
+ */
108
+ function rankTypes(cfg, fv) {
109
+ const scored = ARCHETYPES
110
+ .filter((t) => t !== 'generic')
111
+ .map((t) => ({ type: t, score: scoreType(cfg.weights?.[t], fv) }));
112
+ scored.sort((a, b) => {
113
+ if (b.score !== a.score) return b.score - a.score;
114
+ // tie → stricter (higher strictness_rank) first
115
+ return strictnessOf(cfg, b.type) - strictnessOf(cfg, a.type);
116
+ });
117
+ return scored;
118
+ }
119
+
120
+ /**
121
+ * classifyAgentType(features[, prior][, opts]) → object conforming EXACTLY to
122
+ * agent-classification.schema.json.
123
+ *
124
+ * @param {object} features Anonymized behavioural signals (Modèle C):
125
+ * agent_id {string} pass-through identifier (no content)
126
+ * f_code,f_browser,… {number} per-category FRACTIONS in [0,1]
127
+ * flag_deploy,… {0|1|bool} local discriminator flags (no content)
128
+ * aux_autonomy,… {number} aux ratios in [0,1]
129
+ * n_events {number} events in the current sliding window
130
+ * @param {object} [prior] Previous classification result (the caller
131
+ * threads this so the state machine + asymmetry work across windows). Reads:
132
+ * classified_type, stage, windows_consistent, strictness_rank,
133
+ * last_reclassified_at.
134
+ * @param {object} [opts]
135
+ * regulated {boolean} tenant/Fortress flag (config-driven, NOT
136
+ * behavioural) → adds the `regulated` modifier
137
+ * now {string} ISO timestamp seam for tests
138
+ * config {object} inject config (else loaded from disk)
139
+ * @returns {object} schema-conformant classification result
140
+ */
141
+ export function classifyAgentType(features = {}, prior = null, opts = {}) {
142
+ const cfg = opts.config ? setConfig(opts.config) : loadConfig();
143
+ const th = cfg.thresholds || {};
144
+ const sg = cfg.confidence_sigmoid || {};
145
+ const now = opts.now || new Date().toISOString();
146
+
147
+ const agent_id = String(features.agent_id ?? prior?.agent_id ?? '');
148
+ const fv = normalizeFeatures(cfg, features);
149
+ // Floor + finiteness guard: the schema declares window_events as integer.
150
+ // Non-finite (Infinity/NaN) → 0 so it can't saturate confidence via log(n).
151
+ const _rawN = Number(features.n_events);
152
+ const nEvents = Number.isFinite(_rawN) ? Math.max(0, Math.floor(_rawN)) : 0;
153
+
154
+ // ── Score every archetype, find top1 / top2 / margin (spec §4). ──────────
155
+ const ranked = rankTypes(cfg, fv);
156
+ const top1 = ranked[0] || { type: 'generic', score: 0 };
157
+ const top2 = ranked[1] || { type: 'generic', score: 0 };
158
+ const margin = top1.score - top2.score;
159
+
160
+ // confidence = sigmoid(a·top1.score + b·margin + c·log(n_events) + bias).
161
+ // All three terms (top1 score, margin, log n_events) are folded in — NOT just
162
+ // top1.score. Coefficients a/b/c/bias come from config.
163
+ const logN = Math.log(Math.max(1, nEvents));
164
+ const confidence = clamp01(
165
+ sigmoid((sg.a || 0) * top1.score + (sg.b || 0) * margin + (sg.c || 0) * logN + (sg.bias || 0)),
166
+ );
167
+
168
+ // ── Candidate type per the gates (spec §4). ──────────────────────────────
169
+ // n_events < MIN_EVENTS → generic (cold-start)
170
+ // OR confidence < CONF_THRESHOLD → generic
171
+ // OR margin < MARGIN_MIN → generic
172
+ // else → top1.type
173
+ let candidate;
174
+ const belowMinEvents = nEvents < th.n_events_min;
175
+ const lowConfidence = confidence < th.confidence_min;
176
+ const lowMargin = margin < th.margin_min;
177
+
178
+ // Conservative tie-break (spec §8): "en cas d'égalité, choisir le plus strict
179
+ // (conservateur)". When the top two are a near-TIE (|margin| ≤ tie_epsilon)
180
+ // between two REAL types and there is real signal (top1.score > 0), dropping
181
+ // to generic would RELAX protection — so instead we keep the STRICTER of the
182
+ // tied pair. rankTypes() already sorts the stricter type first on an exact
183
+ // tie, so top1 IS the stricter one here. This applies only on a true tie; a
184
+ // genuinely ambiguous low-signal window (no tie, just a small margin) still
185
+ // falls back to generic via the margin gate below.
186
+ const tieEps = th.tie_epsilon ?? 0;
187
+ const isTie = top1.score > 0 && top2.type !== 'generic' && Math.abs(margin) <= tieEps;
188
+
189
+ if (belowMinEvents) candidate = 'generic';
190
+ else if (isTie) candidate = top1.type; // stricter-wins, conservative
191
+ else if (lowConfidence || lowMargin) candidate = 'generic';
192
+ else candidate = top1.type;
193
+
194
+ // ── State machine + re-classification asymmetry (spec §5). ───────────────
195
+ // We accept the prior state as input so the CALLER threads it across windows;
196
+ // this function is otherwise pure for a given (features, prior).
197
+ const priorType = prior?.classified_type || 'generic';
198
+ const priorStage = prior?.stage || 'cold_start';
199
+ const priorWindows = Math.max(0, Math.floor(Number(prior?.windows_consistent) || 0));
200
+ const priorReclassAt = prior?.last_reclassified_at || null;
201
+ // Last real (non-generic) type, threaded across generic gaps. Closes the
202
+ // generic-laundering downgrade bypass: a strict→generic→looser-real sequence
203
+ // must still face the downgrade gate against the ORIGINAL strict rank.
204
+ const priorLastReal = prior?.last_real_type || (priorType !== 'generic' ? priorType : null);
205
+ // The candidate the prior window(s) were already accumulating toward (if any).
206
+ // The caller threads this so a pending change builds consecutive evidence
207
+ // across windows instead of resetting every window.
208
+ const priorPending = prior?.pending_type || null;
209
+
210
+ let classified_type = priorType;
211
+ let stage = priorStage;
212
+ let windows_consistent = priorWindows;
213
+ let last_reclassified_at = priorReclassAt;
214
+ // pending_type: the candidate we are accumulating consecutive windows toward
215
+ // but have not yet committed (hysteresis / asymmetry not satisfied). Surfaced
216
+ // in the result so the caller can thread it back next window.
217
+ let pending_type = null;
218
+ let pending_windows = 0;
219
+
220
+ if (belowMinEvents) {
221
+ // A low-traffic window must NOT collapse an established type. An adversary
222
+ // could throttle below MIN_EVENTS to shed a strict template (downgrade
223
+ // bypass). If we already hold a real type, RETAIN it (freeze the window
224
+ // count); only a genuinely cold agent (no prior real type) stays generic.
225
+ if (priorType !== 'generic') {
226
+ classified_type = priorType;
227
+ stage = priorStage;
228
+ windows_consistent = priorWindows;
229
+ } else {
230
+ classified_type = 'generic';
231
+ stage = 'cold_start';
232
+ windows_consistent = 0;
233
+ }
234
+ } else if (candidate === priorType) {
235
+ // Same type as last window → accumulate consistency (hysteresis).
236
+ windows_consistent = priorWindows + 1;
237
+ // provisional → stable after STABLE_WINDOWS consecutive consistent windows.
238
+ if (classified_type !== 'generic' && stage !== 'stable' &&
239
+ windows_consistent >= th.stable_windows) {
240
+ stage = 'stable';
241
+ } else if (classified_type !== 'generic' && stage === 'cold_start') {
242
+ stage = 'provisional';
243
+ }
244
+ } else {
245
+ // Type would CHANGE relative to the prior. Decide whether the change is
246
+ // allowed THIS window, or whether we must accumulate more evidence.
247
+ const toRank = strictnessOf(cfg, candidate);
248
+ // Reference rank for downgrade detection: the prior REAL type, or — across a
249
+ // generic gap — the last real type before we fell to generic. Using the
250
+ // last-real reference closes the generic-laundering bypass (strict → generic
251
+ // → looser-real must still face the downgrade gate against the strict rank).
252
+ const refType = (priorType !== 'generic') ? priorType : (priorLastReal || 'generic');
253
+ const refRank = strictnessOf(cfg, refType);
254
+ // A "downgrade" = moving to a real template STRICTLY LESS strict than the
255
+ // reference. Upgrading / lateral is NOT a downgrade. Falling back TO generic
256
+ // is handled below (never relaxes the floor — the global-baseline always applies).
257
+ const isDowngrade = candidate !== 'generic' && refType !== 'generic' && toRank < refRank;
258
+
259
+ // Required consecutive-consistent-window count BEFORE applying the change.
260
+ // Leaving generic (cold_start → provisional): the FIRST window clearing
261
+ // the gates commits — provisional = "1er type au-dessus du seuil" (§5).
262
+ // Upgrade / lateral (real → real, equal-or-stricter): normal hysteresis
263
+ // STABLE_WINDOWS, never relaxes the floor.
264
+ // Downgrade (real → LESS strict): longer DOWNGRADE_WINDOWS AND a raised
265
+ // confidence floor (anti mimicry-evasion: an agent must not soften its
266
+ // protection by imitating a more permissive type).
267
+ const leavingGeneric = priorType === 'generic';
268
+ // Leaving generic is fast (1 window) ONLY when it is not a net downgrade vs
269
+ // the last real type. A net downgrade — even laundered through generic —
270
+ // takes the full gate: longer window AND raised confidence (anti-evasion).
271
+ const neededWindows = isDowngrade ? th.downgrade_windows
272
+ : (leavingGeneric ? 1 : th.stable_windows);
273
+ const neededConfidence = isDowngrade ? th.downgrade_confidence_min : th.confidence_min;
274
+
275
+ // Consecutive consistent windows toward THIS candidate. If the prior window
276
+ // was already accumulating toward the same candidate, continue the count;
277
+ // otherwise this is the first window of a fresh pending change.
278
+ const accWindows = (priorPending === candidate)
279
+ ? Math.max(0, Math.floor(Number(prior?.pending_windows) || 0)) + 1
280
+ : 1;
281
+
282
+ if (candidate === 'generic') {
283
+ // Falling back to generic is never a security relaxation we must gate —
284
+ // the global-baseline floor still applies — but we still respect
285
+ // hysteresis so a single noisy window can't flap us out of a real type.
286
+ if (priorType === 'generic') {
287
+ windows_consistent = priorWindows + 1;
288
+ classified_type = 'generic';
289
+ stage = 'cold_start';
290
+ } else {
291
+ // Accumulate toward dropping the type, but keep the (stricter) prior
292
+ // until hysteresis is satisfied — conservative.
293
+ if (accWindows >= th.stable_windows) {
294
+ classified_type = 'generic';
295
+ stage = 'cold_start';
296
+ windows_consistent = 1;
297
+ last_reclassified_at = now;
298
+ } else {
299
+ pending_type = 'generic';
300
+ pending_windows = accWindows;
301
+ // classified_type / stage / windows_consistent unchanged (stay put).
302
+ }
303
+ }
304
+ } else if (confidence >= neededConfidence && accWindows >= neededWindows) {
305
+ // Enough consecutive evidence (counting the current window) to commit the
306
+ // change. The caller threads pending_type/pending_windows so consecutive
307
+ // windows toward the same candidate accumulate.
308
+ classified_type = candidate;
309
+ // A freshly committed type always lands in 'provisional'; it climbs to
310
+ // 'stable' only after STABLE_WINDOWS consecutive same-type windows.
311
+ stage = 'provisional';
312
+ windows_consistent = 1;
313
+ last_reclassified_at = now;
314
+ } else {
315
+ // Not enough evidence yet → keep the prior (stricter-by-default) type and
316
+ // record the pending candidate so the next window can build on it. We do
317
+ // NOT touch windows_consistent of the committed type (it still applies).
318
+ pending_type = candidate;
319
+ pending_windows = accWindows;
320
+ }
321
+ }
322
+
323
+ // Stage sanity: generic is always cold_start.
324
+ if (classified_type === 'generic') stage = 'cold_start';
325
+
326
+ // Last real (non-generic) type — threaded so a generic gap doesn't erase the
327
+ // downgrade reference (see priorLastReal). Persists across generic windows.
328
+ const last_real_type = (classified_type !== 'generic') ? classified_type : (priorLastReal || null);
329
+
330
+ // ── Modifiers (spec §6): additive restrictions, no asymmetry/hysteresis. ──
331
+ const modifiers = [];
332
+ const autonomyLevel = String(features.autonomy_level ?? features.aux_autonomy_level ?? '');
333
+ const auxAutonomy = Number(features.aux_autonomy) || 0;
334
+ // autonomy: explicit level in {act_with_approval, autonomous}, or a high ratio.
335
+ if (['act_with_approval', 'autonomous'].includes(autonomyLevel) || auxAutonomy >= (th.autonomy_modifier_min ?? 0.5)) {
336
+ modifiers.push('autonomy');
337
+ }
338
+ if ((fv.aux_untrusted || 0) > (th.untrusted_modifier_min ?? 0.1)) {
339
+ modifiers.push('untrusted_input');
340
+ }
341
+ if ((fv.aux_sensitive || 0) > (th.sensitive_modifier_min ?? 0)) {
342
+ modifiers.push('data_sensitivity');
343
+ }
344
+ // regulated is tenant/Fortress config — NOT behavioural.
345
+ if (opts.regulated === true) modifiers.push('regulated');
346
+
347
+ // ── Payment overlay (spec §3/§5/§6): f_payment > 0 FORCES the transactional
348
+ // profile even when another base type dominates. It is an OVERLAY, not a
349
+ // winner-take-all reclassification: the base type stays, and we surface the
350
+ // overlay in evidence so the Shield layer adds the confirmation/limit
351
+ // policies. Reducing f_payment to flee transactional_financial is neutralized
352
+ // by the downgrade asymmetry + the always-on floor.
353
+ //
354
+ // It is surfaced in `evidence.payment_overlay`, NOT in `modifiers[]`: the
355
+ // schema's modifiers enum is fixed to {autonomy, untrusted_input,
356
+ // data_sensitivity, regulated} — "transactional" is not a legal modifier
357
+ // value, so emitting it there would violate the schema. evidence has no
358
+ // additionalProperties:false, so it is the schema-legal carrier for the overlay.
359
+ const paymentOverlay = (fv.f_payment || 0) > (th.payment_overlay_min ?? 0);
360
+
361
+ // ── Evidence (schema-shaped). ────────────────────────────────────────────
362
+ const evidence = {
363
+ window_events: nEvents,
364
+ top2_type: top2.type,
365
+ margin: Number(margin.toFixed(6)),
366
+ };
367
+ // Extra evidence keys are schema-legal (evidence has no additionalProperties:
368
+ // false). Surface the decision context for audit — never raw content.
369
+ if (paymentOverlay) {
370
+ evidence.payment_overlay = {
371
+ active: true,
372
+ f_payment: fv.f_payment,
373
+ adds: 'transactional_financial confirmation/limit policies (overlay, base type unchanged)',
374
+ };
375
+ }
376
+ evidence.confidence_terms = { top1_score: Number(top1.score.toFixed(6)), margin: Number(margin.toFixed(6)), log_n_events: Number(logN.toFixed(6)) };
377
+
378
+ return {
379
+ agent_id,
380
+ classified_type,
381
+ confidence: Number(confidence.toFixed(6)),
382
+ stage,
383
+ modifiers,
384
+ evidence,
385
+ feature_vector: fv,
386
+ windows_consistent,
387
+ strictness_rank: strictnessOf(cfg, classified_type),
388
+ ...(last_reclassified_at ? { last_reclassified_at } : {}),
389
+ // Hysteresis carry-over (schema-legal extras: root has no
390
+ // additionalProperties:false). The caller threads these back as part of the
391
+ // `prior` next window so a pending change accumulates consecutive evidence,
392
+ // and so the downgrade reference survives a generic gap (anti-evasion).
393
+ ...(pending_type ? { pending_type, pending_windows } : {}),
394
+ ...(last_real_type ? { last_real_type } : {}),
395
+ };
396
+ }
397
+
398
+ export default classifyAgentType;