@polygraphso/litmus 0.9.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -30,16 +30,28 @@ and the grade is capped at **B** for that run.
30
30
  ```bash
31
31
  polygraphso-litmus litmus <registry-ref | https-url | path-to-mcp> # grade a server
32
32
  polygraphso-litmus litmus --json <ref> # machine-readable evidence bundle
33
+ polygraphso-litmus litmus --timeout <seconds> <ref> # cap the whole run (default 900s)
33
34
  polygraphso-litmus check <ref> # look up a published grade
34
35
  ```
35
36
 
36
37
  Examples:
37
38
 
38
39
  ```bash
39
- polygraphso-litmus litmus npm/@modelcontextprotocol/server-filesystem
40
+ # a remote https target runs no local code — graded directly
40
41
  polygraphso-litmus litmus https://example.com/mcp
42
+
43
+ # a registry ref or local file launches the TARGET's own code. Grade it sandboxed:
44
+ LITMUS_STDIO_ISOLATION=docker polygraphso-litmus litmus npm/@modelcontextprotocol/server-filesystem
45
+ # …or, without Docker, opt in to running it on this host:
46
+ polygraphso-litmus litmus --unsafe-host-exec npm/@modelcontextprotocol/server-filesystem
41
47
  ```
42
48
 
49
+ **Host-execution safety.** Grading a registry ref (`npm/…`, `pypi/…`) or a local
50
+ path **launches the target's own code**. By default the CLI refuses to do that on
51
+ your host: set `LITMUS_STDIO_ISOLATION=docker` to run the target only inside the
52
+ hardened sandbox, or pass `--unsafe-host-exec` to accept host execution. Remote
53
+ `https://` targets run no local code and need neither.
54
+
43
55
  The `litmus` command exits non-zero on a failing grade (D/F), so it scripts in CI.
44
56
 
45
57
  To dispute a published grade, just re-run `litmus` against the same server: the harness is
@@ -54,7 +66,9 @@ MCP-capable client. It exposes two tools:
54
66
  and return the grade and the evidence. Optional **`bearer`** (and `header`
55
67
  entries, each `"Key: Value"`) grade a token-gated `https://` MCP target — sent
56
68
  to that origin only, ignored for stdio/local targets, the same plumbing as the
57
- CLI's `--bearer` / `--header`.
69
+ CLI's `--bearer` / `--header`. Grading a registry ref or local path launches the
70
+ target's own code, so it requires **`unsafe_host_exec: true`** unless
71
+ `LITMUS_STDIO_ISOLATION=docker` is set (the MCP mirror of `--unsafe-host-exec`).
58
72
  - **`verify_attestation`** — passively read a server's *already-published* grade
59
73
  before trusting or paying it.
60
74
 
@@ -0,0 +1,215 @@
1
+ import {
2
+ CATEGORY_META,
3
+ canonicalStringify
4
+ } from "./chunk-X3P26XGS.js";
5
+
6
+ // ../cli/src/litmus.ts
7
+ import { existsSync } from "fs";
8
+ import { createRequire } from "module";
9
+ import * as path from "path";
10
+
11
+ // ../cli/src/format.ts
12
+ function formatBundle(b) {
13
+ const lines = [];
14
+ lines.push(`\u2192 ${b.methodologyVersion} \xB7 ${b.serverRef}`);
15
+ if (b.resolvedVersion) lines.push(`\u2192 version ${b.resolvedVersion}`);
16
+ lines.push("\u2192 checks");
17
+ const labelWidth = Math.max(0, ...b.categories.map((c) => CATEGORY_META[c.code].label.length));
18
+ for (const c of b.categories) {
19
+ const { label, description } = CATEGORY_META[c.code];
20
+ lines.push(` ${c.code} ${label.padEnd(labelWidth)} ${c.status}`);
21
+ lines.push(` ${description}`);
22
+ }
23
+ const c01 = b.categories.find((c) => c.code === "C-01");
24
+ if (c01?.status === "fail") {
25
+ const highs = c01.probes.flatMap((p) => p.findings).filter((f) => f.severity === "high");
26
+ for (const f of highs.slice(0, 3)) {
27
+ lines.push(` \u26A0 ${f.tool ?? "?"}: ${f.kind} \u2014 ${truncate(f.match, 64)}`);
28
+ }
29
+ }
30
+ lines.push(`\u2192 fingerprint ${shortFp(b.toolDefsFingerprint)}`);
31
+ lines.push(`\u2192 grade: ${b.grade}`);
32
+ lines.push(` ${b.gradeRationale}`);
33
+ return lines.join("\n") + "\n";
34
+ }
35
+ function shortFp(fp) {
36
+ return fp.length > 14 ? `${fp.slice(0, 6)}\u2026${fp.slice(-4)}` : fp;
37
+ }
38
+ function truncate(s, n) {
39
+ return s.length > n ? `${s.slice(0, n)}\u2026` : s;
40
+ }
41
+
42
+ // ../cli/src/litmus.ts
43
+ var DEFAULT_RUN_TIMEOUT_MS = 15 * 60 * 1e3;
44
+ async function runLitmusCli(args) {
45
+ const json = args.includes("--json");
46
+ const { headers, allowStateChanging, unsafeHostExec, timeoutMs, positionals } = parseAuthFlags(args);
47
+ const target = positionals[0];
48
+ if (!target) {
49
+ process.stderr.write(
50
+ 'usage: polygraphso litmus [--json] [--bearer <token>] [--header "Key: Value"] [--allow-state-changing] [--unsafe-host-exec] [--timeout <seconds>] <registry-ref | https-url | path-to-mcp>\n'
51
+ );
52
+ return 2;
53
+ }
54
+ const input = resolveTarget(target);
55
+ const isStdio = typeof input !== "string" || !/^https?:\/\//i.test(input);
56
+ const interactive = Boolean(process.stdin.isTTY && process.stdout.isTTY);
57
+ const probes = await import("./src-4L5VHLRF.js");
58
+ const dockerAvailable = isStdio && interactive ? await probes.isDockerAvailable() : false;
59
+ const decision = checkHostExec(input, { optIn: unsafeHostExec, dockerAvailable, interactive });
60
+ if (decision.action === "refuse") {
61
+ process.stderr.write(`\u2192 litmus: ${decision.refuse}
62
+ `);
63
+ return 2;
64
+ }
65
+ if (decision.action === "confirm" && !await promptYesNo(decision.prompt, decision.defaultYes)) {
66
+ process.stderr.write("\u2192 litmus: cancelled.\n");
67
+ return 2;
68
+ }
69
+ const isolation = decision.isolation;
70
+ if (decision.warn) process.stderr.write(`\u2192 ${decision.warn}
71
+ `);
72
+ if (!json) process.stderr.write(`\u2192 running litmus against ${target} \u2026 (~20\u201360s)
73
+ `);
74
+ const onProgress = (done, total, label) => {
75
+ if (!json) process.stderr.write(` \u2192 [${done}/${total}] ${label}
76
+ `);
77
+ };
78
+ try {
79
+ const bundle = await probes.runLitmus(input, {
80
+ headers,
81
+ allowStateChanging,
82
+ timeoutMs,
83
+ onProgress,
84
+ ...isolation ? { isolation } : {}
85
+ });
86
+ process.stdout.write(json ? canonicalStringify(bundle) + "\n" : formatBundle(bundle));
87
+ return bundle.grade === "D" || bundle.grade === "F" ? 1 : 0;
88
+ } catch (err) {
89
+ process.stderr.write(`\u2192 litmus failed: ${err instanceof Error ? err.message : String(err)}
90
+ `);
91
+ return 1;
92
+ }
93
+ }
94
+ async function promptYesNo(prompt, defaultYes) {
95
+ const { createInterface } = await import("readline/promises");
96
+ const rl = createInterface({ input: process.stdin, output: process.stderr });
97
+ try {
98
+ return isAffirmative(await rl.question(prompt), defaultYes);
99
+ } finally {
100
+ rl.close();
101
+ }
102
+ }
103
+ function parseAuthFlags(args, env = process.env) {
104
+ const headers = {};
105
+ const headerArgs = [];
106
+ let allowStateChanging = false;
107
+ let unsafeHostExec = false;
108
+ let timeoutMs = DEFAULT_RUN_TIMEOUT_MS;
109
+ let bearer = env.LITMUS_BEARER || void 0;
110
+ const positionals = [];
111
+ for (let i = 0; i < args.length; i++) {
112
+ const a = args[i];
113
+ if (a === "--json") continue;
114
+ if (a === "--allow-state-changing") {
115
+ allowStateChanging = true;
116
+ } else if (a === "--unsafe-host-exec") {
117
+ unsafeHostExec = true;
118
+ } else if (a === "--timeout") {
119
+ timeoutMs = timeoutSecondsToMs(args[++i]) ?? timeoutMs;
120
+ } else if (a.startsWith("--timeout=")) {
121
+ timeoutMs = timeoutSecondsToMs(a.slice("--timeout=".length)) ?? timeoutMs;
122
+ } else if (a === "--bearer") {
123
+ bearer = args[++i] ?? bearer;
124
+ } else if (a.startsWith("--bearer=")) {
125
+ bearer = a.slice("--bearer=".length);
126
+ } else if (a === "--header") {
127
+ const v = args[++i];
128
+ if (v) headerArgs.push(v);
129
+ } else if (a.startsWith("--header=")) {
130
+ headerArgs.push(a.slice("--header=".length));
131
+ } else if (a.startsWith("--")) {
132
+ } else {
133
+ positionals.push(a);
134
+ }
135
+ }
136
+ if (bearer) headers["Authorization"] = `Bearer ${bearer}`;
137
+ for (const h of headerArgs) {
138
+ const idx = h.indexOf(":");
139
+ if (idx === -1) continue;
140
+ const key = h.slice(0, idx).trim();
141
+ const value = h.slice(idx + 1).trim();
142
+ if (key) headers[key] = value;
143
+ }
144
+ return { headers, allowStateChanging, unsafeHostExec, timeoutMs, positionals };
145
+ }
146
+ function timeoutSecondsToMs(v) {
147
+ if (!v) return void 0;
148
+ const sec = Number(v);
149
+ return Number.isFinite(sec) && sec > 0 ? Math.floor(sec * 1e3) : void 0;
150
+ }
151
+ function checkHostExec(input, gate) {
152
+ const { optIn, dockerAvailable, interactive, optInHint = "--unsafe-host-exec", env = process.env } = gate;
153
+ const isStdio = typeof input !== "string" || !/^https?:\/\//i.test(input);
154
+ if (!isStdio) return { action: "allow" };
155
+ if (env.LITMUS_STDIO_ISOLATION === "docker") return { action: "allow", isolation: "docker" };
156
+ const why = "this launches the target's own code; without Docker isolation it runs on THIS host";
157
+ const warn = `\u26A0 unsafe host execution \u2014 ${why}.`;
158
+ if (optIn) return { action: "allow", isolation: "none", warn };
159
+ if (interactive) {
160
+ if (dockerAvailable) {
161
+ return {
162
+ action: "confirm",
163
+ isolation: "docker",
164
+ defaultYes: true,
165
+ prompt: "Docker detected \u2014 the target will run sandboxed (recommended). Proceed? [Y/n] "
166
+ };
167
+ }
168
+ return {
169
+ action: "confirm",
170
+ isolation: "none",
171
+ defaultYes: false,
172
+ prompt: `No Docker found \u2014 this would run the target's own code on THIS host, unsandboxed.
173
+ Type "yes" to proceed, or set LITMUS_STDIO_ISOLATION=docker to sandbox: `,
174
+ warn
175
+ };
176
+ }
177
+ return {
178
+ action: "refuse",
179
+ refuse: `refusing host execution \u2014 ${why}.
180
+ \u2022 sandboxed (recommended): set LITMUS_STDIO_ISOLATION=docker (requires Docker)
181
+ \u2022 accept the risk: re-run with ${optInHint}`
182
+ };
183
+ }
184
+ function isAffirmative(answer, defaultYes) {
185
+ const a = answer.trim().toLowerCase();
186
+ if (a === "") return defaultYes;
187
+ return a === "y" || a === "yes";
188
+ }
189
+ function resolveTarget(target) {
190
+ if (/^https?:\/\//i.test(target)) return target;
191
+ if (existsSync(target)) {
192
+ const abs = path.resolve(target);
193
+ if (abs.endsWith(".ts") || abs.endsWith(".mts") || abs.endsWith(".cts")) {
194
+ return { command: process.execPath, args: [tsxCli(), abs], serverRef: target };
195
+ }
196
+ return { command: process.execPath, args: [abs], serverRef: target };
197
+ }
198
+ return target;
199
+ }
200
+ function tsxCli() {
201
+ const require2 = createRequire(import.meta.url);
202
+ const pkgJsonPath = require2.resolve("tsx/package.json");
203
+ const dir = path.dirname(pkgJsonPath);
204
+ const bin = require2(pkgJsonPath).bin;
205
+ const rel = typeof bin === "string" ? bin : bin.tsx ?? "./dist/cli.mjs";
206
+ return path.join(dir, rel);
207
+ }
208
+
209
+ export {
210
+ DEFAULT_RUN_TIMEOUT_MS,
211
+ runLitmusCli,
212
+ parseAuthFlags,
213
+ checkHostExec,
214
+ resolveTarget
215
+ };
@@ -3,7 +3,7 @@ import {
3
3
  METHODOLOGY_VERSION,
4
4
  parseServerRef,
5
5
  serverKey
6
- } from "./chunk-44R4ZYOE.js";
6
+ } from "./chunk-X3P26XGS.js";
7
7
 
8
8
  // ../probes/src/harness.ts
9
9
  import { execFile as execFile3 } from "child_process";
@@ -451,6 +451,10 @@ import { execFile as execFile2 } from "child_process";
451
451
  import { promisify } from "util";
452
452
  import { randomUUID as randomUUID3 } from "crypto";
453
453
  var execFileP = promisify(execFile2);
454
+ var TARGET_STDERR = process.env.LITMUS_DEBUG ? "inherit" : "pipe";
455
+ function discardStderr(transport) {
456
+ transport.stderr?.resume?.();
457
+ }
454
458
  var CLIENT_INFO = { name: "polygraph-litmus", version: "0.0.0" };
455
459
  async function connectTarget(input, opts = {}) {
456
460
  const isolated = opts.isolation === "docker";
@@ -464,6 +468,7 @@ async function connectTarget(input, opts = {}) {
464
468
  command: input.command,
465
469
  args: input.args ?? [],
466
470
  env: { ...getDefaultEnvironment(), ...opts.seedEnv ?? {}, ...input.env ?? {} },
471
+ stderr: TARGET_STDERR,
467
472
  ...input.cwd ?? opts.seedCwd ? { cwd: input.cwd ?? opts.seedCwd } : {}
468
473
  });
469
474
  const cmdline = [input.command, ...input.args ?? []].join(" ");
@@ -497,6 +502,7 @@ async function connectTarget(input, opts = {}) {
497
502
  command: launch.command,
498
503
  args: launch.args,
499
504
  env: { ...getDefaultEnvironment(), ...opts.seedEnv ?? {} },
505
+ stderr: TARGET_STDERR,
500
506
  ...opts.seedCwd ? { cwd: opts.seedCwd } : {}
501
507
  });
502
508
  const client = await connectOrThrow(transport);
@@ -518,14 +524,14 @@ async function connectHostNpm(ref, parsed, opts) {
518
524
  const binNames = await fetchNpmBins(spec, parsed.name);
519
525
  if (!binNames || binNames.length === 0) {
520
526
  const args = ["-y", spec];
521
- const transport = new StdioClientTransport({ command: "npx", args, env, ...cwd });
527
+ const transport = new StdioClientTransport({ command: "npx", args, env, stderr: TARGET_STDERR, ...cwd });
522
528
  const client = await connectOrThrow(transport);
523
529
  return makeResult(client, "stdio", { kind: "stdio", command: ["npx", ...args].join(" "), url: null }, serverRefVal, resolvedVersion, []);
524
530
  }
525
531
  const candidates = orderBinCandidates(binNames, parsed.name);
526
532
  const { result } = await probeForMcpBin(ref, candidates, async (bin) => {
527
533
  const args = ["-y", "-p", spec, bin];
528
- const transport = new StdioClientTransport({ command: "npx", args, env, ...cwd });
534
+ const transport = new StdioClientTransport({ command: "npx", args, env, stderr: TARGET_STDERR, ...cwd });
529
535
  const client = await tryConnect(transport);
530
536
  return client ? { client, descriptor: { kind: "stdio", command: ["npx", ...args].join(" "), url: null } } : null;
531
537
  });
@@ -562,8 +568,9 @@ async function connectIsolatedNpm(ref, parsed, opts) {
562
568
  const transport = new StdioClientTransport({
563
569
  command: launch.command,
564
570
  args: namedArgs,
565
- env: getDefaultEnvironment()
571
+ env: getDefaultEnvironment(),
566
572
  // default env only: no host secrets, no canaries
573
+ stderr: TARGET_STDERR
567
574
  });
568
575
  const client = await tryConnect(transport);
569
576
  if (!client) {
@@ -608,6 +615,7 @@ async function tryConnect(transport) {
608
615
  const client = new Client(CLIENT_INFO, { capabilities: {} });
609
616
  try {
610
617
  await withConnectTimeout(client.connect(transport), transport);
618
+ discardStderr(transport);
611
619
  return client;
612
620
  } catch {
613
621
  try {
@@ -620,6 +628,7 @@ async function tryConnect(transport) {
620
628
  async function connectOrThrow(transport) {
621
629
  const client = new Client(CLIENT_INFO, { capabilities: {} });
622
630
  await withConnectTimeout(client.connect(transport), transport);
631
+ discardStderr(transport);
623
632
  return client;
624
633
  }
625
634
  function makeResult(client, kind, descriptor, serverRef, resolvedVersion, teardownExtra) {
@@ -1536,12 +1545,14 @@ async function runEgressProbe(ref, opts) {
1536
1545
  if (staged) await staged.cleanup();
1537
1546
  }
1538
1547
  }
1548
+ async function exerciseSurface(client, exercise) {
1549
+ for (const t of await enumerateTools(client)) {
1550
+ await exercise({ name: t.name, description: t.description ?? "", inputSchema: t.inputSchema ?? null });
1551
+ }
1552
+ }
1539
1553
  async function collectEgress(conn, sink, declaredEgress, baselineAllowlist) {
1540
1554
  try {
1541
- const { tools } = await conn.client.listTools();
1542
- for (const t of tools) {
1543
- await exerciseTool(conn.client, { name: t.name, description: t.description ?? "", inputSchema: t.inputSchema ?? null });
1544
- }
1555
+ await exerciseSurface(conn.client, (def) => exerciseTool(conn.client, def));
1545
1556
  } finally {
1546
1557
  await conn.teardown();
1547
1558
  }
@@ -2025,7 +2036,7 @@ async function runLitmus(target, opts = {}) {
2025
2036
  const isolation = opts.isolation ?? (process.env.LITMUS_STDIO_ISOLATION === "docker" ? "docker" : "none");
2026
2037
  const ranAt = (/* @__PURE__ */ new Date()).toISOString();
2027
2038
  const baselineAllowlist = [...DEFAULT_EGRESS_BASELINE, ...parseAllowlistEnv(process.env.LITMUS_EGRESS_ALLOWLIST)];
2028
- const dockerAvailable = await checkDocker();
2039
+ const dockerAvailable = await isDockerAvailable();
2029
2040
  const canaries = mintCanaries();
2030
2041
  const seedEnv = canaryEnv(canaries);
2031
2042
  const isHttp = typeof target === "string" && /^https?:\/\//i.test(target);
@@ -2167,7 +2178,7 @@ function withTimeout(p, ms, label) {
2167
2178
  })
2168
2179
  ]);
2169
2180
  }
2170
- function checkDocker() {
2181
+ function isDockerAvailable() {
2171
2182
  return new Promise((resolve) => {
2172
2183
  const child = execFile3("docker", ["info"], { timeout: 4e3 }, (err) => resolve(!err));
2173
2184
  child.on("error", () => resolve(false));
@@ -2362,6 +2373,12 @@ function overBroadTrigger(description) {
2362
2373
  }
2363
2374
 
2364
2375
  // ../probes/src/skills/grade-skill.ts
2376
+ var SKILL_CATEGORY_META = {
2377
+ "S-01": { label: "prompt injection / context poisoning", description: "whether the skill body tries to hijack the agent" },
2378
+ "S-03": { label: "data-exfiltration instructions", description: "whether it instructs the agent to leak secrets" },
2379
+ "S-04": { label: "dangerous bundled commands", description: "whether it ships dangerous executable commands" },
2380
+ "S-05": { label: "tool / permission overreach", description: "whether it claims more capability than it needs" }
2381
+ };
2365
2382
  var DISQUALIFYING = /* @__PURE__ */ new Set(["S-01", "S-03"]);
2366
2383
  var CAPPING = /* @__PURE__ */ new Set(["S-04", "S-05"]);
2367
2384
  function gradeSkillCategories(categories) {
@@ -2626,6 +2643,8 @@ export {
2626
2643
  gradeFromCategories,
2627
2644
  assembleBundle,
2628
2645
  runLitmus,
2646
+ enumerateTools,
2647
+ isDockerAvailable,
2629
2648
  SkillLoadError,
2630
2649
  loadSkill,
2631
2650
  stripExamples,
@@ -2634,6 +2653,7 @@ export {
2634
2653
  exfilInstruction,
2635
2654
  dangerousCommand,
2636
2655
  overBroadTrigger,
2656
+ SKILL_CATEGORY_META,
2637
2657
  gradeSkillCategories,
2638
2658
  SKILL_METHODOLOGY_VERSION,
2639
2659
  SKILL_BUNDLE_SCHEMA_VERSION,
@@ -1,22 +1,26 @@
1
1
  import {
2
+ DEFAULT_RUN_TIMEOUT_MS,
3
+ checkHostExec,
2
4
  parseAuthFlags,
3
5
  resolveTarget
4
- } from "./chunk-BUKDFSDO.js";
6
+ } from "./chunk-EMMCE3LC.js";
5
7
  import {
8
+ SKILL_CATEGORY_META,
6
9
  SKILL_METHODOLOGY_VERSION,
7
10
  runLitmus,
8
11
  runSkillLitmus,
9
12
  runSkillQuality,
10
13
  runSkillQualityJudged
11
- } from "./chunk-RYJXVMCT.js";
14
+ } from "./chunk-NPYDTMQ7.js";
12
15
  import {
16
+ CATEGORY_META,
13
17
  CATEGORY_STATUS_UINT8,
14
18
  METHODOLOGY_VERSION,
15
19
  parseServerRef,
16
20
  parseSkillRef,
17
21
  serverKey,
18
22
  skillKey
19
- } from "./chunk-44R4ZYOE.js";
23
+ } from "./chunk-X3P26XGS.js";
20
24
 
21
25
  // ../onchain/src/networks.ts
22
26
  var NETWORKS = {
@@ -299,24 +303,37 @@ var RUN_LITMUS_TOOL_DESCRIPTION = [
299
303
  var runLitmusInputShape = {
300
304
  server_ref: z.string().min(1).max(512).describe("What to grade: a registry ref (npm/@scope/server), an https:// MCP URL, or a local path to an MCP entry file."),
301
305
  bearer: z.string().min(1).max(8192).optional().describe("Bearer token for a token-gated https:// MCP server. Sent as `Authorization: Bearer <token>` to the target origin only. Ignored for stdio/local targets."),
302
- header: z.array(z.string()).max(20).optional().describe('Extra HTTP headers for a gated https:// target, each "Key: Value" (e.g. "X-Api-Key: \u2026"). Overrides the bearer-derived Authorization for the same key. Ignored for stdio/local targets.')
306
+ header: z.array(z.string()).max(20).optional().describe('Extra HTTP headers for a gated https:// target, each "Key: Value" (e.g. "X-Api-Key: \u2026"). Overrides the bearer-derived Authorization for the same key. Ignored for stdio/local targets.'),
307
+ unsafe_host_exec: z.boolean().optional().describe("Required to grade a registry ref or local path: it launches the target's own code, and without Docker isolation that runs on THIS host. Set true to accept host execution. Ignored for https:// targets or when LITMUS_STDIO_ISOLATION=docker."),
308
+ timeout_seconds: z.number().int().positive().max(3600).optional().describe("Aggregate wall-clock ceiling for the whole run, in seconds (default 900). Bounds a hostile server that stretches the run across many tools/probes.")
303
309
  };
304
310
  var PROGRESS_TOTAL = 5;
305
- async function handleRunLitmus({ server_ref, bearer, header }, extra) {
311
+ async function handleRunLitmus({ server_ref, bearer, header, unsafe_host_exec, timeout_seconds }, extra) {
306
312
  try {
307
313
  const argv = [
308
314
  ...bearer ? ["--bearer", bearer] : [],
309
315
  ...(header ?? []).flatMap((h) => ["--header", h])
310
316
  ];
311
317
  const { headers } = parseAuthFlags(argv, {});
318
+ const input = resolveTarget(server_ref);
319
+ const decision = checkHostExec(input, {
320
+ optIn: unsafe_host_exec ?? false,
321
+ dockerAvailable: false,
322
+ interactive: false,
323
+ optInHint: 'set "unsafe_host_exec": true'
324
+ });
325
+ if (decision.action === "refuse") {
326
+ return { isError: true, content: [{ type: "text", text: `run_litmus refused: ${decision.refuse}` }] };
327
+ }
312
328
  const progressToken = extra._meta?.progressToken;
313
329
  const sendProgress = progressToken !== void 0 ? (progress, message) => void extra.sendNotification({
314
330
  method: "notifications/progress",
315
331
  params: { progressToken, progress, total: PROGRESS_TOTAL, message }
316
332
  }) : void 0;
317
333
  sendProgress?.(0, `Connecting to ${server_ref}\u2026`);
318
- const bundle = await runLitmus(resolveTarget(server_ref), {
334
+ const bundle = await runLitmus(input, {
319
335
  ...Object.keys(headers).length ? { headers } : {},
336
+ timeoutMs: timeout_seconds ? timeout_seconds * 1e3 : DEFAULT_RUN_TIMEOUT_MS,
320
337
  ...sendProgress ? { onProgress: (done, _total, label) => sendProgress(done, label) } : {}
321
338
  });
322
339
  const payload = summarize(bundle);
@@ -326,18 +343,19 @@ async function handleRunLitmus({ server_ref, bearer, header }, extra) {
326
343
  return { isError: true, content: [{ type: "text", text: `run_litmus failed: ${message}` }] };
327
344
  }
328
345
  }
329
- var CATEGORY_LABEL = {
330
- "C-01": "tool-output injection",
331
- "C-02": "permission / egress overreach",
332
- "C-03": "sensitive-data handling",
333
- "C-04": "adversarial-input handling"
334
- };
335
346
  function summarize(b) {
336
347
  const find = (code) => b.categories.find((c) => c.code === code);
337
348
  const categories = ["C-01", "C-02", "C-03", "C-04"].map((code) => {
338
349
  const c = find(code);
339
350
  const findings = c?.status === "fail" ? c.probes.flatMap((p) => p.findings).filter((f) => f.severity === "high").slice(0, 5).map((f) => ({ tool: f.tool, kind: f.kind, match: truncate(f.match, 120), host: f.host, port: f.port })) : [];
340
- return { code, check: CATEGORY_LABEL[code], status: c?.status ?? "unknown", reason: c?.reason ?? null, findings };
351
+ return {
352
+ code,
353
+ check: CATEGORY_META[code].label,
354
+ description: CATEGORY_META[code].description,
355
+ status: c?.status ?? "unknown",
356
+ reason: c?.reason ?? null,
357
+ findings
358
+ };
341
359
  });
342
360
  return {
343
361
  grade: b.grade,
@@ -407,15 +425,11 @@ async function handleRunSkillLitmus({ skill_ref }, ctx = {}) {
407
425
  function errorResult(message) {
408
426
  return { isError: true, content: [{ type: "text", text: `run_skill_litmus failed: ${message}` }] };
409
427
  }
410
- var CATEGORY_LABEL2 = {
411
- "S-01": "prompt injection / context poisoning",
412
- "S-03": "data-exfiltration instructions",
413
- "S-04": "dangerous bundled commands"
414
- };
415
428
  function summarize2(b) {
416
429
  const categories = b.categories.map((c) => ({
417
430
  code: c.code,
418
- check: CATEGORY_LABEL2[c.code] ?? c.code,
431
+ check: SKILL_CATEGORY_META[c.code]?.label ?? c.code,
432
+ description: SKILL_CATEGORY_META[c.code]?.description ?? null,
419
433
  status: c.status,
420
434
  reason: c.reason ?? null,
421
435
  findings: c.status === "fail" ? c.findings.filter((f) => f.severity === "high").slice(0, 5).map((f) => ({ kind: f.kind, match: truncate2(f.match, 120), file: f.file })) : []
@@ -1,6 +1,12 @@
1
1
  // ../core/src/types.ts
2
2
  var METHODOLOGY_VERSION = "litmus-v5";
3
3
  var BUNDLE_SCHEMA_VERSION = "1.4.0";
4
+ var CATEGORY_META = {
5
+ "C-01": { label: "tool-output injection", description: "whether it tries to hijack the caller through tool output" },
6
+ "C-02": { label: "permission / egress overreach", description: "whether it reaches the network beyond what it declares" },
7
+ "C-03": { label: "sensitive-data handling", description: "whether it leaks planted secrets it was handed" },
8
+ "C-04": { label: "adversarial-input handling", description: "whether it stays stable on malformed or hostile input" }
9
+ };
4
10
  var CATEGORY_STATUS_UINT8 = {
5
11
  pass: 0,
6
12
  fail: 1,
@@ -174,6 +180,7 @@ function sortDeep(value, depth = 0) {
174
180
  export {
175
181
  METHODOLOGY_VERSION,
176
182
  BUNDLE_SCHEMA_VERSION,
183
+ CATEGORY_META,
177
184
  CATEGORY_STATUS_UINT8,
178
185
  ServerRefParseError,
179
186
  parseServerRef,
package/dist/cli-skill.js CHANGED
@@ -1,31 +1,18 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
+ SKILL_CATEGORY_META,
3
4
  judgeFromEnv,
4
5
  runSkillLitmus,
5
6
  runSkillQuality,
6
7
  runSkillQualityJudged
7
- } from "./chunk-RYJXVMCT.js";
8
- import "./chunk-44R4ZYOE.js";
8
+ } from "./chunk-NPYDTMQ7.js";
9
+ import "./chunk-X3P26XGS.js";
9
10
 
10
11
  // src/cli-skill.ts
11
12
  import { statSync } from "fs";
12
- var HELP = `polygraphso-litmus-skill \u2014 static safety grades for Claude Code skills.
13
-
14
- usage:
15
- polygraphso-litmus-skill [--json] <path-to-skill-dir>
16
- polygraphso-litmus-skill --help
17
-
18
- The skill dir must contain a SKILL.md. The safety letter is a STATIC scan (no
19
- execution); an A means the static checks were clean, not that the skill is
20
- behaviorally safe.
21
13
 
22
- It also prints a separate, advisory quality signal. The optional LLM-judged
23
- axes (honesty, coherence) run only if you provide your own key \u2014 set
24
- LITMUS_LLM_API_KEY and LITMUS_LLM_MODEL (and LITMUS_LLM_BASE_URL for a non-OpenAI
25
- endpoint). Without a key only the deterministic well-formedness checks run.
26
- More at https://polygraph.so
27
- `;
28
- function render(b) {
14
+ // src/format-skill.ts
15
+ function formatSkillSafety(b) {
29
16
  const lines = [
30
17
  `grade: ${b.grade} (${b.methodologyVersion})`,
31
18
  `${b.gradeRationale}`,
@@ -34,8 +21,11 @@ function render(b) {
34
21
  "",
35
22
  "categories:"
36
23
  ];
24
+ const labelWidth = Math.max(0, ...b.categories.map((c) => SKILL_CATEGORY_META[c.code].label.length));
37
25
  for (const c of b.categories) {
38
- lines.push(` ${c.code} ${c.status}${c.reason ? ` (${c.reason})` : ""}`);
26
+ const { label, description } = SKILL_CATEGORY_META[c.code];
27
+ lines.push(` ${c.code} ${label.padEnd(labelWidth)} ${c.status}${c.reason ? ` (${c.reason})` : ""}`);
28
+ lines.push(` ${description}`);
39
29
  if (c.status === "fail") {
40
30
  for (const f of c.findings.filter((x) => x.severity === "high").slice(0, 5)) {
41
31
  lines.push(` ! ${f.kind}${f.file ? ` [${f.file}]` : ""}: ${f.match}`);
@@ -51,6 +41,24 @@ function render(b) {
51
41
  lines.push("", b.disclaimer);
52
42
  return lines.join("\n") + "\n";
53
43
  }
44
+
45
+ // src/cli-skill.ts
46
+ var HELP = `polygraphso-litmus-skill \u2014 static safety grades for Claude Code skills.
47
+
48
+ usage:
49
+ polygraphso-litmus-skill [--json] <path-to-skill-dir>
50
+ polygraphso-litmus-skill --help
51
+
52
+ The skill dir must contain a SKILL.md. The safety letter is a STATIC scan (no
53
+ execution); an A means the static checks were clean, not that the skill is
54
+ behaviorally safe.
55
+
56
+ It also prints a separate, advisory quality signal. The optional LLM-judged
57
+ axes (honesty, coherence) run only if you provide your own key \u2014 set
58
+ LITMUS_LLM_API_KEY and LITMUS_LLM_MODEL (and LITMUS_LLM_BASE_URL for a non-OpenAI
59
+ endpoint). Without a key only the deterministic well-formedness checks run.
60
+ More at https://polygraph.so
61
+ `;
54
62
  function renderQuality(q) {
55
63
  const lines = ["", `quality (advisory, separate from the grade): ${q.verdict}`];
56
64
  for (const c of q.checks) lines.push(` ${c.status === "pass" ? "\xB7" : "!"} ${c.id}: ${c.detail}`);
@@ -87,7 +95,7 @@ async function main(argv) {
87
95
  const judge = judgeFromEnv();
88
96
  const quality = judge ? await runSkillQualityJudged(target, judge, { skillRef: target }) : runSkillQuality(target, { skillRef: target });
89
97
  process.stdout.write(
90
- json ? JSON.stringify({ safety, quality }, null, 2) + "\n" : render(safety) + renderQuality(quality)
98
+ json ? JSON.stringify({ safety, quality }, null, 2) + "\n" : formatSkillSafety(safety) + renderQuality(quality)
91
99
  );
92
100
  return 0;
93
101
  }
package/dist/cli.js CHANGED
@@ -1,11 +1,11 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  runLitmusCli
4
- } from "./chunk-BUKDFSDO.js";
4
+ } from "./chunk-EMMCE3LC.js";
5
5
  import {
6
6
  parseServerRef,
7
7
  serverKey
8
- } from "./chunk-44R4ZYOE.js";
8
+ } from "./chunk-X3P26XGS.js";
9
9
 
10
10
  // src/cli.ts
11
11
  import { readFileSync } from "fs";
package/dist/index.d.ts CHANGED
@@ -37,6 +37,15 @@ declare const METHODOLOGY_VERSION: "litmus-v5";
37
37
  * `harness.stdioIsolation`; older remain valid. */
38
38
  declare const BUNDLE_SCHEMA_VERSION: "1.4.0";
39
39
  type CategoryCode = "C-01" | "C-02" | "C-03" | "C-04";
40
+ /**
41
+ * Plain-English label + one-line description for each probe category, so CLI and
42
+ * MCP output is legible without knowing the probe IDs. The single source of these
43
+ * strings — both renderers and the MCP `run_litmus` summary read from here.
44
+ */
45
+ declare const CATEGORY_META: Record<CategoryCode, {
46
+ label: string;
47
+ description: string;
48
+ }>;
40
49
  /** Probe IDs carry their family number (1=injection, 2=permission,
41
50
  * 3=adversarial-input, 4=sensitive). 1.3 (second-order injection) added in v5. */
42
51
  type ProbeId = "1.1" | "1.2" | "1.3" | "2.1" | "2.2" | "3.1" | "3.2" | "4.1" | "4.2";
@@ -323,6 +332,38 @@ interface RunLitmusOptions {
323
332
  onProgress?: (done: number, total: number, label: string) => void;
324
333
  }
325
334
  declare function runLitmus(target: TargetInput, opts?: RunLitmusOptions): Promise<EvidenceBundle>;
335
+ /** The fields of a `tools/list` entry the harness reads. */
336
+ interface ListedTool {
337
+ name: string;
338
+ description?: string;
339
+ inputSchema?: unknown;
340
+ annotations?: unknown;
341
+ }
342
+ interface ListToolsClient {
343
+ listTools(params?: {
344
+ cursor?: string;
345
+ }): Promise<{
346
+ tools?: ListedTool[];
347
+ nextCursor?: string;
348
+ }>;
349
+ }
350
+ /**
351
+ * Follow `tools/list` pagination to the end, accumulating the full tool surface.
352
+ * The MCP SDK's `listTools()` returns a single page and does not auto-paginate,
353
+ * so a server can park a tool (e.g. `transfer_funds`) or a poisoned description
354
+ * behind a `nextCursor` — invisible to a one-page lister, yet served to a real
355
+ * agent. We enumerate every page so the fingerprint and grade cover what the
356
+ * agent actually gets, and **fail closed**: if the server is still paginating
357
+ * past the gradable cap, we refuse rather than grade a partial surface.
358
+ */
359
+ declare function enumerateTools(client: ListToolsClient, opts?: {
360
+ maxTools?: number;
361
+ maxBytes?: number;
362
+ listTimeoutMs?: number;
363
+ }): Promise<ListedTool[]>;
364
+ /** True if a Docker daemon is reachable (governs C-02 / probe 4.2, and the CLI's
365
+ * detect-and-confirm sandbox prompt). */
366
+ declare function isDockerAvailable(): Promise<boolean>;
326
367
 
327
368
  /**
328
369
  * Tool-surface fingerprint (litmus-test-v1 §6, technical-design §3).
@@ -441,6 +482,15 @@ declare function hasHighSeverity(findings: readonly Finding[]): boolean;
441
482
  */
442
483
 
443
484
  type SkillCategoryCode = "S-01" | "S-03" | "S-04" | "S-05";
485
+ /**
486
+ * Plain-English label + one-line description for each skill category, so the skill
487
+ * CLI/MCP output is legible without knowing the S-codes. The single source of these
488
+ * strings — both the renderer and the MCP `run_skill_litmus` summary read from here.
489
+ */
490
+ declare const SKILL_CATEGORY_META: Record<SkillCategoryCode, {
491
+ label: string;
492
+ description: string;
493
+ }>;
444
494
  interface SkillCategoryResult {
445
495
  code: SkillCategoryCode;
446
496
  status: CategoryStatus;
@@ -961,23 +1011,27 @@ declare const runLitmusInputShape: {
961
1011
  server_ref: z.ZodString;
962
1012
  bearer: z.ZodOptional<z.ZodString>;
963
1013
  header: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1014
+ unsafe_host_exec: z.ZodOptional<z.ZodBoolean>;
1015
+ timeout_seconds: z.ZodOptional<z.ZodNumber>;
964
1016
  };
965
- declare function handleRunLitmus({ server_ref, bearer, header }: {
1017
+ declare function handleRunLitmus({ server_ref, bearer, header, unsafe_host_exec, timeout_seconds }: {
966
1018
  server_ref: string;
967
1019
  bearer?: string;
968
1020
  header?: string[];
1021
+ unsafe_host_exec?: boolean;
1022
+ timeout_seconds?: number;
969
1023
  }, extra: RequestHandlerExtra<ServerRequest, ServerNotification>): Promise<{
1024
+ isError: true;
970
1025
  content: {
971
1026
  type: "text";
972
1027
  text: string;
973
1028
  }[];
974
- isError?: undefined;
975
1029
  } | {
976
- isError: true;
977
1030
  content: {
978
1031
  type: "text";
979
1032
  text: string;
980
1033
  }[];
1034
+ isError?: undefined;
981
1035
  }>;
982
1036
 
983
1037
  /**
@@ -1057,6 +1111,10 @@ interface ParsedLitmusFlags {
1057
1111
  headers: Record<string, string>;
1058
1112
  /** Whether to actively call state-changing tools (opt-in). */
1059
1113
  allowStateChanging: boolean;
1114
+ /** Opt-in to run a stdio target's code on the host without Docker isolation. */
1115
+ unsafeHostExec: boolean;
1116
+ /** Aggregate wall-clock ceiling (ms) — `--timeout <seconds>`, else the default. */
1117
+ timeoutMs: number;
1060
1118
  /** Non-flag arguments, in order (positionals[0] is the target). */
1061
1119
  positionals: string[];
1062
1120
  }
@@ -1072,4 +1130,4 @@ declare function parseAuthFlags(args: readonly string[], env?: NodeJS.ProcessEnv
1072
1130
  /** A target is an https URL, a local MCP entry file, or a registry ref. */
1073
1131
  declare function resolveTarget(target: string): string | StdioCommand;
1074
1132
 
1075
- export { type AttestationView, BUNDLE_SCHEMA_VERSION, type BundleInput, CATEGORY_STATUS_UINT8, type CategoryCode, type CategoryResult, type CategoryStatus, type ConnectOptions, type ConnectedTarget, DEFAULT_PASSING, type EvidenceBundle, type Finding, type FindingKind, type FingerprintResult, type GateAction, type GateDecision, type Grade, type HarnessInfo, type Judge, type JudgeOptions, type JudgedQuality, LITMUS_SCHEMA, LITMUS_SKILL_SCHEMA, type LitmusAttestationFields, type LitmusGrade, type RunLitmusOptions as LitmusOptions, type LoadedSkill, METHODOLOGY_VERSION, NETWORKS, type Network, type NetworkConfig, type OnchainLitmusAttestation, type OnchainSkillAttestation, type OpenAICompatConfig, type ParsedLitmusFlags, type ParsedServerRef, type ParsedSkillRef, type ProbeContext, type ProbeId, type ProbeResult, type ProbeStatus, type QualityBundle, type QualityCheck, type QualityCheckStatus, type QualityVerdict, RUN_LITMUS_TOOL_DESCRIPTION, RUN_LITMUS_TOOL_NAME, RUN_LITMUS_TOOL_TITLE, RUN_SKILL_LITMUS_TOOL_DESCRIPTION, RUN_SKILL_LITMUS_TOOL_NAME, RUN_SKILL_LITMUS_TOOL_TITLE, type Registry, type RunLitmusOptions, type RunSkillLitmusOptions, type RunSkillQualityOptions, SKILL_BUNDLE_SCHEMA_VERSION, SKILL_METHODOLOGY_VERSION, SKILL_QUALITY_VERSION, ServerRefParseError, type Severity, type SkillAttestationFields, type SkillCategoryCode, type SkillCategoryResult, type SkillEvidenceBundle, type SkillFile, type SkillGrade, type SkillGradeForAttestation, SkillLoadError, SkillRefParseError, type SkillSource, type StdioCommand, type TargetDescriptor, type TargetInput, type TargetKind, type ToolAnnotations, type ToolDef, type ToolSafety, VERIFY_SKILL_TOOL_DESCRIPTION, VERIFY_SKILL_TOOL_NAME, VERIFY_SKILL_TOOL_TITLE, assembleBundle, canaryMatch, canonicalStringify, classifyTool, connectTarget, dangerousCommand, decodeLitmusAttestation, decodeSkillAttestation, encodeLitmusAttestation, encodeSkillAttestation, encodeSkillAttestationFields, exfilInstruction, fingerprintToolDefs, formatServerRef, formatSkillRef, gateDecision, gradeFromCategories, gradeSkillCategories, handleRunLitmus, handleRunSkillLitmus, handleVerifySkill, hasHighSeverity, instructionMimicry, internalsLeak, invisibleUnicode, judgeFromEnv, judgeSkillQuality, litmusFields, litmusSchemaUID, liveFingerprint, loadSkill, markdownTricks, networkConfig, openAICompatJudge, overBroadTrigger, parseAuthFlags, parseServerRef, parseSkillRef, readAttestation, readSkillAttestation, resolveTarget, rpcUrl, runLitmus, runLitmusInputShape, runSkillLitmus, runSkillLitmusInputShape, runSkillQuality, runSkillQualityJudged, selectedNetwork, serverKey, skillAttestationFields, skillInjection, skillInjectionFails, skillKey, skillSchemaUID, stateChangingToolNames, stripExamples, verifySkillInputShape };
1133
+ export { type AttestationView, BUNDLE_SCHEMA_VERSION, type BundleInput, CATEGORY_META, CATEGORY_STATUS_UINT8, type CategoryCode, type CategoryResult, type CategoryStatus, type ConnectOptions, type ConnectedTarget, DEFAULT_PASSING, type EvidenceBundle, type Finding, type FindingKind, type FingerprintResult, type GateAction, type GateDecision, type Grade, type HarnessInfo, type Judge, type JudgeOptions, type JudgedQuality, LITMUS_SCHEMA, LITMUS_SKILL_SCHEMA, type ListToolsClient, type LitmusAttestationFields, type LitmusGrade, type RunLitmusOptions as LitmusOptions, type LoadedSkill, METHODOLOGY_VERSION, NETWORKS, type Network, type NetworkConfig, type OnchainLitmusAttestation, type OnchainSkillAttestation, type OpenAICompatConfig, type ParsedLitmusFlags, type ParsedServerRef, type ParsedSkillRef, type ProbeContext, type ProbeId, type ProbeResult, type ProbeStatus, type QualityBundle, type QualityCheck, type QualityCheckStatus, type QualityVerdict, RUN_LITMUS_TOOL_DESCRIPTION, RUN_LITMUS_TOOL_NAME, RUN_LITMUS_TOOL_TITLE, RUN_SKILL_LITMUS_TOOL_DESCRIPTION, RUN_SKILL_LITMUS_TOOL_NAME, RUN_SKILL_LITMUS_TOOL_TITLE, type Registry, type RunLitmusOptions, type RunSkillLitmusOptions, type RunSkillQualityOptions, SKILL_BUNDLE_SCHEMA_VERSION, SKILL_CATEGORY_META, SKILL_METHODOLOGY_VERSION, SKILL_QUALITY_VERSION, ServerRefParseError, type Severity, type SkillAttestationFields, type SkillCategoryCode, type SkillCategoryResult, type SkillEvidenceBundle, type SkillFile, type SkillGrade, type SkillGradeForAttestation, SkillLoadError, SkillRefParseError, type SkillSource, type StdioCommand, type TargetDescriptor, type TargetInput, type TargetKind, type ToolAnnotations, type ToolDef, type ToolSafety, VERIFY_SKILL_TOOL_DESCRIPTION, VERIFY_SKILL_TOOL_NAME, VERIFY_SKILL_TOOL_TITLE, assembleBundle, canaryMatch, canonicalStringify, classifyTool, connectTarget, dangerousCommand, decodeLitmusAttestation, decodeSkillAttestation, encodeLitmusAttestation, encodeSkillAttestation, encodeSkillAttestationFields, enumerateTools, exfilInstruction, fingerprintToolDefs, formatServerRef, formatSkillRef, gateDecision, gradeFromCategories, gradeSkillCategories, handleRunLitmus, handleRunSkillLitmus, handleVerifySkill, hasHighSeverity, instructionMimicry, internalsLeak, invisibleUnicode, isDockerAvailable, judgeFromEnv, judgeSkillQuality, litmusFields, litmusSchemaUID, liveFingerprint, loadSkill, markdownTricks, networkConfig, openAICompatJudge, overBroadTrigger, parseAuthFlags, parseServerRef, parseSkillRef, readAttestation, readSkillAttestation, resolveTarget, rpcUrl, runLitmus, runLitmusInputShape, runSkillLitmus, runSkillLitmusInputShape, runSkillQuality, runSkillQualityJudged, selectedNetwork, serverKey, skillAttestationFields, skillInjection, skillInjectionFails, skillKey, skillSchemaUID, stateChangingToolNames, stripExamples, verifySkillInputShape };
package/dist/index.js CHANGED
@@ -31,13 +31,14 @@ import {
31
31
  skillAttestationFields,
32
32
  skillSchemaUID,
33
33
  verifySkillInputShape
34
- } from "./chunk-Z66GKAQD.js";
34
+ } from "./chunk-TK4EI66E.js";
35
35
  import {
36
36
  parseAuthFlags,
37
37
  resolveTarget
38
- } from "./chunk-BUKDFSDO.js";
38
+ } from "./chunk-EMMCE3LC.js";
39
39
  import {
40
40
  SKILL_BUNDLE_SCHEMA_VERSION,
41
+ SKILL_CATEGORY_META,
41
42
  SKILL_METHODOLOGY_VERSION,
42
43
  SKILL_QUALITY_VERSION,
43
44
  SkillLoadError,
@@ -46,6 +47,7 @@ import {
46
47
  classifyTool,
47
48
  connectTarget,
48
49
  dangerousCommand,
50
+ enumerateTools,
49
51
  exfilInstruction,
50
52
  fingerprintToolDefs,
51
53
  gradeFromCategories,
@@ -54,6 +56,7 @@ import {
54
56
  instructionMimicry,
55
57
  internalsLeak,
56
58
  invisibleUnicode,
59
+ isDockerAvailable,
57
60
  judgeFromEnv,
58
61
  judgeSkillQuality,
59
62
  loadSkill,
@@ -68,9 +71,10 @@ import {
68
71
  skillInjectionFails,
69
72
  stateChangingToolNames,
70
73
  stripExamples
71
- } from "./chunk-RYJXVMCT.js";
74
+ } from "./chunk-NPYDTMQ7.js";
72
75
  import {
73
76
  BUNDLE_SCHEMA_VERSION,
77
+ CATEGORY_META,
74
78
  CATEGORY_STATUS_UINT8,
75
79
  METHODOLOGY_VERSION,
76
80
  ServerRefParseError,
@@ -82,7 +86,7 @@ import {
82
86
  parseSkillRef,
83
87
  serverKey,
84
88
  skillKey
85
- } from "./chunk-44R4ZYOE.js";
89
+ } from "./chunk-X3P26XGS.js";
86
90
 
87
91
  // ../agent/src/gate.ts
88
92
  function sameServer(a, b) {
@@ -112,22 +116,25 @@ function gateDecision(attestation, live, passing = DEFAULT_PASSING, now = BigInt
112
116
  const versionNote = attestation.resolvedVersion ? ` (graded version ${attestation.resolvedVersion})` : "";
113
117
  return { action: "pay", reason: `grade ${attestation.overallGrade}; live fingerprint matches${versionNote}` };
114
118
  }
119
+ async function fingerprintLiveSurface(client) {
120
+ const defs = (await enumerateTools(client)).map((t) => ({
121
+ name: t.name,
122
+ description: t.description ?? "",
123
+ inputSchema: t.inputSchema ?? null
124
+ }));
125
+ return fingerprintToolDefs(defs).fingerprint;
126
+ }
115
127
  async function liveFingerprint(target) {
116
128
  const conn = await connectTarget(target);
117
129
  try {
118
- const { tools } = await conn.client.listTools();
119
- const defs = (tools ?? []).map((t) => ({
120
- name: t.name,
121
- description: t.description ?? "",
122
- inputSchema: t.inputSchema ?? null
123
- }));
124
- return { fingerprint: fingerprintToolDefs(defs).fingerprint, serverRef: conn.serverRef };
130
+ return { fingerprint: await fingerprintLiveSurface(conn.client), serverRef: conn.serverRef };
125
131
  } finally {
126
132
  await conn.teardown();
127
133
  }
128
134
  }
129
135
  export {
130
136
  BUNDLE_SCHEMA_VERSION,
137
+ CATEGORY_META,
131
138
  CATEGORY_STATUS_UINT8,
132
139
  DEFAULT_PASSING,
133
140
  LITMUS_SCHEMA,
@@ -141,6 +148,7 @@ export {
141
148
  RUN_SKILL_LITMUS_TOOL_NAME,
142
149
  RUN_SKILL_LITMUS_TOOL_TITLE,
143
150
  SKILL_BUNDLE_SCHEMA_VERSION,
151
+ SKILL_CATEGORY_META,
144
152
  SKILL_METHODOLOGY_VERSION,
145
153
  SKILL_QUALITY_VERSION,
146
154
  ServerRefParseError,
@@ -160,6 +168,7 @@ export {
160
168
  encodeLitmusAttestation,
161
169
  encodeSkillAttestation,
162
170
  encodeSkillAttestationFields,
171
+ enumerateTools,
163
172
  exfilInstruction,
164
173
  fingerprintToolDefs,
165
174
  formatServerRef,
@@ -174,6 +183,7 @@ export {
174
183
  instructionMimicry,
175
184
  internalsLeak,
176
185
  invisibleUnicode,
186
+ isDockerAvailable,
177
187
  judgeFromEnv,
178
188
  judgeSkillQuality,
179
189
  litmusFields,
package/dist/mcp.js CHANGED
@@ -20,12 +20,12 @@ import {
20
20
  runSkillLitmusInputShape,
21
21
  verifyInputShape,
22
22
  verifySkillInputShape
23
- } from "./chunk-Z66GKAQD.js";
24
- import "./chunk-BUKDFSDO.js";
23
+ } from "./chunk-TK4EI66E.js";
24
+ import "./chunk-EMMCE3LC.js";
25
25
  import {
26
26
  judgeFromEnv
27
- } from "./chunk-RYJXVMCT.js";
28
- import "./chunk-44R4ZYOE.js";
27
+ } from "./chunk-NPYDTMQ7.js";
28
+ import "./chunk-X3P26XGS.js";
29
29
 
30
30
  // src/mcp.ts
31
31
  import { realpathSync } from "fs";
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  SKILL_BUNDLE_SCHEMA_VERSION,
3
+ SKILL_CATEGORY_META,
3
4
  SKILL_METHODOLOGY_VERSION,
4
5
  SKILL_QUALITY_VERSION,
5
6
  SkillLoadError,
@@ -8,6 +9,7 @@ import {
8
9
  classifyTool,
9
10
  connectTarget,
10
11
  dangerousCommand,
12
+ enumerateTools,
11
13
  exfilInstruction,
12
14
  fingerprintToolDefs,
13
15
  gradeFromCategories,
@@ -16,6 +18,7 @@ import {
16
18
  instructionMimicry,
17
19
  internalsLeak,
18
20
  invisibleUnicode,
21
+ isDockerAvailable,
19
22
  judgeFromEnv,
20
23
  judgeSkillQuality,
21
24
  loadSkill,
@@ -30,10 +33,11 @@ import {
30
33
  skillInjectionFails,
31
34
  stateChangingToolNames,
32
35
  stripExamples
33
- } from "./chunk-RYJXVMCT.js";
34
- import "./chunk-44R4ZYOE.js";
36
+ } from "./chunk-NPYDTMQ7.js";
37
+ import "./chunk-X3P26XGS.js";
35
38
  export {
36
39
  SKILL_BUNDLE_SCHEMA_VERSION,
40
+ SKILL_CATEGORY_META,
37
41
  SKILL_METHODOLOGY_VERSION,
38
42
  SKILL_QUALITY_VERSION,
39
43
  SkillLoadError,
@@ -42,6 +46,7 @@ export {
42
46
  classifyTool,
43
47
  connectTarget,
44
48
  dangerousCommand,
49
+ enumerateTools,
45
50
  exfilInstruction,
46
51
  fingerprintToolDefs,
47
52
  gradeFromCategories,
@@ -50,6 +55,7 @@ export {
50
55
  instructionMimicry,
51
56
  internalsLeak,
52
57
  invisibleUnicode,
58
+ isDockerAvailable,
53
59
  judgeFromEnv,
54
60
  judgeSkillQuality,
55
61
  loadSkill,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@polygraphso/litmus",
3
- "version": "0.9.1",
3
+ "version": "0.11.0",
4
4
  "description": "Behavioral litmus harness for MCP servers — grade a server A–F (tool-output injection, egress, sensitive-data, adversarial-input) with reproducible, content-addressed evidence. Ships a CLI and an MCP server with a run_litmus tool for AI agents.",
5
5
  "license": "Apache-2.0",
6
6
  "homepage": "https://polygraph.so",
@@ -65,9 +65,9 @@
65
65
  "@polygraph/core": "0.0.0",
66
66
  "@polygraph/onchain": "0.0.0",
67
67
  "@polygraph/agent": "0.0.0",
68
- "@polygraph/probes": "0.0.0",
68
+ "@polygraph/mcp": "0.0.0",
69
69
  "@polygraph/cli": "0.0.0",
70
- "@polygraph/mcp": "0.0.0"
70
+ "@polygraph/probes": "0.0.0"
71
71
  },
72
72
  "publishConfig": {
73
73
  "access": "public"
@@ -1,118 +0,0 @@
1
- import {
2
- canonicalStringify
3
- } from "./chunk-44R4ZYOE.js";
4
-
5
- // ../cli/src/litmus.ts
6
- import { existsSync } from "fs";
7
- import { createRequire } from "module";
8
- import * as path from "path";
9
-
10
- // ../cli/src/format.ts
11
- function formatBundle(b) {
12
- const status = (code) => b.categories.find((c) => c.code === code)?.status ?? "?";
13
- const lines = [];
14
- lines.push(`\u2192 ${b.methodologyVersion} \xB7 ${b.serverRef}`);
15
- if (b.resolvedVersion) lines.push(`\u2192 version ${b.resolvedVersion}`);
16
- lines.push(`\u2192 C-01 ${status("C-01")} \xB7 C-02 ${status("C-02")} \xB7 C-03 ${status("C-03")} \xB7 C-04 ${status("C-04")}`);
17
- const c01 = b.categories.find((c) => c.code === "C-01");
18
- if (c01?.status === "fail") {
19
- const highs = c01.probes.flatMap((p) => p.findings).filter((f) => f.severity === "high");
20
- for (const f of highs.slice(0, 3)) {
21
- lines.push(` \u26A0 ${f.tool ?? "?"}: ${f.kind} \u2014 ${truncate(f.match, 64)}`);
22
- }
23
- }
24
- lines.push(`\u2192 fingerprint ${shortFp(b.toolDefsFingerprint)}`);
25
- lines.push(`\u2192 grade: ${b.grade}`);
26
- lines.push(` ${b.gradeRationale}`);
27
- return lines.join("\n") + "\n";
28
- }
29
- function shortFp(fp) {
30
- return fp.length > 14 ? `${fp.slice(0, 6)}\u2026${fp.slice(-4)}` : fp;
31
- }
32
- function truncate(s, n) {
33
- return s.length > n ? `${s.slice(0, n)}\u2026` : s;
34
- }
35
-
36
- // ../cli/src/litmus.ts
37
- async function runLitmusCli(args) {
38
- const json = args.includes("--json");
39
- const { headers, allowStateChanging, positionals } = parseAuthFlags(args);
40
- const target = positionals[0];
41
- if (!target) {
42
- process.stderr.write(
43
- 'usage: polygraphso litmus [--json] [--bearer <token>] [--header "Key: Value"] [--allow-state-changing] <registry-ref | https-url | path-to-mcp>\n'
44
- );
45
- return 2;
46
- }
47
- const { runLitmus } = await import("./src-TMJOIVGB.js");
48
- const input = resolveTarget(target);
49
- try {
50
- const bundle = await runLitmus(input, { headers, allowStateChanging });
51
- process.stdout.write(json ? canonicalStringify(bundle) + "\n" : formatBundle(bundle));
52
- return bundle.grade === "D" || bundle.grade === "F" ? 1 : 0;
53
- } catch (err) {
54
- process.stderr.write(`\u2192 litmus failed: ${err instanceof Error ? err.message : String(err)}
55
- `);
56
- return 1;
57
- }
58
- }
59
- function parseAuthFlags(args, env = process.env) {
60
- const headers = {};
61
- const headerArgs = [];
62
- let allowStateChanging = false;
63
- let bearer = env.LITMUS_BEARER || void 0;
64
- const positionals = [];
65
- for (let i = 0; i < args.length; i++) {
66
- const a = args[i];
67
- if (a === "--json") continue;
68
- if (a === "--allow-state-changing") {
69
- allowStateChanging = true;
70
- } else if (a === "--bearer") {
71
- bearer = args[++i] ?? bearer;
72
- } else if (a.startsWith("--bearer=")) {
73
- bearer = a.slice("--bearer=".length);
74
- } else if (a === "--header") {
75
- const v = args[++i];
76
- if (v) headerArgs.push(v);
77
- } else if (a.startsWith("--header=")) {
78
- headerArgs.push(a.slice("--header=".length));
79
- } else if (a.startsWith("--")) {
80
- } else {
81
- positionals.push(a);
82
- }
83
- }
84
- if (bearer) headers["Authorization"] = `Bearer ${bearer}`;
85
- for (const h of headerArgs) {
86
- const idx = h.indexOf(":");
87
- if (idx === -1) continue;
88
- const key = h.slice(0, idx).trim();
89
- const value = h.slice(idx + 1).trim();
90
- if (key) headers[key] = value;
91
- }
92
- return { headers, allowStateChanging, positionals };
93
- }
94
- function resolveTarget(target) {
95
- if (/^https?:\/\//i.test(target)) return target;
96
- if (existsSync(target)) {
97
- const abs = path.resolve(target);
98
- if (abs.endsWith(".ts") || abs.endsWith(".mts") || abs.endsWith(".cts")) {
99
- return { command: process.execPath, args: [tsxCli(), abs], serverRef: target };
100
- }
101
- return { command: process.execPath, args: [abs], serverRef: target };
102
- }
103
- return target;
104
- }
105
- function tsxCli() {
106
- const require2 = createRequire(import.meta.url);
107
- const pkgJsonPath = require2.resolve("tsx/package.json");
108
- const dir = path.dirname(pkgJsonPath);
109
- const bin = require2(pkgJsonPath).bin;
110
- const rel = typeof bin === "string" ? bin : bin.tsx ?? "./dist/cli.mjs";
111
- return path.join(dir, rel);
112
- }
113
-
114
- export {
115
- runLitmusCli,
116
- parseAuthFlags,
117
- resolveTarget
118
- };