npm - cowork-harness - Versions diffs - 0.1.0 - Mend

cowork-harness 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/.env.example +16 -0
package/CHANGELOG.md +190 -0
package/LICENSE +21 -0
package/README.md +470 -0
package/baselines/desktop-1.11847.5.json +78 -0
package/baselines/desktop-1.12603.1.json +140 -0
package/baselines/prompts/desktop-1.12603.1/host-loop-append.md +8 -0
package/baselines/prompts/desktop-1.12603.1/subagent-append-vm.md +3 -0
package/baselines/prompts/desktop-1.12603.1/system-prompt-append.md +18 -0
package/dist/agent/session.js +465 -0
package/dist/assert.js +159 -0
package/dist/baseline.js +87 -0
package/dist/boundary.js +114 -0
package/dist/canary/grants.js +37 -0
package/dist/cli.js +1107 -0
package/dist/decide/decider.js +521 -0
package/dist/decide/external-channel.js +262 -0
package/dist/decide/llm-transport.js +52 -0
package/dist/dotenv.js +52 -0
package/dist/egress/proxy.js +138 -0
package/dist/egress/sidecar.js +125 -0
package/dist/hostloop/provenance.js +110 -0
package/dist/hostloop/workspace-handler.js +226 -0
package/dist/loop-decision.js +62 -0
package/dist/prompt.js +43 -0
package/dist/run/cassette.js +420 -0
package/dist/run/chat.js +194 -0
package/dist/run/envelope.js +31 -0
package/dist/run/execute.js +533 -0
package/dist/run/renderer.js +179 -0
package/dist/run/run.js +347 -0
package/dist/run/trace-view.js +227 -0
package/dist/runtime/argv.js +126 -0
package/dist/runtime/container.js +76 -0
package/dist/runtime/host-env.js +28 -0
package/dist/runtime/hostloop.js +129 -0
package/dist/runtime/lima.js +177 -0
package/dist/runtime/microvm.js +151 -0
package/dist/runtime/protocol.js +79 -0
package/dist/runtime/stage.js +52 -0
package/dist/secrets.js +42 -0
package/dist/session.js +315 -0
package/dist/sync/cowork-sync.js +215 -0
package/dist/types.js +127 -0
package/docker/Dockerfile.agent +31 -0
package/docker/Dockerfile.proxy +12 -0
package/docker/compose.yml +31 -0
package/fixtures/subagent-grants.json +5 -0
package/package.json +70 -0

package/dist/assert.js ADDED Viewed

@@ -0,0 +1,159 @@
+import { existsSync } from "node:fs";
+import { join } from "node:path";
+/**
+ * Boundary-aware host matching: `host` must equal `needle` exactly or be a proper subdomain of it.
+ * `evilanthropic.com` does NOT match `anthropic.com`; `x.anthropic.com` does.
+ */
+export function hostMatches(host, needle) {
+    return host === needle || host.endsWith("." + needle);
+}
+export function evaluate(assertions, ctx) {
+    return assertions.map((a) => check(a, ctx));
+}
+/**
+ * #5: evaluate EVERY present key (AND semantics) — a multi-key assertion passes iff all of its
+ * keys pass. (The previous first-key-wins `if (a.X) return …` chain silently ignored every key
+ * after the first.) The per-key logic is unchanged; each branch now PUSHES its result instead of
+ * returning. The first failing key supplies the surfaced message. On the replay lane, keys that
+ * cannot be evaluated (filesystem/egress, or question/gate when controlOut is absent) are stripped
+ * from the object BEFORE this runs (see replayCassette), so AND never straddles replay classes.
+ */
+function check(a, ctx) {
+    const results = [];
+    const ok = () => ({ pass: true });
+    const fail = (message) => ({ pass: false, message });
+    if (a.transcript_contains !== undefined)
+        results.push(ctx.transcript.includes(a.transcript_contains) ? ok() : fail(`transcript missing "${a.transcript_contains}"`));
+    if (a.transcript_not_contains !== undefined)
+        results.push(!ctx.transcript.includes(a.transcript_not_contains) ? ok() : fail(`transcript unexpectedly contains "${a.transcript_not_contains}"`));
+    // Fuzzy content for stochastic prose. All regex-building assertions are try/catch-wrapped —
+    // `evaluate()` is a bare `.map(check)` with no error boundary, so a malformed pattern must be a
+    // clean assertion failure, not an uncaught throw. Case-insensitive ("i").
+    if (a.transcript_matches !== undefined) {
+        let re;
+        try {
+            re = new RegExp(a.transcript_matches, "i");
+        }
+        catch (e) {
+            results.push(fail(`transcript_matches: bad regex "${a.transcript_matches}": ${String(e.message)}`));
+        }
+        if (re)
+            results.push(re.test(ctx.transcript) ? ok() : fail(`transcript did not match /${a.transcript_matches}/i`));
+    }
+    if (a.transcript_not_matches !== undefined) {
+        let re;
+        try {
+            re = new RegExp(a.transcript_not_matches, "i");
+        }
+        catch (e) {
+            results.push(fail(`transcript_not_matches: bad regex "${a.transcript_not_matches}": ${String(e.message)}`));
+        }
+        if (re)
+            results.push(!re.test(ctx.transcript) ? ok() : fail(`transcript unexpectedly matched /${a.transcript_not_matches}/i`));
+    }
+    if (a.file_exists !== undefined)
+        results.push(existsSync(join(ctx.workRoot, a.file_exists)) ? ok() : fail(`file not found: ${a.file_exists} (under ${ctx.workRoot})`));
+    if (a.user_visible_artifact !== undefined) {
+        const p = a.user_visible_artifact;
+        const visible = ctx.userVisiblePrefixes.some((pre) => p === pre || p.startsWith(pre + "/"));
+        if (!visible)
+            results.push(fail(`"${p}" is not under a user-visible prefix (${ctx.userVisiblePrefixes.join(", ")}) — invisible to the user in Cowork`));
+        else
+            results.push(existsSync(join(ctx.workRoot, p)) ? ok() : fail(`user-visible artifact not found: ${p}`));
+    }
+    if (a.tool_called !== undefined)
+        results.push(ctx.toolsCalled.has(a.tool_called) ? ok() : fail(`tool not called: ${a.tool_called}`));
+    if (a.tool_not_called !== undefined)
+        results.push(!ctx.toolsCalled.has(a.tool_not_called) ? ok() : fail(`tool unexpectedly called: ${a.tool_not_called}`));
+    if (a.subagent_tool_used !== undefined)
+        results.push(ctx.subagentTools.has(a.subagent_tool_used) ? ok() : fail(`sub-agent did not use: ${a.subagent_tool_used}`));
+    if (a.subagent_tool_absent !== undefined)
+        results.push(!ctx.subagentTools.has(a.subagent_tool_absent) ? ok() : fail(`sub-agent unexpectedly used: ${a.subagent_tool_absent}`));
+    if (a.subagent_dispatched !== undefined) {
+        // Match the agentType OR the description — skills often dispatch with only a `description`
+        // (no subagent_type → agentType "unknown"), so name-matching alone would miss those (O1).
+        let rx;
+        try {
+            rx = new RegExp(a.subagent_dispatched, "i");
+        }
+        catch (e) {
+            results.push(fail(`subagent_dispatched: bad regex "${a.subagent_dispatched}": ${String(e.message)}`));
+        }
+        if (rx)
+            results.push(ctx.subagents.some((s) => rx.test(s.agentType) || rx.test(s.description ?? ""))
+                ? ok()
+                : fail(`no sub-agent matching "${a.subagent_dispatched}" was dispatched (by type or description)`));
+    }
+    if (a.subagent_declared_but_unused !== undefined) {
+        const t = a.subagent_declared_but_unused;
+        // #25 / B2: declared a tool but never USED it — the observable proxy for the v0.3.0 fabrication
+        // class. Previously also required `toolsUsed.length === 0`, which let "declared Bash, used Read"
+        // pass; dropping that clause catches the broader declared-but-unused case.
+        const culprit = ctx.subagents.find((s) => s.declaredTools.includes(t) && !s.toolsUsed.includes(t));
+        results.push(culprit
+            ? fail(`sub-agent "${culprit.agentType}" declared "${t}" but never used it (used: ${culprit.toolsUsed.join(", ") || "none"})`)
+            : ok());
+    }
+    if (a.dispatch_count_max !== undefined)
+        results.push(ctx.subagents.length <= a.dispatch_count_max
+            ? ok()
+            : fail(`dispatched ${ctx.subagents.length} sub-agents, max ${a.dispatch_count_max} (SPEC §10 cap {global:3})`));
+    if (a.egress_denied !== undefined)
+        results.push(ctx.egress.some((e) => hostMatches(e.host, a.egress_denied) && e.decision === "deny")
+            ? ok()
+            : fail(`expected egress denied: ${a.egress_denied}`));
+    if (a.egress_allowed !== undefined)
+        results.push(ctx.egress.some((e) => hostMatches(e.host, a.egress_allowed) && e.decision === "allow")
+            ? ok()
+            : fail(`expected egress allowed: ${a.egress_allowed}`));
+    if (a.no_delete_in_outputs !== undefined)
+        results.push(ctx.outputsDeletes.length === 0
+            ? ok()
+            : fail(`delete op(s) touched outputs (forbidden in Cowork): ${ctx.outputsDeletes.slice(0, 3).join("; ")}`));
+    if (a.self_heal_ran !== undefined)
+        results.push(ctx.selfHealRan === a.self_heal_ran ? ok() : fail(`self_heal_ran was ${ctx.selfHealRan}, expected ${a.self_heal_ran}`));
+    if (a.transcript_no_host_path !== undefined)
+        results.push(!ctx.hostPathLeaked === a.transcript_no_host_path ? ok() : fail(`host path leaked into model-visible text: ${ctx.hostPathLeaked}`));
+    if (a.question_asked !== undefined) {
+        let rx;
+        try {
+            rx = new RegExp(a.question_asked, "i");
+        }
+        catch (e) {
+            results.push(fail(`question_asked: bad regex "${a.question_asked}": ${String(e.message)}`));
+        }
+        if (rx)
+            results.push(ctx.questions.some((q) => rx.test(q)) ? ok() : fail(`no question matched: ${a.question_asked}`));
+    }
+    if (a.questions_count_max !== undefined)
+        results.push(ctx.questions.length <= a.questions_count_max ? ok() : fail(`asked ${ctx.questions.length} questions, max ${a.questions_count_max}`));
+    if (a.gate_answers_delivered !== undefined) {
+        // #19: passes iff every answered gate's tool_result was OBSERVED and non-error. On a finished
+        // run/cassette, an unobserved delivery (delivered=null) is NOT neutral — it is absence of the
+        // evidence the assertion requires, so it fails loud ("no silent false-greens"). `delivered:
+        // false` is a real errored tool_result; `null` is "no tool_result observed for this gate".
+        if (a.gate_answers_delivered) {
+            const bad = ctx.gateDeliveries.filter((g) => g.delivered !== true);
+            results.push(bad.length === 0
+                ? ok()
+                : fail(`gate answer(s) not confirmed delivered to the model: ${bad
+                    .map((g) => `"${g.question}" (${g.delivered === false
+                    ? (g.error ?? "tool error")
+                    : g.reason === "no-pairing-metadata"
+                        ? "no pairing metadata — gate had no toolUseId"
+                        : "delivery unobserved — no tool_result for this gate"})`)
+                    .join("; ")}`));
+        }
+        else {
+            // inverse: expect a CONFIRMED delivery failure (a real errored tool_result), not merely unobserved.
+            const failedConfirmed = ctx.gateDeliveries.filter((g) => g.delivered === false);
+            results.push(failedConfirmed.length > 0 ? ok() : fail(`expected a confirmed gate-delivery failure but none was observed`));
+        }
+    }
+    if (a.result !== undefined)
+        results.push(ctx.result === a.result ? ok() : fail(`result was ${ctx.result}, expected ${a.result}`));
+    if (results.length === 0)
+        return { assertion: a, pass: false, message: "empty assertion" };
+    const firstFail = results.find((r) => !r.pass);
+    return firstFail ? { assertion: a, pass: false, message: firstFail.message } : { assertion: a, pass: true };
+}

package/dist/baseline.js ADDED Viewed

@@ -0,0 +1,87 @@
+import { readFileSync, readdirSync, existsSync } from "node:fs";
+import { join, resolve, isAbsolute } from "node:path";
+import { homedir } from "node:os";
+import { fileURLToPath } from "node:url";
+import { PlatformBaseline } from "./types.js";
+export const BASELINES_DIR = join(fileURLToPath(new URL("..", import.meta.url)), "baselines");
+/** Resolve the host path to the staged agent ELF (COWORK_AGENT_BINARY override > baseline.stagedPath). */
+export function resolveAgentBinary(baseline) {
+    const override = process.env.COWORK_AGENT_BINARY;
+    if (override) {
+        if (!existsSync(override))
+            throw new Error(`COWORK_AGENT_BINARY not found: ${override}`);
+        return resolve(override);
+    }
+    const staged = (baseline.agentBinary?.stagedPath ?? "").replace(/^~(?=$|\/)/, homedir());
+    if (!staged || !existsSync(staged)) {
+        throw new Error(`Staged agent binary not found at "${staged}". It is extracted from your Claude Desktop install ` +
+            `(claude-code-vm/<ver>/claude). Open Cowork once to stage it, or set COWORK_AGENT_BINARY to its path.`);
+    }
+    return resolve(staged);
+}
+/**
+ * Resolve a baseline by `latest`, an absolute path, or a name under `baselines/`. A non-absolute name
+ * is treated as a BARE FILENAME resolved under BASELINES_DIR — both `desktop-x` and `desktop-x.json`
+ * load from there regardless of cwd. Use an absolute path for an out-of-tree baseline.
+ */
+export function loadBaseline(name) {
+    const file = name === "latest"
+        ? latestBaselineFile()
+        : isAbsolute(name)
+            ? name
+            : join(BASELINES_DIR, name.endsWith(".json") ? name : `${name}.json`);
+    const raw = JSON.parse(readFileSync(file, "utf8"));
+    return PlatformBaseline.parse(raw);
+}
+/**
+ * Compare two `desktop-<version>.json` filenames numerically by version segment.
+ * Returns negative if a < b, zero if equal, positive if a > b.
+ * Example: compareBaselineVersions("desktop-1.9.json", "desktop-1.10.json") < 0
+ */
+export function compareBaselineVersions(a, b) {
+    // Strip the "desktop-" prefix and ".json" suffix to get the raw version string.
+    const versionOf = (f) => f.replace(/^desktop-/, "").replace(/\.json$/, "");
+    // A non-numeric segment (e.g. "1.0.0-beta") → parseInt NaN → NaN-0 = NaN corrupts the whole sort.
+    // Coerce a non-number to 0 so the comparison stays total.
+    const seg = (f) => versionOf(f)
+        .split(".")
+        .map((s) => {
+        const n = parseInt(s, 10);
+        return Number.isNaN(n) ? 0 : n;
+    });
+    const segA = seg(a);
+    const segB = seg(b);
+    const len = Math.max(segA.length, segB.length);
+    for (let i = 0; i < len; i++) {
+        const diff = (segA[i] ?? 0) - (segB[i] ?? 0);
+        if (diff !== 0)
+            return diff;
+    }
+    return 0;
+}
+function latestBaselineFile() {
+    const files = readdirSync(BASELINES_DIR).filter((f) => f.startsWith("desktop-") && f.endsWith(".json"));
+    if (files.length === 0)
+        throw new Error(`No baselines in ${BASELINES_DIR}; run \`cowork-harness sync\` first.`);
+    // Use numeric/semver-aware sort so desktop-1.10.json > desktop-1.9.json (not lexical).
+    files.sort(compareBaselineVersions);
+    return join(BASELINES_DIR, files[files.length - 1]);
+}
+/**
+ * Expand the mount layout for a concrete session id.
+ * cwd/sessionRoot = the session root (e.g. /sessions/<id>); mounts sit under mntRoot
+ * (/sessions/<id>/mnt) and are returned as ABSOLUTE guest paths.
+ */
+export function resolveMounts(baseline, sessionId, projectId = "proj1") {
+    const subst = (s) => s.replace("{sessionId}", sessionId).replace("{projectId}", projectId);
+    const cwd = subst(baseline.mountLayout.cwd);
+    const sessionRoot = subst(baseline.mountLayout.sessionRoot);
+    const mntRoot = subst(baseline.mountLayout.mntRoot ?? `${baseline.mountLayout.sessionRoot}/mnt`);
+    return {
+        cwd,
+        sessionRoot,
+        mntRoot,
+        configDir: `${mntRoot}/.claude`,
+        mounts: baseline.mountLayout.mounts.map((m) => ({ ...m, mountPath: `${mntRoot}/${subst(m.mountPath)}` })),
+    };
+}

package/dist/boundary.js ADDED Viewed

@@ -0,0 +1,114 @@
+import { spawnSync } from "node:child_process";
+import { mkdtempSync } from "node:fs";
+import { tmpdir, userInfo, homedir } from "node:os";
+import { join } from "node:path";
+import { startEgressSidecar } from "./egress/sidecar.js";
+/**
+ * The allowlist the boundary sidecar seeds — baseline invariants PLUS the session's egress additions
+ * (so the self-test exercises the same boundary a `--session`/scenario run would). `unrestricted` widens
+ * to `*`, mirroring buildLaunchPlan's egress resolution. Pure → unit-testable without Docker.
+ */
+export function boundaryAllowList(baseline, session) {
+    if (session?.unrestricted)
+        return ["*"];
+    return [...baseline.network.allowDomains, ...(session?.extraAllow ?? [])];
+}
+export function runBoundaryChecks(baseline, session) {
+    const runtime = process.env.COWORK_CONTAINER_RUNTIME ?? "docker";
+    const image = process.env.COWORK_AGENT_IMAGE ?? "cowork-agent-base:1";
+    const results = [];
+    // Stand up the real per-run boundary (internal network + allowlist proxy), exactly
+    // what a container-fidelity scenario uses. Tear it down at the end.
+    const runId = `bchk${process.hrtime.bigint().toString(36)}`;
+    const sidecar = startEgressSidecar(boundaryAllowList(baseline, session), mkdtempSync(join(tmpdir(), "cowork-bchk-")), runId);
+    const network = sidecar.network;
+    const proxy = sidecar.proxyUrl;
+    const probe = (shell, withProxy = false) => spawnSync(runtime, [
+        "run",
+        "--rm",
+        "--platform",
+        "linux/arm64",
+        "--network",
+        network,
+        ...(withProxy ? ["-e", `HTTPS_PROXY=${proxy}`, "-e", `HTTP_PROXY=${proxy}`] : []),
+        "--entrypoint",
+        "sh",
+        image,
+        "-c",
+        shell,
+    ], { encoding: "utf8", timeout: 30_000 });
+    // 1. Host filesystem is NOT visible (no /Users, no host home bind).
+    {
+        const r = probe(`ls /Users 2>&1 || true; ls /host 2>&1 || true`);
+        const out = (r.stdout ?? "") + (r.stderr ?? "");
+        const blocked = isHostFsSealed(out);
+        results.push({
+            check: "host-fs-sealed",
+            expectation: "host paths (/Users, /host) invisible",
+            pass: blocked,
+            detail: out.trim().slice(0, 200),
+        });
+    }
+    // 2. Direct (non-proxied) egress is impossible — no route off the internal net.
+    {
+        const r = probe(`curl -sS -m 5 -o /dev/null http://example.com && echo REACHED || echo BLOCKED`);
+        const out = ((r.stdout ?? "") + (r.stderr ?? "")).trim();
+        results.push({
+            check: "direct-egress-denied",
+            expectation: "no route to internet without proxy",
+            pass: /BLOCKED/.test(out) && !/REACHED/.test(out),
+            detail: out,
+        });
+    }
+    // 3. Non-allowlisted egress via the proxy is refused (403).
+    {
+        const r = probe(`curl -sS -m 5 -o /dev/null https://example.com && echo REACHED || echo BLOCKED`, true);
+        const out = ((r.stdout ?? "") + (r.stderr ?? "")).trim();
+        results.push({
+            check: "allowlist-enforced",
+            expectation: "off-list host refused by proxy",
+            pass: /BLOCKED|403/.test(out) && !/REACHED/.test(out),
+            detail: out.slice(0, 200),
+        });
+    }
+    // 4. Allowlisted egress via the proxy works (so the agent can reach inference).
+    {
+        const r = probe(`curl -sS -m 8 -o /dev/null https://api.anthropic.com && echo OK || echo FAIL`, true);
+        const out = ((r.stdout ?? "") + (r.stderr ?? "")).trim();
+        results.push({
+            check: "allowlist-permits",
+            expectation: "allowlisted host reachable via proxy",
+            pass: /OK/.test(out),
+            detail: out.slice(0, 200),
+        });
+    }
+    sidecar.teardown();
+    return results;
+}
+/** Escape regex metacharacters in a literal so it can be embedded in a RegExp. */
+function escapeRegex(s) {
+    return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+/**
+ * #35: host-fs-sealed pass criterion, made environment-agnostic. The old guard hard-coded the repo
+ * owner's username (`yaniv`) in the negative-match, so a real host-path leak on another developer's
+ * machine (their username) would not be caught. Build the negative guard from the ACTUAL
+ * environment — `os.userInfo().username`, `os.homedir()`, plus the literal host roots `/Users/` and
+ * `/opt/cowork/` — escaping regex metacharacters in the dynamic parts.
+ *
+ * Sealed (pass) ⇔ the probe output looks like a denial ("No such file" etc.) AND contains NONE of
+ * the host markers (a leaked username/homedir/host root would mean the host fs is visible).
+ */
+export function isHostFsSealed(probeOutput, env) {
+    const username = env?.username ?? userInfo().username;
+    const home = env?.homedir ?? homedir();
+    const markers = [escapeRegex(username), escapeRegex(home), "/Users/", "/opt/cowork/"].filter(Boolean);
+    const hostMarker = new RegExp(markers.join("|"));
+    const denied = /No such file|cannot access|not found/i.test(probeOutput);
+    return denied && !hostMarker.test(probeOutput);
+}
+export function formatBoundary(results) {
+    const lines = results.map((r) => `${r.pass ? "PASS" : "FAIL"}  ${r.check.padEnd(22)} — ${r.expectation}${r.pass ? "" : `\n        got: ${r.detail}`}`);
+    const allPass = results.every((r) => r.pass);
+    return `Boundary parity: ${allPass ? "ALL CONSTRAINTS ENFORCED" : "GAPS FOUND"}\n` + lines.join("\n");
+}

package/dist/canary/grants.js ADDED Viewed

@@ -0,0 +1,37 @@
+import { readFileSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+import { join } from "node:path";
+export function loadGrantMap(path) {
+    const p = path ?? join(fileURLToPath(new URL("../..", import.meta.url)), "fixtures", "subagent-grants.json");
+    // #44: a read/parse failure previously `catch { return {} }`, silently disabling drift
+    // detection — corruption is exactly when the B2 canary must fire, so THROW loud instead.
+    let parsed;
+    try {
+        parsed = JSON.parse(readFileSync(p, "utf8"));
+    }
+    catch (e) {
+        throw new Error(`corrupt subagent-grants fixture at ${p}: ${e.message} — run 'cowork-harness sync' to regenerate`);
+    }
+    // #44: `.grants ?? {}` previously coerced a missing/non-object key to an empty map (silent
+    // empty drift). Validate it instead so a malformed fixture is a loud error, not a no-op.
+    const grants = parsed?.grants;
+    if (grants === null || typeof grants !== "object" || Array.isArray(grants)) {
+        throw new Error(`corrupt subagent-grants fixture at ${p}: missing or non-object ".grants" — run 'cowork-harness sync' to regenerate`);
+    }
+    return grants;
+}
+/** Verify dispatched sub-agents against the committed map. Unknown agentTypes are NOT asserted
+ *  (recorded as `unknown` upstream) so we never assert a false invariant. */
+export function verifyGrants(subagents, map) {
+    const drift = [];
+    for (const s of subagents) {
+        const expected = map[s.agentType];
+        if (expected === undefined)
+            continue;
+        const a = [...s.declaredTools].sort();
+        const e = [...expected].sort();
+        if (JSON.stringify(a) !== JSON.stringify(e))
+            drift.push({ agentType: s.agentType, expected: e, actual: a });
+    }
+    return drift;
+}