npm - @ishlabs/cli - Versions diffs - 0.9.0 → 0.10.0 - Mend

@ishlabs/cli 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +54 -5
package/dist/commands/ask.d.ts +12 -0
package/dist/commands/ask.js +127 -2
package/dist/commands/chat.d.ts +17 -0
package/dist/commands/chat.js +589 -0
package/dist/commands/iteration.js +134 -14
package/dist/commands/secret.d.ts +20 -0
package/dist/commands/secret.js +246 -0
package/dist/commands/study-run.d.ts +38 -0
package/dist/commands/study-run.js +199 -80
package/dist/commands/study-tester.js +17 -2
package/dist/commands/study.js +309 -37
package/dist/commands/workspace.js +81 -0
package/dist/config.d.ts +3 -0
package/dist/connect.d.ts +3 -0
package/dist/connect.js +346 -22
package/dist/index.js +64 -6
package/dist/lib/alias-hydrate.d.ts +42 -0
package/dist/lib/alias-hydrate.js +175 -0
package/dist/lib/alias-store.d.ts +1 -0
package/dist/lib/alias-store.js +28 -1
package/dist/lib/auth.js +4 -2
package/dist/lib/chat-endpoint-formatters.d.ts +39 -0
package/dist/lib/chat-endpoint-formatters.js +104 -0
package/dist/lib/command-helpers.d.ts +18 -0
package/dist/lib/command-helpers.js +105 -3
package/dist/lib/docs.js +542 -17
package/dist/lib/modality.d.ts +42 -0
package/dist/lib/modality.js +192 -0
package/dist/lib/output.d.ts +41 -0
package/dist/lib/output.js +453 -19
package/dist/lib/paths.d.ts +1 -0
package/dist/lib/paths.js +3 -0
package/dist/lib/skill-content.js +182 -12
package/dist/lib/types.d.ts +15 -0
package/package.json +1 -1

package/dist/lib/output.js CHANGED Viewed

@@ -113,6 +113,21 @@ function renderBare(value) {
 const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
 const TIMESTAMP_KEYS = new Set(["created_at", "updated_at"]);
 const PAGINATION_KEYS = new Set(["items", "total", "returned", "limit", "offset", "has_more"]);
+// Keys whose UUID value must survive the lean-JSON pass even though it's the
+// same shape leanJson strips elsewhere. These are load-bearing for agent
+// follow-up calls and were forcing agents to pass `--verbose` (C5-Bug4).
+const UUID_KEYS_TO_KEEP = new Set([
+    // ask: which variant the tester picked — the load-bearing field for "who picked what".
+    "variant_pick_id",
+]);
+// Keys whose value must pass through leanJson untouched (no UUID stripping,
+// no empty-array drop, no nested recursion). The entire shape is contract:
+// every variant id key and every tester id in its array is load-bearing,
+// and unpicked variants must surface as `[]` rather than disappear.
+const LEAN_PASSTHROUGH_KEYS = new Set([
+    // Pattern H: variant_id → [tester_id, ...] for drill-in audience discovery.
+    "pick_buckets",
+]);
 /**
  * Strip UUID-valued fields, null/undefined values, and timestamps.
  * Preserves alias, name, label, status, and other meaningful fields.
@@ -143,6 +158,19 @@ function leanJson(data, keepIds = false) {
                 result[key] = value;
             continue;
         }
+        // Allowlisted UUID-valued keys (load-bearing for agent follow-up calls).
+        if (UUID_KEYS_TO_KEEP.has(key)) {
+            if (value !== null && value !== undefined)
+                result[key] = value;
+            continue;
+        }
+        // Pass-through keys (e.g. pick_buckets). The entire nested shape is
+        // contract — copy verbatim with no UUID-stripping or empty-array drop.
+        if (LEAN_PASSTHROUGH_KEYS.has(key)) {
+            if (value !== null && value !== undefined)
+                result[key] = value;
+            continue;
+        }
         // Strip null/undefined
         if (value === null || value === undefined)
             continue;
@@ -405,10 +433,10 @@ function suggestionsForError(err) {
         if (/no auth token|run "ish login"|session expired/i.test(err.message)) {
             return ["Run `ish login` to authenticate"];
         }
-        if (/no workspace set/i.test(err.message)) {
+        if (/no (active )?workspace( set)?/i.test(err.message)) {
             return ["Run `ish workspace list` then `ish workspace use <alias>`"];
         }
-        if (/no study set/i.test(err.message)) {
+        if (/no (active )?study( set)?/i.test(err.message)) {
             return ["Run `ish study list` then `ish study use <alias>`"];
         }
         if (/invalid id/i.test(err.message)) {
@@ -437,10 +465,31 @@ export function outputError(err, json) {
             if (Array.isArray(body.suggestions))
                 bodySuggestions = body.suggestions;
         }
-        const mergedSuggestions = bodySuggestions
-            ? Array.from(new Set([...bodySuggestions.map(String), ...suggestions]))
-            : suggestions;
+        // Call sites can also tag suggestions onto the error instance (e.g.
+        // `ask run --new` overrides `retryable: false` and adds an "inspect
+        // ask list before retrying" hint). Merge those alongside body /
+        // suggestionsForError so agents see the full picture.
+        const taggedSuggestions = err.suggestions;
+        const taggedSuggestionList = Array.isArray(taggedSuggestions)
+            ? taggedSuggestions.map(String)
+            : [];
+        const mergedSuggestions = Array.from(new Set([
+            ...(bodySuggestions ? bodySuggestions.map(String) : []),
+            ...taggedSuggestionList,
+            ...suggestions,
+        ]));
         const limitDetail = err.error_code === "usage_limit_reached" ? structuredDetail(err) : undefined;
+        // B7 / Pattern G: dispatch-attempt failures tag the seeded testers
+        // onto the thrown ApiError so the agent can resume without
+        // re-seeding (which would create duplicates). Surface alongside
+        // the error envelope so machine-readable consumers see them.
+        const tagged = err;
+        const seededIds = Array.isArray(tagged.seeded_but_not_dispatched_ids)
+            ? tagged.seeded_but_not_dispatched_ids
+            : undefined;
+        const seededAliases = Array.isArray(tagged.seeded_but_not_dispatched_aliases)
+            ? tagged.seeded_but_not_dispatched_aliases
+            : undefined;
         if (json) {
             console.error(JSON.stringify({
                 error: err.message,
@@ -454,6 +503,8 @@ export function outputError(err, json) {
                     max: limitDetail.max,
                     upgrade_url: limitDetail.upgrade_url,
                 }),
+                ...(seededIds && { seeded_but_not_dispatched_ids: seededIds }),
+                ...(seededAliases && { seeded_but_not_dispatched_aliases: seededAliases }),
                 ...(bodyErrors !== undefined && { errors: bodyErrors }),
                 ...(mergedSuggestions.length > 0 && { suggestions: mergedSuggestions }),
             }));
@@ -499,17 +550,56 @@ export function outputError(err, json) {
         }
     }
     else if (err instanceof Error) {
+        // Some errors carry a structured `error_kind` (e.g. TunnelInactive from
+        // `chat endpoint test`, ConfirmationRequired from destructive --json
+        // gating) — surface it in the JSON envelope so agents branch on it
+        // without parsing prose. `example` rides along for self-correction
+        // hints (e.g. "<the same command with --yes appended>").
+        //
+        // Errors that pre-declare their own `error_code` + `retryable`
+        // (e.g. WaitTimeoutError from study run/wait timer expiry) bypass
+        // the generic `client_error` mapping so an agent can branch on
+        // `wait_timeout` vs `client_error` without parsing prose. The
+        // optional `progress` field rides along verbatim — agents need to
+        // know how far the wait got (M8 + M9).
+        const tagged = err;
+        const errorKind = typeof tagged.error_kind === "string" ? tagged.error_kind : undefined;
+        const example = typeof tagged.example === "string" ? tagged.example : undefined;
+        const errorCode = typeof tagged.error_code === "string" ? tagged.error_code : "client_error";
+        const retryable = typeof tagged.retryable === "boolean" ? tagged.retryable : false;
+        const progress = tagged.progress ?? undefined;
+        const seededIds = Array.isArray(tagged.seeded_but_not_dispatched_ids)
+            ? tagged.seeded_but_not_dispatched_ids
+            : undefined;
+        const seededAliases = Array.isArray(tagged.seeded_but_not_dispatched_aliases)
+            ? tagged.seeded_but_not_dispatched_aliases
+            : undefined;
+        // Errors can pre-attach `suggestions: string[]` (e.g.
+        // `noActiveContextError` for Pattern A — "no active study; pass --study
+        // or run study use"). Merge alongside the code-based suggestions so the
+        // envelope carries both without dropping either.
+        const taggedSuggestions = Array.isArray(tagged.suggestions)
+            ? tagged.suggestions.filter((s) => typeof s === "string")
+            : [];
+        const mergedSuggestions = [...new Set([...suggestions, ...taggedSuggestions])];
         if (json) {
             console.error(JSON.stringify({
                 error: err.message,
-                error_code: "client_error",
-                retryable: false,
-                ...(suggestions.length > 0 && { suggestions }),
+                error_code: errorCode,
+                retryable,
+                ...(errorKind && { error_kind: errorKind }),
+                ...(example && { example }),
+                ...(progress !== undefined && { progress }),
+                ...(seededIds && { seeded_but_not_dispatched_ids: seededIds }),
+                ...(seededAliases && { seeded_but_not_dispatched_aliases: seededAliases }),
+                ...(mergedSuggestions.length > 0 && { suggestions: mergedSuggestions }),
             }));
         }
         else {
             console.error(`Error: ${err.message}`);
-            for (const s of suggestions)
+            if (example)
+                console.error(`  → Example: ${example}`);
+            for (const s of mergedSuggestions)
                 console.error(`  → ${s}`);
         }
     }
@@ -764,7 +854,9 @@ function buildStudyResultsEnvelope(study) {
             total: sentimentTotal,
         }
         : null;
-    // Group interview answers by question for easy parsing.
+    // Group interview answers by question for easy parsing. Each answer row
+    // includes the tester's session-level `sentiment` (M10) so agents can read
+    // sentiment per answer without round-tripping `study tester <id>` per row.
     const questions = Array.isArray(study.interview_questions) ? study.interview_questions : [];
     const interviewAnswers = questions.map((q) => {
         const qObj = q;
@@ -773,10 +865,11 @@ function buildStudyResultsEnvelope(study) {
             const a = t.interviewAnswers.find((x) => x.questionId === qObj.id);
             if (a) {
                 answers.push({
-                    tester_alias: t.id ? deterministicAlias(ALIAS_PREFIX.tester, t.id) : null,
+                    tester_alias: t.alias || null,
                     tester_name: t.name,
                     iteration: t.iterationLabel,
                     answer: a.answer,
+                    sentiment: t.summarySentiment,
                 });
             }
         }
@@ -794,11 +887,13 @@ function buildStudyResultsEnvelope(study) {
     // agents can act on a failed run without re-fetching every tester.
     const failedCount = allTesters.filter((t) => t.status.toLowerCase() === "failed").length;
     const testerRows = allTesters.map((t) => ({
-        alias: t.id ? deterministicAlias(ALIAS_PREFIX.tester, t.id) : null,
+        alias: t.alias || null,
         name: t.name,
         iteration: t.iterationLabel,
         status: t.status,
         interaction_count: t.interactionCount,
+        sentiment: t.summarySentiment,
+        ...(t.summaryComment && { comment: t.summaryComment }),
         ...(t.errorMessage && { error_message: t.errorMessage }),
     }));
     return {
@@ -888,6 +983,210 @@ export function formatStudyResults(study, json) {
         console.log("\nRun `ish tester get <id> --json` for full interaction details.");
     }
 }
+/**
+ * `study results --summary` projection. Drops interview_answers + per-tester
+ * interaction breakdowns; keeps headline counters, sentiment histogram, and a
+ * per-tester {alias, status, sentiment, comment} row. Useful for agents that
+ * need to branch on outcome without paying for the full envelope.
+ */
+export function buildStudyResultsSummary(study) {
+    const allTesters = collectTesters(study);
+    const studyAlias = study.id
+        ? deterministicAlias(ALIAS_PREFIX.study, String(study.id))
+        : null;
+    const completedCount = allTesters.filter((t) => t.status === "completed" || t.status === "complete").length;
+    const failedCount = allTesters.filter((t) => t.status.toLowerCase() === "failed").length;
+    const sentimentCounts = {};
+    let sentimentTotal = 0;
+    for (const t of allTesters) {
+        for (const [label, count] of Object.entries(t.sentimentCounts)) {
+            sentimentCounts[label] = (sentimentCounts[label] || 0) + count;
+            sentimentTotal += count;
+        }
+    }
+    const sentiment = sentimentTotal > 0 ? { counts: sentimentCounts, total: sentimentTotal } : null;
+    const testers = allTesters.map((t) => ({
+        alias: t.alias || null,
+        name: t.name,
+        status: t.status,
+        sentiment: t.summarySentiment,
+        ...(t.summaryComment && { comment: t.summaryComment }),
+        ...(t.errorMessage && { error_message: t.errorMessage }),
+    }));
+    return {
+        study: {
+            alias: studyAlias,
+            name: study.name || null,
+            modality: study.modality || null,
+        },
+        tester_count: allTesters.length,
+        completed_count: completedCount,
+        failed_count: failedCount,
+        sentiment,
+        testers,
+    };
+}
+/**
+ * `study results --transcript <tester_id>` projection. Mirrors the schema
+ * MCP's `get_chat_transcript` returns (`src/ish_mcp/projections.py:
+ * build_chat_transcript`) so callers see the same shape regardless of
+ * surface. Tester turns whose action carries no text (e.g. select_option)
+ * surface `text: null`; intent lives on `action_type` + `option_label`.
+ * Bot turns with a `bot_reply.failure` block surface `failure` and
+ * `text: null` and don't count toward `unique_bot_replies`.
+ */
+export function buildChatTranscript(tester) {
+    const id = String(tester.id || "");
+    const alias = id ? deterministicAlias(ALIAS_PREFIX.tester, id) : null;
+    const profile = tester.tester_profile;
+    const interactions = Array.isArray(tester.interactions) ? tester.interactions : [];
+    // Sort by timestamp then created_at so agent doesn't need to re-sort.
+    const sorted = [...interactions].sort((a, b) => {
+        const aIx = a;
+        const bIx = b;
+        const at = String(aIx.timestamp ?? aIx.created_at ?? "");
+        const bt = String(bIx.timestamp ?? bIx.created_at ?? "");
+        if (at < bt)
+            return -1;
+        if (at > bt)
+            return 1;
+        const ac = String(aIx.created_at ?? "");
+        const bc = String(bIx.created_at ?? "");
+        return ac < bc ? -1 : ac > bc ? 1 : 0;
+    });
+    const transcript = [];
+    const uniqueBotReplies = new Set();
+    for (const interaction of sorted) {
+        const ix = interaction;
+        // Tester turn — derive role/action from the interaction itself.
+        // Backend shape (post a880eba rename):
+        //   ix.actor in {"ai", "human", "user"} (user is the actual end-user;
+        //   we don't surface those in the transcript).
+        //   ix.actions: [{action_type, data: {type, turn_index, text?, wire_text?,
+        //                                     option_label?, said_instead?, ...}}]
+        //   ix.bot_reply: {text?, failure?}
+        // The tester's actual message text is nested under `action.data` —
+        // earlier versions of this builder read off the action top-level
+        // (`action.text`, `action.type`), which silently produced
+        // `text: null` on every turn (PC-C3 finding #3).
+        const actions = Array.isArray(ix.actions) ? ix.actions : [];
+        const action0 = actions.length > 0
+            ? actions[0]
+            : ix.action;
+        const actionData = action0?.data;
+        const actionType = action0?.action_type
+            ? String(action0.action_type)
+            : (actionData?.type ? String(actionData.type) : null);
+        // Turn index lives on the action.data, not the interaction itself.
+        // Fall back through interaction.turn_index for older payloads.
+        const turnIndex = typeof actionData?.turn_index === "number"
+            ? actionData.turn_index
+            : typeof ix.turn_index === "number"
+                ? ix.turn_index
+                : typeof ix.turn === "number" ? ix.turn : 0;
+        const botReply = ix.bot_reply;
+        if (botReply && typeof botReply === "object") {
+            const failure = botReply.failure;
+            const text = failure ? null : (botReply.text != null ? String(botReply.text) : null);
+            transcript.push({
+                role: "bot",
+                text,
+                turn_index: turnIndex,
+                failure: failure ?? null,
+            });
+            if (!failure && text != null && text.length > 0) {
+                uniqueBotReplies.add(text);
+            }
+        }
+        const actor = String(ix.actor ?? ix.interaction_type ?? "");
+        if (actor === "ai" || actor === "human") {
+            // Resolve the tester's literal text from action.data, preferring
+            // the canonical wire_text the backend exposes for every action
+            // shape (send_text, select_option, ignore_offered, …) so the
+            // transcript carries the actual content on every turn — D2.
+            // Priority order:
+            //   1. action.data.text          (send_text)
+            //   2. action.data.wire_text     (universal canonical for all shapes)
+            //   3. action.data.said_instead  (ignore_offered)
+            //   4. action.text               (legacy / older payloads)
+            const dataText = actionData?.text;
+            const dataWireText = actionData?.wire_text;
+            const dataSaidInstead = actionData?.said_instead;
+            let text = null;
+            if (typeof dataText === "string" && dataText.length > 0)
+                text = dataText;
+            else if (typeof dataWireText === "string" && dataWireText.length > 0)
+                text = dataWireText;
+            else if (typeof dataSaidInstead === "string" && dataSaidInstead.length > 0)
+                text = dataSaidInstead;
+            else if (typeof action0?.text === "string" && action0.text.length > 0)
+                text = action0.text;
+            const dataOption = actionData?.option_label;
+            const optionLabel = typeof dataOption === "string" && dataOption.length > 0
+                ? dataOption
+                : (typeof action0?.option_label === "string" && action0.option_label.length > 0
+                    ? action0.option_label
+                    : (typeof action0?.element_label === "string" && action0.element_label.length > 0
+                        ? action0.element_label
+                        : null));
+            const sentimentObj = ix.sentiment;
+            transcript.push({
+                role: "tester",
+                text,
+                turn_index: turnIndex,
+                action_type: actionType,
+                option_label: optionLabel,
+                sentiment: sentimentObj?.label ? String(sentimentObj.label) : null,
+            });
+        }
+    }
+    const summary = tester.tester_summary;
+    return {
+        tester_id: id || null,
+        tester_alias: alias,
+        instance_name: tester.instance_name ?? null,
+        modality: "chat",
+        transcript,
+        unique_bot_replies: uniqueBotReplies.size,
+        tester_summary: summary
+            ? {
+                comment: summary.comment ?? null,
+                sentiment: summary.sentiment ?? null,
+            }
+            : null,
+        name: profile?.name ?? null,
+    };
+}
+/**
+ * `study tester --summary` projection. Drops the action timeline; keeps the
+ * headline (alias, status, sentiment, comment, error_message). Useful for
+ * the common "did this tester finish, what did they say" check that's
+ * currently buried under the full interactions array.
+ */
+export function buildTesterSummary(tester) {
+    const id = String(tester.id || "");
+    const alias = id ? deterministicAlias(ALIAS_PREFIX.tester, id) : null;
+    const profile = tester.tester_profile;
+    const summary = tester.tester_summary;
+    const summarySentiment = summary?.sentiment;
+    const interactions = Array.isArray(tester.interactions) ? tester.interactions : [];
+    const out = {
+        tester: {
+            alias,
+            name: profile?.name ?? tester.instance_name ?? null,
+            iteration_id: tester.iteration_id ?? null,
+            status: tester.status ?? null,
+        },
+        interaction_count: interactions.length,
+        sentiment: summarySentiment?.label ?? null,
+        comment: summary?.comment ?? null,
+    };
+    if (tester.error_message)
+        out.error_message = String(tester.error_message);
+    if (tester.error_kind)
+        out.error_kind = String(tester.error_kind);
+    return out;
+}
 function collectTesters(study) {
     const iterations = Array.isArray(study.iterations) ? study.iterations : [];
     const testers = [];
@@ -909,14 +1208,20 @@ function collectTesters(study) {
                 }
             }
             const answers = Array.isArray(t.interview_answers) ? t.interview_answers : [];
+            const summary = t.tester_summary;
+            const summarySentimentObj = summary?.sentiment;
+            const id = String(t.id || "");
             testers.push({
-                id: String(t.id || ""),
+                id,
                 name: String(profile?.name || t.instance_name || "Unknown"),
+                alias: id ? deterministicAlias(ALIAS_PREFIX.tester, id) : "",
                 iterationLabel: iterLabel,
                 status: String(t.status || "-"),
                 errorMessage: t.error_message ? String(t.error_message) : null,
                 interactionCount: interactions.length,
                 sentimentCounts,
+                summarySentiment: summarySentimentObj?.label ? String(summarySentimentObj.label) : null,
+                summaryComment: summary?.comment ? String(summary.comment) : null,
                 interviewAnswers: answers.map((a) => ({
                     questionId: String(a.question_id || ""),
                     answer: a.answer,
@@ -1111,6 +1416,23 @@ export function formatSimulationPoll(results, json, isMedia = false) {
             String(r.interaction_count ?? "0"),
         ];
     }));
+    // Pattern A (cli half): list per-tester error_message under the table so
+    // agents see why a simulation failed without re-fetching every tester.
+    // Truncate to 200 chars; full text is available via --json or
+    // `ish study tester get <id>`.
+    const failedRows = results.filter((r) => {
+        const status = String(r.status || "").toLowerCase();
+        return (status === "failed" || status === "errored") && r.error_message;
+    });
+    if (failedRows.length > 0) {
+        console.log("\nFailed simulations:");
+        for (const r of failedRows) {
+            const id = String(r.id || r.tester_id || "");
+            const alias = aliasMap.get(id) || id;
+            const name = String(r.tester_name || "Unknown");
+            console.log(`  ${alias} (${name}): ${truncate(String(r.error_message), 200)}`);
+        }
+    }
 }
 // --- Ask formatting ---
 function variantLetter(index) {
@@ -1324,6 +1646,9 @@ function computeVariantStats(round) {
     }
     return stats;
 }
+const ERROR_RATE_REFUSE_THRESHOLD = 0.5;
+const N_HIGH_CONFIDENCE_FLOOR = 10;
+const N_MEDIUM_CONFIDENCE_FLOOR = 3;
 // When tester_profile and tester_profile_snapshot share all overlapping fields
 // (the common case — snapshot only diverges if the profile was edited after
 // dispatch), drop the redundant content from the snapshot and keep only the
@@ -1366,6 +1691,12 @@ function dedupeTesterSnapshot(tester) {
 }
 // Shape per-variant stats into a machine-readable aggregates object so agents
 // running A/B tests can read the verdict without parsing prose.
+//
+// P6: the winner block carries `n` (completed-response sample size) and a
+// `confidence` summary derived from completion ratio + tied-ness. When the
+// error rate is high enough that calling a winner would be misleading, the
+// winner block is REPLACED by a `{ refused: true, reason }` envelope — agents
+// should run `ish ask retry` first and re-read.
 function buildAggregates(round, stats) {
     if (stats.length === 0)
         return undefined;
@@ -1374,6 +1705,19 @@ function buildAggregates(round, stats) {
     if (!wantsPick && !wantsRatings)
         return undefined;
     const out = {};
+    // Use the same source of truth as the denormalized counts so winner.n is
+    // consistent with `responses_complete` reported on the round envelope.
+    const responses = Array.isArray(round.responses) ? round.responses : [];
+    let completedTotal = 0;
+    let erroredTotal = 0;
+    for (const r of responses) {
+        const status = r.status;
+        if (status === "completed")
+            completedTotal++;
+        else if (status === "errored")
+            erroredTotal++;
+    }
+    const dispatchedTotal = completedTotal + erroredTotal;
     if (wantsPick) {
         const picks = {};
         let topCount = -1;
@@ -1391,8 +1735,65 @@ function buildAggregates(round, stats) {
             }
         }
         out.picks = picks;
+        // Pattern H: pick_buckets keyed by variant id (the value `add_ask_round
+        // --subset-variant` accepts) → tester_ids that picked it. Pre-seed with
+        // every declared variant id so unpicked variants surface as empty
+        // arrays. Only completed responses with a resolvable variant_pick_id
+        // contribute; an errored response carrying a stale variant_pick_id
+        // would otherwise drag a tester into a drill-in audience whose pick
+        // we can't trust.
+        const variants = Array.isArray(round.variants) ? round.variants : [];
+        const variantIdSet = new Set();
+        const pickBuckets = {};
+        for (const v of variants) {
+            const vid = v.id;
+            if (typeof vid === "string" && vid.length > 0) {
+                variantIdSet.add(vid);
+                pickBuckets[vid] = [];
+            }
+        }
+        if (variantIdSet.size > 0) {
+            for (const r of responses) {
+                const resp = r;
+                if (resp.status !== "completed")
+                    continue;
+                const vpid = resp.variant_pick_id;
+                const tid = resp.tester_id;
+                if (typeof vpid === "string"
+                    && variantIdSet.has(vpid)
+                    && typeof tid === "string"
+                    && tid.length > 0) {
+                    pickBuckets[vpid].push(tid);
+                }
+            }
+            out.pick_buckets = pickBuckets;
+        }
         if (topCount > 0) {
-            out.winner = { letter: topLetter, count: topCount, tied };
+            // Refuse the winner when more than half of dispatched responses errored.
+            // Calling A or B with a 4/5 failure rate would mislead the agent into
+            // treating one tester's pick as a verdict.
+            if (dispatchedTotal > 0
+                && erroredTotal / dispatchedTotal > ERROR_RATE_REFUSE_THRESHOLD) {
+                out.winner = {
+                    refused: true,
+                    reason: "error_rate_too_high",
+                    errored: erroredTotal,
+                    total: dispatchedTotal,
+                };
+            }
+            else {
+                out.winner = {
+                    label: topLetter,
+                    count: topCount,
+                    tied,
+                    n: completedTotal,
+                    confidence: deriveWinnerConfidence({
+                        n: completedTotal,
+                        errored: erroredTotal,
+                        tied,
+                    }),
+                };
+            }
         }
     }
     if (wantsRatings) {
@@ -1410,6 +1811,30 @@ function buildAggregates(round, stats) {
     }
     return out;
 }
+/**
+ * Derive a coarse confidence label from sample size + tied-ness + error mix.
+ *
+ * Rules (lowest wins):
+ *   - low:    n < 3 OR tied OR any errored response (we have visible failures)
+ *   - medium: 3 <= n < 10 (small sample but clean)
+ *   - high:   n >= 10 AND no errored responses AND not tied
+ *
+ * Tuned for the typical 5-tester ask: a clean 5/5 lands at "medium" (you
+ * can probably trust the lean), 1/5 with no errors lands at "low" (you
+ * need more data), 5/5 with a tie lands at "low" (no winner to call).
+ */
+export function deriveWinnerConfidence(args) {
+    const { n, errored, tied } = args;
+    if (n < N_MEDIUM_CONFIDENCE_FLOOR)
+        return "low";
+    if (tied)
+        return "low";
+    if (errored > 0)
+        return "low";
+    if (n < N_HIGH_CONFIDENCE_FLOOR)
+        return "medium";
+    return "high";
+}
 function buildCrossRoundSummary(rounds) {
     if (rounds.length < 2)
         return undefined;
@@ -1555,11 +1980,20 @@ export function formatAskResults(ask, json, roundFilter) {
         }
         const headers = ["ROUND", "WINNER", ...Array.from(letters).sort()];
         const rows = crossRound.rounds.map((entry) => {
-            const winnerCell = entry.winner
-                ? entry.winner.tied
-                    ? `${entry.winner.letter} (tied)`
-                    : entry.winner.letter
-                : "-";
+            let winnerCell;
+            if (!entry.winner) {
+                winnerCell = "-";
+            }
+            else if ("refused" in entry.winner) {
+                // Too many failures to call a winner this round; agent should retry.
+                winnerCell = `(refused: ${entry.winner.errored}/${entry.winner.total} errored)`;
+            }
+            else {
+                const base = entry.winner.tied
+                    ? `${entry.winner.label} (tied)`
+                    : entry.winner.label;
+                winnerCell = `${base} · ${entry.winner.confidence}`;
+            }
             return [
                 `R${entry.round_number}`,
                 winnerCell,

package/dist/lib/paths.d.ts CHANGED Viewed

@@ -12,3 +12,4 @@ export declare function binDir(): string;
 export declare function browsersDir(): string;
 export declare function simulationsDir(): string;
 export declare function cloudflaredBin(): string;
+export declare function connectLockPath(): string;

package/dist/lib/paths.js CHANGED Viewed

@@ -34,3 +34,6 @@ export function cloudflaredBin() {
     const exe = process.platform === "win32" ? "cloudflared.exe" : "cloudflared";
     return path.join(binDir(), exe);
 }
+export function connectLockPath() {
+    return path.join(rootDir(), "connect.lock");
+}