@ishlabs/cli 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +54 -5
  2. package/dist/commands/ask.d.ts +12 -0
  3. package/dist/commands/ask.js +127 -2
  4. package/dist/commands/chat.d.ts +17 -0
  5. package/dist/commands/chat.js +655 -0
  6. package/dist/commands/iteration.js +134 -14
  7. package/dist/commands/secret.d.ts +20 -0
  8. package/dist/commands/secret.js +246 -0
  9. package/dist/commands/study-run.d.ts +38 -0
  10. package/dist/commands/study-run.js +199 -80
  11. package/dist/commands/study-tester.js +17 -2
  12. package/dist/commands/study.js +309 -37
  13. package/dist/commands/workspace.js +81 -0
  14. package/dist/config.d.ts +3 -0
  15. package/dist/connect.d.ts +3 -0
  16. package/dist/connect.js +346 -22
  17. package/dist/index.js +64 -6
  18. package/dist/lib/alias-hydrate.d.ts +42 -0
  19. package/dist/lib/alias-hydrate.js +175 -0
  20. package/dist/lib/alias-store.d.ts +1 -0
  21. package/dist/lib/alias-store.js +28 -1
  22. package/dist/lib/auth.js +4 -2
  23. package/dist/lib/chat-endpoint-formatters.d.ts +74 -0
  24. package/dist/lib/chat-endpoint-formatters.js +154 -0
  25. package/dist/lib/chat-endpoint-templates.d.ts +35 -0
  26. package/dist/lib/chat-endpoint-templates.js +210 -0
  27. package/dist/lib/command-helpers.d.ts +18 -0
  28. package/dist/lib/command-helpers.js +105 -3
  29. package/dist/lib/docs.js +641 -17
  30. package/dist/lib/modality.d.ts +42 -0
  31. package/dist/lib/modality.js +192 -0
  32. package/dist/lib/output.d.ts +41 -0
  33. package/dist/lib/output.js +453 -19
  34. package/dist/lib/paths.d.ts +1 -0
  35. package/dist/lib/paths.js +3 -0
  36. package/dist/lib/skill-content.d.ts +18 -0
  37. package/dist/lib/skill-content.js +223 -12
  38. package/dist/lib/types.d.ts +15 -0
  39. package/package.json +2 -2
@@ -113,6 +113,21 @@ function renderBare(value) {
113
113
  const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
114
114
  const TIMESTAMP_KEYS = new Set(["created_at", "updated_at"]);
115
115
  const PAGINATION_KEYS = new Set(["items", "total", "returned", "limit", "offset", "has_more"]);
116
+ // Keys whose UUID value must survive the lean-JSON pass even though it's the
117
+ // same shape leanJson strips elsewhere. These are load-bearing for agent
118
+ // follow-up calls and were forcing agents to pass `--verbose` (C5-Bug4).
119
+ const UUID_KEYS_TO_KEEP = new Set([
120
+ // ask: which variant the tester picked — the load-bearing field for "who picked what".
121
+ "variant_pick_id",
122
+ ]);
123
+ // Keys whose value must pass through leanJson untouched (no UUID stripping,
124
+ // no empty-array drop, no nested recursion). The entire shape is contract:
125
+ // every variant id key and every tester id in its array is load-bearing,
126
+ // and unpicked variants must surface as `[]` rather than disappear.
127
+ const LEAN_PASSTHROUGH_KEYS = new Set([
128
+ // Pattern H: variant_id → [tester_id, ...] for drill-in audience discovery.
129
+ "pick_buckets",
130
+ ]);
116
131
  /**
117
132
  * Strip UUID-valued fields, null/undefined values, and timestamps.
118
133
  * Preserves alias, name, label, status, and other meaningful fields.
@@ -143,6 +158,19 @@ function leanJson(data, keepIds = false) {
143
158
  result[key] = value;
144
159
  continue;
145
160
  }
161
+ // Allowlisted UUID-valued keys (load-bearing for agent follow-up calls).
162
+ if (UUID_KEYS_TO_KEEP.has(key)) {
163
+ if (value !== null && value !== undefined)
164
+ result[key] = value;
165
+ continue;
166
+ }
167
+ // Pass-through keys (e.g. pick_buckets). The entire nested shape is
168
+ // contract — copy verbatim with no UUID-stripping or empty-array drop.
169
+ if (LEAN_PASSTHROUGH_KEYS.has(key)) {
170
+ if (value !== null && value !== undefined)
171
+ result[key] = value;
172
+ continue;
173
+ }
146
174
  // Strip null/undefined
147
175
  if (value === null || value === undefined)
148
176
  continue;
@@ -405,10 +433,10 @@ function suggestionsForError(err) {
405
433
  if (/no auth token|run "ish login"|session expired/i.test(err.message)) {
406
434
  return ["Run `ish login` to authenticate"];
407
435
  }
408
- if (/no workspace set/i.test(err.message)) {
436
+ if (/no (active )?workspace( set)?/i.test(err.message)) {
409
437
  return ["Run `ish workspace list` then `ish workspace use <alias>`"];
410
438
  }
411
- if (/no study set/i.test(err.message)) {
439
+ if (/no (active )?study( set)?/i.test(err.message)) {
412
440
  return ["Run `ish study list` then `ish study use <alias>`"];
413
441
  }
414
442
  if (/invalid id/i.test(err.message)) {
@@ -437,10 +465,31 @@ export function outputError(err, json) {
437
465
  if (Array.isArray(body.suggestions))
438
466
  bodySuggestions = body.suggestions;
439
467
  }
440
- const mergedSuggestions = bodySuggestions
441
- ? Array.from(new Set([...bodySuggestions.map(String), ...suggestions]))
442
- : suggestions;
468
+ // Call sites can also tag suggestions onto the error instance (e.g.
469
+ // `ask run --new` overrides `retryable: false` and adds an "inspect
470
+ // ask list before retrying" hint). Merge those alongside body /
471
+ // suggestionsForError so agents see the full picture.
472
+ const taggedSuggestions = err.suggestions;
473
+ const taggedSuggestionList = Array.isArray(taggedSuggestions)
474
+ ? taggedSuggestions.map(String)
475
+ : [];
476
+ const mergedSuggestions = Array.from(new Set([
477
+ ...(bodySuggestions ? bodySuggestions.map(String) : []),
478
+ ...taggedSuggestionList,
479
+ ...suggestions,
480
+ ]));
443
481
  const limitDetail = err.error_code === "usage_limit_reached" ? structuredDetail(err) : undefined;
482
+ // B7 / Pattern G: dispatch-attempt failures tag the seeded testers
483
+ // onto the thrown ApiError so the agent can resume without
484
+ // re-seeding (which would create duplicates). Surface alongside
485
+ // the error envelope so machine-readable consumers see them.
486
+ const tagged = err;
487
+ const seededIds = Array.isArray(tagged.seeded_but_not_dispatched_ids)
488
+ ? tagged.seeded_but_not_dispatched_ids
489
+ : undefined;
490
+ const seededAliases = Array.isArray(tagged.seeded_but_not_dispatched_aliases)
491
+ ? tagged.seeded_but_not_dispatched_aliases
492
+ : undefined;
444
493
  if (json) {
445
494
  console.error(JSON.stringify({
446
495
  error: err.message,
@@ -454,6 +503,8 @@ export function outputError(err, json) {
454
503
  max: limitDetail.max,
455
504
  upgrade_url: limitDetail.upgrade_url,
456
505
  }),
506
+ ...(seededIds && { seeded_but_not_dispatched_ids: seededIds }),
507
+ ...(seededAliases && { seeded_but_not_dispatched_aliases: seededAliases }),
457
508
  ...(bodyErrors !== undefined && { errors: bodyErrors }),
458
509
  ...(mergedSuggestions.length > 0 && { suggestions: mergedSuggestions }),
459
510
  }));
@@ -499,17 +550,56 @@ export function outputError(err, json) {
499
550
  }
500
551
  }
501
552
  else if (err instanceof Error) {
553
+ // Some errors carry a structured `error_kind` (e.g. TunnelInactive from
554
+ // `chat endpoint test`, ConfirmationRequired from destructive --json
555
+ // gating) — surface it in the JSON envelope so agents branch on it
556
+ // without parsing prose. `example` rides along for self-correction
557
+ // hints (e.g. "<the same command with --yes appended>").
558
+ //
559
+ // Errors that pre-declare their own `error_code` + `retryable`
560
+ // (e.g. WaitTimeoutError from study run/wait timer expiry) bypass
561
+ // the generic `client_error` mapping so an agent can branch on
562
+ // `wait_timeout` vs `client_error` without parsing prose. The
563
+ // optional `progress` field rides along verbatim — agents need to
564
+ // know how far the wait got (M8 + M9).
565
+ const tagged = err;
566
+ const errorKind = typeof tagged.error_kind === "string" ? tagged.error_kind : undefined;
567
+ const example = typeof tagged.example === "string" ? tagged.example : undefined;
568
+ const errorCode = typeof tagged.error_code === "string" ? tagged.error_code : "client_error";
569
+ const retryable = typeof tagged.retryable === "boolean" ? tagged.retryable : false;
570
+ const progress = tagged.progress ?? undefined;
571
+ const seededIds = Array.isArray(tagged.seeded_but_not_dispatched_ids)
572
+ ? tagged.seeded_but_not_dispatched_ids
573
+ : undefined;
574
+ const seededAliases = Array.isArray(tagged.seeded_but_not_dispatched_aliases)
575
+ ? tagged.seeded_but_not_dispatched_aliases
576
+ : undefined;
577
+ // Errors can pre-attach `suggestions: string[]` (e.g.
578
+ // `noActiveContextError` for Pattern A — "no active study; pass --study
579
+ // or run study use"). Merge alongside the code-based suggestions so the
580
+ // envelope carries both without dropping either.
581
+ const taggedSuggestions = Array.isArray(tagged.suggestions)
582
+ ? tagged.suggestions.filter((s) => typeof s === "string")
583
+ : [];
584
+ const mergedSuggestions = [...new Set([...suggestions, ...taggedSuggestions])];
502
585
  if (json) {
503
586
  console.error(JSON.stringify({
504
587
  error: err.message,
505
- error_code: "client_error",
506
- retryable: false,
507
- ...(suggestions.length > 0 && { suggestions }),
588
+ error_code: errorCode,
589
+ retryable,
590
+ ...(errorKind && { error_kind: errorKind }),
591
+ ...(example && { example }),
592
+ ...(progress !== undefined && { progress }),
593
+ ...(seededIds && { seeded_but_not_dispatched_ids: seededIds }),
594
+ ...(seededAliases && { seeded_but_not_dispatched_aliases: seededAliases }),
595
+ ...(mergedSuggestions.length > 0 && { suggestions: mergedSuggestions }),
508
596
  }));
509
597
  }
510
598
  else {
511
599
  console.error(`Error: ${err.message}`);
512
- for (const s of suggestions)
600
+ if (example)
601
+ console.error(` → Example: ${example}`);
602
+ for (const s of mergedSuggestions)
513
603
  console.error(` → ${s}`);
514
604
  }
515
605
  }
@@ -764,7 +854,9 @@ function buildStudyResultsEnvelope(study) {
764
854
  total: sentimentTotal,
765
855
  }
766
856
  : null;
767
- // Group interview answers by question for easy parsing.
857
+ // Group interview answers by question for easy parsing. Each answer row
858
+ // includes the tester's session-level `sentiment` (M10) so agents can read
859
+ // sentiment per answer without round-tripping `study tester <id>` per row.
768
860
  const questions = Array.isArray(study.interview_questions) ? study.interview_questions : [];
769
861
  const interviewAnswers = questions.map((q) => {
770
862
  const qObj = q;
@@ -773,10 +865,11 @@ function buildStudyResultsEnvelope(study) {
773
865
  const a = t.interviewAnswers.find((x) => x.questionId === qObj.id);
774
866
  if (a) {
775
867
  answers.push({
776
- tester_alias: t.id ? deterministicAlias(ALIAS_PREFIX.tester, t.id) : null,
868
+ tester_alias: t.alias || null,
777
869
  tester_name: t.name,
778
870
  iteration: t.iterationLabel,
779
871
  answer: a.answer,
872
+ sentiment: t.summarySentiment,
780
873
  });
781
874
  }
782
875
  }
@@ -794,11 +887,13 @@ function buildStudyResultsEnvelope(study) {
794
887
  // agents can act on a failed run without re-fetching every tester.
795
888
  const failedCount = allTesters.filter((t) => t.status.toLowerCase() === "failed").length;
796
889
  const testerRows = allTesters.map((t) => ({
797
- alias: t.id ? deterministicAlias(ALIAS_PREFIX.tester, t.id) : null,
890
+ alias: t.alias || null,
798
891
  name: t.name,
799
892
  iteration: t.iterationLabel,
800
893
  status: t.status,
801
894
  interaction_count: t.interactionCount,
895
+ sentiment: t.summarySentiment,
896
+ ...(t.summaryComment && { comment: t.summaryComment }),
802
897
  ...(t.errorMessage && { error_message: t.errorMessage }),
803
898
  }));
804
899
  return {
@@ -888,6 +983,210 @@ export function formatStudyResults(study, json) {
888
983
  console.log("\nRun `ish tester get <id> --json` for full interaction details.");
889
984
  }
890
985
  }
986
+ /**
987
+ * `study results --summary` projection. Drops interview_answers + per-tester
988
+ * interaction breakdowns; keeps headline counters, sentiment histogram, and a
989
+ * per-tester {alias, status, sentiment, comment} row. Useful for agents that
990
+ * need to branch on outcome without paying for the full envelope.
991
+ */
992
+ export function buildStudyResultsSummary(study) {
993
+ const allTesters = collectTesters(study);
994
+ const studyAlias = study.id
995
+ ? deterministicAlias(ALIAS_PREFIX.study, String(study.id))
996
+ : null;
997
+ const completedCount = allTesters.filter((t) => t.status === "completed" || t.status === "complete").length;
998
+ const failedCount = allTesters.filter((t) => t.status.toLowerCase() === "failed").length;
999
+ const sentimentCounts = {};
1000
+ let sentimentTotal = 0;
1001
+ for (const t of allTesters) {
1002
+ for (const [label, count] of Object.entries(t.sentimentCounts)) {
1003
+ sentimentCounts[label] = (sentimentCounts[label] || 0) + count;
1004
+ sentimentTotal += count;
1005
+ }
1006
+ }
1007
+ const sentiment = sentimentTotal > 0 ? { counts: sentimentCounts, total: sentimentTotal } : null;
1008
+ const testers = allTesters.map((t) => ({
1009
+ alias: t.alias || null,
1010
+ name: t.name,
1011
+ status: t.status,
1012
+ sentiment: t.summarySentiment,
1013
+ ...(t.summaryComment && { comment: t.summaryComment }),
1014
+ ...(t.errorMessage && { error_message: t.errorMessage }),
1015
+ }));
1016
+ return {
1017
+ study: {
1018
+ alias: studyAlias,
1019
+ name: study.name || null,
1020
+ modality: study.modality || null,
1021
+ },
1022
+ tester_count: allTesters.length,
1023
+ completed_count: completedCount,
1024
+ failed_count: failedCount,
1025
+ sentiment,
1026
+ testers,
1027
+ };
1028
+ }
1029
+ /**
1030
+ * `study results --transcript <tester_id>` projection. Mirrors the schema
1031
+ * MCP's `get_chat_transcript` returns (`src/ish_mcp/projections.py:
1032
+ * build_chat_transcript`) so callers see the same shape regardless of
1033
+ * surface. Tester turns whose action carries no text (e.g. select_option)
1034
+ * surface `text: null`; intent lives on `action_type` + `option_label`.
1035
+ * Bot turns with a `bot_reply.failure` block surface `failure` and
1036
+ * `text: null` and don't count toward `unique_bot_replies`.
1037
+ */
1038
+ export function buildChatTranscript(tester) {
1039
+ const id = String(tester.id || "");
1040
+ const alias = id ? deterministicAlias(ALIAS_PREFIX.tester, id) : null;
1041
+ const profile = tester.tester_profile;
1042
+ const interactions = Array.isArray(tester.interactions) ? tester.interactions : [];
1043
+ // Sort by timestamp then created_at so agent doesn't need to re-sort.
1044
+ const sorted = [...interactions].sort((a, b) => {
1045
+ const aIx = a;
1046
+ const bIx = b;
1047
+ const at = String(aIx.timestamp ?? aIx.created_at ?? "");
1048
+ const bt = String(bIx.timestamp ?? bIx.created_at ?? "");
1049
+ if (at < bt)
1050
+ return -1;
1051
+ if (at > bt)
1052
+ return 1;
1053
+ const ac = String(aIx.created_at ?? "");
1054
+ const bc = String(bIx.created_at ?? "");
1055
+ return ac < bc ? -1 : ac > bc ? 1 : 0;
1056
+ });
1057
+ const transcript = [];
1058
+ const uniqueBotReplies = new Set();
1059
+ for (const interaction of sorted) {
1060
+ const ix = interaction;
1061
+ // Tester turn — derive role/action from the interaction itself.
1062
+ // Backend shape (post a880eba rename):
1063
+ // ix.actor in {"ai", "human", "user"} (user is the actual end-user;
1064
+ // we don't surface those in the transcript).
1065
+ // ix.actions: [{action_type, data: {type, turn_index, text?, wire_text?,
1066
+ // option_label?, said_instead?, ...}}]
1067
+ // ix.bot_reply: {text?, failure?}
1068
+ // The tester's actual message text is nested under `action.data` —
1069
+ // earlier versions of this builder read off the action top-level
1070
+ // (`action.text`, `action.type`), which silently produced
1071
+ // `text: null` on every turn (PC-C3 finding #3).
1072
+ const actions = Array.isArray(ix.actions) ? ix.actions : [];
1073
+ const action0 = actions.length > 0
1074
+ ? actions[0]
1075
+ : ix.action;
1076
+ const actionData = action0?.data;
1077
+ const actionType = action0?.action_type
1078
+ ? String(action0.action_type)
1079
+ : (actionData?.type ? String(actionData.type) : null);
1080
+ // Turn index lives on the action.data, not the interaction itself.
1081
+ // Fall back through interaction.turn_index for older payloads.
1082
+ const turnIndex = typeof actionData?.turn_index === "number"
1083
+ ? actionData.turn_index
1084
+ : typeof ix.turn_index === "number"
1085
+ ? ix.turn_index
1086
+ : typeof ix.turn === "number" ? ix.turn : 0;
1087
+ const botReply = ix.bot_reply;
1088
+ if (botReply && typeof botReply === "object") {
1089
+ const failure = botReply.failure;
1090
+ const text = failure ? null : (botReply.text != null ? String(botReply.text) : null);
1091
+ transcript.push({
1092
+ role: "bot",
1093
+ text,
1094
+ turn_index: turnIndex,
1095
+ failure: failure ?? null,
1096
+ });
1097
+ if (!failure && text != null && text.length > 0) {
1098
+ uniqueBotReplies.add(text);
1099
+ }
1100
+ }
1101
+ const actor = String(ix.actor ?? ix.interaction_type ?? "");
1102
+ if (actor === "ai" || actor === "human") {
1103
+ // Resolve the tester's literal text from action.data, preferring
1104
+ // the canonical wire_text the backend exposes for every action
1105
+ // shape (send_text, select_option, ignore_offered, …) so the
1106
+ // transcript carries the actual content on every turn — D2.
1107
+ // Priority order:
1108
+ // 1. action.data.text (send_text)
1109
+ // 2. action.data.wire_text (universal canonical for all shapes)
1110
+ // 3. action.data.said_instead (ignore_offered)
1111
+ // 4. action.text (legacy / older payloads)
1112
+ const dataText = actionData?.text;
1113
+ const dataWireText = actionData?.wire_text;
1114
+ const dataSaidInstead = actionData?.said_instead;
1115
+ let text = null;
1116
+ if (typeof dataText === "string" && dataText.length > 0)
1117
+ text = dataText;
1118
+ else if (typeof dataWireText === "string" && dataWireText.length > 0)
1119
+ text = dataWireText;
1120
+ else if (typeof dataSaidInstead === "string" && dataSaidInstead.length > 0)
1121
+ text = dataSaidInstead;
1122
+ else if (typeof action0?.text === "string" && action0.text.length > 0)
1123
+ text = action0.text;
1124
+ const dataOption = actionData?.option_label;
1125
+ const optionLabel = typeof dataOption === "string" && dataOption.length > 0
1126
+ ? dataOption
1127
+ : (typeof action0?.option_label === "string" && action0.option_label.length > 0
1128
+ ? action0.option_label
1129
+ : (typeof action0?.element_label === "string" && action0.element_label.length > 0
1130
+ ? action0.element_label
1131
+ : null));
1132
+ const sentimentObj = ix.sentiment;
1133
+ transcript.push({
1134
+ role: "tester",
1135
+ text,
1136
+ turn_index: turnIndex,
1137
+ action_type: actionType,
1138
+ option_label: optionLabel,
1139
+ sentiment: sentimentObj?.label ? String(sentimentObj.label) : null,
1140
+ });
1141
+ }
1142
+ }
1143
+ const summary = tester.tester_summary;
1144
+ return {
1145
+ tester_id: id || null,
1146
+ tester_alias: alias,
1147
+ instance_name: tester.instance_name ?? null,
1148
+ modality: "chat",
1149
+ transcript,
1150
+ unique_bot_replies: uniqueBotReplies.size,
1151
+ tester_summary: summary
1152
+ ? {
1153
+ comment: summary.comment ?? null,
1154
+ sentiment: summary.sentiment ?? null,
1155
+ }
1156
+ : null,
1157
+ name: profile?.name ?? null,
1158
+ };
1159
+ }
1160
+ /**
1161
+ * `study tester --summary` projection. Drops the action timeline; keeps the
1162
+ * headline (alias, status, sentiment, comment, error_message). Useful for
1163
+ * the common "did this tester finish, what did they say" check that's
1164
+ * currently buried under the full interactions array.
1165
+ */
1166
+ export function buildTesterSummary(tester) {
1167
+ const id = String(tester.id || "");
1168
+ const alias = id ? deterministicAlias(ALIAS_PREFIX.tester, id) : null;
1169
+ const profile = tester.tester_profile;
1170
+ const summary = tester.tester_summary;
1171
+ const summarySentiment = summary?.sentiment;
1172
+ const interactions = Array.isArray(tester.interactions) ? tester.interactions : [];
1173
+ const out = {
1174
+ tester: {
1175
+ alias,
1176
+ name: profile?.name ?? tester.instance_name ?? null,
1177
+ iteration_id: tester.iteration_id ?? null,
1178
+ status: tester.status ?? null,
1179
+ },
1180
+ interaction_count: interactions.length,
1181
+ sentiment: summarySentiment?.label ?? null,
1182
+ comment: summary?.comment ?? null,
1183
+ };
1184
+ if (tester.error_message)
1185
+ out.error_message = String(tester.error_message);
1186
+ if (tester.error_kind)
1187
+ out.error_kind = String(tester.error_kind);
1188
+ return out;
1189
+ }
891
1190
  function collectTesters(study) {
892
1191
  const iterations = Array.isArray(study.iterations) ? study.iterations : [];
893
1192
  const testers = [];
@@ -909,14 +1208,20 @@ function collectTesters(study) {
909
1208
  }
910
1209
  }
911
1210
  const answers = Array.isArray(t.interview_answers) ? t.interview_answers : [];
1211
+ const summary = t.tester_summary;
1212
+ const summarySentimentObj = summary?.sentiment;
1213
+ const id = String(t.id || "");
912
1214
  testers.push({
913
- id: String(t.id || ""),
1215
+ id,
914
1216
  name: String(profile?.name || t.instance_name || "Unknown"),
1217
+ alias: id ? deterministicAlias(ALIAS_PREFIX.tester, id) : "",
915
1218
  iterationLabel: iterLabel,
916
1219
  status: String(t.status || "-"),
917
1220
  errorMessage: t.error_message ? String(t.error_message) : null,
918
1221
  interactionCount: interactions.length,
919
1222
  sentimentCounts,
1223
+ summarySentiment: summarySentimentObj?.label ? String(summarySentimentObj.label) : null,
1224
+ summaryComment: summary?.comment ? String(summary.comment) : null,
920
1225
  interviewAnswers: answers.map((a) => ({
921
1226
  questionId: String(a.question_id || ""),
922
1227
  answer: a.answer,
@@ -1111,6 +1416,23 @@ export function formatSimulationPoll(results, json, isMedia = false) {
1111
1416
  String(r.interaction_count ?? "0"),
1112
1417
  ];
1113
1418
  }));
1419
+ // Pattern A (cli half): list per-tester error_message under the table so
1420
+ // agents see why a simulation failed without re-fetching every tester.
1421
+ // Truncate to 200 chars; full text is available via --json or
1422
+ // `ish study tester get <id>`.
1423
+ const failedRows = results.filter((r) => {
1424
+ const status = String(r.status || "").toLowerCase();
1425
+ return (status === "failed" || status === "errored") && r.error_message;
1426
+ });
1427
+ if (failedRows.length > 0) {
1428
+ console.log("\nFailed simulations:");
1429
+ for (const r of failedRows) {
1430
+ const id = String(r.id || r.tester_id || "");
1431
+ const alias = aliasMap.get(id) || id;
1432
+ const name = String(r.tester_name || "Unknown");
1433
+ console.log(` ${alias} (${name}): ${truncate(String(r.error_message), 200)}`);
1434
+ }
1435
+ }
1114
1436
  }
1115
1437
  // --- Ask formatting ---
1116
1438
  function variantLetter(index) {
@@ -1324,6 +1646,9 @@ function computeVariantStats(round) {
1324
1646
  }
1325
1647
  return stats;
1326
1648
  }
1649
+ const ERROR_RATE_REFUSE_THRESHOLD = 0.5;
1650
+ const N_HIGH_CONFIDENCE_FLOOR = 10;
1651
+ const N_MEDIUM_CONFIDENCE_FLOOR = 3;
1327
1652
  // When tester_profile and tester_profile_snapshot share all overlapping fields
1328
1653
  // (the common case — snapshot only diverges if the profile was edited after
1329
1654
  // dispatch), drop the redundant content from the snapshot and keep only the
@@ -1366,6 +1691,12 @@ function dedupeTesterSnapshot(tester) {
1366
1691
  }
1367
1692
  // Shape per-variant stats into a machine-readable aggregates object so agents
1368
1693
  // running A/B tests can read the verdict without parsing prose.
1694
+ //
1695
+ // P6: the winner block carries `n` (completed-response sample size) and a
1696
+ // `confidence` summary derived from completion ratio + tied-ness. When the
1697
+ // error rate is high enough that calling a winner would be misleading, the
1698
+ // winner block is REPLACED by a `{ refused: true, reason }` envelope — agents
1699
+ // should run `ish ask retry` first and re-read.
1369
1700
  function buildAggregates(round, stats) {
1370
1701
  if (stats.length === 0)
1371
1702
  return undefined;
@@ -1374,6 +1705,19 @@ function buildAggregates(round, stats) {
1374
1705
  if (!wantsPick && !wantsRatings)
1375
1706
  return undefined;
1376
1707
  const out = {};
1708
+ // Use the same source of truth as the denormalized counts so winner.n is
1709
+ // consistent with `responses_complete` reported on the round envelope.
1710
+ const responses = Array.isArray(round.responses) ? round.responses : [];
1711
+ let completedTotal = 0;
1712
+ let erroredTotal = 0;
1713
+ for (const r of responses) {
1714
+ const status = r.status;
1715
+ if (status === "completed")
1716
+ completedTotal++;
1717
+ else if (status === "errored")
1718
+ erroredTotal++;
1719
+ }
1720
+ const dispatchedTotal = completedTotal + erroredTotal;
1377
1721
  if (wantsPick) {
1378
1722
  const picks = {};
1379
1723
  let topCount = -1;
@@ -1391,8 +1735,65 @@ function buildAggregates(round, stats) {
1391
1735
  }
1392
1736
  }
1393
1737
  out.picks = picks;
1738
+ // Pattern H: pick_buckets keyed by variant id (the value `add_ask_round
1739
+ // --subset-variant` accepts) → tester_ids that picked it. Pre-seed with
1740
+ // every declared variant id so unpicked variants surface as empty
1741
+ // arrays. Only completed responses with a resolvable variant_pick_id
1742
+ // contribute; an errored response carrying a stale variant_pick_id
1743
+ // would otherwise drag a tester into a drill-in audience whose pick
1744
+ // we can't trust.
1745
+ const variants = Array.isArray(round.variants) ? round.variants : [];
1746
+ const variantIdSet = new Set();
1747
+ const pickBuckets = {};
1748
+ for (const v of variants) {
1749
+ const vid = v.id;
1750
+ if (typeof vid === "string" && vid.length > 0) {
1751
+ variantIdSet.add(vid);
1752
+ pickBuckets[vid] = [];
1753
+ }
1754
+ }
1755
+ if (variantIdSet.size > 0) {
1756
+ for (const r of responses) {
1757
+ const resp = r;
1758
+ if (resp.status !== "completed")
1759
+ continue;
1760
+ const vpid = resp.variant_pick_id;
1761
+ const tid = resp.tester_id;
1762
+ if (typeof vpid === "string"
1763
+ && variantIdSet.has(vpid)
1764
+ && typeof tid === "string"
1765
+ && tid.length > 0) {
1766
+ pickBuckets[vpid].push(tid);
1767
+ }
1768
+ }
1769
+ out.pick_buckets = pickBuckets;
1770
+ }
1394
1771
  if (topCount > 0) {
1395
- out.winner = { letter: topLetter, count: topCount, tied };
1772
+ // Refuse the winner when more than half of dispatched responses errored.
1773
+ // Calling A or B with a 4/5 failure rate would mislead the agent into
1774
+ // treating one tester's pick as a verdict.
1775
+ if (dispatchedTotal > 0
1776
+ && erroredTotal / dispatchedTotal > ERROR_RATE_REFUSE_THRESHOLD) {
1777
+ out.winner = {
1778
+ refused: true,
1779
+ reason: "error_rate_too_high",
1780
+ errored: erroredTotal,
1781
+ total: dispatchedTotal,
1782
+ };
1783
+ }
1784
+ else {
1785
+ out.winner = {
1786
+ label: topLetter,
1787
+ count: topCount,
1788
+ tied,
1789
+ n: completedTotal,
1790
+ confidence: deriveWinnerConfidence({
1791
+ n: completedTotal,
1792
+ errored: erroredTotal,
1793
+ tied,
1794
+ }),
1795
+ };
1796
+ }
1396
1797
  }
1397
1798
  }
1398
1799
  if (wantsRatings) {
@@ -1410,6 +1811,30 @@ function buildAggregates(round, stats) {
1410
1811
  }
1411
1812
  return out;
1412
1813
  }
1814
+ /**
1815
+ * Derive a coarse confidence label from sample size + tied-ness + error mix.
1816
+ *
1817
+ * Rules (lowest wins):
1818
+ * - low: n < 3 OR tied OR any errored response (we have visible failures)
1819
+ * - medium: 3 <= n < 10 (small sample but clean)
1820
+ * - high: n >= 10 AND no errored responses AND not tied
1821
+ *
1822
+ * Tuned for the typical 5-tester ask: a clean 5/5 lands at "medium" (you
1823
+ * can probably trust the lean), 1/5 with no errors lands at "low" (you
1824
+ * need more data), 5/5 with a tie lands at "low" (no winner to call).
1825
+ */
1826
+ export function deriveWinnerConfidence(args) {
1827
+ const { n, errored, tied } = args;
1828
+ if (n < N_MEDIUM_CONFIDENCE_FLOOR)
1829
+ return "low";
1830
+ if (tied)
1831
+ return "low";
1832
+ if (errored > 0)
1833
+ return "low";
1834
+ if (n < N_HIGH_CONFIDENCE_FLOOR)
1835
+ return "medium";
1836
+ return "high";
1837
+ }
1413
1838
  function buildCrossRoundSummary(rounds) {
1414
1839
  if (rounds.length < 2)
1415
1840
  return undefined;
@@ -1555,11 +1980,20 @@ export function formatAskResults(ask, json, roundFilter) {
1555
1980
  }
1556
1981
  const headers = ["ROUND", "WINNER", ...Array.from(letters).sort()];
1557
1982
  const rows = crossRound.rounds.map((entry) => {
1558
- const winnerCell = entry.winner
1559
- ? entry.winner.tied
1560
- ? `${entry.winner.letter} (tied)`
1561
- : entry.winner.letter
1562
- : "-";
1983
+ let winnerCell;
1984
+ if (!entry.winner) {
1985
+ winnerCell = "-";
1986
+ }
1987
+ else if ("refused" in entry.winner) {
1988
+ // Too many failures to call a winner this round; agent should retry.
1989
+ winnerCell = `(refused: ${entry.winner.errored}/${entry.winner.total} errored)`;
1990
+ }
1991
+ else {
1992
+ const base = entry.winner.tied
1993
+ ? `${entry.winner.label} (tied)`
1994
+ : entry.winner.label;
1995
+ winnerCell = `${base} · ${entry.winner.confidence}`;
1996
+ }
1563
1997
  return [
1564
1998
  `R${entry.round_number}`,
1565
1999
  winnerCell,
@@ -12,3 +12,4 @@ export declare function binDir(): string;
12
12
  export declare function browsersDir(): string;
13
13
  export declare function simulationsDir(): string;
14
14
  export declare function cloudflaredBin(): string;
15
+ export declare function connectLockPath(): string;
package/dist/lib/paths.js CHANGED
@@ -34,3 +34,6 @@ export function cloudflaredBin() {
34
34
  const exe = process.platform === "win32" ? "cloudflared.exe" : "cloudflared";
35
35
  return path.join(binDir(), exe);
36
36
  }
37
+ export function connectLockPath() {
38
+ return path.join(rootDir(), "connect.lock");
39
+ }
@@ -29,3 +29,21 @@ export interface SkillTargetSpec {
29
29
  consumers: string[];
30
30
  }
31
31
  export declare const SKILL_TARGETS: SkillTargetSpec[];
32
+ /**
33
+ * Walks from `startDir` upward (inclusive of the home directory, capped at
34
+ * the filesystem root) looking for an installed ish skill at any of
35
+ * SKILL_TARGETS. Returns the first hit, identified by the presence of a
36
+ * SKILL.md file. Used by `ish status` to nudge agents toward `ish init`
37
+ * when the project doesn't have the skill installed yet.
38
+ */
39
+ export declare function findInstalledSkill(startDir: string, fs: {
40
+ existsSync: (p: string) => boolean;
41
+ }, path: {
42
+ join: (...p: string[]) => string;
43
+ dirname: (p: string) => string;
44
+ resolve: (p: string) => string;
45
+ }, homeDir: string): {
46
+ target: SkillTargetSpec;
47
+ root: string;
48
+ skillMdPath: string;
49
+ } | null;