@ishlabs/cli 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,6 +44,8 @@ interface ParticipantStatusRow {
44
44
  participant_name: string;
45
45
  interaction_count: number;
46
46
  error_message?: string;
47
+ error_kind?: string;
48
+ age_seconds?: number;
47
49
  }
48
50
  export declare function attachStudyRunCommands(study: Command): void;
49
51
  export {};
@@ -108,6 +108,26 @@ const POLL_INTERVAL_MS = 5_000;
108
108
  // transparently reverts to POLL_INTERVAL_MS.
109
109
  const SSE_BACKSTOP_INTERVAL_MS = 30_000;
110
110
  const TERMINAL_STATUSES = new Set(["completed", "errored", "failed", "cancelled", "canceled"]);
111
+ // If any running participant has been alive longer than this on the
112
+ // server, the wait-timeout message picks up an explicit "likely stuck"
113
+ // hint. Sized just above the worker's in-process stale-heartbeat
114
+ // threshold (600s) so the suggestion matches the backend reaper's
115
+ // verdict (see app/services/jobs/cleanup_stale_participants.py).
116
+ const LIKELY_STUCK_AGE_SECONDS = 900;
117
+ function buildWaitTimeoutMessage(opts) {
118
+ const base = `Timed out after ${opts.timeoutSeconds}s waiting for simulations. ` +
119
+ `${opts.done}/${opts.total} done. ${opts.resumeHint}`;
120
+ const likelyStuck = opts.rows.some((r) => typeof r.age_seconds === "number" &&
121
+ r.age_seconds >= LIKELY_STUCK_AGE_SECONDS &&
122
+ !TERMINAL_STATUSES.has(r.status));
123
+ if (!likelyStuck)
124
+ return base;
125
+ return (base +
126
+ " At least one participant has been running >15 min (see " +
127
+ "`progress.rows[].age_seconds`); the worker likely died. The " +
128
+ "backend reaper will mark it FAILED(stale_worker) within ~15 min — " +
129
+ "don't keep polling.");
130
+ }
111
131
  function flattenParticipantStatuses(participants, opts = {}) {
112
132
  const rows = [];
113
133
  for (const t of participants ?? []) {
@@ -128,6 +148,8 @@ function flattenParticipantStatuses(participants, opts = {}) {
128
148
  participant_name: t.person?.name || "Unknown",
129
149
  interaction_count: Array.isArray(t.interactions) ? t.interactions.length : 0,
130
150
  ...(errorMessage && { error_message: String(errorMessage) }),
151
+ ...(t.error_kind && { error_kind: t.error_kind }),
152
+ ...(typeof t.age_seconds === "number" && { age_seconds: t.age_seconds }),
131
153
  });
132
154
  }
133
155
  return rows;
@@ -171,8 +193,13 @@ async function pollStudyUntilDone(client, opts) {
171
193
  return { rows, isMedia };
172
194
  }
173
195
  if (Date.now() - start > opts.timeoutMs) {
174
- throw new WaitTimeoutError(`Timed out after ${Math.round(opts.timeoutMs / 1000)}s waiting for simulations. ` +
175
- `${done}/${total} done. Run \`ish study poll --study ${opts.studyId}\` to check status.`, {
196
+ throw new WaitTimeoutError(buildWaitTimeoutMessage({
197
+ timeoutSeconds: Math.round(opts.timeoutMs / 1000),
198
+ done,
199
+ total,
200
+ rows,
201
+ resumeHint: `Run \`ish study poll --study ${opts.studyId}\` to check status.`,
202
+ }), {
176
203
  study_id: opts.studyId,
177
204
  ...(opts.iterationId && { iteration_id: opts.iterationId }),
178
205
  timeout_seconds: Math.round(opts.timeoutMs / 1000),
@@ -1128,20 +1155,32 @@ Examples:
1128
1155
  // M8 + M9 (per-participant wait): structured wait_timeout with the
1129
1156
  // current status as `progress.rows[0]` so `study wait <id>`
1130
1157
  // always emits machine-readable final state.
1131
- throw new WaitTimeoutError(`Timed out after ${Math.round(timeoutMs / 1000)}s waiting for participant ${participantId}. Last status: ${status}.`, {
1158
+ const ageSeconds = typeof data.age_seconds === "number"
1159
+ ? data.age_seconds
1160
+ : undefined;
1161
+ const rows = [
1162
+ {
1163
+ id: resolvedParticipant,
1164
+ status,
1165
+ participant_name: String(data.participant_name ?? "Unknown"),
1166
+ interaction_count: 0,
1167
+ ...(data.error_kind && { error_kind: String(data.error_kind) }),
1168
+ ...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
1169
+ },
1170
+ ];
1171
+ throw new WaitTimeoutError(buildWaitTimeoutMessage({
1172
+ timeoutSeconds: Math.round(timeoutMs / 1000),
1173
+ done: 0,
1174
+ total: 1,
1175
+ rows,
1176
+ resumeHint: `Last status: ${status}.`,
1177
+ }), {
1132
1178
  study_id: resolvedParticipant,
1133
1179
  timeout_seconds: Math.round(timeoutMs / 1000),
1134
1180
  done: 0,
1135
1181
  total: 1,
1136
1182
  pending: 1,
1137
- rows: [
1138
- {
1139
- id: resolvedParticipant,
1140
- status,
1141
- participant_name: String(data.participant_name ?? "Unknown"),
1142
- interaction_count: 0,
1143
- },
1144
- ],
1183
+ rows,
1145
1184
  });
1146
1185
  }
1147
1186
  await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
@@ -1352,20 +1391,32 @@ See \`ish docs get-page concepts/extending-a-simulation\` for the full mental mo
1352
1391
  return;
1353
1392
  }
1354
1393
  if (Date.now() - start > timeoutMs) {
1355
- throw new WaitTimeoutError(`Timed out after ${Math.round(timeoutMs / 1000)}s waiting for participant ${newAlias}. Last status: ${s}.`, {
1394
+ const ageSeconds = typeof status.age_seconds === "number"
1395
+ ? status.age_seconds
1396
+ : undefined;
1397
+ const rows = [
1398
+ {
1399
+ id: newParticipantId,
1400
+ status: s,
1401
+ participant_name: String(status.participant_name ?? "Unknown"),
1402
+ interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
1403
+ ...(status.error_kind && { error_kind: String(status.error_kind) }),
1404
+ ...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
1405
+ },
1406
+ ];
1407
+ throw new WaitTimeoutError(buildWaitTimeoutMessage({
1408
+ timeoutSeconds: Math.round(timeoutMs / 1000),
1409
+ done: 0,
1410
+ total: 1,
1411
+ rows,
1412
+ resumeHint: `Last status: ${s}.`,
1413
+ }), {
1356
1414
  study_id: newParticipantId,
1357
1415
  timeout_seconds: Math.round(timeoutMs / 1000),
1358
1416
  done: 0,
1359
1417
  total: 1,
1360
1418
  pending: 1,
1361
- rows: [
1362
- {
1363
- id: newParticipantId,
1364
- status: s,
1365
- participant_name: String(status.participant_name ?? "Unknown"),
1366
- interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
1367
- },
1368
- ],
1419
+ rows,
1369
1420
  });
1370
1421
  }
1371
1422
  await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
package/dist/lib/docs.js CHANGED
@@ -1711,6 +1711,23 @@ removed); \`extend\` then spawns a fresh participant branched from the
1711
1711
  cancelled participant's last interaction. See
1712
1712
  \`concepts/extending-a-simulation\` for the full mental model.
1713
1713
 
1714
+ ## Stuck runs are auto-failed (no manual intervention)
1715
+
1716
+ If a worker dies mid-run (instance preemption, OOM, infra restart), the
1717
+ backend reaper transitions the participant to
1718
+ \`status: failed, error_kind: stale_worker\` within ~15 min — you don't
1719
+ need to \`cancel\` it. The status payload returned by
1720
+ \`/simulation/status/{participant_id}\` (and surfaced on \`study wait\`,
1721
+ \`study run --wait\`, \`study poll\`) includes \`age_seconds\` so agents
1722
+ can tell "just slow" from "the worker is gone." Once \`age_seconds\`
1723
+ exceeds ~900s for a non-terminal participant the wait-timeout envelope
1724
+ explicitly flags it as likely stuck — stop polling and let the reaper
1725
+ finish the row.
1726
+
1727
+ \`error_kind: self_timeout\` is the same idea written by the worker
1728
+ itself when it self-detects passing its 25-min ceiling; \`stale_worker\`
1729
+ is the reaper's verdict when the row simply stopped reporting.
1730
+
1714
1731
  ## Related
1715
1732
 
1716
1733
  - \`reference/json-mode\` — output modes (display vs capture vs chain).
@@ -1744,7 +1761,10 @@ mid-run?" scenario without restarting from scratch.
1744
1761
  When extend is **not** the right verb:
1745
1762
 
1746
1763
  - Source participant is still RUNNING. \`cancel\` it first, then extend.
1747
- Extend refuses non-terminal sources server-side.
1764
+ Extend refuses non-terminal sources server-side. **Exception:** a
1765
+ stale-heartbeat RUNNING row (worker died mid-run) is reaped to
1766
+ \`failed, error_kind: stale_worker\` automatically within ~15 min — no
1767
+ manual \`cancel\` needed; just wait for the reaper, then extend.
1748
1768
  - You want a fresh cohort with new people flags. Use \`study run\`
1749
1769
  with \`--person\` / \`--sample\` / \`--all\` instead — extend is a
1750
1770
  per-participant resume, not a batch op.
@@ -2223,7 +2243,30 @@ The CLI guarantees these contracts so agents can chain safely:
2223
2243
  envelope carries \`progress: {study_id, iteration_id?,
2224
2244
  timeout_seconds, done, total, pending, rows[]}\` so the agent
2225
2245
  can resume by polling rather than re-dispatching. Same shape on
2226
- \`study wait\` (single-participant rows[] has length 1).
2246
+ \`study wait\` (single-participant rows[] has length 1). Each row
2247
+ in \`progress.rows[]\` carries \`age_seconds\` (server-computed
2248
+ liveness from \`started_at\`) plus \`error_kind\` when populated;
2249
+ when any non-terminal row's \`age_seconds\` exceeds ~900s the
2250
+ envelope's \`error\` message explicitly flags "the worker likely
2251
+ died" — don't keep polling, the backend reaper will mark it
2252
+ \`failed, error_kind=stale_worker\` within ~15 min.
2253
+ - **Participant \`error_kind\` enumeration.** Failed participants
2254
+ carry a classified \`error_kind\` so agents branch without parsing
2255
+ prose. Lifecycle/infra kinds: \`stale_worker\` (worker died mid-run,
2256
+ reaper transitioned the row), \`self_timeout\` (worker self-aborted
2257
+ past its 25-min runtime ceiling). Modality kinds:
2258
+ \`first_impression_llm_failed\`, \`interview_llm_failed\`,
2259
+ \`variant_preparation_failed\` (ask responses). CLI-side kinds:
2260
+ \`ConfirmationRequired\` (destructive op in \`--json\` mode without
2261
+ \`--yes\`), \`TunnelInactive\`, \`BotAuthError\`, \`BotShapeError\`,
2262
+ \`BotInvalidResponseError\`. The full set is open — branch on the
2263
+ ones you handle and treat the rest as "unknown failure, surface to
2264
+ user."
2265
+ - **Per-participant status payload (\`/simulation/status/{id}\`)** carries
2266
+ \`{job_id, status, create_time, completion_time?, error?, error_kind?,
2267
+ started_at?, last_heartbeat_at?, age_seconds?}\`. \`age_seconds\` is
2268
+ server-computed so clock skew between caller and backend doesn't
2269
+ matter; treat absent fields as "older backend, info unavailable."
2227
2270
  - **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
2228
2271
  for the per-POST participants/batch + simulation/start budget. On
2229
2272
  timeout (or any dispatch failure), the error envelope includes
@@ -218,6 +218,7 @@ When in doubt: side-by-side comparison usually beats in-place edits. Ids are che
218
218
  - **Chatbot endpoint response-shape mismatch**: \`chat_endpoint_test\` succeeds shallowly if the bot responds at all, but a wrong response path (e.g. bot returns \`{ data: { reply } }\` instead of \`{ reply }\`) produces empty transcripts on the actual run. Inspect one full test response before dispatching participants.
219
219
  - **Chatbot auth drift**: tokens/sessions baked into \`--from-curl\` expire. If transcripts come back as identical short error strings, re-run \`chat_endpoint_test\` and refresh the curl spec.
220
220
  - **401 surfaces as fake blocker**: an unauthenticated endpoint produces "participant got stuck on auth screen" — looks like a UX blocker but is config. Always confirm endpoint auth before reading transcripts as user-research data.
221
+ - **Don't poll a stuck run forever**: a participant whose worker died will sit in \`status: running\` until the backend reaper transitions it to \`failed, error_kind: stale_worker\` (~15 min). The per-participant status payload exposes \`age_seconds\` (server-computed from \`started_at\`); once it's above ~900s on a non-terminal row, the run is almost certainly stuck. The CLI's \`wait_timeout\` envelope explicitly flags this case in its \`error\` message — when you see "the worker likely died," stop polling and surface the failure rather than retrying. \`error_kind: self_timeout\` is the same idea but written by the worker itself when it self-aborts past its 25-min ceiling.
221
222
  - **No per-page/per-timestamp scoping for media**: there's no "evaluate just slide 14" or "react to seconds 0-30" API. State the focus explicitly in the \`assignment\` text, or pre-stitch the artifact (e.g. replace one slide locally, upload as a new iteration).
222
223
  - **\`study get --json\` participants live at the top level**, not nested under \`iterations[*].participants\`. The backend split made \`/studies/{id}\` lite (metadata + iteration shells, no participant graph) and added \`/studies/{id}/participants\`; the CLI joins them so \`study get --json\` carries a flat \`participants[]\` with \`iteration_id\` on each row. Read \`.participants[]\`, not \`.iterations[].participants[]\`.
223
224
  - **All destructive deletes require \`--yes\` in non-TTY mode**: \`ish workspace delete\`, \`study delete\`, \`ask delete\`, \`person delete\`, \`source delete\`, \`chat endpoint delete\`. In \`--json\` mode (or any piped/non-TTY invocation), omitting \`--yes\` refuses with \`error_kind: "ConfirmationRequired"\` + an \`example\` field showing the same command with \`--yes\` appended. \`workspace delete\` is the highest-blast-radius: it removes ALL nested studies, asks, people, secrets, configs, sources, and chat endpoints — the prompt names them explicitly.
@@ -38,6 +38,9 @@ export interface StudyParticipant extends Participant {
38
38
  conversation_id?: string | null;
39
39
  error_message?: string | null;
40
40
  error_kind?: string | null;
41
+ started_at?: string | null;
42
+ last_heartbeat_at?: string | null;
43
+ age_seconds?: number | null;
41
44
  [k: string]: unknown;
42
45
  }
43
46
  export declare function fetchStudyParticipants(client: ApiClient, studyId: string, opts?: {
@@ -357,6 +357,10 @@ export interface SimulationStatus {
357
357
  create_time?: string;
358
358
  completion_time?: string;
359
359
  error?: string;
360
+ error_kind?: string | null;
361
+ started_at?: string | null;
362
+ last_heartbeat_at?: string | null;
363
+ age_seconds?: number | null;
360
364
  }
361
365
  export interface SimulationCancelResponse {
362
366
  job_id: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ishlabs/cli",
3
- "version": "0.22.0",
3
+ "version": "0.23.0",
4
4
  "description": "The command-line interface for ish",
5
5
  "type": "module",
6
6
  "bin": {