@ishlabs/cli 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -108,6 +108,26 @@ const POLL_INTERVAL_MS = 5_000;
|
|
|
108
108
|
// transparently reverts to POLL_INTERVAL_MS.
|
|
109
109
|
const SSE_BACKSTOP_INTERVAL_MS = 30_000;
|
|
110
110
|
const TERMINAL_STATUSES = new Set(["completed", "errored", "failed", "cancelled", "canceled"]);
|
|
111
|
+
// If any running participant has been alive longer than this on the
|
|
112
|
+
// server, the wait-timeout message picks up an explicit "likely stuck"
|
|
113
|
+
// hint. Sized just above the worker's in-process stale-heartbeat
|
|
114
|
+
// threshold (600s) so the suggestion matches the backend reaper's
|
|
115
|
+
// verdict (see app/services/jobs/cleanup_stale_participants.py).
|
|
116
|
+
const LIKELY_STUCK_AGE_SECONDS = 900;
|
|
117
|
+
function buildWaitTimeoutMessage(opts) {
|
|
118
|
+
const base = `Timed out after ${opts.timeoutSeconds}s waiting for simulations. ` +
|
|
119
|
+
`${opts.done}/${opts.total} done. ${opts.resumeHint}`;
|
|
120
|
+
const likelyStuck = opts.rows.some((r) => typeof r.age_seconds === "number" &&
|
|
121
|
+
r.age_seconds >= LIKELY_STUCK_AGE_SECONDS &&
|
|
122
|
+
!TERMINAL_STATUSES.has(r.status));
|
|
123
|
+
if (!likelyStuck)
|
|
124
|
+
return base;
|
|
125
|
+
return (base +
|
|
126
|
+
" At least one participant has been running >15 min (see " +
|
|
127
|
+
"`progress.rows[].age_seconds`); the worker likely died. The " +
|
|
128
|
+
"backend reaper will mark it FAILED(stale_worker) within ~15 min — " +
|
|
129
|
+
"don't keep polling.");
|
|
130
|
+
}
|
|
111
131
|
function flattenParticipantStatuses(participants, opts = {}) {
|
|
112
132
|
const rows = [];
|
|
113
133
|
for (const t of participants ?? []) {
|
|
@@ -128,6 +148,8 @@ function flattenParticipantStatuses(participants, opts = {}) {
|
|
|
128
148
|
participant_name: t.person?.name || "Unknown",
|
|
129
149
|
interaction_count: Array.isArray(t.interactions) ? t.interactions.length : 0,
|
|
130
150
|
...(errorMessage && { error_message: String(errorMessage) }),
|
|
151
|
+
...(t.error_kind && { error_kind: t.error_kind }),
|
|
152
|
+
...(typeof t.age_seconds === "number" && { age_seconds: t.age_seconds }),
|
|
131
153
|
});
|
|
132
154
|
}
|
|
133
155
|
return rows;
|
|
@@ -171,8 +193,13 @@ async function pollStudyUntilDone(client, opts) {
|
|
|
171
193
|
return { rows, isMedia };
|
|
172
194
|
}
|
|
173
195
|
if (Date.now() - start > opts.timeoutMs) {
|
|
174
|
-
throw new WaitTimeoutError(
|
|
175
|
-
|
|
196
|
+
throw new WaitTimeoutError(buildWaitTimeoutMessage({
|
|
197
|
+
timeoutSeconds: Math.round(opts.timeoutMs / 1000),
|
|
198
|
+
done,
|
|
199
|
+
total,
|
|
200
|
+
rows,
|
|
201
|
+
resumeHint: `Run \`ish study poll --study ${opts.studyId}\` to check status.`,
|
|
202
|
+
}), {
|
|
176
203
|
study_id: opts.studyId,
|
|
177
204
|
...(opts.iterationId && { iteration_id: opts.iterationId }),
|
|
178
205
|
timeout_seconds: Math.round(opts.timeoutMs / 1000),
|
|
@@ -1128,20 +1155,32 @@ Examples:
|
|
|
1128
1155
|
// M8 + M9 (per-participant wait): structured wait_timeout with the
|
|
1129
1156
|
// current status as `progress.rows[0]` so `study wait <id>`
|
|
1130
1157
|
// always emits machine-readable final state.
|
|
1131
|
-
|
|
1158
|
+
const ageSeconds = typeof data.age_seconds === "number"
|
|
1159
|
+
? data.age_seconds
|
|
1160
|
+
: undefined;
|
|
1161
|
+
const rows = [
|
|
1162
|
+
{
|
|
1163
|
+
id: resolvedParticipant,
|
|
1164
|
+
status,
|
|
1165
|
+
participant_name: String(data.participant_name ?? "Unknown"),
|
|
1166
|
+
interaction_count: 0,
|
|
1167
|
+
...(data.error_kind && { error_kind: String(data.error_kind) }),
|
|
1168
|
+
...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
|
|
1169
|
+
},
|
|
1170
|
+
];
|
|
1171
|
+
throw new WaitTimeoutError(buildWaitTimeoutMessage({
|
|
1172
|
+
timeoutSeconds: Math.round(timeoutMs / 1000),
|
|
1173
|
+
done: 0,
|
|
1174
|
+
total: 1,
|
|
1175
|
+
rows,
|
|
1176
|
+
resumeHint: `Last status: ${status}.`,
|
|
1177
|
+
}), {
|
|
1132
1178
|
study_id: resolvedParticipant,
|
|
1133
1179
|
timeout_seconds: Math.round(timeoutMs / 1000),
|
|
1134
1180
|
done: 0,
|
|
1135
1181
|
total: 1,
|
|
1136
1182
|
pending: 1,
|
|
1137
|
-
rows
|
|
1138
|
-
{
|
|
1139
|
-
id: resolvedParticipant,
|
|
1140
|
-
status,
|
|
1141
|
-
participant_name: String(data.participant_name ?? "Unknown"),
|
|
1142
|
-
interaction_count: 0,
|
|
1143
|
-
},
|
|
1144
|
-
],
|
|
1183
|
+
rows,
|
|
1145
1184
|
});
|
|
1146
1185
|
}
|
|
1147
1186
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
|
@@ -1352,20 +1391,32 @@ See \`ish docs get-page concepts/extending-a-simulation\` for the full mental mo
|
|
|
1352
1391
|
return;
|
|
1353
1392
|
}
|
|
1354
1393
|
if (Date.now() - start > timeoutMs) {
|
|
1355
|
-
|
|
1394
|
+
const ageSeconds = typeof status.age_seconds === "number"
|
|
1395
|
+
? status.age_seconds
|
|
1396
|
+
: undefined;
|
|
1397
|
+
const rows = [
|
|
1398
|
+
{
|
|
1399
|
+
id: newParticipantId,
|
|
1400
|
+
status: s,
|
|
1401
|
+
participant_name: String(status.participant_name ?? "Unknown"),
|
|
1402
|
+
interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
|
|
1403
|
+
...(status.error_kind && { error_kind: String(status.error_kind) }),
|
|
1404
|
+
...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
|
|
1405
|
+
},
|
|
1406
|
+
];
|
|
1407
|
+
throw new WaitTimeoutError(buildWaitTimeoutMessage({
|
|
1408
|
+
timeoutSeconds: Math.round(timeoutMs / 1000),
|
|
1409
|
+
done: 0,
|
|
1410
|
+
total: 1,
|
|
1411
|
+
rows,
|
|
1412
|
+
resumeHint: `Last status: ${s}.`,
|
|
1413
|
+
}), {
|
|
1356
1414
|
study_id: newParticipantId,
|
|
1357
1415
|
timeout_seconds: Math.round(timeoutMs / 1000),
|
|
1358
1416
|
done: 0,
|
|
1359
1417
|
total: 1,
|
|
1360
1418
|
pending: 1,
|
|
1361
|
-
rows
|
|
1362
|
-
{
|
|
1363
|
-
id: newParticipantId,
|
|
1364
|
-
status: s,
|
|
1365
|
-
participant_name: String(status.participant_name ?? "Unknown"),
|
|
1366
|
-
interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
|
|
1367
|
-
},
|
|
1368
|
-
],
|
|
1419
|
+
rows,
|
|
1369
1420
|
});
|
|
1370
1421
|
}
|
|
1371
1422
|
await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
|
package/dist/lib/docs.js
CHANGED
|
@@ -1711,6 +1711,23 @@ removed); \`extend\` then spawns a fresh participant branched from the
|
|
|
1711
1711
|
cancelled participant's last interaction. See
|
|
1712
1712
|
\`concepts/extending-a-simulation\` for the full mental model.
|
|
1713
1713
|
|
|
1714
|
+
## Stuck runs are auto-failed (no manual intervention)
|
|
1715
|
+
|
|
1716
|
+
If a worker dies mid-run (instance preemption, OOM, infra restart), the
|
|
1717
|
+
backend reaper transitions the participant to
|
|
1718
|
+
\`status: failed, error_kind: stale_worker\` within ~15 min — you don't
|
|
1719
|
+
need to \`cancel\` it. The status payload returned by
|
|
1720
|
+
\`/simulation/status/{participant_id}\` (and surfaced on \`study wait\`,
|
|
1721
|
+
\`study run --wait\`, \`study poll\`) includes \`age_seconds\` so agents
|
|
1722
|
+
can tell "just slow" from "the worker is gone." Once \`age_seconds\`
|
|
1723
|
+
exceeds ~900s for a non-terminal participant the wait-timeout envelope
|
|
1724
|
+
explicitly flags it as likely stuck — stop polling and let the reaper
|
|
1725
|
+
finish the row.
|
|
1726
|
+
|
|
1727
|
+
\`error_kind: self_timeout\` is the same idea written by the worker
|
|
1728
|
+
itself when it self-detects passing its 25-min ceiling; \`stale_worker\`
|
|
1729
|
+
is the reaper's verdict when the row simply stopped reporting.
|
|
1730
|
+
|
|
1714
1731
|
## Related
|
|
1715
1732
|
|
|
1716
1733
|
- \`reference/json-mode\` — output modes (display vs capture vs chain).
|
|
@@ -1744,7 +1761,10 @@ mid-run?" scenario without restarting from scratch.
|
|
|
1744
1761
|
When extend is **not** the right verb:
|
|
1745
1762
|
|
|
1746
1763
|
- Source participant is still RUNNING. \`cancel\` it first, then extend.
|
|
1747
|
-
Extend refuses non-terminal sources server-side.
|
|
1764
|
+
Extend refuses non-terminal sources server-side. **Exception:** a
|
|
1765
|
+
stale-heartbeat RUNNING row (worker died mid-run) is reaped to
|
|
1766
|
+
\`failed, error_kind: stale_worker\` automatically within ~15 min — no
|
|
1767
|
+
manual \`cancel\` needed; just wait for the reaper, then extend.
|
|
1748
1768
|
- You want a fresh cohort with new people flags. Use \`study run\`
|
|
1749
1769
|
with \`--person\` / \`--sample\` / \`--all\` instead — extend is a
|
|
1750
1770
|
per-participant resume, not a batch op.
|
|
@@ -2223,7 +2243,30 @@ The CLI guarantees these contracts so agents can chain safely:
|
|
|
2223
2243
|
envelope carries \`progress: {study_id, iteration_id?,
|
|
2224
2244
|
timeout_seconds, done, total, pending, rows[]}\` so the agent
|
|
2225
2245
|
can resume by polling rather than re-dispatching. Same shape on
|
|
2226
|
-
\`study wait\` (single-participant rows[] has length 1).
|
|
2246
|
+
\`study wait\` (single-participant rows[] has length 1). Each row
|
|
2247
|
+
in \`progress.rows[]\` carries \`age_seconds\` (server-computed
|
|
2248
|
+
liveness from \`started_at\`) plus \`error_kind\` when populated;
|
|
2249
|
+
when any non-terminal row's \`age_seconds\` exceeds ~900s the
|
|
2250
|
+
envelope's \`error\` message explicitly flags "the worker likely
|
|
2251
|
+
died" — don't keep polling, the backend reaper will mark it
|
|
2252
|
+
\`failed, error_kind=stale_worker\` within ~15 min.
|
|
2253
|
+
- **Participant \`error_kind\` enumeration.** Failed participants
|
|
2254
|
+
carry a classified \`error_kind\` so agents branch without parsing
|
|
2255
|
+
prose. Lifecycle/infra kinds: \`stale_worker\` (worker died mid-run,
|
|
2256
|
+
reaper transitioned the row), \`self_timeout\` (worker self-aborted
|
|
2257
|
+
past its 25-min runtime ceiling). Modality kinds:
|
|
2258
|
+
\`first_impression_llm_failed\`, \`interview_llm_failed\`,
|
|
2259
|
+
\`variant_preparation_failed\` (ask responses). CLI-side kinds:
|
|
2260
|
+
\`ConfirmationRequired\` (destructive op in \`--json\` mode without
|
|
2261
|
+
\`--yes\`), \`TunnelInactive\`, \`BotAuthError\`, \`BotShapeError\`,
|
|
2262
|
+
\`BotInvalidResponseError\`. The full set is open — branch on the
|
|
2263
|
+
ones you handle and treat the rest as "unknown failure, surface to
|
|
2264
|
+
user."
|
|
2265
|
+
- **Per-participant status payload (\`/simulation/status/{id}\`)** carries
|
|
2266
|
+
\`{job_id, status, create_time, completion_time?, error?, error_kind?,
|
|
2267
|
+
started_at?, last_heartbeat_at?, age_seconds?}\`. \`age_seconds\` is
|
|
2268
|
+
server-computed so clock skew between caller and backend doesn't
|
|
2269
|
+
matter; treat absent fields as "older backend, info unavailable."
|
|
2227
2270
|
- **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
|
|
2228
2271
|
for the per-POST participants/batch + simulation/start budget. On
|
|
2229
2272
|
timeout (or any dispatch failure), the error envelope includes
|
|
@@ -218,6 +218,7 @@ When in doubt: side-by-side comparison usually beats in-place edits. Ids are che
|
|
|
218
218
|
- **Chatbot endpoint response-shape mismatch**: \`chat_endpoint_test\` succeeds shallowly if the bot responds at all, but a wrong response path (e.g. bot returns \`{ data: { reply } }\` instead of \`{ reply }\`) produces empty transcripts on the actual run. Inspect one full test response before dispatching participants.
|
|
219
219
|
- **Chatbot auth drift**: tokens/sessions baked into \`--from-curl\` expire. If transcripts come back as identical short error strings, re-run \`chat_endpoint_test\` and refresh the curl spec.
|
|
220
220
|
- **401 surfaces as fake blocker**: an unauthenticated endpoint produces "participant got stuck on auth screen" — looks like a UX blocker but is config. Always confirm endpoint auth before reading transcripts as user-research data.
|
|
221
|
+
- **Don't poll a stuck run forever**: a participant whose worker died will sit in \`status: running\` until the backend reaper transitions it to \`failed, error_kind: stale_worker\` (~15 min). The per-participant status payload exposes \`age_seconds\` (server-computed from \`started_at\`); once it's above ~900s on a non-terminal row, the run is almost certainly stuck. The CLI's \`wait_timeout\` envelope explicitly flags this case in its \`error\` message — when you see "the worker likely died," stop polling and surface the failure rather than retrying. \`error_kind: self_timeout\` is the same idea but written by the worker itself when it self-aborts past its 25-min ceiling.
|
|
221
222
|
- **No per-page/per-timestamp scoping for media**: there's no "evaluate just slide 14" or "react to seconds 0-30" API. State the focus explicitly in the \`assignment\` text, or pre-stitch the artifact (e.g. replace one slide locally, upload as a new iteration).
|
|
222
223
|
- **\`study get --json\` participants live at the top level**, not nested under \`iterations[*].participants\`. The backend split made \`/studies/{id}\` lite (metadata + iteration shells, no participant graph) and added \`/studies/{id}/participants\`; the CLI joins them so \`study get --json\` carries a flat \`participants[]\` with \`iteration_id\` on each row. Read \`.participants[]\`, not \`.iterations[].participants[]\`.
|
|
223
224
|
- **All destructive deletes require \`--yes\` in non-TTY mode**: \`ish workspace delete\`, \`study delete\`, \`ask delete\`, \`person delete\`, \`source delete\`, \`chat endpoint delete\`. In \`--json\` mode (or any piped/non-TTY invocation), omitting \`--yes\` refuses with \`error_kind: "ConfirmationRequired"\` + an \`example\` field showing the same command with \`--yes\` appended. \`workspace delete\` is the highest-blast-radius: it removes ALL nested studies, asks, people, secrets, configs, sources, and chat endpoints — the prompt names them explicitly.
|
|
@@ -38,6 +38,9 @@ export interface StudyParticipant extends Participant {
|
|
|
38
38
|
conversation_id?: string | null;
|
|
39
39
|
error_message?: string | null;
|
|
40
40
|
error_kind?: string | null;
|
|
41
|
+
started_at?: string | null;
|
|
42
|
+
last_heartbeat_at?: string | null;
|
|
43
|
+
age_seconds?: number | null;
|
|
41
44
|
[k: string]: unknown;
|
|
42
45
|
}
|
|
43
46
|
export declare function fetchStudyParticipants(client: ApiClient, studyId: string, opts?: {
|
package/dist/lib/types.d.ts
CHANGED
|
@@ -357,6 +357,10 @@ export interface SimulationStatus {
|
|
|
357
357
|
create_time?: string;
|
|
358
358
|
completion_time?: string;
|
|
359
359
|
error?: string;
|
|
360
|
+
error_kind?: string | null;
|
|
361
|
+
started_at?: string | null;
|
|
362
|
+
last_heartbeat_at?: string | null;
|
|
363
|
+
age_seconds?: number | null;
|
|
360
364
|
}
|
|
361
365
|
export interface SimulationCancelResponse {
|
|
362
366
|
job_id: string;
|