npm - @ishlabs/cli - Versions diffs - 0.22.0 → 0.23.0 - Mend

@ishlabs/cli 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/commands/study-run.d.ts +2 -0
package/dist/commands/study-run.js +71 -20
package/dist/lib/docs.js +45 -2
package/dist/lib/skill-content.js +1 -0
package/dist/lib/study-participants.d.ts +3 -0
package/dist/lib/types.d.ts +4 -0
package/package.json +1 -1

package/dist/commands/study-run.d.ts CHANGED Viewed

@@ -44,6 +44,8 @@ interface ParticipantStatusRow {
     participant_name: string;
     interaction_count: number;
     error_message?: string;
+    error_kind?: string;
+    age_seconds?: number;
 }
 export declare function attachStudyRunCommands(study: Command): void;
 export {};

package/dist/commands/study-run.js CHANGED Viewed

@@ -108,6 +108,26 @@ const POLL_INTERVAL_MS = 5_000;
 // transparently reverts to POLL_INTERVAL_MS.
 const SSE_BACKSTOP_INTERVAL_MS = 30_000;
 const TERMINAL_STATUSES = new Set(["completed", "errored", "failed", "cancelled", "canceled"]);
+// If any running participant has been alive longer than this on the
+// server, the wait-timeout message picks up an explicit "likely stuck"
+// hint. Sized just above the worker's in-process stale-heartbeat
+// threshold (600s) so the suggestion matches the backend reaper's
+// verdict (see app/services/jobs/cleanup_stale_participants.py).
+const LIKELY_STUCK_AGE_SECONDS = 900;
+function buildWaitTimeoutMessage(opts) {
+    const base = `Timed out after ${opts.timeoutSeconds}s waiting for simulations. ` +
+        `${opts.done}/${opts.total} done. ${opts.resumeHint}`;
+    const likelyStuck = opts.rows.some((r) => typeof r.age_seconds === "number" &&
+        r.age_seconds >= LIKELY_STUCK_AGE_SECONDS &&
+        !TERMINAL_STATUSES.has(r.status));
+    if (!likelyStuck)
+        return base;
+    return (base +
+        " At least one participant has been running >15 min (see " +
+        "`progress.rows[].age_seconds`); the worker likely died. The " +
+        "backend reaper will mark it FAILED(stale_worker) within ~15 min — " +
+        "don't keep polling.");
+}
 function flattenParticipantStatuses(participants, opts = {}) {
     const rows = [];
     for (const t of participants ?? []) {
@@ -128,6 +148,8 @@ function flattenParticipantStatuses(participants, opts = {}) {
             participant_name: t.person?.name || "Unknown",
             interaction_count: Array.isArray(t.interactions) ? t.interactions.length : 0,
             ...(errorMessage && { error_message: String(errorMessage) }),
+            ...(t.error_kind && { error_kind: t.error_kind }),
+            ...(typeof t.age_seconds === "number" && { age_seconds: t.age_seconds }),
         });
     }
     return rows;
@@ -171,8 +193,13 @@ async function pollStudyUntilDone(client, opts) {
                 return { rows, isMedia };
             }
             if (Date.now() - start > opts.timeoutMs) {
-                throw new WaitTimeoutError(`Timed out after ${Math.round(opts.timeoutMs / 1000)}s waiting for simulations. ` +
-                    `${done}/${total} done. Run \`ish study poll --study ${opts.studyId}\` to check status.`, {
+                throw new WaitTimeoutError(buildWaitTimeoutMessage({
+                    timeoutSeconds: Math.round(opts.timeoutMs / 1000),
+                    done,
+                    total,
+                    rows,
+                    resumeHint: `Run \`ish study poll --study ${opts.studyId}\` to check status.`,
+                }), {
                     study_id: opts.studyId,
                     ...(opts.iterationId && { iteration_id: opts.iterationId }),
                     timeout_seconds: Math.round(opts.timeoutMs / 1000),
@@ -1128,20 +1155,32 @@ Examples:
                         // M8 + M9 (per-participant wait): structured wait_timeout with the
                         // current status as `progress.rows[0]` so `study wait <id>`
                         // always emits machine-readable final state.
-                        throw new WaitTimeoutError(`Timed out after ${Math.round(timeoutMs / 1000)}s waiting for participant ${participantId}. Last status: ${status}.`, {
+                        const ageSeconds = typeof data.age_seconds === "number"
+                            ? data.age_seconds
+                            : undefined;
+                        const rows = [
+                            {
+                                id: resolvedParticipant,
+                                status,
+                                participant_name: String(data.participant_name ?? "Unknown"),
+                                interaction_count: 0,
+                                ...(data.error_kind && { error_kind: String(data.error_kind) }),
+                                ...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
+                            },
+                        ];
+                        throw new WaitTimeoutError(buildWaitTimeoutMessage({
+                            timeoutSeconds: Math.round(timeoutMs / 1000),
+                            done: 0,
+                            total: 1,
+                            rows,
+                            resumeHint: `Last status: ${status}.`,
+                        }), {
                             study_id: resolvedParticipant,
                             timeout_seconds: Math.round(timeoutMs / 1000),
                             done: 0,
                             total: 1,
                             pending: 1,
-                            rows: [
-                                {
-                                    id: resolvedParticipant,
-                                    status,
-                                    participant_name: String(data.participant_name ?? "Unknown"),
-                                    interaction_count: 0,
-                                },
-                            ],
+                            rows,
                         });
                     }
                     await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
@@ -1352,20 +1391,32 @@ See \`ish docs get-page concepts/extending-a-simulation\` for the full mental mo
                     return;
                 }
                 if (Date.now() - start > timeoutMs) {
-                    throw new WaitTimeoutError(`Timed out after ${Math.round(timeoutMs / 1000)}s waiting for participant ${newAlias}. Last status: ${s}.`, {
+                    const ageSeconds = typeof status.age_seconds === "number"
+                        ? status.age_seconds
+                        : undefined;
+                    const rows = [
+                        {
+                            id: newParticipantId,
+                            status: s,
+                            participant_name: String(status.participant_name ?? "Unknown"),
+                            interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
+                            ...(status.error_kind && { error_kind: String(status.error_kind) }),
+                            ...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
+                        },
+                    ];
+                    throw new WaitTimeoutError(buildWaitTimeoutMessage({
+                        timeoutSeconds: Math.round(timeoutMs / 1000),
+                        done: 0,
+                        total: 1,
+                        rows,
+                        resumeHint: `Last status: ${s}.`,
+                    }), {
                         study_id: newParticipantId,
                         timeout_seconds: Math.round(timeoutMs / 1000),
                         done: 0,
                         total: 1,
                         pending: 1,
-                        rows: [
-                            {
-                                id: newParticipantId,
-                                status: s,
-                                participant_name: String(status.participant_name ?? "Unknown"),
-                                interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
-                            },
-                        ],
+                        rows,
                     });
                 }
                 await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));

package/dist/lib/docs.js CHANGED Viewed

@@ -1711,6 +1711,23 @@ removed); \`extend\` then spawns a fresh participant branched from the
 cancelled participant's last interaction. See
 \`concepts/extending-a-simulation\` for the full mental model.
+## Stuck runs are auto-failed (no manual intervention)
+If a worker dies mid-run (instance preemption, OOM, infra restart), the
+backend reaper transitions the participant to
+\`status: failed, error_kind: stale_worker\` within ~15 min — you don't
+need to \`cancel\` it. The status payload returned by
+\`/simulation/status/{participant_id}\` (and surfaced on \`study wait\`,
+\`study run --wait\`, \`study poll\`) includes \`age_seconds\` so agents
+can tell "just slow" from "the worker is gone." Once \`age_seconds\`
+exceeds ~900s for a non-terminal participant the wait-timeout envelope
+explicitly flags it as likely stuck — stop polling and let the reaper
+finish the row.
+\`error_kind: self_timeout\` is the same idea written by the worker
+itself when it self-detects passing its 25-min ceiling; \`stale_worker\`
+is the reaper's verdict when the row simply stopped reporting.
 ## Related
 - \`reference/json-mode\` — output modes (display vs capture vs chain).
@@ -1744,7 +1761,10 @@ mid-run?" scenario without restarting from scratch.
 When extend is **not** the right verb:
 - Source participant is still RUNNING. \`cancel\` it first, then extend.
-  Extend refuses non-terminal sources server-side.
+  Extend refuses non-terminal sources server-side. **Exception:** a
+  stale-heartbeat RUNNING row (worker died mid-run) is reaped to
+  \`failed, error_kind: stale_worker\` automatically within ~15 min — no
+  manual \`cancel\` needed; just wait for the reaper, then extend.
 - You want a fresh cohort with new people flags. Use \`study run\`
   with \`--person\` / \`--sample\` / \`--all\` instead — extend is a
   per-participant resume, not a batch op.
@@ -2223,7 +2243,30 @@ The CLI guarantees these contracts so agents can chain safely:
   envelope carries \`progress: {study_id, iteration_id?,
   timeout_seconds, done, total, pending, rows[]}\` so the agent
   can resume by polling rather than re-dispatching. Same shape on
-  \`study wait\` (single-participant rows[] has length 1).
+  \`study wait\` (single-participant rows[] has length 1). Each row
+  in \`progress.rows[]\` carries \`age_seconds\` (server-computed
+  liveness from \`started_at\`) plus \`error_kind\` when populated;
+  when any non-terminal row's \`age_seconds\` exceeds ~900s the
+  envelope's \`error\` message explicitly flags "the worker likely
+  died" — don't keep polling, the backend reaper will mark it
+  \`failed, error_kind=stale_worker\` within ~15 min.
+- **Participant \`error_kind\` enumeration.** Failed participants
+  carry a classified \`error_kind\` so agents branch without parsing
+  prose. Lifecycle/infra kinds: \`stale_worker\` (worker died mid-run,
+  reaper transitioned the row), \`self_timeout\` (worker self-aborted
+  past its 25-min runtime ceiling). Modality kinds:
+  \`first_impression_llm_failed\`, \`interview_llm_failed\`,
+  \`variant_preparation_failed\` (ask responses). CLI-side kinds:
+  \`ConfirmationRequired\` (destructive op in \`--json\` mode without
+  \`--yes\`), \`TunnelInactive\`, \`BotAuthError\`, \`BotShapeError\`,
+  \`BotInvalidResponseError\`. The full set is open — branch on the
+  ones you handle and treat the rest as "unknown failure, surface to
+  user."
+- **Per-participant status payload (\`/simulation/status/{id}\`)** carries
+  \`{job_id, status, create_time, completion_time?, error?, error_kind?,
+  started_at?, last_heartbeat_at?, age_seconds?}\`. \`age_seconds\` is
+  server-computed so clock skew between caller and backend doesn't
+  matter; treat absent fields as "older backend, info unavailable."
 - **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
   for the per-POST participants/batch + simulation/start budget. On
   timeout (or any dispatch failure), the error envelope includes

package/dist/lib/skill-content.js CHANGED Viewed

@@ -218,6 +218,7 @@ When in doubt: side-by-side comparison usually beats in-place edits. Ids are che
 - **Chatbot endpoint response-shape mismatch**: \`chat_endpoint_test\` succeeds shallowly if the bot responds at all, but a wrong response path (e.g. bot returns \`{ data: { reply } }\` instead of \`{ reply }\`) produces empty transcripts on the actual run. Inspect one full test response before dispatching participants.
 - **Chatbot auth drift**: tokens/sessions baked into \`--from-curl\` expire. If transcripts come back as identical short error strings, re-run \`chat_endpoint_test\` and refresh the curl spec.
 - **401 surfaces as fake blocker**: an unauthenticated endpoint produces "participant got stuck on auth screen" — looks like a UX blocker but is config. Always confirm endpoint auth before reading transcripts as user-research data.
+- **Don't poll a stuck run forever**: a participant whose worker died will sit in \`status: running\` until the backend reaper transitions it to \`failed, error_kind: stale_worker\` (~15 min). The per-participant status payload exposes \`age_seconds\` (server-computed from \`started_at\`); once it's above ~900s on a non-terminal row, the run is almost certainly stuck. The CLI's \`wait_timeout\` envelope explicitly flags this case in its \`error\` message — when you see "the worker likely died," stop polling and surface the failure rather than retrying. \`error_kind: self_timeout\` is the same idea but written by the worker itself when it self-aborts past its 25-min ceiling.
 - **No per-page/per-timestamp scoping for media**: there's no "evaluate just slide 14" or "react to seconds 0-30" API. State the focus explicitly in the \`assignment\` text, or pre-stitch the artifact (e.g. replace one slide locally, upload as a new iteration).
 - **\`study get --json\` participants live at the top level**, not nested under \`iterations[*].participants\`. The backend split made \`/studies/{id}\` lite (metadata + iteration shells, no participant graph) and added \`/studies/{id}/participants\`; the CLI joins them so \`study get --json\` carries a flat \`participants[]\` with \`iteration_id\` on each row. Read \`.participants[]\`, not \`.iterations[].participants[]\`.
 - **All destructive deletes require \`--yes\` in non-TTY mode**: \`ish workspace delete\`, \`study delete\`, \`ask delete\`, \`person delete\`, \`source delete\`, \`chat endpoint delete\`. In \`--json\` mode (or any piped/non-TTY invocation), omitting \`--yes\` refuses with \`error_kind: "ConfirmationRequired"\` + an \`example\` field showing the same command with \`--yes\` appended. \`workspace delete\` is the highest-blast-radius: it removes ALL nested studies, asks, people, secrets, configs, sources, and chat endpoints — the prompt names them explicitly.

package/dist/lib/study-participants.d.ts CHANGED Viewed

@@ -38,6 +38,9 @@ export interface StudyParticipant extends Participant {
     conversation_id?: string | null;
     error_message?: string | null;
     error_kind?: string | null;
+    started_at?: string | null;
+    last_heartbeat_at?: string | null;
+    age_seconds?: number | null;
     [k: string]: unknown;
 }
 export declare function fetchStudyParticipants(client: ApiClient, studyId: string, opts?: {

package/dist/lib/types.d.ts CHANGED Viewed

@@ -357,6 +357,10 @@ export interface SimulationStatus {
     create_time?: string;
     completion_time?: string;
     error?: string;
+    error_kind?: string | null;
+    started_at?: string | null;
+    last_heartbeat_at?: string | null;
+    age_seconds?: number | null;
 }
 export interface SimulationCancelResponse {
     job_id: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ishlabs/cli",
-  "version": "0.22.0",
+  "version": "0.23.0",
   "description": "The command-line interface for ish",
   "type": "module",
   "bin": {