@ishlabs/cli 0.22.0 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,6 +44,8 @@ interface ParticipantStatusRow {
44
44
  participant_name: string;
45
45
  interaction_count: number;
46
46
  error_message?: string;
47
+ error_kind?: string;
48
+ age_seconds?: number;
47
49
  }
48
50
  export declare function attachStudyRunCommands(study: Command): void;
49
51
  export {};
@@ -108,6 +108,26 @@ const POLL_INTERVAL_MS = 5_000;
108
108
  // transparently reverts to POLL_INTERVAL_MS.
109
109
  const SSE_BACKSTOP_INTERVAL_MS = 30_000;
110
110
  const TERMINAL_STATUSES = new Set(["completed", "errored", "failed", "cancelled", "canceled"]);
111
+ // If any running participant has been alive longer than this on the
112
+ // server, the wait-timeout message picks up an explicit "likely stuck"
113
+ // hint. Sized just above the worker's in-process stale-heartbeat
114
+ // threshold (600s) so the suggestion matches the backend reaper's
115
+ // verdict (see app/services/jobs/cleanup_stale_participants.py).
116
+ const LIKELY_STUCK_AGE_SECONDS = 900;
117
+ function buildWaitTimeoutMessage(opts) {
118
+ const base = `Timed out after ${opts.timeoutSeconds}s waiting for simulations. ` +
119
+ `${opts.done}/${opts.total} done. ${opts.resumeHint}`;
120
+ const likelyStuck = opts.rows.some((r) => typeof r.age_seconds === "number" &&
121
+ r.age_seconds >= LIKELY_STUCK_AGE_SECONDS &&
122
+ !TERMINAL_STATUSES.has(r.status));
123
+ if (!likelyStuck)
124
+ return base;
125
+ return (base +
126
+ " At least one participant has been running >15 min (see " +
127
+ "`progress.rows[].age_seconds`); the worker likely died. The " +
128
+ "backend reaper will mark it FAILED(stale_worker) within ~15 min — " +
129
+ "don't keep polling.");
130
+ }
111
131
  function flattenParticipantStatuses(participants, opts = {}) {
112
132
  const rows = [];
113
133
  for (const t of participants ?? []) {
@@ -128,6 +148,8 @@ function flattenParticipantStatuses(participants, opts = {}) {
128
148
  participant_name: t.person?.name || "Unknown",
129
149
  interaction_count: Array.isArray(t.interactions) ? t.interactions.length : 0,
130
150
  ...(errorMessage && { error_message: String(errorMessage) }),
151
+ ...(t.error_kind && { error_kind: t.error_kind }),
152
+ ...(typeof t.age_seconds === "number" && { age_seconds: t.age_seconds }),
131
153
  });
132
154
  }
133
155
  return rows;
@@ -171,8 +193,13 @@ async function pollStudyUntilDone(client, opts) {
171
193
  return { rows, isMedia };
172
194
  }
173
195
  if (Date.now() - start > opts.timeoutMs) {
174
- throw new WaitTimeoutError(`Timed out after ${Math.round(opts.timeoutMs / 1000)}s waiting for simulations. ` +
175
- `${done}/${total} done. Run \`ish study poll --study ${opts.studyId}\` to check status.`, {
196
+ throw new WaitTimeoutError(buildWaitTimeoutMessage({
197
+ timeoutSeconds: Math.round(opts.timeoutMs / 1000),
198
+ done,
199
+ total,
200
+ rows,
201
+ resumeHint: `Run \`ish study poll --study ${opts.studyId}\` to check status.`,
202
+ }), {
176
203
  study_id: opts.studyId,
177
204
  ...(opts.iterationId && { iteration_id: opts.iterationId }),
178
205
  timeout_seconds: Math.round(opts.timeoutMs / 1000),
@@ -1128,20 +1155,32 @@ Examples:
1128
1155
  // M8 + M9 (per-participant wait): structured wait_timeout with the
1129
1156
  // current status as `progress.rows[0]` so `study wait <id>`
1130
1157
  // always emits machine-readable final state.
1131
- throw new WaitTimeoutError(`Timed out after ${Math.round(timeoutMs / 1000)}s waiting for participant ${participantId}. Last status: ${status}.`, {
1158
+ const ageSeconds = typeof data.age_seconds === "number"
1159
+ ? data.age_seconds
1160
+ : undefined;
1161
+ const rows = [
1162
+ {
1163
+ id: resolvedParticipant,
1164
+ status,
1165
+ participant_name: String(data.participant_name ?? "Unknown"),
1166
+ interaction_count: 0,
1167
+ ...(data.error_kind && { error_kind: String(data.error_kind) }),
1168
+ ...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
1169
+ },
1170
+ ];
1171
+ throw new WaitTimeoutError(buildWaitTimeoutMessage({
1172
+ timeoutSeconds: Math.round(timeoutMs / 1000),
1173
+ done: 0,
1174
+ total: 1,
1175
+ rows,
1176
+ resumeHint: `Last status: ${status}.`,
1177
+ }), {
1132
1178
  study_id: resolvedParticipant,
1133
1179
  timeout_seconds: Math.round(timeoutMs / 1000),
1134
1180
  done: 0,
1135
1181
  total: 1,
1136
1182
  pending: 1,
1137
- rows: [
1138
- {
1139
- id: resolvedParticipant,
1140
- status,
1141
- participant_name: String(data.participant_name ?? "Unknown"),
1142
- interaction_count: 0,
1143
- },
1144
- ],
1183
+ rows,
1145
1184
  });
1146
1185
  }
1147
1186
  await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
@@ -1352,20 +1391,32 @@ See \`ish docs get-page concepts/extending-a-simulation\` for the full mental mo
1352
1391
  return;
1353
1392
  }
1354
1393
  if (Date.now() - start > timeoutMs) {
1355
- throw new WaitTimeoutError(`Timed out after ${Math.round(timeoutMs / 1000)}s waiting for participant ${newAlias}. Last status: ${s}.`, {
1394
+ const ageSeconds = typeof status.age_seconds === "number"
1395
+ ? status.age_seconds
1396
+ : undefined;
1397
+ const rows = [
1398
+ {
1399
+ id: newParticipantId,
1400
+ status: s,
1401
+ participant_name: String(status.participant_name ?? "Unknown"),
1402
+ interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
1403
+ ...(status.error_kind && { error_kind: String(status.error_kind) }),
1404
+ ...(typeof ageSeconds === "number" && { age_seconds: ageSeconds }),
1405
+ },
1406
+ ];
1407
+ throw new WaitTimeoutError(buildWaitTimeoutMessage({
1408
+ timeoutSeconds: Math.round(timeoutMs / 1000),
1409
+ done: 0,
1410
+ total: 1,
1411
+ rows,
1412
+ resumeHint: `Last status: ${s}.`,
1413
+ }), {
1356
1414
  study_id: newParticipantId,
1357
1415
  timeout_seconds: Math.round(timeoutMs / 1000),
1358
1416
  done: 0,
1359
1417
  total: 1,
1360
1418
  pending: 1,
1361
- rows: [
1362
- {
1363
- id: newParticipantId,
1364
- status: s,
1365
- participant_name: String(status.participant_name ?? "Unknown"),
1366
- interaction_count: typeof status.interaction_count === "number" ? status.interaction_count : 0,
1367
- },
1368
- ],
1419
+ rows,
1369
1420
  });
1370
1421
  }
1371
1422
  await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS));
@@ -175,6 +175,8 @@ Examples:
175
175
  Usage counters:
176
176
  studies_used / studies_max — current study count vs the user's plan cap
177
177
  people_used / people_max — workspace-private participant profile count vs cap
178
+ concurrent_participants_max — max in-flight participants per dispatch
179
+ workspace_members_max — max workspace members (seats)
178
180
 
179
181
  Caps fall back to null when the user's plan grants unlimited (math.inf). The
180
182
  account tier is read from /account/me; limit tables come from /billing/limits.
@@ -196,7 +198,11 @@ Examples:
196
198
  if (usage.tier)
197
199
  console.log(`Plan: ${usage.tier}`);
198
200
  console.log(`Studies: ${usage.studies_used} / ${renderCap(usage.studies_max)}`);
199
- console.log(`Custom people: ${usage.people_used} / ${renderCap(usage.people_max)}`);
201
+ console.log(`Custom people: ${usage.people_used} / ${renderCap(usage.people_max)}`);
202
+ if (usage.concurrent_participants_max !== null)
203
+ console.log(`Max concurrent: ${renderCap(usage.concurrent_participants_max)}`);
204
+ if (usage.workspace_members_max !== null)
205
+ console.log(`Max members: ${renderCap(usage.workspace_members_max)}`);
200
206
  });
201
207
  });
202
208
  workspace
@@ -276,19 +282,18 @@ async function collectWorkspaceUsage(client, workspaceId) {
276
282
  ]);
277
283
  const tier = typeof account.credits?.tier === "string" ? account.credits.tier : null;
278
284
  const tierTable = tier ? limits.tiers?.[tier] ?? null : null;
279
- const studiesMax = tierTable && "maxStudiesPerProduct" in tierTable ? tierTable.maxStudiesPerProduct : null;
280
- const peopleMax = tierTable && "maxCustomPersons" in tierTable
281
- ? tierTable.maxCustomPersons
282
- : null;
285
+ const lookupLimit = (key) => tierTable && key in tierTable ? tierTable[key] : null;
283
286
  return {
284
287
  id: workspaceId,
285
288
  name: product?.name ?? null,
286
289
  base_url: product?.base_url ?? null,
287
290
  tier,
288
291
  studies_used: Array.isArray(studies) ? studies.length : 0,
289
- studies_max: studiesMax,
292
+ studies_max: lookupLimit("maxStudiesPerProduct"),
290
293
  people_used: typeof participants.total === "number" ? participants.total : 0,
291
- people_max: peopleMax,
294
+ people_max: lookupLimit("maxCustomPersons"),
295
+ concurrent_participants_max: lookupLimit("maxConcurrentParticipants"),
296
+ workspace_members_max: lookupLimit("maxWorkspaceMembers"),
292
297
  };
293
298
  }
294
299
  // ---------------------------------------------------------------------------
package/dist/lib/docs.js CHANGED
@@ -113,15 +113,17 @@ ish workspace info --json
113
113
  {
114
114
  "studies_used": 2,
115
115
  "studies_max": 3,
116
- "participants_used": 0,
117
- "participants_max": 3,
116
+ "people_used": 0,
117
+ "people_max": 3,
118
+ "concurrent_participants_max": 3,
119
+ "workspace_members_max": 1,
118
120
  "tier": "free"
119
121
  }
120
122
  \`\`\`
121
123
 
122
124
  A \`null\` value on a \`*_max\` field means "unlimited" (paid tiers).
123
125
  Branch on \`studies_used >= studies_max\` before \`study create\`,
124
- likewise for \`participants_used\` before \`study run --sample\`.
126
+ likewise for \`people_used\` before \`study run --sample\`.
125
127
 
126
128
  ## Cold start — \`workspace_create\` is not safe to call blind
127
129
 
@@ -1711,6 +1713,23 @@ removed); \`extend\` then spawns a fresh participant branched from the
1711
1713
  cancelled participant's last interaction. See
1712
1714
  \`concepts/extending-a-simulation\` for the full mental model.
1713
1715
 
1716
+ ## Stuck runs are auto-failed (no manual intervention)
1717
+
1718
+ If a worker dies mid-run (instance preemption, OOM, infra restart), the
1719
+ backend reaper transitions the participant to
1720
+ \`status: failed, error_kind: stale_worker\` within ~15 min — you don't
1721
+ need to \`cancel\` it. The status payload returned by
1722
+ \`/simulation/status/{participant_id}\` (and surfaced on \`study wait\`,
1723
+ \`study run --wait\`, \`study poll\`) includes \`age_seconds\` so agents
1724
+ can tell "just slow" from "the worker is gone." Once \`age_seconds\`
1725
+ exceeds ~900s for a non-terminal participant the wait-timeout envelope
1726
+ explicitly flags it as likely stuck — stop polling and let the reaper
1727
+ finish the row.
1728
+
1729
+ \`error_kind: self_timeout\` is the same idea written by the worker
1730
+ itself when it self-detects passing its 25-min ceiling; \`stale_worker\`
1731
+ is the reaper's verdict when the row simply stopped reporting.
1732
+
1714
1733
  ## Related
1715
1734
 
1716
1735
  - \`reference/json-mode\` — output modes (display vs capture vs chain).
@@ -1744,7 +1763,10 @@ mid-run?" scenario without restarting from scratch.
1744
1763
  When extend is **not** the right verb:
1745
1764
 
1746
1765
  - Source participant is still RUNNING. \`cancel\` it first, then extend.
1747
- Extend refuses non-terminal sources server-side.
1766
+ Extend refuses non-terminal sources server-side. **Exception:** a
1767
+ stale-heartbeat RUNNING row (worker died mid-run) is reaped to
1768
+ \`failed, error_kind: stale_worker\` automatically within ~15 min — no
1769
+ manual \`cancel\` needed; just wait for the reaper, then extend.
1748
1770
  - You want a fresh cohort with new people flags. Use \`study run\`
1749
1771
  with \`--person\` / \`--sample\` / \`--all\` instead — extend is a
1750
1772
  per-participant resume, not a batch op.
@@ -2223,7 +2245,30 @@ The CLI guarantees these contracts so agents can chain safely:
2223
2245
  envelope carries \`progress: {study_id, iteration_id?,
2224
2246
  timeout_seconds, done, total, pending, rows[]}\` so the agent
2225
2247
  can resume by polling rather than re-dispatching. Same shape on
2226
- \`study wait\` (single-participant rows[] has length 1).
2248
+ \`study wait\` (single-participant rows[] has length 1). Each row
2249
+ in \`progress.rows[]\` carries \`age_seconds\` (server-computed
2250
+ liveness from \`started_at\`) plus \`error_kind\` when populated;
2251
+ when any non-terminal row's \`age_seconds\` exceeds ~900s the
2252
+ envelope's \`error\` message explicitly flags "the worker likely
2253
+ died" — don't keep polling, the backend reaper will mark it
2254
+ \`failed, error_kind=stale_worker\` within ~15 min.
2255
+ - **Participant \`error_kind\` enumeration.** Failed participants
2256
+ carry a classified \`error_kind\` so agents branch without parsing
2257
+ prose. Lifecycle/infra kinds: \`stale_worker\` (worker died mid-run,
2258
+ reaper transitioned the row), \`self_timeout\` (worker self-aborted
2259
+ past its 25-min runtime ceiling). Modality kinds:
2260
+ \`first_impression_llm_failed\`, \`interview_llm_failed\`,
2261
+ \`variant_preparation_failed\` (ask responses). CLI-side kinds:
2262
+ \`ConfirmationRequired\` (destructive op in \`--json\` mode without
2263
+ \`--yes\`), \`TunnelInactive\`, \`BotAuthError\`, \`BotShapeError\`,
2264
+ \`BotInvalidResponseError\`. The full set is open — branch on the
2265
+ ones you handle and treat the rest as "unknown failure, surface to
2266
+ user."
2267
+ - **Per-participant status payload (\`/simulation/status/{id}\`)** carries
2268
+ \`{job_id, status, create_time, completion_time?, error?, error_kind?,
2269
+ started_at?, last_heartbeat_at?, age_seconds?}\`. \`age_seconds\` is
2270
+ server-computed so clock skew between caller and backend doesn't
2271
+ matter; treat absent fields as "older backend, info unavailable."
2227
2272
  - **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
2228
2273
  for the per-POST participants/batch + simulation/start budget. On
2229
2274
  timeout (or any dispatch failure), the error envelope includes
@@ -2984,16 +3029,21 @@ The \`formula\` key is stable: agents can branch on it (\`media_per_participant\
2984
3029
 
2985
3030
  ## Tier allotments
2986
3031
 
2987
- | Tier | Monthly credits | Notes |
2988
- |-------------|---------------------------|--------------------------------|
2989
- | FREE | 200 (one-time signup) | Never refilled |
2990
- | STARTER | 1,000 / month | Monthly reset |
2991
- | PRO | 3,000 / month | Monthly reset |
2992
- | ENTERPRISE | unlimited | Custom contract |
3032
+ Paid tiers use **dynamic credit budgets** — the user selects a credit
3033
+ bucket at subscription time. The table shows the range of available
3034
+ buckets per tier:
3035
+
3036
+ | Tier | Monthly credits | Notes |
3037
+ |-------------|---------------------------|--------------------------------------|
3038
+ | FREE | 200 (one-time signup) | Never refilled |
3039
+ | STARTER | 200 – 2,500 / month | User selects bucket (solo, 1 seat) |
3040
+ | PRO | 500 – 10,000 / month | User selects bucket (team, 10 seats) |
3041
+ | ENTERPRISE | unlimited | Custom contract |
2993
3042
 
2994
3043
  The CLI does not enforce these — the backend does. The CLI's job is to
2995
3044
  *preview*, so an agent doesn't dispatch a 5,000-credit run on a
2996
- 200-credit account.
3045
+ 200-credit account. The actual credit budget for a given account depends
3046
+ on which bucket the user chose — query \`workspace info\` for headroom.
2997
3047
 
2998
3048
  ## Insufficient-credit rejection shape
2999
3049
 
@@ -3073,12 +3123,18 @@ request time, for any client, is the backend's \`TIER_LIMITS\` dict in
3073
3123
  | \`maxProducts\` | 1 | 1 | ∞ | ∞ | ∞ |
3074
3124
  | \`maxStudiesPerProduct\` | 3 | ∞ | ∞ | ∞ | ∞ |
3075
3125
  | \`maxIterationsPerStudy\` | 2 | ∞ | ∞ | ∞ | ∞ |
3076
- | \`maxCustomPersons\` | 3 | 10 | 10 | ∞ | ∞ |
3126
+ | \`maxCustomPersons\` | 3 | 10 | 10 | ∞ | ∞ |
3127
+ | \`maxConcurrentParticipants\` | 3 | 3 | 10 | 50 | ∞ |
3128
+ | \`maxWorkspaceMembers\` | 1 | 1 | 1 | 10 | ∞ |
3077
3129
 
3078
3130
  Commands that may hit a limit: \`ish workspace create\`,
3079
3131
  \`ish study create\`, \`ish study generate\`, \`ish iteration create\`,
3080
3132
  \`ish person create\`, \`ish person generate\`.
3081
3133
 
3134
+ \`maxConcurrentParticipants\` gates how many participants can be in-flight
3135
+ at once per dispatch. \`maxWorkspaceMembers\` gates workspace membership
3136
+ (seats). Both are enforced server-side.
3137
+
3082
3138
  ## What you see when a limit is hit
3083
3139
 
3084
3140
  Human output (stderr):
@@ -218,6 +218,7 @@ When in doubt: side-by-side comparison usually beats in-place edits. Ids are che
218
218
  - **Chatbot endpoint response-shape mismatch**: \`chat_endpoint_test\` succeeds shallowly if the bot responds at all, but a wrong response path (e.g. bot returns \`{ data: { reply } }\` instead of \`{ reply }\`) produces empty transcripts on the actual run. Inspect one full test response before dispatching participants.
219
219
  - **Chatbot auth drift**: tokens/sessions baked into \`--from-curl\` expire. If transcripts come back as identical short error strings, re-run \`chat_endpoint_test\` and refresh the curl spec.
220
220
  - **401 surfaces as fake blocker**: an unauthenticated endpoint produces "participant got stuck on auth screen" — looks like a UX blocker but is config. Always confirm endpoint auth before reading transcripts as user-research data.
221
+ - **Don't poll a stuck run forever**: a participant whose worker died will sit in \`status: running\` until the backend reaper transitions it to \`failed, error_kind: stale_worker\` (~15 min). The per-participant status payload exposes \`age_seconds\` (server-computed from \`started_at\`); once it's above ~900s on a non-terminal row, the run is almost certainly stuck. The CLI's \`wait_timeout\` envelope explicitly flags this case in its \`error\` message — when you see "the worker likely died," stop polling and surface the failure rather than retrying. \`error_kind: self_timeout\` is the same idea but written by the worker itself when it self-aborts past its 25-min ceiling.
221
222
  - **No per-page/per-timestamp scoping for media**: there's no "evaluate just slide 14" or "react to seconds 0-30" API. State the focus explicitly in the \`assignment\` text, or pre-stitch the artifact (e.g. replace one slide locally, upload as a new iteration).
222
223
  - **\`study get --json\` participants live at the top level**, not nested under \`iterations[*].participants\`. The backend split made \`/studies/{id}\` lite (metadata + iteration shells, no participant graph) and added \`/studies/{id}/participants\`; the CLI joins them so \`study get --json\` carries a flat \`participants[]\` with \`iteration_id\` on each row. Read \`.participants[]\`, not \`.iterations[].participants[]\`.
223
224
  - **All destructive deletes require \`--yes\` in non-TTY mode**: \`ish workspace delete\`, \`study delete\`, \`ask delete\`, \`person delete\`, \`source delete\`, \`chat endpoint delete\`. In \`--json\` mode (or any piped/non-TTY invocation), omitting \`--yes\` refuses with \`error_kind: "ConfirmationRequired"\` + an \`example\` field showing the same command with \`--yes\` appended. \`workspace delete\` is the highest-blast-radius: it removes ALL nested studies, asks, people, secrets, configs, sources, and chat endpoints — the prompt names them explicitly.
@@ -584,7 +585,7 @@ also in \`study poll --json\`. Branch on it instead of treating
584
585
  \`interaction_count: 0\` as a generic failure.
585
586
 
586
587
  Pre-flight tip: \`ish workspace info\` exposes
587
- \`{studies_used, studies_max, participants_used, participants_max, tier}\` so
588
+ \`{studies_used, studies_max, people_used, people_max, concurrent_participants_max, workspace_members_max, tier}\` so
588
589
  you can branch on plan caps before \`study create\` returns
589
590
  \`error_code: usage_limit_reached\`.
590
591
 
@@ -38,6 +38,9 @@ export interface StudyParticipant extends Participant {
38
38
  conversation_id?: string | null;
39
39
  error_message?: string | null;
40
40
  error_kind?: string | null;
41
+ started_at?: string | null;
42
+ last_heartbeat_at?: string | null;
43
+ age_seconds?: number | null;
41
44
  [k: string]: unknown;
42
45
  }
43
46
  export declare function fetchStudyParticipants(client: ApiClient, studyId: string, opts?: {
@@ -357,6 +357,10 @@ export interface SimulationStatus {
357
357
  create_time?: string;
358
358
  completion_time?: string;
359
359
  error?: string;
360
+ error_kind?: string | null;
361
+ started_at?: string | null;
362
+ last_heartbeat_at?: string | null;
363
+ age_seconds?: number | null;
360
364
  }
361
365
  export interface SimulationCancelResponse {
362
366
  job_id: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ishlabs/cli",
3
- "version": "0.22.0",
3
+ "version": "0.23.1",
4
4
  "description": "The command-line interface for ish",
5
5
  "type": "module",
6
6
  "bin": {