@ishlabs/cli 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/iteration.js +219 -22
- package/dist/commands/profile.js +75 -9
- package/dist/commands/source.js +6 -4
- package/dist/commands/study-run.js +382 -34
- package/dist/commands/study.js +170 -9
- package/dist/commands/workspace.js +35 -2
- package/dist/lib/accessibility-profile.d.ts +12 -0
- package/dist/lib/accessibility-profile.js +136 -0
- package/dist/lib/ask-questions.js +9 -0
- package/dist/lib/billing.d.ts +55 -0
- package/dist/lib/billing.js +77 -0
- package/dist/lib/docs.js +1106 -36
- package/dist/lib/enums.d.ts +54 -0
- package/dist/lib/enums.js +100 -0
- package/dist/lib/local-sim/actions.d.ts +2 -1
- package/dist/lib/local-sim/actions.js +88 -13
- package/dist/lib/local-sim/loop.js +49 -19
- package/dist/lib/local-sim/tabs.d.ts +27 -0
- package/dist/lib/local-sim/tabs.js +157 -0
- package/dist/lib/local-sim/types.d.ts +15 -0
- package/dist/lib/modality.d.ts +70 -1
- package/dist/lib/modality.js +323 -17
- package/dist/lib/output.js +61 -4
- package/dist/lib/skill-content.js +382 -19
- package/dist/lib/types.d.ts +6 -1
- package/package.json +1 -1
|
@@ -11,15 +11,33 @@ import * as readline from "node:readline/promises";
|
|
|
11
11
|
import { withClient, getWebUrl, terminalLink, resolveWorkspace, resolveStudy, parseWaitTimeout, resolveAudienceProfileIds, addAudienceFilterFlags, hasAudienceFlags, } from "../lib/command-helpers.js";
|
|
12
12
|
import { resolveId, tagAlias, ALIAS_PREFIX } from "../lib/alias-store.js";
|
|
13
13
|
import { output, formatSimulationPoll } from "../lib/output.js";
|
|
14
|
-
import { isMediaModality, isChatModality, iterationHasContent, describeRequiredContentFlag, } from "../lib/modality.js";
|
|
14
|
+
import { isMediaModality, isChatModality, iterationHasContent, describeRequiredContentFlag, readChatMode, readTesterPairConfig, summarizeRoleCriteria, } from "../lib/modality.js";
|
|
15
15
|
import { runLocalSimulations } from "../lib/local-sim/loop.js";
|
|
16
16
|
import { ensureBrowser } from "../lib/local-sim/install.js";
|
|
17
|
+
import { estimateChatPair, estimateChatSolo, estimateMediaRun } from "../lib/billing.js";
|
|
17
18
|
function parseMaxInteractions(value) {
|
|
18
19
|
const n = parseInt(value, 10);
|
|
19
20
|
if (isNaN(n) || n < 1)
|
|
20
21
|
throw new Error(`Invalid --max-interactions value: ${value}`);
|
|
21
22
|
return n;
|
|
22
23
|
}
|
|
24
|
+
/**
|
|
25
|
+
* Default cap the CLI sends when neither `--max-interactions` nor the
|
|
26
|
+
* iteration carries its own value. Picked to match the frontend's
|
|
27
|
+
* conservative interactive launchers and to prevent runaway spend when an
|
|
28
|
+
* iteration runs against a broken or non-responsive surface — without a
|
|
29
|
+
* cap, a stuck tester can rack up hundreds of steps before the SDK gives
|
|
30
|
+
* up.
|
|
31
|
+
*/
|
|
32
|
+
const DEFAULT_MAX_INTERACTIONS = 20;
|
|
33
|
+
function resolveMaxInteractions(optsValue, iterationDetails) {
|
|
34
|
+
if (optsValue)
|
|
35
|
+
return parseMaxInteractions(optsValue);
|
|
36
|
+
if (typeof iterationDetails?.max_interactions === "number") {
|
|
37
|
+
return iterationDetails.max_interactions;
|
|
38
|
+
}
|
|
39
|
+
return DEFAULT_MAX_INTERACTIONS;
|
|
40
|
+
}
|
|
23
41
|
function parseSlowMo(value) {
|
|
24
42
|
const n = parseInt(value, 10);
|
|
25
43
|
if (isNaN(n) || n < 0)
|
|
@@ -161,7 +179,7 @@ export function attachStudyRunCommands(study) {
|
|
|
161
179
|
allFlagDescription: "Use every AI profile matching the filters (workspace-wide if no filters set)",
|
|
162
180
|
})
|
|
163
181
|
.option("--config <id>", "Simulation config ID (required for media unless every profile has one)")
|
|
164
|
-
.option("--max-interactions <n>",
|
|
182
|
+
.option("--max-interactions <n>", `Max interactions per tester (interactive / media only). Precedence: flag > iteration's stored value > CLI default (${DEFAULT_MAX_INTERACTIONS}).`)
|
|
165
183
|
.option("--max-turns <n>", "Max conversation turns per tester (chat studies only)")
|
|
166
184
|
.option("--early-termination", "Allow chat agent to end the conversation early when goals are met (chat studies only)")
|
|
167
185
|
.option("--language <lang>", "Language code (e.g. en, sv)")
|
|
@@ -207,6 +225,10 @@ Examples:
|
|
|
207
225
|
# Override the simulation config (e.g. for a media study):
|
|
208
226
|
$ ish study run --config c-c3c
|
|
209
227
|
|
|
228
|
+
# Cap interactions per tester (default 20 — pass higher to allow deeper
|
|
229
|
+
# exploration, lower to cap spend on a known-broken surface):
|
|
230
|
+
$ ish study run --max-interactions 30
|
|
231
|
+
|
|
210
232
|
# Block until all simulations finish (or timeout):
|
|
211
233
|
$ ish study run --wait
|
|
212
234
|
$ ish study run --wait --timeout 600
|
|
@@ -262,6 +284,10 @@ Examples:
|
|
|
262
284
|
const modality = study.modality || "interactive";
|
|
263
285
|
const isMedia = isMediaModality(modality);
|
|
264
286
|
const isChat = isChatModality(modality);
|
|
287
|
+
// Pair-mode (tester_pair) is read off the iteration once we've
|
|
288
|
+
// resolved it below; set defaults here so the value is in scope.
|
|
289
|
+
let chatMode = "external_chatbot";
|
|
290
|
+
let isPair = false;
|
|
265
291
|
if (!study.assignments || study.assignments.length === 0) {
|
|
266
292
|
throw new Error("Study has no assignments. Add tasks with --assignments when creating the study, or use `ish study generate`.");
|
|
267
293
|
}
|
|
@@ -288,24 +314,57 @@ Examples:
|
|
|
288
314
|
// auto-creates an empty iteration A; agents who don't pass
|
|
289
315
|
// --iteration silently dispatch against it. Detect and refuse with
|
|
290
316
|
// a clear suggestion rather than masking the problem.
|
|
317
|
+
if (isChat) {
|
|
318
|
+
chatMode = readChatMode(iteration.details);
|
|
319
|
+
isPair = chatMode === "tester_pair";
|
|
320
|
+
}
|
|
291
321
|
if (!iterationHasContent(iteration.details, modality)) {
|
|
292
|
-
const flagHint = describeRequiredContentFlag(modality);
|
|
322
|
+
const flagHint = describeRequiredContentFlag(modality, isPair ? "tester_pair" : undefined);
|
|
293
323
|
const iterAlias = tagAlias(ALIAS_PREFIX.iteration, iterationId);
|
|
294
|
-
throw new Error(`Iteration "${iterationLabel}" (${iterAlias}) has no ${isMedia ? "content" : "URL"} configured yet. ` +
|
|
295
|
-
`Add ${isMedia ? "content" : "a URL"} with ` +
|
|
324
|
+
throw new Error(`Iteration "${iterationLabel}" (${iterAlias}) has no ${isMedia ? "content" : isPair ? "audiences/scenarios" : isChat ? "endpoint" : "URL"} configured yet. ` +
|
|
325
|
+
`Add ${isMedia ? "content" : isPair ? "the pair-mode payload" : isChat ? "an endpoint" : "a URL"} with ` +
|
|
296
326
|
`\`ish iteration create --study ${resolvedStudy} ${flagHint}\` ` +
|
|
297
327
|
`(or update the existing iteration via \`ish iteration update ${iterAlias} --details-json '{...}'\`), then retry.`);
|
|
298
328
|
}
|
|
299
329
|
const detailsView = readIterationDetails(iteration.details);
|
|
330
|
+
const pairConfig = isPair ? readTesterPairConfig(iteration.details) : undefined;
|
|
300
331
|
// Step 2: Resolve audience.
|
|
301
332
|
// - If any audience flag is set (--profile / --sample / --all / filter flags),
|
|
302
333
|
// resolve a fresh ID list from the workspace pool via the shared helper.
|
|
303
334
|
// - Otherwise reuse the iteration's existing testers.
|
|
335
|
+
// - For chat tester_pair iterations, audiences live inside the
|
|
336
|
+
// iteration's mode_details and are authoritative; run-time
|
|
337
|
+
// overrides are refused.
|
|
304
338
|
const profileNames = new Map();
|
|
305
339
|
const profileIds = [];
|
|
306
340
|
const existingTesters = [];
|
|
307
341
|
const audienceSet = hasAudienceFlags(opts);
|
|
308
|
-
if (
|
|
342
|
+
if (isPair) {
|
|
343
|
+
if (audienceSet) {
|
|
344
|
+
throw new Error("tester_pair chat iterations carry their own audiences inside mode_details; run-time audience overrides (--profile / --sample / --all / --country / --gender / --min-age / --max-age / --search / --visibility) are not supported. " +
|
|
345
|
+
"To change the audiences, update the iteration via `ish iteration update <id> --details-json '{...}'`.");
|
|
346
|
+
}
|
|
347
|
+
if (!pairConfig) {
|
|
348
|
+
throw new Error("Pair-mode iteration is missing mode_details; cannot dispatch.");
|
|
349
|
+
}
|
|
350
|
+
// Surface a flat profileIds[] (a then b) so downstream
|
|
351
|
+
// bookkeeping (config resolution, output) still has something to
|
|
352
|
+
// chew on. The pair-batch tester-provisioning POST below uses
|
|
353
|
+
// the split lists, not this flat one.
|
|
354
|
+
for (const pid of pairConfig.audience_a) {
|
|
355
|
+
if (!profileNames.has(pid)) {
|
|
356
|
+
profileNames.set(pid, "");
|
|
357
|
+
profileIds.push(pid);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
for (const pid of pairConfig.audience_b) {
|
|
361
|
+
if (!profileNames.has(pid)) {
|
|
362
|
+
profileNames.set(pid, "");
|
|
363
|
+
profileIds.push(pid);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
else if (audienceSet) {
|
|
309
368
|
const resolved = await resolveAudienceProfileIds(client, resolvedWorkspace, opts, { requireSimulatable: false, allFlagName: "--all" });
|
|
310
369
|
profileIds.push(...resolved);
|
|
311
370
|
}
|
|
@@ -322,16 +381,28 @@ Examples:
|
|
|
322
381
|
}
|
|
323
382
|
}
|
|
324
383
|
}
|
|
325
|
-
|
|
326
|
-
|
|
384
|
+
// Pair iterations always seed fresh testers via the pair-batch
|
|
385
|
+
// endpoint; never reuse a stale tester roster from a prior run.
|
|
386
|
+
const reuseExistingTesters = !isPair && !audienceSet && existingTesters.length > 0;
|
|
387
|
+
// Pair iterations with criteria-only audiences will have empty
|
|
388
|
+
// profileIds at this stage if the backend deferred resolution past
|
|
389
|
+
// iteration create. That's a valid state — skip the
|
|
390
|
+
// "no audience flags" guard for them and let dispatch surface any
|
|
391
|
+
// backend-side resolution errors (e.g. pool too small).
|
|
392
|
+
const pairCriteriaOnly = isPair && !!pairConfig && profileIds.length === 0
|
|
393
|
+
&& (!!pairConfig.role_criteria_a || !!pairConfig.role_criteria_b);
|
|
394
|
+
if (profileIds.length === 0 && !pairCriteriaOnly) {
|
|
327
395
|
throw new Error(`Iteration "${iterationLabel}" has no testers and no audience flags were given. ` +
|
|
328
396
|
"Pass --profile <ids>, or filter flags (--country, --gender, --min-age, --max-age, --search, --visibility) with --sample <N> or --all.");
|
|
329
397
|
}
|
|
330
398
|
// Step 3: Resolve simulation config (per-profile fallback for
|
|
331
|
-
// media + chat, both of which require a config_id
|
|
399
|
+
// media + chat external_chatbot, both of which require a config_id
|
|
400
|
+
// per batch item). Pair-mode chat dispatch is per-conversation,
|
|
401
|
+
// not per-tester; the backend resolves configs via the tester rows
|
|
402
|
+
// it creates on /testers/pair-batch, so the CLI doesn't pre-fetch.
|
|
332
403
|
const resolvedConfigOverride = opts.config ? resolveId(opts.config) : undefined;
|
|
333
404
|
const profileConfigMap = new Map();
|
|
334
|
-
if ((isMedia || isChat) && !resolvedConfigOverride) {
|
|
405
|
+
if ((isMedia || (isChat && !isPair)) && !resolvedConfigOverride) {
|
|
335
406
|
for (const pid of profileIds) {
|
|
336
407
|
const profile = await client.get(`/tester-profiles/${pid}`);
|
|
337
408
|
if (profile.simulation_config_id) {
|
|
@@ -352,9 +423,63 @@ Examples:
|
|
|
352
423
|
log(` Modality: ${modality}`);
|
|
353
424
|
if (study.content_type)
|
|
354
425
|
log(` Content type: ${study.content_type}`);
|
|
355
|
-
if (
|
|
356
|
-
|
|
357
|
-
|
|
426
|
+
if (isPair && pairConfig) {
|
|
427
|
+
log(` Chat mode: tester_pair`);
|
|
428
|
+
// Audience description per side: prefer explicit count when
|
|
429
|
+
// present; otherwise show the criteria filter that the backend
|
|
430
|
+
// will resolve into a pool.
|
|
431
|
+
const describeSide = (audLen, crit) => {
|
|
432
|
+
if (audLen > 0)
|
|
433
|
+
return `${audLen} profile(s)${crit ? ` (criteria validates list)` : ""}`;
|
|
434
|
+
const summary = summarizeRoleCriteria(crit);
|
|
435
|
+
return summary ? `criteria (${summary}) — pool resolved server-side` : "—";
|
|
436
|
+
};
|
|
437
|
+
log(` Audience A: ${describeSide(pairConfig.audience_a.length, pairConfig.role_criteria_a)}`);
|
|
438
|
+
log(` Audience B: ${describeSide(pairConfig.audience_b.length, pairConfig.role_criteria_b)}`);
|
|
439
|
+
const explicitConvs = Math.min(pairConfig.audience_a.length, pairConfig.audience_b.length);
|
|
440
|
+
const criteriaResolved = !!pairConfig.role_criteria_a || !!pairConfig.role_criteria_b;
|
|
441
|
+
if (explicitConvs > 0 && !criteriaResolved) {
|
|
442
|
+
log(` Conversations: ${explicitConvs} (1:1 by index)`);
|
|
443
|
+
}
|
|
444
|
+
else {
|
|
445
|
+
log(` Conversations: resolved server-side from criteria`);
|
|
446
|
+
}
|
|
447
|
+
// Scale preview: rough LLM-call estimate so the user knows
|
|
448
|
+
// what they're committing to before --yes lands. Formula
|
|
449
|
+
// matches the backend's billing pre-flight
|
|
450
|
+
// (chat_credit_cost(turns) * 2 * conv_count, where the *2
|
|
451
|
+
// accounts for one LLM call per side per turn). Doesn't
|
|
452
|
+
// claim exact credit cost — just shape + magnitude.
|
|
453
|
+
const turnsEstimate = opts.maxTurns
|
|
454
|
+
? parseInt(opts.maxTurns, 10)
|
|
455
|
+
: (typeof iteration.details?.max_turns === "number"
|
|
456
|
+
? iteration.details.max_turns
|
|
457
|
+
: 14);
|
|
458
|
+
if (explicitConvs > 0 && !criteriaResolved && Number.isFinite(turnsEstimate)) {
|
|
459
|
+
const est = estimateChatPair({ conversationCount: explicitConvs, maxTurns: turnsEstimate });
|
|
460
|
+
log(` Scale: ${explicitConvs} conv × ${turnsEstimate} turns × 2 sides ≈ ${explicitConvs * turnsEstimate * 2} LLM calls (upper bound — early-termination may shorten)`);
|
|
461
|
+
log(` Credits (est): ≈ ${est.upper_bound} credit(s) upper bound — see \`ish docs get-page reference/credits\``);
|
|
462
|
+
}
|
|
463
|
+
else if (criteriaResolved) {
|
|
464
|
+
log(` Scale: ~N conv × ${turnsEstimate} turns × 2 sides — N resolved server-side`);
|
|
465
|
+
log(` Credits (est): N × max(1, round(${turnsEstimate}/10)) × 2 — N resolved server-side`);
|
|
466
|
+
}
|
|
467
|
+
log(` Initiator: side ${pairConfig.initiator_side}`);
|
|
468
|
+
const scenAPreview = pairConfig.scenario_a.replace(/\s+/g, " ").trim().slice(0, 60);
|
|
469
|
+
const scenBPreview = pairConfig.scenario_b.replace(/\s+/g, " ").trim().slice(0, 60);
|
|
470
|
+
log(` Scenario A: ${scenAPreview}${pairConfig.scenario_a.length > 60 ? "…" : ""}`);
|
|
471
|
+
log(` Scenario B: ${scenBPreview}${pairConfig.scenario_b.length > 60 ? "…" : ""}`);
|
|
472
|
+
if (opts.maxTurns)
|
|
473
|
+
log(` Max turns: ${opts.maxTurns}`);
|
|
474
|
+
if (opts.earlyTermination)
|
|
475
|
+
log(` Early term: enabled`);
|
|
476
|
+
}
|
|
477
|
+
else if (isChat) {
|
|
478
|
+
const md = iteration.details?.mode_details;
|
|
479
|
+
const epId = (typeof md?.chatbot_endpoint_id === "string" && md.chatbot_endpoint_id)
|
|
480
|
+
|| (typeof iteration.details?.chatbot_endpoint_id === "string"
|
|
481
|
+
? iteration.details.chatbot_endpoint_id
|
|
482
|
+
: undefined);
|
|
358
483
|
if (epId)
|
|
359
484
|
log(` Endpoint: ${epId}`);
|
|
360
485
|
if (opts.maxTurns)
|
|
@@ -375,10 +500,40 @@ Examples:
|
|
|
375
500
|
log(` Config: ${resolvedConfigOverride}`);
|
|
376
501
|
if (opts.language)
|
|
377
502
|
log(` Language: ${opts.language}`);
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
const
|
|
381
|
-
|
|
503
|
+
if (!isPair) {
|
|
504
|
+
log(` Profiles (${profileIds.length}):`);
|
|
505
|
+
for (const pid of profileIds) {
|
|
506
|
+
const name = profileNames.get(pid);
|
|
507
|
+
log(` - ${name ? `${name} (${pid})` : pid}`);
|
|
508
|
+
}
|
|
509
|
+
const testerCount = profileIds.length;
|
|
510
|
+
if (testerCount > 0) {
|
|
511
|
+
if (isChat) {
|
|
512
|
+
const turnsForChat = opts.maxTurns
|
|
513
|
+
? parseInt(opts.maxTurns, 10)
|
|
514
|
+
: (typeof iteration.details?.max_turns === "number"
|
|
515
|
+
? iteration.details.max_turns
|
|
516
|
+
: 14);
|
|
517
|
+
if (Number.isFinite(turnsForChat)) {
|
|
518
|
+
const est = estimateChatSolo({ testerCount, maxTurns: turnsForChat });
|
|
519
|
+
log(` Credits (est): ≈ ${est.upper_bound} credit(s) upper bound — ${est.breakdown}`);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
else {
|
|
523
|
+
const stepsForMedia = resolveMaxInteractions(opts.maxInteractions, iteration.details);
|
|
524
|
+
const source = opts.maxInteractions
|
|
525
|
+
? "from --max-interactions"
|
|
526
|
+
: typeof iteration.details?.max_interactions === "number"
|
|
527
|
+
? "from iteration"
|
|
528
|
+
: `CLI default — pass --max-interactions to override`;
|
|
529
|
+
log(` Max steps: ${stepsForMedia} (${source})`);
|
|
530
|
+
if (Number.isFinite(stepsForMedia)) {
|
|
531
|
+
const est = estimateMediaRun({ testerCount, maxInteractions: stepsForMedia });
|
|
532
|
+
log(` Credits (est): ≈ ${est.upper_bound} credit(s) upper bound — ${est.breakdown}`);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
log(` See \`ish docs get-page reference/credits\` for formula.`);
|
|
536
|
+
}
|
|
382
537
|
}
|
|
383
538
|
log("");
|
|
384
539
|
const rl = readline.createInterface({ input: process.stdin, output: process.stderr });
|
|
@@ -395,7 +550,83 @@ Examples:
|
|
|
395
550
|
}
|
|
396
551
|
// Step 5: Either reuse the iteration's testers or batch-create new ones
|
|
397
552
|
let createdTesters;
|
|
398
|
-
|
|
553
|
+
// Pair-mode bookkeeping: the dispatch endpoint takes
|
|
554
|
+
// `conversation_ids`, not tester ids. We populate this list either
|
|
555
|
+
// by reusing the iteration's existing Conversation rows or by
|
|
556
|
+
// calling pair-batch.
|
|
557
|
+
let pairConversationIds = [];
|
|
558
|
+
if (isPair && pairConfig) {
|
|
559
|
+
// Pair-mode flow mirrors the MCP (`ish-mcp` `_run_pair_mode`):
|
|
560
|
+
// 1. If the iteration already carries `conversations[]` from a
|
|
561
|
+
// prior dispatch, reuse them — skip pair-batch entirely.
|
|
562
|
+
// 2. Otherwise call pair-batch with the resolved
|
|
563
|
+
// audience UUID lists. Criteria-only iterations should
|
|
564
|
+
// already have audiences materialised at iteration-create
|
|
565
|
+
// time; if they're still empty here, the backend's
|
|
566
|
+
// `PairAudienceResolutionError` is the authoritative
|
|
567
|
+
// failure mode — refuse before hitting pair-batch.
|
|
568
|
+
//
|
|
569
|
+
// Wire shapes per backend `app/api/iterations/routers`:
|
|
570
|
+
// POST /iterations/{id}/testers/pair-batch
|
|
571
|
+
// body : { side_a: UUID[1..20], side_b: UUID[1..20] (equal len),
|
|
572
|
+
// language?: str }
|
|
573
|
+
// reply : { conversations: [{ conversation_id, pair_index,
|
|
574
|
+
// tester_a_id, tester_b_id }] }
|
|
575
|
+
const existingConvs = iteration.conversations ?? [];
|
|
576
|
+
const reusable = [];
|
|
577
|
+
for (const c of existingConvs) {
|
|
578
|
+
const cid = c.conversation_id || c.id;
|
|
579
|
+
if (cid && c.tester_a_id && c.tester_b_id) {
|
|
580
|
+
reusable.push({ conversation_id: cid, tester_a_id: c.tester_a_id, tester_b_id: c.tester_b_id });
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
let pairRows;
|
|
584
|
+
if (reusable.length > 0) {
|
|
585
|
+
pairRows = reusable;
|
|
586
|
+
log(`Reusing ${reusable.length} existing conversation${reusable.length > 1 ? "s" : ""} on iteration "${iterationLabel}"`);
|
|
587
|
+
}
|
|
588
|
+
else {
|
|
589
|
+
if (pairConfig.audience_a.length === 0 || pairConfig.audience_b.length === 0) {
|
|
590
|
+
throw new Error("Pair-mode iteration has empty audience_a / audience_b and no conversations yet. " +
|
|
591
|
+
"If this iteration was created with --role-criteria-a/-b, the backend should have " +
|
|
592
|
+
"resolved a profile pool at create time — try `ish iteration get <id>` to fetch a " +
|
|
593
|
+
"fresh shape, or recreate with explicit --profile-a/-b.");
|
|
594
|
+
}
|
|
595
|
+
log(`Provisioning ${pairConfig.audience_a.length} pair conversation${pairConfig.audience_a.length > 1 ? "s" : ""}...`);
|
|
596
|
+
const pairBatchResult = await client.post(`/iterations/${iterationId}/testers/pair-batch`, {
|
|
597
|
+
side_a: pairConfig.audience_a,
|
|
598
|
+
side_b: pairConfig.audience_b,
|
|
599
|
+
...(opts.language && { language: opts.language }),
|
|
600
|
+
}, { timeout: dispatchTimeoutMs });
|
|
601
|
+
pairRows = (pairBatchResult.conversations ?? []).map((c) => ({
|
|
602
|
+
conversation_id: c.conversation_id,
|
|
603
|
+
tester_a_id: c.tester_a_id,
|
|
604
|
+
tester_b_id: c.tester_b_id,
|
|
605
|
+
}));
|
|
606
|
+
if (pairRows.length === 0) {
|
|
607
|
+
throw new Error("Pair-batch returned no conversations. The backend response did not include any conversation IDs.");
|
|
608
|
+
}
|
|
609
|
+
log(`Created ${pairRows.length * 2} testers (${pairRows.length} conversation${pairRows.length > 1 ? "s" : ""})`);
|
|
610
|
+
}
|
|
611
|
+
pairConversationIds = pairRows.map((r) => r.conversation_id);
|
|
612
|
+
// Flatten both sides' tester IDs for downstream bookkeeping:
|
|
613
|
+
// error-tagging (`seeded_but_not_dispatched_ids`), poll filtering,
|
|
614
|
+
// and JSON output. Names aren't returned by pair-batch; agents
|
|
615
|
+
// who care can correlate via `ish iteration get <id>`.
|
|
616
|
+
createdTesters = [];
|
|
617
|
+
for (let i = 0; i < pairRows.length; i++) {
|
|
618
|
+
const row = pairRows[i];
|
|
619
|
+
createdTesters.push({
|
|
620
|
+
id: row.tester_a_id,
|
|
621
|
+
tester_profile: { name: `pair ${i} side A` },
|
|
622
|
+
});
|
|
623
|
+
createdTesters.push({
|
|
624
|
+
id: row.tester_b_id,
|
|
625
|
+
tester_profile: { name: `pair ${i} side B` },
|
|
626
|
+
});
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
else if (reuseExistingTesters && existingTesters.length > 0) {
|
|
399
630
|
createdTesters = existingTesters;
|
|
400
631
|
log(`Reusing ${createdTesters.length} existing tester${createdTesters.length > 1 ? "s" : ""} from iteration "${iterationLabel}"`);
|
|
401
632
|
}
|
|
@@ -430,7 +661,7 @@ Examples:
|
|
|
430
661
|
url: detailsView.url,
|
|
431
662
|
screenFormat: detailsView.screenFormat,
|
|
432
663
|
locale: detailsView.locale,
|
|
433
|
-
maxInteractions: opts.maxInteractions
|
|
664
|
+
maxInteractions: resolveMaxInteractions(opts.maxInteractions, iteration.details),
|
|
434
665
|
headed: !!opts.headed,
|
|
435
666
|
slowMo: opts.slowMo ? parseSlowMo(opts.slowMo) : undefined,
|
|
436
667
|
devtools: opts.devtools,
|
|
@@ -479,23 +710,66 @@ Examples:
|
|
|
479
710
|
}
|
|
480
711
|
};
|
|
481
712
|
if (isChat) {
|
|
482
|
-
const chatBatchItems = createdTesters.map((t, i) => ({
|
|
483
|
-
study_id: resolvedStudy,
|
|
484
|
-
tester_id: t.id,
|
|
485
|
-
config_id: resolvedConfigOverride || profileConfigMap.get(profileIds[i]),
|
|
486
|
-
...(opts.language && { language: opts.language }),
|
|
487
|
-
}));
|
|
488
713
|
const maxTurns = opts.maxTurns ? parseInt(opts.maxTurns, 10) : undefined;
|
|
489
714
|
if (opts.maxTurns !== undefined && (Number.isNaN(maxTurns) || maxTurns < 1)) {
|
|
490
715
|
throw new Error(`Invalid --max-turns value: ${opts.maxTurns}`);
|
|
491
716
|
}
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
717
|
+
if (isPair) {
|
|
718
|
+
if (!pairConfig || pairConversationIds.length === 0) {
|
|
719
|
+
throw new Error("Pair-mode dispatch reached without provisioned conversations — internal invariant violation.");
|
|
720
|
+
}
|
|
721
|
+
// Pair-mode dispatch (backend
|
|
722
|
+
// `app/api/simulation/routers/chat.py`):
|
|
723
|
+
// POST /simulation/chat/pair/start/batch
|
|
724
|
+
// body : { product_id, study_id,
|
|
725
|
+
// conversation_ids: UUID[1..20],
|
|
726
|
+
// config_id, # singular per batch
|
|
727
|
+
// max_turns?, language?, config_overrides? }
|
|
728
|
+
// One Cloud Task per conversation_id. Billing is
|
|
729
|
+
// chat_credit_cost(max_turns) * 2 * len(conversation_ids).
|
|
730
|
+
let pairConfigId = resolvedConfigOverride;
|
|
731
|
+
if (!pairConfigId) {
|
|
732
|
+
// Fall back to the first audience_a profile's
|
|
733
|
+
// simulation_config_id. Pair dispatch takes a single config
|
|
734
|
+
// for the whole batch, so we don't need the per-profile map
|
|
735
|
+
// the external_chatbot path builds.
|
|
736
|
+
const fallbackProfileId = pairConfig.audience_a[0];
|
|
737
|
+
if (!fallbackProfileId) {
|
|
738
|
+
throw new Error("Pair-mode dispatch requires --config <id>: the iteration has no audience profile to draw a default config_id from.");
|
|
739
|
+
}
|
|
740
|
+
const fallbackProfile = await client.get(`/tester-profiles/${fallbackProfileId}`);
|
|
741
|
+
if (!fallbackProfile.simulation_config_id) {
|
|
742
|
+
throw new Error(`Pair-mode dispatch requires a config_id. Profile ${fallbackProfileId} has no simulation config assigned and --config was not passed.\n` +
|
|
743
|
+
"Use --config <id> to specify one, or assign a config to the profile.\n" +
|
|
744
|
+
"List configs with: ish config list");
|
|
745
|
+
}
|
|
746
|
+
pairConfigId = fallbackProfile.simulation_config_id;
|
|
747
|
+
}
|
|
748
|
+
const simResult = await dispatchAttempt(() => client.post("/simulation/chat/pair/start/batch", {
|
|
749
|
+
product_id: resolvedWorkspace,
|
|
750
|
+
study_id: resolvedStudy,
|
|
751
|
+
conversation_ids: pairConversationIds,
|
|
752
|
+
config_id: pairConfigId,
|
|
753
|
+
...(maxTurns !== undefined && { max_turns: maxTurns }),
|
|
754
|
+
...(opts.language && { language: opts.language }),
|
|
755
|
+
}, { timeout: dispatchTimeoutMs }));
|
|
756
|
+
simResults = simResult.results;
|
|
757
|
+
}
|
|
758
|
+
else {
|
|
759
|
+
const chatBatchItems = createdTesters.map((t, i) => ({
|
|
760
|
+
study_id: resolvedStudy,
|
|
761
|
+
tester_id: t.id,
|
|
762
|
+
config_id: resolvedConfigOverride || profileConfigMap.get(profileIds[i]),
|
|
763
|
+
...(opts.language && { language: opts.language }),
|
|
764
|
+
}));
|
|
765
|
+
const simResult = await dispatchAttempt(() => client.post("/simulation/chat/start/batch", {
|
|
766
|
+
product_id: resolvedWorkspace,
|
|
767
|
+
simulations: chatBatchItems,
|
|
768
|
+
...(maxTurns !== undefined && { max_turns: maxTurns }),
|
|
769
|
+
...(opts.earlyTermination && { early_termination: true }),
|
|
770
|
+
}, { timeout: dispatchTimeoutMs }));
|
|
771
|
+
simResults = simResult.results;
|
|
772
|
+
}
|
|
499
773
|
}
|
|
500
774
|
else if (isMedia) {
|
|
501
775
|
const mediaBatchItems = createdTesters.map((t, i) => ({
|
|
@@ -507,7 +781,7 @@ Examples:
|
|
|
507
781
|
const simResult = await dispatchAttempt(() => client.post("/simulation/media/start/batch", {
|
|
508
782
|
product_id: resolvedWorkspace,
|
|
509
783
|
simulations: mediaBatchItems,
|
|
510
|
-
|
|
784
|
+
max_interactions: resolveMaxInteractions(opts.maxInteractions, iteration.details),
|
|
511
785
|
}, { timeout: dispatchTimeoutMs }));
|
|
512
786
|
simResults = simResult.results;
|
|
513
787
|
}
|
|
@@ -525,10 +799,78 @@ Examples:
|
|
|
525
799
|
platform: detailsView.platform || "browser",
|
|
526
800
|
...(detailsView.url && { url: detailsView.url }),
|
|
527
801
|
screen_format: detailsView.screenFormat || "desktop",
|
|
528
|
-
|
|
802
|
+
max_interactions: resolveMaxInteractions(opts.maxInteractions, iteration.details),
|
|
529
803
|
}, { timeout: dispatchTimeoutMs }));
|
|
530
804
|
simResults = simResult.results;
|
|
531
805
|
}
|
|
806
|
+
// Pair-mode preview block: surface the audience sizes + scenario
|
|
807
|
+
// previews + initiator in the JSON envelope so agents can verify
|
|
808
|
+
// what they just dispatched without needing a follow-up
|
|
809
|
+
// `iteration get`. Mirrors the human confirmation block (which is
|
|
810
|
+
// skipped under -y or --json).
|
|
811
|
+
const pairPreviewTurns = opts.maxTurns
|
|
812
|
+
? parseInt(opts.maxTurns, 10)
|
|
813
|
+
: (typeof iteration.details?.max_turns === "number"
|
|
814
|
+
? iteration.details.max_turns
|
|
815
|
+
: 14);
|
|
816
|
+
const pairPreview = isPair && pairConfig ? {
|
|
817
|
+
mode: "tester_pair",
|
|
818
|
+
audience_a_size: pairConfig.audience_a.length,
|
|
819
|
+
audience_b_size: pairConfig.audience_b.length,
|
|
820
|
+
// Post-dispatch we know the actual conversation count from the
|
|
821
|
+
// pair-batch (or reuse) result. This is the authoritative number
|
|
822
|
+
// — better than guessing from audience length, which may diverge
|
|
823
|
+
// when the backend trims to the smaller side.
|
|
824
|
+
conversation_count: pairConversationIds.length,
|
|
825
|
+
conversation_ids: pairConversationIds,
|
|
826
|
+
// Scale preview: matches the backend's billing-preflight
|
|
827
|
+
// formula (chat_credit_cost(turns) * 2 * conv_count). Upper
|
|
828
|
+
// bound — early-termination may shorten actual turns. The CLI
|
|
829
|
+
// doesn't claim exact credit cost; just call magnitude.
|
|
830
|
+
max_turns: Number.isFinite(pairPreviewTurns) ? pairPreviewTurns : null,
|
|
831
|
+
llm_calls_upper_bound: Number.isFinite(pairPreviewTurns)
|
|
832
|
+
? pairConversationIds.length * pairPreviewTurns * 2
|
|
833
|
+
: null,
|
|
834
|
+
// Credit cost upper bound — mirrors backend's chat_credit_cost × 2 × conv.
|
|
835
|
+
// Don't claim exactness; surface formula key so agents can branch
|
|
836
|
+
// on shape. Live rates will move to `GET /billing/rates` later.
|
|
837
|
+
credit_estimate: Number.isFinite(pairPreviewTurns)
|
|
838
|
+
? estimateChatPair({
|
|
839
|
+
conversationCount: pairConversationIds.length,
|
|
840
|
+
maxTurns: pairPreviewTurns,
|
|
841
|
+
})
|
|
842
|
+
: null,
|
|
843
|
+
initiator_side: pairConfig.initiator_side,
|
|
844
|
+
scenario_a_preview: pairConfig.scenario_a.replace(/\s+/g, " ").trim().slice(0, 200),
|
|
845
|
+
scenario_b_preview: pairConfig.scenario_b.replace(/\s+/g, " ").trim().slice(0, 200),
|
|
846
|
+
...(pairConfig.role_criteria_a && { role_criteria_a: pairConfig.role_criteria_a }),
|
|
847
|
+
...(pairConfig.role_criteria_b && { role_criteria_b: pairConfig.role_criteria_b }),
|
|
848
|
+
} : undefined;
|
|
849
|
+
// Non-pair credit estimate — surfaced as a top-level field in the
|
|
850
|
+
// JSON envelope alongside `pair_preview.credit_estimate`. Mirrors
|
|
851
|
+
// backend formulas (`media_credit_cost` / `chat_credit_cost`).
|
|
852
|
+
// null when we can't estimate (criteria-only audience, etc.).
|
|
853
|
+
const nonPairCreditEstimate = (() => {
|
|
854
|
+
if (isPair)
|
|
855
|
+
return null;
|
|
856
|
+
const testerCount = createdTesters.length || profileIds.length;
|
|
857
|
+
if (testerCount <= 0)
|
|
858
|
+
return null;
|
|
859
|
+
if (isChat) {
|
|
860
|
+
const turns = opts.maxTurns
|
|
861
|
+
? parseInt(opts.maxTurns, 10)
|
|
862
|
+
: (typeof iteration.details?.max_turns === "number"
|
|
863
|
+
? iteration.details.max_turns
|
|
864
|
+
: 14);
|
|
865
|
+
if (!Number.isFinite(turns))
|
|
866
|
+
return null;
|
|
867
|
+
return estimateChatSolo({ testerCount, maxTurns: turns });
|
|
868
|
+
}
|
|
869
|
+
const steps = resolveMaxInteractions(opts.maxInteractions, iteration.details);
|
|
870
|
+
if (!Number.isFinite(steps))
|
|
871
|
+
return null;
|
|
872
|
+
return estimateMediaRun({ testerCount, maxInteractions: steps });
|
|
873
|
+
})();
|
|
532
874
|
if (!opts.wait) {
|
|
533
875
|
if (globals.json) {
|
|
534
876
|
const testersOut = createdTesters.map((t) => ({
|
|
@@ -541,6 +883,9 @@ Examples:
|
|
|
541
883
|
testers: testersOut,
|
|
542
884
|
tester_ids: testersOut.map((t) => t.id),
|
|
543
885
|
tester_aliases: testersOut.map((t) => t.alias),
|
|
886
|
+
url: getWebUrl(globals, `/${resolvedWorkspace}/${resolvedStudy}/timeline`),
|
|
887
|
+
...(pairPreview && { pair_preview: pairPreview }),
|
|
888
|
+
...(nonPairCreditEstimate && { credit_estimate: nonPairCreditEstimate }),
|
|
544
889
|
simulations: dedupeSimulations(simResults),
|
|
545
890
|
}, true);
|
|
546
891
|
}
|
|
@@ -577,6 +922,9 @@ Examples:
|
|
|
577
922
|
testers: testersOut,
|
|
578
923
|
tester_ids: testersOut.map((t) => t.id),
|
|
579
924
|
tester_aliases: testersOut.map((t) => t.alias),
|
|
925
|
+
url: getWebUrl(globals, `/${resolvedWorkspace}/${resolvedStudy}/timeline`),
|
|
926
|
+
...(pairPreview && { pair_preview: pairPreview }),
|
|
927
|
+
...(nonPairCreditEstimate && { credit_estimate: nonPairCreditEstimate }),
|
|
580
928
|
simulations: dedupeSimulations(simResults),
|
|
581
929
|
results: rows,
|
|
582
930
|
}, true);
|