npm - @ishlabs/cli - Versions diffs - 0.13.0 → 0.14.1 - Mend

@ishlabs/cli 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/commands/iteration.js +219 -22
package/dist/commands/profile.js +75 -9
package/dist/commands/source.js +6 -4
package/dist/commands/study-run.js +382 -34
package/dist/commands/study.js +170 -9
package/dist/commands/workspace.js +35 -2
package/dist/lib/accessibility-profile.d.ts +12 -0
package/dist/lib/accessibility-profile.js +136 -0
package/dist/lib/ask-questions.js +9 -0
package/dist/lib/billing.d.ts +55 -0
package/dist/lib/billing.js +77 -0
package/dist/lib/docs.js +1106 -36
package/dist/lib/enums.d.ts +54 -0
package/dist/lib/enums.js +100 -0
package/dist/lib/local-sim/actions.d.ts +2 -1
package/dist/lib/local-sim/actions.js +88 -13
package/dist/lib/local-sim/loop.js +49 -19
package/dist/lib/local-sim/tabs.d.ts +27 -0
package/dist/lib/local-sim/tabs.js +157 -0
package/dist/lib/local-sim/types.d.ts +15 -0
package/dist/lib/modality.d.ts +70 -1
package/dist/lib/modality.js +323 -17
package/dist/lib/output.js +61 -4
package/dist/lib/skill-content.js +382 -19
package/dist/lib/types.d.ts +6 -1
package/package.json +1 -1

package/dist/lib/docs.js CHANGED Viewed

@@ -123,8 +123,43 @@ A \`null\` value on a \`*_max\` field means "unlimited" (paid tiers).
 Branch on \`studies_used >= studies_max\` before \`study create\`,
 likewise for \`testers_used\` before \`study run --sample\`.
+## Cold start — \`workspace_create\` is not safe to call blind
+On a saturated account, calling \`workspace_create\` (or
+\`ish workspace create\`) without first inspecting state returns
+\`error_code: usage_limit_reached\` immediately. A first-time agent
+that was told "create a fresh workspace, then run a study" will trip
+the cap on the very first call. **Always inspect existing workspaces
+first** — \`ish workspace list\` / \`workspace_get\` returns per-row
+metadata so you can pick a reuse target rather than blindly creating.
+Each row in the list response carries:
+- \`last_activity_at\` — most recent run, iteration, ask, or write on
+  this workspace. Pick the most recently active workspace if you want
+  one the user is likely already thinking about.
+- \`child_counts\` — \`{ studies, asks, tester_profiles }\`. Zero across
+  the board = a quiet/empty workspace, safe to reuse without
+  cluttering anyone's view.
+- \`has_headroom\` — \`true\` if the workspace is below
+  \`maxStudiesPerProduct\`, \`maxIterationsPerStudy\`, and
+  \`maxCustomTesterProfiles\` for the caller's tier. Branch on this
+  before \`study create\` / \`profile generate\` — \`false\` here will be
+  \`usage_limit_reached\` on the next call.
+For the idempotent create-or-reuse-by-name path, use
+\`ish workspace create --name <name> --ensure\`: returns the existing
+workspace owned by the caller if one with that name exists, otherwise
+creates a fresh one. Safe to call from a cold-start script without
+first scraping the list.
+The full saturated-account walkthrough (with branch logic + a worked
+transcript) lives at \`guides/cold-start\`.
 ## Related
+- \`guides/cold-start\` — saturated-account first-step playbook
+  (\`workspace_get\` → inspect headroom → reuse or \`--ensure\`).
 - \`concepts/secret\` — per-workspace secrets used in chatbot endpoint
   headers via \`{{secret:KEY}}\` placeholders.
 - \`reference/billing-limits\` — \`maxProducts\` cap on workspace creation.
@@ -134,7 +169,8 @@ const CONCEPT_STUDY = `# concept: study
 A **study** is the persistent research artifact. It defines:
 - \`modality\`: \`interactive\` (the tester drives a real browser), one of
   \`text | video | audio | image | document\` (media reaction studies),
-  or \`chat\` (multi-turn probe against an external chatbot endpoint).
+  or \`chat\` (multi-turn conversation — either with an external chatbot
+  endpoint or between two AI personas via tester_pair mode).
 - \`content_type\` (media studies only): \`email | social_post | ad | …\` —
   controls the framing the tester is given.
 - \`assignments\`: the tasks the tester performs. See \`concepts/assignment\`.
@@ -168,7 +204,7 @@ test artifact and don't need to A/B iterations:
 | \`video\`       | \`--content-url <url>\`                              |
 | \`audio\`       | \`--content-url <url>\`                              |
 | \`document\`    | \`--content-url <url>\`                              |
-| \`chat\`        | \`--endpoint <id>\` or \`--endpoint-config <file>\`  |
+| \`chat\`        | \`--endpoint <id>\` or \`--endpoint-config <file>\` (external_chatbot mode), or \`--chat-mode tester_pair --audience-a/-b --scenario-a/-b\` (two-AI rehearsal) |
 \`\`\`
 # Text — single email artifact:
@@ -261,14 +297,22 @@ pick was wrong.
 - \`concepts/questionnaire\` — question types and timing.
 - \`concepts/run-verbs\` — when to use \`study run\` vs \`ask run\`.
 - \`reference/billing-limits\` — \`maxStudiesPerProduct\` cap on study creation.
+- \`reference/credits\` — per-run credit cost & how to preview before dispatch.
 `;
 const CONCEPT_ITERATION = `# concept: iteration
 An **iteration** is one configured run of a study. It carries the
 volatile bits — the URL (interactive), the media (video/text/etc.), or
-the chatbot endpoint (chat) — while the study carries the persistent
+the chat payload (chat) — while the study carries the persistent
 shape (assignments, questionnaire, modality).
+For chat modality, the iteration's \`details.mode_details\` discriminator
+selects between **external_chatbot** (testers probe a customer chatbot
+endpoint) and **tester_pair** (two AI tester audiences converse with
+each other, one Conversation per pair index). Wire-shape examples and
+pair-mode rules live under the "## Chat modality" section below; the
+full chat-author workflow is at \`guides/chat\`.
 - Alias prefix: \`i-\`
 - A study has 1..N iterations. \`ish study run\` defaults to the latest.
 - Local files passed to \`--content-url\`, \`--image-urls\`, etc. are
@@ -311,9 +355,15 @@ ish iteration create --image-urls "./a.png,./b.png"
 # Document (PDF):
 ish iteration create --content-url ./report.pdf
-# Chat — probe a saved chatbot endpoint:
+# Chat (external_chatbot) — probe a saved chatbot endpoint:
 ish iteration create --chat-endpoint-id ce-... --max-turns 10 --early-termination
+# Chat (tester_pair) — rehearse a conversation between two AI audiences:
+ish iteration create --chat-mode tester_pair \\
+  --audience-a tp-a1,tp-a2 --audience-b tp-b1,tp-b2 \\
+  --scenario-a "You're a senior sales rep pitching ish." \\
+  --scenario-b "You're a skeptical CTO evaluating ish."
 # Inspect:
 ish iteration list --study s-b2c
 ish iteration get i-d4e
@@ -401,22 +451,279 @@ paragraph-by-paragraph reactions to a long caption. Use the
 ## Chat modality
-Chat iterations probe an external chatbot endpoint by having a tester
-hold a multi-turn conversation against it. Two ways to wire the
-endpoint:
+Chat iterations hold a multi-turn conversation. The conversation can
+take one of two shapes, picked by the \`mode_details.mode\` discriminator
+on the iteration:
+- **\`external_chatbot\`** — a tester talks to a customer chatbot
+  endpoint (the original chat behaviour). The endpoint config or saved
+  chatbot-endpoint reference lives at
+  \`details.mode_details.endpoint\` / \`details.mode_details.chatbot_endpoint_id\`.
+- **\`tester_pair\`** — two AI tester profiles talk to each other.
+  audience_a and audience_b pair 1:1 by index when counts match (N
+  pairs → N conversations); a side of exactly 1 broadcasts across the
+  other side (so 1 × N → N conversations all sharing the lone profile).
+  Each side carries its own scenario + goal; the other side does not
+  see it (the **asymmetry contract**). Useful for rehearsing a pitch, a
+  difficult conversation, a sales call, or any two-role scenario before
+  it happens.
+Wire-shape:
-\`\`\`
-# Reference a saved endpoint row (recommended — reproducible):
-ish iteration create --chat-endpoint-id ce-...
+\`\`\`json
+// external_chatbot
+{
+  "type": "chat",
+  "mode_details": {
+    "mode": "external_chatbot",
+    "endpoint": { "url": "https://...", "headers": {} },
+    "chatbot_endpoint_id": "ep-uuid"
+  },
+  "max_turns": 14,
+  "early_termination": true
+}
-# Inline endpoint config (one-off):
-ish iteration create --chat-endpoint-json '{"url":"https://...","headers":{...}}'
+// tester_pair (with explicit audiences)
+{
+  "type": "chat",
+  "mode_details": {
+    "mode": "tester_pair",
+    "audience_a": ["tp-uuid-1", "tp-uuid-2"],
+    "audience_b": ["tp-uuid-3", "tp-uuid-4"],
+    "scenario_a": "You are a senior sales rep pitching ish.",
+    "scenario_b": "You are a skeptical CTO evaluating ish.",
+    "initiator_side": "a"
+  },
+  "max_turns": 14,
+  "early_termination": true
+}
+// tester_pair (with role criteria — backend resolves the pool)
+{
+  "type": "chat",
+  "mode_details": {
+    "mode": "tester_pair",
+    "audience_a": [],
+    "audience_b": [],
+    "role_criteria_a": {
+      "occupation": ["founder", "ceo"],
+      "min_age": 28, "max_age": 55,
+      "country": ["US", "SE"]
+    },
+    "role_criteria_b": { "occupation": ["investor", "vc"] },
+    "scenario_a": "...",
+    "scenario_b": "...",
+    "initiator_side": "a"
+  },
+  "max_turns": 14,
+  "early_termination": true
+}
 \`\`\`
-Tunables:
-- \`--max-turns N\` — cap the conversation length (default 12, max 50).
+## Audience selection (tester_pair)
+Each side of a pair needs **either** an explicit audience list **or** a
+role-criteria filter (or both). Three input modes:
+| Side A input | Side B input | Behaviour |
+| --- | --- | --- |
+| \`--audience-a\` (UUIDs) | \`--audience-b\` (UUIDs) | Explicit pairing. Equal counts zip 1:1 by index; a side of exactly 1 broadcasts to the other. |
+| \`--role-criteria-a\` (JSON) | \`--role-criteria-b\` (JSON) | Backend resolves matching pool from each side's criteria and persists the IDs back to the iteration. |
+| Either flag pair | Either flag pair | Mixed (e.g. explicit A + criteria B). Backend handles each side independently. |
+| Both flags on one side | (any) | Criteria validates the explicit list; mismatch blocks run with a clear error. |
+**Persona-first principle**: the tester's persona is sacred — never
+altered by the scenario. Criteria filter the *eligible pool* upstream
+so that by the time a tester reaches the LLM prompt, their persona is
+already plausible for the role. The prompt construction itself does
+not change between explicit-audience and criteria-driven flows.
+\`RoleCriteria\` keys (all optional):
+- \`occupation: string[]\` (job titles, case-insensitive match)
+- \`min_age: int\`, \`max_age: int\`
+- \`gender: string[]\` (e.g. \`["female", "male"]\`)
+- \`country: string[]\` (ISO-3166-alpha-2 codes)
+- \`education_level_in: string[]\` (less_than_secondary, secondary, some_post_secondary, vocational_or_associate, bachelor, graduate)
+- \`household_in: string[]\` (single, couple_no_kids, couple_with_kids, single_parent, shared_housing, adult_with_parents, multi_generational). MECE: a couple raising children is \`couple_with_kids\`, not \`couple_no_kids\`; \`single\` means lives alone with no partner, roommates, parents, or children sharing the household.
+- \`locale_type_in: string[]\` (urban, suburban, small_town, rural)
+- \`income_level_in: string[]\` (lower, lower_middle, middle, upper_middle, upper, prefer_not_to_say)
+- \`employment_status_in: string[]\` (employed_full_time, employed_part_time, self_employed, unemployed_seeking, student, homemaker, retired, unable_to_work, other). Primary daytime activity wins: a student who works part-time is \`student\`; a retiree who freelances is \`retired\`.
+- \`requires_captions: bool\`, \`uses_screen_reader: bool\`, \`prefers_reduced_motion: bool\`, \`prefers_high_contrast: bool\`, \`has_any_accessibility_need: bool\` (coarse boolean filters over \`accessibility_profile\`)
+If the resolved pool is smaller than the requested conversation count
+for a side, \`ish study run\` exits 2 with the backend's error envelope
+intact. No silent fallback. Broaden the criteria, generate more
+profiles, or pass an explicit \`--audience-*\` list to recover.
+## Pair-mode flag names (CLI ↔ MCP alignment)
+CLI flags on \`ish study create\` / \`ish iteration create\` use the
+same nouns the MCP \`study_iterate.chat_pair\` payload uses, so an
+agent doesn't pay a translation tax when switching surfaces:
+| CLI flag                  | MCP field                  | What it carries                                     |
+|---------------------------|----------------------------|-----------------------------------------------------|
+| \`--audience-a\` / \`-b\`   | \`audience_a\` / \`audience_b\` | Explicit tester profile IDs (UUIDs or aliases) for that side. |
+| \`--role-criteria-a\` / \`-b\` | \`role_criteria_a\` / \`role_criteria_b\` | JSON filter (occupation, country, …) the backend resolves into a pool. |
+| \`--scenario-a\` / \`-b\`    | \`scenario_a\` / \`scenario_b\` | The system-prompt-shaped role text injected into one side's prompt only (asymmetry contract). |
+| \`--initiator-side\`        | \`initiator_side\`            | Which side speaks first (\`a\` default).             |
+| \`--max-turns\`             | \`max_turns\`                 | Conversation cap (default 14).                      |
+| \`--early-termination\`     | \`early_termination\`         | Allow the worker to end early when parties signal.  |
+The pre-2026-05 \`--profile-a\` / \`--profile-b\` CLI flags were
+renamed to \`--audience-a\` / \`--audience-b\` to match the MCP and
+the wire shape (\`mode_details.audience_a\` /
+\`mode_details.audience_b\`). Same intent, same accepted inputs
+(comma-separated UUIDs or aliases, repeatable). \`--role-criteria-a\`
+/ \`--role-criteria-b\` were already aligned with MCP and did not
+change.
+CLI authoring:
+\`\`\`
+# external_chatbot — reference a saved endpoint (recommended):
+ish iteration create --endpoint ep-abc --max-turns 10 --early-termination
+# external_chatbot — inline endpoint config:
+ish iteration create --endpoint-config ./bot.json
+# external_chatbot — legacy escape-hatch flags still work:
+ish iteration create --chat-endpoint-id ep-abc --max-turns 10
+ish iteration create --chat-endpoint-json '{"url":"https://..."}'
+# tester_pair — two AI audiences, asymmetric per-side scenarios:
+ish iteration create --chat-mode tester_pair \\
+  --audience-a tp-a1,tp-a2 --audience-b tp-b1,tp-b2 \\
+  --scenario-a @./sales_rep.md --scenario-b @./skeptical_cto.md \\
+  --max-turns 14
+# tester_pair — criteria-driven audience (persona-first filtering):
+ish iteration create --chat-mode tester_pair \\
+  --role-criteria-a '{"occupation":["founder","ceo"],"min_age":28}' \\
+  --role-criteria-b @./criteria_investor.json \\
+  --scenario-a @./sales_rep.md --scenario-b @./skeptical_cto.md \\
+  --max-turns 14
+\`\`\`
+Tunables (both modes):
+- \`--max-turns N\` — cap the conversation length (default 12 for
+  external_chatbot, 14 for tester_pair; persona drift starts ~20 turns
+  so cap accordingly).
 - \`--early-termination\` — let the worker end the session early when
-  the tester signals the conversation is over.
+  the parties signal the conversation is over.
+Pair-mode rules:
+- Each side needs **either** \`--profile-*\` (explicit IDs) **or**
+  \`--role-criteria-*\` (filter the backend resolves). The two can also
+  be combined — criteria then acts as validation on the explicit list.
+- When both sides use explicit \`--audience-a\` / \`--audience-b\`, they
+  must be the same length (≥ 1). Same profile on both sides is allowed
+  (self-talk rehearsal). When either side defers to criteria, the
+  length match is enforced server-side after pool resolution.
+- **1×N broadcast**: pass exactly one profile on one side and N
+  profiles on the other to rehearse the fixed side against N
+  variations. The CLI auto-broadcasts the singleton to match length
+  N. Example: \`--audience-a tp-rep --audience-b tp-cto1,tp-cto2,tp-cto3\`
+  produces 3 conversations, all sharing tp-rep on side A. The CLI
+  prints a stderr notice so you know broadcasting kicked in.
+- Both \`--scenario-a\` and \`--scenario-b\` are required and asymmetric.
+- \`--initiator-side\` defaults to \`a\` (side A speaks first).
+- \`--chat-mode\` accepts both \`tester_pair\` and \`tester-pair\`
+  (hyphenated variants are normalised). Same normalisation applies to
+  \`--screen-format\` (\`mobile_portrait\` ↔ \`mobile-portrait\`),
+  \`--kind\` on \`source upload\` (\`text_file\` ↔ \`text-file\`), and the
+  \`type\` field in \`--questionnaire\` / \`--questions\` manifests
+  (\`single-choice\` ↔ \`single_choice\`).
+- Audiences are pinned to the iteration. \`ish study run\` refuses
+  run-time audience overrides (\`--profile\` / \`--sample\` / \`--all\` /
+  filters) on a pair iteration — change the audiences via
+  \`ish iteration update <id> --details-json '{...}'\` instead.
+- \`--max-turns\` / \`--early-termination\` on \`ish study run\` override
+  the iteration's saved values for that single dispatch (they are not
+  persisted back to the iteration).
+- One Conversation row is created per pair index, server-side. The
+  per-conversation summary (\`end_reason\`, \`dominant_dynamic\`) lands on
+  the iteration response under \`conversations[]\`. Inspect via
+  \`ish iteration get <id>\`.
+## Writing a good scenario
+Thin scenarios produce thin rehearsals. Both \`scenario_a\` and
+\`scenario_b\` are injected into their own side's prompt as
+role-playing context — the partner does **not** see the other side's
+scenario or goal. Treat each scenario as a system prompt for one
+character in a play. Cover five things:
+1. **Role / identity** — who is this person?
+2. **Voice** — how do they speak? Formal, casual, technical, blunt?
+3. **What they know** — the context they came in with.
+4. **What they don't know** — the asymmetry that makes the rehearsal
+   interesting.
+5. **Goal** — what counts as success for *them*.
+Example (\`scenario_a\` — the sales rep):
+\`\`\`
+You are Maya, a senior account executive at ish — three years of
+experience selling research tooling to product orgs. You speak in
+clear, plain sentences, push back when you disagree, and quantify
+claims when you can. You know this is a 30-minute discovery call;
+you've read the prospect's LinkedIn and that's it. You do NOT know
+the prospect's current tooling, budget, or internal politics — your
+job is to find out by listening and asking. Success = end the call
+with a clear next step (a pilot, a follow-up demo, or a "no, here's
+why"). A polite "we'll get back to you" is not success.
+\`\`\`
+Example (\`scenario_b\` — the buyer):
+\`\`\`
+You are Devon, the CTO at a 60-person Series B SaaS company. You
+distrust new vendors by default — your team has been burned by
+"AI for research" tools twice in the past 18 months. You speak in
+short, sceptical sentences and interrupt vendor pitches with
+specifics: pricing, integrations, where the data lives. You know
+your team currently runs unmoderated tests via UserTesting and
+Pendo; the budget for new tooling is tight (€8k/year max). You do
+NOT know how ish prices, what it integrates with, or whether it
+handles your stack (Mixpanel + Heap + Linear). Success = leave the
+call with either a concrete proof point that addresses your top
+risk, OR a clean way to decline without burning the relationship.
+\`\`\`
+Read those back to back: the personas are asymmetric (different
+goals, different knowledge), grounded (specific tools, specific
+numbers), and constrained (each has a stake). That's the difference
+between a rehearsal that produces signal and one that produces
+generic dialogue. Keep each scenario under ~250 words — past that,
+persona drift starts to dominate.
+### Don't put demographics in the scenario
+A scenario describes **voice, knowledge, and goal** for one role —
+*not* the demographics of who plays it. Demographic constraints
+("you are 35-year-old Swedish founder") belong in
+\`--role-criteria-a\` / \`--role-criteria-b\` instead. The tester's
+persona stays sacred; criteria filter the eligible pool upstream so
+the persona is already plausible for the role by the time the LLM
+sees the prompt. Mixing demographics into the scenario text
+short-circuits the asymmetry contract and produces incoherent
+characters (a retired farmer suddenly "pitching a Series A").
+Paired with the Maya / Devon scenarios above, the criteria might
+look like:
+\`\`\`
+# --role-criteria-a (the sales rep filter):
+{"occupation":["sales","account executive"],"min_age":28,"max_age":50}
+# --role-criteria-b (the skeptical CTO filter):
+{"occupation":["cto","vp engineering","head of engineering"],
+ "country":["US","SE"],"education_level_in":["bachelor","graduate"]}
+\`\`\`
+Scenarios describe the role; criteria pick who plays it.
 ## No more auto-empty iteration A
@@ -444,6 +751,7 @@ Treat this as actionable, not transient — re-running won't change anything.
 - \`concepts/run-verbs\` — how \`ish study run\` selects the iteration.
 - \`concepts/audience\` — how testers are picked for a run.
 - \`reference/billing-limits\` — \`maxIterationsPerStudy\` cap on iteration creation.
+- \`reference/credits\` — per-iteration-run credit cost & preview shape (\`pair_preview.credit_estimate\` for tester-pair, top-level \`credit_estimate\` otherwise).
 `;
 const CONCEPT_ASSIGNMENT = `# concept: assignment
@@ -527,6 +835,11 @@ ish study create … --questionnaire ./questionnaire.json
 \`questionnaire.json\` is an array of question objects in the shape above.
 The same shape is accepted by \`ish ask add-questions … --questions …\`.
+The \`type\` field is hyphenated for the multi-word values (\`single-choice\`,
+\`multiple-choice\`). The CLI normalises the underscored variants
+(\`single_choice\`, \`multiple_choice\`) back to the canonical hyphenated form,
+so either works in your manifest.
 ## Related
 - \`concepts/ask\` — asks have per-round questions, similar shape.
@@ -700,6 +1013,33 @@ copy can safely append questions without losing prior round results.
 See \`reference/json-mode\` for the full shape.
+## Response-shape ergonomics
+A few non-obvious shape rules on the MCP / ask endpoints that save
+round-trips when you know them up front:
+- **\`cross_round_summary\` requires \`wants_pick=true\` on every
+  round.** \`ask results\` / \`ask_get\` only compute the top-level
+  \`cross_round_summary\` when *every* round in the ask was dispatched
+  with \`wants_pick=true\` — picks across rounds are the only
+  comparable signal. When even one round is a free-text drill
+  question (\`wants_pick=false\`), the field is omitted and the
+  response carries a \`cross_round_summary_reason\` string explaining
+  which round(s) lacked \`wants_pick\` (e.g.
+  \`"omitted: rounds 2, 3 lack wants_pick=true"\`). Branch on the
+  reason, don't poll for the field to appear.
+- **\`audience_get\` omits \`accessibility_profile\` by default.** The
+  field is ~1KB per row; on a 50-profile page it overflows
+  agent-tool result budgets. Pass
+  \`include_accessibility_profile=true\` to include it. Mirrors the
+  existing \`include_bio=false\` default — same opt-in pattern.
+- **\`ask_testers\` uses \`dispatch_into_round\`, not \`round\`.** The
+  parameter name was renamed from the ambiguous \`round\` (which read
+  as "start from round N") to the verbatim \`dispatch_into_round\`
+  ("add these new testers into round N"). Behavior is unchanged —
+  it appends testers to the named round on an existing ask, it does
+  not roll back or restart any prior round.
 ## Variant syntax
 \`--variant <type>:<value>[::label=<label>]\`
@@ -714,6 +1054,7 @@ See \`reference/json-mode\` for the full shape.
 - \`concepts/round\` — what a round is and how it executes.
 - \`concepts/audience\` — how testers are chosen at ask creation.
 - \`concepts/run-verbs\` — \`ish ask run\` vs \`ish study run\`.
+- \`reference/credits\` — ask rounds bill 1 credit per successful response.
 `;
 const CONCEPT_ROUND = `# concept: round
@@ -816,6 +1157,58 @@ Expected JSON: \`{ "name": "...", "type": "ai", "gender": "female",
   Re-generating the same name/country/occupation/age yields the
   same DOB.
+## Structured profile fields
+Five universal enums + a versioned accessibility JSONB live on every
+TesterProfile. Values are snake_case and match
+\`https://ishlabs.io/spec/profile-enums.v1.json\` byte-for-byte.
+- \`education_level\`: \`less_than_secondary\`, \`secondary\`,
+  \`some_post_secondary\`, \`vocational_or_associate\`, \`bachelor\`, \`graduate\`
+- \`household\` (MECE): \`single\`, \`couple_no_kids\`, \`couple_with_kids\`,
+  \`single_parent\`, \`shared_housing\`, \`adult_with_parents\`,
+  \`multi_generational\`. A couple raising children is \`couple_with_kids\`,
+  not \`couple_no_kids\`. \`single\` means lives alone (no partner,
+  roommates, parents, or children sharing the household).
+- \`locale_type\`: \`urban\`, \`suburban\`, \`small_town\`, \`rural\`
+- \`income_level\`: \`lower\`, \`lower_middle\`, \`middle\`, \`upper_middle\`,
+  \`upper\`, \`prefer_not_to_say\`
+- \`employment_status\`: \`employed_full_time\`, \`employed_part_time\`,
+  \`self_employed\`, \`unemployed_seeking\`, \`student\`, \`homemaker\`,
+  \`retired\`, \`unable_to_work\`, \`other\`. Pick the primary daytime
+  activity: a student who works part-time is \`student\`; a retiree who
+  freelances is \`retired\`.
+- \`accessibility_profile\`: JSONB v1.0 with optional \`visual\`,
+  \`auditory\`, \`motor\`, \`cognitive\`, \`data\` groups, plus
+  \`assistive_tech: string[]\` and \`notes\`. Empty \`{}\` means "no
+  accessibility configuration declared". Schema:
+  \`https://ishlabs.io/spec/accessibility-profile-schema.v1.json\`.
+Set them on \`ish profile update\`:
+\`\`\`
+ish profile update tp-1b9 \\
+    --education-level bachelor \\
+    --household couple_with_kids \\
+    --locale-type suburban \\
+    --income-level middle \\
+    --employment-status employed_full_time
+# accessibility_profile accepts inline JSON or a path:
+ish profile update tp-1b9 --accessibility-profile '{
+  "version": "1.0",
+  "visual": {"uses_screen_reader": true, "text_size": "large"},
+  "cognitive": {"reduce_motion": true},
+  "assistive_tech": ["VoiceOver"]
+}'
+ish profile update tp-1b9 --accessibility-profile ./a11y.json
+\`\`\`
+The legacy \`--tech-savviness\` flag was removed in
+\`profile-schema-v2\`; passing it now produces commander's standard
+"unknown option" error.
 ## Related
 - \`concepts/source\` — the inputs to \`profile generate\`.
@@ -829,7 +1222,7 @@ audio file, image, or PDF that an LLM reads to ground generated profiles
 in real customer evidence.
 - Alias prefix: \`tps-\`
-- Source kinds: \`text_file | audio | image\` (auto-detected from extension).
+- Source kinds: \`text_file | audio | image\` (auto-detected from extension; \`text-file\` is accepted as a hyphen variant).
 - Audio supports speaker diarization via \`--diarize\`.
 ## Two ways to use a source
@@ -894,6 +1287,52 @@ Error: No simulatable AI tester profiles in workspace w-b32 match:
 The suggestion is best-effort — it never replaces the original error,
 just augments it.
+## Audience-build behaviors to know before dispatch
+Two adjacent footguns surface most often on first-time audience
+construction. Both are documented here because they cost a round-trip
+to discover by experiment.
+### \`occupation\` is a loose substring match
+\`audience_build\` (and the \`--search\` flag) treats \`occupation\` as
+a **loose, case-insensitive substring filter**, not a whole-token /
+taxonomy match. \`occupation=["manager"]\` will match hotel managers,
+retail store managers, bank branch managers — anything containing
+the literal string "manager". Three patterns that recover the
+specificity you usually want:
+- **Whole-token alternation**: \`occupation=["engineering manager",
+  "software engineering manager", "vp engineering", "tech lead"]\` —
+  exhaustive enumeration of the role surface beats one short token.
+- **Pair with other filters**: \`occupation=["manager"]\` +
+  \`min_age=28\` + \`country=["US","SE"]\` narrows even a loose substring
+  meaningfully.
+- **Preview before dispatch**: \`audience_build\` returns a
+  \`match_preview\` summary on the response — a 1-line histogram of
+  matched occupations (e.g. \`"matched 17 — software developer (12),
+  DevOps engineer (3), other (2)"\`). Read it before
+  \`ask_run\` / \`study_run\` to confirm the substring is matching what
+  you intended; iterate on the filter cheaply if not.
+### The public profile pool skews non-tech / non-Western
+The default public tester-profile pool was built from a broad
+demographic sample — so a substring like \`"software engineering
+manager"\` may return only a handful of matches, while \`"hotel
+manager"\` or \`"retail associate"\` return many. Two adaptations:
+- **Don't assume Silicon Valley defaults.** A criteria-driven audience
+  that works on a private testing pool may resolve to a much smaller
+  count in the public pool. Read the \`match_preview\` (or count) on
+  every \`audience_build\` before dispatching a run that depends on
+  reaching N matches.
+- **Seed your own pool when you need a specific archetype.** If the
+  public pool is genuinely thin for your role, generate the audience
+  yourself via \`ish profile generate --description "..."\` — that
+  produces profiles plausible for the role you described, regardless
+  of public-pool composition. See \`concepts/profile\`.
 ## Defaults
 - \`ish study run\` with no audience flags → reuses the iteration's
@@ -1347,6 +1786,15 @@ The CLI guarantees these contracts so agents can chain safely:
   is collapsed to one batch entry per study (M13) with nested
   \`tester_ids[]\`, \`tester_aliases[]\`, \`job_ids[]\`, and \`count\` —
   an N-sample dispatch is a single row, not N near-duplicate rows.
+- **\`study\` JSON includes a \`url\` field.** \`study create\`,
+  \`study generate\`, \`study get\`, \`study list\` (per item), and
+  \`study run\` each return a top-level \`url\` pointing to the study
+  in the web app — \`/<workspace>/<study>/overview\` on the read /
+  write paths, \`/<workspace>/<study>/timeline\` on \`study run\`.
+  Print it to the user instead of composing the host + path yourself.
+  The base host follows the active backend: \`https://app.ishlabs.io\`
+  on production, \`http://localhost:3000\` under \`--dev\`. Override
+  with the \`ISH_APP_URL\` env var for staging or self-hosted UIs.
 - **\`study results --json\` includes per-answer sentiment** (M10).
   Every \`interview_answers[].answers[]\` row carries \`sentiment\`
   (the tester's session-level label from \`tester_summary.sentiment\`),
@@ -1357,16 +1805,30 @@ The CLI guarantees these contracts so agents can chain safely:
   error_message}. Drops \`interview_answers\` and per-interaction
   breakdowns. Cheapest "did this run land?" shape.
 - **\`study results --transcript <tester_id>\`** is the chat-modality
-  projection. Returns \`{tester_id, tester_alias, transcript: [...],
-  unique_bot_replies, tester_summary}\`. Each transcript entry is
-  \`{role, text, turn_index, ...}\` — bot turns add \`failure\`
-  (set when the dispatch crashed); tester turns add \`action_type\`,
-  \`option_label\`, and \`sentiment\`. \`text\` is null on tester
-  turns whose action carries no text (\`select_option\`,
-  \`ignore_offered\`); read intent from \`action_type\` +
-  \`option_label\`. Same shape as the MCP \`get_chat_transcript\`
-  tool. \`unique_bot_replies = 1\` on a multi-turn run is the M2 loop
-  signature.
+  projection — **external_chatbot mode only in v1**. Returns
+  \`{tester_id, tester_alias, transcript: [...], unique_bot_replies,
+  tester_summary}\`. Each transcript entry is \`{role, text, turn_index,
+  ...}\` — bot turns add \`failure\` (set when the dispatch crashed);
+  tester turns add \`action_type\`, \`option_label\`, and \`sentiment\`.
+  \`text\` is null on tester turns whose action carries no text
+  (\`select_option\`, \`ignore_offered\`); read intent from
+  \`action_type\` + \`option_label\`. Same shape as the MCP
+  \`get_chat_transcript\` tool. \`unique_bot_replies = 1\` on a
+  multi-turn run is the M2 loop signature.
+  **For tester_pair conversations**, the bot/tester role pair doesn't
+  apply (both speakers are testers). Inspect pair transcripts via the
+  iteration response instead:
+  \`\`\`bash
+  ish iteration get <iter-id> --json | jq '.conversations[]'
+  # → [{ id, pair_index, started_at, ended_at, end_reason, summary, ... }]
+  \`\`\`
+  Per-side tester summaries still land on each tester row
+  (\`ish study tester <id> --json\`); the conversation-level summary
+  (\`end_reason\`, \`dominant_dynamic\`, \`who_steered\`) lands on
+  \`iteration.conversations[]\`.
 - **\`study tester --summary\`** drops the action timeline and
   returns just \`{tester, interaction_count, sentiment, comment,
   error_message?, error_kind?}\`.
@@ -1438,9 +1900,24 @@ The CLI guarantees these contracts so agents can chain safely:
   phase-2 LLM calls instead of 2N. Pass \`--redispatch-all\` for the
   legacy reset behavior when you want fresh first impressions.
 - **\`ask results --json\` includes \`cross_round_summary\` for 2+
-  rounds.** Top-level field with per-round picks/winner snapshots and
-  a \`picks_delta\` (R1 → last round). Replaces hand-rolled diffing of
-  two \`ask results\` calls.
+  rounds — when every round used \`wants_pick=true\`.** Top-level
+  field with per-round picks/winner snapshots and a \`picks_delta\`
+  (R1 → last round). Replaces hand-rolled diffing of two
+  \`ask results\` calls. When **any** round was dispatched with
+  \`wants_pick=false\` (typical for free-text follow-up rounds), the
+  summary is omitted and \`cross_round_summary_reason\` carries the
+  explanation (e.g. \`"omitted: rounds 2, 3 lack wants_pick=true"\`).
+  Branch on the reason field, don't poll for the summary.
+- **\`audience_get\` omits \`accessibility_profile\` by default.** The
+  block is ~1KB per row; including it on a 50-row page overflows
+  agent tool result budgets. Pass
+  \`include_accessibility_profile=true\` to opt in. Mirrors the
+  existing \`include_bio=false\` opt-in.
+- **\`ask_testers\` parameter is \`dispatch_into_round\`, not
+  \`round\`.** Reads verbatim — "dispatch these new testers into round
+  N". The old name (\`round\`) read as "start from round N", which
+  was wrong: the call never restarts prior rounds, it only appends
+  testers to the named round. Behavior unchanged across the rename.
 - **No more auto-empty iteration A.** \`study create\` and
   \`study generate\` no longer produce a placeholder iteration A. The
   first explicit \`ish iteration create\` becomes label A.
@@ -1739,6 +2216,168 @@ of scope: \`workspace\`, \`config\`, \`docs\`, \`init\`, \`login\`,
   including \`--get workspace.alias\` to capture the active workspace
   without piping \`ish status --json\` through \`jq\`.
 `;
+const REFERENCE_CREDITS = `# reference: credits & cost preview
+Every billable run (study, ask, insight) costs **credits**. The CLI
+surfaces a cost upper bound *before* you dispatch so you can budget. The
+backend is the authoritative source — its rejection envelope on
+\`insufficient_credits\` carries the live required/available pair.
+## How costs are shaped
+The formula has the same shape across modalities — \`max(1, round(N / 10))\`
+per principal — but the inputs differ. **Treat the rates below as the
+current calibration**; they will evolve as we differentiate per-modality
+compute cost. Agents should:
+- For prospective cost preview: read \`credit_estimate\` from \`study run\`'s
+  JSON envelope (top-level for solo/media runs; under \`pair_preview\` for
+  tester-pair chat).
+- For hard budget checks: catch the backend's \`insufficient_credits\`
+  rejection (HTTP 402; envelope shape below) and react to
+  \`required\` / \`available\`.
+| Surface             | Per-principal cost              | Total formula                                    | Example                              |
+|---------------------|---------------------------------|--------------------------------------------------|--------------------------------------|
+| Interactive (URL)   | \`max(1, round(steps/10))\`       | \`testers × per-tester\`                           | 10 testers × 30 steps → 30 credits   |
+| Text/image/video/audio/document | same                | same                                             | 5 testers × 20 steps → 10 credits    |
+| Chat (external chatbot, solo) | \`max(1, round(turns/10))\` | \`testers × per-tester\`                           | 5 testers × 12 turns → 10 credits    |
+| Chat (tester pair)  | \`max(1, round(turns/10))\` × 2   | \`conv × per-side × 2\`                            | 3 conv × 14 turns → 6 credits        |
+| Ask round           | 1 / successful response         | \`successful_testers\`                             | 50 responses → 50 credits            |
+| Study insights      | first free, then **10 flat**    | n/a                                              | 2nd analysis → 10 credits            |
+All numbers are **upper bounds**. Early termination, refusals, or
+backend audience trimming can reduce actual charge.
+## Capping interactive/media spend (\`--max-interactions\`)
+\`ish study run\` always sends \`max_interactions\` to the backend for
+interactive and media runs. Precedence: \`--max-interactions <n>\` flag
+> the iteration's stored \`details.max_interactions\` > **CLI default
+of 20**. The default exists to prevent runaway spend when a tester
+gets stuck on a broken or non-responsive surface — without a cap, one
+stuck tester can rack up 100+ steps before the SDK gives up. Pass
+\`--max-interactions\` to override (e.g. \`--max-interactions 50\` for
+deeper exploration, \`--max-interactions 5\` for a cheap smoke test).
+The confirmation block shows the resolved value and where it came
+from (flag / iteration / CLI default). The JSON envelope's
+\`credit_estimate.breakdown\` reflects the dispatched value.
+## Where the CLI surfaces it
+**Human output — \`study run\` confirmation block:**
+\`\`\`
+Run settings:
+  ...
+  Scale:          3 conv × 14 turns × 2 sides ≈ 84 LLM calls (upper bound — early-termination may shorten)
+  Credits (est):  ≈ 6 credit(s) upper bound — see \`ish docs get-page reference/credits\`
+\`\`\`
+**JSON envelope — \`study run --json\`:**
+Pair chat — under \`pair_preview\`:
+\`\`\`json
+{
+  "pair_preview": {
+    "conversation_count": 3,
+    "max_turns": 14,
+    "llm_calls_upper_bound": 84,
+    "credit_estimate": {
+      "upper_bound": 6,
+      "formula": "chat_pair",
+      "breakdown": "3 conv × max(1, round(14 turns / 10)) × 2 sides = 3 × 1 × 2 = 6",
+      "unit": "credits"
+    }
+  }
+}
+\`\`\`
+Solo media/interactive/chat — top-level \`credit_estimate\`:
+\`\`\`json
+{
+  "iteration_id": "…",
+  "credit_estimate": {
+    "upper_bound": 30,
+    "formula": "media_per_tester",
+    "breakdown": "10 tester(s) × max(1, round(30 steps / 10)) = 10 × 3 = 30",
+    "unit": "credits"
+  }
+}
+\`\`\`
+The \`formula\` key is stable: agents can branch on it (\`media_per_tester\`,
+\`chat_solo\`, \`chat_pair\`, \`ask_per_response\`).
+## Tier allotments
+| Tier        | Monthly credits           | Notes                          |
+|-------------|---------------------------|--------------------------------|
+| FREE        | 200 (one-time signup)     | Never refilled                 |
+| STARTER     | 1,000 / month             | Monthly reset                  |
+| PRO         | 3,000 / month             | Monthly reset                  |
+| ENTERPRISE  | unlimited                 | Custom contract                |
+The CLI does not enforce these — the backend does. The CLI's job is to
+*preview*, so an agent doesn't dispatch a 5,000-credit run on a
+200-credit account.
+## Insufficient-credit rejection shape
+When you try to dispatch beyond what's available, the backend returns
+HTTP 402. The CLI surfaces it as a structured error envelope:
+\`\`\`json
+{
+  "error": "Insufficient credits.",
+  "error_code": "insufficient_credits",
+  "status": 402,
+  "retryable": false,
+  "required": 30,
+  "available": 8,
+  "upgrade_url": "https://app.ishlabs.io/billing"
+}
+\`\`\`
+Exit code \`1\` (non-retryable). Don't poll — the user has to upgrade or
+free credits before re-dispatch.
+## Agent recipe
+1. Build/draft the run (\`study create\`, \`iteration create\`).
+2. Call \`study run\` *without* \`--dispatch\` to read the
+   \`credit_estimate\` upper bound from JSON. (Or \`--dry-run\` where
+   supported — see modality concept pages.)
+3. If \`upper_bound\` fits your budget, re-call with \`--dispatch\`.
+4. If you hit \`error_code: insufficient_credits\`, surface
+   \`required\` / \`available\` / \`upgrade_url\` to the human.
+## Caveats
+- The CLI's preview uses the **same formula** the backend bills with,
+  but does **not** make a network preflight call — it's pure math
+  client-side. If the backend formula changes mid-version, the preview
+  will drift until the CLI is updated. The \`insufficient_credits\`
+  rejection envelope is always authoritative.
+- Pair-chat \`credit_estimate\` is \`null\` if \`max_turns\` isn't a finite
+  number (e.g. the iteration doesn't specify one and there's no
+  \`--max-turns\` flag).
+- Audience criteria that resolve server-side won't have a precise
+  estimate at preview time — the CLI prints the shape (\`N × … × 2\`)
+  instead of a number.
+## Related
+- \`reference/billing-limits\` — per-tier *entity* caps (max
+  workspaces/studies/iterations/profiles), separate from credit budget.
+- \`reference/json-mode\` — full error envelope shape and exit codes.
+- \`concepts/study\`, \`concepts/iteration\`, \`concepts/ask\` —
+  per-modality run shapes.
+- \`guides/chat\` — worked example of a pair-chat run including
+  \`pair_preview.credit_estimate\`.
+`;
 const REFERENCE_BILLING_LIMITS = `# reference: billing tier limits
 Some create operations are gated by your account's billing tier. The
@@ -1812,6 +2451,9 @@ upgrade or delete an existing resource to free up headroom.
 ## Related
+- \`reference/credits\` — per-run credit cost & preview (separate from
+  these entity caps; this page is about *how many things you can have*,
+  that page is about *how much each run costs*).
 - \`concepts/workspace\` — \`maxProducts\` is per-account.
 - \`concepts/study\`     — \`maxStudiesPerProduct\` gates study creation.
 - \`concepts/iteration\` — \`maxIterationsPerStudy\` gates iteration creation.
@@ -1820,6 +2462,51 @@ upgrade or delete an existing resource to free up headroom.
 `;
 const GUIDE_CHAT = `# guide: chat-modality studies
+Chat-modality studies cover two distinct shapes:
+- **external_chatbot** — testers probe a customer chatbot endpoint
+  (sections 1-3 below: configure → smoke test → run).
+- **tester_pair** — two AI personas converse with each other for
+  rehearsal scenarios. Pitch rehearsals, difficult-conversation
+  prep, founder-vs-investor archetypes. See section 7a/7b and the
+  TL;DR below.
+## TL;DR — rehearse a pitch in one shot
+For "rehearse my pitch against 3 different skeptical CTOs" (the
+canonical 1×N variations shape), this is the whole flow. Inline
+scenarios — no extra files needed:
+\`\`\`bash
+# Capture aliases for the rep (1) and CTOs (3) via subshell:
+REP=$(ish profile generate \\
+  --description "Senior B2B SaaS account executive; concise, technical" \\
+  --count 1 --json | jq -r '.items[0].alias')
+CTOS=$(ish profile generate \\
+  --description "Skeptical CTO at Series B SaaS; distrusts AI vendors" \\
+  --count 3 --json | jq -r '[.items[].alias] | join(",")')
+# One-shot study + iteration A (1×N broadcast does the rest):
+ish study create --modality chat --chat-mode tester_pair \\
+  --name "Pitch rehearsal" \\
+  --audience-a "$REP" --audience-b "$CTOS" \\
+  --scenario-a "You are pitching <your product>. Be concise, push back on vague objections. Goal: land a pilot or a clear next step." \\
+  --scenario-b "You are a skeptical CTO. Probe for technical depth, distrust marketing-speak, refuse to commit without evidence. Goal: leave with either a concrete proof point or a graceful 'no'." \\
+  --assignment "Pitch:Land a pilot" --max-turns 14
+# Run all 3 conversations:
+ish study run -y --wait
+# Compare side-by-side:
+ish iteration get <iter-id> --json \\
+  | jq '.conversations[] | {pair_index, end_reason, dynamic: .summary.dominant_dynamic}'
+\`\`\`
+Section 7b below has the longer version with scenario-writing
+guidance, criteria-driven audiences, and the broadcast rule.
+---
 Goal: from a customer chatbot endpoint to a finished chat-modality
 study with parsed transcripts, end to end via the CLI. The flow has
 three phases: configure the endpoint, smoke test it, run a study.
@@ -2113,13 +2800,20 @@ cat ./bot-config.json | ish study create \\
 Optional \`--max-turns <n>\` (default 12) caps the chat per tester.
-Audience size is set at run time. Use \`--sample <N>\` to pick N
-random simulatable profiles, or \`--all\` for the full pool.
-\`--profile <id>\` is also supported for explicit selection:
+Audience size is set at run time for **external_chatbot** chat
+studies. Use \`--sample <N>\` to pick N random simulatable profiles,
+or \`--all\` for the full pool. \`--profile <id>\` is also supported
+for explicit selection:
 \`\`\`
 ish study run stu-xyz --sample 5 --wait
 \`\`\`
+> **Pair-mode is different.** \`--sample\` / \`--profile\` / demographic
+> filters on \`study run\` are **refused** for tester_pair iterations
+> — pair audiences live on the iteration itself. Set them at
+> iteration-create time via \`--audience-a/-b\` (with 1×N broadcast)
+> or \`--role-criteria-a/-b\`. See the tester_pair section below.
 Pull raw interactions:
 \`\`\`
 ish study results stu-xyz --json | jq '.interactions'
@@ -2141,6 +2835,171 @@ ish iteration create --study stu-xyz --endpoint-config ./bot.json
 Same flag set as \`study create\`'s chat shortcut.
+## tester_pair: rehearse a conversation between two AI personas
+\`Modality.CHAT\` also supports a **tester_pair** mode where two AI
+tester profiles converse with each other — useful for rehearsing a
+sales pitch, a difficult conversation, a fundraising chat, or any
+two-role scenario. Each side has its own scenario + goal text; the
+other side does NOT see it (the asymmetry contract). Audiences are
+1:1 paired by index (audience_a[i] talks to audience_b[i]).
+One-shot study + iteration:
+\`\`\`
+ish study create \\
+    --modality chat --chat-mode tester_pair \\
+    --name "Pitch rehearsal" \\
+    --audience-a tp-sales-1,tp-sales-2 \\
+    --audience-b tp-cto-skeptic-1,tp-cto-skeptic-2 \\
+    --scenario-a @./sales_rep.md \\
+    --scenario-b @./skeptical_cto.md \\
+    --assignment "Pitch:Try to win the meeting"
+\`\`\`
+Or add a pair iteration to an existing chat study:
+\`\`\`
+ish iteration create --study stu-xyz --chat-mode tester_pair \\
+    --audience-a tp-a1,tp-a2 --audience-b tp-b1,tp-b2 \\
+    --scenario-a "..." --scenario-b "..." \\
+    --max-turns 14
+\`\`\`
+### Rehearsing against N variations of one side (1×N)
+The most common rehearsal shape: fix one side (your role) and vary
+the other (the audience you're rehearsing against). E.g. "pitch this
+once and see how it lands against 3 different skeptical CTOs."
+Step 1 — produce N distinct profiles for the varying side:
+\`\`\`bash
+# Generate 3 skeptical-CTO profiles (or any archetype):
+ish profile generate \\
+    --description "Skeptical CTO at a Series B SaaS startup; distrusts AI vendors" \\
+    --count 3 --json | jq -r '.items[].alias'
+# → tp-cto1, tp-cto2, tp-cto3
+\`\`\`
+If you already have profiles you want to reuse, list them:
+\`\`\`bash
+ish profile list --search "cto" --json | jq -r '.items[].alias'
+\`\`\`
+Step 2 — author the two scenarios as separate files (\`sales_rep.md\`
+and \`skeptical_cto.md\`). **Each scenario is a system prompt for one
+role — the other side never sees it.** Cover voice, what they know,
+what they don't know, and what counts as success for them. Don't
+cram demographic constraints into the text; that's what
+\`--role-criteria-\*\` is for. See the **"Writing a good scenario"**
+section below for the Maya/Devon worked example and the 5-point
+template.
+Step 3 — create the iteration with **one profile** on the fixed
+side and **N profiles** on the varying side. The CLI auto-broadcasts
+the singleton to match length N (and prints a stderr notice like
+\`Broadcasting --audience-a (1 profile) to length 3 to match --audience-b\`
+when it does, so you can see it happen):
+\`\`\`bash
+ish study create \\
+    --modality chat --chat-mode tester_pair \\
+    --name "Pitch rehearsal — 3 CTO variants" \\
+    --audience-a tp-rep \\
+    --audience-b tp-cto1,tp-cto2,tp-cto3 \\
+    --scenario-a @./sales_rep.md \\
+    --scenario-b @./skeptical_cto.md \\
+    --assignment "Pitch:Land a pilot or a clear next step"
+# Result: 3 conversations, all using tp-rep on side A, one each
+# of tp-cto1/2/3 on side B. Same scenario for the CTOs (they share
+# the role description) but different underlying personas, so the
+# conversations diverge in tone and pressure points.
+\`\`\`
+Run it (\`--yes\` to skip the confirmation prompt):
+\`\`\`bash
+ish study run -y --wait
+\`\`\`
+Inspect the per-conversation summaries side-by-side:
+\`\`\`bash
+ish iteration get <iter-id> --json \\
+    | jq '.conversations[] | {pair_index, end_reason, dominant_dynamic: .summary.dominant_dynamic}'
+\`\`\`
+**When to use criteria instead**: if you don't care about specific
+profile IDs and just want "any 3 CTOs the backend can find", pass
+\`--role-criteria-b '{"occupation":["cto"]}'\` (alone or with a single
+\`--audience-a tp-rep\`). The backend resolves the matching pool at
+iteration-create time. Caveat: the resolved pool may collapse onto
+similar personas — for guaranteed distinctness, generate explicit
+profiles first.
+### Criteria-driven audience (persona-first filtering)
+When you don't want to hand-pick UUIDs, pass a **role-criteria
+filter** per side. The backend resolves it into an eligible pool of
+tester profiles and pairs them 1:1. The persona itself is never
+altered — criteria filter the pool upstream so the persona is
+already plausible for the role:
+\`\`\`
+ish study create \\
+    --modality chat --chat-mode tester_pair \\
+    --name "Pitch rehearsal" \\
+    --role-criteria-a '{"occupation":["sales","account executive"],"min_age":28}' \\
+    --role-criteria-b '{"occupation":["cto","vp engineering"],"country":["US","SE"]}' \\
+    --scenario-a @./sales_rep.md --scenario-b @./skeptical_cto.md \\
+    --assignment "Pitch:Try to land a pilot"
+\`\`\`
+Keys (all optional): \`occupation\`, \`min_age\`, \`max_age\`,
+\`gender\`, \`country\`, \`education_level_in\`, \`household_in\`,
+\`locale_type_in\`, \`income_level_in\`, \`employment_status_in\`,
+\`requires_captions\`, \`uses_screen_reader\`, \`prefers_reduced_motion\`,
+\`prefers_high_contrast\`, \`has_any_accessibility_need\`. The five \`*_in\`
+arrays accept snake_case spec values; the five accessibility filters are
+booleans. Combine \`--profile-*\` and \`--role-criteria-*\` on the same side
+to make criteria validate an explicit list (mismatch blocks the run).
+MECE notes for the list filters:
+- \`household_in\`: \`couple_with_kids\` covers couples raising children;
+  \`couple_no_kids\` is strictly child-free. \`single\` means lives alone
+  (no partner, no roommates, no parents, no children in the household).
+- \`employment_status_in\`: pick the tester's primary daytime activity.
+  A student who works 15 hrs/week is \`student\`; a retiree who freelances
+  is \`retired\`.
+If the resolved pool is too small, \`ish study run\` exits 2 with the
+backend's error message intact — no silent fallback. Broaden the
+criteria or generate more matching profiles via
+\`ish profile generate --description "..."\`.
+Dispatch is per-Conversation (one task per pair index). Run-time
+audience overrides (\`--profile\`, \`--sample\`, \`--all\`, demographic
+filters) are refused on pair iterations — the iteration's audiences
+are authoritative. To change them, update the iteration:
+\`\`\`
+ish study run --study stu-xyz --iteration i-pair -y
+ish iteration update i-pair --details-json '{...}'   # change audiences
+\`\`\`
+Inspect:
+\`\`\`
+ish iteration get i-pair --json | jq '.details.mode_details.mode, .conversations[]'
+\`\`\`
+Per-Conversation summaries (\`end_reason\`, \`dominant_dynamic\`,
+\`who_steered\`) land on \`iteration.conversations[]\`. Per-tester
+summaries land on \`tester.summary\` as before.
 ## Active-endpoint convention
 \`ish chat endpoint use <id>\` writes the endpoint to
@@ -2171,12 +3030,211 @@ Mirrors \`workspace use\` / \`study use\` / \`ask use\`.
 ## Related
-- \`concepts/iteration\` — chat iteration shape (\`details.endpoint\`,
-  \`details.chatbot_endpoint_id\`, \`details.max_turns\`).
+- \`concepts/iteration\` — chat iteration shape
+  (\`details.mode_details\` discriminator, \`mode_details.endpoint\` /
+  \`mode_details.chatbot_endpoint_id\` for external_chatbot,
+  \`mode_details.audience_a/_b\` + \`scenario_a/_b\` for tester_pair,
+  \`details.max_turns\`).
 - \`concepts/study\` — modality + assignments + iteration nesting.
 - \`reference/json-mode\` — JSON output, error envelope, exit codes.
 - \`guides/first-study\` — the same pattern for an interactive
   modality study.
+- \`guides/cold-start\` — the saturated-account first-step playbook
+  if \`workspace_create\` returns \`usage_limit_reached\`.
+`;
+const GUIDE_COLD_START = `# guide: cold start on a saturated account
+The naive cold-start instruction — "create a fresh workspace, then run
+a study" — fails immediately on any account that has accumulated state.
+\`workspace_create\` (CLI: \`ish workspace create\`) returns
+\`error_code: usage_limit_reached\` once the caller hits
+\`maxProducts\` for their tier (1 on FREE). On a saturated dogfood
+account this is the first call an agent burns. This guide is the
+recovery path: inspect existing state, pick a reuse target, or call
+the idempotent create-or-reuse-by-name path.
+## The shape of the failure
+\`\`\`json
+// workspace_create / POST /products on a FREE-tier account with 22 workspaces:
+{
+  "error": "Free plan allows 1 workspace (you have 22).",
+  "error_code": "usage_limit_reached",
+  "status": 403,
+  "retryable": false,
+  "tier": "free",
+  "limit": "maxProducts",
+  "current": 22,
+  "max": 1,
+  "upgrade_url": "https://app.ishlabs.io/billing"
+}
+\`\`\`
+Don't retry. The cap is server-enforced. You have three recovery
+paths:
+1. **Reuse an existing workspace** (most cases).
+2. **Use the idempotent \`--ensure\` path** if you have a stable name
+   the user wants to claim.
+3. **Surface the upgrade link** if neither fits.
+## Step 1 — inspect before you create
+Always start a cold-start session by listing what's already there.
+\`workspace_get\` / \`ish workspace list --json\` returns rows with
+the metadata you need to pick safely:
+\`\`\`bash
+ish workspace list --json
+\`\`\`
+\`\`\`json
+{
+  "items": [
+    {
+      "id": "...", "alias": "w-6ec", "name": "Onboarding revamp",
+      "base_url": "https://example.com",
+      "last_activity_at": "2026-05-10T14:22:00Z",
+      "child_counts": { "studies": 2, "asks": 1, "tester_profiles": 4 },
+      "has_headroom": true
+    },
+    {
+      "id": "...", "alias": "w-d02", "name": "Demo",
+      "last_activity_at": "2025-11-02T09:11:00Z",
+      "child_counts": { "studies": 3, "asks": 0, "tester_profiles": 0 },
+      "has_headroom": false
+    }
+  ],
+  "total": 22, "returned": 22, "limit": 50, "offset": 0, "has_more": false
+}
+\`\`\`
+Read three fields per row:
+- **\`last_activity_at\`** — most recent run, iteration, ask, or write
+  on this workspace. The most recently active one is usually the
+  workspace the user is mentally already in.
+- **\`child_counts\`** — \`{ studies, asks, tester_profiles }\`. Zero
+  across the board = quiet/empty, ideal reuse target without
+  cluttering anyone's view. A workspace with content the user owns is
+  also fine to reuse if there's still headroom.
+- **\`has_headroom\`** — \`true\` if the workspace still has room under
+  \`maxStudiesPerProduct\`, \`maxIterationsPerStudy\`, and
+  \`maxCustomTesterProfiles\` for the caller's tier. If \`false\`, the
+  next \`study create\` / \`profile generate\` against this workspace
+  will be \`usage_limit_reached\`. Filter these out unless the user
+  explicitly wants to free space by deleting state.
+## Step 2 — pick a reuse target (decision rule)
+\`\`\`
+For each workspace in workspace_get():
+  if has_headroom == false:           skip (next call would fail)
+  if name matches the user's intent:  use it (early return)
+  if child_counts == 0 across board:  candidate (empty workspace)
+  else                                candidate (active but not user's intent)
+If candidates exist:
+  prefer name-match > most-recent last_activity_at > lowest child_counts
+If zero candidates with has_headroom == true:
+  the account is genuinely saturated — surface upgrade_url
+  from the next workspace_create's error envelope.
+\`\`\`
+\`\`\`bash
+ish workspace use w-6ec        # commit the choice; saves to ~/.ish/config.json
+\`\`\`
+## Step 3 — or use \`--ensure\` to skip the decision tree
+When you have a stable workspace name the user owns (e.g. a brand
+name, a project codename), use the idempotent path:
+\`\`\`bash
+ish workspace create --name "Acme — onboarding revamp" --ensure
+\`\`\`
+Behavior:
+- If a workspace with that exact name exists and is owned by the
+  caller, returns it (HTTP 200, no quota consumed, no error).
+- Otherwise creates a fresh one (HTTP 201; consumes one
+  \`maxProducts\` slot, so still subject to the tier cap).
+- The returned envelope is the same shape either way — agents don't
+  branch on success vs. reuse.
+This is the right call when you don't want to scrape the list
+yourself or risk a name clash. Pair it with the inspection step
+when the saturated state matters (e.g. you also need to know
+\`has_headroom\` before \`study create\`).
+## Worked transcript — saturated account, agent recovery
+\`\`\`bash
+# 1. Probe state before doing anything else.
+ish workspace list --json --fields alias,name,last_activity_at,child_counts,has_headroom \\
+  | jq '.items | sort_by(.last_activity_at) | reverse | .[0:5]'
+# Output (truncated to top-5 most-recently-active):
+# [
+#   {"alias":"w-6ec","name":"Onboarding revamp",
+#    "last_activity_at":"2026-05-10T14:22:00Z",
+#    "child_counts":{"studies":2,"asks":1,"tester_profiles":4},
+#    "has_headroom":true},
+#   {"alias":"w-d02","name":"Demo",
+#    "last_activity_at":"2025-11-02T09:11:00Z",
+#    "child_counts":{"studies":3,"asks":0,"tester_profiles":0},
+#    "has_headroom":false},
+#   ...
+# ]
+# 2. Pick a workspace with has_headroom=true (w-6ec here).
+ish workspace use w-6ec
+# 3. Carry on as if the workspace_create had succeeded.
+ish profile generate --description "..." --count 3
+ish study create --modality interactive --name "..." \\
+  --url https://example.com \\
+  --assignment "..." --question "..."
+ish study run --all --wait
+\`\`\`
+If the agent prefers \`--ensure\` (e.g. so the user sees their
+preferred name in the UI):
+\`\`\`bash
+WS=$(ish workspace create --name "Cold-start probe" --ensure --get alias)
+ish workspace use "$WS"
+\`\`\`
+## When the account is genuinely saturated
+If every workspace has \`has_headroom: false\` AND \`maxProducts\` is
+at cap (\`current == max\`), there is no path to a new study without
+either upgrading the plan or deleting an existing workspace. Surface
+the \`upgrade_url\` from the \`usage_limit_reached\` envelope to the
+human and stop — don't guess which workspace to delete on the user's
+behalf.
+## Why this matters
+Two of four dogfood agents stopped on \`workspace_create\` on a
+saturated account before producing any signal — the very first call
+in the cold-start script was the cap-hitter. Inspecting
+\`workspace_get\` first (or going through \`--ensure\`) cuts that
+class of failure to zero. The \`last_activity_at\` / \`child_counts\` /
+\`has_headroom\` fields exist specifically so an agent can branch
+without a second round-trip.
+## Related
+- \`concepts/workspace\` — workspace fundamentals, including
+  \`workspace info\` for in-workspace usage counters.
+- \`reference/billing-limits\` — the full tier × cap table; \`maxProducts\`
+  drives \`workspace_create\` rejections.
+- \`reference/json-mode\` — error envelope shape and exit code mapping
+  (\`usage_limit_reached\` is HTTP 403, exit 1, non-retryable).
 `;
 const PAGES = [
     {
@@ -2200,7 +3258,7 @@ const PAGES = [
     {
         slug: "concepts/iteration",
         title: "concept: iteration",
-        description: "One configured run of a study (URL, media, or chat). Covers segments, segment labels, and HTML content.",
+        description: "One configured run of a study (URL, media, or chat). Covers segments, segment labels, HTML content, and chat mode_details (external_chatbot vs tester_pair).",
         body: CONCEPT_ITERATION,
     },
     {
@@ -2293,6 +3351,12 @@ const PAGES = [
         description: "Per-tier caps on workspaces/studies/iterations/profiles; usage_limit_reached error shape.",
         body: REFERENCE_BILLING_LIMITS,
     },
+    {
+        slug: "reference/credits",
+        title: "reference: credits & cost preview",
+        description: "Per-modality credit cost formulas, where the CLI surfaces cost estimates (Scale line, pair_preview.credit_estimate, top-level credit_estimate), tier allotments, insufficient_credits error shape.",
+        body: REFERENCE_CREDITS,
+    },
     {
         slug: "guides/first-study",
         title: "guide: your first study, end to end",
@@ -2302,9 +3366,15 @@ const PAGES = [
     {
         slug: "guides/chat",
         title: "guide: chat-modality studies",
-        description: "Configure a chatbot endpoint (slots-only model), smoke test it, run a chat-modality study. Covers slot bindings, streaming endpoints, and built-in templates.",
+        description: "Configure a chatbot endpoint (slots-only model), smoke test it, run a chat-modality study (external_chatbot mode). Also: tester_pair mode — two AI personas talk to each other for rehearsal scenarios.",
         body: GUIDE_CHAT,
     },
+    {
+        slug: "guides/cold-start",
+        title: "guide: cold start on a saturated account",
+        description: "What to do when workspace_create returns usage_limit_reached on a saturated account. Inspect workspace_get (has_headroom / child_counts / last_activity_at), pick a reuse target, or call ish workspace create --ensure name.",
+        body: GUIDE_COLD_START,
+    },
 ];
 const PAGES_BY_SLUG = new Map(PAGES.map((p) => [p.slug, p]));
 export function listPages() {