npm - @ishlabs/cli - Versions diffs - 0.8.5 → 0.10.0 - Mend

@ishlabs/cli 0.8.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/README.md +55 -6
package/dist/auth.d.ts +23 -4
package/dist/auth.js +165 -39
package/dist/commands/ask.d.ts +12 -0
package/dist/commands/ask.js +127 -2
package/dist/commands/chat.d.ts +17 -0
package/dist/commands/chat.js +589 -0
package/dist/commands/iteration.js +232 -13
package/dist/commands/secret.d.ts +20 -0
package/dist/commands/secret.js +246 -0
package/dist/commands/source.js +24 -2
package/dist/commands/study-run.d.ts +38 -0
package/dist/commands/study-run.js +199 -80
package/dist/commands/study-tester.js +17 -2
package/dist/commands/study.js +311 -39
package/dist/commands/workspace.js +81 -0
package/dist/config.d.ts +7 -0
package/dist/connect.d.ts +3 -0
package/dist/connect.js +359 -24
package/dist/index.js +67 -9
package/dist/lib/alias-hydrate.d.ts +42 -0
package/dist/lib/alias-hydrate.js +175 -0
package/dist/lib/alias-store.d.ts +1 -0
package/dist/lib/alias-store.js +28 -1
package/dist/lib/auth.js +11 -3
package/dist/lib/chat-endpoint-formatters.d.ts +39 -0
package/dist/lib/chat-endpoint-formatters.js +104 -0
package/dist/lib/command-helpers.d.ts +18 -0
package/dist/lib/command-helpers.js +188 -53
package/dist/lib/docs.js +662 -34
package/dist/lib/modality.d.ts +42 -0
package/dist/lib/modality.js +192 -0
package/dist/lib/output.d.ts +41 -0
package/dist/lib/output.js +453 -19
package/dist/lib/paths.d.ts +1 -0
package/dist/lib/paths.js +3 -0
package/dist/lib/skill-content.js +183 -13
package/dist/lib/types.d.ts +15 -0
package/package.json +3 -3

package/dist/lib/docs.js CHANGED Viewed

@@ -18,7 +18,7 @@ Workspace (= product)
 ├── Tester Profiles ────── reusable audience personas (alias: tp-…)
 │     └── Sources ──────── transcripts/audio/images that seed generation
 ├── Study ──────────────── persistent research artifact (alias: s-…)
-│     ├── modality ──────── interactive | text | video | audio | image | document
+│     ├── modality ──────── interactive | text | video | audio | image | document | chat
 │     ├── assignments ───── tasks the tester does
 │     ├── questionnaire ─── questions the tester answers
 │     └── Iterations ────── one configured run (URL or content) (alias: i-…)
@@ -98,18 +98,43 @@ ish workspace list
 ish workspace create --name "My product" --base-url https://example.com
 ish workspace use w-6ec        # set as active
 ish workspace get              # show the active workspace
+ish workspace info             # usage counters + plan caps (see below)
 ish workspace site-access status
 \`\`\`
+## Checking usage before destructive calls
+\`ish workspace info\` shows usage counters so an agent can branch on
+plan limits without burning a doomed \`study create\` attempt that
+returns \`error_code: usage_limit_reached\`.
+\`\`\`
+ish workspace info --json
+{
+  "studies_used": 2,
+  "studies_max": 3,
+  "testers_used": 0,
+  "testers_max": 3,
+  "tier": "free"
+}
+\`\`\`
+A \`null\` value on a \`*_max\` field means "unlimited" (paid tiers).
+Branch on \`studies_used >= studies_max\` before \`study create\`,
+likewise for \`testers_used\` before \`study run --sample\`.
 ## Related
+- \`concepts/secret\` — per-workspace secrets used in chatbot endpoint
+  headers via \`{{secret:KEY}}\` placeholders.
 - \`reference/billing-limits\` — \`maxProducts\` cap on workspace creation.
 `;
 const CONCEPT_STUDY = `# concept: study
 A **study** is the persistent research artifact. It defines:
-- \`modality\`: \`interactive\` (the tester drives a real browser) or one of
-  \`text | video | audio | image | document\` (media reaction studies).
+- \`modality\`: \`interactive\` (the tester drives a real browser), one of
+  \`text | video | audio | image | document\` (media reaction studies),
+  or \`chat\` (multi-turn probe against an external chatbot endpoint).
 - \`content_type\` (media studies only): \`email | social_post | ad | …\` —
   controls the framing the tester is given.
 - \`assignments\`: the tasks the tester performs. See \`concepts/assignment\`.
@@ -129,25 +154,65 @@ its iterations. Think: a study is the recipe; an iteration is one batch.
 3. \`ish study run --sample 5 --country SE\` — dispatches simulations.
 4. \`ish study results\` or \`ish study wait\` to gather outputs.
-### One-shot variant
+### One-shot variant (inline iteration A)
+\`study create\` accepts a per-modality content flag and creates
+iteration A inline in the same call. Useful when you have a single
+test artifact and don't need to A/B iterations:
-\`study create\` now accepts \`--content-text\` (text modality) or
-\`--url\` (interactive modality) inline; iteration A is created in the
-same call. Useful when you have a single test artifact and don't need
-to A/B iterations:
+| Modality        | Inline content flag                                  |
+|-----------------|------------------------------------------------------|
+| \`interactive\` | \`--url <url>\` (\`--screen-format desktop\` is the default; pass \`mobile_portrait\` for mobile) |
+| \`text\`        | \`--content-text <text-or-@file>\`                   |
+| \`image\`       | \`--image-urls <url1,url2,...>\`                     |
+| \`video\`       | \`--content-url <url>\`                              |
+| \`audio\`       | \`--content-url <url>\`                              |
+| \`document\`    | \`--content-url <url>\`                              |
+| \`chat\`        | \`--endpoint <id>\` or \`--endpoint-config <file>\`  |
 \`\`\`
+# Text — single email artifact:
 ish study create --modality text --content-type email \\
   --name "Daily Brief concept" \\
   --assignment "Read:Read the email and react" \\
   --question "What stood out?" \\
   --content-text @./brief.md
-# → study + iteration A in one call, ready for \`study run\`.
-\`\`\`
-Without those flags no iteration is created — agents can no longer
-trip the old "empty A" footgun where \`study run\` silently targeted a
-placeholder.
+# Interactive — URL + screen format inline:
+ish study create --modality interactive \\
+  --name "HN scan" --url https://news.ycombinator.com \\
+  --screen-format desktop \\
+  --assignment "Skim:Skim the top stories"
+# Image A/B — two hero shots:
+ish study create --modality image \\
+  --name "Hero shots" \\
+  --image-urls "https://cdn.example.com/a.png,https://cdn.example.com/b.png" \\
+  --assignment "Compare:Which feels more premium?"
+# Video — one ad clip:
+ish study create --modality video \\
+  --name "Product ad smoke" \\
+  --content-url https://cdn.example.com/ad.mp4 \\
+  --assignment "Watch:Watch and react"
+# Document — a PDF whitepaper:
+ish study create --modality document \\
+  --name "Whitepaper read-through" \\
+  --content-url https://cdn.example.com/report.pdf \\
+  --assignment "Skim:Summarise the report"
+\`\`\`
+Without an inline content flag no iteration is created — agents can no
+longer trip the old "empty A" footgun where \`study run\` silently
+targeted a placeholder. Add \`iteration create\` later if you want B/C
+variants.
+**Local files**: \`--content-url\` and \`--image-urls\` on \`study create\`
+only accept http(s) URLs (the upload endpoint needs a study to upload
+against). For local files, use the 2-step flow: \`study create\` (no
+media flags) then \`iteration create --content-url ./file.mp4\` —
+\`iteration create\` auto-uploads.
 ## Status fields (read \`runtime_status\`, not \`status\`)
@@ -200,9 +265,9 @@ pick was wrong.
 const CONCEPT_ITERATION = `# concept: iteration
 An **iteration** is one configured run of a study. It carries the
-volatile bits — the URL (interactive) or the media (video/text/etc.) —
-while the study carries the persistent shape (assignments, questionnaire,
-modality).
+volatile bits — the URL (interactive), the media (video/text/etc.), or
+the chatbot endpoint (chat) — while the study carries the persistent
+shape (assignments, questionnaire, modality).
 - Alias prefix: \`i-\`
 - A study has 1..N iterations. \`ish study run\` defaults to the latest.
@@ -224,9 +289,19 @@ ish iteration create --study s-b2c --url https://example.com
 # Interactive on mobile screen format:
 ish iteration create --url https://example.com --screen-format mobile_portrait
+# Figma interactive (file_key + start_node_id required):
+ish iteration create --platform figma --url https://figma.com/proto \\
+    --screen-format mobile_portrait --file-key abc123 --start-node-id 0:1 \\
+    --flow-name "Onboarding A"
 # Text/email content from a file:
 ish iteration create --content-text @./email.html --title "Newsletter"
+# Email iteration with sender + featured hero image:
+ish iteration create --content-text @./email.txt --content-html @./email.html \\
+    --sender-name "Marketing" --sender-email "marketing@example.com" \\
+    --featured-image-url https://cdn.example.com/hero.png
 # Video (URL or local file):
 ish iteration create --content-url ./video.mp4
@@ -236,11 +311,113 @@ ish iteration create --image-urls "./a.png,./b.png"
 # Document (PDF):
 ish iteration create --content-url ./report.pdf
+# Chat — probe a saved chatbot endpoint:
+ish iteration create --chat-endpoint-id ce-... --max-turns 10 --early-termination
 # Inspect:
 ish iteration list --study s-b2c
 ish iteration get i-d4e
 \`\`\`
+## Segments and segment labels
+For media iterations (video, audio, text, image, document), reactions
+can be collected per **segment** instead of over the whole asset. A
+segment is a contiguous slice of the iteration's content — a 30-second
+window of a video, a paragraph range of an email, a section of a PDF.
+Each segment can carry a human-readable **label** ("Intro", "Pricing
+section", "Call to action") that surfaces in the tester UI and in
+results.
+Segments live inside the iteration's \`segmentation\` field — there is
+no separate segments resource. Three discriminated shapes:
+- **time_based** (video, audio): boundaries in seconds. Segment 0 runs
+  from \`intervals_seconds[0]\` to \`intervals_seconds[1]\`, etc.
+  Optional \`labels[]\` names each segment.
+  \`\`\`json
+  {
+    "type": "time_based",
+    "intervals_seconds": [0, 30, 60, 90],
+    "labels": ["Hook", "Feature 1", "Feature 2", "CTA"]
+  }
+  \`\`\`
+- **section_based** (text, document, image copy): explicit list of
+  named sections, either marker-bounded or paragraph-bounded.
+  \`\`\`json
+  {
+    "type": "section_based",
+    "sections": [
+      { "name": "intro", "label": "Intro",   "paragraph_start": 0, "paragraph_end": 1 },
+      { "name": "body",  "label": "Body",    "paragraph_start": 1, "paragraph_end": 4 },
+      { "name": "cta",   "label": "Call to action", "paragraph_start": 4, "paragraph_end": 5 }
+    ]
+  }
+  \`\`\`
+- **page_based** (document): pages are auto-derived from the document.
+  No additional fields.
+Pass via \`--segmentation-json '<json>'\` on \`iteration create\`.
+### Default segmentation for text/image iterations
+For text- and image-modality iterations created without
+\`--segmentation-json\`, the worker synthesises a single whole-content
+section so a minimal \`ish iteration create --content-text "..."\` runs
+end-to-end. Author your own segmentation when you want section-level
+reactions; otherwise the default just works.
+### content_config — early termination + selected segments
+A sibling of \`segmentation\` that controls how the tester progresses
+through segments:
+- \`early_termination: true\` — stop the session once every selected
+  segment has been seen.
+- \`selected_segment_indices: [0, 2]\` — only show these segment
+  indices; \`null\` (default) means all segments are active.
+Pass via \`--content-config-json '<json>'\`.
+## HTML content (text + media captions)
+- **Text modality**: pair plain \`--content-text\` with rich
+  \`--content-html\` to render emails / articles with formatting. The
+  plain text is what testers reason over; the HTML is what they see.
+- **Media captions** (video, audio, image): \`--copy-text\` and
+  \`--copy-html\` attach a caption to the media — the social-post
+  pattern. Add \`--social-platform\` (instagram/tiktok/facebook/linkedin/x)
+  for platform-specific framing, and \`--copy-position before|after\`
+  for ordering relative to the media.
+Captions can carry their own segmentation when you want
+paragraph-by-paragraph reactions to a long caption. Use the
+\`--details-json\` escape hatch to pass a nested
+\`copy_content.segmentation\`.
+## Chat modality
+Chat iterations probe an external chatbot endpoint by having a tester
+hold a multi-turn conversation against it. Two ways to wire the
+endpoint:
+\`\`\`
+# Reference a saved endpoint row (recommended — reproducible):
+ish iteration create --chat-endpoint-id ce-...
+# Inline endpoint config (one-off):
+ish iteration create --chat-endpoint-json '{"url":"https://...","headers":{...}}'
+\`\`\`
+Tunables:
+- \`--max-turns N\` — cap the conversation length (default 12, max 50).
+- \`--early-termination\` — let the worker end the session early when
+  the tester signals the conversation is over.
 ## No more auto-empty iteration A
 \`ish study create\` and \`ish study generate\` **do not auto-create
@@ -261,16 +438,6 @@ then retry.
 Treat this as actionable, not transient — re-running won't change anything.
-## Default segmentation for text/image iterations
-For text-modality iterations created with just \`--content-text\` (and
-similarly \`--image-urls\` for image), the worker now synthesises a
-single whole-content section if no \`segmentation\` was supplied. This
-means a minimal \`ish iteration create --study s-XYZ --content-text
-"..."\` actually runs end-to-end without you needing to author a
-SegmentationConfig manually. Author your own segmentation when you
-want section-level reactions; otherwise the default just works.
 ## Related
 - \`concepts/study\` — the parent artifact.
@@ -423,7 +590,23 @@ choice. \`pick_confidence\` is only present on rounds run with
   "picks":   { "A": 3, "B": 0 },
   "ratings": { "A": { "mean": 4.667, "n": 3 },
                "B": { "mean": 2.000, "n": 3 } },
-  "winner":  { "letter": "A", "count": 3, "tied": false }
+  "winner":  { "label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" }
+}
+\`\`\`
+\`winner.label\` is the picked variant's display label (matches
+\`mcp__ish__get_ask_results\` so the same JQ path works either side).
+\`winner.n\` is the completed-response sample the verdict was elected
+from (NOT the pick count itself); \`winner.confidence\` is a coarse
+summary: \`low\` for n<3 OR tied OR any errored response, \`medium\` for
+3 ≤ n < 10 with no errors, \`high\` for n ≥ 10 with no errors. When more
+than half of dispatched responses errored, the winner block is REPLACED
+by a refusal envelope and you should run \`ish ask retry\` first:
+\`\`\`json
+{
+  "picks":   { "A": 1, "B": 0 },
+  "winner":  { "refused": true, "reason": "error_rate_too_high", "errored": 4, "total": 5 }
 }
 \`\`\`
@@ -435,13 +618,31 @@ When the ask has 2+ rounds, \`ask results\` also includes a top-level
 \`\`\`json
 "cross_round_summary": {
   "rounds": [
-    { "round_number": 1, "picks": {"A": 1, "B": 2}, "winner": {"letter": "B", "count": 2, "tied": false } },
-    { "round_number": 2, "picks": {"A": 3, "B": 0}, "winner": {"letter": "A", "count": 3, "tied": false } }
+    { "round_number": 1, "picks": {"A": 1, "B": 2}, "winner": {"label": "B", "count": 2, "tied": false, "n": 3, "confidence": "low" } },
+    { "round_number": 2, "picks": {"A": 3, "B": 0}, "winner": {"label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" } }
   ],
   "picks_delta": { "A": +2, "B": -2 }
 }
 \`\`\`
+## Retrying errored responses
+\`ish ask retry <ask> --round N\` re-dispatches only the ERRORED
+responses on a round. COMPLETED responses are left untouched (their
+answers are the source of truth). Use this after a partial failure
+(e.g. 4 of 5 testers errored on round 1) — fix the underlying cause,
+then \`ask retry\` to backfill the missing rows. Idempotent: zero-errored
+is a no-op. Add \`--wait\` to block until the retried round settles.
+\`\`\`bash
+$ ish ask retry a-d3e --round 1 --wait
+\`\`\`
+Errored responses carry \`error_message\` + \`error_kind\` (e.g.
+\`first_impression_llm_failed\`, \`interview_llm_failed\`,
+\`variant_preparation_failed\`) so an agent can branch on retry vs
+abort without parsing prose.
 ## Adding follow-up questions to a round
 \`ish ask add-questions --round N --questions ./qs.json\` is **additive
@@ -725,6 +926,72 @@ printf %s "$STAGING_PW" | ish workspace site-access basic-auth \\
     --username alice --password -
 \`\`\`
 `;
+const CONCEPT_SECRET = `# concept: secret
+Per-workspace key/value secrets. Used at chatbot-dispatch time to
+resolve \`{{secret:KEY}}\` placeholders in outgoing headers (or
+anywhere else in the rendered request). Common shape:
+\`\`\`
+Authorization: Bearer {{secret:GROQ_KEY}}
+X-API-Key:     {{secret:CUSTOMER_BOT_KEY}}
+\`\`\`
+Distinct from site-access (\`concepts/site-access\`): site-access is
+for interactive studies that gate a browser session against a UI;
+secrets here are for chatbot endpoints, where ish dispatches the
+HTTP request itself and the value lands in the wire request.
+## Verbs
+\`\`\`
+ish secret list                       # list KEYS only. Values never returned.
+ish secret set GROQ_KEY <value>       # positional value (warning: shell history)
+ish secret set GROQ_KEY --value-file ./grok.txt
+printf %s "$VAL" | ish secret set GROQ_KEY --value-stdin
+ish secret delete GROQ_KEY
+\`\`\`
+## Keep values out of shell history
+Three input modes. Pick the safest for the source:
+- **\`--value-stdin\`**: read from stdin. Best for piping from
+  another process (\`gcloud secrets ...\`, \`op read\`, etc.).
+- **\`--value-file <path>\`**: read from a file. Use \`-\` to read
+  from stdin (alias for \`--value-stdin\`).
+- **Positional value**: convenient but lands in shell history.
+  Avoid in scripts.
+Exactly one source per call; passing two is a usage error
+(\`error_code: validation_error\`, exit 2).
+## How resolution works
+At chatbot dispatch, the renderer looks up each \`{{secret:KEY}}\`
+in the workspace's secret store. Missing keys render as the empty
+string (no error). This matches the legacy ContextValueResolver
+behavior and lets templates degrade silently instead of breaking
+the request. The bot will most likely 401, which is a clear signal.
+Reserved KEYs (\`BASIC_AUTH_*\`, \`SESSION_COOKIE_*\`,
+\`LOGIN_*\`) are rejected client-side with a hint to use
+\`ish workspace site-access\` instead. Those keys are owned by
+the site-access flow and writing them as plain secrets would
+silently break that path.
+## When to use a secret vs. inline a header
+If the value is the same across every customer / environment and
+not sensitive (a vendor name, an API version), inline it in the
+endpoint config's \`headers\` field. If it's per-workspace, rotates,
+or shouldn't be committed to a config JSON file, use a secret.
+## Related
+- \`guides/chat\`: chat endpoint setup, including auth header examples.
+- \`concepts/site-access\`: credentials for browser-rendered study URLs.
+`;
 const CONCEPT_RUN_VERBS = `# concept: run verbs — \`study run\` vs \`ask run\`
 Both verbs dispatch simulations against an audience, but the lifecycle
@@ -966,7 +1233,80 @@ The CLI guarantees these contracts so agents can chain safely:
   \`jq '.rounds[0].responses | length'\`.
 - **\`study run --json\` exposes tester handles.** The top-level
   \`tester_ids[]\` and \`tester_aliases[]\` arrays are the canonical
-  inputs to \`ish study poll/wait/cancel\`.
+  inputs to \`ish study poll/wait/cancel\`. The \`simulations[]\` array
+  is collapsed to one batch entry per study (M13) with nested
+  \`tester_ids[]\`, \`tester_aliases[]\`, \`job_ids[]\`, and \`count\` —
+  an N-sample dispatch is a single row, not N near-duplicate rows.
+- **\`study results --json\` includes per-answer sentiment** (M10).
+  Every \`interview_answers[].answers[]\` row carries \`sentiment\`
+  (the tester's session-level label from \`tester_summary.sentiment\`),
+  and every \`testers[]\` row carries \`sentiment\` + \`comment\`. No
+  \`study tester <id>\` round-trip required.
+- **\`study results --summary\`** is a lean projection: counts +
+  sentiment histogram + per-tester {alias, status, sentiment, comment,
+  error_message}. Drops \`interview_answers\` and per-interaction
+  breakdowns. Cheapest "did this run land?" shape.
+- **\`study results --transcript <tester_id>\`** is the chat-modality
+  projection. Returns \`{tester_id, tester_alias, transcript: [...],
+  unique_bot_replies, tester_summary}\`. Each transcript entry is
+  \`{role, text, turn_index, ...}\` — bot turns add \`failure\`
+  (set when the dispatch crashed); tester turns add \`action_type\`,
+  \`option_label\`, and \`sentiment\`. \`text\` is null on tester
+  turns whose action carries no text (\`select_option\`,
+  \`ignore_offered\`); read intent from \`action_type\` +
+  \`option_label\`. Same shape as the MCP \`get_chat_transcript\`
+  tool. \`unique_bot_replies = 1\` on a multi-turn run is the M2 loop
+  signature.
+- **\`study tester --summary\`** drops the action timeline and
+  returns just \`{tester, interaction_count, sentiment, comment,
+  error_message?, error_kind?}\`.
+- **\`study poll\` honors the active study.** Pass no \`--study\`
+  flag and it falls back to the active study (set by
+  \`ish study use\`), parity with \`study results\` /
+  \`study wait\` / \`study run\`.
+- **\`iteration get --json\` testers carry \`alias\` + \`name\`** (M12).
+  Same identifying triple as \`study results --json\`'s tester rows.
+- **\`ask results --json\` keeps \`variant_pick_id\` on every response**
+  (C5-Bug4). It's the load-bearing field for "who picked what" — no
+  \`--verbose\` required. Same logic on \`ask get --json\`.
+- **Every verb's \`--help\` ends with a "Tips:" footer** naming
+  \`--get\` and \`--fields\`. If you're reaching for \`jq -r .x\` you
+  almost certainly wanted \`--get x\`.
+- **\`study run --wait\` returns \`error_code: "wait_timeout"\`**
+  (exit 5, retryable) when the wait timer expires — distinct from
+  the api-client's generic timeout / network / server families. The
+  envelope carries \`progress: {study_id, iteration_id?,
+  timeout_seconds, done, total, pending, rows[]}\` so the agent
+  can resume by polling rather than re-dispatching. Same shape on
+  \`study wait\` (single-tester rows[] has length 1).
+- **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
+  for the per-POST testers/batch + simulation/start budget. On
+  timeout (or any dispatch failure), the error envelope includes
+  \`seeded_but_not_dispatched_ids[]\` + \`seeded_but_not_dispatched_aliases[]\`
+  listing the testers that exist server-side but didn't get
+  dispatched. Resume by polling those instead of re-running
+  \`study run\` (which would create another batch on top).
+- **\`ask run --new\` is non-idempotent and marked \`retryable: false\`**
+  on any failure — agents auto-retrying would create a duplicate
+  ask. The error envelope's \`suggestions\` includes a pointer to
+  \`ish ask list --workspace <id>\` so the agent can confirm
+  whether the resource already exists before retrying manually.
+- **\`ish connect --detach\` blocks until tunnel registration is
+  confirmed** (\`registered: true\` in the lock file). The
+  registration POST retries up to 4 times with exponential backoff
+  (~7s worst case) before giving up; the heartbeat re-registers
+  on a transient 404 instead of burning through the 3-strike
+  countdown. If the heartbeat path persistently 404s even after
+  several successful re-register cycles (D1: backend keeps
+  forgetting the connection between heartbeats), the CLI emits
+  a single stderr Notice and keeps the tunnel up rather than
+  dying — the route is the problem, not the tunnel. Subsequent
+  simulations may still hit \`TunnelInactive\` on dispatch in
+  that case; investigate the backend's /connect route.
+- **The "Could not verify token (network error)…" stderr warning
+  is gone** on green runs. The probe is best-effort; if there's a
+  real auth failure, the subsequent API call surfaces it with a
+  proper exit code 3.
 - **Study responses carry a derived \`runtime_status\` field**
   (\`draft | running | completed | completed_with_errors | cancelled\`).
   Prefer this over the raw \`status\` field — \`runtime_status\` is
@@ -1021,7 +1361,7 @@ The CLI guarantees these contracts so agents can chain safely:
       "picks":   { "A": 3, "B": 0 },
       "ratings": { "A": { "mean": 4.667, "n": 3 },
                    "B": { "mean": 2.000, "n": 3 } },
-      "winner":  { "letter": "A", "count": 3, "tied": false }
+      "winner":  { "label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" }
     }
   }
   \`\`\`
@@ -1029,8 +1369,23 @@ The CLI guarantees these contracts so agents can chain safely:
   \`picks\` is present iff \`wants_pick\`; \`ratings\` is present iff
   \`wants_ratings\` and ≥ 1 rating was submitted; \`winner\` is the
   highest pick count (\`tied: true\` if multiple variants share the
-  top). \`mean\` is rounded to 3 decimal places; \`n\` is the rating
-  count for that variant.
+  top). \`winner.n\` is the completed-response sample;
+  \`winner.confidence\` is \`low\` for n<3 / tied / any errors,
+  \`medium\` for clean 3–9, \`high\` for clean 10+. When >50% of
+  dispatched responses errored the winner block is replaced by
+  \`{ refused: true, reason: "error_rate_too_high", errored, total }\` —
+  run \`ish ask retry <ask> --round N\` first. \`mean\` is rounded to 3
+  decimal places; \`n\` (on ratings) is the rating count for that variant.
+- **Errored ask responses carry \`error_message\` + \`error_kind\`.**
+  Each \`responses[]\` entry whose \`status: errored\` exposes the
+  classified failure (e.g. \`first_impression_llm_failed\`,
+  \`interview_llm_failed\`, \`variant_preparation_failed\`) so an agent
+  can branch on retry vs abort without parsing prose. Both fields are
+  \`null\` on \`pending\` and \`completed\` rows.
+- **\`ish ask retry <ask> --round N\` re-dispatches errored responses.**
+  COMPLETED rows are left untouched; only ERRORED responses are reset
+  to PENDING and re-run from scratch. Idempotent: zero-errored is a
+  no-op. Add \`--wait\` to block until the retry settles.
 - **\`ask results --json\` deduplicates tester profile snapshots.** When
   \`tester_profile\` and \`tester_profile_snapshot\` share all
   overlapping fields (the common case — they only diverge if the
@@ -1353,6 +1708,267 @@ upgrade or delete an existing resource to free up headroom.
 - \`concepts/profile\`   — \`maxCustomTesterProfiles\` gates profile creation.
 - \`reference/json-mode\` — full error envelope shape and exit codes.
 `;
+const GUIDE_CHAT = `# guide: chat-modality studies
+Goal: from a customer chatbot endpoint to a finished chat-modality
+study with parsed transcripts, end to end via the CLI. The flow has
+three phases: configure the endpoint, smoke test it, run a study.
+## 1. Configure the endpoint
+Two starting points:
+### From a curl example (recommended for first-time setup)
+The agent has a curl request that talks to the customer's bot. Save
+it to a file and run \`init\`:
+\`\`\`
+ish chat endpoint init \\
+    --from-curl ./bot.curl \\
+    --name my-bot
+\`\`\`
+\`init\` posts the curl to \`/chat/auto-detect-shape\`, infers the
+config (URL, method, headers, body template, response paths,
+mode, async-poll if applicable), and saves it as a chatbot endpoint
+resource. Output JSON shape:
+\`\`\`json
+{
+  "success": true,
+  "saved": true,
+  "endpoint_id": "ep_abc",
+  "alias": "ep-abc",
+  "config": { /* full ChatbotEndpointConfig */ },
+  "tunnel_backed": true,
+  "tunnel_backed_detected": true,
+  "confidence": "high",
+  "explanation": "...",
+  "warnings": []
+}
+\`\`\`
+For local bots (URL host is \`localhost\` / \`127.0.0.1\` /
+\`0.0.0.0\`), \`tunnel_backed\` is auto-set to \`true\`. Override
+explicitly with \`--tunnel-backed\` / \`--no-tunnel-backed\`.
+Pass \`--no-save\` to inspect the inferred config without persisting.
+### From a hand-written config
+\`\`\`
+ish chat endpoint create --endpoint-config ./bot-config.json --name "my-bot"
+\`\`\`
+The file is the bare \`ChatbotEndpointConfig\` shape (or a full
+endpoint envelope with \`id\` / \`name\` / \`config\` keys —
+\`.config\` is extracted automatically). Pipe from stdin via \`-\`.
+### Editing a saved endpoint
+The dialog and the CLI both PUT the full config to
+\`/chatbot-endpoints/{id}\` on save (no patch semantics). The CLI
+exposes that round-trip cleanly:
+\`\`\`
+# Single-field edits via shorthand flags
+ish chat endpoint update ep-abc --name "Production support bot"
+ish chat endpoint update ep-abc --url https://api.example.com/v2/chat
+ish chat endpoint update ep-abc --mode stateless
+ish chat endpoint update ep-abc --tunnel-backed       # or --no-tunnel-backed
+# Richer edits via fetch | jq | replace
+ish chat endpoint get ep-abc --verbose \\
+  | jq '.config.outgoing.headers["X-API-Key"] = "{{secret:KEY}}"' \\
+  | ish chat endpoint update ep-abc --endpoint-config -
+ish chat endpoint get ep-abc --verbose \\
+  | jq '.config.incoming.slotsContainerPaths += ["response.options"]
+        | .config.incoming.slotsKindHints["response.options"] = "alternatives"' \\
+  | ish chat endpoint update ep-abc --endpoint-config -
+\`\`\`
+\`get --verbose\` (or piped) emits the round-trippable envelope
+\`{id, name, isTunnelBacked, config}\` — exactly what
+\`update --endpoint-config -\` accepts. Field-shorthand flags win on
+conflict with \`--endpoint-config\`.
+### Body template placeholders
+The renderer expands these tokens at request time:
+- \`{{action.text}}\`: the persona's outgoing user message this turn.
+- \`{{history}}\`: past turns as \`[{role, content}, ...]\`. Past
+  turns only; current turn is in \`{{action.text}}\`.
+- \`{{history_with_current}}\`: \`{{history}}\` plus a synthetic
+  \`{role: "user", content: action.text}\` at the tail. **Use this for
+  OpenAI-shape bots that take a single \`messages: [...]\` array
+  containing prior turns and the current user message.**
+- \`{{turn.role}}\` / \`{{turn.text}}\`: per-turn expansion. Place
+  one element with these tokens inside an array literal; the
+  renderer expands it to one entry per past turn.
+- \`{{tester.name}}\` / \`{{tester.locale}}\`: persona attributes.
+- \`{{conversation_id}}\`: bot-supplied session id (stateful mode).
+- \`{{secret:KEY}}\`: workspace secret (see below).
+\`{{history_with_current}}\` lands the typical OpenAI/Anthropic/Pollinations shape:
+\`\`\`json
+{
+  "model": "gpt-4o-mini",
+  "messages": "{{history_with_current}}"
+}
+\`\`\`
+### Auth via workspace secrets
+For bots behind an API key, store the value as a workspace secret
+once and reference it from the endpoint's headers:
+\`\`\`
+printf %s "$GROQ_KEY" | ish secret set GROQ_KEY --value-stdin
+ish chat endpoint update ep-abc --endpoint-config - <<'EOF'
+{ "config": { "outgoing": { "headers": { "Authorization": "Bearer {{secret:GROQ_KEY}}" } } } }
+EOF
+\`\`\`
+The renderer resolves \`{{secret:GROQ_KEY}}\` from the workspace
+secret store at dispatch time. Missing keys render empty, which
+typically surfaces as a 401 from the bot. That's an actionable signal.
+See \`concepts/secret\` for the full set of input modes
+(\`--value-file\`, \`--value-stdin\`, positional) and the reserved-key
+list.
+## 2. Smoke test the connection
+Before launching a study, verify the bot answers cleanly:
+\`\`\`
+ish chat endpoint test ep-abc -m "Hello"
+\`\`\`
+Output:
+\`\`\`json
+{
+  "success": true,
+  "text": "Hi! How can I help?",
+  "conversation_id": "...",
+  "slots": [...],
+  "references": [...],
+  "bot_latency_ms": 240,
+  "end_of_conversation": false
+}
+\`\`\`
+For tunnel-backed endpoints (\`isTunnelBacked: true\`), the CLI
+runs a tunnel pre-flight against \`/connect/active\` first and
+exits \`5\` with \`error_kind: "TunnelInactive"\` when no tunnel is
+running. Run \`ish connect <port>\` in another shell first, then
+retry.
+For stateful endpoints, thread the conversation across script
+invocations:
+\`\`\`
+CID=$(ish chat endpoint test ep-abc -m "Hi" | jq -r .conversation_id)
+ish chat endpoint test ep-abc -m "Tell me more" --conversation-id "$CID"
+\`\`\`
+For multi-turn validation use \`ish study run --sample 1\` against
+a draft study (next phase).
+## 3. Run a chat-modality study
+Use the existing study flow with the new chat flags. \`study create\`
+fetches the saved endpoint and embeds its config inline at
+\`iteration.details.endpoint\` plus the lineage id at
+\`iteration.details.chatbot_endpoint_id\`:
+\`\`\`
+ish study create \\
+    --modality chat \\
+    --endpoint ep-abc \\
+    --name "Sign-up Q1" \\
+    --assignment "Sign up:Try to sign up"
+\`\`\`
+Or pass an inline config when there's no saved endpoint to reference
+(mutually exclusive with \`--endpoint\`):
+\`\`\`
+cat ./bot-config.json | ish study create \\
+    --modality chat --endpoint-config - \\
+    --name "Sign-up Q1" --assignment "Sign up:Try to sign up"
+\`\`\`
+Optional \`--max-turns <n>\` (default 12) caps the chat per tester.
+Audience size is set at run time. Use \`--sample <N>\` to pick N
+random simulatable profiles, or \`--all\` for the full pool.
+\`--profile <id>\` is also supported for explicit selection:
+\`\`\`
+ish study run stu-xyz --sample 5 --wait
+\`\`\`
+Pull raw interactions:
+\`\`\`
+ish study results stu-xyz --json | jq '.interactions'
+\`\`\`
+Note: chat is currently excluded from the LLM-analysis route; the
+results call returns raw interactions, not an analyzed summary.
+## Iteration shortcuts
+Add a chat iteration to an existing chat study post-hoc. The
+iteration type is inherited from the parent study's modality —
+no \`--type\` flag is needed:
+\`\`\`
+ish iteration create --study stu-xyz --endpoint ep-abc --max-turns 10
+ish iteration create --study stu-xyz --endpoint-config ./bot.json
+\`\`\`
+Same flag set as \`study create\`'s chat shortcut.
+## Active-endpoint convention
+\`ish chat endpoint use <id>\` writes the endpoint to
+\`~/.ish/config.json\` (\`chat_endpoint\` key). After that, every
+\`chat endpoint *\` verb that takes \`[endpoint-id]\` defaults to the
+active endpoint when the positional is omitted:
+\`\`\`
+ish chat endpoint use ep-abc
+ish chat endpoint test -m "Hello"        # uses ep-abc
+ish chat endpoint get --verbose          # uses ep-abc
+\`\`\`
+Mirrors \`workspace use\` / \`study use\` / \`ask use\`.
+## Common errors
+- \`error_kind: "TunnelInactive"\` (exit 5) — tunnel-backed endpoint
+  but no active tunnel. Run \`ish connect <port>\` first.
+- \`error_code: "validation_error"\` (exit 2) — usage error
+  (mutually exclusive flags both set, missing required input,
+  modality mismatch). The error envelope's \`valid_options\` field
+  surfaces the accepted shape.
+- \`error_kind: "BotInvalidResponseError"\` (exit 1) — the bot
+  responded but the configured \`incoming.*\` paths didn't resolve.
+  Edit the response shape via \`update --endpoint-config\` or rerun
+  \`init\` with a fresher curl sample.
+## Related
+- \`concepts/iteration\` — chat iteration shape (\`details.endpoint\`,
+  \`details.chatbot_endpoint_id\`, \`details.max_turns\`).
+- \`concepts/study\` — modality + assignments + iteration nesting.
+- \`reference/json-mode\` — JSON output, error envelope, exit codes.
+- \`guides/first-study\` — the same pattern for an interactive
+  modality study.
+`;
 const PAGES = [
     {
         slug: "overview",
@@ -1375,7 +1991,7 @@ const PAGES = [
     {
         slug: "concepts/iteration",
         title: "concept: iteration",
-        description: "One configured run of a study (URL or media).",
+        description: "One configured run of a study (URL, media, or chat). Covers segments, segment labels, and HTML content.",
         body: CONCEPT_ITERATION,
     },
     {
@@ -1426,6 +2042,12 @@ const PAGES = [
         description: "Credentials for gated URLs (basic auth, cookies, login forms).",
         body: CONCEPT_SITE_ACCESS,
     },
+    {
+        slug: "concepts/secret",
+        title: "concept: secret",
+        description: "Per-workspace KV store for {{secret:KEY}} placeholders in chatbot endpoint headers.",
+        body: CONCEPT_SECRET,
+    },
     {
         slug: "concepts/run-verbs",
         title: "concept: run verbs — study run vs ask run",
@@ -1462,6 +2084,12 @@ const PAGES = [
         description: "Login → workspace → audience → study → iteration → run → results.",
         body: GUIDE_FIRST_STUDY,
     },
+    {
+        slug: "guides/chat",
+        title: "guide: chat-modality studies",
+        description: "Configure a chatbot endpoint, smoke test it, run a chat-modality study.",
+        body: GUIDE_CHAT,
+    },
 ];
 const PAGES_BY_SLUG = new Map(PAGES.map((p) => [p.slug, p]));
 export function listPages() {