@ishlabs/cli 0.8.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +55 -6
  2. package/dist/auth.d.ts +23 -4
  3. package/dist/auth.js +165 -39
  4. package/dist/commands/ask.d.ts +12 -0
  5. package/dist/commands/ask.js +127 -2
  6. package/dist/commands/chat.d.ts +17 -0
  7. package/dist/commands/chat.js +589 -0
  8. package/dist/commands/iteration.js +232 -13
  9. package/dist/commands/secret.d.ts +20 -0
  10. package/dist/commands/secret.js +246 -0
  11. package/dist/commands/source.js +24 -2
  12. package/dist/commands/study-run.d.ts +38 -0
  13. package/dist/commands/study-run.js +199 -80
  14. package/dist/commands/study-tester.js +17 -2
  15. package/dist/commands/study.js +311 -39
  16. package/dist/commands/workspace.js +81 -0
  17. package/dist/config.d.ts +7 -0
  18. package/dist/connect.d.ts +3 -0
  19. package/dist/connect.js +359 -24
  20. package/dist/index.js +67 -9
  21. package/dist/lib/alias-hydrate.d.ts +42 -0
  22. package/dist/lib/alias-hydrate.js +175 -0
  23. package/dist/lib/alias-store.d.ts +1 -0
  24. package/dist/lib/alias-store.js +28 -1
  25. package/dist/lib/auth.js +11 -3
  26. package/dist/lib/chat-endpoint-formatters.d.ts +39 -0
  27. package/dist/lib/chat-endpoint-formatters.js +104 -0
  28. package/dist/lib/command-helpers.d.ts +18 -0
  29. package/dist/lib/command-helpers.js +188 -53
  30. package/dist/lib/docs.js +662 -34
  31. package/dist/lib/modality.d.ts +42 -0
  32. package/dist/lib/modality.js +192 -0
  33. package/dist/lib/output.d.ts +41 -0
  34. package/dist/lib/output.js +453 -19
  35. package/dist/lib/paths.d.ts +1 -0
  36. package/dist/lib/paths.js +3 -0
  37. package/dist/lib/skill-content.js +183 -13
  38. package/dist/lib/types.d.ts +15 -0
  39. package/package.json +3 -3
package/dist/lib/docs.js CHANGED
@@ -18,7 +18,7 @@ Workspace (= product)
18
18
  ├── Tester Profiles ────── reusable audience personas (alias: tp-…)
19
19
  │ └── Sources ──────── transcripts/audio/images that seed generation
20
20
  ├── Study ──────────────── persistent research artifact (alias: s-…)
21
- │ ├── modality ──────── interactive | text | video | audio | image | document
21
+ │ ├── modality ──────── interactive | text | video | audio | image | document | chat
22
22
  │ ├── assignments ───── tasks the tester does
23
23
  │ ├── questionnaire ─── questions the tester answers
24
24
  │ └── Iterations ────── one configured run (URL or content) (alias: i-…)
@@ -98,18 +98,43 @@ ish workspace list
98
98
  ish workspace create --name "My product" --base-url https://example.com
99
99
  ish workspace use w-6ec # set as active
100
100
  ish workspace get # show the active workspace
101
+ ish workspace info # usage counters + plan caps (see below)
101
102
  ish workspace site-access status
102
103
  \`\`\`
103
104
 
105
+ ## Checking usage before destructive calls
106
+
107
+ \`ish workspace info\` shows usage counters so an agent can branch on
108
+ plan limits without burning a doomed \`study create\` attempt that
109
+ returns \`error_code: usage_limit_reached\`.
110
+
111
+ \`\`\`
112
+ ish workspace info --json
113
+ {
114
+ "studies_used": 2,
115
+ "studies_max": 3,
116
+ "testers_used": 0,
117
+ "testers_max": 3,
118
+ "tier": "free"
119
+ }
120
+ \`\`\`
121
+
122
+ A \`null\` value on a \`*_max\` field means "unlimited" (paid tiers).
123
+ Branch on \`studies_used >= studies_max\` before \`study create\`,
124
+ likewise for \`testers_used\` before \`study run --sample\`.
125
+
104
126
  ## Related
105
127
 
128
+ - \`concepts/secret\` — per-workspace secrets used in chatbot endpoint
129
+ headers via \`{{secret:KEY}}\` placeholders.
106
130
  - \`reference/billing-limits\` — \`maxProducts\` cap on workspace creation.
107
131
  `;
108
132
  const CONCEPT_STUDY = `# concept: study
109
133
 
110
134
  A **study** is the persistent research artifact. It defines:
111
- - \`modality\`: \`interactive\` (the tester drives a real browser) or one of
112
- \`text | video | audio | image | document\` (media reaction studies).
135
+ - \`modality\`: \`interactive\` (the tester drives a real browser), one of
136
+ \`text | video | audio | image | document\` (media reaction studies),
137
+ or \`chat\` (multi-turn probe against an external chatbot endpoint).
113
138
  - \`content_type\` (media studies only): \`email | social_post | ad | …\` —
114
139
  controls the framing the tester is given.
115
140
  - \`assignments\`: the tasks the tester performs. See \`concepts/assignment\`.
@@ -129,25 +154,65 @@ its iterations. Think: a study is the recipe; an iteration is one batch.
129
154
  3. \`ish study run --sample 5 --country SE\` — dispatches simulations.
130
155
  4. \`ish study results\` or \`ish study wait\` to gather outputs.
131
156
 
132
- ### One-shot variant
157
+ ### One-shot variant (inline iteration A)
158
+
159
+ \`study create\` accepts a per-modality content flag and creates
160
+ iteration A inline in the same call. Useful when you have a single
161
+ test artifact and don't need to A/B iterations:
133
162
 
134
- \`study create\` now accepts \`--content-text\` (text modality) or
135
- \`--url\` (interactive modality) inline; iteration A is created in the
136
- same call. Useful when you have a single test artifact and don't need
137
- to A/B iterations:
163
+ | Modality | Inline content flag |
164
+ |-----------------|------------------------------------------------------|
165
+ | \`interactive\` | \`--url <url>\` (\`--screen-format desktop\` is the default; pass \`mobile_portrait\` for mobile) |
166
+ | \`text\` | \`--content-text <text-or-@file>\` |
167
+ | \`image\` | \`--image-urls <url1,url2,...>\` |
168
+ | \`video\` | \`--content-url <url>\` |
169
+ | \`audio\` | \`--content-url <url>\` |
170
+ | \`document\` | \`--content-url <url>\` |
171
+ | \`chat\` | \`--endpoint <id>\` or \`--endpoint-config <file>\` |
138
172
 
139
173
  \`\`\`
174
+ # Text — single email artifact:
140
175
  ish study create --modality text --content-type email \\
141
176
  --name "Daily Brief concept" \\
142
177
  --assignment "Read:Read the email and react" \\
143
178
  --question "What stood out?" \\
144
179
  --content-text @./brief.md
145
- # → study + iteration A in one call, ready for \`study run\`.
146
- \`\`\`
147
180
 
148
- Without those flags no iteration is created — agents can no longer
149
- trip the old "empty A" footgun where \`study run\` silently targeted a
150
- placeholder.
181
+ # Interactive URL + screen format inline:
182
+ ish study create --modality interactive \\
183
+ --name "HN scan" --url https://news.ycombinator.com \\
184
+ --screen-format desktop \\
185
+ --assignment "Skim:Skim the top stories"
186
+
187
+ # Image A/B — two hero shots:
188
+ ish study create --modality image \\
189
+ --name "Hero shots" \\
190
+ --image-urls "https://cdn.example.com/a.png,https://cdn.example.com/b.png" \\
191
+ --assignment "Compare:Which feels more premium?"
192
+
193
+ # Video — one ad clip:
194
+ ish study create --modality video \\
195
+ --name "Product ad smoke" \\
196
+ --content-url https://cdn.example.com/ad.mp4 \\
197
+ --assignment "Watch:Watch and react"
198
+
199
+ # Document — a PDF whitepaper:
200
+ ish study create --modality document \\
201
+ --name "Whitepaper read-through" \\
202
+ --content-url https://cdn.example.com/report.pdf \\
203
+ --assignment "Skim:Summarise the report"
204
+ \`\`\`
205
+
206
+ Without an inline content flag no iteration is created — agents can no
207
+ longer trip the old "empty A" footgun where \`study run\` silently
208
+ targeted a placeholder. Add \`iteration create\` later if you want B/C
209
+ variants.
210
+
211
+ **Local files**: \`--content-url\` and \`--image-urls\` on \`study create\`
212
+ only accept http(s) URLs (the upload endpoint needs a study to upload
213
+ against). For local files, use the 2-step flow: \`study create\` (no
214
+ media flags) then \`iteration create --content-url ./file.mp4\` —
215
+ \`iteration create\` auto-uploads.
151
216
 
152
217
  ## Status fields (read \`runtime_status\`, not \`status\`)
153
218
 
@@ -200,9 +265,9 @@ pick was wrong.
200
265
  const CONCEPT_ITERATION = `# concept: iteration
201
266
 
202
267
  An **iteration** is one configured run of a study. It carries the
203
- volatile bits — the URL (interactive) or the media (video/text/etc.)
204
- while the study carries the persistent shape (assignments, questionnaire,
205
- modality).
268
+ volatile bits — the URL (interactive), the media (video/text/etc.), or
269
+ the chatbot endpoint (chat) — while the study carries the persistent
270
+ shape (assignments, questionnaire, modality).
206
271
 
207
272
  - Alias prefix: \`i-\`
208
273
  - A study has 1..N iterations. \`ish study run\` defaults to the latest.
@@ -224,9 +289,19 @@ ish iteration create --study s-b2c --url https://example.com
224
289
  # Interactive on mobile screen format:
225
290
  ish iteration create --url https://example.com --screen-format mobile_portrait
226
291
 
292
+ # Figma interactive (file_key + start_node_id required):
293
+ ish iteration create --platform figma --url https://figma.com/proto \\
294
+ --screen-format mobile_portrait --file-key abc123 --start-node-id 0:1 \\
295
+ --flow-name "Onboarding A"
296
+
227
297
  # Text/email content from a file:
228
298
  ish iteration create --content-text @./email.html --title "Newsletter"
229
299
 
300
+ # Email iteration with sender + featured hero image:
301
+ ish iteration create --content-text @./email.txt --content-html @./email.html \\
302
+ --sender-name "Marketing" --sender-email "marketing@example.com" \\
303
+ --featured-image-url https://cdn.example.com/hero.png
304
+
230
305
  # Video (URL or local file):
231
306
  ish iteration create --content-url ./video.mp4
232
307
 
@@ -236,11 +311,113 @@ ish iteration create --image-urls "./a.png,./b.png"
236
311
  # Document (PDF):
237
312
  ish iteration create --content-url ./report.pdf
238
313
 
314
+ # Chat — probe a saved chatbot endpoint:
315
+ ish iteration create --chat-endpoint-id ce-... --max-turns 10 --early-termination
316
+
239
317
  # Inspect:
240
318
  ish iteration list --study s-b2c
241
319
  ish iteration get i-d4e
242
320
  \`\`\`
243
321
 
322
+ ## Segments and segment labels
323
+
324
+ For media iterations (video, audio, text, image, document), reactions
325
+ can be collected per **segment** instead of over the whole asset. A
326
+ segment is a contiguous slice of the iteration's content — a 30-second
327
+ window of a video, a paragraph range of an email, a section of a PDF.
328
+ Each segment can carry a human-readable **label** ("Intro", "Pricing
329
+ section", "Call to action") that surfaces in the tester UI and in
330
+ results.
331
+
332
+ Segments live inside the iteration's \`segmentation\` field — there is
333
+ no separate segments resource. Three discriminated shapes:
334
+
335
+ - **time_based** (video, audio): boundaries in seconds. Segment 0 runs
336
+ from \`intervals_seconds[0]\` to \`intervals_seconds[1]\`, etc.
337
+ Optional \`labels[]\` names each segment.
338
+
339
+ \`\`\`json
340
+ {
341
+ "type": "time_based",
342
+ "intervals_seconds": [0, 30, 60, 90],
343
+ "labels": ["Hook", "Feature 1", "Feature 2", "CTA"]
344
+ }
345
+ \`\`\`
346
+
347
+ - **section_based** (text, document, image copy): explicit list of
348
+ named sections, either marker-bounded or paragraph-bounded.
349
+
350
+ \`\`\`json
351
+ {
352
+ "type": "section_based",
353
+ "sections": [
354
+ { "name": "intro", "label": "Intro", "paragraph_start": 0, "paragraph_end": 1 },
355
+ { "name": "body", "label": "Body", "paragraph_start": 1, "paragraph_end": 4 },
356
+ { "name": "cta", "label": "Call to action", "paragraph_start": 4, "paragraph_end": 5 }
357
+ ]
358
+ }
359
+ \`\`\`
360
+
361
+ - **page_based** (document): pages are auto-derived from the document.
362
+ No additional fields.
363
+
364
+ Pass via \`--segmentation-json '<json>'\` on \`iteration create\`.
365
+
366
+ ### Default segmentation for text/image iterations
367
+
368
+ For text- and image-modality iterations created without
369
+ \`--segmentation-json\`, the worker synthesises a single whole-content
370
+ section so a minimal \`ish iteration create --content-text "..."\` runs
371
+ end-to-end. Author your own segmentation when you want section-level
372
+ reactions; otherwise the default just works.
373
+
374
+ ### content_config — early termination + selected segments
375
+
376
+ A sibling of \`segmentation\` that controls how the tester progresses
377
+ through segments:
378
+
379
+ - \`early_termination: true\` — stop the session once every selected
380
+ segment has been seen.
381
+ - \`selected_segment_indices: [0, 2]\` — only show these segment
382
+ indices; \`null\` (default) means all segments are active.
383
+
384
+ Pass via \`--content-config-json '<json>'\`.
385
+
386
+ ## HTML content (text + media captions)
387
+
388
+ - **Text modality**: pair plain \`--content-text\` with rich
389
+ \`--content-html\` to render emails / articles with formatting. The
390
+ plain text is what testers reason over; the HTML is what they see.
391
+ - **Media captions** (video, audio, image): \`--copy-text\` and
392
+ \`--copy-html\` attach a caption to the media — the social-post
393
+ pattern. Add \`--social-platform\` (instagram/tiktok/facebook/linkedin/x)
394
+ for platform-specific framing, and \`--copy-position before|after\`
395
+ for ordering relative to the media.
396
+
397
+ Captions can carry their own segmentation when you want
398
+ paragraph-by-paragraph reactions to a long caption. Use the
399
+ \`--details-json\` escape hatch to pass a nested
400
+ \`copy_content.segmentation\`.
401
+
402
+ ## Chat modality
403
+
404
+ Chat iterations probe an external chatbot endpoint by having a tester
405
+ hold a multi-turn conversation against it. Two ways to wire the
406
+ endpoint:
407
+
408
+ \`\`\`
409
+ # Reference a saved endpoint row (recommended — reproducible):
410
+ ish iteration create --chat-endpoint-id ce-...
411
+
412
+ # Inline endpoint config (one-off):
413
+ ish iteration create --chat-endpoint-json '{"url":"https://...","headers":{...}}'
414
+ \`\`\`
415
+
416
+ Tunables:
417
+ - \`--max-turns N\` — cap the conversation length (default 12, max 50).
418
+ - \`--early-termination\` — let the worker end the session early when
419
+ the tester signals the conversation is over.
420
+
244
421
  ## No more auto-empty iteration A
245
422
 
246
423
  \`ish study create\` and \`ish study generate\` **do not auto-create
@@ -261,16 +438,6 @@ then retry.
261
438
 
262
439
  Treat this as actionable, not transient — re-running won't change anything.
263
440
 
264
- ## Default segmentation for text/image iterations
265
-
266
- For text-modality iterations created with just \`--content-text\` (and
267
- similarly \`--image-urls\` for image), the worker now synthesises a
268
- single whole-content section if no \`segmentation\` was supplied. This
269
- means a minimal \`ish iteration create --study s-XYZ --content-text
270
- "..."\` actually runs end-to-end without you needing to author a
271
- SegmentationConfig manually. Author your own segmentation when you
272
- want section-level reactions; otherwise the default just works.
273
-
274
441
  ## Related
275
442
 
276
443
  - \`concepts/study\` — the parent artifact.
@@ -423,7 +590,23 @@ choice. \`pick_confidence\` is only present on rounds run with
423
590
  "picks": { "A": 3, "B": 0 },
424
591
  "ratings": { "A": { "mean": 4.667, "n": 3 },
425
592
  "B": { "mean": 2.000, "n": 3 } },
426
- "winner": { "letter": "A", "count": 3, "tied": false }
593
+ "winner": { "label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" }
594
+ }
595
+ \`\`\`
596
+
597
+ \`winner.label\` is the picked variant's display label (matches
598
+ \`mcp__ish__get_ask_results\` so the same JQ path works either side).
599
+ \`winner.n\` is the completed-response sample the verdict was elected
600
+ from (NOT the pick count itself); \`winner.confidence\` is a coarse
601
+ summary: \`low\` for n<3 OR tied OR any errored response, \`medium\` for
602
+ 3 ≤ n < 10 with no errors, \`high\` for n ≥ 10 with no errors. When more
603
+ than half of dispatched responses errored, the winner block is REPLACED
604
+ by a refusal envelope and you should run \`ish ask retry\` first:
605
+
606
+ \`\`\`json
607
+ {
608
+ "picks": { "A": 1, "B": 0 },
609
+ "winner": { "refused": true, "reason": "error_rate_too_high", "errored": 4, "total": 5 }
427
610
  }
428
611
  \`\`\`
429
612
 
@@ -435,13 +618,31 @@ When the ask has 2+ rounds, \`ask results\` also includes a top-level
435
618
  \`\`\`json
436
619
  "cross_round_summary": {
437
620
  "rounds": [
438
- { "round_number": 1, "picks": {"A": 1, "B": 2}, "winner": {"letter": "B", "count": 2, "tied": false } },
439
- { "round_number": 2, "picks": {"A": 3, "B": 0}, "winner": {"letter": "A", "count": 3, "tied": false } }
621
+ { "round_number": 1, "picks": {"A": 1, "B": 2}, "winner": {"label": "B", "count": 2, "tied": false, "n": 3, "confidence": "low" } },
622
+ { "round_number": 2, "picks": {"A": 3, "B": 0}, "winner": {"label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" } }
440
623
  ],
441
624
  "picks_delta": { "A": +2, "B": -2 }
442
625
  }
443
626
  \`\`\`
444
627
 
628
+ ## Retrying errored responses
629
+
630
+ \`ish ask retry <ask> --round N\` re-dispatches only the ERRORED
631
+ responses on a round. COMPLETED responses are left untouched (their
632
+ answers are the source of truth). Use this after a partial failure
633
+ (e.g. 4 of 5 testers errored on round 1) — fix the underlying cause,
634
+ then \`ask retry\` to backfill the missing rows. Idempotent: zero-errored
635
+ is a no-op. Add \`--wait\` to block until the retried round settles.
636
+
637
+ \`\`\`bash
638
+ $ ish ask retry a-d3e --round 1 --wait
639
+ \`\`\`
640
+
641
+ Errored responses carry \`error_message\` + \`error_kind\` (e.g.
642
+ \`first_impression_llm_failed\`, \`interview_llm_failed\`,
643
+ \`variant_preparation_failed\`) so an agent can branch on retry vs
644
+ abort without parsing prose.
645
+
445
646
  ## Adding follow-up questions to a round
446
647
 
447
648
  \`ish ask add-questions --round N --questions ./qs.json\` is **additive
@@ -725,6 +926,72 @@ printf %s "$STAGING_PW" | ish workspace site-access basic-auth \\
725
926
  --username alice --password -
726
927
  \`\`\`
727
928
  `;
929
+ const CONCEPT_SECRET = `# concept: secret
930
+
931
+ Per-workspace key/value secrets. Used at chatbot-dispatch time to
932
+ resolve \`{{secret:KEY}}\` placeholders in outgoing headers (or
933
+ anywhere else in the rendered request). Common shape:
934
+
935
+ \`\`\`
936
+ Authorization: Bearer {{secret:GROQ_KEY}}
937
+ X-API-Key: {{secret:CUSTOMER_BOT_KEY}}
938
+ \`\`\`
939
+
940
+ Distinct from site-access (\`concepts/site-access\`): site-access is
941
+ for interactive studies that gate a browser session against a UI;
942
+ secrets here are for chatbot endpoints, where ish dispatches the
943
+ HTTP request itself and the value lands in the wire request.
944
+
945
+ ## Verbs
946
+
947
+ \`\`\`
948
+ ish secret list # list KEYS only. Values never returned.
949
+ ish secret set GROQ_KEY <value> # positional value (warning: shell history)
950
+ ish secret set GROQ_KEY --value-file ./grok.txt
951
+ printf %s "$VAL" | ish secret set GROQ_KEY --value-stdin
952
+ ish secret delete GROQ_KEY
953
+ \`\`\`
954
+
955
+ ## Keep values out of shell history
956
+
957
+ Three input modes. Pick the safest for the source:
958
+
959
+ - **\`--value-stdin\`**: read from stdin. Best for piping from
960
+ another process (\`gcloud secrets ...\`, \`op read\`, etc.).
961
+ - **\`--value-file <path>\`**: read from a file. Use \`-\` to read
962
+ from stdin (alias for \`--value-stdin\`).
963
+ - **Positional value**: convenient but lands in shell history.
964
+ Avoid in scripts.
965
+
966
+ Exactly one source per call; passing two is a usage error
967
+ (\`error_code: validation_error\`, exit 2).
968
+
969
+ ## How resolution works
970
+
971
+ At chatbot dispatch, the renderer looks up each \`{{secret:KEY}}\`
972
+ in the workspace's secret store. Missing keys render as the empty
973
+ string (no error). This matches the legacy ContextValueResolver
974
+ behavior and lets templates degrade silently instead of breaking
975
+ the request. The bot will most likely 401, which is a clear signal.
976
+
977
+ Reserved KEYs (\`BASIC_AUTH_*\`, \`SESSION_COOKIE_*\`,
978
+ \`LOGIN_*\`) are rejected client-side with a hint to use
979
+ \`ish workspace site-access\` instead. Those keys are owned by
980
+ the site-access flow and writing them as plain secrets would
981
+ silently break that path.
982
+
983
+ ## When to use a secret vs. inline a header
984
+
985
+ If the value is the same across every customer / environment and
986
+ not sensitive (a vendor name, an API version), inline it in the
987
+ endpoint config's \`headers\` field. If it's per-workspace, rotates,
988
+ or shouldn't be committed to a config JSON file, use a secret.
989
+
990
+ ## Related
991
+
992
+ - \`guides/chat\`: chat endpoint setup, including auth header examples.
993
+ - \`concepts/site-access\`: credentials for browser-rendered study URLs.
994
+ `;
728
995
  const CONCEPT_RUN_VERBS = `# concept: run verbs — \`study run\` vs \`ask run\`
729
996
 
730
997
  Both verbs dispatch simulations against an audience, but the lifecycle
@@ -966,7 +1233,80 @@ The CLI guarantees these contracts so agents can chain safely:
966
1233
  \`jq '.rounds[0].responses | length'\`.
967
1234
  - **\`study run --json\` exposes tester handles.** The top-level
968
1235
  \`tester_ids[]\` and \`tester_aliases[]\` arrays are the canonical
969
- inputs to \`ish study poll/wait/cancel\`.
1236
+ inputs to \`ish study poll/wait/cancel\`. The \`simulations[]\` array
1237
+ is collapsed to one batch entry per study (M13) with nested
1238
+ \`tester_ids[]\`, \`tester_aliases[]\`, \`job_ids[]\`, and \`count\` —
1239
+ an N-sample dispatch is a single row, not N near-duplicate rows.
1240
+ - **\`study results --json\` includes per-answer sentiment** (M10).
1241
+ Every \`interview_answers[].answers[]\` row carries \`sentiment\`
1242
+ (the tester's session-level label from \`tester_summary.sentiment\`),
1243
+ and every \`testers[]\` row carries \`sentiment\` + \`comment\`. No
1244
+ \`study tester <id>\` round-trip required.
1245
+ - **\`study results --summary\`** is a lean projection: counts +
1246
+ sentiment histogram + per-tester {alias, status, sentiment, comment,
1247
+ error_message}. Drops \`interview_answers\` and per-interaction
1248
+ breakdowns. Cheapest "did this run land?" shape.
1249
+ - **\`study results --transcript <tester_id>\`** is the chat-modality
1250
+ projection. Returns \`{tester_id, tester_alias, transcript: [...],
1251
+ unique_bot_replies, tester_summary}\`. Each transcript entry is
1252
+ \`{role, text, turn_index, ...}\` — bot turns add \`failure\`
1253
+ (set when the dispatch crashed); tester turns add \`action_type\`,
1254
+ \`option_label\`, and \`sentiment\`. \`text\` is null on tester
1255
+ turns whose action carries no text (\`select_option\`,
1256
+ \`ignore_offered\`); read intent from \`action_type\` +
1257
+ \`option_label\`. Same shape as the MCP \`get_chat_transcript\`
1258
+ tool. \`unique_bot_replies = 1\` on a multi-turn run is the M2 loop
1259
+ signature.
1260
+ - **\`study tester --summary\`** drops the action timeline and
1261
+ returns just \`{tester, interaction_count, sentiment, comment,
1262
+ error_message?, error_kind?}\`.
1263
+ - **\`study poll\` honors the active study.** Pass no \`--study\`
1264
+ flag and it falls back to the active study (set by
1265
+ \`ish study use\`), parity with \`study results\` /
1266
+ \`study wait\` / \`study run\`.
1267
+ - **\`iteration get --json\` testers carry \`alias\` + \`name\`** (M12).
1268
+ Same identifying triple as \`study results --json\`'s tester rows.
1269
+ - **\`ask results --json\` keeps \`variant_pick_id\` on every response**
1270
+ (C5-Bug4). It's the load-bearing field for "who picked what" — no
1271
+ \`--verbose\` required. Same logic on \`ask get --json\`.
1272
+ - **Every verb's \`--help\` ends with a "Tips:" footer** naming
1273
+ \`--get\` and \`--fields\`. If you're reaching for \`jq -r .x\` you
1274
+ almost certainly wanted \`--get x\`.
1275
+ - **\`study run --wait\` returns \`error_code: "wait_timeout"\`**
1276
+ (exit 5, retryable) when the wait timer expires — distinct from
1277
+ the api-client's generic timeout / network / server families. The
1278
+ envelope carries \`progress: {study_id, iteration_id?,
1279
+ timeout_seconds, done, total, pending, rows[]}\` so the agent
1280
+ can resume by polling rather than re-dispatching. Same shape on
1281
+ \`study wait\` (single-tester rows[] has length 1).
1282
+ - **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
1283
+ for the per-POST testers/batch + simulation/start budget. On
1284
+ timeout (or any dispatch failure), the error envelope includes
1285
+ \`seeded_but_not_dispatched_ids[]\` + \`seeded_but_not_dispatched_aliases[]\`
1286
+ listing the testers that exist server-side but didn't get
1287
+ dispatched. Resume by polling those instead of re-running
1288
+ \`study run\` (which would create another batch on top).
1289
+ - **\`ask run --new\` is non-idempotent and marked \`retryable: false\`**
1290
+ on any failure — agents auto-retrying would create a duplicate
1291
+ ask. The error envelope's \`suggestions\` includes a pointer to
1292
+ \`ish ask list --workspace <id>\` so the agent can confirm
1293
+ whether the resource already exists before retrying manually.
1294
+ - **\`ish connect --detach\` blocks until tunnel registration is
1295
+ confirmed** (\`registered: true\` in the lock file). The
1296
+ registration POST retries up to 4 times with exponential backoff
1297
+ (~7s worst case) before giving up; the heartbeat re-registers
1298
+ on a transient 404 instead of burning through the 3-strike
1299
+ countdown. If the heartbeat path persistently 404s even after
1300
+ several successful re-register cycles (D1: backend keeps
1301
+ forgetting the connection between heartbeats), the CLI emits
1302
+ a single stderr Notice and keeps the tunnel up rather than
1303
+ dying — the route is the problem, not the tunnel. Subsequent
1304
+ simulations may still hit \`TunnelInactive\` on dispatch in
1305
+ that case; investigate the backend's /connect route.
1306
+ - **The "Could not verify token (network error)…" stderr warning
1307
+ is gone** on green runs. The probe is best-effort; if there's a
1308
+ real auth failure, the subsequent API call surfaces it with a
1309
+ proper exit code 3.
970
1310
  - **Study responses carry a derived \`runtime_status\` field**
971
1311
  (\`draft | running | completed | completed_with_errors | cancelled\`).
972
1312
  Prefer this over the raw \`status\` field — \`runtime_status\` is
@@ -1021,7 +1361,7 @@ The CLI guarantees these contracts so agents can chain safely:
1021
1361
  "picks": { "A": 3, "B": 0 },
1022
1362
  "ratings": { "A": { "mean": 4.667, "n": 3 },
1023
1363
  "B": { "mean": 2.000, "n": 3 } },
1024
- "winner": { "letter": "A", "count": 3, "tied": false }
1364
+ "winner": { "label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" }
1025
1365
  }
1026
1366
  }
1027
1367
  \`\`\`
@@ -1029,8 +1369,23 @@ The CLI guarantees these contracts so agents can chain safely:
1029
1369
  \`picks\` is present iff \`wants_pick\`; \`ratings\` is present iff
1030
1370
  \`wants_ratings\` and ≥ 1 rating was submitted; \`winner\` is the
1031
1371
  highest pick count (\`tied: true\` if multiple variants share the
1032
- top). \`mean\` is rounded to 3 decimal places; \`n\` is the rating
1033
- count for that variant.
1372
+ top). \`winner.n\` is the completed-response sample;
1373
+ \`winner.confidence\` is \`low\` for n<3 / tied / any errors,
1374
+ \`medium\` for clean 3–9, \`high\` for clean 10+. When >50% of
1375
+ dispatched responses errored the winner block is replaced by
1376
+ \`{ refused: true, reason: "error_rate_too_high", errored, total }\` —
1377
+ run \`ish ask retry <ask> --round N\` first. \`mean\` is rounded to 3
1378
+ decimal places; \`n\` (on ratings) is the rating count for that variant.
1379
+ - **Errored ask responses carry \`error_message\` + \`error_kind\`.**
1380
+ Each \`responses[]\` entry whose \`status: errored\` exposes the
1381
+ classified failure (e.g. \`first_impression_llm_failed\`,
1382
+ \`interview_llm_failed\`, \`variant_preparation_failed\`) so an agent
1383
+ can branch on retry vs abort without parsing prose. Both fields are
1384
+ \`null\` on \`pending\` and \`completed\` rows.
1385
+ - **\`ish ask retry <ask> --round N\` re-dispatches errored responses.**
1386
+ COMPLETED rows are left untouched; only ERRORED responses are reset
1387
+ to PENDING and re-run from scratch. Idempotent: zero-errored is a
1388
+ no-op. Add \`--wait\` to block until the retry settles.
1034
1389
  - **\`ask results --json\` deduplicates tester profile snapshots.** When
1035
1390
  \`tester_profile\` and \`tester_profile_snapshot\` share all
1036
1391
  overlapping fields (the common case — they only diverge if the
@@ -1353,6 +1708,267 @@ upgrade or delete an existing resource to free up headroom.
1353
1708
  - \`concepts/profile\` — \`maxCustomTesterProfiles\` gates profile creation.
1354
1709
  - \`reference/json-mode\` — full error envelope shape and exit codes.
1355
1710
  `;
1711
+ const GUIDE_CHAT = `# guide: chat-modality studies
1712
+
1713
+ Goal: from a customer chatbot endpoint to a finished chat-modality
1714
+ study with parsed transcripts, end to end via the CLI. The flow has
1715
+ three phases: configure the endpoint, smoke test it, run a study.
1716
+
1717
+ ## 1. Configure the endpoint
1718
+
1719
+ Two starting points:
1720
+
1721
+ ### From a curl example (recommended for first-time setup)
1722
+
1723
+ The agent has a curl request that talks to the customer's bot. Save
1724
+ it to a file and run \`init\`:
1725
+
1726
+ \`\`\`
1727
+ ish chat endpoint init \\
1728
+ --from-curl ./bot.curl \\
1729
+ --name my-bot
1730
+ \`\`\`
1731
+
1732
+ \`init\` posts the curl to \`/chat/auto-detect-shape\`, infers the
1733
+ config (URL, method, headers, body template, response paths,
1734
+ mode, async-poll if applicable), and saves it as a chatbot endpoint
1735
+ resource. Output JSON shape:
1736
+
1737
+ \`\`\`json
1738
+ {
1739
+ "success": true,
1740
+ "saved": true,
1741
+ "endpoint_id": "ep_abc",
1742
+ "alias": "ep-abc",
1743
+ "config": { /* full ChatbotEndpointConfig */ },
1744
+ "tunnel_backed": true,
1745
+ "tunnel_backed_detected": true,
1746
+ "confidence": "high",
1747
+ "explanation": "...",
1748
+ "warnings": []
1749
+ }
1750
+ \`\`\`
1751
+
1752
+ For local bots (URL host is \`localhost\` / \`127.0.0.1\` /
1753
+ \`0.0.0.0\`), \`tunnel_backed\` is auto-set to \`true\`. Override
1754
+ explicitly with \`--tunnel-backed\` / \`--no-tunnel-backed\`.
1755
+ Pass \`--no-save\` to inspect the inferred config without persisting.
1756
+
1757
+ ### From a hand-written config
1758
+
1759
+ \`\`\`
1760
+ ish chat endpoint create --endpoint-config ./bot-config.json --name "my-bot"
1761
+ \`\`\`
1762
+
1763
+ The file is the bare \`ChatbotEndpointConfig\` shape (or a full
1764
+ endpoint envelope with \`id\` / \`name\` / \`config\` keys —
1765
+ \`.config\` is extracted automatically). Pipe from stdin via \`-\`.
1766
+
1767
+ ### Editing a saved endpoint
1768
+
1769
+ The dialog and the CLI both PUT the full config to
1770
+ \`/chatbot-endpoints/{id}\` on save (no patch semantics). The CLI
1771
+ exposes that round-trip cleanly:
1772
+
1773
+ \`\`\`
1774
+ # Single-field edits via shorthand flags
1775
+ ish chat endpoint update ep-abc --name "Production support bot"
1776
+ ish chat endpoint update ep-abc --url https://api.example.com/v2/chat
1777
+ ish chat endpoint update ep-abc --mode stateless
1778
+ ish chat endpoint update ep-abc --tunnel-backed # or --no-tunnel-backed
1779
+
1780
+ # Richer edits via fetch | jq | replace
1781
+ ish chat endpoint get ep-abc --verbose \\
1782
+ | jq '.config.outgoing.headers["X-API-Key"] = "{{secret:KEY}}"' \\
1783
+ | ish chat endpoint update ep-abc --endpoint-config -
1784
+
1785
+ ish chat endpoint get ep-abc --verbose \\
1786
+ | jq '.config.incoming.slotsContainerPaths += ["response.options"]
1787
+ | .config.incoming.slotsKindHints["response.options"] = "alternatives"' \\
1788
+ | ish chat endpoint update ep-abc --endpoint-config -
1789
+ \`\`\`
1790
+
1791
+ \`get --verbose\` (or piped) emits the round-trippable envelope
1792
+ \`{id, name, isTunnelBacked, config}\` — exactly what
1793
+ \`update --endpoint-config -\` accepts. Field-shorthand flags win on
1794
+ conflict with \`--endpoint-config\`.
1795
+
1796
+ ### Body template placeholders
1797
+
1798
+ The renderer expands these tokens at request time:
1799
+
1800
+ - \`{{action.text}}\`: the persona's outgoing user message this turn.
1801
+ - \`{{history}}\`: past turns as \`[{role, content}, ...]\`. Past
1802
+ turns only; current turn is in \`{{action.text}}\`.
1803
+ - \`{{history_with_current}}\`: \`{{history}}\` plus a synthetic
1804
+ \`{role: "user", content: action.text}\` at the tail. **Use this for
1805
+ OpenAI-shape bots that take a single \`messages: [...]\` array
1806
+ containing prior turns and the current user message.**
1807
+ - \`{{turn.role}}\` / \`{{turn.text}}\`: per-turn expansion. Place
1808
+ one element with these tokens inside an array literal; the
1809
+ renderer expands it to one entry per past turn.
1810
+ - \`{{tester.name}}\` / \`{{tester.locale}}\`: persona attributes.
1811
+ - \`{{conversation_id}}\`: bot-supplied session id (stateful mode).
1812
+ - \`{{secret:KEY}}\`: workspace secret (see below).
1813
+
1814
+ \`{{history_with_current}}\` lands the typical OpenAI/Anthropic/Pollinations shape:
1815
+
1816
+ \`\`\`json
1817
+ {
1818
+ "model": "gpt-4o-mini",
1819
+ "messages": "{{history_with_current}}"
1820
+ }
1821
+ \`\`\`
1822
+
1823
+ ### Auth via workspace secrets
1824
+
1825
+ For bots behind an API key, store the value as a workspace secret
1826
+ once and reference it from the endpoint's headers:
1827
+
1828
+ \`\`\`
1829
+ printf %s "$GROQ_KEY" | ish secret set GROQ_KEY --value-stdin
1830
+ ish chat endpoint update ep-abc --endpoint-config - <<'EOF'
1831
+ { "config": { "outgoing": { "headers": { "Authorization": "Bearer {{secret:GROQ_KEY}}" } } } }
1832
+ EOF
1833
+ \`\`\`
1834
+
1835
+ The renderer resolves \`{{secret:GROQ_KEY}}\` from the workspace
1836
+ secret store at dispatch time. Missing keys render empty, which
1837
+ typically surfaces as a 401 from the bot. That's an actionable signal.
1838
+
1839
+ See \`concepts/secret\` for the full set of input modes
1840
+ (\`--value-file\`, \`--value-stdin\`, positional) and the reserved-key
1841
+ list.
1842
+
1843
+ ## 2. Smoke test the connection
1844
+
1845
+ Before launching a study, verify the bot answers cleanly:
1846
+
1847
+ \`\`\`
1848
+ ish chat endpoint test ep-abc -m "Hello"
1849
+ \`\`\`
1850
+
1851
+ Output:
1852
+ \`\`\`json
1853
+ {
1854
+ "success": true,
1855
+ "text": "Hi! How can I help?",
1856
+ "conversation_id": "...",
1857
+ "slots": [...],
1858
+ "references": [...],
1859
+ "bot_latency_ms": 240,
1860
+ "end_of_conversation": false
1861
+ }
1862
+ \`\`\`
1863
+
1864
+ For tunnel-backed endpoints (\`isTunnelBacked: true\`), the CLI
1865
+ runs a tunnel pre-flight against \`/connect/active\` first and
1866
+ exits \`5\` with \`error_kind: "TunnelInactive"\` when no tunnel is
1867
+ running. Run \`ish connect <port>\` in another shell first, then
1868
+ retry.
1869
+
1870
+ For stateful endpoints, thread the conversation across script
1871
+ invocations:
1872
+
1873
+ \`\`\`
1874
+ CID=$(ish chat endpoint test ep-abc -m "Hi" | jq -r .conversation_id)
1875
+ ish chat endpoint test ep-abc -m "Tell me more" --conversation-id "$CID"
1876
+ \`\`\`
1877
+
1878
+ For multi-turn validation use \`ish study run --sample 1\` against
1879
+ a draft study (next phase).
1880
+
1881
+ ## 3. Run a chat-modality study
1882
+
1883
+ Use the existing study flow with the new chat flags. \`study create\`
1884
+ fetches the saved endpoint and embeds its config inline at
1885
+ \`iteration.details.endpoint\` plus the lineage id at
1886
+ \`iteration.details.chatbot_endpoint_id\`:
1887
+
1888
+ \`\`\`
1889
+ ish study create \\
1890
+ --modality chat \\
1891
+ --endpoint ep-abc \\
1892
+ --name "Sign-up Q1" \\
1893
+ --assignment "Sign up:Try to sign up"
1894
+ \`\`\`
1895
+
1896
+ Or pass an inline config when there's no saved endpoint to reference
1897
+ (mutually exclusive with \`--endpoint\`):
1898
+
1899
+ \`\`\`
1900
+ cat ./bot-config.json | ish study create \\
1901
+ --modality chat --endpoint-config - \\
1902
+ --name "Sign-up Q1" --assignment "Sign up:Try to sign up"
1903
+ \`\`\`
1904
+
1905
+ Optional \`--max-turns <n>\` (default 12) caps the chat per tester.
1906
+
1907
+ Audience size is set at run time. Use \`--sample <N>\` to pick N
1908
+ random simulatable profiles, or \`--all\` for the full pool.
1909
+ \`--profile <id>\` is also supported for explicit selection:
1910
+ \`\`\`
1911
+ ish study run stu-xyz --sample 5 --wait
1912
+ \`\`\`
1913
+
1914
+ Pull raw interactions:
1915
+ \`\`\`
1916
+ ish study results stu-xyz --json | jq '.interactions'
1917
+ \`\`\`
1918
+
1919
+ Note: chat is currently excluded from the LLM-analysis route; the
1920
+ results call returns raw interactions, not an analyzed summary.
1921
+
1922
+ ## Iteration shortcuts
1923
+
1924
+ Add a chat iteration to an existing chat study post-hoc. The
1925
+ iteration type is inherited from the parent study's modality —
1926
+ no \`--type\` flag is needed:
1927
+
1928
+ \`\`\`
1929
+ ish iteration create --study stu-xyz --endpoint ep-abc --max-turns 10
1930
+ ish iteration create --study stu-xyz --endpoint-config ./bot.json
1931
+ \`\`\`
1932
+
1933
+ Same flag set as \`study create\`'s chat shortcut.
1934
+
1935
+ ## Active-endpoint convention
1936
+
1937
+ \`ish chat endpoint use <id>\` writes the endpoint to
1938
+ \`~/.ish/config.json\` (\`chat_endpoint\` key). After that, every
1939
+ \`chat endpoint *\` verb that takes \`[endpoint-id]\` defaults to the
1940
+ active endpoint when the positional is omitted:
1941
+
1942
+ \`\`\`
1943
+ ish chat endpoint use ep-abc
1944
+ ish chat endpoint test -m "Hello" # uses ep-abc
1945
+ ish chat endpoint get --verbose # uses ep-abc
1946
+ \`\`\`
1947
+
1948
+ Mirrors \`workspace use\` / \`study use\` / \`ask use\`.
1949
+
1950
+ ## Common errors
1951
+
1952
+ - \`error_kind: "TunnelInactive"\` (exit 5) — tunnel-backed endpoint
1953
+ but no active tunnel. Run \`ish connect <port>\` first.
1954
+ - \`error_code: "validation_error"\` (exit 2) — usage error
1955
+ (mutually exclusive flags both set, missing required input,
1956
+ modality mismatch). The error envelope's \`valid_options\` field
1957
+ surfaces the accepted shape.
1958
+ - \`error_kind: "BotInvalidResponseError"\` (exit 1) — the bot
1959
+ responded but the configured \`incoming.*\` paths didn't resolve.
1960
+ Edit the response shape via \`update --endpoint-config\` or rerun
1961
+ \`init\` with a fresher curl sample.
1962
+
1963
+ ## Related
1964
+
1965
+ - \`concepts/iteration\` — chat iteration shape (\`details.endpoint\`,
1966
+ \`details.chatbot_endpoint_id\`, \`details.max_turns\`).
1967
+ - \`concepts/study\` — modality + assignments + iteration nesting.
1968
+ - \`reference/json-mode\` — JSON output, error envelope, exit codes.
1969
+ - \`guides/first-study\` — the same pattern for an interactive
1970
+ modality study.
1971
+ `;
1356
1972
  const PAGES = [
1357
1973
  {
1358
1974
  slug: "overview",
@@ -1375,7 +1991,7 @@ const PAGES = [
1375
1991
  {
1376
1992
  slug: "concepts/iteration",
1377
1993
  title: "concept: iteration",
1378
- description: "One configured run of a study (URL or media).",
1994
+ description: "One configured run of a study (URL, media, or chat). Covers segments, segment labels, and HTML content.",
1379
1995
  body: CONCEPT_ITERATION,
1380
1996
  },
1381
1997
  {
@@ -1426,6 +2042,12 @@ const PAGES = [
1426
2042
  description: "Credentials for gated URLs (basic auth, cookies, login forms).",
1427
2043
  body: CONCEPT_SITE_ACCESS,
1428
2044
  },
2045
+ {
2046
+ slug: "concepts/secret",
2047
+ title: "concept: secret",
2048
+ description: "Per-workspace KV store for {{secret:KEY}} placeholders in chatbot endpoint headers.",
2049
+ body: CONCEPT_SECRET,
2050
+ },
1429
2051
  {
1430
2052
  slug: "concepts/run-verbs",
1431
2053
  title: "concept: run verbs — study run vs ask run",
@@ -1462,6 +2084,12 @@ const PAGES = [
1462
2084
  description: "Login → workspace → audience → study → iteration → run → results.",
1463
2085
  body: GUIDE_FIRST_STUDY,
1464
2086
  },
2087
+ {
2088
+ slug: "guides/chat",
2089
+ title: "guide: chat-modality studies",
2090
+ description: "Configure a chatbot endpoint, smoke test it, run a chat-modality study.",
2091
+ body: GUIDE_CHAT,
2092
+ },
1465
2093
  ];
1466
2094
  const PAGES_BY_SLUG = new Map(PAGES.map((p) => [p.slug, p]));
1467
2095
  export function listPages() {