@ishlabs/cli 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +54 -5
  2. package/dist/commands/ask.d.ts +12 -0
  3. package/dist/commands/ask.js +127 -2
  4. package/dist/commands/chat.d.ts +17 -0
  5. package/dist/commands/chat.js +655 -0
  6. package/dist/commands/iteration.js +134 -14
  7. package/dist/commands/secret.d.ts +20 -0
  8. package/dist/commands/secret.js +246 -0
  9. package/dist/commands/study-run.d.ts +38 -0
  10. package/dist/commands/study-run.js +199 -80
  11. package/dist/commands/study-tester.js +17 -2
  12. package/dist/commands/study.js +309 -37
  13. package/dist/commands/workspace.js +81 -0
  14. package/dist/config.d.ts +3 -0
  15. package/dist/connect.d.ts +3 -0
  16. package/dist/connect.js +346 -22
  17. package/dist/index.js +64 -6
  18. package/dist/lib/alias-hydrate.d.ts +42 -0
  19. package/dist/lib/alias-hydrate.js +175 -0
  20. package/dist/lib/alias-store.d.ts +1 -0
  21. package/dist/lib/alias-store.js +28 -1
  22. package/dist/lib/auth.js +4 -2
  23. package/dist/lib/chat-endpoint-formatters.d.ts +74 -0
  24. package/dist/lib/chat-endpoint-formatters.js +154 -0
  25. package/dist/lib/chat-endpoint-templates.d.ts +35 -0
  26. package/dist/lib/chat-endpoint-templates.js +210 -0
  27. package/dist/lib/command-helpers.d.ts +18 -0
  28. package/dist/lib/command-helpers.js +105 -3
  29. package/dist/lib/docs.js +641 -17
  30. package/dist/lib/modality.d.ts +42 -0
  31. package/dist/lib/modality.js +192 -0
  32. package/dist/lib/output.d.ts +41 -0
  33. package/dist/lib/output.js +453 -19
  34. package/dist/lib/paths.d.ts +1 -0
  35. package/dist/lib/paths.js +3 -0
  36. package/dist/lib/skill-content.d.ts +18 -0
  37. package/dist/lib/skill-content.js +223 -12
  38. package/dist/lib/types.d.ts +15 -0
  39. package/package.json +2 -2
package/dist/lib/docs.js CHANGED
@@ -98,11 +98,35 @@ ish workspace list
98
98
  ish workspace create --name "My product" --base-url https://example.com
99
99
  ish workspace use w-6ec # set as active
100
100
  ish workspace get # show the active workspace
101
+ ish workspace info # usage counters + plan caps (see below)
101
102
  ish workspace site-access status
102
103
  \`\`\`
103
104
 
105
+ ## Checking usage before destructive calls
106
+
107
+ \`ish workspace info\` shows usage counters so an agent can branch on
108
+ plan limits without burning a doomed \`study create\` attempt that
109
+ returns \`error_code: usage_limit_reached\`.
110
+
111
+ \`\`\`
112
+ ish workspace info --json
113
+ {
114
+ "studies_used": 2,
115
+ "studies_max": 3,
116
+ "testers_used": 0,
117
+ "testers_max": 3,
118
+ "tier": "free"
119
+ }
120
+ \`\`\`
121
+
122
+ A \`null\` value on a \`*_max\` field means "unlimited" (paid tiers).
123
+ Branch on \`studies_used >= studies_max\` before \`study create\`,
124
+ likewise for \`testers_used\` before \`study run --sample\`.
125
+
104
126
  ## Related
105
127
 
128
+ - \`concepts/secret\` — per-workspace secrets used in chatbot endpoint
129
+ headers via \`{{secret:KEY}}\` placeholders.
106
130
  - \`reference/billing-limits\` — \`maxProducts\` cap on workspace creation.
107
131
  `;
108
132
  const CONCEPT_STUDY = `# concept: study
@@ -130,25 +154,65 @@ its iterations. Think: a study is the recipe; an iteration is one batch.
130
154
  3. \`ish study run --sample 5 --country SE\` — dispatches simulations.
131
155
  4. \`ish study results\` or \`ish study wait\` to gather outputs.
132
156
 
133
- ### One-shot variant
157
+ ### One-shot variant (inline iteration A)
158
+
159
+ \`study create\` accepts a per-modality content flag and creates
160
+ iteration A inline in the same call. Useful when you have a single
161
+ test artifact and don't need to A/B iterations:
134
162
 
135
- \`study create\` now accepts \`--content-text\` (text modality) or
136
- \`--url\` (interactive modality) inline; iteration A is created in the
137
- same call. Useful when you have a single test artifact and don't need
138
- to A/B iterations:
163
+ | Modality | Inline content flag |
164
+ |-----------------|------------------------------------------------------|
165
+ | \`interactive\` | \`--url <url>\` (\`--screen-format desktop\` is the default; pass \`mobile_portrait\` for mobile) |
166
+ | \`text\` | \`--content-text <text-or-@file>\` |
167
+ | \`image\` | \`--image-urls <url1,url2,...>\` |
168
+ | \`video\` | \`--content-url <url>\` |
169
+ | \`audio\` | \`--content-url <url>\` |
170
+ | \`document\` | \`--content-url <url>\` |
171
+ | \`chat\` | \`--endpoint <id>\` or \`--endpoint-config <file>\` |
139
172
 
140
173
  \`\`\`
174
+ # Text — single email artifact:
141
175
  ish study create --modality text --content-type email \\
142
176
  --name "Daily Brief concept" \\
143
177
  --assignment "Read:Read the email and react" \\
144
178
  --question "What stood out?" \\
145
179
  --content-text @./brief.md
146
- # → study + iteration A in one call, ready for \`study run\`.
147
- \`\`\`
148
180
 
149
- Without those flags no iteration is created — agents can no longer
150
- trip the old "empty A" footgun where \`study run\` silently targeted a
151
- placeholder.
181
+ # Interactive URL + screen format inline:
182
+ ish study create --modality interactive \\
183
+ --name "HN scan" --url https://news.ycombinator.com \\
184
+ --screen-format desktop \\
185
+ --assignment "Skim:Skim the top stories"
186
+
187
+ # Image A/B — two hero shots:
188
+ ish study create --modality image \\
189
+ --name "Hero shots" \\
190
+ --image-urls "https://cdn.example.com/a.png,https://cdn.example.com/b.png" \\
191
+ --assignment "Compare:Which feels more premium?"
192
+
193
+ # Video — one ad clip:
194
+ ish study create --modality video \\
195
+ --name "Product ad smoke" \\
196
+ --content-url https://cdn.example.com/ad.mp4 \\
197
+ --assignment "Watch:Watch and react"
198
+
199
+ # Document — a PDF whitepaper:
200
+ ish study create --modality document \\
201
+ --name "Whitepaper read-through" \\
202
+ --content-url https://cdn.example.com/report.pdf \\
203
+ --assignment "Skim:Summarise the report"
204
+ \`\`\`
205
+
206
+ Without an inline content flag no iteration is created — agents can no
207
+ longer trip the old "empty A" footgun where \`study run\` silently
208
+ targeted a placeholder. Add \`iteration create\` later if you want B/C
209
+ variants.
210
+
211
+ **Local files**: \`--content-url\` and \`--image-urls\` on \`study create\`
212
+ only accept http(s) URLs (the upload endpoint needs a study to upload
213
+ against). For local files, use the 2-step flow: \`study create\` (no
214
+ media flags) then \`iteration create --content-url ./file.mp4\` —
215
+ \`iteration create\` auto-uploads.
152
216
 
153
217
  ## Status fields (read \`runtime_status\`, not \`status\`)
154
218
 
@@ -526,7 +590,23 @@ choice. \`pick_confidence\` is only present on rounds run with
526
590
  "picks": { "A": 3, "B": 0 },
527
591
  "ratings": { "A": { "mean": 4.667, "n": 3 },
528
592
  "B": { "mean": 2.000, "n": 3 } },
529
- "winner": { "letter": "A", "count": 3, "tied": false }
593
+ "winner": { "label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" }
594
+ }
595
+ \`\`\`
596
+
597
+ \`winner.label\` is the picked variant's display label (matches
598
+ \`mcp__ish__get_ask_results\` so the same JQ path works either side).
599
+ \`winner.n\` is the completed-response sample the verdict was elected
600
+ from (NOT the pick count itself); \`winner.confidence\` is a coarse
601
+ summary: \`low\` for n<3 OR tied OR any errored response, \`medium\` for
602
+ 3 ≤ n < 10 with no errors, \`high\` for n ≥ 10 with no errors. When more
603
+ than half of dispatched responses errored, the winner block is REPLACED
604
+ by a refusal envelope and you should run \`ish ask retry\` first:
605
+
606
+ \`\`\`json
607
+ {
608
+ "picks": { "A": 1, "B": 0 },
609
+ "winner": { "refused": true, "reason": "error_rate_too_high", "errored": 4, "total": 5 }
530
610
  }
531
611
  \`\`\`
532
612
 
@@ -538,13 +618,31 @@ When the ask has 2+ rounds, \`ask results\` also includes a top-level
538
618
  \`\`\`json
539
619
  "cross_round_summary": {
540
620
  "rounds": [
541
- { "round_number": 1, "picks": {"A": 1, "B": 2}, "winner": {"letter": "B", "count": 2, "tied": false } },
542
- { "round_number": 2, "picks": {"A": 3, "B": 0}, "winner": {"letter": "A", "count": 3, "tied": false } }
621
+ { "round_number": 1, "picks": {"A": 1, "B": 2}, "winner": {"label": "B", "count": 2, "tied": false, "n": 3, "confidence": "low" } },
622
+ { "round_number": 2, "picks": {"A": 3, "B": 0}, "winner": {"label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" } }
543
623
  ],
544
624
  "picks_delta": { "A": +2, "B": -2 }
545
625
  }
546
626
  \`\`\`
547
627
 
628
+ ## Retrying errored responses
629
+
630
+ \`ish ask retry <ask> --round N\` re-dispatches only the ERRORED
631
+ responses on a round. COMPLETED responses are left untouched (their
632
+ answers are the source of truth). Use this after a partial failure
633
+ (e.g. 4 of 5 testers errored on round 1) — fix the underlying cause,
634
+ then \`ask retry\` to backfill the missing rows. Idempotent: zero-errored
635
+ is a no-op. Add \`--wait\` to block until the retried round settles.
636
+
637
+ \`\`\`bash
638
+ $ ish ask retry a-d3e --round 1 --wait
639
+ \`\`\`
640
+
641
+ Errored responses carry \`error_message\` + \`error_kind\` (e.g.
642
+ \`first_impression_llm_failed\`, \`interview_llm_failed\`,
643
+ \`variant_preparation_failed\`) so an agent can branch on retry vs
644
+ abort without parsing prose.
645
+
548
646
  ## Adding follow-up questions to a round
549
647
 
550
648
  \`ish ask add-questions --round N --questions ./qs.json\` is **additive
@@ -828,6 +926,72 @@ printf %s "$STAGING_PW" | ish workspace site-access basic-auth \\
828
926
  --username alice --password -
829
927
  \`\`\`
830
928
  `;
929
+ const CONCEPT_SECRET = `# concept: secret
930
+
931
+ Per-workspace key/value secrets. Used at chatbot-dispatch time to
932
+ resolve \`{{secret:KEY}}\` placeholders in outgoing headers (or
933
+ anywhere else in the rendered request). Common shape:
934
+
935
+ \`\`\`
936
+ Authorization: Bearer {{secret:GROQ_KEY}}
937
+ X-API-Key: {{secret:CUSTOMER_BOT_KEY}}
938
+ \`\`\`
939
+
940
+ Distinct from site-access (\`concepts/site-access\`): site-access is
941
+ for interactive studies that gate a browser session against a UI;
942
+ secrets here are for chatbot endpoints, where ish dispatches the
943
+ HTTP request itself and the value lands in the wire request.
944
+
945
+ ## Verbs
946
+
947
+ \`\`\`
948
+ ish secret list # list KEYS only. Values never returned.
949
+ ish secret set GROQ_KEY <value> # positional value (warning: shell history)
950
+ ish secret set GROQ_KEY --value-file ./grok.txt
951
+ printf %s "$VAL" | ish secret set GROQ_KEY --value-stdin
952
+ ish secret delete GROQ_KEY
953
+ \`\`\`
954
+
955
+ ## Keep values out of shell history
956
+
957
+ Three input modes. Pick the safest for the source:
958
+
959
+ - **\`--value-stdin\`**: read from stdin. Best for piping from
960
+ another process (\`gcloud secrets ...\`, \`op read\`, etc.).
961
+ - **\`--value-file <path>\`**: read from a file. Use \`-\` to read
962
+ from stdin (alias for \`--value-stdin\`).
963
+ - **Positional value**: convenient but lands in shell history.
964
+ Avoid in scripts.
965
+
966
+ Exactly one source per call; passing two is a usage error
967
+ (\`error_code: validation_error\`, exit 2).
968
+
969
+ ## How resolution works
970
+
971
+ At chatbot dispatch, the renderer looks up each \`{{secret:KEY}}\`
972
+ in the workspace's secret store. Missing keys render as the empty
973
+ string (no error). This matches the legacy ContextValueResolver
974
+ behavior and lets templates degrade silently instead of breaking
975
+ the request. The bot will most likely 401, which is a clear signal.
976
+
977
+ Reserved KEYs (\`BASIC_AUTH_*\`, \`SESSION_COOKIE_*\`,
978
+ \`LOGIN_*\`) are rejected client-side with a hint to use
979
+ \`ish workspace site-access\` instead. Those keys are owned by
980
+ the site-access flow and writing them as plain secrets would
981
+ silently break that path.
982
+
983
+ ## When to use a secret vs. inline a header
984
+
985
+ If the value is the same across every customer / environment and
986
+ not sensitive (a vendor name, an API version), inline it in the
987
+ endpoint config's \`headers\` field. If it's per-workspace, rotates,
988
+ or shouldn't be committed to a config JSON file, use a secret.
989
+
990
+ ## Related
991
+
992
+ - \`guides/chat\`: chat endpoint setup, including auth header examples.
993
+ - \`concepts/site-access\`: credentials for browser-rendered study URLs.
994
+ `;
831
995
  const CONCEPT_RUN_VERBS = `# concept: run verbs — \`study run\` vs \`ask run\`
832
996
 
833
997
  Both verbs dispatch simulations against an audience, but the lifecycle
@@ -1069,7 +1233,80 @@ The CLI guarantees these contracts so agents can chain safely:
1069
1233
  \`jq '.rounds[0].responses | length'\`.
1070
1234
  - **\`study run --json\` exposes tester handles.** The top-level
1071
1235
  \`tester_ids[]\` and \`tester_aliases[]\` arrays are the canonical
1072
- inputs to \`ish study poll/wait/cancel\`.
1236
+ inputs to \`ish study poll/wait/cancel\`. The \`simulations[]\` array
1237
+ is collapsed to one batch entry per study (M13) with nested
1238
+ \`tester_ids[]\`, \`tester_aliases[]\`, \`job_ids[]\`, and \`count\` —
1239
+ an N-sample dispatch is a single row, not N near-duplicate rows.
1240
+ - **\`study results --json\` includes per-answer sentiment** (M10).
1241
+ Every \`interview_answers[].answers[]\` row carries \`sentiment\`
1242
+ (the tester's session-level label from \`tester_summary.sentiment\`),
1243
+ and every \`testers[]\` row carries \`sentiment\` + \`comment\`. No
1244
+ \`study tester <id>\` round-trip required.
1245
+ - **\`study results --summary\`** is a lean projection: counts +
1246
+ sentiment histogram + per-tester {alias, status, sentiment, comment,
1247
+ error_message}. Drops \`interview_answers\` and per-interaction
1248
+ breakdowns. Cheapest "did this run land?" shape.
1249
+ - **\`study results --transcript <tester_id>\`** is the chat-modality
1250
+ projection. Returns \`{tester_id, tester_alias, transcript: [...],
1251
+ unique_bot_replies, tester_summary}\`. Each transcript entry is
1252
+ \`{role, text, turn_index, ...}\` — bot turns add \`failure\`
1253
+ (set when the dispatch crashed); tester turns add \`action_type\`,
1254
+ \`option_label\`, and \`sentiment\`. \`text\` is null on tester
1255
+ turns whose action carries no text (\`select_option\`,
1256
+ \`ignore_offered\`); read intent from \`action_type\` +
1257
+ \`option_label\`. Same shape as the MCP \`get_chat_transcript\`
1258
+ tool. \`unique_bot_replies = 1\` on a multi-turn run is the M2 loop
1259
+ signature.
1260
+ - **\`study tester --summary\`** drops the action timeline and
1261
+ returns just \`{tester, interaction_count, sentiment, comment,
1262
+ error_message?, error_kind?}\`.
1263
+ - **\`study poll\` honors the active study.** Pass no \`--study\`
1264
+ flag and it falls back to the active study (set by
1265
+ \`ish study use\`), parity with \`study results\` /
1266
+ \`study wait\` / \`study run\`.
1267
+ - **\`iteration get --json\` testers carry \`alias\` + \`name\`** (M12).
1268
+ Same identifying triple as \`study results --json\`'s tester rows.
1269
+ - **\`ask results --json\` keeps \`variant_pick_id\` on every response**
1270
+ (C5-Bug4). It's the load-bearing field for "who picked what" — no
1271
+ \`--verbose\` required. Same logic on \`ask get --json\`.
1272
+ - **Every verb's \`--help\` ends with a "Tips:" footer** naming
1273
+ \`--get\` and \`--fields\`. If you're reaching for \`jq -r .x\` you
1274
+ almost certainly wanted \`--get x\`.
1275
+ - **\`study run --wait\` returns \`error_code: "wait_timeout"\`**
1276
+ (exit 5, retryable) when the wait timer expires — distinct from
1277
+ the api-client's generic timeout / network / server families. The
1278
+ envelope carries \`progress: {study_id, iteration_id?,
1279
+ timeout_seconds, done, total, pending, rows[]}\` so the agent
1280
+ can resume by polling rather than re-dispatching. Same shape on
1281
+ \`study wait\` (single-tester rows[] has length 1).
1282
+ - **\`study run\` accepts \`--dispatch-timeout <s>\`** (default 120)
1283
+ for the per-POST testers/batch + simulation/start budget. On
1284
+ timeout (or any dispatch failure), the error envelope includes
1285
+ \`seeded_but_not_dispatched_ids[]\` + \`seeded_but_not_dispatched_aliases[]\`
1286
+ listing the testers that exist server-side but didn't get
1287
+ dispatched. Resume by polling those instead of re-running
1288
+ \`study run\` (which would create another batch on top).
1289
+ - **\`ask run --new\` is non-idempotent and marked \`retryable: false\`**
1290
+ on any failure — agents auto-retrying would create a duplicate
1291
+ ask. The error envelope's \`suggestions\` includes a pointer to
1292
+ \`ish ask list --workspace <id>\` so the agent can confirm
1293
+ whether the resource already exists before retrying manually.
1294
+ - **\`ish connect --detach\` blocks until tunnel registration is
1295
+ confirmed** (\`registered: true\` in the lock file). The
1296
+ registration POST retries up to 4 times with exponential backoff
1297
+ (~7s worst case) before giving up; the heartbeat re-registers
1298
+ on a transient 404 instead of burning through the 3-strike
1299
+ countdown. If the heartbeat path persistently 404s even after
1300
+ several successful re-register cycles (D1: backend keeps
1301
+ forgetting the connection between heartbeats), the CLI emits
1302
+ a single stderr Notice and keeps the tunnel up rather than
1303
+ dying — the route is the problem, not the tunnel. Subsequent
1304
+ simulations may still hit \`TunnelInactive\` on dispatch in
1305
+ that case; investigate the backend's /connect route.
1306
+ - **The "Could not verify token (network error)…" stderr warning
1307
+ is gone** on green runs. The probe is best-effort; if there's a
1308
+ real auth failure, the subsequent API call surfaces it with a
1309
+ proper exit code 3.
1073
1310
  - **Study responses carry a derived \`runtime_status\` field**
1074
1311
  (\`draft | running | completed | completed_with_errors | cancelled\`).
1075
1312
  Prefer this over the raw \`status\` field — \`runtime_status\` is
@@ -1124,7 +1361,7 @@ The CLI guarantees these contracts so agents can chain safely:
1124
1361
  "picks": { "A": 3, "B": 0 },
1125
1362
  "ratings": { "A": { "mean": 4.667, "n": 3 },
1126
1363
  "B": { "mean": 2.000, "n": 3 } },
1127
- "winner": { "letter": "A", "count": 3, "tied": false }
1364
+ "winner": { "label": "A", "count": 3, "tied": false, "n": 3, "confidence": "medium" }
1128
1365
  }
1129
1366
  }
1130
1367
  \`\`\`
@@ -1132,8 +1369,23 @@ The CLI guarantees these contracts so agents can chain safely:
1132
1369
  \`picks\` is present iff \`wants_pick\`; \`ratings\` is present iff
1133
1370
  \`wants_ratings\` and ≥ 1 rating was submitted; \`winner\` is the
1134
1371
  highest pick count (\`tied: true\` if multiple variants share the
1135
- top). \`mean\` is rounded to 3 decimal places; \`n\` is the rating
1136
- count for that variant.
1372
+ top). \`winner.n\` is the completed-response sample;
1373
+ \`winner.confidence\` is \`low\` for n<3 / tied / any errors,
1374
+ \`medium\` for clean 3–9, \`high\` for clean 10+. When >50% of
1375
+ dispatched responses errored the winner block is replaced by
1376
+ \`{ refused: true, reason: "error_rate_too_high", errored, total }\` —
1377
+ run \`ish ask retry <ask> --round N\` first. \`mean\` is rounded to 3
1378
+ decimal places; \`n\` (on ratings) is the rating count for that variant.
1379
+ - **Errored ask responses carry \`error_message\` + \`error_kind\`.**
1380
+ Each \`responses[]\` entry whose \`status: errored\` exposes the
1381
+ classified failure (e.g. \`first_impression_llm_failed\`,
1382
+ \`interview_llm_failed\`, \`variant_preparation_failed\`) so an agent
1383
+ can branch on retry vs abort without parsing prose. Both fields are
1384
+ \`null\` on \`pending\` and \`completed\` rows.
1385
+ - **\`ish ask retry <ask> --round N\` re-dispatches errored responses.**
1386
+ COMPLETED rows are left untouched; only ERRORED responses are reset
1387
+ to PENDING and re-run from scratch. Idempotent: zero-errored is a
1388
+ no-op. Add \`--wait\` to block until the retry settles.
1137
1389
  - **\`ask results --json\` deduplicates tester profile snapshots.** When
1138
1390
  \`tester_profile\` and \`tester_profile_snapshot\` share all
1139
1391
  overlapping fields (the common case — they only diverge if the
@@ -1456,6 +1708,366 @@ upgrade or delete an existing resource to free up headroom.
1456
1708
  - \`concepts/profile\` — \`maxCustomTesterProfiles\` gates profile creation.
1457
1709
  - \`reference/json-mode\` — full error envelope shape and exit codes.
1458
1710
  `;
1711
+ const GUIDE_CHAT = `# guide: chat-modality studies
1712
+
1713
+ Goal: from a customer chatbot endpoint to a finished chat-modality
1714
+ study with parsed transcripts, end to end via the CLI. The flow has
1715
+ three phases: configure the endpoint, smoke test it, run a study.
1716
+
1717
+ ## 1. Configure the endpoint
1718
+
1719
+ Two starting points:
1720
+
1721
+ ### From a curl example (recommended for first-time setup)
1722
+
1723
+ The agent has a curl request that talks to the customer's bot. Save
1724
+ it to a file and run \`init\`:
1725
+
1726
+ \`\`\`
1727
+ ish chat endpoint init \\
1728
+ --from-curl ./bot.curl \\
1729
+ --name my-bot
1730
+ \`\`\`
1731
+
1732
+ \`init\` posts the curl to \`/chat/auto-detect-shape\`, infers the
1733
+ config (URL, method, headers, body template, response paths,
1734
+ mode, async-poll if applicable), and saves it as a chatbot endpoint
1735
+ resource. Output JSON shape:
1736
+
1737
+ \`\`\`json
1738
+ {
1739
+ "success": true,
1740
+ "saved": true,
1741
+ "endpoint_id": "ep_abc",
1742
+ "alias": "ep-abc",
1743
+ "config": { /* full ChatbotEndpointConfig */ },
1744
+ "tunnel_backed": true,
1745
+ "tunnel_backed_detected": true,
1746
+ "confidence": "high",
1747
+ "explanation": "...",
1748
+ "warnings": []
1749
+ }
1750
+ \`\`\`
1751
+
1752
+ For local bots (URL host is \`localhost\` / \`127.0.0.1\` /
1753
+ \`0.0.0.0\`), \`tunnel_backed\` is auto-set to \`true\`. Override
1754
+ explicitly with \`--tunnel-backed\` / \`--no-tunnel-backed\`.
1755
+ Pass \`--no-save\` to inspect the inferred config without persisting.
1756
+
1757
+ ### From a hand-written config
1758
+
1759
+ \`\`\`
1760
+ ish chat endpoint create --endpoint-config ./bot-config.json --name "my-bot"
1761
+ \`\`\`
1762
+
1763
+ The file is the bare \`ChatbotEndpointConfig\` shape (or a full
1764
+ endpoint envelope with \`id\` / \`name\` / \`config\` keys —
1765
+ \`.config\` is extracted automatically). Pipe from stdin via \`-\`.
1766
+
1767
+ ### Editing a saved endpoint
1768
+
1769
+ The dialog and the CLI both PUT the full config to
1770
+ \`/chatbot-endpoints/{id}\` on save (no patch semantics). The CLI
1771
+ exposes that round-trip cleanly:
1772
+
1773
+ \`\`\`
1774
+ # Single-field edits via shorthand flags
1775
+ ish chat endpoint update ep-abc --name "Production support bot"
1776
+ ish chat endpoint update ep-abc --url https://api.example.com/v2/chat
1777
+ ish chat endpoint update ep-abc --mode stateless
1778
+ ish chat endpoint update ep-abc --tunnel-backed # or --no-tunnel-backed
1779
+
1780
+ # Richer edits via fetch | jq | replace
1781
+ ish chat endpoint get ep-abc --verbose \\
1782
+ | jq '.config.outgoing.headers["X-API-Key"] = "{{secret:KEY}}"' \\
1783
+ | ish chat endpoint update ep-abc --endpoint-config -
1784
+
1785
+ ish chat endpoint get ep-abc --verbose \\
1786
+ | jq '.config.incoming.slots += [{"containerPath": "response.options", "kind": "alternatives"}]' \\
1787
+ | ish chat endpoint update ep-abc --endpoint-config -
1788
+ \`\`\`
1789
+
1790
+ \`get --verbose\` (or piped) emits the round-trippable envelope
1791
+ \`{id, name, isTunnelBacked, config}\` — exactly what
1792
+ \`update --endpoint-config -\` accepts. Field-shorthand flags win on
1793
+ conflict with \`--endpoint-config\`.
1794
+
1795
+ ### Body template placeholders
1796
+
1797
+ The renderer expands these tokens at request time:
1798
+
1799
+ - \`{{action.text}}\`: the persona's outgoing user message this turn.
1800
+ - \`{{history}}\`: past turns as \`[{role, content}, ...]\`. Past
1801
+ turns only; current turn is in \`{{action.text}}\`.
1802
+ - \`{{history_with_current}}\`: \`{{history}}\` plus a synthetic
1803
+ \`{role: "user", content: action.text}\` at the tail. **Use this for
1804
+ OpenAI-shape bots that take a single \`messages: [...]\` array
1805
+ containing prior turns and the current user message.**
1806
+ - \`{{turn.role}}\` / \`{{turn.text}}\`: per-turn expansion. Place
1807
+ one element with these tokens inside an array literal; the
1808
+ renderer expands it to one entry per past turn.
1809
+ - \`{{tester.name}}\` / \`{{tester.locale}}\`: persona attributes.
1810
+ - \`{{conversation_id}}\`: bot-supplied session id (stateful mode).
1811
+ - \`{{secret:KEY}}\`: workspace secret (see below).
1812
+
1813
+ \`{{history_with_current}}\` lands the typical OpenAI/Anthropic/Pollinations shape:
1814
+
1815
+ \`\`\`json
1816
+ {
1817
+ "model": "gpt-4o-mini",
1818
+ "messages": "{{history_with_current}}"
1819
+ }
1820
+ \`\`\`
1821
+
1822
+ ### Slot bindings (interactive containers)
1823
+
1824
+ The bot's response shape is described by two typed lists on
1825
+ \`incoming\`:
1826
+
1827
+ - \`incoming.slots[]\` — INTERACTIVE containers the persona must
1828
+ respond to. Each entry is
1829
+ \`{containerPath, kind?, labelPath?, idPath?}\`.
1830
+ - \`incoming.references[]\` — PASSIVE containers (citations,
1831
+ followups, file artifacts, related links). Each entry is
1832
+ \`{containerPath, labelPath?, urlPath?}\`.
1833
+
1834
+ \`kind\` is one of \`alternatives\` (pick from a list), \`form\`
1835
+ (fill named fields), or \`text\` (free text). Leaving \`kind\` unset
1836
+ (the default) means "auto-classify per-turn from the live shape" —
1837
+ ish inspects the container value at parse time and dispatches on the
1838
+ shape it sees. Use that whenever the bot returns different slot
1839
+ kinds across turns.
1840
+
1841
+ \`labelPath\` / \`idPath\` (alternatives) and \`labelPath\` /
1842
+ \`urlPath\` (references) are optional sub-paths within each item.
1843
+ When omitted, ish falls back to \`label\` / \`text\` / \`title\` for
1844
+ labels, \`id\` / \`value\` / \`payload\` for ids, and
1845
+ \`url\` / \`uri\` / \`href\` for urls.
1846
+
1847
+ The legacy per-affordance fields (\`optionsPath\`,
1848
+ \`formRequestPath\`, \`cardsPath\`, \`artifactsPath\`,
1849
+ \`suggestedFollowupsPath\`) are gone. Anything interactive is a
1850
+ slot tagged with \`kind\`; anything passive is a reference. New
1851
+ affordance shapes ship as a new \`kind\` value, no schema migration.
1852
+
1853
+ \`auto-detect-shape\` (the engine behind \`init\`) populates these
1854
+ lists from the response stub via a shape-based classifier:
1855
+ \`list[{label, id?}]\` becomes \`alternatives\`,
1856
+ \`{fields: [...]}\` becomes \`form\`,
1857
+ \`list[{label, url}]\` becomes a reference, and
1858
+ \`list[str]\` becomes a string-list reference. The classifier never
1859
+ emits \`kind=text\` — that's a hand-set tag for free-text follow-ups.
1860
+
1861
+ ### Streaming endpoints
1862
+
1863
+ When a bot speaks Server-Sent Events (Accept: text/event-stream,
1864
+ \`-N\` / \`--no-buffer\` curl flags, or \`"stream": true\` in the
1865
+ body), \`init\` flips \`transport\` to \`"streaming"\` and seeds a
1866
+ \`streaming\` block:
1867
+
1868
+ \`\`\`json
1869
+ {
1870
+ "transport": "streaming",
1871
+ "streaming": {
1872
+ "eventFormat": "openai",
1873
+ "deltaPath": null,
1874
+ "terminalEvent": null,
1875
+ "maxWaitSeconds": 120
1876
+ }
1877
+ }
1878
+ \`\`\`
1879
+
1880
+ \`eventFormat\` is one of:
1881
+
1882
+ - \`"openai"\` — chunks shaped like
1883
+ \`{choices: [{delta: {content: "..."}}]}\`; ends on \`[DONE]\` or
1884
+ \`finish_reason != null\`. Matches OpenAI / Groq / vLLM / LiteLLM /
1885
+ OpenRouter / Chainlit.
1886
+ - \`"anthropic"\` — \`event: content_block_delta\` chunks carrying
1887
+ \`{delta: {type: "text_delta", text: "..."}}\`; ends on
1888
+ \`event: message_stop\`.
1889
+ - \`"raw"\` — body-text concatenation; no JSON decoding; closes on
1890
+ connection drop.
1891
+
1892
+ Override the format-specific defaults via \`deltaPath\` (e.g. an
1893
+ OpenAI-compatible proxy that nests delta under
1894
+ \`message.delta.content\`) and \`terminalEvent\`. \`maxWaitSeconds\`
1895
+ caps the streaming-loop deadline.
1896
+
1897
+ ### From a template
1898
+
1899
+ For well-known providers, skip auto-detect and start from a
1900
+ hand-curated config:
1901
+
1902
+ \`\`\`
1903
+ ish chat endpoint init --template openai --name "OpenAI"
1904
+ ish chat endpoint init --template anthropic --no-save | jq '.config'
1905
+ ish chat endpoint init --template voiceflow --name "Sales bot"
1906
+ ish chat endpoint init --template dialogflow-cx --name "Support"
1907
+ ish chat endpoint init --template botframework --name "Concierge"
1908
+ \`\`\`
1909
+
1910
+ Templates use \`{{secret:NAME}}\` placeholders for auth tokens; set
1911
+ the matching workspace secrets before testing:
1912
+
1913
+ \`\`\`
1914
+ printf %s "$OPENAI_API_KEY" | ish secret set OPENAI_API_KEY --value-stdin
1915
+ \`\`\`
1916
+
1917
+ Available templates: openai, anthropic, voiceflow, dialogflow-cx,
1918
+ botframework. Each is derived from the provider's public docs and
1919
+ is intentionally minimal — agents typically tighten the message
1920
+ path / model / slot bindings after one round-trip with the real bot.
1921
+
1922
+ ### Auth via workspace secrets
1923
+
1924
+ For bots behind an API key, store the value as a workspace secret
1925
+ once and reference it from the endpoint's headers:
1926
+
1927
+ \`\`\`
1928
+ printf %s "$GROQ_KEY" | ish secret set GROQ_KEY --value-stdin
1929
+ ish chat endpoint update ep-abc --endpoint-config - <<'EOF'
1930
+ { "config": { "outgoing": { "headers": { "Authorization": "Bearer {{secret:GROQ_KEY}}" } } } }
1931
+ EOF
1932
+ \`\`\`
1933
+
1934
+ The renderer resolves \`{{secret:GROQ_KEY}}\` from the workspace
1935
+ secret store at dispatch time. Missing keys render empty, which
1936
+ typically surfaces as a 401 from the bot. That's an actionable signal.
1937
+
1938
+ See \`concepts/secret\` for the full set of input modes
1939
+ (\`--value-file\`, \`--value-stdin\`, positional) and the reserved-key
1940
+ list.
1941
+
1942
+ ## 2. Smoke test the connection
1943
+
1944
+ Before launching a study, verify the bot answers cleanly:
1945
+
1946
+ \`\`\`
1947
+ ish chat endpoint test ep-abc -m "Hello"
1948
+ \`\`\`
1949
+
1950
+ Output:
1951
+ \`\`\`json
1952
+ {
1953
+ "success": true,
1954
+ "text": "Hi! How can I help?",
1955
+ "conversation_id": "...",
1956
+ "slots": [...],
1957
+ "references": [...],
1958
+ "bot_latency_ms": 240,
1959
+ "end_of_conversation": false
1960
+ }
1961
+ \`\`\`
1962
+
1963
+ For tunnel-backed endpoints (\`isTunnelBacked: true\`), the CLI
1964
+ runs a tunnel pre-flight against \`/connect/active\` first and
1965
+ exits \`5\` with \`error_kind: "TunnelInactive"\` when no tunnel is
1966
+ running. Run \`ish connect <port>\` in another shell first, then
1967
+ retry.
1968
+
1969
+ For stateful endpoints, thread the conversation across script
1970
+ invocations:
1971
+
1972
+ \`\`\`
1973
+ CID=$(ish chat endpoint test ep-abc -m "Hi" | jq -r .conversation_id)
1974
+ ish chat endpoint test ep-abc -m "Tell me more" --conversation-id "$CID"
1975
+ \`\`\`
1976
+
1977
+ For multi-turn validation use \`ish study run --sample 1\` against
1978
+ a draft study (next phase).
1979
+
1980
+ ## 3. Run a chat-modality study
1981
+
1982
+ Use the existing study flow with the new chat flags. \`study create\`
1983
+ fetches the saved endpoint and embeds its config inline at
1984
+ \`iteration.details.endpoint\` plus the lineage id at
1985
+ \`iteration.details.chatbot_endpoint_id\`:
1986
+
1987
+ \`\`\`
1988
+ ish study create \\
1989
+ --modality chat \\
1990
+ --endpoint ep-abc \\
1991
+ --name "Sign-up Q1" \\
1992
+ --assignment "Sign up:Try to sign up"
1993
+ \`\`\`
1994
+
1995
+ Or pass an inline config when there's no saved endpoint to reference
1996
+ (mutually exclusive with \`--endpoint\`):
1997
+
1998
+ \`\`\`
1999
+ cat ./bot-config.json | ish study create \\
2000
+ --modality chat --endpoint-config - \\
2001
+ --name "Sign-up Q1" --assignment "Sign up:Try to sign up"
2002
+ \`\`\`
2003
+
2004
+ Optional \`--max-turns <n>\` (default 12) caps the chat per tester.
2005
+
2006
+ Audience size is set at run time. Use \`--sample <N>\` to pick N
2007
+ random simulatable profiles, or \`--all\` for the full pool.
2008
+ \`--profile <id>\` is also supported for explicit selection:
2009
+ \`\`\`
2010
+ ish study run stu-xyz --sample 5 --wait
2011
+ \`\`\`
2012
+
2013
+ Pull raw interactions:
2014
+ \`\`\`
2015
+ ish study results stu-xyz --json | jq '.interactions'
2016
+ \`\`\`
2017
+
2018
+ Note: chat is currently excluded from the LLM-analysis route; the
2019
+ results call returns raw interactions, not an analyzed summary.
2020
+
2021
+ ## Iteration shortcuts
2022
+
2023
+ Add a chat iteration to an existing chat study post-hoc. The
2024
+ iteration type is inherited from the parent study's modality —
2025
+ no \`--type\` flag is needed:
2026
+
2027
+ \`\`\`
2028
+ ish iteration create --study stu-xyz --endpoint ep-abc --max-turns 10
2029
+ ish iteration create --study stu-xyz --endpoint-config ./bot.json
2030
+ \`\`\`
2031
+
2032
+ Same flag set as \`study create\`'s chat shortcut.
2033
+
2034
+ ## Active-endpoint convention
2035
+
2036
+ \`ish chat endpoint use <id>\` writes the endpoint to
2037
+ \`~/.ish/config.json\` (\`chat_endpoint\` key). After that, every
2038
+ \`chat endpoint *\` verb that takes \`[endpoint-id]\` defaults to the
2039
+ active endpoint when the positional is omitted:
2040
+
2041
+ \`\`\`
2042
+ ish chat endpoint use ep-abc
2043
+ ish chat endpoint test -m "Hello" # uses ep-abc
2044
+ ish chat endpoint get --verbose # uses ep-abc
2045
+ \`\`\`
2046
+
2047
+ Mirrors \`workspace use\` / \`study use\` / \`ask use\`.
2048
+
2049
+ ## Common errors
2050
+
2051
+ - \`error_kind: "TunnelInactive"\` (exit 5) — tunnel-backed endpoint
2052
+ but no active tunnel. Run \`ish connect <port>\` first.
2053
+ - \`error_code: "validation_error"\` (exit 2) — usage error
2054
+ (mutually exclusive flags both set, missing required input,
2055
+ modality mismatch). The error envelope's \`valid_options\` field
2056
+ surfaces the accepted shape.
2057
+ - \`error_kind: "BotInvalidResponseError"\` (exit 1) — the bot
2058
+ responded but the configured \`incoming.*\` paths didn't resolve.
2059
+ Edit the response shape via \`update --endpoint-config\` or rerun
2060
+ \`init\` with a fresher curl sample.
2061
+
2062
+ ## Related
2063
+
2064
+ - \`concepts/iteration\` — chat iteration shape (\`details.endpoint\`,
2065
+ \`details.chatbot_endpoint_id\`, \`details.max_turns\`).
2066
+ - \`concepts/study\` — modality + assignments + iteration nesting.
2067
+ - \`reference/json-mode\` — JSON output, error envelope, exit codes.
2068
+ - \`guides/first-study\` — the same pattern for an interactive
2069
+ modality study.
2070
+ `;
1459
2071
  const PAGES = [
1460
2072
  {
1461
2073
  slug: "overview",
@@ -1529,6 +2141,12 @@ const PAGES = [
1529
2141
  description: "Credentials for gated URLs (basic auth, cookies, login forms).",
1530
2142
  body: CONCEPT_SITE_ACCESS,
1531
2143
  },
2144
+ {
2145
+ slug: "concepts/secret",
2146
+ title: "concept: secret",
2147
+ description: "Per-workspace KV store for {{secret:KEY}} placeholders in chatbot endpoint headers.",
2148
+ body: CONCEPT_SECRET,
2149
+ },
1532
2150
  {
1533
2151
  slug: "concepts/run-verbs",
1534
2152
  title: "concept: run verbs — study run vs ask run",
@@ -1565,6 +2183,12 @@ const PAGES = [
1565
2183
  description: "Login → workspace → audience → study → iteration → run → results.",
1566
2184
  body: GUIDE_FIRST_STUDY,
1567
2185
  },
2186
+ {
2187
+ slug: "guides/chat",
2188
+ title: "guide: chat-modality studies",
2189
+ description: "Configure a chatbot endpoint (slots-only model), smoke test it, run a chat-modality study. Covers slot bindings, streaming endpoints, and built-in templates.",
2190
+ body: GUIDE_CHAT,
2191
+ },
1568
2192
  ];
1569
2193
  const PAGES_BY_SLUG = new Map(PAGES.map((p) => [p.slug, p]));
1570
2194
  export function listPages() {