@ishlabs/cli 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,11 +112,28 @@ ish study list
112
112
  ish iteration list --study s-b2c
113
113
  ish ask list
114
114
 
115
- # Define / configure
116
- ish study create --name "..." --modality interactive --assignment "..." --question "..."
117
- ish iteration create --url https://example.com
115
+ # Define / configure (one-shot — iteration A inline)
116
+ ish study create --modality interactive --name "..." --url https://example.com \
117
+ --assignment "..." --question "..."
118
+ ish study create --modality image --name "..." \
119
+ --image-urls "https://cdn.example.com/a.png,https://cdn.example.com/b.png" \
120
+ --assignment "Compare:Which feels more premium?"
121
+ ish study create --modality video --name "..." \
122
+ --content-url https://cdn.example.com/ad.mp4 --assignment "Watch:..."
123
+
124
+ # Or 2-step (when you want to A/B iterations later, or upload local files)
125
+ ish study create --name "..." --modality interactive --assignment "..."
126
+ ish iteration create --url https://example.com # auto-uploads local files
127
+
118
128
  ish profile generate --description "..." --count 5
119
129
 
130
+ # Chat modality (talk to a customer chatbot). Audience size lives on
131
+ # study run; study create defines the persistent shape only.
132
+ ish chat endpoint init --from-curl ./bot.curl --name my-bot
133
+ ish chat endpoint test my-bot -m "Hello"
134
+ ish study create --modality chat --endpoint my-bot --assignment "Sign up:Try to sign up"
135
+ # (then) ish study run --sample 5 --wait
136
+
120
137
  # Run
121
138
  ish study run --sample 5 --country SE --wait
122
139
  ish ask run --new --name "..." --prompt "..." --variant text:"A" --variant text:"B" --sample 30 --wants-pick --wait
@@ -223,6 +240,22 @@ implies \`--quiet\` so the bare value is the only thing on stdout.
223
240
  Top-level field with per-round picks/winner snapshots and
224
241
  \`picks_delta\` (R1 → last). Don't diff two \`ask results\` calls by
225
242
  hand.
243
+ - **\`ask retry <ask> --round N\` re-dispatches errored responses.**
244
+ Use after a partial failure (e.g. 4 of 5 testers errored on round
245
+ 1). Only ERRORED rows are reset to PENDING and re-run; COMPLETED
246
+ rows are left untouched. Idempotent: zero-errored is a no-op. Add
247
+ \`--wait\` to block.
248
+ - **Errored ask responses carry \`error_message\` + \`error_kind\`.**
249
+ Each \`responses[]\` entry whose \`status: errored\` exposes the
250
+ classified failure (e.g. \`first_impression_llm_failed\`,
251
+ \`interview_llm_failed\`, \`variant_preparation_failed\`). Branch on
252
+ \`error_kind\` to decide retry vs abort.
253
+ - **\`winner\` carries \`n\` and \`confidence\`.** \`n\` is the completed
254
+ sample the verdict was elected from; \`confidence\` is \`low\` /
255
+ \`medium\` / \`high\` based on completion ratio + tied-ness. When
256
+ errored responses exceed 50%, the winner block is REPLACED by
257
+ \`{ refused: true, reason: "error_rate_too_high", errored, total }\`
258
+ — run \`ask retry\` first.
226
259
  - **\`--workspace\` works at the program root AND every subcommand.**
227
260
  \`ish --workspace w-6ec study list\` and \`ish study list --workspace
228
261
  w-6ec\` are equivalent; if both are passed, the subcommand-level
@@ -348,14 +381,17 @@ ish profile generate \\
348
381
  --description "Tech-savvy millennials in the US who use mobile banking" \\
349
382
  --count 3
350
383
 
351
- # 4. Define the study
384
+ # 4. Define the study + iteration A in one call (one-shot path).
385
+ # The same shape works for image (--image-urls), video / audio /
386
+ # document (--content-url <url>), and chat (--endpoint <id>).
352
387
  ish study create --name "Onboarding UX" --modality interactive \\
388
+ --url https://example.com --screen-format desktop \\
353
389
  --assignment "Sign up:Complete the signup flow" \\
354
390
  --question "How easy was it?"
355
391
  ish study use s-…
356
392
 
357
- # 5. Configure an iteration with the URL under test
358
- ish iteration create --url https://example.com
393
+ # (Optional) add a B variant later instead of inline:
394
+ # ish iteration create --url https://example.com/v2
359
395
 
360
396
  # 6. Run, blocking until done
361
397
  ish study run --all --wait
@@ -379,7 +415,7 @@ ish ask run --new --name "hero shots" \\
379
415
  # Read the verdict directly — no comment-parsing required:
380
416
  ish ask results --json | jq '.rounds[0].aggregates'
381
417
  # → { "picks": { "A": 22, "B": 8 },
382
- # "winner": { "letter": "A", "count": 22, "tied": false } }
418
+ # "winner": { "label": "A", "count": 22, "tied": false, "n": 30, "confidence": "high" } }
383
419
  \`\`\`
384
420
 
385
421
  For \`--wants-pick\` / \`--wants-ratings\` rounds, \`ask results --json\`
@@ -480,7 +516,87 @@ URL=$(jq -r 'select(.status=="connected") | .tunnel_url' /tmp/ish-tunnel.log | h
480
516
  ish iteration create --url "$URL"
481
517
  \`\`\`
482
518
 
483
- ## 7. Display-vs-capture: a script that does both
519
+ ## 7. Chat-modality study (drive a chatbot endpoint)
520
+
521
+ Goal: configure a customer chatbot endpoint, smoke test it, and run
522
+ a chat-modality study end to end. The CLI talks to the endpoint
523
+ through whatever transport it's configured for (sync / async-poll);
524
+ local bots reach ish via \`ish connect\`.
525
+
526
+ \`\`\`bash
527
+ # 1. Author the endpoint from a curl example (or a ChatbotEndpointConfig file).
528
+ # Localhost URLs auto-flag is_tunnel_backed=true.
529
+ ID=$(ish chat endpoint init --from-curl ./bot.curl --name my-bot \\
530
+ | jq -r .endpoint_id)
531
+
532
+ # 2. Smoke test (single turn). Tunnel-backed endpoints need an active
533
+ # \`ish connect <port>\` first; otherwise this exits 5 with
534
+ # error_kind="TunnelInactive".
535
+ ish chat endpoint test "$ID" -m "Hello"
536
+ # → { "success": true, "text": "Hi! How can I help?", "conversation_id": "...",
537
+ # "slots": [...], "bot_latency_ms": 240 }
538
+
539
+ # 3. (Optional) iterate on the config — full-replace via stdin or
540
+ # one-liner shorthand. Mirrors the editor dialog's PUT contract.
541
+ ish chat endpoint update "$ID" --name "Production support bot"
542
+ ish chat endpoint get "$ID" --verbose \\
543
+ | jq '.config.incoming.slotsContainerPaths += ["response.options"]' \\
544
+ | ish chat endpoint update "$ID" --endpoint-config -
545
+
546
+ # 4. Run a chat-modality study referencing the endpoint. Audience size
547
+ # is set on study run, not study create (--sample, --all, --profile).
548
+ STUDY=$(ish study create --modality chat --endpoint "$ID" \\
549
+ --name "Sign-up Q1" --assignment "Sign up:Try to sign up" \\
550
+ | jq -r .id)
551
+ ish study run --study "$STUDY" --sample 5 --wait
552
+ ish study results "$STUDY" --json | jq '.testers'
553
+ \`\`\`
554
+
555
+ For stateful bots, thread \`conversation_id\` across single-turn
556
+ test invocations:
557
+
558
+ \`\`\`bash
559
+ CID=$(ish chat endpoint test my-bot -m "Hi" | jq -r .conversation_id)
560
+ ish chat endpoint test my-bot -m "Tell me more" --conversation-id "$CID"
561
+ \`\`\`
562
+
563
+ For OpenAI-shape bots that take a single \`messages: [...]\` array
564
+ of prior turns plus the current user message, use the
565
+ \`{{history_with_current}}\` placeholder in the body template
566
+ (\`{ "messages": "{{history_with_current}}" }\`). Auto-detect emits
567
+ this automatically when it sees an OpenAI-shape sample.
568
+
569
+ For bots behind an API key, store the key as a workspace secret
570
+ once and reference it from headers:
571
+
572
+ \`\`\`bash
573
+ printf %s "$GROQ_KEY" | ish secret set GROQ_KEY --value-stdin
574
+ ish chat endpoint update "$ID" --endpoint-config - <<'EOF'
575
+ { "config": { "outgoing": { "headers": { "Authorization": "Bearer {{secret:GROQ_KEY}}" } } } }
576
+ EOF
577
+ \`\`\`
578
+
579
+ Endpoint editing: \`get --verbose\` emits a round-trippable
580
+ \`{id, name, isTunnelBacked, config}\` envelope that pipes directly
581
+ into \`update --endpoint-config -\`. Field-shorthand flags
582
+ (\`--name\`, \`--url\`, \`--method\`, \`--mode\`,
583
+ \`--tunnel-backed\` / \`--no-tunnel-backed\`) cover one-liner edits
584
+ without round-tripping.
585
+
586
+ Failed chat workers surface their error in
587
+ \`study results --json\` under \`testers[].error_message\` and
588
+ also in \`study poll --json\`. Branch on it instead of treating
589
+ \`interaction_count: 0\` as a generic failure.
590
+
591
+ Pre-flight tip: \`ish workspace info\` exposes
592
+ \`{studies_used, studies_max, testers_used, testers_max, tier}\` so
593
+ you can branch on plan caps before \`study create\` returns
594
+ \`error_code: usage_limit_reached\`.
595
+
596
+ The full reference is at \`ish docs get-page guides/chat\`,
597
+ secrets are at \`ish docs get-page concepts/secret\`.
598
+
599
+ ## 8. Display-vs-capture: a script that does both
484
600
 
485
601
  Goal: drive an A/B in a script, capture aliases without \`jq\`, and
486
602
  still show the human a readable result table at the end.
@@ -496,8 +612,8 @@ ASK=$(ish ask create --new --name "tagline AB" \\
496
612
  # Wait silently — exit code is what matters here.
497
613
  ish ask wait "$ASK" --timeout 600 --quiet
498
614
 
499
- # Capture the winner letter for downstream branching:
500
- WINNER=$(ish ask results "$ASK" --get rounds.aggregates.winner.letter)
615
+ # Capture the winner label for downstream branching:
616
+ WINNER=$(ish ask results "$ASK" --get rounds.aggregates.winner.label)
501
617
  echo "Winning variant: $WINNER"
502
618
 
503
619
  # Display mode — show the user the full results table even though
@@ -514,7 +630,33 @@ If you find yourself reaching for \`jq -r .x\`, you wanted \`--get x\`.
514
630
  - Capture aliases from JSON: \`ITER=$(ish iteration create --url … --json | jq -r .alias)\`
515
631
  - After \`ish study run --json\`, the testers you just dispatched are at
516
632
  \`.tester_aliases[]\` (and \`.tester_ids[]\` for UUIDs). Pass these to
517
- \`ish study poll/wait/cancel <tester_id>\`.
633
+ \`ish study poll/wait/cancel <tester_id>\`. The \`simulations[]\` array
634
+ is collapsed to one batch entry per study with nested
635
+ \`tester_ids[]\` / \`tester_aliases[]\` / \`job_ids[]\` so an N-sample
636
+ batch is a single row, not N near-duplicate rows.
637
+ - \`ish study poll\` honors the active study set by \`ish study use\` —
638
+ pass no \`--study\` flag and it polls the active study (parity with
639
+ \`study results\` / \`study wait\` / \`study run\`).
640
+ - \`ish study results --json\` includes per-answer \`sentiment\` (the
641
+ tester's session-level sentiment label) on every \`interview_answers[]
642
+ .answers[]\` row, plus \`sentiment\` + \`comment\` on every
643
+ \`testers[]\` row. No need to fetch \`study tester <id>\` per row.
644
+ - \`ish study results --summary --json\` drops the interview_answers
645
+ payload and gives you counts + sentiment + per-tester
646
+ {alias, status, sentiment, comment}. The cheapest "did this run land?"
647
+ shape.
648
+ - \`ish study results --transcript <tester_id> --json\` is the
649
+ chat-modality projection: a flat \`transcript[]\` of {role, text,
650
+ turn_index, action_type?, option_label?, sentiment?, failure?} with a
651
+ \`unique_bot_replies\` count (1 on a multi-turn run = the M2 loop
652
+ signature). Same shape as the MCP \`get_chat_transcript\` tool.
653
+ - \`ish study tester <id> --summary --json\` drops the action timeline
654
+ and returns just {tester, sentiment, comment, error_message}.
655
+ - \`ish ask results --json\` keeps \`variant_pick_id\` on every
656
+ response without needing \`--verbose\` — it's the load-bearing field
657
+ for "who picked what". Same logic on \`ask get\`.
658
+ - \`ish iteration get --json\` testers carry \`alias\` + \`name\` (M12
659
+ parity with \`study results --json\`).
518
660
  - Use \`--fields\` to keep JSON tight: \`ish study list --fields alias,name,status\`
519
661
  - Always pass \`--wait\` (or \`ish study wait\`) before reading
520
662
  \`ish study results\` — without it you may read partial data.
@@ -528,6 +670,27 @@ If you find yourself reaching for \`jq -r .x\`, you wanted \`--get x\`.
528
670
  the JSON body to construct a recovery message. \`profile generate\` /
529
671
  \`study generate\` refuse the entire batch when the post-generation
530
672
  count would exceed the cap; re-issue with a smaller \`--count\`.
673
+ - Every verb's \`--help\` ends with a "Tips:" footer naming \`--get\`
674
+ and \`--fields\`. If you're reaching for \`jq -r .x\` you almost
675
+ certainly wanted \`--get x\`.
676
+ - \`ish study run --wait\` returns \`error_code: "wait_timeout"\`
677
+ on wait expiry (exit 5, retryable) — distinct from network /
678
+ server timeouts. The envelope carries \`progress\` so you can
679
+ resume by polling the listed testers instead of re-dispatching.
680
+ Same envelope on \`ish study wait\` and per-tester \`study wait\`.
681
+ - \`ish study run\` accepts \`--dispatch-timeout <s>\` (default 120)
682
+ for the per-POST budget. On dispatch failure the error envelope
683
+ includes \`seeded_but_not_dispatched_ids[]\` /
684
+ \`seeded_but_not_dispatched_aliases[]\` — testers exist
685
+ server-side; resume by polling them, don't re-run \`study run\`.
686
+ - \`ish ask run --new\` is non-idempotent and marked
687
+ \`retryable: false\` on any failure. If you do see one, run
688
+ \`ish ask list --workspace <id>\` first to check whether the
689
+ ask was created server-side before retrying manually.
690
+ - \`ish connect --detach\` blocks until backend registration is
691
+ confirmed. The orphan-tunnel-on-startup-404 bug is fixed.
692
+ - The \`Warning: Could not verify token (network error). Proceeding
693
+ anyway.\` stderr line is gone on green runs.
531
694
 
532
695
  ## Common reshaping → use the CLI, not jq/python
533
696
 
@@ -543,6 +706,11 @@ If you find yourself reaching for \`jq -r .x\`, you wanted \`--get x\`.
543
706
  | Count responses on a round | \`--json \\| jq '.rounds[0].responses \\| length'\` | \`ish ask get a-… --fields alias,rounds,responses_complete,responses_total\` |
544
707
  | Pick the A/B winner | \`--json \\| jq '.rounds[0].responses…'\` | \`ish ask results a-… --json\` then read \`.rounds[].aggregates.winner\` |
545
708
  | List of testers from \`study run\` | \`--json \\| jq '.testers[].id'\` | \`--get tester_aliases\` (or \`tester_ids\` for UUIDs) |
709
+ | Per-answer sentiment | \`--json \\| jq '...'\` per tester | \`ish study results <id> --json\` (sentiment is on every answer row) |
710
+ | "Did this run land?" headline | \`study results --json\` + jq filtering | \`ish study results <id> --summary --json\` |
711
+ | Chat transcript for one tester | \`study tester --json\` + jq | \`ish study results <id> --transcript <tester_id> --json\` |
712
+ | Tester headline only (no action timeline) | \`study tester --json\` + jq | \`ish study tester <id> --summary --json\` |
713
+ | Variant pick id on an ask response | \`ask results --json --verbose\` | \`ish ask results a-… --json\` (variant_pick_id is preserved) |
546
714
 
547
715
  The bias here is intentional: \`ish\` ships shapes designed for agent
548
716
  consumption. If you find yourself reaching for \`jq\` or \`python\` to
@@ -564,13 +732,15 @@ ish <command> --help
564
732
 
565
733
  | Group | Purpose | Concept page |
566
734
  |-------------|-------------------------------------------------|-----------------------------|
567
- | \`workspace\` | Top-level container (= product) | concepts/workspace |
735
+ | \`workspace\` | Top-level container (= product). \`info\` shows usage caps. | concepts/workspace |
568
736
  | \`study\` | Persistent research artifact | concepts/study |
569
737
  | \`iteration\` | One configured run of a study (URL or media) | concepts/iteration |
570
738
  | \`ask\` | Lightweight reaction artifact | concepts/ask |
571
739
  | \`profile\` | Tester profiles + audience generation | concepts/profile |
572
740
  | \`source\` | Upload sources for profile generation | concepts/source |
573
741
  | \`config\` | Simulation configs (model, timing, retries) | (run \`ish config --help\`) |
742
+ | \`chat\` | Chat endpoint CRUD + smoke test (chat modality) | guides/chat |
743
+ | \`secret\` | Per-workspace secrets (\`{{secret:KEY}}\` resolver) | concepts/secret |
574
744
  | \`docs\` | Offline docs for agents | (run \`ish docs --help\`) |
575
745
  | \`init\` | Drop this skill into a Claude Code / Codex / | (run \`ish init --help\`) |
576
746
  | | Cursor / Cline / Roo project | |
@@ -265,12 +265,27 @@ export interface InterviewAnswer {
265
265
  answer: unknown;
266
266
  rationale?: string;
267
267
  }
268
+ /**
269
+ * Pattern B — drill-in subset for a follow-up ask round.
270
+ *
271
+ * Filters the new round's audience to the testers who picked
272
+ * `picked_variant_id` on the 1-indexed prior `round`. Mirrors the
273
+ * backend's `AudienceSubset` model. Only valid on follow-up rounds —
274
+ * round 1 has no prior round to filter against. The backend rejects
275
+ * unresolvable subsets with a 422 carrying
276
+ * `error_kind: "audience_subset_invalid"`.
277
+ */
278
+ export interface AudienceSubset {
279
+ round: number;
280
+ picked_variant_id: string;
281
+ }
268
282
  export interface AskRoundInput {
269
283
  prompt: string;
270
284
  variants?: AskVariantInput[];
271
285
  wants_pick?: boolean;
272
286
  wants_ratings?: boolean;
273
287
  questions?: InterviewQuestion[];
288
+ audience_subset?: AudienceSubset;
274
289
  }
275
290
  export interface AskCreateInput {
276
291
  name: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ishlabs/cli",
3
- "version": "0.9.0",
3
+ "version": "0.10.0",
4
4
  "description": "The command-line interface for ish",
5
5
  "type": "module",
6
6
  "bin": {