@ishlabs/cli 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -5
- package/dist/commands/ask.d.ts +12 -0
- package/dist/commands/ask.js +127 -2
- package/dist/commands/chat.d.ts +17 -0
- package/dist/commands/chat.js +589 -0
- package/dist/commands/iteration.js +134 -14
- package/dist/commands/secret.d.ts +20 -0
- package/dist/commands/secret.js +246 -0
- package/dist/commands/study-run.d.ts +38 -0
- package/dist/commands/study-run.js +199 -80
- package/dist/commands/study-tester.js +17 -2
- package/dist/commands/study.js +309 -37
- package/dist/commands/workspace.js +81 -0
- package/dist/config.d.ts +3 -0
- package/dist/connect.d.ts +3 -0
- package/dist/connect.js +346 -22
- package/dist/index.js +64 -6
- package/dist/lib/alias-hydrate.d.ts +42 -0
- package/dist/lib/alias-hydrate.js +175 -0
- package/dist/lib/alias-store.d.ts +1 -0
- package/dist/lib/alias-store.js +28 -1
- package/dist/lib/auth.js +4 -2
- package/dist/lib/chat-endpoint-formatters.d.ts +39 -0
- package/dist/lib/chat-endpoint-formatters.js +104 -0
- package/dist/lib/command-helpers.d.ts +18 -0
- package/dist/lib/command-helpers.js +105 -3
- package/dist/lib/docs.js +542 -17
- package/dist/lib/modality.d.ts +42 -0
- package/dist/lib/modality.js +192 -0
- package/dist/lib/output.d.ts +41 -0
- package/dist/lib/output.js +453 -19
- package/dist/lib/paths.d.ts +1 -0
- package/dist/lib/paths.js +3 -0
- package/dist/lib/skill-content.js +182 -12
- package/dist/lib/types.d.ts +15 -0
- package/package.json +1 -1
package/dist/lib/output.js
CHANGED
|
@@ -113,6 +113,21 @@ function renderBare(value) {
|
|
|
113
113
|
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
|
114
114
|
const TIMESTAMP_KEYS = new Set(["created_at", "updated_at"]);
|
|
115
115
|
const PAGINATION_KEYS = new Set(["items", "total", "returned", "limit", "offset", "has_more"]);
|
|
116
|
+
// Keys whose UUID value must survive the lean-JSON pass even though it's the
|
|
117
|
+
// same shape leanJson strips elsewhere. These are load-bearing for agent
|
|
118
|
+
// follow-up calls and were forcing agents to pass `--verbose` (C5-Bug4).
|
|
119
|
+
const UUID_KEYS_TO_KEEP = new Set([
|
|
120
|
+
// ask: which variant the tester picked — the load-bearing field for "who picked what".
|
|
121
|
+
"variant_pick_id",
|
|
122
|
+
]);
|
|
123
|
+
// Keys whose value must pass through leanJson untouched (no UUID stripping,
|
|
124
|
+
// no empty-array drop, no nested recursion). The entire shape is contract:
|
|
125
|
+
// every variant id key and every tester id in its array is load-bearing,
|
|
126
|
+
// and unpicked variants must surface as `[]` rather than disappear.
|
|
127
|
+
const LEAN_PASSTHROUGH_KEYS = new Set([
|
|
128
|
+
// Pattern H: variant_id → [tester_id, ...] for drill-in audience discovery.
|
|
129
|
+
"pick_buckets",
|
|
130
|
+
]);
|
|
116
131
|
/**
|
|
117
132
|
* Strip UUID-valued fields, null/undefined values, and timestamps.
|
|
118
133
|
* Preserves alias, name, label, status, and other meaningful fields.
|
|
@@ -143,6 +158,19 @@ function leanJson(data, keepIds = false) {
|
|
|
143
158
|
result[key] = value;
|
|
144
159
|
continue;
|
|
145
160
|
}
|
|
161
|
+
// Allowlisted UUID-valued keys (load-bearing for agent follow-up calls).
|
|
162
|
+
if (UUID_KEYS_TO_KEEP.has(key)) {
|
|
163
|
+
if (value !== null && value !== undefined)
|
|
164
|
+
result[key] = value;
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
// Pass-through keys (e.g. pick_buckets). The entire nested shape is
|
|
168
|
+
// contract — copy verbatim with no UUID-stripping or empty-array drop.
|
|
169
|
+
if (LEAN_PASSTHROUGH_KEYS.has(key)) {
|
|
170
|
+
if (value !== null && value !== undefined)
|
|
171
|
+
result[key] = value;
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
146
174
|
// Strip null/undefined
|
|
147
175
|
if (value === null || value === undefined)
|
|
148
176
|
continue;
|
|
@@ -405,10 +433,10 @@ function suggestionsForError(err) {
|
|
|
405
433
|
if (/no auth token|run "ish login"|session expired/i.test(err.message)) {
|
|
406
434
|
return ["Run `ish login` to authenticate"];
|
|
407
435
|
}
|
|
408
|
-
if (/no workspace set
|
|
436
|
+
if (/no (active )?workspace( set)?/i.test(err.message)) {
|
|
409
437
|
return ["Run `ish workspace list` then `ish workspace use <alias>`"];
|
|
410
438
|
}
|
|
411
|
-
if (/no study set
|
|
439
|
+
if (/no (active )?study( set)?/i.test(err.message)) {
|
|
412
440
|
return ["Run `ish study list` then `ish study use <alias>`"];
|
|
413
441
|
}
|
|
414
442
|
if (/invalid id/i.test(err.message)) {
|
|
@@ -437,10 +465,31 @@ export function outputError(err, json) {
|
|
|
437
465
|
if (Array.isArray(body.suggestions))
|
|
438
466
|
bodySuggestions = body.suggestions;
|
|
439
467
|
}
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
468
|
+
// Call sites can also tag suggestions onto the error instance (e.g.
|
|
469
|
+
// `ask run --new` overrides `retryable: false` and adds an "inspect
|
|
470
|
+
// ask list before retrying" hint). Merge those alongside body /
|
|
471
|
+
// suggestionsForError so agents see the full picture.
|
|
472
|
+
const taggedSuggestions = err.suggestions;
|
|
473
|
+
const taggedSuggestionList = Array.isArray(taggedSuggestions)
|
|
474
|
+
? taggedSuggestions.map(String)
|
|
475
|
+
: [];
|
|
476
|
+
const mergedSuggestions = Array.from(new Set([
|
|
477
|
+
...(bodySuggestions ? bodySuggestions.map(String) : []),
|
|
478
|
+
...taggedSuggestionList,
|
|
479
|
+
...suggestions,
|
|
480
|
+
]));
|
|
443
481
|
const limitDetail = err.error_code === "usage_limit_reached" ? structuredDetail(err) : undefined;
|
|
482
|
+
// B7 / Pattern G: dispatch-attempt failures tag the seeded testers
|
|
483
|
+
// onto the thrown ApiError so the agent can resume without
|
|
484
|
+
// re-seeding (which would create duplicates). Surface alongside
|
|
485
|
+
// the error envelope so machine-readable consumers see them.
|
|
486
|
+
const tagged = err;
|
|
487
|
+
const seededIds = Array.isArray(tagged.seeded_but_not_dispatched_ids)
|
|
488
|
+
? tagged.seeded_but_not_dispatched_ids
|
|
489
|
+
: undefined;
|
|
490
|
+
const seededAliases = Array.isArray(tagged.seeded_but_not_dispatched_aliases)
|
|
491
|
+
? tagged.seeded_but_not_dispatched_aliases
|
|
492
|
+
: undefined;
|
|
444
493
|
if (json) {
|
|
445
494
|
console.error(JSON.stringify({
|
|
446
495
|
error: err.message,
|
|
@@ -454,6 +503,8 @@ export function outputError(err, json) {
|
|
|
454
503
|
max: limitDetail.max,
|
|
455
504
|
upgrade_url: limitDetail.upgrade_url,
|
|
456
505
|
}),
|
|
506
|
+
...(seededIds && { seeded_but_not_dispatched_ids: seededIds }),
|
|
507
|
+
...(seededAliases && { seeded_but_not_dispatched_aliases: seededAliases }),
|
|
457
508
|
...(bodyErrors !== undefined && { errors: bodyErrors }),
|
|
458
509
|
...(mergedSuggestions.length > 0 && { suggestions: mergedSuggestions }),
|
|
459
510
|
}));
|
|
@@ -499,17 +550,56 @@ export function outputError(err, json) {
|
|
|
499
550
|
}
|
|
500
551
|
}
|
|
501
552
|
else if (err instanceof Error) {
|
|
553
|
+
// Some errors carry a structured `error_kind` (e.g. TunnelInactive from
|
|
554
|
+
// `chat endpoint test`, ConfirmationRequired from destructive --json
|
|
555
|
+
// gating) — surface it in the JSON envelope so agents branch on it
|
|
556
|
+
// without parsing prose. `example` rides along for self-correction
|
|
557
|
+
// hints (e.g. "<the same command with --yes appended>").
|
|
558
|
+
//
|
|
559
|
+
// Errors that pre-declare their own `error_code` + `retryable`
|
|
560
|
+
// (e.g. WaitTimeoutError from study run/wait timer expiry) bypass
|
|
561
|
+
// the generic `client_error` mapping so an agent can branch on
|
|
562
|
+
// `wait_timeout` vs `client_error` without parsing prose. The
|
|
563
|
+
// optional `progress` field rides along verbatim — agents need to
|
|
564
|
+
// know how far the wait got (M8 + M9).
|
|
565
|
+
const tagged = err;
|
|
566
|
+
const errorKind = typeof tagged.error_kind === "string" ? tagged.error_kind : undefined;
|
|
567
|
+
const example = typeof tagged.example === "string" ? tagged.example : undefined;
|
|
568
|
+
const errorCode = typeof tagged.error_code === "string" ? tagged.error_code : "client_error";
|
|
569
|
+
const retryable = typeof tagged.retryable === "boolean" ? tagged.retryable : false;
|
|
570
|
+
const progress = tagged.progress ?? undefined;
|
|
571
|
+
const seededIds = Array.isArray(tagged.seeded_but_not_dispatched_ids)
|
|
572
|
+
? tagged.seeded_but_not_dispatched_ids
|
|
573
|
+
: undefined;
|
|
574
|
+
const seededAliases = Array.isArray(tagged.seeded_but_not_dispatched_aliases)
|
|
575
|
+
? tagged.seeded_but_not_dispatched_aliases
|
|
576
|
+
: undefined;
|
|
577
|
+
// Errors can pre-attach `suggestions: string[]` (e.g.
|
|
578
|
+
// `noActiveContextError` for Pattern A — "no active study; pass --study
|
|
579
|
+
// or run study use"). Merge alongside the code-based suggestions so the
|
|
580
|
+
// envelope carries both without dropping either.
|
|
581
|
+
const taggedSuggestions = Array.isArray(tagged.suggestions)
|
|
582
|
+
? tagged.suggestions.filter((s) => typeof s === "string")
|
|
583
|
+
: [];
|
|
584
|
+
const mergedSuggestions = [...new Set([...suggestions, ...taggedSuggestions])];
|
|
502
585
|
if (json) {
|
|
503
586
|
console.error(JSON.stringify({
|
|
504
587
|
error: err.message,
|
|
505
|
-
error_code:
|
|
506
|
-
retryable
|
|
507
|
-
...(
|
|
588
|
+
error_code: errorCode,
|
|
589
|
+
retryable,
|
|
590
|
+
...(errorKind && { error_kind: errorKind }),
|
|
591
|
+
...(example && { example }),
|
|
592
|
+
...(progress !== undefined && { progress }),
|
|
593
|
+
...(seededIds && { seeded_but_not_dispatched_ids: seededIds }),
|
|
594
|
+
...(seededAliases && { seeded_but_not_dispatched_aliases: seededAliases }),
|
|
595
|
+
...(mergedSuggestions.length > 0 && { suggestions: mergedSuggestions }),
|
|
508
596
|
}));
|
|
509
597
|
}
|
|
510
598
|
else {
|
|
511
599
|
console.error(`Error: ${err.message}`);
|
|
512
|
-
|
|
600
|
+
if (example)
|
|
601
|
+
console.error(` → Example: ${example}`);
|
|
602
|
+
for (const s of mergedSuggestions)
|
|
513
603
|
console.error(` → ${s}`);
|
|
514
604
|
}
|
|
515
605
|
}
|
|
@@ -764,7 +854,9 @@ function buildStudyResultsEnvelope(study) {
|
|
|
764
854
|
total: sentimentTotal,
|
|
765
855
|
}
|
|
766
856
|
: null;
|
|
767
|
-
// Group interview answers by question for easy parsing.
|
|
857
|
+
// Group interview answers by question for easy parsing. Each answer row
|
|
858
|
+
// includes the tester's session-level `sentiment` (M10) so agents can read
|
|
859
|
+
// sentiment per answer without round-tripping `study tester <id>` per row.
|
|
768
860
|
const questions = Array.isArray(study.interview_questions) ? study.interview_questions : [];
|
|
769
861
|
const interviewAnswers = questions.map((q) => {
|
|
770
862
|
const qObj = q;
|
|
@@ -773,10 +865,11 @@ function buildStudyResultsEnvelope(study) {
|
|
|
773
865
|
const a = t.interviewAnswers.find((x) => x.questionId === qObj.id);
|
|
774
866
|
if (a) {
|
|
775
867
|
answers.push({
|
|
776
|
-
tester_alias: t.
|
|
868
|
+
tester_alias: t.alias || null,
|
|
777
869
|
tester_name: t.name,
|
|
778
870
|
iteration: t.iterationLabel,
|
|
779
871
|
answer: a.answer,
|
|
872
|
+
sentiment: t.summarySentiment,
|
|
780
873
|
});
|
|
781
874
|
}
|
|
782
875
|
}
|
|
@@ -794,11 +887,13 @@ function buildStudyResultsEnvelope(study) {
|
|
|
794
887
|
// agents can act on a failed run without re-fetching every tester.
|
|
795
888
|
const failedCount = allTesters.filter((t) => t.status.toLowerCase() === "failed").length;
|
|
796
889
|
const testerRows = allTesters.map((t) => ({
|
|
797
|
-
alias: t.
|
|
890
|
+
alias: t.alias || null,
|
|
798
891
|
name: t.name,
|
|
799
892
|
iteration: t.iterationLabel,
|
|
800
893
|
status: t.status,
|
|
801
894
|
interaction_count: t.interactionCount,
|
|
895
|
+
sentiment: t.summarySentiment,
|
|
896
|
+
...(t.summaryComment && { comment: t.summaryComment }),
|
|
802
897
|
...(t.errorMessage && { error_message: t.errorMessage }),
|
|
803
898
|
}));
|
|
804
899
|
return {
|
|
@@ -888,6 +983,210 @@ export function formatStudyResults(study, json) {
|
|
|
888
983
|
console.log("\nRun `ish tester get <id> --json` for full interaction details.");
|
|
889
984
|
}
|
|
890
985
|
}
|
|
986
|
+
/**
|
|
987
|
+
* `study results --summary` projection. Drops interview_answers + per-tester
|
|
988
|
+
* interaction breakdowns; keeps headline counters, sentiment histogram, and a
|
|
989
|
+
* per-tester {alias, status, sentiment, comment} row. Useful for agents that
|
|
990
|
+
* need to branch on outcome without paying for the full envelope.
|
|
991
|
+
*/
|
|
992
|
+
export function buildStudyResultsSummary(study) {
|
|
993
|
+
const allTesters = collectTesters(study);
|
|
994
|
+
const studyAlias = study.id
|
|
995
|
+
? deterministicAlias(ALIAS_PREFIX.study, String(study.id))
|
|
996
|
+
: null;
|
|
997
|
+
const completedCount = allTesters.filter((t) => t.status === "completed" || t.status === "complete").length;
|
|
998
|
+
const failedCount = allTesters.filter((t) => t.status.toLowerCase() === "failed").length;
|
|
999
|
+
const sentimentCounts = {};
|
|
1000
|
+
let sentimentTotal = 0;
|
|
1001
|
+
for (const t of allTesters) {
|
|
1002
|
+
for (const [label, count] of Object.entries(t.sentimentCounts)) {
|
|
1003
|
+
sentimentCounts[label] = (sentimentCounts[label] || 0) + count;
|
|
1004
|
+
sentimentTotal += count;
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
const sentiment = sentimentTotal > 0 ? { counts: sentimentCounts, total: sentimentTotal } : null;
|
|
1008
|
+
const testers = allTesters.map((t) => ({
|
|
1009
|
+
alias: t.alias || null,
|
|
1010
|
+
name: t.name,
|
|
1011
|
+
status: t.status,
|
|
1012
|
+
sentiment: t.summarySentiment,
|
|
1013
|
+
...(t.summaryComment && { comment: t.summaryComment }),
|
|
1014
|
+
...(t.errorMessage && { error_message: t.errorMessage }),
|
|
1015
|
+
}));
|
|
1016
|
+
return {
|
|
1017
|
+
study: {
|
|
1018
|
+
alias: studyAlias,
|
|
1019
|
+
name: study.name || null,
|
|
1020
|
+
modality: study.modality || null,
|
|
1021
|
+
},
|
|
1022
|
+
tester_count: allTesters.length,
|
|
1023
|
+
completed_count: completedCount,
|
|
1024
|
+
failed_count: failedCount,
|
|
1025
|
+
sentiment,
|
|
1026
|
+
testers,
|
|
1027
|
+
};
|
|
1028
|
+
}
|
|
1029
|
+
/**
|
|
1030
|
+
* `study results --transcript <tester_id>` projection. Mirrors the schema
|
|
1031
|
+
* MCP's `get_chat_transcript` returns (`src/ish_mcp/projections.py:
|
|
1032
|
+
* build_chat_transcript`) so callers see the same shape regardless of
|
|
1033
|
+
* surface. Tester turns whose action carries no text (e.g. select_option)
|
|
1034
|
+
* surface `text: null`; intent lives on `action_type` + `option_label`.
|
|
1035
|
+
* Bot turns with a `bot_reply.failure` block surface `failure` and
|
|
1036
|
+
* `text: null` and don't count toward `unique_bot_replies`.
|
|
1037
|
+
*/
|
|
1038
|
+
export function buildChatTranscript(tester) {
|
|
1039
|
+
const id = String(tester.id || "");
|
|
1040
|
+
const alias = id ? deterministicAlias(ALIAS_PREFIX.tester, id) : null;
|
|
1041
|
+
const profile = tester.tester_profile;
|
|
1042
|
+
const interactions = Array.isArray(tester.interactions) ? tester.interactions : [];
|
|
1043
|
+
// Sort by timestamp then created_at so agent doesn't need to re-sort.
|
|
1044
|
+
const sorted = [...interactions].sort((a, b) => {
|
|
1045
|
+
const aIx = a;
|
|
1046
|
+
const bIx = b;
|
|
1047
|
+
const at = String(aIx.timestamp ?? aIx.created_at ?? "");
|
|
1048
|
+
const bt = String(bIx.timestamp ?? bIx.created_at ?? "");
|
|
1049
|
+
if (at < bt)
|
|
1050
|
+
return -1;
|
|
1051
|
+
if (at > bt)
|
|
1052
|
+
return 1;
|
|
1053
|
+
const ac = String(aIx.created_at ?? "");
|
|
1054
|
+
const bc = String(bIx.created_at ?? "");
|
|
1055
|
+
return ac < bc ? -1 : ac > bc ? 1 : 0;
|
|
1056
|
+
});
|
|
1057
|
+
const transcript = [];
|
|
1058
|
+
const uniqueBotReplies = new Set();
|
|
1059
|
+
for (const interaction of sorted) {
|
|
1060
|
+
const ix = interaction;
|
|
1061
|
+
// Tester turn — derive role/action from the interaction itself.
|
|
1062
|
+
// Backend shape (post a880eba rename):
|
|
1063
|
+
// ix.actor in {"ai", "human", "user"} (user is the actual end-user;
|
|
1064
|
+
// we don't surface those in the transcript).
|
|
1065
|
+
// ix.actions: [{action_type, data: {type, turn_index, text?, wire_text?,
|
|
1066
|
+
// option_label?, said_instead?, ...}}]
|
|
1067
|
+
// ix.bot_reply: {text?, failure?}
|
|
1068
|
+
// The tester's actual message text is nested under `action.data` —
|
|
1069
|
+
// earlier versions of this builder read off the action top-level
|
|
1070
|
+
// (`action.text`, `action.type`), which silently produced
|
|
1071
|
+
// `text: null` on every turn (PC-C3 finding #3).
|
|
1072
|
+
const actions = Array.isArray(ix.actions) ? ix.actions : [];
|
|
1073
|
+
const action0 = actions.length > 0
|
|
1074
|
+
? actions[0]
|
|
1075
|
+
: ix.action;
|
|
1076
|
+
const actionData = action0?.data;
|
|
1077
|
+
const actionType = action0?.action_type
|
|
1078
|
+
? String(action0.action_type)
|
|
1079
|
+
: (actionData?.type ? String(actionData.type) : null);
|
|
1080
|
+
// Turn index lives on the action.data, not the interaction itself.
|
|
1081
|
+
// Fall back through interaction.turn_index for older payloads.
|
|
1082
|
+
const turnIndex = typeof actionData?.turn_index === "number"
|
|
1083
|
+
? actionData.turn_index
|
|
1084
|
+
: typeof ix.turn_index === "number"
|
|
1085
|
+
? ix.turn_index
|
|
1086
|
+
: typeof ix.turn === "number" ? ix.turn : 0;
|
|
1087
|
+
const botReply = ix.bot_reply;
|
|
1088
|
+
if (botReply && typeof botReply === "object") {
|
|
1089
|
+
const failure = botReply.failure;
|
|
1090
|
+
const text = failure ? null : (botReply.text != null ? String(botReply.text) : null);
|
|
1091
|
+
transcript.push({
|
|
1092
|
+
role: "bot",
|
|
1093
|
+
text,
|
|
1094
|
+
turn_index: turnIndex,
|
|
1095
|
+
failure: failure ?? null,
|
|
1096
|
+
});
|
|
1097
|
+
if (!failure && text != null && text.length > 0) {
|
|
1098
|
+
uniqueBotReplies.add(text);
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
1101
|
+
const actor = String(ix.actor ?? ix.interaction_type ?? "");
|
|
1102
|
+
if (actor === "ai" || actor === "human") {
|
|
1103
|
+
// Resolve the tester's literal text from action.data, preferring
|
|
1104
|
+
// the canonical wire_text the backend exposes for every action
|
|
1105
|
+
// shape (send_text, select_option, ignore_offered, …) so the
|
|
1106
|
+
// transcript carries the actual content on every turn — D2.
|
|
1107
|
+
// Priority order:
|
|
1108
|
+
// 1. action.data.text (send_text)
|
|
1109
|
+
// 2. action.data.wire_text (universal canonical for all shapes)
|
|
1110
|
+
// 3. action.data.said_instead (ignore_offered)
|
|
1111
|
+
// 4. action.text (legacy / older payloads)
|
|
1112
|
+
const dataText = actionData?.text;
|
|
1113
|
+
const dataWireText = actionData?.wire_text;
|
|
1114
|
+
const dataSaidInstead = actionData?.said_instead;
|
|
1115
|
+
let text = null;
|
|
1116
|
+
if (typeof dataText === "string" && dataText.length > 0)
|
|
1117
|
+
text = dataText;
|
|
1118
|
+
else if (typeof dataWireText === "string" && dataWireText.length > 0)
|
|
1119
|
+
text = dataWireText;
|
|
1120
|
+
else if (typeof dataSaidInstead === "string" && dataSaidInstead.length > 0)
|
|
1121
|
+
text = dataSaidInstead;
|
|
1122
|
+
else if (typeof action0?.text === "string" && action0.text.length > 0)
|
|
1123
|
+
text = action0.text;
|
|
1124
|
+
const dataOption = actionData?.option_label;
|
|
1125
|
+
const optionLabel = typeof dataOption === "string" && dataOption.length > 0
|
|
1126
|
+
? dataOption
|
|
1127
|
+
: (typeof action0?.option_label === "string" && action0.option_label.length > 0
|
|
1128
|
+
? action0.option_label
|
|
1129
|
+
: (typeof action0?.element_label === "string" && action0.element_label.length > 0
|
|
1130
|
+
? action0.element_label
|
|
1131
|
+
: null));
|
|
1132
|
+
const sentimentObj = ix.sentiment;
|
|
1133
|
+
transcript.push({
|
|
1134
|
+
role: "tester",
|
|
1135
|
+
text,
|
|
1136
|
+
turn_index: turnIndex,
|
|
1137
|
+
action_type: actionType,
|
|
1138
|
+
option_label: optionLabel,
|
|
1139
|
+
sentiment: sentimentObj?.label ? String(sentimentObj.label) : null,
|
|
1140
|
+
});
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
const summary = tester.tester_summary;
|
|
1144
|
+
return {
|
|
1145
|
+
tester_id: id || null,
|
|
1146
|
+
tester_alias: alias,
|
|
1147
|
+
instance_name: tester.instance_name ?? null,
|
|
1148
|
+
modality: "chat",
|
|
1149
|
+
transcript,
|
|
1150
|
+
unique_bot_replies: uniqueBotReplies.size,
|
|
1151
|
+
tester_summary: summary
|
|
1152
|
+
? {
|
|
1153
|
+
comment: summary.comment ?? null,
|
|
1154
|
+
sentiment: summary.sentiment ?? null,
|
|
1155
|
+
}
|
|
1156
|
+
: null,
|
|
1157
|
+
name: profile?.name ?? null,
|
|
1158
|
+
};
|
|
1159
|
+
}
|
|
1160
|
+
/**
|
|
1161
|
+
* `study tester --summary` projection. Drops the action timeline; keeps the
|
|
1162
|
+
* headline (alias, status, sentiment, comment, error_message). Useful for
|
|
1163
|
+
* the common "did this tester finish, what did they say" check that's
|
|
1164
|
+
* currently buried under the full interactions array.
|
|
1165
|
+
*/
|
|
1166
|
+
export function buildTesterSummary(tester) {
|
|
1167
|
+
const id = String(tester.id || "");
|
|
1168
|
+
const alias = id ? deterministicAlias(ALIAS_PREFIX.tester, id) : null;
|
|
1169
|
+
const profile = tester.tester_profile;
|
|
1170
|
+
const summary = tester.tester_summary;
|
|
1171
|
+
const summarySentiment = summary?.sentiment;
|
|
1172
|
+
const interactions = Array.isArray(tester.interactions) ? tester.interactions : [];
|
|
1173
|
+
const out = {
|
|
1174
|
+
tester: {
|
|
1175
|
+
alias,
|
|
1176
|
+
name: profile?.name ?? tester.instance_name ?? null,
|
|
1177
|
+
iteration_id: tester.iteration_id ?? null,
|
|
1178
|
+
status: tester.status ?? null,
|
|
1179
|
+
},
|
|
1180
|
+
interaction_count: interactions.length,
|
|
1181
|
+
sentiment: summarySentiment?.label ?? null,
|
|
1182
|
+
comment: summary?.comment ?? null,
|
|
1183
|
+
};
|
|
1184
|
+
if (tester.error_message)
|
|
1185
|
+
out.error_message = String(tester.error_message);
|
|
1186
|
+
if (tester.error_kind)
|
|
1187
|
+
out.error_kind = String(tester.error_kind);
|
|
1188
|
+
return out;
|
|
1189
|
+
}
|
|
891
1190
|
function collectTesters(study) {
|
|
892
1191
|
const iterations = Array.isArray(study.iterations) ? study.iterations : [];
|
|
893
1192
|
const testers = [];
|
|
@@ -909,14 +1208,20 @@ function collectTesters(study) {
|
|
|
909
1208
|
}
|
|
910
1209
|
}
|
|
911
1210
|
const answers = Array.isArray(t.interview_answers) ? t.interview_answers : [];
|
|
1211
|
+
const summary = t.tester_summary;
|
|
1212
|
+
const summarySentimentObj = summary?.sentiment;
|
|
1213
|
+
const id = String(t.id || "");
|
|
912
1214
|
testers.push({
|
|
913
|
-
id
|
|
1215
|
+
id,
|
|
914
1216
|
name: String(profile?.name || t.instance_name || "Unknown"),
|
|
1217
|
+
alias: id ? deterministicAlias(ALIAS_PREFIX.tester, id) : "",
|
|
915
1218
|
iterationLabel: iterLabel,
|
|
916
1219
|
status: String(t.status || "-"),
|
|
917
1220
|
errorMessage: t.error_message ? String(t.error_message) : null,
|
|
918
1221
|
interactionCount: interactions.length,
|
|
919
1222
|
sentimentCounts,
|
|
1223
|
+
summarySentiment: summarySentimentObj?.label ? String(summarySentimentObj.label) : null,
|
|
1224
|
+
summaryComment: summary?.comment ? String(summary.comment) : null,
|
|
920
1225
|
interviewAnswers: answers.map((a) => ({
|
|
921
1226
|
questionId: String(a.question_id || ""),
|
|
922
1227
|
answer: a.answer,
|
|
@@ -1111,6 +1416,23 @@ export function formatSimulationPoll(results, json, isMedia = false) {
|
|
|
1111
1416
|
String(r.interaction_count ?? "0"),
|
|
1112
1417
|
];
|
|
1113
1418
|
}));
|
|
1419
|
+
// Pattern A (cli half): list per-tester error_message under the table so
|
|
1420
|
+
// agents see why a simulation failed without re-fetching every tester.
|
|
1421
|
+
// Truncate to 200 chars; full text is available via --json or
|
|
1422
|
+
// `ish study tester get <id>`.
|
|
1423
|
+
const failedRows = results.filter((r) => {
|
|
1424
|
+
const status = String(r.status || "").toLowerCase();
|
|
1425
|
+
return (status === "failed" || status === "errored") && r.error_message;
|
|
1426
|
+
});
|
|
1427
|
+
if (failedRows.length > 0) {
|
|
1428
|
+
console.log("\nFailed simulations:");
|
|
1429
|
+
for (const r of failedRows) {
|
|
1430
|
+
const id = String(r.id || r.tester_id || "");
|
|
1431
|
+
const alias = aliasMap.get(id) || id;
|
|
1432
|
+
const name = String(r.tester_name || "Unknown");
|
|
1433
|
+
console.log(` ${alias} (${name}): ${truncate(String(r.error_message), 200)}`);
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1114
1436
|
}
|
|
1115
1437
|
// --- Ask formatting ---
|
|
1116
1438
|
function variantLetter(index) {
|
|
@@ -1324,6 +1646,9 @@ function computeVariantStats(round) {
|
|
|
1324
1646
|
}
|
|
1325
1647
|
return stats;
|
|
1326
1648
|
}
|
|
1649
|
+
const ERROR_RATE_REFUSE_THRESHOLD = 0.5;
|
|
1650
|
+
const N_HIGH_CONFIDENCE_FLOOR = 10;
|
|
1651
|
+
const N_MEDIUM_CONFIDENCE_FLOOR = 3;
|
|
1327
1652
|
// When tester_profile and tester_profile_snapshot share all overlapping fields
|
|
1328
1653
|
// (the common case — snapshot only diverges if the profile was edited after
|
|
1329
1654
|
// dispatch), drop the redundant content from the snapshot and keep only the
|
|
@@ -1366,6 +1691,12 @@ function dedupeTesterSnapshot(tester) {
|
|
|
1366
1691
|
}
|
|
1367
1692
|
// Shape per-variant stats into a machine-readable aggregates object so agents
|
|
1368
1693
|
// running A/B tests can read the verdict without parsing prose.
|
|
1694
|
+
//
|
|
1695
|
+
// P6: the winner block carries `n` (completed-response sample size) and a
|
|
1696
|
+
// `confidence` summary derived from completion ratio + tied-ness. When the
|
|
1697
|
+
// error rate is high enough that calling a winner would be misleading, the
|
|
1698
|
+
// winner block is REPLACED by a `{ refused: true, reason }` envelope — agents
|
|
1699
|
+
// should run `ish ask retry` first and re-read.
|
|
1369
1700
|
function buildAggregates(round, stats) {
|
|
1370
1701
|
if (stats.length === 0)
|
|
1371
1702
|
return undefined;
|
|
@@ -1374,6 +1705,19 @@ function buildAggregates(round, stats) {
|
|
|
1374
1705
|
if (!wantsPick && !wantsRatings)
|
|
1375
1706
|
return undefined;
|
|
1376
1707
|
const out = {};
|
|
1708
|
+
// Use the same source of truth as the denormalized counts so winner.n is
|
|
1709
|
+
// consistent with `responses_complete` reported on the round envelope.
|
|
1710
|
+
const responses = Array.isArray(round.responses) ? round.responses : [];
|
|
1711
|
+
let completedTotal = 0;
|
|
1712
|
+
let erroredTotal = 0;
|
|
1713
|
+
for (const r of responses) {
|
|
1714
|
+
const status = r.status;
|
|
1715
|
+
if (status === "completed")
|
|
1716
|
+
completedTotal++;
|
|
1717
|
+
else if (status === "errored")
|
|
1718
|
+
erroredTotal++;
|
|
1719
|
+
}
|
|
1720
|
+
const dispatchedTotal = completedTotal + erroredTotal;
|
|
1377
1721
|
if (wantsPick) {
|
|
1378
1722
|
const picks = {};
|
|
1379
1723
|
let topCount = -1;
|
|
@@ -1391,8 +1735,65 @@ function buildAggregates(round, stats) {
|
|
|
1391
1735
|
}
|
|
1392
1736
|
}
|
|
1393
1737
|
out.picks = picks;
|
|
1738
|
+
// Pattern H: pick_buckets keyed by variant id (the value `add_ask_round
|
|
1739
|
+
// --subset-variant` accepts) → tester_ids that picked it. Pre-seed with
|
|
1740
|
+
// every declared variant id so unpicked variants surface as empty
|
|
1741
|
+
// arrays. Only completed responses with a resolvable variant_pick_id
|
|
1742
|
+
// contribute; an errored response carrying a stale variant_pick_id
|
|
1743
|
+
// would otherwise drag a tester into a drill-in audience whose pick
|
|
1744
|
+
// we can't trust.
|
|
1745
|
+
const variants = Array.isArray(round.variants) ? round.variants : [];
|
|
1746
|
+
const variantIdSet = new Set();
|
|
1747
|
+
const pickBuckets = {};
|
|
1748
|
+
for (const v of variants) {
|
|
1749
|
+
const vid = v.id;
|
|
1750
|
+
if (typeof vid === "string" && vid.length > 0) {
|
|
1751
|
+
variantIdSet.add(vid);
|
|
1752
|
+
pickBuckets[vid] = [];
|
|
1753
|
+
}
|
|
1754
|
+
}
|
|
1755
|
+
if (variantIdSet.size > 0) {
|
|
1756
|
+
for (const r of responses) {
|
|
1757
|
+
const resp = r;
|
|
1758
|
+
if (resp.status !== "completed")
|
|
1759
|
+
continue;
|
|
1760
|
+
const vpid = resp.variant_pick_id;
|
|
1761
|
+
const tid = resp.tester_id;
|
|
1762
|
+
if (typeof vpid === "string"
|
|
1763
|
+
&& variantIdSet.has(vpid)
|
|
1764
|
+
&& typeof tid === "string"
|
|
1765
|
+
&& tid.length > 0) {
|
|
1766
|
+
pickBuckets[vpid].push(tid);
|
|
1767
|
+
}
|
|
1768
|
+
}
|
|
1769
|
+
out.pick_buckets = pickBuckets;
|
|
1770
|
+
}
|
|
1394
1771
|
if (topCount > 0) {
|
|
1395
|
-
|
|
1772
|
+
// Refuse the winner when more than half of dispatched responses errored.
|
|
1773
|
+
// Calling A or B with a 4/5 failure rate would mislead the agent into
|
|
1774
|
+
// treating one tester's pick as a verdict.
|
|
1775
|
+
if (dispatchedTotal > 0
|
|
1776
|
+
&& erroredTotal / dispatchedTotal > ERROR_RATE_REFUSE_THRESHOLD) {
|
|
1777
|
+
out.winner = {
|
|
1778
|
+
refused: true,
|
|
1779
|
+
reason: "error_rate_too_high",
|
|
1780
|
+
errored: erroredTotal,
|
|
1781
|
+
total: dispatchedTotal,
|
|
1782
|
+
};
|
|
1783
|
+
}
|
|
1784
|
+
else {
|
|
1785
|
+
out.winner = {
|
|
1786
|
+
label: topLetter,
|
|
1787
|
+
count: topCount,
|
|
1788
|
+
tied,
|
|
1789
|
+
n: completedTotal,
|
|
1790
|
+
confidence: deriveWinnerConfidence({
|
|
1791
|
+
n: completedTotal,
|
|
1792
|
+
errored: erroredTotal,
|
|
1793
|
+
tied,
|
|
1794
|
+
}),
|
|
1795
|
+
};
|
|
1796
|
+
}
|
|
1396
1797
|
}
|
|
1397
1798
|
}
|
|
1398
1799
|
if (wantsRatings) {
|
|
@@ -1410,6 +1811,30 @@ function buildAggregates(round, stats) {
|
|
|
1410
1811
|
}
|
|
1411
1812
|
return out;
|
|
1412
1813
|
}
|
|
1814
|
+
/**
|
|
1815
|
+
* Derive a coarse confidence label from sample size + tied-ness + error mix.
|
|
1816
|
+
*
|
|
1817
|
+
* Rules (lowest wins):
|
|
1818
|
+
* - low: n < 3 OR tied OR any errored response (we have visible failures)
|
|
1819
|
+
* - medium: 3 <= n < 10 (small sample but clean)
|
|
1820
|
+
* - high: n >= 10 AND no errored responses AND not tied
|
|
1821
|
+
*
|
|
1822
|
+
* Tuned for the typical 5-tester ask: a clean 5/5 lands at "medium" (you
|
|
1823
|
+
* can probably trust the lean), 1/5 with no errors lands at "low" (you
|
|
1824
|
+
* need more data), 5/5 with a tie lands at "low" (no winner to call).
|
|
1825
|
+
*/
|
|
1826
|
+
export function deriveWinnerConfidence(args) {
|
|
1827
|
+
const { n, errored, tied } = args;
|
|
1828
|
+
if (n < N_MEDIUM_CONFIDENCE_FLOOR)
|
|
1829
|
+
return "low";
|
|
1830
|
+
if (tied)
|
|
1831
|
+
return "low";
|
|
1832
|
+
if (errored > 0)
|
|
1833
|
+
return "low";
|
|
1834
|
+
if (n < N_HIGH_CONFIDENCE_FLOOR)
|
|
1835
|
+
return "medium";
|
|
1836
|
+
return "high";
|
|
1837
|
+
}
|
|
1413
1838
|
function buildCrossRoundSummary(rounds) {
|
|
1414
1839
|
if (rounds.length < 2)
|
|
1415
1840
|
return undefined;
|
|
@@ -1555,11 +1980,20 @@ export function formatAskResults(ask, json, roundFilter) {
|
|
|
1555
1980
|
}
|
|
1556
1981
|
const headers = ["ROUND", "WINNER", ...Array.from(letters).sort()];
|
|
1557
1982
|
const rows = crossRound.rounds.map((entry) => {
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1983
|
+
let winnerCell;
|
|
1984
|
+
if (!entry.winner) {
|
|
1985
|
+
winnerCell = "-";
|
|
1986
|
+
}
|
|
1987
|
+
else if ("refused" in entry.winner) {
|
|
1988
|
+
// Too many failures to call a winner this round; agent should retry.
|
|
1989
|
+
winnerCell = `(refused: ${entry.winner.errored}/${entry.winner.total} errored)`;
|
|
1990
|
+
}
|
|
1991
|
+
else {
|
|
1992
|
+
const base = entry.winner.tied
|
|
1993
|
+
? `${entry.winner.label} (tied)`
|
|
1994
|
+
: entry.winner.label;
|
|
1995
|
+
winnerCell = `${base} · ${entry.winner.confidence}`;
|
|
1996
|
+
}
|
|
1563
1997
|
return [
|
|
1564
1998
|
`R${entry.round_number}`,
|
|
1565
1999
|
winnerCell,
|
package/dist/lib/paths.d.ts
CHANGED
package/dist/lib/paths.js
CHANGED