@kweaver-ai/kweaver-sdk 0.8.1 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +40 -52
  2. package/README.zh.md +41 -46
  3. package/dist/agent-providers/index.d.ts +7 -0
  4. package/dist/agent-providers/index.js +5 -0
  5. package/dist/agent-providers/prompt-template.d.ts +62 -0
  6. package/dist/agent-providers/prompt-template.js +105 -0
  7. package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
  8. package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
  9. package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
  10. package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
  11. package/dist/agent-providers/providers/stub.d.ts +47 -0
  12. package/dist/agent-providers/providers/stub.js +77 -0
  13. package/dist/agent-providers/registry.d.ts +45 -0
  14. package/dist/agent-providers/registry.js +77 -0
  15. package/dist/agent-providers/types.d.ts +91 -0
  16. package/dist/agent-providers/types.js +25 -0
  17. package/dist/api/agent-chat.js +8 -6
  18. package/dist/api/context-loader.d.ts +1 -0
  19. package/dist/api/resources.d.ts +94 -0
  20. package/dist/api/resources.js +166 -0
  21. package/dist/api/semantic-search.d.ts +5 -0
  22. package/dist/api/semantic-search.js +5 -0
  23. package/dist/api/skills.d.ts +75 -2
  24. package/dist/api/skills.js +108 -12
  25. package/dist/api/trace.d.ts +5 -0
  26. package/dist/api/trace.js +4 -0
  27. package/dist/cli.js +109 -15
  28. package/dist/client.d.ts +3 -3
  29. package/dist/client.js +5 -5
  30. package/dist/commands/agent/mode.d.ts +6 -0
  31. package/dist/commands/agent/mode.js +75 -0
  32. package/dist/commands/agent-members.js +27 -11
  33. package/dist/commands/agent.js +469 -286
  34. package/dist/commands/auth.js +184 -71
  35. package/dist/commands/bkn-metric.js +37 -16
  36. package/dist/commands/bkn-ops.js +164 -86
  37. package/dist/commands/bkn-query.js +99 -31
  38. package/dist/commands/bkn-schema.d.ts +3 -3
  39. package/dist/commands/bkn-schema.js +127 -86
  40. package/dist/commands/bkn.js +153 -114
  41. package/dist/commands/call.js +23 -13
  42. package/dist/commands/config.js +22 -12
  43. package/dist/commands/context-loader.js +625 -49
  44. package/dist/commands/dataflow.js +14 -6
  45. package/dist/commands/ds.js +52 -30
  46. package/dist/commands/explore.js +18 -15
  47. package/dist/commands/model.js +53 -42
  48. package/dist/commands/resource.d.ts +1 -0
  49. package/dist/commands/{dataview.js → resource.js} +62 -84
  50. package/dist/commands/skill.d.ts +21 -1
  51. package/dist/commands/skill.js +567 -43
  52. package/dist/commands/token.js +11 -0
  53. package/dist/commands/tool.js +46 -29
  54. package/dist/commands/toolbox.js +31 -15
  55. package/dist/commands/trace.d.ts +26 -1
  56. package/dist/commands/trace.js +515 -15
  57. package/dist/commands/vega.js +466 -250
  58. package/dist/help/format.d.ts +65 -0
  59. package/dist/help/format.js +141 -0
  60. package/dist/index.d.ts +5 -5
  61. package/dist/index.js +3 -3
  62. package/dist/resources/bkn.d.ts +5 -0
  63. package/dist/resources/bkn.js +5 -0
  64. package/dist/resources/{dataviews.d.ts → resources.d.ts} +10 -11
  65. package/dist/resources/{dataviews.js → resources.js} +12 -13
  66. package/dist/resources/skills.d.ts +17 -1
  67. package/dist/resources/skills.js +32 -1
  68. package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
  69. package/dist/trace-ai/diagnose/agent-binding.js +257 -0
  70. package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
  71. package/dist/trace-ai/diagnose/index.d.ts +32 -0
  72. package/dist/trace-ai/diagnose/index.js +246 -0
  73. package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
  74. package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
  75. package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
  76. package/dist/trace-ai/diagnose/query-extractor.js +45 -0
  77. package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
  78. package/dist/{trace-core → trace-ai}/diagnose/report-assembler.js +19 -9
  79. package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
  80. package/dist/trace-ai/diagnose/report-markdown.js +192 -0
  81. package/dist/{trace-core → trace-ai}/diagnose/rule-loader.js +42 -8
  82. package/dist/{trace-core → trace-ai}/diagnose/schemas.d.ts +77 -2
  83. package/dist/trace-ai/diagnose/schemas.js +154 -0
  84. package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
  85. package/dist/trace-ai/diagnose/signal-probe.js +39 -0
  86. package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
  87. package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
  88. package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.js +1 -0
  89. package/dist/{trace-core → trace-ai}/diagnose/types.d.ts +55 -6
  90. package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
  91. package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
  92. package/dist/trace-ai/eval-set/builder.d.ts +36 -0
  93. package/dist/trace-ai/eval-set/builder.js +126 -0
  94. package/dist/trace-ai/eval-set/index.d.ts +15 -0
  95. package/dist/trace-ai/eval-set/index.js +10 -0
  96. package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
  97. package/dist/trace-ai/eval-set/output-writer.js +126 -0
  98. package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
  99. package/dist/trace-ai/eval-set/query-picker.js +147 -0
  100. package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
  101. package/dist/trace-ai/eval-set/redactor.js +133 -0
  102. package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
  103. package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
  104. package/dist/trace-ai/eval-set/schemas.js +130 -0
  105. package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
  106. package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
  107. package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
  108. package/dist/trace-ai/eval-set/test-runner.js +153 -0
  109. package/dist/trace-ai/eval-set/types.d.ts +46 -0
  110. package/dist/trace-ai/eval-set/types.js +8 -0
  111. package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
  112. package/dist/trace-ai/exp/bundle-writer.js +54 -0
  113. package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
  114. package/dist/trace-ai/exp/claude-binary.js +30 -0
  115. package/dist/trace-ai/exp/coordinator.d.ts +45 -0
  116. package/dist/trace-ai/exp/coordinator.js +203 -0
  117. package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
  118. package/dist/trace-ai/exp/eval-runner.js +47 -0
  119. package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
  120. package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
  121. package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
  122. package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
  123. package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
  124. package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
  125. package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
  126. package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
  127. package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
  128. package/dist/trace-ai/exp/exp-store/index.js +59 -0
  129. package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
  130. package/dist/trace-ai/exp/exp-store/lock.js +73 -0
  131. package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
  132. package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
  133. package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
  134. package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
  135. package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
  136. package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
  137. package/dist/trace-ai/exp/index.d.ts +8 -0
  138. package/dist/trace-ai/exp/index.js +238 -0
  139. package/dist/trace-ai/exp/info.d.ts +35 -0
  140. package/dist/trace-ai/exp/info.js +120 -0
  141. package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
  142. package/dist/trace-ai/exp/patch/agent-config.js +26 -0
  143. package/dist/trace-ai/exp/patch/index.d.ts +2 -0
  144. package/dist/trace-ai/exp/patch/index.js +13 -0
  145. package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
  146. package/dist/trace-ai/exp/patch/skill.js +24 -0
  147. package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
  148. package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
  149. package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
  150. package/dist/trace-ai/exp/providers/triage-client.js +51 -0
  151. package/dist/trace-ai/exp/schemas.d.ts +147 -0
  152. package/dist/trace-ai/exp/schemas.js +50 -0
  153. package/dist/trace-ai/exp/scoring.d.ts +2 -0
  154. package/dist/trace-ai/exp/scoring.js +46 -0
  155. package/dist/trace-ai/scan/aggregator.d.ts +20 -0
  156. package/dist/trace-ai/scan/aggregator.js +26 -0
  157. package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
  158. package/dist/trace-ai/scan/artifacts/paths.js +18 -0
  159. package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
  160. package/dist/trace-ai/scan/artifacts/writer.js +96 -0
  161. package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
  162. package/dist/trace-ai/scan/batched-rubric.js +159 -0
  163. package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
  164. package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
  165. package/dist/trace-ai/scan/index.d.ts +31 -0
  166. package/dist/trace-ai/scan/index.js +390 -0
  167. package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
  168. package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
  169. package/dist/trace-ai/scan/runner.d.ts +25 -0
  170. package/dist/trace-ai/scan/runner.js +42 -0
  171. package/dist/trace-ai/scan/sampler.d.ts +18 -0
  172. package/dist/trace-ai/scan/sampler.js +81 -0
  173. package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
  174. package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
  175. package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
  176. package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
  177. package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
  178. package/dist/trace-ai/scan/single-agent-validator.js +42 -0
  179. package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
  180. package/dist/trace-ai/scan/traces-list-parser.js +46 -0
  181. package/package.json +2 -2
  182. package/dist/api/dataviews.d.ts +0 -117
  183. package/dist/api/dataviews.js +0 -265
  184. package/dist/commands/dataview.d.ts +0 -8
  185. package/dist/trace-core/diagnose/index.d.ts +0 -9
  186. package/dist/trace-core/diagnose/index.js +0 -104
  187. package/dist/trace-core/diagnose/report-assembler.d.ts +0 -12
  188. package/dist/trace-core/diagnose/schemas.js +0 -94
  189. package/dist/trace-core/diagnose/signal-probe.d.ts +0 -5
  190. package/dist/trace-core/diagnose/signal-probe.js +0 -21
  191. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +0 -0
  192. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +0 -0
  193. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +0 -0
  194. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +0 -0
  195. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.js +0 -0
  196. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +0 -0
  197. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.d.ts +0 -0
  198. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/register.js +0 -0
  199. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +0 -0
  200. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.js +0 -0
  201. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +0 -0
  202. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.d.ts +0 -0
  203. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.js +0 -0
  204. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-error-swallowed.yaml +0 -0
  205. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +0 -0
  206. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.js +0 -0
  207. /package/dist/{trace-core → trace-ai}/diagnose/builtin-rules/tool-loop-no-state-change.yaml +0 -0
  208. /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.d.ts +0 -0
  209. /package/dist/{trace-core → trace-ai}/diagnose/predicate-registry.js +0 -0
  210. /package/dist/{trace-core → trace-ai}/diagnose/rule-loader.d.ts +0 -0
  211. /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.d.ts +0 -0
  212. /package/dist/{trace-core → trace-ai}/diagnose/synthesizer-template.js +0 -0
  213. /package/dist/{trace-core → trace-ai}/diagnose/trace-shaper.d.ts +0 -0
  214. /package/dist/{trace-core → trace-ai}/diagnose/types.js +0 -0
@@ -0,0 +1,153 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import yaml from "js-yaml";
4
+ import { evaluateAssertion } from "./assertion-evaluator.js";
5
+ import { EvalSetIndexSchema, EvalSetShardSchema, TestReportSchema } from "./schemas.js";
6
+ // ── eval-set loader ───────────────────────────────────────────────────────────
7
+ async function loadEvalCases(evalSetDir) {
8
+ const indexRaw = await fs.readFile(path.join(evalSetDir, "index.yaml"), "utf8");
9
+ const index = EvalSetIndexSchema.parse(yaml.load(indexRaw));
10
+ const cases = [];
11
+ for (const shard of index.shards) {
12
+ const shardRaw = await fs.readFile(path.join(evalSetDir, shard.path), "utf8");
13
+ const parsed = EvalSetShardSchema.parse(yaml.load(shardRaw));
14
+ cases.push(...parsed.cases);
15
+ }
16
+ return cases;
17
+ }
18
+ // ── case runner ───────────────────────────────────────────────────────────────
19
+ async function runCase(evalCase, agentInfo, deps) {
20
+ const startMs = Date.now();
21
+ let conversationId = null;
22
+ let traceId = null;
23
+ let spans = [];
24
+ let answerText = "";
25
+ let stage = "chat";
26
+ try {
27
+ const chatResult = await deps.sendChat({
28
+ agentInfo,
29
+ query: evalCase.input.user_message,
30
+ });
31
+ answerText = chatResult.text;
32
+ conversationId = chatResult.conversationId ?? null;
33
+ if (conversationId) {
34
+ stage = "trace";
35
+ const traceResult = await deps.fetchTrace(conversationId);
36
+ spans = traceResult.spans;
37
+ traceId = spans[0]?.traceId ?? null;
38
+ }
39
+ }
40
+ catch (e) {
41
+ const durationMs = Date.now() - startMs;
42
+ return {
43
+ query_id: evalCase.query_id,
44
+ status: "error",
45
+ conversation_id: conversationId,
46
+ trace_id: traceId,
47
+ duration_ms: durationMs,
48
+ assertion_results: [],
49
+ error_message: e instanceof Error ? e.message : String(e),
50
+ error_code: stage === "trace" ? "trace-fetch-failed" : "chat-failed",
51
+ };
52
+ }
53
+ const durationMs = Date.now() - startMs;
54
+ const assertionResults = [];
55
+ for (const assertion of evalCase.assertions ?? []) {
56
+ try {
57
+ const result = await evaluateAssertion(assertion, {
58
+ answer: answerText,
59
+ spans,
60
+ reference: evalCase.reference,
61
+ durationMs,
62
+ question: evalCase.input.user_message,
63
+ semanticMatchProvider: deps.semanticMatchProvider,
64
+ });
65
+ assertionResults.push({ assertion, verdict: result.verdict, actual: result.actual });
66
+ }
67
+ catch (e) {
68
+ assertionResults.push({ assertion, verdict: "skip", actual: `assertion-eval-error: ${e.message}` });
69
+ }
70
+ }
71
+ // A case may pass schema with reference-only (no assertions), but without
72
+ // assertions there is no pass/fail signal — mark skip so it does not
73
+ // silently inflate the pass count.
74
+ if (assertionResults.length === 0) {
75
+ return {
76
+ query_id: evalCase.query_id,
77
+ status: "skip",
78
+ conversation_id: conversationId,
79
+ trace_id: traceId,
80
+ duration_ms: durationMs,
81
+ assertion_results: assertionResults,
82
+ failure_reason: "no assertions configured; case has reference but no judge (e.g. semantic_match) wired",
83
+ };
84
+ }
85
+ const hasFail = assertionResults.some((r) => r.verdict === "fail");
86
+ const allSkip = assertionResults.every((r) => r.verdict === "skip");
87
+ const status = hasFail ? "fail" : allSkip ? "skip" : "pass";
88
+ return {
89
+ query_id: evalCase.query_id,
90
+ status: status,
91
+ conversation_id: conversationId,
92
+ trace_id: traceId,
93
+ duration_ms: durationMs,
94
+ assertion_results: assertionResults,
95
+ };
96
+ }
97
+ // ── main runner ───────────────────────────────────────────────────────────────
98
+ export async function run(opts) {
99
+ const { evalSetDir, candidateAgentId, candidateAgentVersion, outDir, deps } = opts;
100
+ const maxParallel = opts.maxParallel ?? 4;
101
+ const [cases, agentInfo] = await Promise.all([
102
+ loadEvalCases(evalSetDir),
103
+ deps.fetchAgent(candidateAgentId, candidateAgentVersion),
104
+ ]);
105
+ // Fetch eval_set_id from index
106
+ const indexRaw = await fs.readFile(path.join(evalSetDir, "index.yaml"), "utf8");
107
+ const index = EvalSetIndexSchema.parse(yaml.load(indexRaw));
108
+ const ranAt = new Date().toISOString();
109
+ const overallStart = Date.now();
110
+ const caseResults = new Array(cases.length);
111
+ let nextIdx = 0;
112
+ const workerCount = Math.max(1, Math.min(maxParallel, cases.length));
113
+ const workers = Array.from({ length: workerCount }, async () => {
114
+ while (true) {
115
+ const idx = nextIdx++;
116
+ if (idx >= cases.length)
117
+ return;
118
+ caseResults[idx] = await runCase(cases[idx], agentInfo, deps);
119
+ }
120
+ });
121
+ await Promise.all(workers);
122
+ const overallDurationMs = Date.now() - overallStart;
123
+ // Build summary
124
+ const counts = { total: caseResults.length, pass: 0, fail: 0, error: 0, skip: 0 };
125
+ const byType = {};
126
+ for (const cr of caseResults) {
127
+ counts[cr.status]++;
128
+ for (const ar of cr.assertion_results) {
129
+ const t = ar.assertion["type"];
130
+ if (!byType[t])
131
+ byType[t] = { pass: 0, fail: 0 };
132
+ if (ar.verdict === "pass")
133
+ byType[t].pass++;
134
+ else if (ar.verdict === "fail")
135
+ byType[t].fail++;
136
+ }
137
+ }
138
+ const report = TestReportSchema.parse({
139
+ schema_version: "trace-test-report/v1",
140
+ meta: {
141
+ eval_set_dir: evalSetDir,
142
+ eval_set_id: index.eval_set_id,
143
+ candidate: { agent_id: agentInfo.id, agent_version: agentInfo.version },
144
+ cli_version: "0.0.0",
145
+ ran_at: ranAt,
146
+ duration_ms: overallDurationMs,
147
+ },
148
+ summary: { ...counts, by_assertion_type: byType },
149
+ cases: caseResults,
150
+ });
151
+ await fs.mkdir(outDir, { recursive: true });
152
+ await fs.writeFile(path.join(outDir, "report.yaml"), yaml.dump(report, { lineWidth: 120 }), "utf8");
153
+ }
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Internal types for the M5 eval-set module (PR-A).
3
+ *
4
+ * These mirror the zod schemas in `./schemas.ts` but are kept independent so
5
+ * non-validating code paths (builder / picker / redactor / output-writer) can
6
+ * import the types without paying the zod parse overhead at module load.
7
+ */
8
+ export interface EvalCaseInput {
9
+ user_message: string;
10
+ }
11
+ export interface EvalReference {
12
+ answer: string;
13
+ }
14
+ export type AssertionType = "contains" | "not_contains" | "regex" | "tool_call_count" | "tool_call_order" | "semantic_match" | "latency_ms";
15
+ export interface EvalAssertion {
16
+ type: AssertionType;
17
+ [key: string]: unknown;
18
+ }
19
+ export interface EvalCase {
20
+ query_id: string;
21
+ input: EvalCaseInput;
22
+ reference?: EvalReference;
23
+ assertions?: EvalAssertion[];
24
+ tags?: string[];
25
+ }
26
+ export interface EvalSetIndexShard {
27
+ path: string;
28
+ role?: "seed" | "regression" | "holdout";
29
+ }
30
+ export interface EvalSetIndex {
31
+ schema_version: "trace-eval-set-index/v1";
32
+ eval_set_id: string;
33
+ shards: EvalSetIndexShard[];
34
+ }
35
+ export interface BuildResult {
36
+ cases_written: number;
37
+ cases_skipped: number;
38
+ conflicts: string[];
39
+ shard_paths: string[];
40
+ redaction_rules_source: "cli-flag" | "repo" | "builtin";
41
+ }
42
+ export interface RedactionRule {
43
+ name: string;
44
+ pattern: RegExp;
45
+ replace: string;
46
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Internal types for the M5 eval-set module (PR-A).
3
+ *
4
+ * These mirror the zod schemas in `./schemas.ts` but are kept independent so
5
+ * non-validating code paths (builder / picker / redactor / output-writer) can
6
+ * import the types without paying the zod parse overhead at module load.
7
+ */
8
+ export {};
@@ -0,0 +1,10 @@
1
+ import type { LineageEntry, RoundData } from "./schemas.js";
2
+ interface WriteBundlesOpts {
3
+ expDir: string;
4
+ experimentId: string;
5
+ lineage: LineageEntry[];
6
+ rounds: RoundData[];
7
+ createdBy: string;
8
+ }
9
+ export declare function writeBundles(opts: WriteBundlesOpts): Promise<void>;
10
+ export {};
@@ -0,0 +1,54 @@
1
+ // src/trace-ai/exp/bundle-writer.ts
2
+ import fs from "node:fs/promises";
3
+ import path from "node:path";
4
+ import crypto from "node:crypto";
5
+ import yaml from "js-yaml";
6
+ export async function writeBundles(opts) {
7
+ const { expDir, experimentId, lineage, rounds, createdBy } = opts;
8
+ const bestEntry = lineage.filter(e => e.status === "scored").at(-1) ?? lineage.at(-1);
9
+ const bestVersion = bestEntry?.version ?? 0;
10
+ const bundleId = `bundle_${crypto.randomBytes(4).toString("hex")}`;
11
+ const now = new Date().toISOString();
12
+ const bundle = {
13
+ schema_version: "trace-bundle/v1",
14
+ experiment_id: experimentId,
15
+ bundle_id: bundleId,
16
+ best_trial_version: bestVersion,
17
+ resources: {
18
+ agent_config: bestEntry?.next_change ?? {},
19
+ skills: [],
20
+ },
21
+ provenance: {
22
+ created_by: createdBy,
23
+ created_at: now,
24
+ evidence_traces: rounds.flatMap(r => (r.per_query_results ?? []).map(q => q.raw_trace_id ?? "").filter(Boolean)),
25
+ round_refs: rounds.map(r => `.trace-state/rounds/round-${r.round}.yaml`),
26
+ },
27
+ };
28
+ const lastRound = rounds.at(-1);
29
+ const manifest = {
30
+ schema_version: "trace-manifest/v1",
31
+ experiment_id: experimentId,
32
+ trial_version: bestVersion,
33
+ predictions: {
34
+ fixes: (lastRound?.per_query_results ?? [])
35
+ .filter(q => q.assertion_results.every(a => a.verdict === "pass"))
36
+ .map(q => ({ query_id: q.query_id, reason: "all assertions passed" })),
37
+ risks: (lastRound?.per_query_results ?? [])
38
+ .filter(q => q.assertion_results.some(a => a.verdict === "fail"))
39
+ .map(q => ({ query_id: q.query_id, reason: "assertions failed" })),
40
+ },
41
+ };
42
+ const provenance = {
43
+ experiment_id: experimentId,
44
+ generated_at: now,
45
+ rounds_count: rounds.length,
46
+ lineage_count: lineage.length,
47
+ round_verdicts: rounds.map(r => ({ round: r.round, verdict: r.triage_conclusion?.verdict ?? "pending" })),
48
+ };
49
+ const outDir = path.join(expDir, "outputs");
50
+ await fs.mkdir(outDir, { recursive: true });
51
+ await fs.writeFile(path.join(outDir, "bundle.yaml"), yaml.dump(bundle, { lineWidth: -1 }));
52
+ await fs.writeFile(path.join(outDir, "manifest.yaml"), yaml.dump(manifest, { lineWidth: -1 }));
53
+ await fs.writeFile(path.join(outDir, "provenance.yaml"), yaml.dump(provenance, { lineWidth: -1 }));
54
+ }
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Resolves the path to the claude CLI binary.
3
+ * Priority: CLAUDE_BIN env → `which claude` → known install locations → bare "claude".
4
+ */
5
+ export declare function resolveClaudeBinary(): string;
@@ -0,0 +1,30 @@
1
+ import { execSync } from "node:child_process";
2
+ import os from "node:os";
3
+ /**
4
+ * Resolves the path to the claude CLI binary.
5
+ * Priority: CLAUDE_BIN env → `which claude` → known install locations → bare "claude".
6
+ */
7
+ export function resolveClaudeBinary() {
8
+ if (process.env["CLAUDE_BIN"])
9
+ return process.env["CLAUDE_BIN"];
10
+ try {
11
+ const resolved = execSync("which claude", { encoding: "utf8", timeout: 3000 }).trim();
12
+ // Reject shell alias expansions like "claude: aliased to ..."
13
+ if (resolved && !resolved.includes(" "))
14
+ return resolved;
15
+ }
16
+ catch { /* fall through */ }
17
+ const home = os.homedir();
18
+ for (const p of [
19
+ `${home}/.local/bin/claude`,
20
+ "/opt/homebrew/bin/claude",
21
+ "/usr/local/bin/claude",
22
+ ]) {
23
+ try {
24
+ execSync(`test -x "${p}"`, { timeout: 1000 });
25
+ return p;
26
+ }
27
+ catch { /* try next */ }
28
+ }
29
+ return "claude";
30
+ }
@@ -0,0 +1,45 @@
1
+ import type { Mission, NextChange, QueryResult, RoundData } from "./schemas.js";
2
+ export interface SynthesizerClient {
3
+ generate(input: {
4
+ mission: Mission;
5
+ candidateConfig: Record<string, unknown>;
6
+ prevRound?: RoundData;
7
+ prevRounds: RoundData[];
8
+ crossRoundMemoryRef?: string;
9
+ }): Promise<NextChange>;
10
+ }
11
+ export interface TriageClient {
12
+ triage(input: {
13
+ currentRound: RoundData;
14
+ prevRounds: RoundData[];
15
+ candidateConfig: Record<string, unknown>;
16
+ crossRoundMemoryRef?: string;
17
+ }): Promise<RoundData["triage_conclusion"] & {
18
+ new_memory_token: string;
19
+ }>;
20
+ }
21
+ export interface CoordinatorOpts {
22
+ expDir: string;
23
+ synthesizer: SynthesizerClient;
24
+ triage: TriageClient;
25
+ runEval: (opts: {
26
+ evalSetPaths: string[];
27
+ candidatePath: string;
28
+ expDir: string;
29
+ round: number;
30
+ }) => Promise<{
31
+ queryResults: QueryResult[];
32
+ }>;
33
+ experimentId?: string;
34
+ }
35
+ export declare class ExperimentCoordinator {
36
+ private opts;
37
+ private store;
38
+ private heartbeatTimer?;
39
+ constructor(opts: CoordinatorOpts);
40
+ run(): Promise<void>;
41
+ resume(): Promise<void>;
42
+ private runLoop;
43
+ private checkAbort;
44
+ private withRetry;
45
+ }
@@ -0,0 +1,203 @@
1
+ // src/trace-ai/exp/coordinator.ts
2
+ import path from "node:path";
3
+ import fs from "node:fs/promises";
4
+ import yaml from "js-yaml";
5
+ import { ExpStore } from "./exp-store/index.js";
6
+ import { applyPatch } from "./patch/index.js";
7
+ import { computeScores } from "./scoring.js";
8
+ import { writeBundles } from "./bundle-writer.js";
9
+ export class ExperimentCoordinator {
10
+ opts;
11
+ store;
12
+ heartbeatTimer;
13
+ constructor(opts) {
14
+ this.opts = opts;
15
+ this.store = new ExpStore(opts.expDir);
16
+ }
17
+ async run() {
18
+ const replayed = await this.store.replayState();
19
+ if (replayed.isTerminal && !replayed.currentState.includes("Aborted")) {
20
+ throw new Error(`Experiment is in terminal state ${replayed.currentState}. Use --new-run to start fresh.`);
21
+ }
22
+ const mission = await this.store.readMission();
23
+ const expId = this.opts.experimentId ?? `exp_${Date.now()}`;
24
+ if (replayed.currentRound === 0) {
25
+ await this.store.initDir(mission);
26
+ }
27
+ await this.store.acquireLock();
28
+ this.heartbeatTimer = setInterval(() => { void this.store.updateHeartbeat(); }, 10_000);
29
+ // If previous run failed mid-round, retry that round (startRound = currentRound - 1)
30
+ const startRound = replayed.lastFailure && replayed.currentRound > 0
31
+ ? replayed.currentRound - 1
32
+ : replayed.currentRound;
33
+ try {
34
+ await this.runLoop(mission, startRound, expId);
35
+ }
36
+ finally {
37
+ clearInterval(this.heartbeatTimer);
38
+ await this.store.releaseLock();
39
+ }
40
+ }
41
+ async resume() {
42
+ const replayed = await this.store.replayState();
43
+ if (replayed.currentState !== "Deciding") {
44
+ throw new Error(`Cannot resume: experiment is in state ${replayed.currentState}, not Deciding. Only Deciding state supports resume.`);
45
+ }
46
+ await this.store.acquireLock();
47
+ this.heartbeatTimer = setInterval(() => { void this.store.updateHeartbeat(); }, 10_000);
48
+ try {
49
+ const mission = await this.store.readMission();
50
+ const expId = `exp_${replayed.currentRound}`;
51
+ await this.runLoop(mission, replayed.currentRound, expId);
52
+ }
53
+ finally {
54
+ clearInterval(this.heartbeatTimer);
55
+ await this.store.releaseLock();
56
+ }
57
+ }
58
+ async runLoop(mission, startRound, expId) {
59
+ const round = startRound + 1;
60
+ const maxRounds = mission.max_rounds ?? Infinity;
61
+ if (await this.checkAbort(round))
62
+ return;
63
+ // === Generating (Apply Phase) ===
64
+ await this.store.appendEvent({ type: "state_transition", from: "Deciding", to: "Generating", round });
65
+ const nextChange = mission.next_change;
66
+ if (!nextChange)
67
+ throw new Error("mission.md has no next_change — add one or let Synthesizer suggest");
68
+ const prevRounds = await this.store.readAllRounds();
69
+ // Load current candidate and apply patch
70
+ const currentCandidatePath = path.join(this.opts.expDir, mission.current_candidate.path);
71
+ const currentCandidate = yaml.load(await fs.readFile(currentCandidatePath, "utf8"));
72
+ const patched = applyPatch(currentCandidate, nextChange);
73
+ patched["candidate_version"] = `v${round}`;
74
+ const newCandidatePath = path.join(this.opts.expDir, "candidates", `candidate-v${round}.yaml`);
75
+ await fs.writeFile(newCandidatePath, yaml.dump(patched, { lineWidth: -1 }));
76
+ await this.store.appendLineage({
77
+ version: round,
78
+ candidate_path: `candidates/candidate-v${round}.yaml`,
79
+ next_change: nextChange,
80
+ status: "running",
81
+ });
82
+ if (await this.checkAbort(round))
83
+ return;
84
+ // === Executing ===
85
+ await this.store.appendEvent({ type: "state_transition", from: "Generating", to: "Executing", round });
86
+ const evalSetPaths = mission.eval_sets.map(e => path.join(this.opts.expDir, e.path));
87
+ let queryResults;
88
+ try {
89
+ const result = await this.withRetry(() => this.opts.runEval({ evalSetPaths, candidatePath: newCandidatePath, expDir: this.opts.expDir, round }), "Executing");
90
+ queryResults = result.queryResults;
91
+ }
92
+ catch {
93
+ return; // step_failed already written by withRetry
94
+ }
95
+ if (await this.checkAbort(round))
96
+ return;
97
+ // === Scoring ===
98
+ await this.store.appendEvent({ type: "state_transition", from: "Executing", to: "Scoring", round });
99
+ const guardrails = mission.guardrails ?? [];
100
+ const scores = computeScores(queryResults, guardrails);
101
+ if (scores.guardrail_hard_fail) {
102
+ await this.store.updateLineage(round, { status: "guardrail_failed" });
103
+ await this.store.writeRound(round, { round, trial_version: round, guardrail_failed: true, scores });
104
+ await this.store.appendEvent({ type: "state_transition", from: "Scoring", to: "Deciding", round });
105
+ process.stdout.write(`\nRound ${round}: Guardrail hard gate violated. Fix the candidate and run exp resume.\n`);
106
+ return;
107
+ }
108
+ await this.store.updateLineage(round, { status: "scored" });
109
+ await this.store.writeRound(round, { round, trial_version: round, scores, per_query_results: queryResults });
110
+ if (await this.checkAbort(round))
111
+ return;
112
+ // === Triaging ===
113
+ await this.store.appendEvent({ type: "state_transition", from: "Scoring", to: "Triaging", round });
114
+ const currentRoundData = (await this.store.readAllRounds()).find(r => r.round === round) ?? { round, trial_version: round };
115
+ const prevMemory = prevRounds.at(-1)?.triage_conclusion?.cross_round_memory_ref;
116
+ let triageResult;
117
+ try {
118
+ triageResult = await this.withRetry(() => this.opts.triage.triage({
119
+ currentRound: currentRoundData,
120
+ prevRounds,
121
+ candidateConfig: patched,
122
+ crossRoundMemoryRef: prevMemory,
123
+ }), "Triaging");
124
+ }
125
+ catch {
126
+ return;
127
+ }
128
+ await this.store.writeRound(round, {
129
+ triage_conclusion: {
130
+ diagnoses: triageResult.diagnoses,
131
+ hints: triageResult.hints,
132
+ verdict: triageResult.verdict,
133
+ cross_round_memory_ref: triageResult.new_memory_token,
134
+ },
135
+ });
136
+ await this.store.appendEvent({ type: "round_completed", round, verdict: triageResult.verdict });
137
+ // Generate next suggestion if continuing
138
+ if (triageResult.verdict === "continue" && round < maxRounds) {
139
+ const updatedMission = await this.store.readMission();
140
+ try {
141
+ const suggestion = await this.withRetry(() => this.opts.synthesizer.generate({
142
+ mission: updatedMission,
143
+ candidateConfig: patched,
144
+ prevRound: currentRoundData,
145
+ prevRounds,
146
+ crossRoundMemoryRef: triageResult.new_memory_token,
147
+ }), "Triaging");
148
+ await this.store.writeSuggestedChange(suggestion);
149
+ }
150
+ catch {
151
+ return;
152
+ }
153
+ }
154
+ // === Deciding ===
155
+ await this.store.appendEvent({ type: "state_transition", from: "Triaging", to: "Deciding", round });
156
+ if (triageResult.verdict === "publish" || round >= maxRounds) {
157
+ // Publish immediately
158
+ await this.store.appendEvent({ type: "state_transition", from: "Deciding", to: "Publishing", round });
159
+ const allRounds = await this.store.readAllRounds();
160
+ const allLineage = await this.store.readLineage();
161
+ await writeBundles({ expDir: this.opts.expDir, experimentId: expId, lineage: allLineage, rounds: allRounds, createdBy: process.env["USER"] ?? "unknown" });
162
+ await this.store.appendEvent({ type: "state_transition", from: "Publishing", to: "Published", round });
163
+ process.stdout.write(`\nExperiment complete. Outputs written to ${path.join(this.opts.expDir, "outputs")}\n`);
164
+ }
165
+ else {
166
+ // Pause at Deciding — lock released by run()/resume() finally block
167
+ process.stdout.write(`\nRound ${round} complete.\n`);
168
+ process.stdout.write(`Scores: outcome=${scores.outcome.toFixed(2)}, trajectory=${scores.trajectory.toFixed(2)}\n`);
169
+ process.stdout.write(`Triage: ${triageResult.diagnoses.join("; ")}\n`);
170
+ process.stdout.write(`Next suggestion written to mission.md. Review and run exp resume to continue.\n`);
171
+ }
172
+ }
173
+ async checkAbort(round) {
174
+ if (await this.store.isAborted()) {
175
+ clearInterval(this.heartbeatTimer);
176
+ await this.store.appendEvent({ type: "aborted", round, reason: "user_abort" });
177
+ await this.store.releaseLock();
178
+ return true;
179
+ }
180
+ return false;
181
+ }
182
+ async withRetry(fn, state) {
183
+ let lastErr;
184
+ for (let attempt = 0; attempt < 3; attempt++) {
185
+ try {
186
+ return await fn();
187
+ }
188
+ catch (err) {
189
+ lastErr = err;
190
+ if (attempt < 2) {
191
+ await new Promise(r => setTimeout(r, 1000 * 2 ** attempt));
192
+ }
193
+ }
194
+ }
195
+ await this.store.appendEvent({
196
+ type: "step_failed",
197
+ state: state,
198
+ error: String(lastErr),
199
+ retryable: true,
200
+ });
201
+ throw lastErr;
202
+ }
203
+ }
@@ -0,0 +1,14 @@
1
+ import type { QueryResult } from "./schemas.js";
2
+ import type { RunnerDeps } from "../eval-set/test-runner.js";
3
+ export interface EvalRunnerOpts {
4
+ evalSetPaths: string[];
5
+ candidatePath: string;
6
+ expDir: string;
7
+ round: number;
8
+ deps: RunnerDeps;
9
+ maxParallel?: number;
10
+ }
11
+ export interface EvalRunResult {
12
+ queryResults: QueryResult[];
13
+ }
14
+ export declare function runEval(opts: EvalRunnerOpts): Promise<EvalRunResult>;
@@ -0,0 +1,47 @@
1
+ // src/trace-ai/exp/eval-runner.ts
2
+ import path from "node:path";
3
+ import yaml from "js-yaml";
4
+ import fs from "node:fs/promises";
5
+ import { run as evalSetRun } from "../eval-set/test-runner.js";
6
+ export async function runEval(opts) {
7
+ const candidateRaw = yaml.load(await fs.readFile(opts.candidatePath, "utf8"));
8
+ const agentId = candidateRaw["agent_id"] ?? "candidate";
9
+ const agentVersion = candidateRaw["candidate_version"];
10
+ const roundEvalBase = path.join(opts.expDir, ".trace-state", "rounds", `round-${opts.round}-eval`);
11
+ // Run eval for each eval-set (sequentially for MVP-C single-path)
12
+ const allResults = [];
13
+ for (const evalSetDir of opts.evalSetPaths) {
14
+ // Each eval-set gets its own subdir so outputs from multiple sets don't overwrite each other
15
+ const outDir = path.join(roundEvalBase, path.basename(evalSetDir));
16
+ await fs.mkdir(outDir, { recursive: true });
17
+ await evalSetRun({
18
+ evalSetDir,
19
+ candidateAgentId: agentId,
20
+ candidateAgentVersion: agentVersion,
21
+ outDir,
22
+ maxParallel: opts.maxParallel ?? 4,
23
+ deps: opts.deps,
24
+ });
25
+ // Read report and convert to QueryResult[]
26
+ const reportPath = path.join(outDir, "report.yaml");
27
+ const report = yaml.load(await fs.readFile(reportPath, "utf8"));
28
+ for (const c of report.cases) {
29
+ allResults.push({
30
+ query_id: c.query_id,
31
+ assertion_results: c.assertion_results.map(ar => ({
32
+ type: ar.assertion.type,
33
+ verdict: ar.verdict,
34
+ reason: typeof ar.actual === "string" ? ar.actual : undefined,
35
+ })),
36
+ trajectory_summary: {
37
+ tool_call_sequence: [], // populated from trace if available
38
+ retry_count: 0,
39
+ latency_ms: c.duration_ms ?? 0,
40
+ error_codes: [],
41
+ },
42
+ raw_trace_id: c.trace_id ?? undefined,
43
+ });
44
+ }
45
+ }
46
+ return { queryResults: allResults };
47
+ }
@@ -0,0 +1,3 @@
1
+ export declare function isAborted(expDir: string): Promise<boolean>;
2
+ export declare function writeAbortSignal(expDir: string): Promise<void>;
3
+ export declare function clearAbortSignal(expDir: string): Promise<void>;
@@ -0,0 +1,27 @@
1
+ // src/trace-ai/exp/exp-store/abort-signal.ts
2
+ import fs from "node:fs/promises";
3
+ import path from "node:path";
4
+ function signalPath(expDir) {
5
+ return path.join(expDir, ".trace-state", "abort.signal");
6
+ }
7
+ export async function isAborted(expDir) {
8
+ try {
9
+ await fs.access(signalPath(expDir));
10
+ return true;
11
+ }
12
+ catch {
13
+ return false;
14
+ }
15
+ }
16
+ export async function writeAbortSignal(expDir) {
17
+ await fs.writeFile(signalPath(expDir), new Date().toISOString(), "utf8");
18
+ }
19
+ export async function clearAbortSignal(expDir) {
20
+ try {
21
+ await fs.unlink(signalPath(expDir));
22
+ }
23
+ catch (err) {
24
+ if (err.code !== "ENOENT")
25
+ throw err;
26
+ }
27
+ }
@@ -0,0 +1,4 @@
1
+ import type { LineageEntry } from "../schemas.js";
2
+ export declare function appendLineage(expDir: string, entry: Omit<LineageEntry, "appended_at">): Promise<void>;
3
+ export declare function updateLineage(expDir: string, version: number, patch: Partial<LineageEntry>): Promise<void>;
4
+ export declare function readLineage(expDir: string): Promise<LineageEntry[]>;