@kweaver-ai/kweaver-sdk 0.7.4 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/README.md +39 -5
  2. package/README.zh.md +37 -5
  3. package/dist/agent-providers/index.d.ts +7 -0
  4. package/dist/agent-providers/index.js +5 -0
  5. package/dist/agent-providers/prompt-template.d.ts +62 -0
  6. package/dist/agent-providers/prompt-template.js +105 -0
  7. package/dist/agent-providers/prompts/rubric-judge-v1.prompt.md +51 -0
  8. package/dist/agent-providers/prompts/within-trace-synthesizer-v1.prompt.md +60 -0
  9. package/dist/agent-providers/providers/claude-code-subprocess.d.ts +74 -0
  10. package/dist/agent-providers/providers/claude-code-subprocess.js +259 -0
  11. package/dist/agent-providers/providers/stub.d.ts +47 -0
  12. package/dist/agent-providers/providers/stub.js +77 -0
  13. package/dist/agent-providers/registry.d.ts +45 -0
  14. package/dist/agent-providers/registry.js +77 -0
  15. package/dist/agent-providers/types.d.ts +91 -0
  16. package/dist/agent-providers/types.js +25 -0
  17. package/dist/api/agent-chat.js +8 -6
  18. package/dist/api/agent-observability.d.ts +51 -0
  19. package/dist/api/agent-observability.js +108 -0
  20. package/dist/api/context-loader.d.ts +1 -0
  21. package/dist/api/conversations.d.ts +4 -8
  22. package/dist/api/conversations.js +16 -58
  23. package/dist/api/datasources.d.ts +2 -20
  24. package/dist/api/datasources.js +7 -123
  25. package/dist/api/semantic-search.d.ts +5 -0
  26. package/dist/api/semantic-search.js +5 -0
  27. package/dist/api/skills.d.ts +75 -2
  28. package/dist/api/skills.js +108 -12
  29. package/dist/api/trace.d.ts +49 -0
  30. package/dist/api/trace.js +85 -0
  31. package/dist/api/vega.d.ts +53 -0
  32. package/dist/api/vega.js +144 -0
  33. package/dist/cli.js +12 -5
  34. package/dist/commands/agent/mode.d.ts +6 -0
  35. package/dist/commands/agent/mode.js +75 -0
  36. package/dist/commands/agent.js +101 -29
  37. package/dist/commands/bkn-ops.js +12 -6
  38. package/dist/commands/bkn-utils.d.ts +9 -0
  39. package/dist/commands/bkn-utils.js +17 -0
  40. package/dist/commands/context-loader.js +608 -38
  41. package/dist/commands/ds.js +7 -2
  42. package/dist/commands/skill.d.ts +21 -1
  43. package/dist/commands/skill.js +389 -1
  44. package/dist/commands/trace.d.ts +39 -0
  45. package/dist/commands/trace.js +668 -0
  46. package/dist/index.d.ts +2 -2
  47. package/dist/index.js +1 -1
  48. package/dist/resources/bkn.d.ts +5 -0
  49. package/dist/resources/bkn.js +5 -0
  50. package/dist/resources/datasources.js +2 -1
  51. package/dist/resources/skills.d.ts +17 -1
  52. package/dist/resources/skills.js +32 -1
  53. package/dist/trace-ai/diagnose/agent-binding.d.ts +67 -0
  54. package/dist/trace-ai/diagnose/agent-binding.js +257 -0
  55. package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.d.ts +2 -0
  56. package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.js +15 -0
  57. package/dist/trace-ai/diagnose/builtin-rules/excessive-tool-calls-per-turn.yaml +16 -0
  58. package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.d.ts +2 -0
  59. package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.js +44 -0
  60. package/dist/trace-ai/diagnose/builtin-rules/llm-response-truncated-no-continue.yaml +15 -0
  61. package/dist/trace-ai/diagnose/builtin-rules/register.d.ts +1 -0
  62. package/dist/trace-ai/diagnose/builtin-rules/register.js +11 -0
  63. package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.d.ts +2 -0
  64. package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.js +29 -0
  65. package/dist/trace-ai/diagnose/builtin-rules/retrieval-empty-no-fallback.yaml +15 -0
  66. package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.d.ts +2 -0
  67. package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.js +45 -0
  68. package/dist/trace-ai/diagnose/builtin-rules/tool-error-swallowed.yaml +15 -0
  69. package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.d.ts +2 -0
  70. package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.js +38 -0
  71. package/dist/trace-ai/diagnose/builtin-rules/tool-loop-no-state-change.yaml +16 -0
  72. package/dist/trace-ai/diagnose/builtin-rules/tool-retry-intent-mismatch.yaml +68 -0
  73. package/dist/trace-ai/diagnose/index.d.ts +32 -0
  74. package/dist/trace-ai/diagnose/index.js +246 -0
  75. package/dist/trace-ai/diagnose/output-schema-converter.d.ts +24 -0
  76. package/dist/trace-ai/diagnose/output-schema-converter.js +81 -0
  77. package/dist/trace-ai/diagnose/predicate-registry.d.ts +7 -0
  78. package/dist/trace-ai/diagnose/predicate-registry.js +30 -0
  79. package/dist/trace-ai/diagnose/query-extractor.d.ts +14 -0
  80. package/dist/trace-ai/diagnose/query-extractor.js +45 -0
  81. package/dist/trace-ai/diagnose/report-assembler.d.ts +31 -0
  82. package/dist/trace-ai/diagnose/report-assembler.js +100 -0
  83. package/dist/trace-ai/diagnose/report-markdown.d.ts +18 -0
  84. package/dist/trace-ai/diagnose/report-markdown.js +192 -0
  85. package/dist/trace-ai/diagnose/rule-loader.d.ts +11 -0
  86. package/dist/trace-ai/diagnose/rule-loader.js +120 -0
  87. package/dist/trace-ai/diagnose/schemas.d.ts +184 -0
  88. package/dist/trace-ai/diagnose/schemas.js +154 -0
  89. package/dist/trace-ai/diagnose/signal-probe.d.ts +17 -0
  90. package/dist/trace-ai/diagnose/signal-probe.js +39 -0
  91. package/dist/trace-ai/diagnose/synthesizer-agent.d.ts +40 -0
  92. package/dist/trace-ai/diagnose/synthesizer-agent.js +158 -0
  93. package/dist/trace-ai/diagnose/synthesizer-template.d.ts +2 -0
  94. package/dist/trace-ai/diagnose/synthesizer-template.js +49 -0
  95. package/dist/trace-ai/diagnose/trace-shaper.d.ts +3 -0
  96. package/dist/trace-ai/diagnose/trace-shaper.js +73 -0
  97. package/dist/trace-ai/diagnose/types.d.ts +173 -0
  98. package/dist/trace-ai/diagnose/types.js +1 -0
  99. package/dist/trace-ai/eval-set/assertion-evaluator.d.ts +29 -0
  100. package/dist/trace-ai/eval-set/assertion-evaluator.js +100 -0
  101. package/dist/trace-ai/eval-set/builder.d.ts +36 -0
  102. package/dist/trace-ai/eval-set/builder.js +126 -0
  103. package/dist/trace-ai/eval-set/index.d.ts +15 -0
  104. package/dist/trace-ai/eval-set/index.js +10 -0
  105. package/dist/trace-ai/eval-set/output-writer.d.ts +27 -0
  106. package/dist/trace-ai/eval-set/output-writer.js +126 -0
  107. package/dist/trace-ai/eval-set/query-picker.d.ts +37 -0
  108. package/dist/trace-ai/eval-set/query-picker.js +147 -0
  109. package/dist/trace-ai/eval-set/redactor.d.ts +42 -0
  110. package/dist/trace-ai/eval-set/redactor.js +133 -0
  111. package/dist/trace-ai/eval-set/rubric-templates/answer-match-reference.prompt.md +19 -0
  112. package/dist/trace-ai/eval-set/schemas.d.ts +136 -0
  113. package/dist/trace-ai/eval-set/schemas.js +130 -0
  114. package/dist/trace-ai/eval-set/semantic-match-provider.d.ts +33 -0
  115. package/dist/trace-ai/eval-set/semantic-match-provider.js +51 -0
  116. package/dist/trace-ai/eval-set/test-runner.d.ts +34 -0
  117. package/dist/trace-ai/eval-set/test-runner.js +153 -0
  118. package/dist/trace-ai/eval-set/types.d.ts +46 -0
  119. package/dist/trace-ai/eval-set/types.js +8 -0
  120. package/dist/trace-ai/exp/bundle-writer.d.ts +10 -0
  121. package/dist/trace-ai/exp/bundle-writer.js +54 -0
  122. package/dist/trace-ai/exp/claude-binary.d.ts +5 -0
  123. package/dist/trace-ai/exp/claude-binary.js +30 -0
  124. package/dist/trace-ai/exp/coordinator.d.ts +45 -0
  125. package/dist/trace-ai/exp/coordinator.js +203 -0
  126. package/dist/trace-ai/exp/eval-runner.d.ts +14 -0
  127. package/dist/trace-ai/exp/eval-runner.js +47 -0
  128. package/dist/trace-ai/exp/exp-store/abort-signal.d.ts +3 -0
  129. package/dist/trace-ai/exp/exp-store/abort-signal.js +27 -0
  130. package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.d.ts +4 -0
  131. package/dist/trace-ai/exp/exp-store/candidate-lineage-yaml.js +37 -0
  132. package/dist/trace-ai/exp/exp-store/events-jsonl.d.ts +17 -0
  133. package/dist/trace-ai/exp/exp-store/events-jsonl.js +60 -0
  134. package/dist/trace-ai/exp/exp-store/exp-registry.d.ts +6 -0
  135. package/dist/trace-ai/exp/exp-store/exp-registry.js +41 -0
  136. package/dist/trace-ai/exp/exp-store/index.d.ts +46 -0
  137. package/dist/trace-ai/exp/exp-store/index.js +59 -0
  138. package/dist/trace-ai/exp/exp-store/lock.d.ts +3 -0
  139. package/dist/trace-ai/exp/exp-store/lock.js +73 -0
  140. package/dist/trace-ai/exp/exp-store/mission-md.d.ts +3 -0
  141. package/dist/trace-ai/exp/exp-store/mission-md.js +37 -0
  142. package/dist/trace-ai/exp/exp-store/readme-template.d.ts +5 -0
  143. package/dist/trace-ai/exp/exp-store/readme-template.js +25 -0
  144. package/dist/trace-ai/exp/exp-store/round-yaml.d.ts +3 -0
  145. package/dist/trace-ai/exp/exp-store/round-yaml.js +33 -0
  146. package/dist/trace-ai/exp/index.d.ts +8 -0
  147. package/dist/trace-ai/exp/index.js +238 -0
  148. package/dist/trace-ai/exp/info.d.ts +35 -0
  149. package/dist/trace-ai/exp/info.js +120 -0
  150. package/dist/trace-ai/exp/patch/agent-config.d.ts +1 -0
  151. package/dist/trace-ai/exp/patch/agent-config.js +26 -0
  152. package/dist/trace-ai/exp/patch/index.d.ts +2 -0
  153. package/dist/trace-ai/exp/patch/index.js +13 -0
  154. package/dist/trace-ai/exp/patch/skill.d.ts +1 -0
  155. package/dist/trace-ai/exp/patch/skill.js +24 -0
  156. package/dist/trace-ai/exp/providers/synthesizer-client.d.ts +14 -0
  157. package/dist/trace-ai/exp/providers/synthesizer-client.js +39 -0
  158. package/dist/trace-ai/exp/providers/triage-client.d.ts +19 -0
  159. package/dist/trace-ai/exp/providers/triage-client.js +51 -0
  160. package/dist/trace-ai/exp/schemas.d.ts +147 -0
  161. package/dist/trace-ai/exp/schemas.js +50 -0
  162. package/dist/trace-ai/exp/scoring.d.ts +2 -0
  163. package/dist/trace-ai/exp/scoring.js +46 -0
  164. package/dist/trace-ai/scan/aggregator.d.ts +20 -0
  165. package/dist/trace-ai/scan/aggregator.js +26 -0
  166. package/dist/trace-ai/scan/artifacts/paths.d.ts +12 -0
  167. package/dist/trace-ai/scan/artifacts/paths.js +18 -0
  168. package/dist/trace-ai/scan/artifacts/writer.d.ts +67 -0
  169. package/dist/trace-ai/scan/artifacts/writer.js +96 -0
  170. package/dist/trace-ai/scan/batched-rubric.d.ts +55 -0
  171. package/dist/trace-ai/scan/batched-rubric.js +159 -0
  172. package/dist/trace-ai/scan/cross-trace-synthesizer.d.ts +24 -0
  173. package/dist/trace-ai/scan/cross-trace-synthesizer.js +93 -0
  174. package/dist/trace-ai/scan/index.d.ts +31 -0
  175. package/dist/trace-ai/scan/index.js +390 -0
  176. package/dist/trace-ai/scan/prompts/builtin/cross-trace-synthesizer-v1.prompt.md +44 -0
  177. package/dist/trace-ai/scan/prompts/builtin/rubric-judge-batch-v1.prompt.md +44 -0
  178. package/dist/trace-ai/scan/runner.d.ts +25 -0
  179. package/dist/trace-ai/scan/runner.js +42 -0
  180. package/dist/trace-ai/scan/sampler.d.ts +18 -0
  181. package/dist/trace-ai/scan/sampler.js +81 -0
  182. package/dist/trace-ai/scan/scan-summary-markdown.d.ts +2 -0
  183. package/dist/trace-ai/scan/scan-summary-markdown.js +71 -0
  184. package/dist/trace-ai/scan/scan-summary-schema.d.ts +73 -0
  185. package/dist/trace-ai/scan/scan-summary-schema.js +61 -0
  186. package/dist/trace-ai/scan/single-agent-validator.d.ts +23 -0
  187. package/dist/trace-ai/scan/single-agent-validator.js +42 -0
  188. package/dist/trace-ai/scan/traces-list-parser.d.ts +15 -0
  189. package/dist/trace-ai/scan/traces-list-parser.js +46 -0
  190. package/package.json +14 -4
@@ -0,0 +1,51 @@
1
+ // src/trace-ai/exp/providers/triage-client.ts
2
+ import { z } from "zod";
3
+ import { defaultRegistry } from "../../../agent-providers/registry.js";
4
+ const TriageOutputSchema = z.object({
5
+ diagnoses: z.array(z.string()),
6
+ hints: z.array(z.string()),
7
+ verdict: z.enum(["continue", "publish"]),
8
+ new_memory_token: z.string(),
9
+ });
10
+ export class ClaudeCodeTriageClient {
11
+ async triage(input) {
12
+ const provider = defaultRegistry.resolve({ preferred: "claude-code" });
13
+ if (!provider)
14
+ throw new Error("claude-code provider not available");
15
+ const r = input.currentRound;
16
+ const scoresSummary = r.scores
17
+ ? `outcome=${r.scores.outcome.toFixed(2)}, trajectory=${r.scores.trajectory.toFixed(2)}, guardrail=${r.scores.guardrail.toFixed(2)}`
18
+ : "no scores";
19
+ const failedQueries = (r.per_query_results ?? [])
20
+ .filter(q => q.assertion_results.some(a => a.verdict === "fail"))
21
+ .map(q => `${q.query_id}: ${q.assertion_results.filter(a => a.verdict === "fail").map(a => a.type).join(", ")}`)
22
+ .join("\n");
23
+ // candidateConfig is available for future prompt enrichment; omitted here to keep the prompt focused on scores.
24
+ const prompt = `You are an agent evaluation triager. Analyze the current round results and recommend next steps.
25
+
26
+ ROUND ${r.round} SCORES: ${scoresSummary}
27
+
28
+ FAILED QUERIES:
29
+ ${failedQueries || "None"}
30
+
31
+ TRAJECTORY ISSUES:
32
+ ${(r.per_query_results ?? []).filter(q => q.trajectory_summary.retry_count > 1).map(q => `${q.query_id}: ${q.trajectory_summary.retry_count} retries`).join("\n") || "None"}
33
+
34
+ PREVIOUS ROUND HISTORY:
35
+ ${input.prevRounds.map(pr => `Round ${pr.round}: outcome=${pr.scores?.outcome.toFixed(2) ?? "?"}, verdict=${pr.triage_conclusion?.verdict ?? "?"}`).join("\n") || "None"}
36
+
37
+ ${input.crossRoundMemoryRef ? `CONTEXT FROM PREVIOUS TRIAGE: ${input.crossRoundMemoryRef}` : ""}
38
+
39
+ Respond with JSON:
40
+ - "diagnoses": list of root cause observations
41
+ - "hints": list of specific suggestions for next change
42
+ - "verdict": "continue" if more rounds needed, "publish" if this candidate is good enough
43
+ - "new_memory_token": brief summary of key findings to carry forward (1-2 sentences)`;
44
+ const response = await provider.invoke({
45
+ prompt,
46
+ outputSchema: TriageOutputSchema,
47
+ correlationId: `triage-${Date.now()}`,
48
+ });
49
+ return response.output;
50
+ }
51
+ }
@@ -0,0 +1,147 @@
1
+ import { z } from "zod";
2
+ export declare const NextChangeSchema: z.ZodObject<{
3
+ target: z.ZodString;
4
+ hypothesis: z.ZodString;
5
+ patch: z.ZodString;
6
+ }, z.core.$strip>;
7
+ declare const GuardrailSchema: z.ZodObject<{
8
+ name: z.ZodString;
9
+ kind: z.ZodEnum<{
10
+ hard: "hard";
11
+ soft: "soft";
12
+ }>;
13
+ rule: z.ZodString;
14
+ }, z.core.$strip>;
15
+ export declare const MissionSchema: z.ZodObject<{
16
+ schema_version: z.ZodLiteral<"trace-mission/v1">;
17
+ goal: z.ZodString;
18
+ max_rounds: z.ZodOptional<z.ZodNumber>;
19
+ provider: z.ZodOptional<z.ZodString>;
20
+ eval_sets: z.ZodArray<z.ZodObject<{
21
+ path: z.ZodString;
22
+ role: z.ZodEnum<{
23
+ seed: "seed";
24
+ regression: "regression";
25
+ holdout: "holdout";
26
+ }>;
27
+ }, z.core.$strip>>;
28
+ current_candidate: z.ZodObject<{
29
+ path: z.ZodString;
30
+ }, z.core.$strip>;
31
+ next_change: z.ZodOptional<z.ZodObject<{
32
+ target: z.ZodString;
33
+ hypothesis: z.ZodString;
34
+ patch: z.ZodString;
35
+ }, z.core.$strip>>;
36
+ guardrails: z.ZodOptional<z.ZodArray<z.ZodObject<{
37
+ name: z.ZodString;
38
+ kind: z.ZodEnum<{
39
+ hard: "hard";
40
+ soft: "soft";
41
+ }>;
42
+ rule: z.ZodString;
43
+ }, z.core.$strip>>>;
44
+ }, z.core.$strip>;
45
+ export type Mission = z.infer<typeof MissionSchema>;
46
+ export type NextChange = z.infer<typeof NextChangeSchema>;
47
+ export { GuardrailSchema };
48
+ export type Guardrail = z.infer<typeof GuardrailSchema>;
49
+ export declare const BundleSchema: z.ZodObject<{
50
+ schema_version: z.ZodLiteral<"trace-bundle/v1">;
51
+ experiment_id: z.ZodString;
52
+ bundle_id: z.ZodString;
53
+ best_trial_version: z.ZodNumber;
54
+ resources: z.ZodObject<{
55
+ agent_config: z.ZodRecord<z.ZodString, z.ZodUnknown>;
56
+ skills: z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
57
+ }, z.core.$strip>;
58
+ provenance: z.ZodObject<{
59
+ created_by: z.ZodString;
60
+ created_at: z.ZodString;
61
+ evidence_traces: z.ZodArray<z.ZodString>;
62
+ round_refs: z.ZodArray<z.ZodString>;
63
+ }, z.core.$strip>;
64
+ }, z.core.$strip>;
65
+ export type Bundle = z.infer<typeof BundleSchema>;
66
+ export declare const ManifestSchema: z.ZodObject<{
67
+ schema_version: z.ZodLiteral<"trace-manifest/v1">;
68
+ experiment_id: z.ZodString;
69
+ trial_version: z.ZodNumber;
70
+ predictions: z.ZodObject<{
71
+ fixes: z.ZodArray<z.ZodObject<{
72
+ query_id: z.ZodString;
73
+ reason: z.ZodString;
74
+ }, z.core.$strip>>;
75
+ risks: z.ZodArray<z.ZodObject<{
76
+ query_id: z.ZodString;
77
+ reason: z.ZodString;
78
+ }, z.core.$strip>>;
79
+ }, z.core.$strip>;
80
+ }, z.core.$strip>;
81
+ export type Manifest = z.infer<typeof ManifestSchema>;
82
+ export type ExpFsmState = "Init" | "Generating" | "Executing" | "Scoring" | "Triaging" | "Deciding" | "Publishing" | "Published" | "Aborted";
83
+ export type ExpEvent = {
84
+ ts: string;
85
+ type: "state_transition";
86
+ from: ExpFsmState;
87
+ to: ExpFsmState;
88
+ round: number;
89
+ } | {
90
+ ts: string;
91
+ type: "round_completed";
92
+ round: number;
93
+ verdict: "continue" | "publish";
94
+ } | {
95
+ ts: string;
96
+ type: "step_failed";
97
+ state: ExpFsmState;
98
+ error: string;
99
+ retryable: boolean;
100
+ } | {
101
+ ts: string;
102
+ type: "aborted";
103
+ round: number;
104
+ reason: string;
105
+ };
106
+ export interface LineageEntry {
107
+ version: number;
108
+ candidate_path: string;
109
+ next_change: NextChange;
110
+ status: "running" | "scored" | "guardrail_failed";
111
+ appended_at: string;
112
+ }
113
+ export interface ThreeAxisScores {
114
+ outcome: number;
115
+ trajectory: number;
116
+ guardrail: number;
117
+ guardrail_hard_fail: boolean;
118
+ }
119
+ export interface QueryResult {
120
+ query_id: string;
121
+ assertion_results: Array<{
122
+ type: string;
123
+ verdict: "pass" | "fail" | "skip";
124
+ reason?: string;
125
+ }>;
126
+ trajectory_summary: {
127
+ tool_call_sequence: string[];
128
+ retry_count: number;
129
+ latency_ms: number;
130
+ error_codes: string[];
131
+ };
132
+ raw_trace_id?: string;
133
+ }
134
+ export interface RoundData {
135
+ round: number;
136
+ trial_version: number;
137
+ scores?: ThreeAxisScores;
138
+ per_query_results?: QueryResult[];
139
+ trajectory_summaries?: QueryResult["trajectory_summary"][];
140
+ guardrail_failed?: boolean;
141
+ triage_conclusion?: {
142
+ diagnoses: string[];
143
+ hints: string[];
144
+ verdict: "continue" | "publish";
145
+ cross_round_memory_ref?: string;
146
+ };
147
+ }
@@ -0,0 +1,50 @@
1
+ import { z } from "zod";
2
+ export const NextChangeSchema = z.object({
3
+ target: z.string().min(1),
4
+ hypothesis: z.string().min(1),
5
+ patch: z.string(),
6
+ });
7
+ const GuardrailSchema = z.object({
8
+ name: z.string(),
9
+ kind: z.enum(["hard", "soft"]),
10
+ rule: z.string(),
11
+ });
12
+ export const MissionSchema = z.object({
13
+ schema_version: z.literal("trace-mission/v1"),
14
+ goal: z.string().min(1),
15
+ max_rounds: z.number().int().positive().optional(),
16
+ provider: z.string().optional(),
17
+ eval_sets: z.array(z.object({
18
+ path: z.string().min(1),
19
+ role: z.enum(["seed", "regression", "holdout"]),
20
+ })).min(1),
21
+ current_candidate: z.object({ path: z.string() }),
22
+ next_change: NextChangeSchema.optional(),
23
+ guardrails: z.array(GuardrailSchema).optional(),
24
+ });
25
+ export { GuardrailSchema };
26
+ export const BundleSchema = z.object({
27
+ schema_version: z.literal("trace-bundle/v1"),
28
+ experiment_id: z.string().min(1),
29
+ bundle_id: z.string().min(1),
30
+ best_trial_version: z.number().int().nonnegative(),
31
+ resources: z.object({
32
+ agent_config: z.record(z.string(), z.unknown()),
33
+ skills: z.array(z.record(z.string(), z.unknown())),
34
+ }),
35
+ provenance: z.object({
36
+ created_by: z.string(),
37
+ created_at: z.string(),
38
+ evidence_traces: z.array(z.string()),
39
+ round_refs: z.array(z.string()),
40
+ }),
41
+ });
42
+ export const ManifestSchema = z.object({
43
+ schema_version: z.literal("trace-manifest/v1"),
44
+ experiment_id: z.string().min(1),
45
+ trial_version: z.number().int().nonnegative(),
46
+ predictions: z.object({
47
+ fixes: z.array(z.object({ query_id: z.string(), reason: z.string() })),
48
+ risks: z.array(z.object({ query_id: z.string(), reason: z.string() })),
49
+ }),
50
+ });
@@ -0,0 +1,2 @@
1
+ import type { QueryResult, ThreeAxisScores, Guardrail } from "./schemas.js";
2
+ export declare function computeScores(results: QueryResult[], guardrails: Guardrail[]): ThreeAxisScores;
@@ -0,0 +1,46 @@
1
+ export function computeScores(results, guardrails) {
2
+ if (results.length === 0) {
3
+ return { outcome: 0, trajectory: 0, guardrail: 1, guardrail_hard_fail: false };
4
+ }
5
+ // Outcome: fraction of assertions that passed
6
+ let totalAssertions = 0;
7
+ let passedAssertions = 0;
8
+ for (const r of results) {
9
+ for (const a of r.assertion_results) {
10
+ if (a.verdict === "skip")
11
+ continue;
12
+ totalAssertions++;
13
+ if (a.verdict === "pass")
14
+ passedAssertions++;
15
+ }
16
+ }
17
+ const outcome = totalAssertions === 0 ? 1 : passedAssertions / totalAssertions;
18
+ const RETRY_PENALTY_PER_RETRY = 0.15;
19
+ const MAX_RETRY_PENALTY = 0.6;
20
+ const ERROR_CODE_PENALTY = 0.3;
21
+ // Trajectory: penalize retries and errors
22
+ let trajectorySum = 0;
23
+ for (const r of results) {
24
+ const { retry_count, error_codes } = r.trajectory_summary;
25
+ const retryPenalty = Math.min(retry_count * RETRY_PENALTY_PER_RETRY, MAX_RETRY_PENALTY);
26
+ const errorPenalty = error_codes.length > 0 ? ERROR_CODE_PENALTY : 0;
27
+ trajectorySum += Math.max(0, 1 - retryPenalty - errorPenalty);
28
+ }
29
+ const trajectory = trajectorySum / results.length;
30
+ // MVP-C stub: hard guardrails fire when any result has error_codes present,
31
+ // regardless of the specific rule text. Soft guardrails do not affect the guardrail score yet.
32
+ // Guardrail: check hard gates (any error_codes in results triggers hard gate if guardrail with kind="hard")
33
+ let guardrail_hard_fail = false;
34
+ let guardrail = 1;
35
+ for (const g of guardrails) {
36
+ if (g.kind === "hard") {
37
+ const violated = results.some(r => r.trajectory_summary.error_codes.length > 0);
38
+ if (violated) {
39
+ guardrail_hard_fail = true;
40
+ guardrail = 0;
41
+ break;
42
+ }
43
+ }
44
+ }
45
+ return { outcome, trajectory, guardrail, guardrail_hard_fail };
46
+ }
@@ -0,0 +1,20 @@
1
+ import type { Report } from "../diagnose/types.js";
2
+ export interface RuleFrequencyItem {
3
+ rule_id: string;
4
+ count: number;
5
+ severity_breakdown: {
6
+ high: number;
7
+ medium: number;
8
+ low: number;
9
+ };
10
+ }
11
+ export interface AggregatesBlock {
12
+ rule_frequency: RuleFrequencyItem[];
13
+ }
14
+ /**
15
+ * Deterministic aggregation over a list of per-trace reports.
16
+ * - rule_frequency: counts each rule_id across all findings; severity_breakdown
17
+ * gives high/medium/low counts. Sorted by count descending, then rule_id
18
+ * ascending for stable ordering.
19
+ */
20
+ export declare function aggregate(reports: Report[]): AggregatesBlock;
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Deterministic aggregation over a list of per-trace reports.
3
+ * - rule_frequency: counts each rule_id across all findings; severity_breakdown
4
+ * gives high/medium/low counts. Sorted by count descending, then rule_id
5
+ * ascending for stable ordering.
6
+ */
7
+ export function aggregate(reports) {
8
+ const byRule = new Map();
9
+ for (const r of reports) {
10
+ for (const f of r.findings) {
11
+ let item = byRule.get(f.ruleId);
12
+ if (!item) {
13
+ item = { rule_id: f.ruleId, count: 0, severity_breakdown: { high: 0, medium: 0, low: 0 } };
14
+ byRule.set(f.ruleId, item);
15
+ }
16
+ item.count += 1;
17
+ item.severity_breakdown[f.severity] += 1;
18
+ }
19
+ }
20
+ const rule_frequency = [...byRule.values()].sort((a, b) => {
21
+ if (b.count !== a.count)
22
+ return b.count - a.count;
23
+ return a.rule_id.localeCompare(b.rule_id);
24
+ });
25
+ return { rule_frequency };
26
+ }
@@ -0,0 +1,12 @@
1
+ export interface ResolveArtifactsBaseInput {
2
+ /** 'batch' → `<out>/artifacts/`; 'single' → `<stem>.artifacts/` next to the report. */
3
+ mode: "batch" | "single";
4
+ /** Batch: directory path (`--out=<dir>`). Single: file path (`--out=<file.yaml>`). */
5
+ out: string;
6
+ }
7
+ /**
8
+ * Resolve the artifacts base directory given the caller's `--out` value and
9
+ * mode. Strips known extensions in single-trace mode so `.yaml`, `.yml`, and
10
+ * `.md` all yield the same artifacts dir name.
11
+ */
12
+ export declare function resolveArtifactsBase(input: ResolveArtifactsBaseInput): string;
@@ -0,0 +1,18 @@
1
+ import path from "node:path";
2
+ /**
3
+ * Resolve the artifacts base directory given the caller's `--out` value and
4
+ * mode. Strips known extensions in single-trace mode so `.yaml`, `.yml`, and
5
+ * `.md` all yield the same artifacts dir name.
6
+ */
7
+ export function resolveArtifactsBase(input) {
8
+ if (input.mode === "batch") {
9
+ // Trim trailing slash, then append `artifacts`.
10
+ const trimmed = input.out.replace(/\/+$/, "");
11
+ return path.join(trimmed, "artifacts");
12
+ }
13
+ // single-trace: <dirname>/<stem>.artifacts/
14
+ const dir = path.dirname(input.out);
15
+ const base = path.basename(input.out);
16
+ const stem = base.replace(/\.(yaml|yml|md)$/i, "");
17
+ return path.join(dir, `${stem}.artifacts`);
18
+ }
@@ -0,0 +1,67 @@
1
+ export interface RunMetadata {
2
+ cli_args: Record<string, unknown>;
3
+ agent_id: string;
4
+ rule_load_summary: {
5
+ rules_applied: string[];
6
+ rules_skipped_at_load: string[];
7
+ rules_dir: string;
8
+ };
9
+ single_agent_validation: {
10
+ checked_conv_ids: number;
11
+ agent_id_resolved: string;
12
+ };
13
+ timing: {
14
+ stage_1_ms: number;
15
+ stage_2_ms: number;
16
+ stage_3_ms: number;
17
+ stage_4_ms: number;
18
+ total_ms: number;
19
+ };
20
+ llm_calls: {
21
+ stage_2_chunks: number;
22
+ stage_3: number;
23
+ stage_4: number;
24
+ total: number;
25
+ };
26
+ cost_estimate_usd: {
27
+ stage_2: number;
28
+ stage_4: number;
29
+ total: number;
30
+ model_price_table_version: string;
31
+ };
32
+ }
33
+ export interface ArtifactWriterOpts {
34
+ /** Base directory; everything else is relative to this. */
35
+ base: string;
36
+ /** When false, all write methods are no-ops. */
37
+ enabled: boolean;
38
+ }
39
+ /**
40
+ * Persists each Stage's LLM I/O to disk so users can trace why a diagnosis
41
+ * came out the way it did. Used by both single-trace (PR-B `diagnose()`) and
42
+ * batch (`runBatch()`); only the directory base differs.
43
+ *
44
+ * Layout (under `base`):
45
+ * run-metadata.json
46
+ * stage-2-rubric/<rule_id>/{work-queue.json, chunk-NNN.{prompt.md, response.json, parse-errors.json}}
47
+ * stage-3-synth/{prompt.md, response.json} ← single-trace only
48
+ * stage-4-cross-trace-synth/{aggregates.json, samples.json, prompt.md, response.json, parse-errors.json} ← batch only
49
+ */
50
+ export declare class ArtifactWriter {
51
+ private base;
52
+ private enabled;
53
+ constructor(opts: ArtifactWriterOpts);
54
+ private ensureDir;
55
+ private chunkSlug;
56
+ writeStageTwoWorkQueue(ruleId: string, convIds: string[]): Promise<void>;
57
+ writeStageTwoPrompt(ruleId: string, chunkIdx: number, prompt: string): Promise<void>;
58
+ writeStageTwoResponse(ruleId: string, chunkIdx: number, response: unknown): Promise<void>;
59
+ writeStageTwoParseErrors(ruleId: string, chunkIdx: number, errors: unknown[]): Promise<void>;
60
+ writeStageThreeSynthPrompt(prompt: string): Promise<void>;
61
+ writeStageThreeSynthResponse(response: unknown): Promise<void>;
62
+ writeStageFourInputs(aggregates: unknown, samples: unknown): Promise<void>;
63
+ writeStageFourPrompt(prompt: string): Promise<void>;
64
+ writeStageFourResponse(response: unknown): Promise<void>;
65
+ writeStageFourParseErrors(errors: unknown[]): Promise<void>;
66
+ writeRunMetadata(meta: RunMetadata): Promise<void>;
67
+ }
@@ -0,0 +1,96 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ /**
4
+ * Persists each Stage's LLM I/O to disk so users can trace why a diagnosis
5
+ * came out the way it did. Used by both single-trace (PR-B `diagnose()`) and
6
+ * batch (`runBatch()`); only the directory base differs.
7
+ *
8
+ * Layout (under `base`):
9
+ * run-metadata.json
10
+ * stage-2-rubric/<rule_id>/{work-queue.json, chunk-NNN.{prompt.md, response.json, parse-errors.json}}
11
+ * stage-3-synth/{prompt.md, response.json} ← single-trace only
12
+ * stage-4-cross-trace-synth/{aggregates.json, samples.json, prompt.md, response.json, parse-errors.json} ← batch only
13
+ */
14
+ export class ArtifactWriter {
15
+ base;
16
+ enabled;
17
+ constructor(opts) {
18
+ this.base = opts.base;
19
+ this.enabled = opts.enabled;
20
+ }
21
+ async ensureDir(rel) {
22
+ const abs = path.join(this.base, rel);
23
+ await fs.mkdir(abs, { recursive: true });
24
+ return abs;
25
+ }
26
+ chunkSlug(idx) {
27
+ return `chunk-${String(idx).padStart(3, "0")}`;
28
+ }
29
+ async writeStageTwoWorkQueue(ruleId, convIds) {
30
+ if (!this.enabled)
31
+ return;
32
+ const dir = await this.ensureDir(path.join("stage-2-rubric", ruleId));
33
+ await fs.writeFile(path.join(dir, "work-queue.json"), JSON.stringify(convIds, null, 2), "utf8");
34
+ }
35
+ async writeStageTwoPrompt(ruleId, chunkIdx, prompt) {
36
+ if (!this.enabled)
37
+ return;
38
+ const dir = await this.ensureDir(path.join("stage-2-rubric", ruleId));
39
+ await fs.writeFile(path.join(dir, `${this.chunkSlug(chunkIdx)}.prompt.md`), prompt, "utf8");
40
+ }
41
+ async writeStageTwoResponse(ruleId, chunkIdx, response) {
42
+ if (!this.enabled)
43
+ return;
44
+ const dir = await this.ensureDir(path.join("stage-2-rubric", ruleId));
45
+ await fs.writeFile(path.join(dir, `${this.chunkSlug(chunkIdx)}.response.json`), JSON.stringify(response, null, 2), "utf8");
46
+ }
47
+ async writeStageTwoParseErrors(ruleId, chunkIdx, errors) {
48
+ if (!this.enabled || errors.length === 0)
49
+ return;
50
+ const dir = await this.ensureDir(path.join("stage-2-rubric", ruleId));
51
+ await fs.writeFile(path.join(dir, `${this.chunkSlug(chunkIdx)}.parse-errors.json`), JSON.stringify(errors, null, 2), "utf8");
52
+ }
53
+ async writeStageThreeSynthPrompt(prompt) {
54
+ if (!this.enabled)
55
+ return;
56
+ const dir = await this.ensureDir("stage-3-synth");
57
+ await fs.writeFile(path.join(dir, "prompt.md"), prompt, "utf8");
58
+ }
59
+ async writeStageThreeSynthResponse(response) {
60
+ if (!this.enabled)
61
+ return;
62
+ const dir = await this.ensureDir("stage-3-synth");
63
+ await fs.writeFile(path.join(dir, "response.json"), JSON.stringify(response, null, 2), "utf8");
64
+ }
65
+ async writeStageFourInputs(aggregates, samples) {
66
+ if (!this.enabled)
67
+ return;
68
+ const dir = await this.ensureDir("stage-4-cross-trace-synth");
69
+ await fs.writeFile(path.join(dir, "aggregates.json"), JSON.stringify(aggregates, null, 2), "utf8");
70
+ await fs.writeFile(path.join(dir, "samples.json"), JSON.stringify(samples, null, 2), "utf8");
71
+ }
72
+ async writeStageFourPrompt(prompt) {
73
+ if (!this.enabled)
74
+ return;
75
+ const dir = await this.ensureDir("stage-4-cross-trace-synth");
76
+ await fs.writeFile(path.join(dir, "prompt.md"), prompt, "utf8");
77
+ }
78
+ async writeStageFourResponse(response) {
79
+ if (!this.enabled)
80
+ return;
81
+ const dir = await this.ensureDir("stage-4-cross-trace-synth");
82
+ await fs.writeFile(path.join(dir, "response.json"), JSON.stringify(response, null, 2), "utf8");
83
+ }
84
+ async writeStageFourParseErrors(errors) {
85
+ if (!this.enabled || errors.length === 0)
86
+ return;
87
+ const dir = await this.ensureDir("stage-4-cross-trace-synth");
88
+ await fs.writeFile(path.join(dir, "parse-errors.json"), JSON.stringify(errors, null, 2), "utf8");
89
+ }
90
+ async writeRunMetadata(meta) {
91
+ if (!this.enabled)
92
+ return;
93
+ await fs.mkdir(this.base, { recursive: true });
94
+ await fs.writeFile(path.join(this.base, "run-metadata.json"), JSON.stringify(meta, null, 2), "utf8");
95
+ }
96
+ }
@@ -0,0 +1,55 @@
1
+ import { z } from "zod";
2
+ import type { AgentProvider } from "../../agent-providers/types.js";
3
+ import { PromptTemplateRegistry, type AgentOutputLang } from "../../agent-providers/prompt-template.js";
4
+ import { ArtifactWriter } from "./artifacts/writer.js";
5
+ export interface BatchTraceItem {
6
+ traceId: string;
7
+ /** Real span_ids present in this trace; used to validate `first_violating_step_id`. */
8
+ spans: string[];
9
+ /** Inputs resolved per the rule's `inputs` schema. */
10
+ inputs: Record<string, unknown>;
11
+ }
12
+ export interface BatchedRubricRule {
13
+ ruleId: string;
14
+ judgeQuestion: string;
15
+ outputSchema: z.ZodTypeAny;
16
+ outputSchemaRaw: Record<string, unknown>;
17
+ promptTemplateRef: string;
18
+ }
19
+ export interface BatchedRubricVerdict {
20
+ traceId: string;
21
+ category: string;
22
+ reasoning: string;
23
+ severity: "low" | "medium" | "high";
24
+ firstViolatingStepId: string;
25
+ evidenceSpanIds: string[];
26
+ }
27
+ export interface BatchedRubricSkipped {
28
+ traceId: string;
29
+ reason: string;
30
+ }
31
+ export interface BatchedRubricResult {
32
+ verdicts: BatchedRubricVerdict[];
33
+ skipped: BatchedRubricSkipped[];
34
+ }
35
+ export interface RunBatchedRubricOpts {
36
+ rule: BatchedRubricRule;
37
+ traces: BatchTraceItem[];
38
+ agentId: string;
39
+ provider: AgentProvider;
40
+ promptRegistry: PromptTemplateRegistry;
41
+ chunkSize: number;
42
+ lang?: AgentOutputLang;
43
+ artifacts?: ArtifactWriter;
44
+ timeoutMs?: number;
45
+ }
46
+ /**
47
+ * Stage-2 batched rubric evaluator. Splits flagged traces into chunks of K
48
+ * (default 10), one LLM call per chunk, then validates each per-trace verdict
49
+ * against the rule's output schema PLUS two ground-truth checks:
50
+ * - trace_id must echo back one of this chunk's input trace_ids
51
+ * - first_violating_step_id must be a real span_id in THAT trace's spans
52
+ * Failures isolate to the affected trace; chunk-wide LLM failures skip the
53
+ * whole chunk with agent-error:<kind>.
54
+ */
55
+ export declare function runBatchedRubric(opts: RunBatchedRubricOpts): Promise<BatchedRubricResult>;