@tangle-network/agent-eval 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +212 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
- package/dist/chunk-EDUKQ5AM.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-JLZQWFV3.js +618 -0
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +2018 -3003
- package/dist/index.js +7443 -9102
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +345 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-BNgMdqPF.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +369 -25
- package/dist/wire/index.js +22 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
|
|
2
|
+
export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
|
|
3
|
+
import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
4
|
+
import '../trajectory-CnoBo-JY.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Export PRM-graded traces as training data for downstream reward-model
|
|
8
|
+
* fine-tuning. Canonical format is NDJSON of
|
|
9
|
+
* `{ trajectory_text, step_index, rubric, score }` so a small model can
|
|
10
|
+
* learn to predict step rewards from step context.
|
|
11
|
+
*
|
|
12
|
+
* The framework doesn't train the model — we emit the data; callers
|
|
13
|
+
* plug it into their preferred trainer (TRL, Unsloth, custom).
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
interface PrmTrainingSample {
|
|
17
|
+
runId: string;
|
|
18
|
+
spanId: string;
|
|
19
|
+
rubricId: string;
|
|
20
|
+
score: number;
|
|
21
|
+
/** Serialized step context — step + surrounding conversation. */
|
|
22
|
+
context: {
|
|
23
|
+
priorTurns: Array<{
|
|
24
|
+
role: string;
|
|
25
|
+
content: string;
|
|
26
|
+
}>;
|
|
27
|
+
step: {
|
|
28
|
+
kind: Span['kind'];
|
|
29
|
+
text: string;
|
|
30
|
+
};
|
|
31
|
+
};
|
|
32
|
+
/** Optional evidence + rationale for auditability. */
|
|
33
|
+
rationale?: string;
|
|
34
|
+
evidence?: string;
|
|
35
|
+
}
|
|
36
|
+
declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
|
|
37
|
+
contextWindow?: number;
|
|
38
|
+
}): Promise<PrmTrainingSample[]>;
|
|
39
|
+
/** NDJSON serialization — write to file or stream directly to a trainer. */
|
|
40
|
+
declare function toNdjson(samples: PrmTrainingSample[]): string;
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Built-in reference rubrics. Consumers combine these with domain
|
|
44
|
+
* rubrics. All are deterministic, rule-based — cheap to run + easy
|
|
45
|
+
* to unit-test. LLM-based rubrics are trivially authored by
|
|
46
|
+
* following the StepRubric contract.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
/** Penalize very short or very long assistant outputs. */
|
|
50
|
+
declare function outputLengthRubric(args?: {
|
|
51
|
+
minChars?: number;
|
|
52
|
+
maxChars?: number;
|
|
53
|
+
weight?: number;
|
|
54
|
+
}): StepRubric;
|
|
55
|
+
/** Reward tool calls that succeeded (status='ok') with an informative result. */
|
|
56
|
+
declare function toolSuccessRubric(args?: {
|
|
57
|
+
weight?: number;
|
|
58
|
+
}): StepRubric;
|
|
59
|
+
/** Penalize tool calls that duplicate a prior call with identical args. */
|
|
60
|
+
declare function toolNonRedundantRubric(args?: {
|
|
61
|
+
weight?: number;
|
|
62
|
+
}): StepRubric;
|
|
63
|
+
/** Penalize LLM outputs that contain common refusal markers when a refusal
|
|
64
|
+
* is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
|
|
65
|
+
declare function nonRefusalRubric(args?: {
|
|
66
|
+
markers?: RegExp[];
|
|
67
|
+
weight?: number;
|
|
68
|
+
}): StepRubric;
|
|
69
|
+
/** Reward outputs that invoke the next-step tool the trajectory actually uses
|
|
70
|
+
* (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
|
|
71
|
+
declare function toolIntentAlignmentRubric(args?: {
|
|
72
|
+
weight?: number;
|
|
73
|
+
}): StepRubric;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Inference-time PRM scoring — pick the best of N candidate trajectories
|
|
77
|
+
* using a trained reward model (or a rule-based PRM as a proxy).
|
|
78
|
+
*
|
|
79
|
+
* The canonical Best-of-N pattern: generate N completions, score each
|
|
80
|
+
* with a PRM, pick the winner. Here the scoring loop is framework-agnostic
|
|
81
|
+
* — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
|
|
82
|
+
*/
|
|
83
|
+
|
|
84
|
+
interface BestOfNResult {
|
|
85
|
+
winner: PrmGradedTrace;
|
|
86
|
+
ranked: PrmGradedTrace[];
|
|
87
|
+
/** Standard deviation of aggregate scores — small = candidates were homogenous. */
|
|
88
|
+
stdDev: number;
|
|
89
|
+
}
|
|
90
|
+
declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
|
|
91
|
+
/**
|
|
92
|
+
* Weighted vote across multiple graders — use when you want a PRM ensemble
|
|
93
|
+
* (e.g. rule-based + LLM-based + trained model). Each grader produces its
|
|
94
|
+
* own ranking; we aggregate via rank-sum (Borda count) so no single grader
|
|
95
|
+
* dominates via a different score scale.
|
|
96
|
+
*/
|
|
97
|
+
declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
|
|
98
|
+
|
|
99
|
+
export { type BestOfNResult, PrmGradedTrace, PrmGrader, type PrmTrainingSample, StepRubric, exportTrainingData, nonRefusalRubric, outputLengthRubric, prmBestOfN, prmEnsembleBestOfN, toNdjson, toolIntentAlignmentRubric, toolNonRedundantRubric, toolSuccessRubric };
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import {
|
|
2
|
+
exportTrainingData,
|
|
3
|
+
toNdjson
|
|
4
|
+
} from "../chunk-KMPRBJK4.js";
|
|
5
|
+
import {
|
|
6
|
+
buildTrajectory
|
|
7
|
+
} from "../chunk-RZTMDUO7.js";
|
|
8
|
+
import "../chunk-5BKGXME7.js";
|
|
9
|
+
import {
|
|
10
|
+
TraceEmitter
|
|
11
|
+
} from "../chunk-TVVP3ZZQ.js";
|
|
12
|
+
import "../chunk-PZ5AY32C.js";
|
|
13
|
+
|
|
14
|
+
// src/prm/builtin-rubrics.ts
|
|
15
|
+
function outputLengthRubric(args = {}) {
|
|
16
|
+
const min = args.minChars ?? 20;
|
|
17
|
+
const max = args.maxChars ?? 8e3;
|
|
18
|
+
return {
|
|
19
|
+
id: "output-length",
|
|
20
|
+
kinds: ["llm"],
|
|
21
|
+
weight: args.weight ?? 0.5,
|
|
22
|
+
async grade({ step }) {
|
|
23
|
+
const llm = step.span;
|
|
24
|
+
const len = (llm.output ?? "").length;
|
|
25
|
+
if (len === 0) return { score: 0, rationale: "empty output" };
|
|
26
|
+
if (len < min)
|
|
27
|
+
return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` };
|
|
28
|
+
if (len > max)
|
|
29
|
+
return {
|
|
30
|
+
score: Math.max(0, 1 - (len - max) / max),
|
|
31
|
+
rationale: `above max (${len} > ${max})`
|
|
32
|
+
};
|
|
33
|
+
return { score: 1, rationale: `${len} chars in bounds` };
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
function toolSuccessRubric(args = {}) {
|
|
38
|
+
return {
|
|
39
|
+
id: "tool-success",
|
|
40
|
+
kinds: ["tool"],
|
|
41
|
+
weight: args.weight ?? 1,
|
|
42
|
+
async grade({ step }) {
|
|
43
|
+
const tool = step.span;
|
|
44
|
+
if (tool.status === "error")
|
|
45
|
+
return { score: 0, rationale: `error: ${tool.error ?? "unknown"}` };
|
|
46
|
+
const r = tool.result;
|
|
47
|
+
if (r === null || r === void 0) return { score: 0.3, rationale: "empty result" };
|
|
48
|
+
const asText = typeof r === "string" ? r : JSON.stringify(r);
|
|
49
|
+
if (asText.length < 4) return { score: 0.5, rationale: "tiny result" };
|
|
50
|
+
return { score: 1, rationale: `${tool.toolName} ok` };
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
function toolNonRedundantRubric(args = {}) {
|
|
55
|
+
const weight = args.weight ?? 0.5;
|
|
56
|
+
return {
|
|
57
|
+
id: "tool-non-redundant",
|
|
58
|
+
kinds: ["tool"],
|
|
59
|
+
weight,
|
|
60
|
+
async grade({ step, prior }) {
|
|
61
|
+
const tool = step.span;
|
|
62
|
+
const priorMatches = prior.filter((p) => {
|
|
63
|
+
if (p.span.kind !== "tool") return false;
|
|
64
|
+
const pt = p.span;
|
|
65
|
+
return pt.toolName === tool.toolName && stableStringify(pt.args) === stableStringify(tool.args);
|
|
66
|
+
});
|
|
67
|
+
if (priorMatches.length === 0) return { score: 1, rationale: "novel call" };
|
|
68
|
+
return {
|
|
69
|
+
score: Math.max(0, 1 - priorMatches.length * 0.5),
|
|
70
|
+
rationale: `${priorMatches.length} duplicate(s)`
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function nonRefusalRubric(args = {}) {
|
|
76
|
+
const weight = args.weight ?? 1;
|
|
77
|
+
const markers = args.markers ?? [
|
|
78
|
+
/\bi\s+(?:can(?:not|'t)|won't|will\s+not)\b/i,
|
|
79
|
+
/\b(?:as\s+an?\s+)?ai\b.*?\b(?:can't|cannot)\b/i
|
|
80
|
+
];
|
|
81
|
+
return {
|
|
82
|
+
id: "non-refusal",
|
|
83
|
+
kinds: ["llm"],
|
|
84
|
+
weight,
|
|
85
|
+
async grade({ step }) {
|
|
86
|
+
const llm = step.span;
|
|
87
|
+
const out = llm.output ?? "";
|
|
88
|
+
const refused = markers.some((re) => re.test(out));
|
|
89
|
+
return refused ? { score: 0, rationale: "refusal marker present" } : { score: 1, rationale: "no refusal" };
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
function toolIntentAlignmentRubric(args = {}) {
|
|
94
|
+
return {
|
|
95
|
+
id: "tool-intent-alignment",
|
|
96
|
+
kinds: ["llm"],
|
|
97
|
+
weight: args.weight ?? 0.5,
|
|
98
|
+
async grade({ step, next }) {
|
|
99
|
+
const llm = step.span;
|
|
100
|
+
const nextTool = next.find((s) => s.span.kind === "tool");
|
|
101
|
+
if (!nextTool) return null;
|
|
102
|
+
const toolName = nextTool.span.toolName;
|
|
103
|
+
const out = (llm.output ?? "").toLowerCase();
|
|
104
|
+
const mentioned = out.includes(toolName.toLowerCase());
|
|
105
|
+
return mentioned ? { score: 1, rationale: `mentioned "${toolName}" before calling it` } : { score: 0.5, rationale: `called "${toolName}" without announcing it` };
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
function stableStringify(value) {
|
|
110
|
+
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
111
|
+
if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`;
|
|
112
|
+
const keys = Object.keys(value).sort();
|
|
113
|
+
return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify(value[k])}`).join(",")}}`;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// src/prm/inference.ts
|
|
117
|
+
async function prmBestOfN(store, grader, runIds) {
|
|
118
|
+
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
119
|
+
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
120
|
+
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
121
|
+
const mean = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
122
|
+
const variance = graded.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / graded.length;
|
|
123
|
+
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance) };
|
|
124
|
+
}
|
|
125
|
+
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
126
|
+
if (graders.length === 0) throw new Error("prmEnsembleBestOfN: at least 1 grader");
|
|
127
|
+
const perGrader = await Promise.all(
|
|
128
|
+
graders.map(async (g) => {
|
|
129
|
+
const graded = await Promise.all(runIds.map((id) => g.grade(store, id)));
|
|
130
|
+
return graded.sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
131
|
+
})
|
|
132
|
+
);
|
|
133
|
+
const bordaScores = /* @__PURE__ */ new Map();
|
|
134
|
+
for (const ranking of perGrader) {
|
|
135
|
+
ranking.forEach((g, rank) => {
|
|
136
|
+
bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank));
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
const canonical = perGrader[0];
|
|
140
|
+
const byRun = new Map(canonical.map((g) => [g.runId, g]));
|
|
141
|
+
const ranked = [...byRun.values()].sort(
|
|
142
|
+
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
143
|
+
);
|
|
144
|
+
const mean = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
145
|
+
const variance = ranked.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / ranked.length;
|
|
146
|
+
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance) };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// src/prm/rubric.ts
|
|
150
|
+
var PrmGrader = class {
|
|
151
|
+
constructor(rubrics) {
|
|
152
|
+
this.rubrics = rubrics;
|
|
153
|
+
if (rubrics.length === 0) throw new Error("PrmGrader: at least 1 rubric required");
|
|
154
|
+
}
|
|
155
|
+
rubrics;
|
|
156
|
+
/**
|
|
157
|
+
* Grade every eligible span in a run. Emits a JudgeVerdict span for each
|
|
158
|
+
* (rubric × span) verdict so the result is visible to downstream pipelines
|
|
159
|
+
* (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
|
|
160
|
+
*/
|
|
161
|
+
async grade(store, runId) {
|
|
162
|
+
const trajectory = await buildTrajectory(store, runId);
|
|
163
|
+
const emitter = new TraceEmitter(store, { runId });
|
|
164
|
+
const steps = [];
|
|
165
|
+
let ungraded = 0;
|
|
166
|
+
for (let i = 0; i < trajectory.steps.length; i++) {
|
|
167
|
+
const step = trajectory.steps[i];
|
|
168
|
+
const ctx = {
|
|
169
|
+
trajectory,
|
|
170
|
+
step,
|
|
171
|
+
prior: trajectory.steps.slice(0, i),
|
|
172
|
+
next: trajectory.steps.slice(i + 1)
|
|
173
|
+
};
|
|
174
|
+
let gradedThis = false;
|
|
175
|
+
for (const rubric of this.rubrics) {
|
|
176
|
+
if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue;
|
|
177
|
+
const verdict = await rubric.grade(ctx);
|
|
178
|
+
if (verdict === null) continue;
|
|
179
|
+
const weight = rubric.weight ?? 1;
|
|
180
|
+
steps.push({
|
|
181
|
+
spanId: step.span.spanId,
|
|
182
|
+
rubricId: rubric.id,
|
|
183
|
+
score: verdict.score,
|
|
184
|
+
weight,
|
|
185
|
+
rationale: verdict.rationale,
|
|
186
|
+
evidence: verdict.evidence
|
|
187
|
+
});
|
|
188
|
+
gradedThis = true;
|
|
189
|
+
await emitter.recordJudge({
|
|
190
|
+
judgeId: `prm:${rubric.id}`,
|
|
191
|
+
targetSpanId: step.span.spanId,
|
|
192
|
+
dimension: "step_quality",
|
|
193
|
+
score: verdict.score,
|
|
194
|
+
rationale: verdict.rationale,
|
|
195
|
+
evidence: verdict.evidence,
|
|
196
|
+
name: `prm:${rubric.id}`
|
|
197
|
+
});
|
|
198
|
+
}
|
|
199
|
+
if (!gradedThis) ungraded++;
|
|
200
|
+
}
|
|
201
|
+
const totalWeight = steps.reduce((a, s) => a + s.weight, 0);
|
|
202
|
+
const aggregateScore = totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight;
|
|
203
|
+
return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded };
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
function isPrmVerdict(verdict) {
|
|
207
|
+
return verdict.judgeId.startsWith("prm:");
|
|
208
|
+
}
|
|
209
|
+
export {
|
|
210
|
+
PrmGrader,
|
|
211
|
+
exportTrainingData,
|
|
212
|
+
isPrmVerdict,
|
|
213
|
+
nonRefusalRubric,
|
|
214
|
+
outputLengthRubric,
|
|
215
|
+
prmBestOfN,
|
|
216
|
+
prmEnsembleBestOfN,
|
|
217
|
+
toNdjson,
|
|
218
|
+
toolIntentAlignmentRubric,
|
|
219
|
+
toolNonRedundantRubric,
|
|
220
|
+
toolSuccessRubric
|
|
221
|
+
};
|
|
222
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/prm/builtin-rubrics.ts","../../src/prm/inference.ts","../../src/prm/rubric.ts"],"sourcesContent":["/**\n * Built-in reference rubrics. Consumers combine these with domain\n * rubrics. All are deterministic, rule-based — cheap to run + easy\n * to unit-test. LLM-based rubrics are trivially authored by\n * following the StepRubric contract.\n */\n\nimport type { LlmSpan, ToolSpan } from '../trace/schema'\nimport type { StepRubric } from './rubric'\n\n/** Penalize very short or very long assistant outputs. */\nexport function outputLengthRubric(\n args: { minChars?: number; maxChars?: number; weight?: number } = {},\n): StepRubric {\n const min = args.minChars ?? 20\n const max = args.maxChars ?? 8000\n return {\n id: 'output-length',\n kinds: ['llm'],\n weight: args.weight ?? 0.5,\n async grade({ step }) {\n const llm = step.span as LlmSpan\n const len = (llm.output ?? '').length\n if (len === 0) return { score: 0, rationale: 'empty output' }\n if (len < min)\n return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` }\n if (len > max)\n return {\n score: Math.max(0, 1 - (len - max) / max),\n rationale: `above max (${len} > ${max})`,\n }\n return { score: 1, rationale: `${len} chars in bounds` }\n },\n }\n}\n\n/** Reward tool calls that succeeded (status='ok') with an informative result. */\nexport function toolSuccessRubric(args: { weight?: number } = {}): StepRubric {\n return {\n id: 'tool-success',\n kinds: ['tool'],\n weight: args.weight ?? 1,\n async grade({ step }) {\n const tool = step.span as ToolSpan\n if (tool.status === 'error')\n return { score: 0, rationale: `error: ${tool.error ?? 'unknown'}` }\n const r = tool.result\n if (r === null || r === undefined) return { score: 0.3, rationale: 'empty result' }\n const asText = typeof r === 'string' ? r : JSON.stringify(r)\n if (asText.length < 4) return { score: 0.5, rationale: 'tiny result' }\n return { score: 1, rationale: `${tool.toolName} ok` }\n },\n }\n}\n\n/** Penalize tool calls that duplicate a prior call with identical args. */\nexport function toolNonRedundantRubric(args: { weight?: number } = {}): StepRubric {\n const weight = args.weight ?? 0.5\n return {\n id: 'tool-non-redundant',\n kinds: ['tool'],\n weight,\n async grade({ step, prior }) {\n const tool = step.span as ToolSpan\n const priorMatches = prior.filter((p) => {\n if (p.span.kind !== 'tool') return false\n const pt = p.span as ToolSpan\n return (\n pt.toolName === tool.toolName && stableStringify(pt.args) === stableStringify(tool.args)\n )\n })\n if (priorMatches.length === 0) return { score: 1, rationale: 'novel call' }\n return {\n score: Math.max(0, 1 - priorMatches.length * 0.5),\n rationale: `${priorMatches.length} duplicate(s)`,\n }\n },\n }\n}\n\n/** Penalize LLM outputs that contain common refusal markers when a refusal\n * is NOT expected (caller inverts weight for scenarios where refusal IS expected). */\nexport function nonRefusalRubric(args: { markers?: RegExp[]; weight?: number } = {}): StepRubric {\n const weight = args.weight ?? 1\n const markers = args.markers ?? [\n /\\bi\\s+(?:can(?:not|'t)|won't|will\\s+not)\\b/i,\n /\\b(?:as\\s+an?\\s+)?ai\\b.*?\\b(?:can't|cannot)\\b/i,\n ]\n return {\n id: 'non-refusal',\n kinds: ['llm'],\n weight,\n async grade({ step }) {\n const llm = step.span as LlmSpan\n const out = llm.output ?? ''\n const refused = markers.some((re) => re.test(out))\n return refused\n ? { score: 0, rationale: 'refusal marker present' }\n : { score: 1, rationale: 'no refusal' }\n },\n }\n}\n\n/** Reward outputs that invoke the next-step tool the trajectory actually uses\n * (i.e. the LLM span announced \"I will call X\" and the following tool span IS X). */\nexport function toolIntentAlignmentRubric(args: { weight?: number } = {}): StepRubric {\n return {\n id: 'tool-intent-alignment',\n kinds: ['llm'],\n weight: args.weight ?? 0.5,\n async grade({ step, next }) {\n const llm = step.span as LlmSpan\n const nextTool = next.find((s) => s.span.kind === 'tool')\n if (!nextTool) return null\n const toolName = (nextTool.span as ToolSpan).toolName\n const out = (llm.output ?? '').toLowerCase()\n const mentioned = out.includes(toolName.toLowerCase())\n return mentioned\n ? { score: 1, rationale: `mentioned \"${toolName}\" before calling it` }\n : { score: 0.5, rationale: `called \"${toolName}\" without announcing it` }\n },\n }\n}\n\nfunction stableStringify(value: unknown): string {\n if (value === null || typeof value !== 'object') return JSON.stringify(value)\n if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]`\n const keys = Object.keys(value as Record<string, unknown>).sort()\n return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify((value as Record<string, unknown>)[k])}`).join(',')}}`\n}\n","/**\n * Inference-time PRM scoring — pick the best of N candidate trajectories\n * using a trained reward model (or a rule-based PRM as a proxy).\n *\n * The canonical Best-of-N pattern: generate N completions, score each\n * with a PRM, pick the winner. Here the scoring loop is framework-agnostic\n * — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport type { PrmGradedTrace, PrmGrader } from './rubric'\n\nexport interface BestOfNResult {\n winner: PrmGradedTrace\n ranked: PrmGradedTrace[]\n /** Standard deviation of aggregate scores — small = candidates were homogenous. */\n stdDev: number\n}\n\nexport async function prmBestOfN(\n store: TraceStore,\n grader: PrmGrader,\n runIds: string[],\n): Promise<BestOfNResult> {\n if (runIds.length === 0) throw new Error('prmBestOfN: at least 1 candidate required')\n const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)))\n const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore)\n const mean = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length\n const variance = graded.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / graded.length\n return { winner: ranked[0]!, ranked, stdDev: Math.sqrt(variance) }\n}\n\n/**\n * Weighted vote across multiple graders — use when you want a PRM ensemble\n * (e.g. rule-based + LLM-based + trained model). Each grader produces its\n * own ranking; we aggregate via rank-sum (Borda count) so no single grader\n * dominates via a different score scale.\n */\nexport async function prmEnsembleBestOfN(\n store: TraceStore,\n graders: PrmGrader[],\n runIds: string[],\n): Promise<BestOfNResult> {\n if (graders.length === 0) throw new Error('prmEnsembleBestOfN: at least 1 grader')\n const perGrader = await Promise.all(\n graders.map(async (g) => {\n const graded = await Promise.all(runIds.map((id) => g.grade(store, id)))\n return graded.sort((a, b) => b.aggregateScore - a.aggregateScore)\n }),\n )\n // Borda: rank-sum across graders.\n const bordaScores = new Map<string, number>()\n for (const ranking of perGrader) {\n ranking.forEach((g, rank) => {\n bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank))\n })\n }\n // Return a synthesized ranking using the first grader's graded traces\n // ordered by Borda score. aggregateScore field kept for UX.\n const canonical = perGrader[0]!\n const byRun = new Map(canonical.map((g) => [g.runId, g]))\n const ranked = [...byRun.values()].sort(\n (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0),\n )\n const mean = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length\n const variance = ranked.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / ranked.length\n return { winner: ranked[0]!, ranked, stdDev: Math.sqrt(variance) }\n}\n","/**\n * Process Reward Modeling — per-step rubric grading.\n *\n * A StepRubric inspects one span and returns a score + rationale.\n * PrmGrader applies an array of rubrics to every LLM span in a\n * trajectory (consumers can broaden to tool/retrieval spans via the\n * `kind` filter on each rubric).\n *\n * Why this matters: outcome-only eval (did the final artifact work?)\n * gives sparse reward — most agent turns are unattributable. PRMs\n * densify the signal so optimizers and RL fine-tuning can assign\n * credit per turn.\n */\n\nimport { TraceEmitter } from '../trace/emitter'\nimport type { JudgeSpan, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface StepContext {\n trajectory: Trajectory\n step: TrajectoryStep\n /** Steps preceding `step` in trajectory order. */\n prior: TrajectoryStep[]\n /** Steps following `step`. */\n next: TrajectoryStep[]\n}\n\nexport interface StepRubric {\n id: string\n /** Only grade spans of these kinds (default: all). */\n kinds?: Array<Span['kind']>\n /** Weight in the aggregate score. Default 1. */\n weight?: number\n /** Returns score in 0..1 + optional rationale/evidence. Return `null` to\n * skip grading (rubric doesn't apply to this step). */\n grade: (\n ctx: StepContext,\n ) => Promise<{ score: number; rationale?: string; evidence?: string } | null>\n}\n\nexport interface GradedStep {\n spanId: string\n rubricId: string\n score: number\n weight: number\n rationale?: string\n evidence?: string\n}\n\nexport interface PrmGradedTrace {\n runId: string\n steps: GradedStep[]\n /** Weighted mean of all graded steps; 0..1. */\n aggregateScore: number\n /** Number of spans graded — useful for sanity-checking coverage. */\n gradedCount: number\n /** Number of spans in the trajectory that no rubric matched. */\n ungradedCount: number\n}\n\nexport class PrmGrader {\n constructor(private rubrics: StepRubric[]) {\n if (rubrics.length === 0) throw new Error('PrmGrader: at least 1 rubric required')\n }\n\n /**\n * Grade every eligible span in a run. Emits a JudgeVerdict span for each\n * (rubric × span) verdict so the result is visible to downstream pipelines\n * (judgeAgreementView, etc.) — PRM is just \"a judge that runs per span.\"\n */\n async grade(store: TraceStore, runId: string): Promise<PrmGradedTrace> {\n const trajectory = await buildTrajectory(store, runId)\n const emitter = new TraceEmitter(store, { runId })\n const steps: GradedStep[] = []\n let ungraded = 0\n for (let i = 0; i < trajectory.steps.length; i++) {\n const step = trajectory.steps[i]!\n const ctx: StepContext = {\n trajectory,\n step,\n prior: trajectory.steps.slice(0, i),\n next: trajectory.steps.slice(i + 1),\n }\n let gradedThis = false\n for (const rubric of this.rubrics) {\n if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue\n const verdict = await rubric.grade(ctx)\n if (verdict === null) continue\n const weight = rubric.weight ?? 1\n steps.push({\n spanId: step.span.spanId,\n rubricId: rubric.id,\n score: verdict.score,\n weight,\n rationale: verdict.rationale,\n evidence: verdict.evidence,\n })\n gradedThis = true\n // Persist the verdict as a JudgeSpan so the query pipelines see it\n await emitter.recordJudge({\n judgeId: `prm:${rubric.id}`,\n targetSpanId: step.span.spanId,\n dimension: 'step_quality',\n score: verdict.score,\n rationale: verdict.rationale,\n evidence: verdict.evidence,\n name: `prm:${rubric.id}`,\n })\n }\n if (!gradedThis) ungraded++\n }\n\n const totalWeight = steps.reduce((a, s) => a + s.weight, 0)\n const aggregateScore =\n totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight\n\n return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded }\n }\n}\n\n/** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines\n * can distinguish PRM verdicts from human or top-level LLM judges. */\nexport function isPrmVerdict(verdict: JudgeSpan): boolean {\n return verdict.judgeId.startsWith('prm:')\n}\n"],"mappings":";;;;;;;;;;;;;;AAWO,SAAS,mBACd,OAAkE,CAAC,GACvD;AACZ,QAAM,MAAM,KAAK,YAAY;AAC7B,QAAM,MAAM,KAAK,YAAY;AAC7B,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,KAAK;AAAA,IACb,QAAQ,KAAK,UAAU;AAAA,IACvB,MAAM,MAAM,EAAE,KAAK,GAAG;AACpB,YAAM,MAAM,KAAK;AACjB,YAAM,OAAO,IAAI,UAAU,IAAI;AAC/B,UAAI,QAAQ,EAAG,QAAO,EAAE,OAAO,GAAG,WAAW,eAAe;AAC5D,UAAI,MAAM;AACR,eAAO,EAAE,OAAO,KAAK,IAAI,GAAG,MAAM,GAAG,GAAG,WAAW,cAAc,GAAG,MAAM,GAAG,IAAI;AACnF,UAAI,MAAM;AACR,eAAO;AAAA,UACL,OAAO,KAAK,IAAI,GAAG,KAAK,MAAM,OAAO,GAAG;AAAA,UACxC,WAAW,cAAc,GAAG,MAAM,GAAG;AAAA,QACvC;AACF,aAAO,EAAE,OAAO,GAAG,WAAW,GAAG,GAAG,mBAAmB;AAAA,IACzD;AAAA,EACF;AACF;AAGO,SAAS,kBAAkB,OAA4B,CAAC,GAAe;AAC5E,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,MAAM;AAAA,IACd,QAAQ,KAAK,UAAU;AAAA,IACvB,MAAM,MAAM,EAAE,KAAK,GAAG;AACpB,YAAM,OAAO,KAAK;AAClB,UAAI,KAAK,WAAW;AAClB,eAAO,EAAE,OAAO,GAAG,WAAW,UAAU,KAAK,SAAS,SAAS,GAAG;AACpE,YAAM,IAAI,KAAK;AACf,UAAI,MAAM,QAAQ,MAAM,OAAW,QAAO,EAAE,OAAO,KAAK,WAAW,eAAe;AAClF,YAAM,SAAS,OAAO,MAAM,WAAW,IAAI,KAAK,UAAU,CAAC;AAC3D,UAAI,OAAO,SAAS,EAAG,QAAO,EAAE,OAAO,KAAK,WAAW,cAAc;AACrE,aAAO,EAAE,OAAO,GAAG,WAAW,GAAG,KAAK,QAAQ,MAAM;AAAA,IACtD;AAAA,EACF;AACF;AAGO,SAAS,uBAAuB,OAA4B,CAAC,GAAe;AACjF,QAAM,SAAS,KAAK,UAAU;AAC9B,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,MAAM;AAAA,IACd;AAAA,IACA,MAAM,MAAM,EAAE,MAAM,MAAM,GAAG;AAC3B,YAAM,OAAO,KAAK;AAClB,YAAM,eAAe,MAAM,OAAO,CAAC,MAAM;AACvC,YAAI,EAAE,KAAK,SAAS,OAAQ,QAAO;AACnC,cAAM,KAAK,EAAE;AACb,eACE,GAAG,aAAa,KAAK,YAAY,gBAAgB,GAAG,IAAI,MAAM,gBAAgB,KAAK,IAAI;AAAA,MAE3F,CAAC;AACD,UAAI,aAAa,WAAW,EAAG,QAAO,EAAE,OAAO,GAAG,WAAW,aAAa;AAC1E,aAAO;AAAA,QACL,OAAO,KAAK,IAAI,GAAG,IAAI,aAAa,SAAS,GAAG;AAAA,QAChD,WAAW,GAAG,aAAa,MAAM;AAAA,MACnC;AAAA,IACF;AAAA,EACF;AACF;AAIO,SAAS,iBAAiB,OAAgD,CAAC,GAAe;AAC/F,QAAM,SAAS,KAAK,UAAU;AAC9B,QAAM,UAAU,KAAK,WAAW;AAAA,IAC9B;AAAA,IACA;AAAA,EACF;AACA,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,KAAK;AAAA,IACb;AAAA,IACA,MAAM,MAAM,EAAE,KAAK,GAAG;AACpB,YAAM,MAAM,KAAK;AACjB,YAAM,MAAM,IAAI,UAAU;AAC1B,YAAM,UAAU,QAAQ,KAAK,CAAC,OAAO,GAAG,KAAK,GAAG,CAAC;AACjD,aAAO,UACH,EAAE,OAAO,GAAG,WAAW,yBAAyB,IAChD,EAAE,OAAO,GAAG,WAAW,aAAa;AAAA,IAC1C;AAAA,EACF;AACF;AAIO,SAAS,0BAA0B,OAA4B,CAAC,GAAe;AACpF,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,KAAK;AAAA,IACb,QAAQ,KAAK,UAAU;AAAA,IACvB,MAAM,MAAM,EAAE,MAAM,KAAK,GAAG;AAC1B,YAAM,MAAM,KAAK;AACjB,YAAM,WAAW,KAAK,KAAK,CAAC,MAAM,EAAE,KAAK,SAAS,MAAM;AACxD,UAAI,CAAC,SAAU,QAAO;AACtB,YAAM,WAAY,SAAS,KAAkB;AAC7C,YAAM,OAAO,IAAI,UAAU,IAAI,YAAY;AAC3C,YAAM,YAAY,IAAI,SAAS,SAAS,YAAY,CAAC;AACrD,aAAO,YACH,EAAE,OAAO,GAAG,WAAW,cAAc,QAAQ,sBAAsB,IACnE,EAAE,OAAO,KAAK,WAAW,WAAW,QAAQ,0BAA0B;AAAA,IAC5E;AAAA,EACF;AACF;AAEA,SAAS,gBAAgB,OAAwB;AAC/C,MAAI,UAAU,QAAQ,OAAO,UAAU,SAAU,QAAO,KAAK,UAAU,KAAK;AAC5E,MAAI,MAAM,QAAQ,KAAK,EAAG,QAAO,IAAI,MAAM,IAAI,eAAe,EAAE,KAAK,GAAG,CAAC;AACzE,QAAM,OAAO,OAAO,KAAK,KAAgC,EAAE,KAAK;AAChE,SAAO,IAAI,KAAK,IAAI,CAAC,MAAM,GAAG,KAAK,UAAU,CAAC,CAAC,IAAI,gBAAiB,MAAkC,CAAC,CAAC,CAAC,EAAE,EAAE,KAAK,GAAG,CAAC;AACxH;;;AC9GA,eAAsB,WACpB,OACA,QACA,QACwB;AACxB,MAAI,OAAO,WAAW,EAAG,OAAM,IAAI,MAAM,2CAA2C;AACpF,QAAM,SAAS,MAAM,QAAQ,IAAI,OAAO,IAAI,CAAC,OAAO,OAAO,MAAM,OAAO,EAAE,CAAC,CAAC;AAC5E,QAAM,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,iBAAiB,EAAE,cAAc;AAC7E,QAAM,OAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,gBAAgB,CAAC,IAAI,OAAO;AACvE,QAAM,WAAW,OAAO,OAAO,CAAC,GAAG,MAAM,KAAK,EAAE,iBAAiB,SAAS,GAAG,CAAC,IAAI,OAAO;AACzF,SAAO,EAAE,QAAQ,OAAO,CAAC,GAAI,QAAQ,QAAQ,KAAK,KAAK,QAAQ,EAAE;AACnE;AAQA,eAAsB,mBACpB,OACA,SACA,QACwB;AACxB,MAAI,QAAQ,WAAW,EAAG,OAAM,IAAI,MAAM,uCAAuC;AACjF,QAAM,YAAY,MAAM,QAAQ;AAAA,IAC9B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,SAAS,MAAM,QAAQ,IAAI,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,OAAO,EAAE,CAAC,CAAC;AACvE,aAAO,OAAO,KAAK,CAAC,GAAG,MAAM,EAAE,iBAAiB,EAAE,cAAc;AAAA,IAClE,CAAC;AAAA,EACH;AAEA,QAAM,cAAc,oBAAI,IAAoB;AAC5C,aAAW,WAAW,WAAW;AAC/B,YAAQ,QAAQ,CAAC,GAAG,SAAS;AAC3B,kBAAY,IAAI,EAAE,QAAQ,YAAY,IAAI,EAAE,KAAK,KAAK,MAAM,QAAQ,SAAS,KAAK;AAAA,IACpF,CAAC;AAAA,EACH;AAGA,QAAM,YAAY,UAAU,CAAC;AAC7B,QAAM,QAAQ,IAAI,IAAI,UAAU,IAAI,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;AACxD,QAAM,SAAS,CAAC,GAAG,MAAM,OAAO,CAAC,EAAE;AAAA,IACjC,CAAC,GAAG,OAAO,YAAY,IAAI,EAAE,KAAK,KAAK,MAAM,YAAY,IAAI,EAAE,KAAK,KAAK;AAAA,EAC3E;AACA,QAAM,OAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,gBAAgB,CAAC,IAAI,OAAO;AACvE,QAAM,WAAW,OAAO,OAAO,CAAC,GAAG,MAAM,KAAK,EAAE,iBAAiB,SAAS,GAAG,CAAC,IAAI,OAAO;AACzF,SAAO,EAAE,QAAQ,OAAO,CAAC,GAAI,QAAQ,QAAQ,KAAK,KAAK,QAAQ,EAAE;AACnE;;;ACNO,IAAM,YAAN,MAAgB;AAAA,EACrB,YAAoB,SAAuB;AAAvB;AAClB,QAAI,QAAQ,WAAW,EAAG,OAAM,IAAI,MAAM,uCAAuC;AAAA,EACnF;AAAA,EAFoB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASpB,MAAM,MAAM,OAAmB,OAAwC;AACrE,UAAM,aAAa,MAAM,gBAAgB,OAAO,KAAK;AACrD,UAAM,UAAU,IAAI,aAAa,OAAO,EAAE,MAAM,CAAC;AACjD,UAAM,QAAsB,CAAC;AAC7B,QAAI,WAAW;AACf,aAAS,IAAI,GAAG,IAAI,WAAW,MAAM,QAAQ,KAAK;AAChD,YAAM,OAAO,WAAW,MAAM,CAAC;AAC/B,YAAM,MAAmB;AAAA,QACvB;AAAA,QACA;AAAA,QACA,OAAO,WAAW,MAAM,MAAM,GAAG,CAAC;AAAA,QAClC,MAAM,WAAW,MAAM,MAAM,IAAI,CAAC;AAAA,MACpC;AACA,UAAI,aAAa;AACjB,iBAAW,UAAU,KAAK,SAAS;AACjC,YAAI,OAAO,SAAS,CAAC,OAAO,MAAM,SAAS,KAAK,KAAK,IAAI,EAAG;AAC5D,cAAM,UAAU,MAAM,OAAO,MAAM,GAAG;AACtC,YAAI,YAAY,KAAM;AACtB,cAAM,SAAS,OAAO,UAAU;AAChC,cAAM,KAAK;AAAA,UACT,QAAQ,KAAK,KAAK;AAAA,UAClB,UAAU,OAAO;AAAA,UACjB,OAAO,QAAQ;AAAA,UACf;AAAA,UACA,WAAW,QAAQ;AAAA,UACnB,UAAU,QAAQ;AAAA,QACpB,CAAC;AACD,qBAAa;AAEb,cAAM,QAAQ,YAAY;AAAA,UACxB,SAAS,OAAO,OAAO,EAAE;AAAA,UACzB,cAAc,KAAK,KAAK;AAAA,UACxB,WAAW;AAAA,UACX,OAAO,QAAQ;AAAA,UACf,WAAW,QAAQ;AAAA,UACnB,UAAU,QAAQ;AAAA,UAClB,MAAM,OAAO,OAAO,EAAE;AAAA,QACxB,CAAC;AAAA,MACH;AACA,UAAI,CAAC,WAAY;AAAA,IACnB;AAEA,UAAM,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,QAAQ,CAAC;AAC1D,UAAM,iBACJ,gBAAgB,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,IAAI;AAE9E,WAAO,EAAE,OAAO,OAAO,gBAAgB,aAAa,MAAM,QAAQ,eAAe,SAAS;AAAA,EAC5F;AACF;AAIO,SAAS,aAAa,SAA6B;AACxD,SAAO,QAAQ,QAAQ,WAAW,MAAM;AAC1C;","names":[]}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Typed query helpers over TraceStore.
|
|
5
|
+
*
|
|
6
|
+
* Not a full SQL engine — a minimal, composable set of operators that
|
|
7
|
+
* cover the canned-pipeline use cases. For ad-hoc analytics, persist to
|
|
8
|
+
* NDJSON and point DuckDB at it; the schema is stable so external SQL
|
|
9
|
+
* tooling works out of the box.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
|
|
13
|
+
declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
|
|
14
|
+
declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
|
|
15
|
+
declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
|
|
16
|
+
/** Group spans by any key selector. */
|
|
17
|
+
declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
|
|
18
|
+
/** Hash tool arguments to an orderless-key-stable string for de-duplication. */
|
|
19
|
+
declare function argHash(args: unknown): string;
|
|
20
|
+
/** Sum an LLM-span array into aggregate token + cost. */
|
|
21
|
+
declare function aggregateLlm(spans: LlmSpan[]): {
|
|
22
|
+
inputTokens: number;
|
|
23
|
+
outputTokens: number;
|
|
24
|
+
cachedTokens: number;
|
|
25
|
+
costUsd: number;
|
|
26
|
+
};
|
|
27
|
+
/** Pick the outcome's failure class when present, else derive 'success' from run status. */
|
|
28
|
+
declare function runFailureClass(run: Run): FailureClass;
|
|
29
|
+
|
|
30
|
+
export { aggregateLlm as a, argHash as b, runsForScenario as c, groupBy as g, judgeSpans as j, llmSpans as l, runFailureClass as r, toolSpans as t };
|