armature-agent-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/golden.d.ts +7 -0
- package/dist/golden.d.ts.map +1 -0
- package/dist/golden.js +63 -0
- package/dist/golden.js.map +1 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/runner.d.ts +10 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +32 -0
- package/dist/runner.js.map +1 -0
- package/dist/toolValidation.d.ts +7 -0
- package/dist/toolValidation.d.ts.map +1 -0
- package/dist/toolValidation.js +56 -0
- package/dist/toolValidation.js.map +1 -0
- package/dist/trajectory.d.ts +14 -0
- package/dist/trajectory.d.ts.map +1 -0
- package/dist/trajectory.js +80 -0
- package/dist/trajectory.js.map +1 -0
- package/dist/types.d.ts +81 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +23 -0
package/dist/golden.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { AgentMessage } from '@armature/core';
|
|
2
|
+
import type { GoldenTranscriptOptions, GoldenTranscriptResult } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Compare a transcript against a golden reference transcript.
|
|
5
|
+
*/
|
|
6
|
+
export declare function compareGoldenTranscript(actual: AgentMessage[], expected: AgentMessage[], options?: GoldenTranscriptOptions): GoldenTranscriptResult;
|
|
7
|
+
//# sourceMappingURL=golden.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"golden.d.ts","sourceRoot":"","sources":["../src/golden.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACxD,OAAO,KAAK,EAAc,uBAAuB,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAA;AA6B7F;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,YAAY,EAAE,EACtB,QAAQ,EAAE,YAAY,EAAE,EACxB,OAAO,GAAE,uBAA4B,GACpC,sBAAsB,CAmCxB"}
|
package/dist/golden.js
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { collectToolCalls } from '@open-agent-sdk/core';
|
|
2
|
+
function messageToFingerprint(msg, options) {
|
|
3
|
+
if (options.toolNamesOnly) {
|
|
4
|
+
const calls = collectToolCalls(msg);
|
|
5
|
+
if (calls.length > 0) {
|
|
6
|
+
return `${msg.role}:tools:[${calls.map(c => c.name).join(',')}]`;
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
const parts = 'parts' in msg ? (msg.parts ?? []) : [];
|
|
10
|
+
const texts = parts
|
|
11
|
+
.map(p => {
|
|
12
|
+
if (p.type === 'text')
|
|
13
|
+
return p.text;
|
|
14
|
+
if (p.type === 'tool_call')
|
|
15
|
+
return `call:${p.toolCall.name}`;
|
|
16
|
+
if (p.type === 'tool_result')
|
|
17
|
+
return `result:${p.toolResult.toolCallId}`;
|
|
18
|
+
if (p.type === 'reasoning')
|
|
19
|
+
return `reasoning`;
|
|
20
|
+
if (p.type === 'attachment')
|
|
21
|
+
return `attachment:${p.mediaType}`;
|
|
22
|
+
return '';
|
|
23
|
+
})
|
|
24
|
+
.join('|');
|
|
25
|
+
const content = options.ignoreWhitespace ? texts.replace(/\s+/g, ' ').trim() : texts;
|
|
26
|
+
return `${msg.role}:${content}`;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Compare a transcript against a golden reference transcript.
|
|
30
|
+
*/
|
|
31
|
+
export function compareGoldenTranscript(actual, expected, options = {}) {
|
|
32
|
+
const ignoreRoles = new Set(options.ignoreRoles ?? []);
|
|
33
|
+
const filteredActual = actual.filter(m => !ignoreRoles.has(m.role));
|
|
34
|
+
const filteredExpected = expected.filter(m => !ignoreRoles.has(m.role));
|
|
35
|
+
const actualFPs = filteredActual.map(m => messageToFingerprint(m, options));
|
|
36
|
+
const expectedFPs = filteredExpected.map(m => messageToFingerprint(m, options));
|
|
37
|
+
const diffs = [];
|
|
38
|
+
const maxLen = Math.max(actualFPs.length, expectedFPs.length);
|
|
39
|
+
let matches = 0;
|
|
40
|
+
for (let i = 0; i < maxLen; i++) {
|
|
41
|
+
const a = actualFPs[i];
|
|
42
|
+
const e = expectedFPs[i];
|
|
43
|
+
if (a === undefined) {
|
|
44
|
+
diffs.push({ index: i, type: 'missing', expected: e });
|
|
45
|
+
}
|
|
46
|
+
else if (e === undefined) {
|
|
47
|
+
diffs.push({ index: i, type: 'extra', actual: a });
|
|
48
|
+
}
|
|
49
|
+
else if (a !== e) {
|
|
50
|
+
diffs.push({ index: i, type: 'mismatch', expected: e, actual: a });
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
matches++;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const similarity = maxLen > 0 ? matches / maxLen : 1;
|
|
57
|
+
return {
|
|
58
|
+
passed: diffs.length === 0,
|
|
59
|
+
similarity,
|
|
60
|
+
diffs,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=golden.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"golden.js","sourceRoot":"","sources":["../src/golden.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAA;AAIvD,SAAS,oBAAoB,CAC3B,GAAiB,EACjB,OAAgC;IAEhC,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;QAC1B,MAAM,KAAK,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAA;QACnC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,GAAG,GAAG,CAAC,IAAI,WAAW,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAA;QAClE,CAAC;IACH,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IACrD,MAAM,KAAK,GAAG,KAAK;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE;QACP,IAAI,CAAC,CAAC,IAAI,KAAK,MAAM;YAAE,OAAO,CAAC,CAAC,IAAI,CAAA;QACpC,IAAI,CAAC,CAAC,IAAI,KAAK,WAAW;YAAE,OAAO,QAAQ,CAAC,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAA;QAC5D,IAAI,CAAC,CAAC,IAAI,KAAK,aAAa;YAAE,OAAO,UAAU,CAAC,CAAC,UAAU,CAAC,UAAU,EAAE,CAAA;QACxE,IAAI,CAAC,CAAC,IAAI,KAAK,WAAW;YAAE,OAAO,WAAW,CAAA;QAC9C,IAAI,CAAC,CAAC,IAAI,KAAK,YAAY;YAAE,OAAO,cAAc,CAAC,CAAC,SAAS,EAAE,CAAA;QAC/D,OAAO,EAAE,CAAA;IACX,CAAC,CAAC;SACD,IAAI,CAAC,GAAG,CAAC,CAAA;IAEZ,MAAM,OAAO,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,KAAK,CAAA;IACpF,OAAO,GAAG,GAAG,CAAC,IAAI,IAAI,OAAO,EAAE,CAAA;AACjC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,MAAsB,EACtB,QAAwB,EACxB,UAAmC,EAAE;IAErC,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAA;IAEtD,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,IAA0B,CAAC,CAAC,CAAA;IACzF,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,IAA0B,CAAC,CAAC,CAAA;IAE7F,MAAM,SAAS,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,oBAAoB,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAA;IAC3E,MAAM,WAAW,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,oBAAoB,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAA;IAE/E,MAAM,KAAK,GAAiB,EAAE,CAAA;IAC9B,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,CAAA;IAC7D,IAAI,OAAO,GAAG,CAAC,CAAA;IAEf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,MAAM,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC,CAAA;QACtB,MAAM,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAA;QAExB,IAAI,CAAC,KAAK,SAAS,EAAE,CAAC;YACpB,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAA;QACxD,CAAC;aAAM,IAAI,CAAC,KAAK,SAAS,EAAE,CAAC;YAC3B,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAA;QACpD,CAAC;aAAM,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YACnB,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAA;QACpE,CAAC;aAAM,CAAC;YACN,OAAO,EAAE,CAAA;QACX,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;IAEpD,OAAO;QACL,MAAM,EAAE,KAAK,CAAC,MAAM,KAAK,CAAC;QAC1B,UAAU;QACV,KAAK;KACN,CAAA;AACH,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { assertTrajectory, extractToolCallSequence } from './trajectory.js';
|
|
2
|
+
export { validateToolConstraints } from './toolValidation.js';
|
|
3
|
+
export { compareGoldenTranscript } from './golden.js';
|
|
4
|
+
export { runEval, runEvalSuite } from './runner.js';
|
|
5
|
+
export type { EvalCase, EvalResult, GoldenDiff, GoldenTranscriptOptions, GoldenTranscriptResult, ToolCallConstraint, ToolCallValidationResult, TrajectoryAssertion, TrajectoryResult, TrajectoryStep, } from './types.js';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAC3E,OAAO,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAA;AAC7D,OAAO,EAAE,uBAAuB,EAAE,MAAM,aAAa,CAAA;AACrD,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAA;AAEnD,YAAY,EACV,QAAQ,EACR,UAAU,EACV,UAAU,EACV,uBAAuB,EACvB,sBAAsB,EACtB,kBAAkB,EAClB,wBAAwB,EACxB,mBAAmB,EACnB,gBAAgB,EAChB,cAAc,GACf,MAAM,YAAY,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { assertTrajectory, extractToolCallSequence } from './trajectory.js';
|
|
2
|
+
export { validateToolConstraints } from './toolValidation.js';
|
|
3
|
+
export { compareGoldenTranscript } from './golden.js';
|
|
4
|
+
export { runEval, runEvalSuite } from './runner.js';
|
|
5
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAC3E,OAAO,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAA;AAC7D,OAAO,EAAE,uBAAuB,EAAE,MAAM,aAAa,CAAA;AACrD,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAA"}
|
package/dist/runner.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { EvalCase, EvalResult } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Run a full eval case against a session transcript.
|
|
4
|
+
*/
|
|
5
|
+
export declare function runEval(evalCase: EvalCase): EvalResult;
|
|
6
|
+
/**
|
|
7
|
+
* Run multiple eval cases and return all results.
|
|
8
|
+
*/
|
|
9
|
+
export declare function runEvalSuite(cases: EvalCase[]): EvalResult[];
|
|
10
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAKtD;;GAEG;AACH,wBAAgB,OAAO,CAAC,QAAQ,EAAE,QAAQ,GAAG,UAAU,CA6BtD;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,UAAU,EAAE,CAE5D"}
|
package/dist/runner.js
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { assertTrajectory } from './trajectory.js';
|
|
2
|
+
import { validateToolConstraints } from './toolValidation.js';
|
|
3
|
+
import { compareGoldenTranscript } from './golden.js';
|
|
4
|
+
/**
|
|
5
|
+
* Run a full eval case against a session transcript.
|
|
6
|
+
*/
|
|
7
|
+
export function runEval(evalCase) {
|
|
8
|
+
const trajectoryResults = (evalCase.trajectories ?? []).map(assertion => assertTrajectory(evalCase.transcript, assertion));
|
|
9
|
+
const toolValidationResults = evalCase.toolConstraints
|
|
10
|
+
? validateToolConstraints(evalCase.transcript, evalCase.toolConstraints)
|
|
11
|
+
: [];
|
|
12
|
+
const goldenResult = evalCase.goldenTranscript
|
|
13
|
+
? compareGoldenTranscript(evalCase.transcript, evalCase.goldenTranscript, evalCase.goldenOptions)
|
|
14
|
+
: undefined;
|
|
15
|
+
const passed = trajectoryResults.every(r => r.passed) &&
|
|
16
|
+
toolValidationResults.every(r => r.passed) &&
|
|
17
|
+
(goldenResult?.passed ?? true);
|
|
18
|
+
return {
|
|
19
|
+
name: evalCase.name,
|
|
20
|
+
passed,
|
|
21
|
+
trajectoryResults,
|
|
22
|
+
toolValidationResults,
|
|
23
|
+
goldenResult,
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Run multiple eval cases and return all results.
|
|
28
|
+
*/
|
|
29
|
+
export function runEvalSuite(cases) {
|
|
30
|
+
return cases.map(runEval);
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAA;AAClD,OAAO,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAA;AAC7D,OAAO,EAAE,uBAAuB,EAAE,MAAM,aAAa,CAAA;AAErD;;GAEG;AACH,MAAM,UAAU,OAAO,CAAC,QAAkB;IACxC,MAAM,iBAAiB,GAAG,CAAC,QAAQ,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CACtE,gBAAgB,CAAC,QAAQ,CAAC,UAAU,EAAE,SAAS,CAAC,CACjD,CAAA;IAED,MAAM,qBAAqB,GAAG,QAAQ,CAAC,eAAe;QACpD,CAAC,CAAC,uBAAuB,CAAC,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,eAAe,CAAC;QACxE,CAAC,CAAC,EAAE,CAAA;IAEN,MAAM,YAAY,GAAG,QAAQ,CAAC,gBAAgB;QAC5C,CAAC,CAAC,uBAAuB,CACrB,QAAQ,CAAC,UAAU,EACnB,QAAQ,CAAC,gBAAgB,EACzB,QAAQ,CAAC,aAAa,CACvB;QACH,CAAC,CAAC,SAAS,CAAA;IAEb,MAAM,MAAM,GACV,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;QACtC,qBAAqB,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;QAC1C,CAAC,YAAY,EAAE,MAAM,IAAI,IAAI,CAAC,CAAA;IAEhC,OAAO;QACL,IAAI,EAAE,QAAQ,CAAC,IAAI;QACnB,MAAM;QACN,iBAAiB;QACjB,qBAAqB;QACrB,YAAY;KACb,CAAA;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,KAAiB;IAC5C,OAAO,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,CAAA;AAC3B,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { AgentMessage } from '@armature/core';
|
|
2
|
+
import type { ToolCallConstraint, ToolCallValidationResult } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Validate tool-call constraints against a session transcript.
|
|
5
|
+
*/
|
|
6
|
+
export declare function validateToolConstraints(transcript: AgentMessage[], constraints: ToolCallConstraint[]): ToolCallValidationResult[];
|
|
7
|
+
//# sourceMappingURL=toolValidation.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toolValidation.d.ts","sourceRoot":"","sources":["../src/toolValidation.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACxD,OAAO,KAAK,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAA;AAE9E;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,UAAU,EAAE,YAAY,EAAE,EAC1B,WAAW,EAAE,kBAAkB,EAAE,GAChC,wBAAwB,EAAE,CAqE5B"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { collectToolCalls } from '@open-agent-sdk/core';
|
|
2
|
+
/**
|
|
3
|
+
* Validate tool-call constraints against a session transcript.
|
|
4
|
+
*/
|
|
5
|
+
export function validateToolConstraints(transcript, constraints) {
|
|
6
|
+
const allCalls = transcript.flatMap(msg => collectToolCalls(msg).map(tc => ({ name: tc.name, input: tc.input })));
|
|
7
|
+
return constraints.map(constraint => {
|
|
8
|
+
const violations = [];
|
|
9
|
+
const matchingCalls = allCalls.filter(c => c.name === constraint.toolName);
|
|
10
|
+
// Forbidden check
|
|
11
|
+
if (constraint.forbidden && matchingCalls.length > 0) {
|
|
12
|
+
violations.push(`Tool "${constraint.toolName}" was called ${matchingCalls.length} time(s) but is forbidden`);
|
|
13
|
+
}
|
|
14
|
+
// Max calls check
|
|
15
|
+
if (constraint.maxCalls !== undefined &&
|
|
16
|
+
matchingCalls.length > constraint.maxCalls) {
|
|
17
|
+
violations.push(`Tool "${constraint.toolName}" was called ${matchingCalls.length} time(s), exceeds max ${constraint.maxCalls}`);
|
|
18
|
+
}
|
|
19
|
+
// Required input check
|
|
20
|
+
if (constraint.requiredInput) {
|
|
21
|
+
for (let i = 0; i < matchingCalls.length; i++) {
|
|
22
|
+
const call = matchingCalls[i];
|
|
23
|
+
const input = call.input;
|
|
24
|
+
if (!input || typeof input !== 'object') {
|
|
25
|
+
violations.push(`Tool "${constraint.toolName}" call #${i} has non-object input`);
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
for (const [key, value] of Object.entries(constraint.requiredInput)) {
|
|
29
|
+
if (input[key] !== value) {
|
|
30
|
+
violations.push(`Tool "${constraint.toolName}" call #${i} missing required input ${key}=${JSON.stringify(value)}`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
// Forbidden input check
|
|
36
|
+
if (constraint.forbiddenInput) {
|
|
37
|
+
for (let i = 0; i < matchingCalls.length; i++) {
|
|
38
|
+
const call = matchingCalls[i];
|
|
39
|
+
const input = call.input;
|
|
40
|
+
if (!input || typeof input !== 'object')
|
|
41
|
+
continue;
|
|
42
|
+
for (const [key, value] of Object.entries(constraint.forbiddenInput)) {
|
|
43
|
+
if (input[key] === value) {
|
|
44
|
+
violations.push(`Tool "${constraint.toolName}" call #${i} contains forbidden input ${key}=${JSON.stringify(value)}`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return {
|
|
50
|
+
toolName: constraint.toolName,
|
|
51
|
+
passed: violations.length === 0,
|
|
52
|
+
violations,
|
|
53
|
+
};
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=toolValidation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"toolValidation.js","sourceRoot":"","sources":["../src/toolValidation.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAA;AAIvD;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,UAA0B,EAC1B,WAAiC;IAEjC,MAAM,QAAQ,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CACxC,gBAAgB,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CACtE,CAAA;IAED,OAAO,WAAW,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE;QAClC,MAAM,UAAU,GAAa,EAAE,CAAA;QAC/B,MAAM,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,QAAQ,CAAC,CAAA;QAE1E,kBAAkB;QAClB,IAAI,UAAU,CAAC,SAAS,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrD,UAAU,CAAC,IAAI,CACb,SAAS,UAAU,CAAC,QAAQ,gBAAgB,aAAa,CAAC,MAAM,2BAA2B,CAC5F,CAAA;QACH,CAAC;QAED,kBAAkB;QAClB,IACE,UAAU,CAAC,QAAQ,KAAK,SAAS;YACjC,aAAa,CAAC,MAAM,GAAG,UAAU,CAAC,QAAQ,EAC1C,CAAC;YACD,UAAU,CAAC,IAAI,CACb,SAAS,UAAU,CAAC,QAAQ,gBAAgB,aAAa,CAAC,MAAM,yBAAyB,UAAU,CAAC,QAAQ,EAAE,CAC/G,CAAA;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,UAAU,CAAC,aAAa,EAAE,CAAC;YAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC9C,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAA;gBAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAuC,CAAA;gBAC1D,IAAI,CAAC,KAAK,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;oBACxC,UAAU,CAAC,IAAI,CACb,SAAS,UAAU,CAAC,QAAQ,WAAW,CAAC,uBAAuB,CAChE,CAAA;oBACD,SAAQ;gBACV,CAAC;gBACD,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;oBACpE,IAAI,KAAK,CAAC,GAAG,CAAC,KAAK,KAAK,EAAE,CAAC;wBACzB,UAAU,CAAC,IAAI,CACb,SAAS,UAAU,CAAC,QAAQ,WAAW,CAAC,2BAA2B,GAAG,IAAI,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAClG,CAAA;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,wBAAwB;QACxB,IAAI,UAAU,CAAC,cAAc,EAAE,CAAC;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC9C,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAA;gBAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAuC,CAAA;gBAC1D,IAAI,CAAC,KAAK,IAAI,OAAO,KAAK,KAAK,QAAQ;oBAAE,SAAQ;gBACjD,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;oBACrE,IAAI,KAAK,CAAC,GAAG,CAAC,KAAK,KAAK,EAAE,CAAC;wBACzB,UAAU,CAAC,IAAI,CACb,SAAS,UAAU,CAAC,QAAQ,WAAW,CAAC,6BAA6B,GAAG,IAAI,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CACpG,CAAA;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,UAAU,CAAC,QAAQ;YAC7B,MAAM,EAAE,UAAU,CAAC,MAAM,KAAK,CAAC;YAC/B,UAAU;SACX,CAAA;IACH,CAAC,CAAC,CAAA;AACJ,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { AgentMessage } from '@armature/core';
|
|
2
|
+
import type { TrajectoryAssertion, TrajectoryResult } from './types.js';
|
|
3
|
+
/**
|
|
4
|
+
* Extract the ordered list of tool call names from a session transcript.
|
|
5
|
+
*/
|
|
6
|
+
export declare function extractToolCallSequence(transcript: AgentMessage[]): Array<{
|
|
7
|
+
name: string;
|
|
8
|
+
input: unknown;
|
|
9
|
+
}>;
|
|
10
|
+
/**
|
|
11
|
+
* Assert that a session transcript matches an expected trajectory.
|
|
12
|
+
*/
|
|
13
|
+
export declare function assertTrajectory(transcript: AgentMessage[], assertion: TrajectoryAssertion): TrajectoryResult;
|
|
14
|
+
//# sourceMappingURL=trajectory.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"trajectory.d.ts","sourceRoot":"","sources":["../src/trajectory.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AACxD,OAAO,KAAK,EAAE,mBAAmB,EAAE,gBAAgB,EAAkB,MAAM,YAAY,CAAA;AAEvF;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,UAAU,EAAE,YAAY,EAAE,GACzB,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,OAAO,CAAA;CAAE,CAAC,CAIzC;AAgBD;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,UAAU,EAAE,YAAY,EAAE,EAC1B,SAAS,EAAE,mBAAmB,GAC7B,gBAAgB,CAgElB"}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { collectToolCalls } from '@open-agent-sdk/core';
|
|
2
|
+
/**
|
|
3
|
+
* Extract the ordered list of tool call names from a session transcript.
|
|
4
|
+
*/
|
|
5
|
+
export function extractToolCallSequence(transcript) {
|
|
6
|
+
return transcript.flatMap(msg => collectToolCalls(msg).map(tc => ({ name: tc.name, input: tc.input })));
|
|
7
|
+
}
|
|
8
|
+
function inputMatches(actual, pattern) {
|
|
9
|
+
if (typeof actual !== 'object' || actual === null)
|
|
10
|
+
return false;
|
|
11
|
+
const obj = actual;
|
|
12
|
+
return Object.entries(pattern).every(([key, value]) => {
|
|
13
|
+
if (typeof value === 'object' && value !== null) {
|
|
14
|
+
return inputMatches(obj[key], value);
|
|
15
|
+
}
|
|
16
|
+
return obj[key] === value;
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Assert that a session transcript matches an expected trajectory.
|
|
21
|
+
*/
|
|
22
|
+
export function assertTrajectory(transcript, assertion) {
|
|
23
|
+
const calls = extractToolCallSequence(transcript);
|
|
24
|
+
const ordered = assertion.ordered !== false;
|
|
25
|
+
const missingSteps = [];
|
|
26
|
+
const matchedIndices = new Set();
|
|
27
|
+
let matchedSteps = 0;
|
|
28
|
+
if (ordered) {
|
|
29
|
+
let callIdx = 0;
|
|
30
|
+
for (const step of assertion.steps) {
|
|
31
|
+
let found = false;
|
|
32
|
+
const savedIdx = callIdx;
|
|
33
|
+
while (callIdx < calls.length) {
|
|
34
|
+
const call = calls[callIdx];
|
|
35
|
+
if (call.name === step.toolName &&
|
|
36
|
+
(!step.inputMatch || inputMatches(call.input, step.inputMatch))) {
|
|
37
|
+
matchedIndices.add(callIdx);
|
|
38
|
+
matchedSteps++;
|
|
39
|
+
callIdx++;
|
|
40
|
+
found = true;
|
|
41
|
+
break;
|
|
42
|
+
}
|
|
43
|
+
callIdx++;
|
|
44
|
+
}
|
|
45
|
+
if (!found) {
|
|
46
|
+
// Restore cursor so subsequent steps can still match
|
|
47
|
+
callIdx = savedIdx;
|
|
48
|
+
if (step.required !== false) {
|
|
49
|
+
missingSteps.push(step);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
for (const step of assertion.steps) {
|
|
56
|
+
const idx = calls.findIndex((call, i) => !matchedIndices.has(i) &&
|
|
57
|
+
call.name === step.toolName &&
|
|
58
|
+
(!step.inputMatch || inputMatches(call.input, step.inputMatch)));
|
|
59
|
+
if (idx >= 0) {
|
|
60
|
+
matchedIndices.add(idx);
|
|
61
|
+
matchedSteps++;
|
|
62
|
+
}
|
|
63
|
+
else if (step.required !== false) {
|
|
64
|
+
missingSteps.push(step);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
const extraCalls = calls
|
|
69
|
+
.filter((_, i) => !matchedIndices.has(i))
|
|
70
|
+
.map(c => c.name);
|
|
71
|
+
return {
|
|
72
|
+
name: assertion.name,
|
|
73
|
+
passed: missingSteps.length === 0,
|
|
74
|
+
matchedSteps,
|
|
75
|
+
totalSteps: assertion.steps.filter(s => s.required !== false).length,
|
|
76
|
+
missingSteps,
|
|
77
|
+
extraCalls,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
//# sourceMappingURL=trajectory.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"trajectory.js","sourceRoot":"","sources":["../src/trajectory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAA;AAIvD;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,UAA0B;IAE1B,OAAO,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAC9B,gBAAgB,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CACtE,CAAA;AACH,CAAC;AAED,SAAS,YAAY,CACnB,MAAe,EACf,OAAgC;IAEhC,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI;QAAE,OAAO,KAAK,CAAA;IAC/D,MAAM,GAAG,GAAG,MAAiC,CAAA;IAC7C,OAAO,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE;QACpD,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YAChD,OAAO,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,KAAgC,CAAC,CAAA;QACjE,CAAC;QACD,OAAO,GAAG,CAAC,GAAG,CAAC,KAAK,KAAK,CAAA;IAC3B,CAAC,CAAC,CAAA;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC9B,UAA0B,EAC1B,SAA8B;IAE9B,MAAM,KAAK,GAAG,uBAAuB,CAAC,UAAU,CAAC,CAAA;IACjD,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,KAAK,KAAK,CAAA;IAE3C,MAAM,YAAY,GAAqB,EAAE,CAAA;IACzC,MAAM,cAAc,GAAG,IAAI,GAAG,EAAU,CAAA;IACxC,IAAI,YAAY,GAAG,CAAC,CAAA;IAEpB,IAAI,OAAO,EAAE,CAAC;QACZ,IAAI,OAAO,GAAG,CAAC,CAAA;QACf,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;YACnC,IAAI,KAAK,GAAG,KAAK,CAAA;YACjB,MAAM,QAAQ,GAAG,OAAO,CAAA;YACxB,OAAO,OAAO,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;gBAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,CAAA;gBAC3B,IACE,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,QAAQ;oBAC3B,CAAC,CAAC,IAAI,CAAC,UAAU,IAAI,YAAY,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC,EAC/D,CAAC;oBACD,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,CAAA;oBAC3B,YAAY,EAAE,CAAA;oBACd,OAAO,EAAE,CAAA;oBACT,KAAK,GAAG,IAAI,CAAA;oBACZ,MAAK;gBACP,CAAC;gBACD,OAAO,EAAE,CAAA;YACX,CAAC;YACD,IAAI,CAAC,KAAK,EAAE,CAAC;gBACX,qDAAqD;gBACrD,OAAO,GAAG,QAAQ,CAAA;gBAClB,IAAI,IAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,CAAC;oBAC5B,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;gBACzB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;YACnC,MAAM,GAAG,GAAG,KAAK,CAAC,SAAS,CACzB,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC;gBACtB,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,QAAQ;gBAC3B,CAAC,CAAC,IAAI,CAAC,UAAU,IAAI,YAAY,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC,CAClE,CAAA;YACD,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;gBACb,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBACvB,YAAY,EAAE,CAAA;YAChB,CAAC;iBAAM,IAAI,IAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,CAAC;gBACnC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YACzB,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,KAAK;SACrB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACxC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;IAEnB,OAAO;QACL,IAAI,EAAE,SAAS,CAAC,IAAI;QACpB,MAAM,EAAE,YAAY,CAAC,MAAM,KAAK,CAAC;QACjC,YAAY;QACZ,UAAU,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,KAAK,CAAC,CAAC,MAAM;QACpE,YAAY;QACZ,UAAU;KACX,CAAA;AACH,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import type { AgentMessage } from '@armature/core';
|
|
2
|
+
export interface TrajectoryStep {
|
|
3
|
+
/** Expected tool name (e.g., 'Read', 'Bash', 'Edit') */
|
|
4
|
+
toolName: string;
|
|
5
|
+
/** Optional: partial match on tool input */
|
|
6
|
+
inputMatch?: Record<string, unknown>;
|
|
7
|
+
/** Optional: whether the step is required (default: true) */
|
|
8
|
+
required?: boolean;
|
|
9
|
+
}
|
|
10
|
+
export interface TrajectoryAssertion {
|
|
11
|
+
/** Human-readable name for this assertion */
|
|
12
|
+
name: string;
|
|
13
|
+
/** Expected sequence of tool calls */
|
|
14
|
+
steps: TrajectoryStep[];
|
|
15
|
+
/** Whether ordering must be exact (default: true) */
|
|
16
|
+
ordered?: boolean;
|
|
17
|
+
}
|
|
18
|
+
export interface TrajectoryResult {
|
|
19
|
+
name: string;
|
|
20
|
+
passed: boolean;
|
|
21
|
+
matchedSteps: number;
|
|
22
|
+
totalSteps: number;
|
|
23
|
+
missingSteps: TrajectoryStep[];
|
|
24
|
+
extraCalls: string[];
|
|
25
|
+
}
|
|
26
|
+
export interface ToolCallConstraint {
|
|
27
|
+
/** Tool name to constrain */
|
|
28
|
+
toolName: string;
|
|
29
|
+
/** Maximum number of times this tool may be called */
|
|
30
|
+
maxCalls?: number;
|
|
31
|
+
/** Tool must never be called */
|
|
32
|
+
forbidden?: boolean;
|
|
33
|
+
/** Partial input that must appear in every call to this tool */
|
|
34
|
+
requiredInput?: Record<string, unknown>;
|
|
35
|
+
/** Partial input that must never appear */
|
|
36
|
+
forbiddenInput?: Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
export interface ToolCallValidationResult {
|
|
39
|
+
toolName: string;
|
|
40
|
+
passed: boolean;
|
|
41
|
+
violations: string[];
|
|
42
|
+
}
|
|
43
|
+
export interface GoldenTranscriptOptions {
|
|
44
|
+
/** Ignore whitespace differences in text content */
|
|
45
|
+
ignoreWhitespace?: boolean;
|
|
46
|
+
/** Only compare tool call names and sequence, not arguments */
|
|
47
|
+
toolNamesOnly?: boolean;
|
|
48
|
+
/** Ignore specific message roles */
|
|
49
|
+
ignoreRoles?: Array<'system' | 'event'>;
|
|
50
|
+
}
|
|
51
|
+
export interface GoldenTranscriptResult {
|
|
52
|
+
passed: boolean;
|
|
53
|
+
similarity: number;
|
|
54
|
+
diffs: GoldenDiff[];
|
|
55
|
+
}
|
|
56
|
+
export interface GoldenDiff {
|
|
57
|
+
index: number;
|
|
58
|
+
type: 'missing' | 'extra' | 'mismatch';
|
|
59
|
+
expected?: string;
|
|
60
|
+
actual?: string;
|
|
61
|
+
}
|
|
62
|
+
export interface EvalCase {
|
|
63
|
+
name: string;
|
|
64
|
+
/** The session transcript to evaluate */
|
|
65
|
+
transcript: AgentMessage[];
|
|
66
|
+
/** Trajectory assertions to run */
|
|
67
|
+
trajectories?: TrajectoryAssertion[];
|
|
68
|
+
/** Tool-call constraints to validate */
|
|
69
|
+
toolConstraints?: ToolCallConstraint[];
|
|
70
|
+
/** Golden transcript to compare against */
|
|
71
|
+
goldenTranscript?: AgentMessage[];
|
|
72
|
+
goldenOptions?: GoldenTranscriptOptions;
|
|
73
|
+
}
|
|
74
|
+
export interface EvalResult {
|
|
75
|
+
name: string;
|
|
76
|
+
passed: boolean;
|
|
77
|
+
trajectoryResults: TrajectoryResult[];
|
|
78
|
+
toolValidationResults: ToolCallValidationResult[];
|
|
79
|
+
goldenResult?: GoldenTranscriptResult;
|
|
80
|
+
}
|
|
81
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAA;AAMxD,MAAM,WAAW,cAAc;IAC7B,wDAAwD;IACxD,QAAQ,EAAE,MAAM,CAAA;IAChB,4CAA4C;IAC5C,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IACpC,6DAA6D;IAC7D,QAAQ,CAAC,EAAE,OAAO,CAAA;CACnB;AAED,MAAM,WAAW,mBAAmB;IAClC,6CAA6C;IAC7C,IAAI,EAAE,MAAM,CAAA;IACZ,sCAAsC;IACtC,KAAK,EAAE,cAAc,EAAE,CAAA;IACvB,qDAAqD;IACrD,OAAO,CAAC,EAAE,OAAO,CAAA;CAClB;AAED,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAA;IACZ,MAAM,EAAE,OAAO,CAAA;IACf,YAAY,EAAE,MAAM,CAAA;IACpB,UAAU,EAAE,MAAM,CAAA;IAClB,YAAY,EAAE,cAAc,EAAE,CAAA;IAC9B,UAAU,EAAE,MAAM,EAAE,CAAA;CACrB;AAMD,MAAM,WAAW,kBAAkB;IACjC,6BAA6B;IAC7B,QAAQ,EAAE,MAAM,CAAA;IAChB,sDAAsD;IACtD,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,gCAAgC;IAChC,SAAS,CAAC,EAAE,OAAO,CAAA;IACnB,gEAAgE;IAChE,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IACvC,2CAA2C;IAC3C,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;CACzC;AAED,MAAM,WAAW,wBAAwB;IACvC,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,OAAO,CAAA;IACf,UAAU,EAAE,MAAM,EAAE,CAAA;CACrB;AAMD,MAAM,WAAW,uBAAuB;IACtC,oDAAoD;IACpD,gBAAgB,CAAC,EAAE,OAAO,CAAA;IAC1B,+DAA+D;IAC/D,aAAa,CAAC,EAAE,OAAO,CAAA;IACvB,oCAAoC;IACpC,WAAW,CAAC,EAAE,KAAK,CAAC,QAAQ,GAAG,OAAO,CAAC,CAAA;CACxC;AAED,MAAM,WAAW,sBAAsB;IACrC,MAAM,EAAE,OAAO,CAAA;IACf,UAAU,EAAE,MAAM,CAAA;IAClB,KAAK,EAAE,UAAU,EAAE,CAAA;CACpB;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,IAAI,EAAE,SAAS,GAAG,OAAO,GAAG,UAAU,CAAA;IACtC,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,MAAM,CAAC,EAAE,MAAM,CAAA;CAChB;AAMD,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAA;IACZ,yCAAyC;IACzC,UAAU,EAAE,YAAY,EAAE,CAAA;IAC1B,mCAAmC;IACnC,YAAY,CAAC,EAAE,mBAAmB,EAAE,CAAA;IACpC,wCAAwC;IACxC,eAAe,CAAC,EAAE,kBAAkB,EAAE,CAAA;IACtC,2CAA2C;IAC3C,gBAAgB,CAAC,EAAE,YAAY,EAAE,CAAA;IACjC,aAAa,CAAC,EAAE,uBAAuB,CAAA;CACxC;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,MAAM,CAAA;IACZ,MAAM,EAAE,OAAO,CAAA;IACf,iBAAiB,EAAE,gBAAgB,EAAE,CAAA;IACrC,qBAAqB,EAAE,wBAAwB,EAAE,CAAA;IACjD,YAAY,CAAC,EAAE,sBAAsB,CAAA;CACtC"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "armature-agent-eval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"main": "./dist/index.js",
|
|
6
|
+
"types": "./dist/index.d.ts",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"types": "./dist/index.d.ts",
|
|
10
|
+
"import": "./dist/index.js",
|
|
11
|
+
"default": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"armature-agent-core": "^0.1.0"
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsc -p tsconfig.json"
|
|
19
|
+
},
|
|
20
|
+
"files": [
|
|
21
|
+
"dist"
|
|
22
|
+
]
|
|
23
|
+
}
|