@tangle-network/agent-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # @tangle-network/agent-eval
2
+
3
+ Domain-agnostic evaluation framework for Tangle agent apps. Multi-turn scenario execution, multi-judge scoring, agent-driver meta-testing, convergence tracking. Every agent (tax, legal, film, gtm) imports this to get a reproducible quality harness.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install @tangle-network/agent-eval
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```ts
14
+ import { BenchmarkRunner, ProductClient, defaultJudges } from '@tangle-network/agent-eval'
15
+
16
+ const client = new ProductClient({
17
+ baseUrl: 'https://my-agent.tangle.tools',
18
+ routes: {
19
+ signup: '/api/auth/sign-up/email',
20
+ chat: '/api/chat',
21
+ // ...
22
+ },
23
+ })
24
+
25
+ const runner = new BenchmarkRunner(client, {
26
+ scenarios: myScenarios,
27
+ judges: defaultJudges('film production'),
28
+ systemPrompt: MY_SYSTEM_PROMPT,
29
+ })
30
+
31
+ const report = await runner.run()
32
+ ```
33
+
34
+ ## What's in the box
35
+
36
+ - **ProductClient** — configurable HTTP client (routes are config, not code)
37
+ - **ScenarioRegistry** — auto-discovery + filtering
38
+ - **executeScenario** — multi-turn executor with artifact collection
39
+ - **BenchmarkRunner** — orchestrates scenarios + judges + scoring
40
+ - **AgentDriver** — meta-agent that plays personas against a real product
41
+ - **MetricsCollector** — per-turn product state metrics
42
+ - **ConvergenceTracker** — completion% over turns
43
+ - **Reporter** — markdown + console output
44
+ - **Judges** — 4 built-in (domain expert, code execution, coherence, adversarial) + `createCustomJudge` factory
45
+
46
+ ## Tier
47
+
48
+ Marketplace tier of the [agent-builder](https://github.com/drewstone/tangle-agent-builder) three-tier architecture. Uses [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) for judge LLM calls.
49
+
50
+ ## Related
51
+
52
+ - [`@tangle-network/agent-gateway`](https://github.com/tangle-network/agent-gateway) — the gateway agents published through
53
+ - [`@tangle-network/agent-client`](https://github.com/tangle-network/agent-client) — consumer SDK for those endpoints
54
+ - [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) — platform SDK (used internally by judges)
55
+
56
+ ## License
57
+
58
+ MIT
@@ -0,0 +1,579 @@
1
+ import { TCloud } from '@tangle-network/tcloud';
2
+
3
+ interface Scenario {
4
+ id: string;
5
+ persona: string;
6
+ label: string;
7
+ thesis: string;
8
+ dimensions: string[];
9
+ turns: Turn[];
10
+ artifactChecks: ArtifactCheck[];
11
+ systemPromptAppend?: string;
12
+ }
13
+ interface Turn {
14
+ user: string;
15
+ expectedBehaviors: string[];
16
+ adversarial?: boolean;
17
+ feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
18
+ }
19
+ interface ArtifactCheck {
20
+ type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
21
+ target: string;
22
+ contains?: string;
23
+ minCount?: number;
24
+ description: string;
25
+ }
26
+ interface JudgeConfig {
27
+ model: string;
28
+ temperature: number;
29
+ rubric: JudgeRubric;
30
+ }
31
+ interface JudgeRubric {
32
+ name: string;
33
+ description: string;
34
+ dimensions: RubricDimension[];
35
+ }
36
+ interface RubricDimension {
37
+ name: string;
38
+ description: string;
39
+ anchor_low: string;
40
+ anchor_high: string;
41
+ weight: number;
42
+ }
43
+ interface ScenarioResult {
44
+ scenarioId: string;
45
+ persona: string;
46
+ turns: TurnResult[];
47
+ artifactResults: ArtifactResult[];
48
+ judgeScores: JudgeScore[];
49
+ judgeErrors: number;
50
+ overallScore: number;
51
+ totalDurationMs: number;
52
+ artifacts: CollectedArtifacts;
53
+ }
54
+ interface TurnResult {
55
+ turnIndex: number;
56
+ userMessage: string;
57
+ agentResponse: string;
58
+ durationMs: number;
59
+ blocksExtracted: {
60
+ type: string;
61
+ title: string;
62
+ }[];
63
+ containsCode: boolean;
64
+ containsToolCall: boolean;
65
+ }
66
+ interface ArtifactResult {
67
+ check: ArtifactCheck;
68
+ passed: boolean;
69
+ detail?: string;
70
+ }
71
+ interface JudgeScore {
72
+ judgeName: string;
73
+ dimension: string;
74
+ score: number;
75
+ reasoning: string;
76
+ evidence?: string;
77
+ }
78
+ interface CollectedArtifacts {
79
+ vaultFiles: {
80
+ path: string;
81
+ content: string;
82
+ }[];
83
+ blocksExtracted: {
84
+ type: string;
85
+ fields: Record<string, string>;
86
+ }[];
87
+ codeBlocks: {
88
+ language: string;
89
+ code: string;
90
+ }[];
91
+ toolCalls: string[];
92
+ }
93
+ interface BenchmarkReport {
94
+ timestamp: string;
95
+ generation: number;
96
+ promptVersion: string;
97
+ scenarioCount: number;
98
+ results: ScenarioResult[];
99
+ summary: {
100
+ overallAvg: number;
101
+ byPersona: Record<string, {
102
+ avg: number;
103
+ passed: number;
104
+ total: number;
105
+ }>;
106
+ byDimension: Record<string, {
107
+ avg: number;
108
+ scores: number[];
109
+ }>;
110
+ weakest: {
111
+ scenario: string;
112
+ score: number;
113
+ reason: string;
114
+ }[];
115
+ strongest: {
116
+ scenario: string;
117
+ score: number;
118
+ reason: string;
119
+ }[];
120
+ };
121
+ }
122
+ interface RouteMap {
123
+ signup?: string;
124
+ login?: string;
125
+ workspaces?: string;
126
+ threads?: string;
127
+ chat?: string;
128
+ tasks?: string;
129
+ events?: string;
130
+ approvals?: string;
131
+ vault?: string;
132
+ generations?: string;
133
+ [key: string]: string | undefined;
134
+ }
135
+ interface ProductClientConfig {
136
+ baseUrl: string;
137
+ routes: RouteMap;
138
+ }
139
+ interface ScenarioFile {
140
+ id: string;
141
+ category: string;
142
+ persona: string;
143
+ label: string;
144
+ thesis: string;
145
+ isControl?: boolean;
146
+ rubric?: {
147
+ dimensions: {
148
+ name: string;
149
+ description: string;
150
+ weight: number;
151
+ }[];
152
+ };
153
+ turns: Turn[];
154
+ artifactChecks: ArtifactCheck[];
155
+ }
156
+ interface CompletionCriterion {
157
+ name: string;
158
+ check: (state: DriverState) => boolean;
159
+ progress?: (state: DriverState) => number;
160
+ }
161
+ interface FeedbackPattern {
162
+ trigger: string;
163
+ response: string;
164
+ }
165
+ interface PersonaConfig {
166
+ id: string;
167
+ role: string;
168
+ goal: string;
169
+ completionCriteria: CompletionCriterion[];
170
+ feedbackPatterns?: FeedbackPattern[];
171
+ maxTurns: number;
172
+ driverModel?: string;
173
+ }
174
+ interface DriverState {
175
+ tasks: number;
176
+ events: number;
177
+ proposals: {
178
+ pending: number;
179
+ approved: number;
180
+ rejected: number;
181
+ };
182
+ vaultFiles: string[];
183
+ codeBlocks: number;
184
+ generations: number;
185
+ }
186
+ interface TurnMetrics {
187
+ turn: number;
188
+ timestamp: string;
189
+ tasks: number;
190
+ events: number;
191
+ proposals: {
192
+ pending: number;
193
+ approved: number;
194
+ rejected: number;
195
+ };
196
+ vaultFiles: number;
197
+ responseLatencyMs: number;
198
+ responseChars: number;
199
+ codeBlocksProduced: number;
200
+ blocksExtracted: number;
201
+ qualityScore?: number;
202
+ inputTokens: number;
203
+ outputTokens: number;
204
+ estimatedCostUsd: number;
205
+ totalCostUsd: number;
206
+ completionPercent: number;
207
+ }
208
+ interface DriverResult {
209
+ personaId: string;
210
+ completed: boolean;
211
+ turnsToCompletion: number | null;
212
+ totalTurns: number;
213
+ metrics: TurnMetrics[];
214
+ finalState: DriverState;
215
+ convergenceCurve: number[];
216
+ totalCostUsd: number;
217
+ finalQualityScore: number | null;
218
+ }
219
+ interface BenchmarkRunnerConfig {
220
+ scenarios: Scenario[];
221
+ judges: JudgeFn[];
222
+ systemPrompt: string;
223
+ model?: string;
224
+ judgeModel?: string;
225
+ passThreshold?: number;
226
+ generation?: number;
227
+ promptVersion?: string;
228
+ }
229
+ interface JudgeInput {
230
+ scenario: Scenario;
231
+ turns: TurnResult[];
232
+ artifacts: CollectedArtifacts;
233
+ }
234
+ type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
235
+
236
+ interface TestResult {
237
+ name: string;
238
+ passed: boolean;
239
+ duration: number;
240
+ detail?: string;
241
+ checks: CheckResult[];
242
+ }
243
+ interface CheckResult {
244
+ name: string;
245
+ passed: boolean;
246
+ expected: string;
247
+ actual: string;
248
+ }
249
+ interface EvalResult {
250
+ scenario: string;
251
+ status: 'pass' | 'fail' | 'skip';
252
+ duration: number;
253
+ detail?: string;
254
+ artifact?: string;
255
+ }
256
+
257
+ /**
258
+ * ProductClient — configurable HTTP client for exercising any agent's APIs.
259
+ *
260
+ * Routes are config, not hardcoded. Each agent provides its own RouteMap.
261
+ */
262
+ declare class ProductClient {
263
+ private baseUrl;
264
+ private routes;
265
+ private cookies;
266
+ constructor(config: ProductClientConfig);
267
+ private route;
268
+ signup(name: string, email: string, password: string): Promise<{
269
+ userId: string;
270
+ }>;
271
+ login(email: string, password: string): Promise<void>;
272
+ createWorkspace(name: string, type?: string): Promise<string>;
273
+ createThread(workspaceId: string): Promise<string>;
274
+ chat(workspaceId: string, threadId: string, content: string, _opts?: {
275
+ blockPatterns?: RegExp[];
276
+ }): Promise<{
277
+ text: string;
278
+ blocks: {
279
+ type: string;
280
+ title: string;
281
+ }[];
282
+ }>;
283
+ getTasks(workspaceId: string): Promise<{
284
+ id: string;
285
+ title: string;
286
+ status: string;
287
+ priority: string;
288
+ }[]>;
289
+ getEvents(workspaceId: string): Promise<{
290
+ id: string;
291
+ title: string;
292
+ type: string;
293
+ }[]>;
294
+ getApprovals(workspaceId: string): Promise<{
295
+ id: string;
296
+ title: string;
297
+ status: string;
298
+ type: string;
299
+ }[]>;
300
+ getVaultTree(workspaceId: string): Promise<string[]>;
301
+ approveAction(workspaceId: string, id: string): Promise<void>;
302
+ rejectAction(workspaceId: string, id: string, reason: string): Promise<void>;
303
+ getGenerations(workspaceId: string): Promise<{
304
+ id: string;
305
+ type: string;
306
+ prompt: string;
307
+ }[]>;
308
+ /** Generic GET for custom routes */
309
+ get(path: string): Promise<Record<string, unknown>>;
310
+ /** Generic POST for custom routes */
311
+ post(path: string, body: Record<string, unknown>): Promise<Record<string, unknown>>;
312
+ /** Generic PATCH for custom routes */
313
+ patch(path: string, body: Record<string, unknown>): Promise<Record<string, unknown>>;
314
+ }
315
+ /**
316
+ * Run a full e2e workflow test against a live product.
317
+ *
318
+ * The `workflow` callback receives a ProductClient and returns CheckResults.
319
+ * This is the generic harness — each agent defines its own workflow steps.
320
+ */
321
+ declare function runE2EWorkflow(client: ProductClient, name: string, workflow: (client: ProductClient) => Promise<CheckResult[]>): Promise<TestResult>;
322
+
323
+ /**
324
+ * Create a domain expert judge with a configurable domain.
325
+ *
326
+ * The judge evaluates professional accuracy and depth.
327
+ */
328
+ declare function createDomainExpertJudge(domain: string): JudgeFn;
329
+ /**
330
+ * Code execution judge — evaluates whether code blocks are valid and runnable.
331
+ */
332
+ declare const codeExecutionJudge: JudgeFn;
333
+ /**
334
+ * Coherence judge — evaluates multi-turn consistency and progression.
335
+ */
336
+ declare const coherenceJudge: JudgeFn;
337
+ /**
338
+ * Adversarial judge — red-teams agent responses.
339
+ */
340
+ declare const adversarialJudge: JudgeFn;
341
+ /**
342
+ * Create a custom judge with a fully custom prompt.
343
+ */
344
+ declare function createCustomJudge(name: string, systemPrompt: string, opts?: {
345
+ model?: string;
346
+ temperature?: number;
347
+ maxTokens?: number;
348
+ }): JudgeFn;
349
+ /** Default judge set (domain must be provided for domain expert) */
350
+ declare function defaultJudges(domain: string): JudgeFn[];
351
+
352
+ interface ExecutorConfig {
353
+ /** System prompt for the agent under test */
354
+ systemPrompt: string;
355
+ /** Model to use for the agent */
356
+ model?: string;
357
+ /** Judges to run after execution */
358
+ judges: JudgeFn[];
359
+ /** Regex patterns for detecting tool/API calls in responses */
360
+ toolCallPatterns?: RegExp[];
361
+ /** Block delimiter pattern (default: :::type\n...\n:::) */
362
+ blockPattern?: RegExp;
363
+ /** Custom artifact checker for domain-specific checks */
364
+ artifactChecker?: (check: Scenario['artifactChecks'][0], artifacts: CollectedArtifacts) => {
365
+ passed: boolean;
366
+ detail: string;
367
+ } | null;
368
+ }
369
+ /**
370
+ * Execute a scenario against an LLM via tcloud.
371
+ *
372
+ * Runs multi-turn conversation, extracts artifacts, runs judges.
373
+ */
374
+ declare function executeScenario(tc: TCloud, scenario: Scenario, config: ExecutorConfig): Promise<ScenarioResult>;
375
+
376
+ /**
377
+ * BenchmarkRunner — orchestrates scenarios, executor, judges, and scoring.
378
+ *
379
+ * Domain-agnostic. Each agent provides its own scenarios, judges, and system prompt.
380
+ */
381
+ declare class BenchmarkRunner {
382
+ private tc;
383
+ private config;
384
+ constructor(tc: TCloud, config: BenchmarkRunnerConfig);
385
+ run(scenarios?: Scenario[]): Promise<BenchmarkReport>;
386
+ }
387
+
388
+ /** Per-1K token pricing for common models */
389
+ declare const MODEL_PRICING: Record<string, {
390
+ input: number;
391
+ output: number;
392
+ }>;
393
+ /** Estimate token count from string length (chars / 4 approximation) */
394
+ declare function estimateTokens(text: string): number;
395
+ /** Calculate cost in USD from token counts and model */
396
+ declare function estimateCost(inputTokens: number, outputTokens: number, model: string): number;
397
+ /**
398
+ * TokenCounter — accumulates token usage and cost across turns.
399
+ */
400
+ declare class TokenCounter {
401
+ private totalInput;
402
+ private totalOutput;
403
+ private totalCost;
404
+ private model;
405
+ constructor(model?: string);
406
+ /** Record tokens for a turn, returns per-turn cost */
407
+ record(inputTokens: number, outputTokens: number): number;
408
+ /** Estimate and record from raw text */
409
+ recordFromText(inputText: string, outputText: string): {
410
+ inputTokens: number;
411
+ outputTokens: number;
412
+ cost: number;
413
+ };
414
+ getTotalInput(): number;
415
+ getTotalOutput(): number;
416
+ getTotalCost(): number;
417
+ }
418
+ /**
419
+ * MetricsCollector — collects per-turn metrics from the product.
420
+ *
421
+ * After each turn, queries the product's APIs to measure state changes.
422
+ */
423
+ declare class MetricsCollector {
424
+ private client;
425
+ private workspaceId;
426
+ private metrics;
427
+ constructor(client: ProductClient, workspaceId: string);
428
+ /** Collect metrics after a turn completes */
429
+ collect(turn: number, responseLatencyMs: number, responseChars: number, codeBlocksProduced: number, blocksExtracted: number, completionCriteriaMet: number, completionCriteriaTotal: number, qualityScore?: number, inputTokens?: number, outputTokens?: number, estimatedCostUsd?: number): Promise<TurnMetrics>;
430
+ /** Get current product state */
431
+ getState(): Promise<DriverState>;
432
+ /** Get all collected metrics */
433
+ getMetrics(): TurnMetrics[];
434
+ /** Get convergence curve (completion% over turns) */
435
+ getConvergenceCurve(): number[];
436
+ }
437
+
438
+ /**
439
+ * Normalize scores so all dimensions follow "higher = better".
440
+ * Inverted dimensions (hallucination, false_confidence, worst_failure)
441
+ * already use inverted scoring in the prompt (10 = no hallucination),
442
+ * but this function ensures consistency if raw scores leak through.
443
+ */
444
+ declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
445
+ /** Weighted mean — falls back to uniform weights when omitted */
446
+ declare function weightedMean(scores: {
447
+ score: number;
448
+ weight?: number;
449
+ }[]): number;
450
+ /** Bootstrap confidence interval */
451
+ declare function confidenceInterval(scores: number[], confidence?: number): {
452
+ mean: number;
453
+ lower: number;
454
+ upper: number;
455
+ };
456
+ /**
457
+ * Inter-rater reliability — simplified Krippendorff's alpha.
458
+ *
459
+ * Each inner array is one judge's scores for all items.
460
+ * All arrays must have the same length (same items scored).
461
+ */
462
+ declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
463
+ /**
464
+ * Mann-Whitney U test for comparing two independent groups.
465
+ * Returns U statistic and approximate p-value (normal approximation).
466
+ */
467
+ declare function mannWhitneyU(a: number[], b: number[]): {
468
+ u: number;
469
+ p: number;
470
+ };
471
+ /** Partial credit: returns 0-1 ratio of current toward target */
472
+ declare function partialCredit(current: number, target: number): number;
473
+
474
+ /**
475
+ * ConvergenceTracker — tracks completion percentage over turns.
476
+ *
477
+ * Produces convergence curves showing how quickly the agent reaches
478
+ * completion criteria.
479
+ */
480
+ declare class ConvergenceTracker {
481
+ private criteria;
482
+ private history;
483
+ constructor(criteria: CompletionCriterion[]);
484
+ /** Evaluate criteria against current state, record result */
485
+ record(turn: number, state: DriverState): {
486
+ completionPercent: number;
487
+ complete: boolean;
488
+ criteriaStatus: Record<string, boolean | number>;
489
+ };
490
+ /** Get convergence curve */
491
+ getCurve(): number[];
492
+ /** Get full history with per-criterion status */
493
+ getHistory(): {
494
+ turn: number;
495
+ completionPercent: number;
496
+ criteriaStatus: Record<string, boolean | number>;
497
+ }[];
498
+ /** Find the turn where completion first reached 100% (or null) */
499
+ getTurnToCompletion(): number | null;
500
+ }
501
+
502
+ /**
503
+ * ScenarioRegistry — manages scenario discovery and filtering.
504
+ *
505
+ * Each agent registers its scenarios. The registry handles conversion
506
+ * from ScenarioFile format to the framework's Scenario type.
507
+ */
508
+ declare class ScenarioRegistry {
509
+ private scenarios;
510
+ private scenarioFiles;
511
+ /** Register scenarios from ScenarioFile format */
512
+ registerFiles(files: ScenarioFile[]): void;
513
+ /** Register pre-built Scenario objects directly */
514
+ register(scenarios: Scenario[]): void;
515
+ /** Get all scenarios */
516
+ all(): Scenario[];
517
+ /** Get scenarios filtered by category */
518
+ byCategory(category: string): Scenario[];
519
+ /** List all categories with counts */
520
+ listCategories(): {
521
+ category: string;
522
+ count: number;
523
+ }[];
524
+ /** Get scenarios filtered by persona */
525
+ byPersona(persona: string): Scenario[];
526
+ /** Get a single scenario by ID */
527
+ byId(id: string): Scenario | undefined;
528
+ /** Count total scenarios */
529
+ get count(): number;
530
+ }
531
+
532
+ interface AgentDriverConfig {
533
+ client: ProductClient;
534
+ driverModel?: string;
535
+ /** System prompt context for the driver LLM to understand the product */
536
+ productContext?: string;
537
+ }
538
+ /**
539
+ * AgentDriver — meta-agent that plays a persona against the real product.
540
+ *
541
+ * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
542
+ * Not scripted — the driver gets the current product state and decides
543
+ * the next realistic user message.
544
+ */
545
+ declare class AgentDriver {
546
+ private tc;
547
+ private client;
548
+ private driverModel;
549
+ private productContext;
550
+ constructor(tc: TCloud, config: AgentDriverConfig);
551
+ /**
552
+ * Run a persona through the product.
553
+ *
554
+ * Returns metrics on how many turns to completion, cost curve,
555
+ * quality curve, and convergence curve.
556
+ */
557
+ run(persona: PersonaConfig): Promise<DriverResult>;
558
+ /** Use the driver LLM to decide what the "user" says next */
559
+ private decideNextMessage;
560
+ /** Handle pending approvals based on persona feedback patterns */
561
+ private handleApprovals;
562
+ /** Describe which completion criteria are met */
563
+ private describeCompletion;
564
+ }
565
+
566
+ /**
567
+ * Report generation utilities.
568
+ *
569
+ * Outputs convergence curves, cost curves, quality curves,
570
+ * and per-persona summaries in markdown format.
571
+ */
572
+ /** Generate a markdown report from benchmark results */
573
+ declare function formatBenchmarkReport(report: BenchmarkReport): string;
574
+ /** Generate a markdown report from agent driver results */
575
+ declare function formatDriverReport(results: DriverResult[]): string;
576
+ /** Print a compact summary to console */
577
+ declare function printDriverSummary(results: DriverResult[]): void;
578
+
579
+ export { AgentDriver, type AgentDriverConfig, type ArtifactCheck, type ArtifactResult, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, type EvalResult, type ExecutorConfig, type FeedbackPattern, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, MODEL_PRICING, MetricsCollector, type PersonaConfig, ProductClient, type ProductClientConfig, type RouteMap, type RubricDimension, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type TestResult, TokenCounter, type Turn, type TurnMetrics, type TurnResult, adversarialJudge, codeExecutionJudge, coherenceJudge, confidenceInterval, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, formatBenchmarkReport, formatDriverReport, interRaterReliability, mannWhitneyU, normalizeScores, partialCredit, printDriverSummary, runE2EWorkflow, weightedMean };