@tangle-network/agent-eval 0.49.0 → 0.50.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +135 -0
  2. package/README.md +235 -331
  3. package/dist/adapters/http.d.ts +1 -1
  4. package/dist/adapters/langchain.d.ts +1 -1
  5. package/dist/adapters/otel.d.ts +8 -2
  6. package/dist/campaign/index.d.ts +3 -3
  7. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  8. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  9. package/dist/chunk-EGIPWXHL.js.map +1 -0
  10. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  11. package/dist/chunk-FQK2CCIM.js.map +1 -0
  12. package/dist/chunk-MAZ26DC7.js +99 -0
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/chunk-SHTXZ4O2.js +113 -0
  15. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  16. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  17. package/dist/contract/index.d.ts +206 -9
  18. package/dist/contract/index.js +751 -3
  19. package/dist/contract/index.js.map +1 -1
  20. package/dist/governance/index.d.ts +1 -1
  21. package/dist/hosted/index.d.ts +8 -192
  22. package/dist/hosted/index.js +1 -1
  23. package/dist/index-BRxz6qov.d.ts +409 -0
  24. package/dist/index.d.ts +18 -462
  25. package/dist/index.js +14 -106
  26. package/dist/index.js.map +1 -1
  27. package/dist/meta-eval/index.d.ts +3 -3
  28. package/dist/openapi.json +1 -1
  29. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  30. package/dist/registry-8KAs18kY.d.ts +457 -0
  31. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  32. package/dist/reporting.d.ts +6 -4
  33. package/dist/reporting.js +6 -4
  34. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  35. package/dist/rl.d.ts +9 -8
  36. package/dist/rl.js +3 -2
  37. package/dist/rl.js.map +1 -1
  38. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  39. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  40. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  41. package/dist/store-CJbzDxZ2.d.ts +220 -0
  42. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  43. package/dist/traces.d.ts +3 -220
  44. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  45. package/dist/types-DhqpAi_z.d.ts +296 -0
  46. package/docs/concepts.md +20 -0
  47. package/docs/customer-journeys.md +208 -0
  48. package/docs/insight-report.md +337 -0
  49. package/package.json +1 -1
  50. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  51. package/dist/chunk-OYI6RZJK.js.map +0 -1
  52. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  53. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
package/dist/traces.d.ts CHANGED
@@ -8,6 +8,8 @@ import { T as TraceStore } from './store-Db2Bv8Cf.js';
8
8
  export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
9
9
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
10
10
  import { AxAIService, AxFunction } from '@ax-llm/ax';
11
+ import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
12
+ export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';
11
13
 
12
14
  /**
13
15
  * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
@@ -189,225 +191,6 @@ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: R
189
191
  report: RedactionReport;
190
192
  };
191
193
 
192
- /**
193
- * Shared types for the trace-analyst module.
194
- *
195
- * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
196
- * line per span, OTLP-shaped. We do NOT depend on a specific tracing
197
- * vendor at the type level. Adapter
198
- * layers map upstream shapes onto this interface.
199
- *
200
- * Design constraint. Every read operation that can return arbitrary
201
- * payload must carry a byte budget so the agent's tool result stays
202
- * bounded regardless of input trace size. Oversized responses
203
- * substitute a deterministic summary instead of bytes — see
204
- * `ViewTraceOversized`.
205
- */
206
- /** OTLP span kind (subset we actually use). */
207
- type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
208
- type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
209
- /** Subset of OTLP span fields the analyst exposes to the agent. The
210
- * store's job is to project upstream's full span shape down to this
211
- * view — the analyst never sees vendor extensions directly. */
212
- interface TraceAnalystSpan {
213
- trace_id: string;
214
- span_id: string;
215
- parent_span_id: string | null;
216
- name: string;
217
- kind: TraceAnalystSpanKind;
218
- start_time: string;
219
- end_time: string;
220
- duration_ms: number;
221
- status: TraceAnalystSpanStatus;
222
- status_message?: string;
223
- service_name: string | null;
224
- agent_name: string | null;
225
- model_name: string | null;
226
- tool_name: string | null;
227
- /** Raw JSON-serialisable attribute map. May contain large strings;
228
- * callers must respect the per-attribute byte cap. */
229
- attributes: Record<string, unknown>;
230
- }
231
- interface TraceAnalystTraceSummary {
232
- trace_id: string;
233
- service_name: string | null;
234
- agent_name: string | null;
235
- span_count: number;
236
- has_errors: boolean;
237
- start_time: string;
238
- end_time: string;
239
- duration_ms: number;
240
- raw_jsonl_bytes: number;
241
- models: string[];
242
- tools: string[];
243
- }
244
- interface TraceAnalystFilters {
245
- /** Restrict to traces that contain at least one error span. */
246
- has_errors?: boolean;
247
- /** Match if any span's `service.name` is in this list. */
248
- service_names?: string[];
249
- /** Match if any span's `agent.name` is in this list. */
250
- agent_names?: string[];
251
- /** Match if any LLM span's `llm.model_name` is in this list. */
252
- model_names?: string[];
253
- /** Match if any tool span's `tool.name` is in this list. */
254
- tool_names?: string[];
255
- /** ISO-8601 lower bound on the trace's earliest start time. */
256
- start_time_after?: string;
257
- /** ISO-8601 upper bound on the trace's earliest start time. */
258
- start_time_before?: string;
259
- /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
260
- * expensive on large datasets. Use the indexed filters above first. */
261
- regex_pattern?: string;
262
- }
263
- interface DatasetOverview {
264
- total_traces: number;
265
- raw_jsonl_bytes: number;
266
- services: string[];
267
- agents: string[];
268
- models: string[];
269
- tool_names: string[];
270
- /** Up to 20 real trace ids the agent may pass to view/search tools. */
271
- sample_trace_ids: string[];
272
- errors: {
273
- trace_count: number;
274
- span_count: number;
275
- };
276
- time_range: {
277
- earliest: string;
278
- latest: string;
279
- } | null;
280
- }
281
- interface QueryTracesPage {
282
- traces: TraceAnalystTraceSummary[];
283
- total: number;
284
- has_more: boolean;
285
- }
286
- /** Full-trace view. When the response would exceed the per-call byte
287
- * budget, `oversized` is populated INSTEAD of `spans` so the agent
288
- * knows to switch to `searchTrace` / `viewSpans`. */
289
- interface ViewTraceResult {
290
- trace_id: string;
291
- spans?: TraceAnalystSpan[];
292
- oversized?: ViewTraceOversized;
293
- }
294
- interface ViewTraceOversized {
295
- span_count: number;
296
- /** Names with their counts, sorted desc. Capped at 20 entries. */
297
- top_span_names: Array<[string, number]>;
298
- /** Largest single span body (bytes after attribute-cap projection). */
299
- span_response_bytes_max: number;
300
- error_span_count: number;
301
- }
302
- interface ViewSpansResult {
303
- trace_id: string;
304
- spans: TraceAnalystSpan[];
305
- /** Number of requested span ids that were not found in the trace. */
306
- missing_span_ids: string[];
307
- /** Number of attribute fields truncated to fit the per-attribute cap. */
308
- truncated_attribute_count: number;
309
- }
310
- interface SpanMatchRecord {
311
- trace_id: string;
312
- span_id: string;
313
- span_name: string;
314
- span_kind: TraceAnalystSpanKind;
315
- /** JSON pointer-style path to the matched value, e.g.
316
- * `attributes."llm.input_messages"[2].content`. */
317
- attribute_path: string;
318
- matched_text: string;
319
- context_before: string;
320
- context_after: string;
321
- match_offset: number;
322
- }
323
- interface SearchTraceResult {
324
- trace_id: string;
325
- hits: SpanMatchRecord[];
326
- total_matches: number;
327
- has_more: boolean;
328
- }
329
- interface SearchSpanResult {
330
- trace_id: string;
331
- span_id: string;
332
- hits: SpanMatchRecord[];
333
- total_matches: number;
334
- has_more: boolean;
335
- }
336
- /** Tunable byte budgets for bounded RLM tool output. */
337
- interface TraceAnalystByteBudgets {
338
- /** Max bytes any single tool response may emit. Hard ceiling enforced
339
- * by the store; oversized → summary. Default 150_000. */
340
- perCallByteCeiling: number;
341
- /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
342
- * Default 4096. */
343
- perAttributeViewBudget: number;
344
- /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
345
- * Default 16384. */
346
- perAttributeSpanBudget: number;
347
- /** Per-attribute cap on a single match record's `matched_text` and
348
- * context window. Default 1024. */
349
- perMatchTextBudget: number;
350
- }
351
- declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
352
- /** Marker substituted in place of truncated string payloads. Callers
353
- * parsing tool output can detect it deterministically. */
354
- declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
355
-
356
- /**
357
- * `TraceAnalysisStore` — read-side interface the trace-analyst calls
358
- * through. Six operations, all bounded:
359
- *
360
- * - `getOverview(filters?)` — dataset rollup + sample trace ids.
361
- * - `queryTraces(filters?, limit, offset)` — paginated summaries.
362
- * - `countTraces(filters?)` — cheap count without materialisation.
363
- * - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
364
- * - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
365
- * - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
366
- * - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
367
- *
368
- * Multiple implementations ship in the core (`OtlpFileTraceStore`).
369
- * Downstream callers can supply their own — e.g. a DuckDB-backed
370
- * adapter or an in-memory adapter for tests — by implementing this
371
- * interface.
372
- *
373
- * Filters compose with AND semantics. Empty/undefined fields impose
374
- * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
375
- * implementations may skip it via `count`/`overview` when not set.
376
- */
377
-
378
- interface TraceAnalysisStore {
379
- getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
380
- queryTraces(opts: {
381
- filters?: TraceAnalystFilters;
382
- limit: number;
383
- offset?: number;
384
- }): Promise<QueryTracesPage>;
385
- countTraces(filters?: TraceAnalystFilters): Promise<number>;
386
- viewTrace(opts: {
387
- trace_id: string;
388
- /** Override per-attribute byte cap. Defaults to discovery budget. */
389
- per_attribute_byte_cap?: number;
390
- }): Promise<ViewTraceResult>;
391
- viewSpans(opts: {
392
- trace_id: string;
393
- span_ids: readonly string[];
394
- /** Override per-attribute byte cap. Defaults to surgical budget. */
395
- per_attribute_byte_cap?: number;
396
- }): Promise<ViewSpansResult>;
397
- searchTrace(opts: {
398
- trace_id: string;
399
- regex_pattern: string;
400
- /** Hard cap on matches returned. Default 50. */
401
- max_matches?: number;
402
- }): Promise<SearchTraceResult>;
403
- searchSpan(opts: {
404
- trace_id: string;
405
- span_id: string;
406
- regex_pattern: string;
407
- max_matches?: number;
408
- }): Promise<SearchSpanResult>;
409
- }
410
-
411
194
  interface AnalyzeTracesInput {
412
195
  /** The user-facing question. Domain framing belongs here, not in the
413
196
  * actor description. */
@@ -887,4 +670,4 @@ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
887
670
  spanId?: string;
888
671
  }): AsyncGenerator<ReplayCacheEntry>;
889
672
 
890
- export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
673
+ export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DatasetOverview, type ExportableSpan, OTEL_AGENT_EVAL_SCOPE, type OtelExportConfig, type OtelExporter, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, RunCompleteHook, RunCompleteHookContext, SearchSpanResult, SearchTraceResult, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalysisStore, TraceAnalystFilters, type TraceAnalystHookOptions, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, ViewSpansResult, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
@@ -372,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
372
372
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
373
373
  }
374
374
 
375
- export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
375
+ export type { CodeSurface as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeScore as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, JudgeConfig as a, DispatchContext as b, LabeledScenarioWrite as c, LabeledScenarioSampleArgs as d, LabeledScenarioRecord as e, CampaignAggregates as f, CampaignArtifactWriter as g, CampaignCellResult as h, CampaignCostMeter as i, CampaignResult as j, CampaignTraceWriter as k, GateContext as l, GateDecision as m, GateResult as n, GenerationCandidate as o, GenerationRecord as p, JudgeAggregate as q, JudgeDimension as r, LabeledScenarioSource as s, Mutator as t, ScenarioAggregate as u, SessionScript as v };
@@ -0,0 +1,296 @@
1
+ import { TCloud } from '@tangle-network/tcloud';
2
+
3
+ interface Scenario {
4
+ id: string;
5
+ persona: string;
6
+ label: string;
7
+ thesis: string;
8
+ dimensions: string[];
9
+ turns: Turn[];
10
+ artifactChecks: ArtifactCheck[];
11
+ systemPromptAppend?: string;
12
+ }
13
+ interface Turn {
14
+ user: string;
15
+ expectedBehaviors: string[];
16
+ adversarial?: boolean;
17
+ feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
18
+ }
19
+ interface ArtifactCheck {
20
+ type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
21
+ target: string;
22
+ contains?: string;
23
+ minCount?: number;
24
+ description: string;
25
+ }
26
+ interface JudgeConfig {
27
+ model: string;
28
+ temperature: number;
29
+ rubric: JudgeRubric;
30
+ }
31
+ interface JudgeRubric {
32
+ name: string;
33
+ description: string;
34
+ dimensions: RubricDimension[];
35
+ }
36
+ interface RubricDimension {
37
+ name: string;
38
+ description: string;
39
+ anchor_low: string;
40
+ anchor_high: string;
41
+ weight: number;
42
+ }
43
+ interface ScenarioResult {
44
+ scenarioId: string;
45
+ persona: string;
46
+ turns: TurnResult[];
47
+ artifactResults: ArtifactResult[];
48
+ judgeScores: JudgeScore[];
49
+ judgeErrors: number;
50
+ overallScore: number;
51
+ totalDurationMs: number;
52
+ artifacts: CollectedArtifacts;
53
+ }
54
+ interface TurnResult {
55
+ turnIndex: number;
56
+ userMessage: string;
57
+ agentResponse: string;
58
+ durationMs: number;
59
+ blocksExtracted: {
60
+ type: string;
61
+ title: string;
62
+ }[];
63
+ containsCode: boolean;
64
+ containsToolCall: boolean;
65
+ }
66
+ interface ArtifactResult {
67
+ check: ArtifactCheck;
68
+ passed: boolean;
69
+ detail?: string;
70
+ }
71
+ interface JudgeScore {
72
+ judgeName: string;
73
+ dimension: string;
74
+ score: number;
75
+ reasoning: string;
76
+ evidence?: string;
77
+ }
78
+ interface CollectedArtifacts {
79
+ vaultFiles: {
80
+ path: string;
81
+ content: string;
82
+ }[];
83
+ blocksExtracted: {
84
+ type: string;
85
+ fields: Record<string, string>;
86
+ }[];
87
+ codeBlocks: {
88
+ language: string;
89
+ code: string;
90
+ }[];
91
+ toolCalls: string[];
92
+ }
93
+ interface BenchmarkReport {
94
+ timestamp: string;
95
+ generation: number;
96
+ promptVersion: string;
97
+ scenarioCount: number;
98
+ results: ScenarioResult[];
99
+ summary: {
100
+ overallAvg: number;
101
+ byPersona: Record<string, {
102
+ avg: number;
103
+ passed: number;
104
+ total: number;
105
+ }>;
106
+ byDimension: Record<string, {
107
+ avg: number;
108
+ scores: number[];
109
+ }>;
110
+ weakest: {
111
+ scenario: string;
112
+ score: number;
113
+ reason: string;
114
+ }[];
115
+ strongest: {
116
+ scenario: string;
117
+ score: number;
118
+ reason: string;
119
+ }[];
120
+ };
121
+ }
122
+ interface RouteMap {
123
+ signup?: string;
124
+ login?: string;
125
+ workspaces?: string;
126
+ threads?: string;
127
+ chat?: string;
128
+ tasks?: string;
129
+ events?: string;
130
+ approvals?: string;
131
+ vault?: string;
132
+ generations?: string;
133
+ [key: string]: string | undefined;
134
+ }
135
+ interface ProductClientConfig {
136
+ baseUrl: string;
137
+ routes: RouteMap;
138
+ }
139
+ interface ScenarioFile {
140
+ id: string;
141
+ category: string;
142
+ persona: string;
143
+ label: string;
144
+ thesis: string;
145
+ isControl?: boolean;
146
+ rubric?: {
147
+ dimensions: {
148
+ name: string;
149
+ description: string;
150
+ weight: number;
151
+ }[];
152
+ };
153
+ turns: Turn[];
154
+ artifactChecks: ArtifactCheck[];
155
+ }
156
+ interface CompletionCriterion {
157
+ name: string;
158
+ check: (state: DriverState) => boolean;
159
+ progress?: (state: DriverState) => number;
160
+ }
161
+ interface FeedbackPattern {
162
+ trigger: string;
163
+ response: string;
164
+ }
165
+ /**
166
+ * How hard the simulated user pushes back. The driver LLM scales its tone
167
+ * and follow-up aggression to this:
168
+ * cooperative — forgiving early adopter; accepts reasonable answers.
169
+ * demanding — experienced professional; rejects vague or hedged answers.
170
+ * relentless — senior partner reviewing for a client who will litigate;
171
+ * interrogates every claim, accepts nothing undefended.
172
+ */
173
+ type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
174
+ interface PersonaConfig {
175
+ id: string;
176
+ role: string;
177
+ goal: string;
178
+ completionCriteria: CompletionCriterion[];
179
+ feedbackPatterns?: FeedbackPattern[];
180
+ maxTurns: number;
181
+ driverModel?: string;
182
+ /** How adversarial the simulated user is. Defaults to 'demanding'. */
183
+ rigor?: PersonaRigor;
184
+ /**
185
+ * Domain expertise the simulated user holds — quoted into the driver
186
+ * prompt so it challenges the agent with authority instead of vague
187
+ * dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
188
+ * working-capital mechanics cold".
189
+ */
190
+ expertise?: string;
191
+ /**
192
+ * Substantive issues a senior professional in this role would
193
+ * interrogate — traps the scenario hides, claims that must be defended.
194
+ * The driver probes these without revealing them verbatim; the agent
195
+ * must surface them on its own.
196
+ */
197
+ pressurePoints?: string[];
198
+ /**
199
+ * Curveballs the driver may inject once the agent is coasting — changed
200
+ * facts, a hostile counterparty position, a new constraint. Forces the
201
+ * agent to re-derive rather than recite.
202
+ */
203
+ curveballs?: string[];
204
+ }
205
+ interface DriverState {
206
+ tasks: number;
207
+ events: number;
208
+ proposals: {
209
+ pending: number;
210
+ approved: number;
211
+ rejected: number;
212
+ };
213
+ vaultFiles: string[];
214
+ codeBlocks: number;
215
+ generations: number;
216
+ }
217
+ interface TurnMetrics {
218
+ turn: number;
219
+ timestamp: string;
220
+ tasks: number;
221
+ events: number;
222
+ proposals: {
223
+ pending: number;
224
+ approved: number;
225
+ rejected: number;
226
+ };
227
+ vaultFiles: number;
228
+ responseLatencyMs: number;
229
+ responseChars: number;
230
+ codeBlocksProduced: number;
231
+ blocksExtracted: number;
232
+ qualityScore?: number;
233
+ inputTokens: number;
234
+ outputTokens: number;
235
+ estimatedCostUsd: number;
236
+ totalCostUsd: number;
237
+ completionPercent: number;
238
+ }
239
+ interface DriverResult {
240
+ personaId: string;
241
+ /** True when the simulated user professionally signed off (driver said DONE). */
242
+ completed: boolean;
243
+ /** Turn at which the simulated user signed off, or null if it never did. */
244
+ turnsToCompletion: number | null;
245
+ /**
246
+ * Turn at which nominal completionCriteria were first all met, or null.
247
+ * Distinct from turnsToCompletion: criteria can be met while the
248
+ * simulated professional is still unsatisfied with the work's rigor.
249
+ */
250
+ criteriaMetAtTurn: number | null;
251
+ totalTurns: number;
252
+ metrics: TurnMetrics[];
253
+ finalState: DriverState;
254
+ convergenceCurve: number[];
255
+ totalCostUsd: number;
256
+ finalQualityScore: number | null;
257
+ }
258
+ interface BenchmarkRunnerConfig {
259
+ scenarios: Scenario[];
260
+ judges: JudgeFn[];
261
+ systemPrompt: string;
262
+ model?: string;
263
+ judgeModel?: string;
264
+ passThreshold?: number;
265
+ generation?: number;
266
+ promptVersion?: string;
267
+ }
268
+ interface JudgeInput {
269
+ scenario: Scenario;
270
+ turns: TurnResult[];
271
+ artifacts: CollectedArtifacts;
272
+ }
273
+ type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
274
+
275
+ interface TestResult {
276
+ name: string;
277
+ passed: boolean;
278
+ duration: number;
279
+ detail?: string;
280
+ checks: CheckResult[];
281
+ }
282
+ interface CheckResult {
283
+ name: string;
284
+ passed: boolean;
285
+ expected: string;
286
+ actual: string;
287
+ }
288
+ interface EvalResult {
289
+ scenario: string;
290
+ status: 'pass' | 'fail' | 'skip';
291
+ duration: number;
292
+ detail?: string;
293
+ artifact?: string;
294
+ }
295
+
296
+ export type { ArtifactCheck as A, BenchmarkRunnerConfig as B, CheckResult as C, DriverResult as D, EvalResult as E, FeedbackPattern as F, JudgeInput as J, ProductClientConfig as P, RouteMap as R, Scenario as S, TestResult as T, JudgeScore as a, JudgeFn as b, BenchmarkReport as c, PersonaConfig as d, DriverState as e, CollectedArtifacts as f, ScenarioResult as g, TurnMetrics as h, ScenarioFile as i, CompletionCriterion as j, ArtifactResult as k, JudgeConfig as l, JudgeRubric as m, PersonaRigor as n, RubricDimension as o, Turn as p, TurnResult as q };
package/docs/concepts.md CHANGED
@@ -9,6 +9,26 @@ connected, or the answer lacks required sources. The package gives products a
9
9
  shared way to record runs, check outcomes, classify failures, compare variants,
10
10
  and make release decisions.
11
11
 
12
+ ## The three top-level functions
13
+
14
+ Everything funnels through `/contract`. Three entries, one shape coming back:
15
+
16
+ | Function | When to call it | What you give it | What you get back |
17
+ |---|---|---|---|
18
+ | **`selfImprove()`** | You have a closed loop — scenarios, judge, agent in hand, and you want the substrate to propose better candidates + gate them. | scenarios, agent, judge, baseline surface | `SelfImproveResult.insight: InsightReport` + ship/hold verdict + winner surface |
19
+ | **`analyzeRuns()`** | You have observed runs (production traces, an approve/reject corpus, a CSV gold set) and want the same rigor packet without invoking an agent. | `RunRecord[]` + optional flags | `InsightReport` |
20
+ | **Intake adapters** (`fromFeedbackTable`, `fromOtelSpans`) | Your data isn't already in `RunRecord` shape — it's in Obsidian, Sheets, an OTel collector, etc. | source-specific input | `RunRecord[]` ready to pipe into `analyzeRuns()` |
21
+
22
+ The three customer maturity stages — logs only → ratings → closed loop — map exactly to the three functions. See [`customer-journeys.md`](./customer-journeys.md) for the runnable walkthroughs.
23
+
24
+ The shape of the answer — `InsightReport` — is identical across all three paths. Distributional summary, paired-bootstrap lift CI, judge stats, inter-rater agreement, cost-quality Pareto, failure clusters, contamination check, outcome correlation, release axes, and a ranked recommendations array. Walked through section-by-section in [`insight-report.md`](./insight-report.md).
25
+
26
+ ## The layering rule
27
+
28
+ `agent-eval` is the **substrate** at the bottom of the Tangle agent stack. `agent-runtime` and `agent-knowledge` depend on it; `agent-eval` MUST NOT import from either. Primitives that "feel like" they belong in a consumer but are actually substrate-shaped (validator verdicts, run records, scenarios, judge scores) live here. Primitives that genuinely require a running agent loop (`ValidationCtx` with iteration + signal + traceEmitter, sandbox `AgentRunSpec`) stay in `agent-runtime`.
29
+
30
+ The test: *does this concept make sense WITHOUT a running agent loop?* If yes, it's substrate. If no, it's runtime. The full rule is in [`/CLAUDE.md`](../CLAUDE.md#repo-layering--this-package-is-the-substrate).
31
+
12
32
  ## Main Objects
13
33
 
14
34
  | Thing | What it is | One-line example |