@tangle-network/agent-eval 0.27.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +87 -0
  2. package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/chunk-UW4NOOZI.js +1561 -0
  5. package/dist/chunk-UW4NOOZI.js.map +1 -0
  6. package/dist/{control-BT4qnXiS.d.ts → control-rJhEDdpy.d.ts} +4 -4
  7. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-BRdQ0wrx.d.ts} +2 -2
  8. package/dist/control.d.ts +5 -5
  9. package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
  10. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-D1NZKqYu.d.ts} +1 -1
  11. package/dist/{feedback-trajectory-D1aGKusy.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
  12. package/dist/governance/index.d.ts +2 -2
  13. package/dist/{index-BhLlu-qO.d.ts → index-Cgt3DKXr.d.ts} +1 -1
  14. package/dist/index.d.ts +1190 -335
  15. package/dist/index.js +1580 -489
  16. package/dist/index.js.map +1 -1
  17. package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
  18. package/dist/knowledge/index.d.ts +3 -3
  19. package/dist/meta-eval/index.d.ts +1 -1
  20. package/dist/{multi-layer-verifier-U-c8ge1k.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +1 -1
  21. package/dist/openapi.json +1 -1
  22. package/dist/optimization.d.ts +8 -8
  23. package/dist/pipelines/index.d.ts +6 -6
  24. package/dist/prm/index.d.ts +4 -4
  25. package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
  26. package/dist/{release-report-CCQqnK46.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
  27. package/dist/replay-BX5Fm8en.d.ts +529 -0
  28. package/dist/reporting.d.ts +4 -4
  29. package/dist/{researcher-G81CWc0q.d.ts → researcher-ClDX3KZx.d.ts} +5 -5
  30. package/dist/rl.d.ts +8 -8
  31. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
  32. package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
  33. package/dist/{summary-report-Dl4akLKX.d.ts → summary-report-jrSGb2xZ.d.ts} +1 -1
  34. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
  35. package/dist/traces.d.ts +9 -311
  36. package/dist/traces.js +15 -986
  37. package/dist/traces.js.map +1 -1
  38. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
  39. package/dist/wire/index.d.ts +4 -4
  40. package/package.json +1 -1
  41. package/dist/chunk-4U4BKCXK.js +0 -569
  42. package/dist/chunk-4U4BKCXK.js.map +0 -1
  43. package/dist/replay-D7z0J43-.d.ts +0 -225
@@ -1,225 +0,0 @@
1
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
- import { R as ReplayError } from './errors-BZ9sTdz7.js';
3
- import { R as RawProviderSink, c as RawProviderEvent } from './integrity-DK2EBVZC.js';
4
-
5
- /**
6
- * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
7
- * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
8
- *
9
- * Wire format only. We do NOT depend on the @opentelemetry SDK — that
10
- * would drag in polyfills incompatible with Workers/Edge. Consumers
11
- * push the JSON to their collector of choice via HTTP.
12
- *
13
- * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
14
- */
15
-
16
- declare const OTEL_AGENT_EVAL_SCOPE: {
17
- name: string;
18
- version: string;
19
- };
20
- interface OtlpSpan {
21
- traceId: string;
22
- spanId: string;
23
- parentSpanId?: string;
24
- name: string;
25
- kind: number;
26
- startTimeUnixNano: string;
27
- endTimeUnixNano: string;
28
- attributes: Array<{
29
- key: string;
30
- value: {
31
- stringValue?: string;
32
- intValue?: string;
33
- doubleValue?: number;
34
- boolValue?: boolean;
35
- };
36
- }>;
37
- events?: Array<{
38
- timeUnixNano: string;
39
- name: string;
40
- attributes?: OtlpSpan['attributes'];
41
- }>;
42
- status?: {
43
- code: number;
44
- message?: string;
45
- };
46
- }
47
- interface OtlpResourceSpans {
48
- resource: {
49
- attributes: OtlpSpan['attributes'];
50
- };
51
- scopeSpans: Array<{
52
- scope: typeof OTEL_AGENT_EVAL_SCOPE;
53
- spans: OtlpSpan[];
54
- }>;
55
- }
56
- interface OtlpExport {
57
- resourceSpans: OtlpResourceSpans[];
58
- }
59
- /** Export a single run's spans + events in OTLP/JSON. */
60
- declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
61
-
62
- /**
63
- * Redaction — remove PII / secrets from trace payloads before persist.
64
- *
65
- * Pre-persistence rules mean raw traces in storage are already scrubbed.
66
- * Unredacted variants (for debugging / post-mortems) live in a separate
67
- * storage layer with stricter access controls; this module only covers
68
- * the default scrub-then-persist path.
69
- *
70
- * Rules compose: pass an array of `RedactionRule`, each is applied in
71
- * order. Strings that match get replaced with a tagged sentinel so the
72
- * eval framework can count how many redactions happened per run
73
- * (surfaced via `redaction_applied` events).
74
- */
75
- interface RedactionRule {
76
- id: string;
77
- pattern: RegExp;
78
- /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
79
- replacement?: string;
80
- }
81
- interface RedactionReport {
82
- redactionCount: number;
83
- byRule: Record<string, number>;
84
- }
85
- /** OWASP / common-sense defaults — extend per-domain. */
86
- declare const DEFAULT_REDACTION_RULES: RedactionRule[];
87
- declare const REDACTION_VERSION = "1.0.0";
88
- /**
89
- * Redact a single string. Returns the new string and a per-rule count of
90
- * how many substitutions fired.
91
- */
92
- declare function redactString(input: string, rules?: RedactionRule[]): {
93
- output: string;
94
- report: RedactionReport;
95
- };
96
- /**
97
- * Walk a JSON-ish value applying `redactString` to every string leaf.
98
- * Arrays and plain objects are recursed; other types pass through
99
- * untouched. Circular references throw — traces should be tree-shaped.
100
- */
101
- declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
102
- value: unknown;
103
- report: RedactionReport;
104
- };
105
-
106
- /**
107
- * Replay-from-raw-events — turn every captured campaign run into a
108
- * re-runnable artifact.
109
- *
110
- * `RawProviderSink` captures every provider HTTP envelope; `runEvalCampaign`
111
- * makes that capture the default. Together they make every past run a
112
- * complete fingerprint of what happened on the wire — enough to replay
113
- * the run without burning new LLM cost.
114
- *
115
- * Three use cases this primitive enables:
116
- *
117
- * 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
118
- * to last week's runs without re-calling any LLM. The cost of trying
119
- * a new rubric drops from "another full sweep" to a CPU-bound replay.
120
- * 2. **Determinism audits** — replay the same campaign and verify the
121
- * raw responses match byte-for-byte. Any drift is a non-determinism
122
- * bug (in the harness, the prompt builder, the sandbox, …).
123
- * 3. **Free judge calibration** — run two judges on identical responses
124
- * and measure inter-judge agreement without doubling LLM spend.
125
- *
126
- * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
127
- * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
128
- * from the cache instead of calling the network. No new code path through
129
- * the LLM client is needed; the cache hit is invisible to the runner.
130
- */
131
-
132
- declare class ReplayCacheMissError extends ReplayError {
133
- readonly url: string;
134
- readonly requestKey: string;
135
- constructor(url: string, requestKey: string, message?: string);
136
- }
137
- interface ReplayCacheEntry {
138
- request: RawProviderEvent;
139
- response: RawProviderEvent;
140
- }
141
- interface ReplayCacheStats {
142
- total: number;
143
- byProvider: Record<string, number>;
144
- byModel: Record<string, number>;
145
- /** Spans for which we have a request but no response (run aborted mid-call). */
146
- orphanRequests: number;
147
- }
148
- /**
149
- * In-memory deterministic cache of (request → response) keyed on a stable
150
- * hash of the request body. Built from a `RawProviderSink` containing
151
- * paired `request` and `response` events from a previous run.
152
- *
153
- * The cache is the source of truth for replay; `createReplayFetch` is a
154
- * thin wrapper that reads from it.
155
- */
156
- declare class ReplayCache {
157
- private byKey;
158
- private orphans;
159
- private byProvider;
160
- private byModel;
161
- /**
162
- * Build a cache from a sink's events. The sink must implement `list()`.
163
- * Filter by `runId` / `spanId` to scope to a specific replay.
164
- */
165
- static fromSink(sink: RawProviderSink, filter?: {
166
- runId?: string;
167
- spanId?: string;
168
- }): Promise<ReplayCache>;
169
- /** Build a cache from an in-memory event list. */
170
- static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
171
- /** Number of cacheable (request, response) pairs in the cache. */
172
- size(): number;
173
- stats(): ReplayCacheStats;
174
- /** Iterate every cached `(request, response)` pair in insertion order. */
175
- entries(): IterableIterator<ReplayCacheEntry>;
176
- /**
177
- * Look up a cached response by hashing the (model, messages, temperature,
178
- * maxTokens, response_format) shape. Returns `undefined` on miss; the
179
- * caller decides whether to throw, fall back to the network, or skip.
180
- */
181
- lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
182
- }
183
- interface ReplayFetchOptions {
184
- /**
185
- * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
186
- * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
187
- * still complete; `'fail-closed'` returns a synthetic 599 response so the
188
- * call site sees a non-retriable failure.
189
- */
190
- onMiss?: 'throw' | 'fallback' | 'fail-closed';
191
- fallbackFetch?: typeof fetch;
192
- /** Optional callback fired once per replayed call (for telemetry / counters). */
193
- onHit?: (info: {
194
- url: string;
195
- provider: string;
196
- model: string;
197
- }) => void;
198
- /** Optional callback fired on cache miss before the `onMiss` policy applies. */
199
- onMissNotify?: (info: {
200
- url: string;
201
- requestBody: unknown;
202
- }) => void;
203
- }
204
- /**
205
- * Build a `fetch`-shaped function that serves cached responses out of a
206
- * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
207
- * `LlmClientOptions.fetch` and `callLlm` becomes free.
208
- *
209
- * Non-`/chat/completions` URLs are passed straight to the fallback fetch
210
- * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
211
- * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
212
- * `fetch` and shouldn't be intercepted.
213
- */
214
- declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
215
- /**
216
- * Convenience iterator over `(request, response)` pairs in a sink — for
217
- * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
218
- * runs purely in-process over cached LLM outputs.
219
- */
220
- declare function iterateRawCalls(sink: RawProviderSink, filter?: {
221
- runId?: string;
222
- spanId?: string;
223
- }): AsyncGenerator<ReplayCacheEntry>;
224
-
225
- export { DEFAULT_REDACTION_RULES as D, OTEL_AGENT_EVAL_SCOPE as O, REDACTION_VERSION as R, type OtlpExport as a, type OtlpResourceSpans as b, type OtlpSpan as c, type RedactionReport as d, type RedactionRule as e, ReplayCache as f, type ReplayCacheEntry as g, ReplayCacheMissError as h, type ReplayCacheStats as i, type ReplayFetchOptions as j, createReplayFetch as k, exportRunAsOtlp as l, iterateRawCalls as m, redactValue as n, redactString as r };