@tangle-network/agent-eval 0.27.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +87 -0
  2. package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/chunk-UW4NOOZI.js +1561 -0
  5. package/dist/chunk-UW4NOOZI.js.map +1 -0
  6. package/dist/{control-BT4qnXiS.d.ts → control-rJhEDdpy.d.ts} +4 -4
  7. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-BRdQ0wrx.d.ts} +2 -2
  8. package/dist/control.d.ts +5 -5
  9. package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
  10. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-D1NZKqYu.d.ts} +1 -1
  11. package/dist/{feedback-trajectory-D1aGKusy.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
  12. package/dist/governance/index.d.ts +2 -2
  13. package/dist/{index-BhLlu-qO.d.ts → index-Cgt3DKXr.d.ts} +1 -1
  14. package/dist/index.d.ts +1190 -335
  15. package/dist/index.js +1580 -489
  16. package/dist/index.js.map +1 -1
  17. package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
  18. package/dist/knowledge/index.d.ts +3 -3
  19. package/dist/meta-eval/index.d.ts +1 -1
  20. package/dist/{multi-layer-verifier-U-c8ge1k.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +1 -1
  21. package/dist/openapi.json +1 -1
  22. package/dist/optimization.d.ts +8 -8
  23. package/dist/pipelines/index.d.ts +6 -6
  24. package/dist/prm/index.d.ts +4 -4
  25. package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
  26. package/dist/{release-report-CCQqnK46.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
  27. package/dist/replay-BX5Fm8en.d.ts +529 -0
  28. package/dist/reporting.d.ts +4 -4
  29. package/dist/{researcher-G81CWc0q.d.ts → researcher-ClDX3KZx.d.ts} +5 -5
  30. package/dist/rl.d.ts +8 -8
  31. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
  32. package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
  33. package/dist/{summary-report-Dl4akLKX.d.ts → summary-report-jrSGb2xZ.d.ts} +1 -1
  34. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
  35. package/dist/traces.d.ts +9 -311
  36. package/dist/traces.js +15 -986
  37. package/dist/traces.js.map +1 -1
  38. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
  39. package/dist/wire/index.d.ts +4 -4
  40. package/package.json +1 -1
  41. package/dist/chunk-4U4BKCXK.js +0 -569
  42. package/dist/chunk-4U4BKCXK.js.map +0 -1
  43. package/dist/replay-D7z0J43-.d.ts +0 -225
package/CHANGELOG.md CHANGED
@@ -1,5 +1,92 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.29.0 — 2026-05-19
4
+
5
+ ### Analyst kinds + cross-run findings context
6
+
7
+ Builds on 0.28.0's analyst registry. Ships four trace-analyst **kinds**
8
+ that emit graded findings through native Ax structured output (no more
9
+ flat-defaulted bullet lists) and a cross-run findings context the
10
+ registry can inject into prompts so each kind sees what the prior run
11
+ already surfaced.
12
+
13
+ ### Added
14
+
15
+ - **`createTraceAnalystKind(spec, opts)`** (`src/analyst/kind-factory.ts`) —
16
+ turns a `TraceAnalystKindSpec` into a registry-ready
17
+ `Analyst<TraceAnalysisStore>`. Ax signature is
18
+ `'question:string -> findings:json[]'`; the Zod boundary in
19
+ `finding-signature.ts` rejects malformed rows instead of lifting them
20
+ with default severity. Supports `versionSuffix` for optimizer-fitted
21
+ prompts (MIPRO / GEPA / Bootstrap) and a per-row `postProcess` hook.
22
+ - **`RawAnalystFinding`** Zod schema + **`RAW_FINDING_SCHEMA_PROMPT`**
23
+ string embedded into kind actor prompts so the model and the parser
24
+ share one source of truth.
25
+ - **`TraceToolGroupName`** + **`buildTraceToolsForGroup`**
26
+ (`src/analyst/tool-groups.ts`) — five named tool subsets
27
+ (`all | discovery | discoveryAndRead | discoveryAndSearch | targeted`);
28
+ unknown group names throw.
29
+ - **Four shipping kinds** (`src/analyst/kinds/`):
30
+ - `FAILURE_MODE_KIND_SPEC` — clusters dataset failures into distinct
31
+ modes (maxDepth 3, parallel 4, all tools).
32
+ - `KNOWLEDGE_GAP_KIND_SPEC` — attributes missing/stale knowledge to
33
+ `agent-knowledge:wiki:*`, `websearch:outdated:*`, `tool-doc:*`,
34
+ `system-prompt:*`, `memory:*` (maxDepth 2, discoveryAndSearch).
35
+ - `KNOWLEDGE_POISONING_KIND_SPEC` — dual-verify analyst for
36
+ confident-but-wrong actions (maxDepth 2, all tools).
37
+ - `IMPROVEMENT_KIND_SPEC` — converts upstream failure / gap /
38
+ poisoning findings into concrete locus-named edits with leverage
39
+ grades (maxDepth 3, all tools).
40
+ - **`DEFAULT_TRACE_ANALYST_KINDS`** — the four specs in canonical run
41
+ order (failure-mode → gap → poisoning → improvement).
42
+ - **`priorFindings` on `AnalystContext`** — registry injects findings
43
+ from a prior `AnalystRunResult` into every analyst's context, so an
44
+ improvement-kind run can see the failure-mode findings the previous
45
+ pass surfaced. Kinds reference prior findings via
46
+ `evidence_uri: "finding://<id>"`.
47
+
48
+ ### Deprecated
49
+
50
+ - `createTraceAnalystAdapter` (`src/analyst/adapters.ts`) — the legacy
51
+ bullet-list lifter. Kept for one minor while consumers migrate to
52
+ `createTraceAnalystKind`.
53
+
54
+ ## 0.28.0 — 2026-05-19
55
+
56
+ ### Analyst registry + findings envelope
57
+
58
+ A generic, model-agnostic orchestration layer over the existing
59
+ analyzers (`analyzeTraces`, `MultiLayerVerifier`, `RunCritic`,
60
+ `SemanticConceptJudge`, `JudgeFn`). One contract, one runner, one
61
+ persistence path. Reusable by VB operator bench, leaderboard submission
62
+ pipeline, and orchestrator on-completion reports with the same code.
63
+
64
+ ### Added
65
+
66
+ - **`Analyst<TInput>`** contract + **`AnalystFinding`** envelope with
67
+ sha-stable `finding_id` (`src/analyst/types.ts`).
68
+ - **`AnalystRegistry`** (`src/analyst/registry.ts`) — register/list/run
69
+ with input routing by `inputKind`, per-analyst isolation, equal-split
70
+ budget by default, per-analyst telemetry.
71
+ - **`AnalystHooks`** — `onBeforeAnalyze | onAfterAnalyze | onError |
72
+ onComplete`. Generic seam for telemetry, cost ingestion, rotation,
73
+ error → finding conversion.
74
+ - **`BudgetPolicy`** — `{ totalUsd, weights, allocate }`. Default
75
+ equal-split; weighted split or custom `allocate(args)` for precision.
76
+ - **`ChatClient`** abstraction (`src/analyst/chat-client.ts`) over
77
+ `router | sandbox-sdk | cli-bridge | direct-provider | mock` so
78
+ analyst code is transport-agnostic; `wrapLlmClient` races the call
79
+ against `ChatCallOpts.signal`.
80
+ - **`FindingsStore`** + **`diffFindings(prev, cur, { isMaterial })`**
81
+ (`src/analyst/findings-store.ts`) — locked JSONL persistence + cross-run
82
+ diff (appeared / disappeared / persisted / changed) with a pluggable
83
+ materiality predicate (`defaultIsMaterial` exported for layering).
84
+ - Five **adapter** factories (`src/analyst/adapters.ts`) that lift
85
+ existing primitives into the contract without re-implementing them:
86
+ `createTraceAnalystAdapter`, `createVerifierAdapter`,
87
+ `createRunCriticAdapter`, `createJudgeAdapter`,
88
+ `createSemanticConceptJudgeAdapter`.
89
+
3
90
  ## 0.27.2 — 2026-05-17
4
91
 
5
92
  ### Corpus-wide inter-rater agreement primitive
@@ -1,4 +1,4 @@
1
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
1
+ import { T as TraceStore } from './store-BP5be6s7.js';
2
2
 
3
3
  /**
4
4
  * Tool-use metrics — derived purely from trace data.
@@ -1,6 +1,6 @@
1
- import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-B2kWEdh9.js';
2
- import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
- import { T as TraceStore, R as Run } from '../store-Db2Bv8Cf.js';
1
+ import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-BJ54PDan.js';
2
+ import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
3
+ import { T as TraceStore, R as Run } from '../store-BP5be6s7.js';
4
4
 
5
5
  /**
6
6
  * BuilderSession — ties a builder-of-builders workflow together.