@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +177 -0
  2. package/README.md +43 -1
  3. package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  6. package/dist/chunk-5IIQKMD5.js.map +1 -0
  7. package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
  8. package/dist/chunk-6M774GY6.js +53 -0
  9. package/dist/chunk-6M774GY6.js.map +1 -0
  10. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  11. package/dist/chunk-IOXMGMHQ.js +1226 -0
  12. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  13. package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
  14. package/dist/chunk-KAO3Q65R.js.map +1 -0
  15. package/dist/chunk-QUKKGHTZ.js +121 -0
  16. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  17. package/dist/chunk-SQQLHODJ.js +163 -0
  18. package/dist/chunk-SQQLHODJ.js.map +1 -0
  19. package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
  20. package/dist/chunk-UAND2LOT.js.map +1 -0
  21. package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
  22. package/dist/chunk-USHQBPMH.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  26. package/dist/control.d.ts +4 -3
  27. package/dist/control.js +2 -2
  28. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  29. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  30. package/dist/index.d.ts +16 -302
  31. package/dist/index.js +70 -62
  32. package/dist/index.js.map +1 -1
  33. package/dist/integrity-K2oVlF57.d.ts +210 -0
  34. package/dist/openapi.json +1 -1
  35. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  36. package/dist/optimization.d.ts +7 -144
  37. package/dist/optimization.js +9 -2
  38. package/dist/reporting-B82RSv9C.d.ts +593 -0
  39. package/dist/reporting.d.ts +5 -426
  40. package/dist/reporting.js +17 -6
  41. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  42. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  43. package/dist/traces.d.ts +179 -3
  44. package/dist/traces.js +35 -4
  45. package/dist/wire/index.js +3 -2
  46. package/docs/research-report-methodology.md +170 -0
  47. package/docs/wire-protocol.md +1 -1
  48. package/package.json +11 -13
  49. package/dist/chunk-75MCTH7P.js.map +0 -1
  50. package/dist/chunk-HKYRWNHV.js.map +0 -1
  51. package/dist/chunk-IKFVX537.js.map +0 -1
  52. package/dist/chunk-KWUAAIHR.js.map +0 -1
  53. package/dist/chunk-ODFINDLQ.js +0 -413
  54. package/dist/chunk-ODFINDLQ.js.map +0 -1
  55. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  56. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
  57. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
@@ -1,145 +1,8 @@
1
- import { G as GateDecision } from './multi-shot-optimization-Bvtz294B.js';
2
- export { A as ActionableSideInfo, b as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, e as GenerationReport, I as InMemoryTrialCache, h as MultiShotGateConfig, i as MultiShotGateResult, j as MultiShotMutateAdapter, k as MultiShotOptimizationConfig, l as MultiShotOptimizationResult, m as MultiShotRun, n as MultiShotRunInput, o as MultiShotRunner, p as MultiShotScore, q as MultiShotScorer, r as MultiShotSplit, s as MultiShotTrace, t as MultiShotTrialResult, u as MultiShotVariant, M as MutateAdapter, v as PromptEvolutionConfig, w as PromptEvolutionEvent, x as PromptEvolutionResult, R as ReflectionContext, y as ReflectionProposal, S as ScenarioAggregate, z as ScoreAdapter, T as TrialCache, a as TrialResult, B as TrialTrace, V as VariantAggregate, C as buildReflectionPrompt, J as defaultMultiShotObjectives, Q as parseReflectionResponse, U as runMultiShotOptimization, W as runPromptEvolution, Y as trialTraceFromMultiShotTrial } from './multi-shot-optimization-Bvtz294B.js';
3
- import { a as RunRecord } from './run-record-CX_jcAyr.js';
4
- export { n as FeedbackArtifactType, o as FeedbackAttempt, F as FeedbackLabel, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, b as FeedbackTrajectory, y as FeedbackTrajectoryFilter, a as FeedbackTrajectoryStore, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, U as serializeFeedbackTrajectoriesJsonl, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-BGQ_ANCN.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, y as runEvalCampaign } from './optimization-UVDNKaO6.js';
2
+ export { A as ActionableSideInfo, b as AsiSeverity, c as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, m as GenerationReport, I as InMemoryTrialCache, p as MultiShotGateConfig, q as MultiShotGateResult, r as MultiShotMutateAdapter, s as MultiShotOptimizationConfig, t as MultiShotOptimizationResult, u as MultiShotRun, v as MultiShotRunInput, w as MultiShotRunner, x as MultiShotScore, y as MultiShotScorer, z as MultiShotSplit, B as MultiShotTrace, C as MultiShotTrialResult, J as MultiShotVariant, M as MutateAdapter, N as PromptEvolutionConfig, Q as PromptEvolutionEvent, R as PromptEvolutionResult, U as ReflectionContext, W as ReflectionProposal, a1 as ScenarioAggregate, a2 as ScoreAdapter, T as TrialCache, a as TrialResult, a6 as TrialTrace, V as VariantAggregate, a7 as buildReflectionPrompt, aa as defaultMultiShotObjectives, ah as parseReflectionResponse, aj as runMultiShotOptimization, ak as runPromptEvolution, an as trialTraceFromMultiShotTrial } from './summary-report-D4p7RlDu.js';
3
+ export { n as FeedbackArtifactType, o as FeedbackAttempt, F as FeedbackLabel, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, b as FeedbackTrajectory, y as FeedbackTrajectoryFilter, a as FeedbackTrajectoryStore, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, U as serializeFeedbackTrajectoriesJsonl, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-CB0A32o3.js';
4
+ import './integrity-K2oVlF57.js';
5
+ import './store-u47QaJ9G.js';
6
+ import './emitter-B2XqDKFU.js';
7
+ import './run-record-CX_jcAyr.js';
5
8
  import './dataset-B9qvlm_o.js';
6
- import './emitter-BYO2nSDA.js';
7
-
8
- /**
9
- * Researcher interface — stable hook for an external autonomous-research
10
- * agent to drive the meta-loop.
11
- *
12
- * Implementations live downstream (typically in a private repo that
13
- * runs the actual LLM). This package ships only the contract + a
14
- * `NoopResearcher` so consumers can wire the surface without being
15
- * forced to implement every method up front.
16
- *
17
- * The four methods mirror the four stages of the paper "Two Loops,
18
- * Three Roles":
19
- *
20
- * inspectFailures — given the observed runs, what failure modes
21
- * are present? (data → diagnosis)
22
- * proposeChange — given diagnosed failure modes, what
23
- * structural changes should we try?
24
- * (diagnosis → plan delta)
25
- * applyChange — fold the proposed deltas into a concrete
26
- * experiment plan against an existing baseline.
27
- * (plan delta → executable plan)
28
- * evaluateChange — run the plan, return runs + the gate verdict.
29
- * (executable plan → verdict)
30
- *
31
- * Composition is the discipline: a Researcher implementation MUST
32
- * keep these four steps separate and inspectable. Conflating
33
- * "diagnose + propose + run" into a single LLM call defeats the
34
- * point of the framework — you can't audit which step lied.
35
- *
36
- * THIS INTERFACE IS STABLE. Breaking changes require a new module
37
- * (e.g. `Researcher2`) so existing implementations keep working.
38
- */
39
-
40
- /** A diagnosed failure mode with the run-IDs that exhibit it. */
41
- interface FailureMode {
42
- /** Short machine-readable code. Must be stable across runs of the
43
- * same researcher to enable longitudinal tracking. */
44
- code: string;
45
- /** Human-readable description for the paper / dashboard. */
46
- description: string;
47
- evidence: {
48
- /** Run IDs (from `RunRecord.runId`) where this failure mode was
49
- * observed. */
50
- runIds: string[];
51
- /** Number of run samples that informed the diagnosis. */
52
- samples: number;
53
- };
54
- }
55
- /** A single steering change the researcher wants to try. */
56
- interface SteeringChange {
57
- kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
58
- /** Implementation-specific payload. Researcher implementations
59
- * define the schema — keep this `unknown` here to avoid coupling
60
- * the public interface to any one researcher's internal model. */
61
- payload: unknown;
62
- /** Why the researcher proposed this change. Goes into the audit
63
- * trail next to the failure-mode evidence. */
64
- rationale: string;
65
- /** Optional self-reported expected delta on the headline metric. */
66
- expectedDelta?: number;
67
- }
68
- /** A single experiment plan, mapped onto the search/holdout splits. */
69
- interface ExperimentPlan {
70
- baselineCandidateId: string;
71
- proposedCandidateId: string;
72
- changes: SteeringChange[];
73
- /** USD ceiling for the entire experiment. The runner must stop
74
- * before exceeding this and report a partial result. */
75
- evaluationBudgetUsd: number;
76
- /** Item IDs (your dataset keys) for the search vs holdout splits. */
77
- splits: {
78
- search: string[];
79
- holdout: string[];
80
- };
81
- }
82
- /** Result of running a plan: every run, plus the gate verdict. */
83
- interface ExperimentResult {
84
- plan: ExperimentPlan;
85
- runs: RunRecord[];
86
- gateDecision: GateDecision;
87
- }
88
- /**
89
- * The researcher loop. Stable, four-step, inspectable.
90
- *
91
- * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
92
- * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
93
- * └──────────┘ └──────────┘ └────┬─────┘
94
- * │
95
- * ▼
96
- * ┌────────────────┐ applyChange ┌────────┐
97
- * │ ExperimentPlan │ ◀────────────│ base │
98
- * └────────┬───────┘ └────────┘
99
- * │
100
- * evaluateChange ▼
101
- * ┌────────────────┐
102
- * │ ExperimentResult│
103
- * └────────────────┘
104
- */
105
- interface Researcher {
106
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
107
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
108
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
109
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
110
- }
111
- interface CallbackResearcherOptions {
112
- inspectFailures: Researcher['inspectFailures'];
113
- proposeChange: Researcher['proposeChange'];
114
- applyChange: Researcher['applyChange'];
115
- evaluateChange: Researcher['evaluateChange'];
116
- }
117
- /**
118
- * Minimal concrete researcher for tests, scripts, and small integrations.
119
- * Larger autonomous researchers can still implement `Researcher` directly.
120
- */
121
- declare class CallbackResearcher implements Researcher {
122
- private readonly callbacks;
123
- constructor(callbacks: CallbackResearcherOptions);
124
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
125
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
126
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
127
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
128
- }
129
- /**
130
- * No-op researcher — fails loud on every method. Use as a placeholder
131
- * in code paths that wire the interface but don't have an implementation
132
- * yet. Importantly, this does NOT silently succeed: a no-op researcher
133
- * that returned empty arrays would muffle the loop's signal that
134
- * nobody implemented the brain.
135
- */
136
- declare class NoopResearcher implements Researcher {
137
- private readonly hint;
138
- constructor(hint?: string);
139
- inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
140
- proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
141
- applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
142
- evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
143
- }
144
-
145
- export { CallbackResearcher, type CallbackResearcherOptions, type ExperimentPlan, type ExperimentResult, type FailureMode, NoopResearcher, type Researcher, type SteeringChange };
@@ -19,15 +19,21 @@ import {
19
19
  renderPreferenceMemoryMarkdown,
20
20
  replayFeedbackTrajectories,
21
21
  replayFeedbackTrajectory,
22
+ runEvalCampaign,
22
23
  runMultiShotOptimization,
23
24
  runPromptEvolution,
24
25
  serializeFeedbackTrajectoriesJsonl,
25
26
  summarizePreferenceMemory,
26
27
  trialTraceFromMultiShotTrial,
27
28
  withAssignedFeedbackSplit
28
- } from "./chunk-HKYRWNHV.js";
29
+ } from "./chunk-USHQBPMH.js";
29
30
  import "./chunk-YUFXO3TU.js";
30
- import "./chunk-ODFINDLQ.js";
31
+ import "./chunk-IOXMGMHQ.js";
32
+ import "./chunk-QUKKGHTZ.js";
33
+ import "./chunk-5IIQKMD5.js";
34
+ import "./chunk-6M774GY6.js";
35
+ import "./chunk-KAO3Q65R.js";
36
+ import "./chunk-SQQLHODJ.js";
31
37
  import "./chunk-PZ5AY32C.js";
32
38
  export {
33
39
  CallbackResearcher,
@@ -50,6 +56,7 @@ export {
50
56
  renderPreferenceMemoryMarkdown,
51
57
  replayFeedbackTrajectories,
52
58
  replayFeedbackTrajectory,
59
+ runEvalCampaign,
53
60
  runMultiShotOptimization,
54
61
  runPromptEvolution,
55
62
  serializeFeedbackTrajectoriesJsonl,