@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +518 -9
  11. package/dist/campaign/index.js +672 -22
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
  19. package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/contract/index.d.ts +17 -13
  29. package/dist/contract/index.js +13 -7
  30. package/dist/contract/index.js.map +1 -1
  31. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  32. package/dist/control.d.ts +2 -2
  33. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  34. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  35. package/dist/hosted/index.d.ts +223 -2
  36. package/dist/index.d.ts +49 -1323
  37. package/dist/index.js +353 -2496
  38. package/dist/index.js.map +1 -1
  39. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  40. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  41. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  42. package/dist/openapi.json +1 -1
  43. package/dist/pareto-E-pembql.d.ts +81 -0
  44. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  45. package/dist/redact-B40YG2M_.d.ts +45 -0
  46. package/dist/registry-DuVYiTvw.d.ts +128 -0
  47. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  48. package/dist/rl.d.ts +4 -3
  49. package/dist/rl.js +4 -4
  50. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  51. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  52. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  53. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  54. package/dist/traces.d.ts +371 -308
  55. package/dist/traces.js +43 -18
  56. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  57. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  58. package/dist/wire/index.d.ts +1 -1
  59. package/dist/workflow/index.d.ts +494 -0
  60. package/dist/workflow/index.js +2177 -0
  61. package/dist/workflow/index.js.map +1 -0
  62. package/docs/design/self-improvement-roadmap.md +106 -0
  63. package/package.json +36 -12
  64. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  65. package/dist/chunk-ODGETRTM.js.map +0 -1
  66. package/dist/chunk-SL55X4VN.js +0 -186
  67. package/dist/chunk-SL55X4VN.js.map +0 -1
  68. package/dist/chunk-UD6EF73X.js.map +0 -1
  69. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
package/dist/traces.js CHANGED
@@ -1,20 +1,9 @@
1
1
  import {
2
- DEFAULT_TRACE_ANALYST_BUDGETS,
3
2
  FileSystemTraceStore,
4
3
  InMemoryTraceStore,
5
4
  OTEL_AGENT_EVAL_SCOPE,
6
- OtlpFileTraceStore,
7
5
  ReplayCache,
8
6
  ReplayCacheMissError,
9
- SpanNotFoundError,
10
- TRACE_ANALYST_ACTOR_DESCRIPTION,
11
- TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
12
- TRACE_ANALYST_SUBAGENT_DESCRIPTION,
13
- TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
14
- TraceFileMissingError,
15
- TraceNotFoundError,
16
- analyzeTraces,
17
- buildTraceAnalystTools,
18
7
  buildTraceInsightContext,
19
8
  buildTraceInsightPrompt,
20
9
  captureFetchToRawSink,
@@ -29,12 +18,13 @@ import {
29
18
  inferDomainKeywords,
30
19
  iterateRawCalls,
31
20
  otelRunCompleteHook,
21
+ otlpToRunRecords,
22
+ otlpToTraceRunRecords,
32
23
  planTraceInsightQuestions,
33
24
  scoreTraceInsightReadiness,
34
25
  tokenizeDomainWords,
35
- traceAnalystFunctionGroup,
36
26
  traceAnalystOnRunComplete
37
- } from "./chunk-ODGETRTM.js";
27
+ } from "./chunk-JHA3ZGSO.js";
38
28
  import {
39
29
  DEFAULT_REDACTION_RULES,
40
30
  REDACTION_VERSION,
@@ -60,16 +50,34 @@ import {
60
50
  isSandboxSpan,
61
51
  isToolSpan
62
52
  } from "./chunk-5BKGXME7.js";
53
+ import {
54
+ DEFAULT_TRACE_ANALYST_BUDGETS,
55
+ OtlpFileTraceStore,
56
+ SpanNotFoundError,
57
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
58
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
59
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
60
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
61
+ TraceFileMissingError,
62
+ TraceNotFoundError,
63
+ analyzeTraces,
64
+ asNumber,
65
+ asString,
66
+ buildTraceAnalystTools,
67
+ extractOtlpAttributes,
68
+ firstNumberAttr,
69
+ firstStringAttr,
70
+ inferOtlpKind,
71
+ projectOtlpFlatLine,
72
+ readOtlpStatus,
73
+ stringField,
74
+ traceAnalystFunctionGroup
75
+ } from "./chunk-VUINJM5M.js";
63
76
  import {
64
77
  RunIntegrityError,
65
78
  assertRunCaptured,
66
79
  throwIfRunIncomplete
67
80
  } from "./chunk-SBCB6VZY.js";
68
- import {
69
- TraceEmitter,
70
- llmSpanFromProvider
71
- } from "./chunk-TVVP3ZZQ.js";
72
- import "./chunk-VSMTAMNK.js";
73
81
  import {
74
82
  FileSystemRawProviderSink,
75
83
  InMemoryRawProviderSink,
@@ -77,6 +85,12 @@ import {
77
85
  defaultProviderRedactor,
78
86
  providerFromBaseUrl
79
87
  } from "./chunk-PC4UYEBM.js";
88
+ import "./chunk-F3SRAAZO.js";
89
+ import {
90
+ TraceEmitter,
91
+ llmSpanFromProvider
92
+ } from "./chunk-TVVP3ZZQ.js";
93
+ import "./chunk-VSMTAMNK.js";
80
94
  import "./chunk-3BFEG2F6.js";
81
95
  import "./chunk-PZ5AY32C.js";
82
96
  export {
@@ -106,6 +120,8 @@ export {
106
120
  aggregateLlm,
107
121
  analyzeTraces,
108
122
  argHash,
123
+ asNumber,
124
+ asString,
109
125
  assertRunCaptured,
110
126
  buildTraceAnalystTools,
111
127
  buildTraceInsightContext,
@@ -119,9 +135,13 @@ export {
119
135
  describeTraceInsightScope,
120
136
  domainEvidencePattern,
121
137
  exportRunAsOtlp,
138
+ extractOtlpAttributes,
139
+ firstNumberAttr,
140
+ firstStringAttr,
122
141
  flattenOtlpExportToNdjson,
123
142
  groupBy,
124
143
  inferDomainKeywords,
144
+ inferOtlpKind,
125
145
  isJudgeSpan,
126
146
  isLlmSpan,
127
147
  isRetrievalSpan,
@@ -132,13 +152,18 @@ export {
132
152
  llmSpanFromProvider,
133
153
  llmSpans,
134
154
  otelRunCompleteHook,
155
+ otlpToRunRecords,
156
+ otlpToTraceRunRecords,
135
157
  planTraceInsightQuestions,
158
+ projectOtlpFlatLine,
136
159
  providerFromBaseUrl,
160
+ readOtlpStatus,
137
161
  redactString,
138
162
  redactValue,
139
163
  runFailureClass,
140
164
  runsForScenario,
141
165
  scoreTraceInsightReadiness,
166
+ stringField,
142
167
  throwIfRunIncomplete,
143
168
  tokenizeDomainWords,
144
169
  toolSpans,
@@ -489,4 +489,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
489
489
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
490
490
  }
491
491
 
492
- export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type CampaignArtifactWriter as c, type CampaignCellResult as d, type CampaignCostMeter as e, type CampaignResult as f, type CampaignTraceWriter as g, type CodeSurface as h, type GateContext as i, type GateDecision as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type LabeledScenarioWrite as q, type LabeledScenarioSampleArgs as r, type LabeledScenarioRecord as s, type LabelTrust as t, type LabeledScenarioSource as u, type CampaignTokenUsage as v, type JudgeAggregate as w, type ProposeContext as x, type ProposedCandidate as y, type ScenarioAggregate as z };
492
+ export { isProposedCandidate as A, labelTrustRank as B, type CampaignAggregates as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ParetoParent as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type CampaignArtifactWriter as c, type CampaignCellResult as d, type CampaignCostMeter as e, type CampaignResult as f, type CampaignTraceWriter as g, type CodeSurface as h, type GateContext as i, type GateDecision as j, type GateResult as k, type GenerationCandidate as l, type GenerationRecord as m, type JudgeDimension as n, type Mutator as o, type SessionScript as p, type ProposeContext as q, type LabeledScenarioWrite as r, type LabeledScenarioSampleArgs as s, type LabeledScenarioRecord as t, type LabelTrust as u, type LabeledScenarioSource as v, type CampaignTokenUsage as w, type JudgeAggregate as x, type ProposedCandidate as y, type ScenarioAggregate as z };
@@ -1,7 +1,7 @@
1
- import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
2
1
  import { R as RunRecord } from './run-record-BgTFzO2r.js';
3
- import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
2
+ import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
4
3
  import { a as JudgeInput } from './types-Croy5h7V.js';
4
+ import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-DbjLfz-K.js';
5
5
 
6
6
  /**
7
7
  * ChatClient — the single LLM abstraction analysts call.
@@ -329,129 +329,4 @@ type AnalystRunEvent = {
329
329
  result: AnalystRunResult;
330
330
  };
331
331
 
332
- /**
333
- * AnalystRegistry — orchestrate N analysts against one run.
334
- *
335
- * Owns three responsibilities and only three:
336
- * 1. Registration — ids must be unique; bad registrations fail loudly
337
- * at register-time, not run-time.
338
- * 2. Routing — each analyst declares its `inputKind`; the registry
339
- * picks the matching field from AnalystRunInputs and skips the
340
- * analyst with a logged reason if it's missing.
341
- * 3. Isolation — one analyst's exception MUST NOT stop other analysts.
342
- * Failed analysts produce zero findings + a 'failed' summary row.
343
- *
344
- * Cross-cutting concerns (telemetry, error → finding conversion, cost
345
- * ingestion, storage rotation) live in `AnalystHooks`. Budget shaping
346
- * (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both
347
- * have sensible defaults; consumers override only what they need.
348
- */
349
-
350
- interface AnalystHooks {
351
- /** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */
352
- onBeforeAnalyze?(args: {
353
- analyst: Analyst;
354
- ctx: AnalystContext;
355
- runId: string;
356
- }): void | Promise<void>;
357
- /** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */
358
- onAfterAnalyze?(args: {
359
- analyst: Analyst;
360
- summary: AnalystRunSummary;
361
- findings: AnalystFinding[];
362
- runId: string;
363
- }): void | Promise<void>;
364
- /**
365
- * On analyst exception. Hook MAY return findings to convert the
366
- * error into structured findings; the summary still reports 'failed'.
367
- * Return void to keep the default empty-findings behavior.
368
- */
369
- onError?(args: {
370
- analyst: Analyst;
371
- error: Error;
372
- runId: string;
373
- }): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
374
- /** Once after registry.run() completes. Use for final aggregation, persistence. */
375
- onComplete?(args: {
376
- result: AnalystRunResult;
377
- }): void | Promise<void>;
378
- }
379
- interface BudgetPolicy {
380
- /** Overall USD cap across the registry.run(). */
381
- totalUsd?: number;
382
- /** Per-analyst weight for the default allocator. Missing ids get weight 1. */
383
- weights?: Record<string, number>;
384
- /**
385
- * Custom allocator — receives the analyst, remaining/total budget, and
386
- * the count of analysts that will run. Returns the per-analyst budget
387
- * (or undefined to leave it uncapped). Overrides weights when set.
388
- */
389
- allocate?: (args: {
390
- analyst: Analyst;
391
- totalUsd: number | undefined;
392
- remainingUsd: number | undefined;
393
- runningCount: number;
394
- }) => number | undefined;
395
- }
396
- interface AnalystRegistryOptions {
397
- /** Shared chat client passed to every LLM analyst via AnalystContext. */
398
- chat?: ChatClient;
399
- /** Logger callback. Defaults to a no-op. */
400
- log?: (msg: string, fields?: Record<string, unknown>) => void;
401
- /** Hooks invoked around analyze() — observability + customization seam. */
402
- hooks?: AnalystHooks;
403
- /** Default budget when run() doesn't override. */
404
- defaultBudget?: BudgetPolicy;
405
- }
406
- interface RegistryRunOpts {
407
- /** Restrict to a subset of registered analysts by id. */
408
- only?: string[];
409
- /** Skip these analysts even if registered. Useful for cheap iteration. */
410
- skip?: string[];
411
- /** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */
412
- budget?: BudgetPolicy;
413
- /** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */
414
- timeoutMs?: number;
415
- /** Abort signal — forwarded into every analyst's context. */
416
- signal?: AbortSignal;
417
- /** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */
418
- tags?: Record<string, string>;
419
- /**
420
- * Prior-run findings made available as retrieval context to every
421
- * analyst via `ctx.priorFindings`. The registry forwards the slice
422
- * whose `analyst_id` matches each registered analyst so a kind sees
423
- * only its own history. Pass `{ '*': findings }` to broadcast to
424
- * every analyst (useful for cross-kind chaining where the improvement
425
- * analyst consumes upstream failure findings).
426
- */
427
- priorFindings?: ReadonlyArray<AnalystFinding> | Record<string, ReadonlyArray<AnalystFinding>>;
428
- }
429
- declare class AnalystRegistry {
430
- private readonly analysts;
431
- private readonly options;
432
- constructor(options?: AnalystRegistryOptions);
433
- register(analyst: Analyst): void;
434
- list(): ReadonlyArray<{
435
- id: string;
436
- description: string;
437
- version: string;
438
- cost: Analyst['cost'];
439
- }>;
440
- run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
441
- /**
442
- * Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
443
- * in real time — `run-started`, then per-analyst `skipped` /
444
- * `started` / `completed`, then a terminal `run-completed` whose
445
- * payload is the full `AnalystRunResult`. UIs use this to render
446
- * progress; persistence consumers use `run()` and read the result.
447
- *
448
- * Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
449
- * `onComplete`) fire as before — streaming is additive, not a hook
450
- * replacement.
451
- */
452
- runStream(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): AsyncGenerator<AnalystRunEvent, void, void>;
453
- private selectAnalysts;
454
- private routeInput;
455
- }
456
-
457
- export { AnalystRegistry as A, type BudgetPolicy as B, type ChatRequest as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type CreateChatClientOpts as f, type AnalystHooks as g, type AnalystInputKind as h, type AnalystRegistryOptions as i, type AnalystRequirements as j, type AnalystRunEvent as k, type AnalystRunInputs as l, type AnalystRunResult as m, type AnalystRunSummary as n, type ChatCallOpts as o, type ChatClient as p, type ChatResponse as q, type ChatTransport as r, type CliBridgeTransportOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
332
+ export { type Analyst as A, type ChatClient as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RouterTransportOpts as R, type SandboxSdkTransportOpts as S, type AnalystContext as a, type AnalystRunSummary as b, type AnalystFinding as c, type AnalystRunResult as d, type AnalystRunInputs as e, type AnalystRunEvent as f, type AnalystCost as g, type AnalystSeverity as h, type AnalystInputKind as i, type AnalystRequirements as j, type ChatCallOpts as k, type ChatRequest as l, type ChatResponse as m, type ChatTransport as n, type CliBridgeTransportOpts as o, type CreateChatClientOpts as p, computeFindingId as q, createChatClient as r, makeFinding as s };
@@ -1,4 +1,4 @@
1
- import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-8hKC5EOb.js';
1
+ import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-B3rErRsh.js';
2
2
  import { T as TraceStore } from '../store-CKUAgsJz.js';
3
3
  import { z } from 'zod';
4
4
  import { OpenAPIObject } from 'openapi3-ts/oas31';