@tangle-network/agent-eval 0.49.0 → 0.50.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +135 -0
- package/README.md +235 -331
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +8 -2
- package/dist/campaign/index.d.ts +3 -3
- package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
- package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
- package/dist/chunk-EGIPWXHL.js.map +1 -0
- package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
- package/dist/chunk-FQK2CCIM.js.map +1 -0
- package/dist/chunk-MAZ26DC7.js +99 -0
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
- package/dist/contract/index.d.ts +206 -9
- package/dist/contract/index.js +751 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/hosted/index.d.ts +8 -192
- package/dist/hosted/index.js +1 -1
- package/dist/index-BRxz6qov.d.ts +409 -0
- package/dist/index.d.ts +18 -462
- package/dist/index.js +14 -106
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
- package/dist/registry-8KAs18kY.d.ts +457 -0
- package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +6 -4
- package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
- package/dist/rl.d.ts +9 -8
- package/dist/rl.js +3 -2
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
- package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
- package/dist/sequential-5iSVfzl2.d.ts +139 -0
- package/dist/store-CJbzDxZ2.d.ts +220 -0
- package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
- package/dist/traces.d.ts +3 -220
- package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
- package/dist/types-DhqpAi_z.d.ts +296 -0
- package/docs/concepts.md +20 -0
- package/docs/customer-journeys.md +208 -0
- package/docs/insight-report.md +337 -0
- package/package.json +1 -1
- package/dist/chunk-MNL6LXGQ.js.map +0 -1
- package/dist/chunk-OYI6RZJK.js.map +0 -1
- /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
- /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -2,16 +2,20 @@ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunT
|
|
|
2
2
|
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
3
3
|
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
|
|
4
4
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
5
|
-
import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-
|
|
6
|
-
export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-
|
|
5
|
+
import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-LZD0qHEa.js';
|
|
6
|
+
export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-LZD0qHEa.js';
|
|
7
7
|
import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
|
|
8
8
|
export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
|
|
9
|
-
import { L as LlmClientOptions
|
|
10
|
-
export { d as LlmCallError, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
|
|
11
|
-
import {
|
|
12
|
-
export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES,
|
|
13
|
-
import {
|
|
14
|
-
export {
|
|
9
|
+
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
10
|
+
export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
|
|
11
|
+
import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
12
|
+
export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
13
|
+
import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
|
|
14
|
+
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-CJbzDxZ2.js';
|
|
15
|
+
import { b as JudgeFn, J as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-DhqpAi_z.js';
|
|
16
|
+
export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, a as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-DhqpAi_z.js';
|
|
17
|
+
import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext } from './registry-8KAs18kY.js';
|
|
18
|
+
export { f as AnalystHooks, g as AnalystInputKind, A as AnalystRegistry, h as AnalystRegistryOptions, i as AnalystRequirements, j as AnalystRunEvent, k as AnalystRunInputs, l as AnalystRunResult, m as AnalystRunSummary, B as BudgetPolicy, C as ChatCallOpts, n as ChatClient, o as ChatRequest, p as ChatResponse, q as ChatTransport, r as CliBridgeTransportOpts, s as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-8KAs18kY.js';
|
|
15
19
|
import { TCloud } from '@tangle-network/tcloud';
|
|
16
20
|
import { z } from 'zod';
|
|
17
21
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
@@ -20,6 +24,8 @@ export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError,
|
|
|
20
24
|
import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-BSxqEpu7.js';
|
|
21
25
|
export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-BSxqEpu7.js';
|
|
22
26
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
27
|
+
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DSu0DWy8.js';
|
|
28
|
+
export { A as ActionableSideInfo, s as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, C as CorpusAgreementOptions, t as CorpusAgreementPerDimension, u as CorpusAgreementReport, v as CorpusScoreRecord, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, x as bonferroni, n as bootstrapCi, y as cohensD, z as confidenceInterval, D as corpusInterRaterAgreement, E as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, F as interRaterReliability, p as judgeReplayGate, G as mannWhitneyU, H as normalizeScores, q as pairedBootstrap, I as pairedMde, K as pairedTTest, L as partialCredit, r as renderReleaseReport, M as requiredSampleSize, N as weightedMean, w as wilcoxonSignedRank } from './release-report-DSu0DWy8.js';
|
|
23
29
|
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
|
|
24
30
|
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
|
|
25
31
|
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
@@ -40,8 +46,9 @@ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b
|
|
|
40
46
|
import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
|
|
41
47
|
export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
|
|
42
48
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
|
|
43
|
-
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions,
|
|
44
|
-
|
|
49
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-B7gNRX-r.js';
|
|
50
|
+
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
51
|
+
import './outcome-store-D6KWmYvj.js';
|
|
45
52
|
|
|
46
53
|
interface RunScore {
|
|
47
54
|
success: number;
|
|
@@ -229,332 +236,6 @@ declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, optio
|
|
|
229
236
|
*/
|
|
230
237
|
declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
|
|
231
238
|
|
|
232
|
-
/**
|
|
233
|
-
* ChatClient — the single LLM abstraction analysts call.
|
|
234
|
-
*
|
|
235
|
-
* agent-eval already ships an `LlmClient` (OpenAI-compatible, retry,
|
|
236
|
-
* graceful JSON-schema degrade) and judges that talk to `TCloud`. Two
|
|
237
|
-
* mixed patterns force every analyst author to pick a transport, which
|
|
238
|
-
* couples analyst code to runtime concerns (cli-bridge vs router vs
|
|
239
|
-
* sandbox-sdk) it shouldn't know about.
|
|
240
|
-
*
|
|
241
|
-
* `ChatClient` is one interface every analyst takes via `AnalystContext.chat`.
|
|
242
|
-
* The operator decides at the registry boundary which transport binds
|
|
243
|
-
* to it. Analyst code stays transport-agnostic; swapping production
|
|
244
|
-
* (sandbox-sdk) for local dev (cli-bridge) or tests (mock) is a one-
|
|
245
|
-
* line factory call.
|
|
246
|
-
*
|
|
247
|
-
* Designed to coexist: existing `LlmClient` callers and existing
|
|
248
|
-
* `TCloud`-based judges keep working untouched. New analyst code uses
|
|
249
|
-
* `ChatClient`. When old call sites migrate, they pick up budgeting,
|
|
250
|
-
* cancellation, and unified telemetry for free.
|
|
251
|
-
*/
|
|
252
|
-
|
|
253
|
-
/**
|
|
254
|
-
* Unified chat interface. Mirrors LlmCallRequest/Result so the OpenAI-
|
|
255
|
-
* compatible mental model stays. Two methods: a one-shot `chat()` and
|
|
256
|
-
* an `streamChat()` for future agentic loops (not yet exposed).
|
|
257
|
-
*/
|
|
258
|
-
interface ChatClient {
|
|
259
|
-
/** Display name of the bound transport — included in telemetry. */
|
|
260
|
-
readonly transport: ChatTransport;
|
|
261
|
-
/** Default model when caller omits — operators bind this per environment. */
|
|
262
|
-
readonly defaultModel?: string;
|
|
263
|
-
chat(req: ChatRequest, opts?: ChatCallOpts): Promise<ChatResponse>;
|
|
264
|
-
}
|
|
265
|
-
type ChatTransport = 'router' | 'sandbox-sdk' | 'cli-bridge' | 'direct-provider' | 'mock';
|
|
266
|
-
interface ChatRequest extends Omit<LlmCallRequest, 'model'> {
|
|
267
|
-
/** Optional — falls back to ChatClient.defaultModel. */
|
|
268
|
-
model?: string;
|
|
269
|
-
}
|
|
270
|
-
type ChatResponse = LlmCallResult;
|
|
271
|
-
interface ChatCallOpts {
|
|
272
|
-
/** Cancel the in-flight request. */
|
|
273
|
-
signal?: AbortSignal;
|
|
274
|
-
/** Hard USD ceiling for this single call (informational; the underlying transport may not enforce). */
|
|
275
|
-
maxCostUsd?: number;
|
|
276
|
-
/** Correlation tag carried into request headers when the transport allows. */
|
|
277
|
-
correlationId?: string;
|
|
278
|
-
}
|
|
279
|
-
type CreateChatClientOpts = RouterTransportOpts | CliBridgeTransportOpts | DirectProviderTransportOpts | SandboxSdkTransportOpts | MockTransportOpts;
|
|
280
|
-
interface BaseTransportOpts {
|
|
281
|
-
defaultModel?: string;
|
|
282
|
-
}
|
|
283
|
-
interface RouterTransportOpts extends BaseTransportOpts {
|
|
284
|
-
transport: 'router';
|
|
285
|
-
baseUrl?: string;
|
|
286
|
-
apiKey: string;
|
|
287
|
-
}
|
|
288
|
-
interface CliBridgeTransportOpts extends BaseTransportOpts {
|
|
289
|
-
transport: 'cli-bridge';
|
|
290
|
-
baseUrl?: string;
|
|
291
|
-
bearer?: string;
|
|
292
|
-
}
|
|
293
|
-
interface DirectProviderTransportOpts extends BaseTransportOpts {
|
|
294
|
-
transport: 'direct-provider';
|
|
295
|
-
baseUrl: string;
|
|
296
|
-
apiKey: string;
|
|
297
|
-
}
|
|
298
|
-
/**
|
|
299
|
-
* Sandbox-SDK transport. Provided as a thin pass-through: the caller
|
|
300
|
-
* supplies a callable that mimics LlmClient.chat() against an already-
|
|
301
|
-
* configured Sandbox handle. We don't import the SDK here to keep
|
|
302
|
-
* agent-eval dep-free of @tangle-network/sandbox.
|
|
303
|
-
*/
|
|
304
|
-
interface SandboxSdkTransportOpts extends BaseTransportOpts {
|
|
305
|
-
transport: 'sandbox-sdk';
|
|
306
|
-
chat: (req: ChatRequest, opts?: ChatCallOpts) => Promise<ChatResponse>;
|
|
307
|
-
}
|
|
308
|
-
/**
|
|
309
|
-
* Mock transport for tests. The handler receives the request and returns
|
|
310
|
-
* whatever the test wants. No retries, no JSON-schema degrade.
|
|
311
|
-
*/
|
|
312
|
-
interface MockTransportOpts extends BaseTransportOpts {
|
|
313
|
-
transport: 'mock';
|
|
314
|
-
handler: (req: ChatRequest, opts?: ChatCallOpts) => Promise<ChatResponse>;
|
|
315
|
-
}
|
|
316
|
-
/**
|
|
317
|
-
* Build a ChatClient bound to a specific transport. The returned client
|
|
318
|
-
* is safe to share across analysts in a single registry run.
|
|
319
|
-
*/
|
|
320
|
-
declare function createChatClient(opts: CreateChatClientOpts): ChatClient;
|
|
321
|
-
|
|
322
|
-
/**
|
|
323
|
-
* Analyst contract — the missing orchestration layer over agent-eval's
|
|
324
|
-
* existing analyzers (analyzeTraces, MultiLayerVerifier, RunCritic,
|
|
325
|
-
* SemanticConceptJudge, JudgeFn, ...).
|
|
326
|
-
*
|
|
327
|
-
* Each existing primitive returns its own output shape. The Analyst
|
|
328
|
-
* contract is the single envelope every primitive lifts into, so a
|
|
329
|
-
* registry can run N analysts against a run and a single renderer can
|
|
330
|
-
* compose findings without knowing which analyzer produced them.
|
|
331
|
-
*
|
|
332
|
-
* The contract is intentionally domain-agnostic: nothing here knows
|
|
333
|
-
* about code, voice, RAG, or any particular agent stack. Analysts
|
|
334
|
-
* declare what INPUT KIND they need (a trace store, an artifact dir,
|
|
335
|
-
* a RunRecord, a JudgeInput, or `custom`), and the registry routes
|
|
336
|
-
* the matching input from `AnalystRunInputs`.
|
|
337
|
-
*/
|
|
338
|
-
|
|
339
|
-
/**
|
|
340
|
-
* Unified envelope every analyst emits. Schema-versioned so renderers
|
|
341
|
-
* and time-series diffs survive future field additions.
|
|
342
|
-
*/
|
|
343
|
-
interface AnalystFinding {
|
|
344
|
-
schema_version: '1.0.0';
|
|
345
|
-
/**
|
|
346
|
-
* Stable hash over identity-defining fields (analyst_id + canonical
|
|
347
|
-
* claim + area + optional subject). Two findings from two runs that
|
|
348
|
-
* "are the same finding" share this id — that's what `diffFindings`
|
|
349
|
-
* uses to compute appeared/disappeared sets across runs.
|
|
350
|
-
*/
|
|
351
|
-
finding_id: string;
|
|
352
|
-
analyst_id: string;
|
|
353
|
-
produced_at: string;
|
|
354
|
-
severity: AnalystSeverity;
|
|
355
|
-
/**
|
|
356
|
-
* Coarse classification. Renderers group by this. Free-form so
|
|
357
|
-
* domain-specific analysts can introduce categories without a
|
|
358
|
-
* schema change ('agent-reasoning', 'verification', 'cost',
|
|
359
|
-
* 'tool-use', 'safety', 'latency', 'data-quality', ...).
|
|
360
|
-
*/
|
|
361
|
-
area: string;
|
|
362
|
-
claim: string;
|
|
363
|
-
rationale?: string;
|
|
364
|
-
evidence_refs: EvidenceRef[];
|
|
365
|
-
recommended_action?: string;
|
|
366
|
-
validation_plan?: string;
|
|
367
|
-
/** 0..1 — the analyst's own confidence. Not calibrated across analysts. */
|
|
368
|
-
confidence: number;
|
|
369
|
-
/**
|
|
370
|
-
* Optional subject the finding is about — leaf id, agent id, request
|
|
371
|
-
* id. Included in finding_id when present so per-subject findings
|
|
372
|
-
* diff cleanly across runs.
|
|
373
|
-
*/
|
|
374
|
-
subject?: string;
|
|
375
|
-
/** Analyst-private extras; renderers ignore unless they know the analyst. */
|
|
376
|
-
metadata?: Record<string, unknown>;
|
|
377
|
-
}
|
|
378
|
-
type AnalystSeverity = 'critical' | 'high' | 'medium' | 'low' | 'info';
|
|
379
|
-
interface EvidenceRef {
|
|
380
|
-
/**
|
|
381
|
-
* Where the evidence lives. `span` and `event` refer to OTLP trace
|
|
382
|
-
* elements; `artifact` to a file inside the run's artifact tree;
|
|
383
|
-
* `finding` to another AnalystFinding (cross-analyst chaining);
|
|
384
|
-
* `metric` to a named scalar reading the renderer knows how to read.
|
|
385
|
-
*/
|
|
386
|
-
kind: 'span' | 'event' | 'artifact' | 'finding' | 'metric';
|
|
387
|
-
uri: string;
|
|
388
|
-
excerpt?: string;
|
|
389
|
-
}
|
|
390
|
-
/**
|
|
391
|
-
* The discriminator the registry uses to pass the right input.
|
|
392
|
-
* `custom` is the escape hatch — analysts that need something else
|
|
393
|
-
* (e.g. an embedding cache, a partner SDK handle) read it from
|
|
394
|
-
* `AnalystRunInputs.custom[<analyst id>]`.
|
|
395
|
-
*/
|
|
396
|
-
type AnalystInputKind = 'trace-store' | 'artifact-dir' | 'run-record' | 'judge-input' | 'custom';
|
|
397
|
-
interface AnalystCost {
|
|
398
|
-
/** `deterministic` analysts MUST NOT call the LLM. */
|
|
399
|
-
kind: 'deterministic' | 'llm';
|
|
400
|
-
/** Optional declared upper bound; the registry can enforce a budget. */
|
|
401
|
-
est_usd_per_run?: number;
|
|
402
|
-
/** Models the analyst expects to use (informational). */
|
|
403
|
-
models?: string[];
|
|
404
|
-
}
|
|
405
|
-
interface AnalystRequirements {
|
|
406
|
-
/** Min number of shots / samples the analyst needs to produce signal. */
|
|
407
|
-
min_shots?: number;
|
|
408
|
-
/** Capabilities the runtime must supply (e.g. ['network', 'gpu']). */
|
|
409
|
-
capabilities?: string[];
|
|
410
|
-
}
|
|
411
|
-
/**
|
|
412
|
-
* What's passed to every analyst call. The registry resolves which
|
|
413
|
-
* field the analyst's `inputKind` selects and asserts it's present.
|
|
414
|
-
*/
|
|
415
|
-
interface AnalystRunInputs {
|
|
416
|
-
traceStore?: TraceAnalysisStore;
|
|
417
|
-
artifactDir?: string;
|
|
418
|
-
runRecord?: RunRecord;
|
|
419
|
-
judgeInput?: JudgeInput;
|
|
420
|
-
/** Keyed by analyst id; populated by callers that registered custom analysts. */
|
|
421
|
-
custom?: Record<string, unknown>;
|
|
422
|
-
}
|
|
423
|
-
interface AnalystContext {
|
|
424
|
-
runId: string;
|
|
425
|
-
/** Stable correlation id so logs from a single registry.run() share a tag. */
|
|
426
|
-
correlationId: string;
|
|
427
|
-
/** Wall-clock deadline (epoch ms). Analysts SHOULD honor for graceful cancel. */
|
|
428
|
-
deadlineMs?: number;
|
|
429
|
-
/** Per-analyst USD budget. Analysts MAY check before issuing LLM calls. */
|
|
430
|
-
budgetUsd?: number;
|
|
431
|
-
/**
|
|
432
|
-
* Shared chat client. Analysts that call an LLM go through this so
|
|
433
|
-
* the operator picks transport (sandbox-sdk | router | cli-bridge |
|
|
434
|
-
* direct-provider | mock) at the registry boundary without touching
|
|
435
|
-
* analyst code.
|
|
436
|
-
*/
|
|
437
|
-
chat?: ChatClient;
|
|
438
|
-
/**
|
|
439
|
-
* Findings from a prior run the operator wants the analyst to see as
|
|
440
|
-
* retrieval context. Kinds that take advantage of cross-run memory
|
|
441
|
-
* (failure-mode "I saw this cluster last run", knowledge-gap "the wiki
|
|
442
|
-
* page I asked for is still missing") render these into the actor's
|
|
443
|
-
* working set. Filtering is the operator's job: pass the slice that
|
|
444
|
-
* matches the analyst's id, or pass everything and let the kind
|
|
445
|
-
* filter. Empty / absent means no cross-run context.
|
|
446
|
-
*/
|
|
447
|
-
priorFindings?: ReadonlyArray<AnalystFinding>;
|
|
448
|
-
/** Free-form runtime tags (env, host, op). Findings can echo these into metadata. */
|
|
449
|
-
tags?: Record<string, string>;
|
|
450
|
-
/** Logger callback — analysts SHOULD prefer this over console.* for testability. */
|
|
451
|
-
log?: (msg: string, fields?: Record<string, unknown>) => void;
|
|
452
|
-
/** Optional abort signal. Analysts SHOULD pass it through to LLM calls. */
|
|
453
|
-
signal?: AbortSignal;
|
|
454
|
-
}
|
|
455
|
-
/**
|
|
456
|
-
* The minimal contract. Concrete analysts can refine `TInput` so
|
|
457
|
-
* implementations stay type-safe (e.g. a trace analyst's `TInput` is
|
|
458
|
-
* `TraceAnalysisStore`); the registry passes the right field from
|
|
459
|
-
* `AnalystRunInputs` based on `inputKind`.
|
|
460
|
-
*/
|
|
461
|
-
interface Analyst<TInput = unknown> {
|
|
462
|
-
/** Stable identifier — appears in finding_id, telemetry, and registry exclusion lists. */
|
|
463
|
-
readonly id: string;
|
|
464
|
-
/** Human-readable. One sentence. */
|
|
465
|
-
readonly description: string;
|
|
466
|
-
readonly inputKind: AnalystInputKind;
|
|
467
|
-
readonly cost: AnalystCost;
|
|
468
|
-
readonly requires?: AnalystRequirements;
|
|
469
|
-
/** Bump on breaking changes to claim wording or area so old finding_ids don't collide. */
|
|
470
|
-
readonly version: string;
|
|
471
|
-
analyze(input: TInput, ctx: AnalystContext): Promise<AnalystFinding[]>;
|
|
472
|
-
}
|
|
473
|
-
/**
|
|
474
|
-
* Compute the stable finding_id from the identity-defining fields.
|
|
475
|
-
* Default implementation hashes {analyst_id, area, subject, normalized claim}.
|
|
476
|
-
* Analysts that emit findings whose claim text varies per run (timestamps,
|
|
477
|
-
* counts) SHOULD either: (a) pass an explicit `id_basis` to fix the hash,
|
|
478
|
-
* or (b) move the variable part into `rationale`/`metadata` and keep the
|
|
479
|
-
* `claim` static.
|
|
480
|
-
*/
|
|
481
|
-
declare function computeFindingId(input: {
|
|
482
|
-
analyst_id: string;
|
|
483
|
-
area: string;
|
|
484
|
-
subject?: string;
|
|
485
|
-
claim: string;
|
|
486
|
-
/** Override the claim for hashing — use when the displayed claim has run-specific bits. */
|
|
487
|
-
id_basis?: string;
|
|
488
|
-
}): string;
|
|
489
|
-
/**
|
|
490
|
-
* Convenience factory: produce a fully-formed AnalystFinding with the
|
|
491
|
-
* id computed automatically. Analyst code stays terse.
|
|
492
|
-
*/
|
|
493
|
-
declare function makeFinding(init: Omit<AnalystFinding, 'schema_version' | 'finding_id' | 'produced_at'> & {
|
|
494
|
-
id_basis?: string;
|
|
495
|
-
produced_at?: string;
|
|
496
|
-
}): AnalystFinding;
|
|
497
|
-
interface AnalystRunSummary {
|
|
498
|
-
analyst_id: string;
|
|
499
|
-
status: 'ok' | 'skipped' | 'failed';
|
|
500
|
-
/** Why skipped — missing input, budget exceeded, capability unmet. */
|
|
501
|
-
reason?: string;
|
|
502
|
-
findings_count: number;
|
|
503
|
-
latency_ms: number;
|
|
504
|
-
cost_usd: number;
|
|
505
|
-
/** When `status='failed'`: the error class + message, never the full stack. */
|
|
506
|
-
error?: {
|
|
507
|
-
class: string;
|
|
508
|
-
message: string;
|
|
509
|
-
};
|
|
510
|
-
}
|
|
511
|
-
interface AnalystRunResult {
|
|
512
|
-
run_id: string;
|
|
513
|
-
correlation_id: string;
|
|
514
|
-
started_at: string;
|
|
515
|
-
ended_at: string;
|
|
516
|
-
findings: AnalystFinding[];
|
|
517
|
-
per_analyst: AnalystRunSummary[];
|
|
518
|
-
/** Total LLM cost in USD across all analysts in this registry.run(). */
|
|
519
|
-
total_cost_usd: number;
|
|
520
|
-
}
|
|
521
|
-
/**
|
|
522
|
-
* Events emitted by `AnalystRegistry.runStream(...)` in real time as
|
|
523
|
-
* the registry executes. UIs subscribe via `for await (const ev of
|
|
524
|
-
* registry.runStream(...))`; `registry.run(...)` is a thin collector
|
|
525
|
-
* over the same stream, so the two surfaces share their invariants.
|
|
526
|
-
*
|
|
527
|
-
* Per-finding events are intentionally omitted — analyzers are batch
|
|
528
|
-
* operations (an Ax actor returns the full `findings:json[]` at the
|
|
529
|
-
* end of the responder), so streaming inside one analyst would only
|
|
530
|
-
* emit partial JSON consumers can't render. The kind-completion event
|
|
531
|
-
* is the right granularity; subscribers wanting per-finding rendering
|
|
532
|
-
* iterate `event.findings` themselves.
|
|
533
|
-
*/
|
|
534
|
-
type AnalystRunEvent = {
|
|
535
|
-
type: 'run-started';
|
|
536
|
-
run_id: string;
|
|
537
|
-
correlation_id: string;
|
|
538
|
-
started_at: string;
|
|
539
|
-
/** The ordered list of analyst ids the registry will run. */
|
|
540
|
-
analyst_ids: ReadonlyArray<string>;
|
|
541
|
-
} | {
|
|
542
|
-
type: 'analyst-skipped';
|
|
543
|
-
summary: AnalystRunSummary;
|
|
544
|
-
} | {
|
|
545
|
-
type: 'analyst-started';
|
|
546
|
-
analyst_id: string;
|
|
547
|
-
started_at: string;
|
|
548
|
-
} | {
|
|
549
|
-
type: 'analyst-completed';
|
|
550
|
-
/** `summary.status` is `'ok'` for clean completion or `'failed'` for thrown analysts. */
|
|
551
|
-
summary: AnalystRunSummary;
|
|
552
|
-
findings: ReadonlyArray<AnalystFinding>;
|
|
553
|
-
} | {
|
|
554
|
-
type: 'run-completed';
|
|
555
|
-
result: AnalystRunResult;
|
|
556
|
-
};
|
|
557
|
-
|
|
558
239
|
/**
|
|
559
240
|
* Adapter factories — lift each existing agent-eval primitive into the
|
|
560
241
|
* Analyst contract without re-implementing it.
|
|
@@ -1117,131 +798,6 @@ declare const KNOWLEDGE_POISONING_KIND_SPEC: TraceAnalystKindSpec;
|
|
|
1117
798
|
*/
|
|
1118
799
|
declare const DEFAULT_TRACE_ANALYST_KINDS: readonly TraceAnalystKindSpec[];
|
|
1119
800
|
|
|
1120
|
-
/**
|
|
1121
|
-
* AnalystRegistry — orchestrate N analysts against one run.
|
|
1122
|
-
*
|
|
1123
|
-
* Owns three responsibilities and only three:
|
|
1124
|
-
* 1. Registration — ids must be unique; bad registrations fail loudly
|
|
1125
|
-
* at register-time, not run-time.
|
|
1126
|
-
* 2. Routing — each analyst declares its `inputKind`; the registry
|
|
1127
|
-
* picks the matching field from AnalystRunInputs and skips the
|
|
1128
|
-
* analyst with a logged reason if it's missing.
|
|
1129
|
-
* 3. Isolation — one analyst's exception MUST NOT stop other analysts.
|
|
1130
|
-
* Failed analysts produce zero findings + a 'failed' summary row.
|
|
1131
|
-
*
|
|
1132
|
-
* Cross-cutting concerns (telemetry, error → finding conversion, cost
|
|
1133
|
-
* ingestion, storage rotation) live in `AnalystHooks`. Budget shaping
|
|
1134
|
-
* (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both
|
|
1135
|
-
* have sensible defaults; consumers override only what they need.
|
|
1136
|
-
*/
|
|
1137
|
-
|
|
1138
|
-
interface AnalystHooks {
|
|
1139
|
-
/** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */
|
|
1140
|
-
onBeforeAnalyze?(args: {
|
|
1141
|
-
analyst: Analyst;
|
|
1142
|
-
ctx: AnalystContext;
|
|
1143
|
-
runId: string;
|
|
1144
|
-
}): void | Promise<void>;
|
|
1145
|
-
/** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */
|
|
1146
|
-
onAfterAnalyze?(args: {
|
|
1147
|
-
analyst: Analyst;
|
|
1148
|
-
summary: AnalystRunSummary;
|
|
1149
|
-
findings: AnalystFinding[];
|
|
1150
|
-
runId: string;
|
|
1151
|
-
}): void | Promise<void>;
|
|
1152
|
-
/**
|
|
1153
|
-
* On analyst exception. Hook MAY return findings to convert the
|
|
1154
|
-
* error into structured findings; the summary still reports 'failed'.
|
|
1155
|
-
* Return void to keep the default empty-findings behavior.
|
|
1156
|
-
*/
|
|
1157
|
-
onError?(args: {
|
|
1158
|
-
analyst: Analyst;
|
|
1159
|
-
error: Error;
|
|
1160
|
-
runId: string;
|
|
1161
|
-
}): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
|
|
1162
|
-
/** Once after registry.run() completes. Use for final aggregation, persistence. */
|
|
1163
|
-
onComplete?(args: {
|
|
1164
|
-
result: AnalystRunResult;
|
|
1165
|
-
}): void | Promise<void>;
|
|
1166
|
-
}
|
|
1167
|
-
interface BudgetPolicy {
|
|
1168
|
-
/** Overall USD cap across the registry.run(). */
|
|
1169
|
-
totalUsd?: number;
|
|
1170
|
-
/** Per-analyst weight for the default allocator. Missing ids get weight 1. */
|
|
1171
|
-
weights?: Record<string, number>;
|
|
1172
|
-
/**
|
|
1173
|
-
* Custom allocator — receives the analyst, remaining/total budget, and
|
|
1174
|
-
* the count of analysts that will run. Returns the per-analyst budget
|
|
1175
|
-
* (or undefined to leave it uncapped). Overrides weights when set.
|
|
1176
|
-
*/
|
|
1177
|
-
allocate?: (args: {
|
|
1178
|
-
analyst: Analyst;
|
|
1179
|
-
totalUsd: number | undefined;
|
|
1180
|
-
remainingUsd: number | undefined;
|
|
1181
|
-
runningCount: number;
|
|
1182
|
-
}) => number | undefined;
|
|
1183
|
-
}
|
|
1184
|
-
interface AnalystRegistryOptions {
|
|
1185
|
-
/** Shared chat client passed to every LLM analyst via AnalystContext. */
|
|
1186
|
-
chat?: ChatClient;
|
|
1187
|
-
/** Logger callback. Defaults to a no-op. */
|
|
1188
|
-
log?: (msg: string, fields?: Record<string, unknown>) => void;
|
|
1189
|
-
/** Hooks invoked around analyze() — observability + customization seam. */
|
|
1190
|
-
hooks?: AnalystHooks;
|
|
1191
|
-
/** Default budget when run() doesn't override. */
|
|
1192
|
-
defaultBudget?: BudgetPolicy;
|
|
1193
|
-
}
|
|
1194
|
-
interface RegistryRunOpts {
|
|
1195
|
-
/** Restrict to a subset of registered analysts by id. */
|
|
1196
|
-
only?: string[];
|
|
1197
|
-
/** Skip these analysts even if registered. Useful for cheap iteration. */
|
|
1198
|
-
skip?: string[];
|
|
1199
|
-
/** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */
|
|
1200
|
-
budget?: BudgetPolicy;
|
|
1201
|
-
/** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */
|
|
1202
|
-
timeoutMs?: number;
|
|
1203
|
-
/** Abort signal — forwarded into every analyst's context. */
|
|
1204
|
-
signal?: AbortSignal;
|
|
1205
|
-
/** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */
|
|
1206
|
-
tags?: Record<string, string>;
|
|
1207
|
-
/**
|
|
1208
|
-
* Prior-run findings made available as retrieval context to every
|
|
1209
|
-
* analyst via `ctx.priorFindings`. The registry forwards the slice
|
|
1210
|
-
* whose `analyst_id` matches each registered analyst so a kind sees
|
|
1211
|
-
* only its own history. Pass `{ '*': findings }` to broadcast to
|
|
1212
|
-
* every analyst (useful for cross-kind chaining where the improvement
|
|
1213
|
-
* analyst consumes upstream failure findings).
|
|
1214
|
-
*/
|
|
1215
|
-
priorFindings?: ReadonlyArray<AnalystFinding> | Record<string, ReadonlyArray<AnalystFinding>>;
|
|
1216
|
-
}
|
|
1217
|
-
declare class AnalystRegistry {
|
|
1218
|
-
private readonly analysts;
|
|
1219
|
-
private readonly options;
|
|
1220
|
-
constructor(options?: AnalystRegistryOptions);
|
|
1221
|
-
register(analyst: Analyst): void;
|
|
1222
|
-
list(): ReadonlyArray<{
|
|
1223
|
-
id: string;
|
|
1224
|
-
description: string;
|
|
1225
|
-
version: string;
|
|
1226
|
-
cost: Analyst['cost'];
|
|
1227
|
-
}>;
|
|
1228
|
-
run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
|
|
1229
|
-
/**
|
|
1230
|
-
* Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
|
|
1231
|
-
* in real time — `run-started`, then per-analyst `skipped` /
|
|
1232
|
-
* `started` / `completed`, then a terminal `run-completed` whose
|
|
1233
|
-
* payload is the full `AnalystRunResult`. UIs use this to render
|
|
1234
|
-
* progress; persistence consumers use `run()` and read the result.
|
|
1235
|
-
*
|
|
1236
|
-
* Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
|
|
1237
|
-
* `onComplete`) fire as before — streaming is additive, not a hook
|
|
1238
|
-
* replacement.
|
|
1239
|
-
*/
|
|
1240
|
-
runStream(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): AsyncGenerator<AnalystRunEvent, void, void>;
|
|
1241
|
-
private selectAnalysts;
|
|
1242
|
-
private routeInput;
|
|
1243
|
-
}
|
|
1244
|
-
|
|
1245
801
|
/**
|
|
1246
802
|
* Pre-curated tool subsets for analyst kinds.
|
|
1247
803
|
*
|
|
@@ -5928,4 +5484,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
|
|
|
5928
5484
|
*/
|
|
5929
5485
|
declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
|
|
5930
5486
|
|
|
5931
|
-
export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|
|
5487
|
+
export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|