@tangle-network/agent-eval 0.38.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/multishot/index.d.ts +21 -21
- package/dist/multishot/index.js +64 -15
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +33 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CmLJk3IG.js';
|
|
2
2
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
3
3
|
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
|
|
4
4
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
@@ -10,16 +10,16 @@ import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from '
|
|
|
10
10
|
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-DeZ_EArp.js';
|
|
11
11
|
import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
12
12
|
export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
13
|
-
import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-
|
|
14
|
-
export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-
|
|
13
|
+
import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-Di84bXD7.js';
|
|
14
|
+
export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
|
|
15
15
|
import { TCloud } from '@tangle-network/tcloud';
|
|
16
16
|
import { z } from 'zod';
|
|
17
17
|
import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
|
|
18
18
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
19
19
|
import { A as AgentEvalError } from './errors-mje_cKOs.js';
|
|
20
20
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
|
|
21
|
-
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-
|
|
22
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
21
|
+
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-Dvy-bt7x.js';
|
|
22
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-Dvy-bt7x.js';
|
|
23
23
|
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DuZXOk7K.js';
|
|
24
24
|
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
|
|
25
25
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
@@ -35,11 +35,12 @@ import { a as BaselineReport } from './baseline-4R5deP0N.js';
|
|
|
35
35
|
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
|
|
36
36
|
import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
37
37
|
export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
|
|
38
|
-
import { a as DatasetScenario,
|
|
39
|
-
export { d as DatasetDifficulty,
|
|
38
|
+
import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
|
|
39
|
+
export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
|
|
40
40
|
export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
|
|
41
|
-
export { D as DEFAULT_RED_TEAM_CORPUS,
|
|
41
|
+
export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-30II1T4o.js';
|
|
42
42
|
import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
|
|
43
|
+
export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
|
|
43
44
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
|
|
44
45
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
45
46
|
import './outcome-store-D6KWmYvj.js';
|
|
@@ -1159,7 +1160,7 @@ interface AnalystHooks {
|
|
|
1159
1160
|
analyst: Analyst;
|
|
1160
1161
|
error: Error;
|
|
1161
1162
|
runId: string;
|
|
1162
|
-
}): AnalystFinding[] |
|
|
1163
|
+
}): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
|
|
1163
1164
|
/** Once after registry.run() completes. Use for final aggregation, persistence. */
|
|
1164
1165
|
onComplete?(args: {
|
|
1165
1166
|
result: AnalystRunResult;
|
|
@@ -6311,6 +6312,31 @@ declare function withOtelPipeline(opts?: OtelPipelineOptions): OtelPipelineHandl
|
|
|
6311
6312
|
*/
|
|
6312
6313
|
declare function isOtelConfigured(): boolean;
|
|
6313
6314
|
|
|
6315
|
+
/**
|
|
6316
|
+
* Traced analyst wrapper — instruments `analyzeTraces` with spans so the
|
|
6317
|
+
* analyst's internal LLM calls (actor + responder turns) appear in the
|
|
6318
|
+
* trace tree. Also wraps each actor turn callback with a span.
|
|
6319
|
+
*
|
|
6320
|
+
* Since the analyst uses @ax-llm/ax internally (an agent framework with
|
|
6321
|
+
* its own turn loop), we cannot wrap individual `tc.chat()` calls without
|
|
6322
|
+
* forking ax. Instead, we wrap at the boundary:
|
|
6323
|
+
* 1. A parent span for the entire analyst run.
|
|
6324
|
+
* 2. Per-turn child spans from the `onTurn` callback (captures code,
|
|
6325
|
+
* output size, error status).
|
|
6326
|
+
* 3. Summary attributes on the parent (total turns, usage, findings).
|
|
6327
|
+
*/
|
|
6328
|
+
|
|
6329
|
+
interface TracedAnalystOptions {
|
|
6330
|
+
/** TraceEmitter for span emission. */
|
|
6331
|
+
emitter: TraceEmitter;
|
|
6332
|
+
/** Parent span id. If omitted, uses emitter stack. */
|
|
6333
|
+
parentSpanId?: string;
|
|
6334
|
+
}
|
|
6335
|
+
/**
|
|
6336
|
+
* Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
|
|
6337
|
+
*/
|
|
6338
|
+
declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
|
|
6339
|
+
|
|
6314
6340
|
/**
|
|
6315
6341
|
* Traced judge wrappers — instruments every LLM call inside the judge
|
|
6316
6342
|
* ensemble with child spans so OTEL sinks see per-judge latency, model,
|
|
@@ -6337,31 +6363,6 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
|
|
|
6337
6363
|
*/
|
|
6338
6364
|
declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
|
|
6339
6365
|
|
|
6340
|
-
/**
|
|
6341
|
-
* Traced analyst wrapper — instruments `analyzeTraces` with spans so the
|
|
6342
|
-
* analyst's internal LLM calls (actor + responder turns) appear in the
|
|
6343
|
-
* trace tree. Also wraps each actor turn callback with a span.
|
|
6344
|
-
*
|
|
6345
|
-
* Since the analyst uses @ax-llm/ax internally (an agent framework with
|
|
6346
|
-
* its own turn loop), we cannot wrap individual `tc.chat()` calls without
|
|
6347
|
-
* forking ax. Instead, we wrap at the boundary:
|
|
6348
|
-
* 1. A parent span for the entire analyst run.
|
|
6349
|
-
* 2. Per-turn child spans from the `onTurn` callback (captures code,
|
|
6350
|
-
* output size, error status).
|
|
6351
|
-
* 3. Summary attributes on the parent (total turns, usage, findings).
|
|
6352
|
-
*/
|
|
6353
|
-
|
|
6354
|
-
interface TracedAnalystOptions {
|
|
6355
|
-
/** TraceEmitter for span emission. */
|
|
6356
|
-
emitter: TraceEmitter;
|
|
6357
|
-
/** Parent span id. If omitted, uses emitter stack. */
|
|
6358
|
-
parentSpanId?: string;
|
|
6359
|
-
}
|
|
6360
|
-
/**
|
|
6361
|
-
* Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
|
|
6362
|
-
*/
|
|
6363
|
-
declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
|
|
6364
|
-
|
|
6365
6366
|
/**
|
|
6366
6367
|
* Traced mutator wrapper — instruments reflective-mutation LLM calls.
|
|
6367
6368
|
*
|