@tangle-network/agent-eval 0.37.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +516 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +38 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
|
@@ -112,4 +112,4 @@ declare class Dataset {
|
|
|
112
112
|
}
|
|
113
113
|
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
114
114
|
|
|
115
|
-
export { type DatasetSplit as D, HoldoutLockedError as H, type SliceOptions as S, type DatasetScenario as a,
|
|
115
|
+
export { type DatasetSplit as D, HoldoutLockedError as H, type SliceOptions as S, type DatasetScenario as a, Dataset as b, type DatasetManifest as c, type DatasetDifficulty as d, type DatasetProvenance as e, hashScenarios as h };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { C as ControlEvalResult, a as ControlRunResult, b as ControlStep } from './control-runtime-BZ_lVLYW.js';
|
|
2
|
-
import { D as DatasetSplit, a as DatasetScenario } from './dataset-
|
|
2
|
+
import { D as DatasetSplit, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
3
3
|
|
|
4
4
|
type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
|
|
5
5
|
type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
|
|
@@ -1,6 +1,134 @@
|
|
|
1
|
-
|
|
2
|
-
import '../
|
|
1
|
+
import { c as DatasetManifest } from '../dataset-BlwAtYYf.js';
|
|
2
|
+
import { b as CalibrationResult } from '../judge-calibration-DilmB3Ml.js';
|
|
3
|
+
import { O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
|
|
4
|
+
import { d as RedTeamReport } from '../red-team-30II1T4o.js';
|
|
5
|
+
import { T as TraceStore } from '../store-Db2Bv8Cf.js';
|
|
3
6
|
import '../errors-mje_cKOs.js';
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Governance reporting — shared types.
|
|
10
|
+
*
|
|
11
|
+
* The framework collects a `GovernanceContext` (traces + outcomes +
|
|
12
|
+
* dataset manifests + red-team results + judge calibration) and each
|
|
13
|
+
* specific template (NIST AI RMF, SOC2, EU AI Act) renders a
|
|
14
|
+
* structured report from it.
|
|
15
|
+
*
|
|
16
|
+
* Reports are machine-readable JSON first; human-readable Markdown is a
|
|
17
|
+
* pure transform on top. External auditors consume the Markdown; CI
|
|
18
|
+
* consumes the JSON.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
interface GovernanceContext {
|
|
22
|
+
/** Legal / org identity for the report. */
|
|
23
|
+
organization: string;
|
|
24
|
+
/** System / agent identifier. */
|
|
25
|
+
systemName: string;
|
|
26
|
+
/** ISO8601 period the report covers. */
|
|
27
|
+
periodStart: string;
|
|
28
|
+
periodEnd: string;
|
|
29
|
+
/** Versioned dataset manifests used during the period. */
|
|
30
|
+
datasets: DatasetManifest[];
|
|
31
|
+
traceStore: TraceStore;
|
|
32
|
+
outcomeStore?: OutcomeStore;
|
|
33
|
+
/** Cached red-team results for the period, if available. */
|
|
34
|
+
redTeam?: RedTeamReport;
|
|
35
|
+
/** Judge-vs-human calibration results, if measured. */
|
|
36
|
+
judgeCalibration?: CalibrationResult[];
|
|
37
|
+
/** Responsible owner for the system — role + name + email. */
|
|
38
|
+
owner: {
|
|
39
|
+
role: string;
|
|
40
|
+
name: string;
|
|
41
|
+
email: string;
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
interface GovernanceFinding {
|
|
45
|
+
id: string;
|
|
46
|
+
severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
47
|
+
/** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
|
|
48
|
+
control: string;
|
|
49
|
+
summary: string;
|
|
50
|
+
evidence?: string;
|
|
51
|
+
remediation?: string;
|
|
52
|
+
}
|
|
53
|
+
interface GovernanceReport {
|
|
54
|
+
framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
|
|
55
|
+
version: string;
|
|
56
|
+
context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
|
|
57
|
+
summary: {
|
|
58
|
+
findings: number;
|
|
59
|
+
byeverity: Record<GovernanceFinding['severity'], number>;
|
|
60
|
+
overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
|
|
61
|
+
};
|
|
62
|
+
findings: GovernanceFinding[];
|
|
63
|
+
/** Framework-specific structured payload (mapped controls, risk class, etc.). */
|
|
64
|
+
payload: Record<string, unknown>;
|
|
65
|
+
generatedAt: string;
|
|
66
|
+
}
|
|
67
|
+
declare function renderMarkdown(report: GovernanceReport): string;
|
|
68
|
+
declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* EU AI Act — risk-class classification + compliance checklist.
|
|
72
|
+
*
|
|
73
|
+
* Classification is declarative: caller supplies the domain/use-case
|
|
74
|
+
* signals (biometric? critical infrastructure? education? employment?
|
|
75
|
+
* access to services?) and we map to the Act's risk tiers:
|
|
76
|
+
* - "unacceptable" (prohibited)
|
|
77
|
+
* - "high" (Annex III — strict obligations)
|
|
78
|
+
* - "limited" (transparency obligations)
|
|
79
|
+
* - "minimal" (voluntary codes of conduct)
|
|
80
|
+
*
|
|
81
|
+
* Then the compliance checklist enumerates Article 9 (risk mgmt),
|
|
82
|
+
* 10 (data + data governance), 11 (technical documentation), 13
|
|
83
|
+
* (transparency), 14 (human oversight), 15 (accuracy + robustness)
|
|
84
|
+
* requirements and flags gaps.
|
|
85
|
+
*/
|
|
86
|
+
|
|
87
|
+
type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
|
|
88
|
+
interface UseCaseSignals {
|
|
89
|
+
/** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
|
|
90
|
+
biometricPublic?: boolean;
|
|
91
|
+
/** Social scoring by public authorities? (Art. 5). */
|
|
92
|
+
socialScoring?: boolean;
|
|
93
|
+
/** Subliminal manipulation? (Art. 5). */
|
|
94
|
+
subliminal?: boolean;
|
|
95
|
+
/** Annex III sector: critical infrastructure / education / employment /
|
|
96
|
+
* access to essential services / law enforcement / migration /
|
|
97
|
+
* administration of justice / democratic processes? */
|
|
98
|
+
annexIII?: boolean;
|
|
99
|
+
/** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
|
|
100
|
+
chatbot?: boolean;
|
|
101
|
+
/** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
|
|
102
|
+
generatesSyntheticMedia?: boolean;
|
|
103
|
+
}
|
|
104
|
+
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
105
|
+
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
|
|
109
|
+
*
|
|
110
|
+
* Each subcategory derives its status from concrete framework state:
|
|
111
|
+
* MEASURE 2.x: do we have a calibration regime? contamination controls?
|
|
112
|
+
* MEASURE 2.7: are red-team results available?
|
|
113
|
+
* MANAGE 1.x: are outcome metrics captured? correlation measured?
|
|
114
|
+
* GOVERN 1.x: dataset + prompt provenance recorded?
|
|
115
|
+
*
|
|
116
|
+
* We ship the mapping and the derivation rules; consumers supply the
|
|
117
|
+
* GovernanceContext.
|
|
118
|
+
*/
|
|
119
|
+
|
|
120
|
+
declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* SOC 2 — Common Criteria 7 (system operations + change management)
|
|
124
|
+
* audit trail derived from the trace corpus.
|
|
125
|
+
*
|
|
126
|
+
* This is NOT a formal SOC2 report — that requires an external
|
|
127
|
+
* auditor. What we ship is the machine-readable *evidence* package
|
|
128
|
+
* that an auditor consumes: run counts, deploy events, access log
|
|
129
|
+
* summary, anomaly tracking, response-time SLOs.
|
|
130
|
+
*/
|
|
131
|
+
|
|
132
|
+
declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
133
|
+
|
|
134
|
+
export { type EuRiskClass, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CmLJk3IG.js';
|
|
2
2
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
3
3
|
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
|
|
4
4
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
@@ -10,16 +10,16 @@ import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from '
|
|
|
10
10
|
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-DeZ_EArp.js';
|
|
11
11
|
import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
12
12
|
export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
13
|
-
import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-
|
|
14
|
-
export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-
|
|
13
|
+
import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-Di84bXD7.js';
|
|
14
|
+
export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
|
|
15
15
|
import { TCloud } from '@tangle-network/tcloud';
|
|
16
16
|
import { z } from 'zod';
|
|
17
17
|
import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
|
|
18
18
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
19
19
|
import { A as AgentEvalError } from './errors-mje_cKOs.js';
|
|
20
20
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
|
|
21
|
-
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-
|
|
22
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
21
|
+
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-Dvy-bt7x.js';
|
|
22
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-Dvy-bt7x.js';
|
|
23
23
|
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DuZXOk7K.js';
|
|
24
24
|
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
|
|
25
25
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
@@ -35,11 +35,12 @@ import { a as BaselineReport } from './baseline-4R5deP0N.js';
|
|
|
35
35
|
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
|
|
36
36
|
import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
37
37
|
export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
|
|
38
|
-
import { a as DatasetScenario,
|
|
39
|
-
export { d as DatasetDifficulty,
|
|
38
|
+
import { a as DatasetScenario, b as Dataset } from './dataset-BlwAtYYf.js';
|
|
39
|
+
export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-BlwAtYYf.js';
|
|
40
40
|
export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
|
|
41
|
-
export { D as DEFAULT_RED_TEAM_CORPUS,
|
|
41
|
+
export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-30II1T4o.js';
|
|
42
42
|
import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
|
|
43
|
+
export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
|
|
43
44
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
|
|
44
45
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
45
46
|
import './outcome-store-D6KWmYvj.js';
|
|
@@ -1159,7 +1160,7 @@ interface AnalystHooks {
|
|
|
1159
1160
|
analyst: Analyst;
|
|
1160
1161
|
error: Error;
|
|
1161
1162
|
runId: string;
|
|
1162
|
-
}): AnalystFinding[] |
|
|
1163
|
+
}): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
|
|
1163
1164
|
/** Once after registry.run() completes. Use for final aggregation, persistence. */
|
|
1164
1165
|
onComplete?(args: {
|
|
1165
1166
|
result: AnalystRunResult;
|
|
@@ -6311,6 +6312,31 @@ declare function withOtelPipeline(opts?: OtelPipelineOptions): OtelPipelineHandl
|
|
|
6311
6312
|
*/
|
|
6312
6313
|
declare function isOtelConfigured(): boolean;
|
|
6313
6314
|
|
|
6315
|
+
/**
|
|
6316
|
+
* Traced analyst wrapper — instruments `analyzeTraces` with spans so the
|
|
6317
|
+
* analyst's internal LLM calls (actor + responder turns) appear in the
|
|
6318
|
+
* trace tree. Also wraps each actor turn callback with a span.
|
|
6319
|
+
*
|
|
6320
|
+
* Since the analyst uses @ax-llm/ax internally (an agent framework with
|
|
6321
|
+
* its own turn loop), we cannot wrap individual `tc.chat()` calls without
|
|
6322
|
+
* forking ax. Instead, we wrap at the boundary:
|
|
6323
|
+
* 1. A parent span for the entire analyst run.
|
|
6324
|
+
* 2. Per-turn child spans from the `onTurn` callback (captures code,
|
|
6325
|
+
* output size, error status).
|
|
6326
|
+
* 3. Summary attributes on the parent (total turns, usage, findings).
|
|
6327
|
+
*/
|
|
6328
|
+
|
|
6329
|
+
interface TracedAnalystOptions {
|
|
6330
|
+
/** TraceEmitter for span emission. */
|
|
6331
|
+
emitter: TraceEmitter;
|
|
6332
|
+
/** Parent span id. If omitted, uses emitter stack. */
|
|
6333
|
+
parentSpanId?: string;
|
|
6334
|
+
}
|
|
6335
|
+
/**
|
|
6336
|
+
* Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
|
|
6337
|
+
*/
|
|
6338
|
+
declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
|
|
6339
|
+
|
|
6314
6340
|
/**
|
|
6315
6341
|
* Traced judge wrappers — instruments every LLM call inside the judge
|
|
6316
6342
|
* ensemble with child spans so OTEL sinks see per-judge latency, model,
|
|
@@ -6337,31 +6363,6 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
|
|
|
6337
6363
|
*/
|
|
6338
6364
|
declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
|
|
6339
6365
|
|
|
6340
|
-
/**
|
|
6341
|
-
* Traced analyst wrapper — instruments `analyzeTraces` with spans so the
|
|
6342
|
-
* analyst's internal LLM calls (actor + responder turns) appear in the
|
|
6343
|
-
* trace tree. Also wraps each actor turn callback with a span.
|
|
6344
|
-
*
|
|
6345
|
-
* Since the analyst uses @ax-llm/ax internally (an agent framework with
|
|
6346
|
-
* its own turn loop), we cannot wrap individual `tc.chat()` calls without
|
|
6347
|
-
* forking ax. Instead, we wrap at the boundary:
|
|
6348
|
-
* 1. A parent span for the entire analyst run.
|
|
6349
|
-
* 2. Per-turn child spans from the `onTurn` callback (captures code,
|
|
6350
|
-
* output size, error status).
|
|
6351
|
-
* 3. Summary attributes on the parent (total turns, usage, findings).
|
|
6352
|
-
*/
|
|
6353
|
-
|
|
6354
|
-
interface TracedAnalystOptions {
|
|
6355
|
-
/** TraceEmitter for span emission. */
|
|
6356
|
-
emitter: TraceEmitter;
|
|
6357
|
-
/** Parent span id. If omitted, uses emitter stack. */
|
|
6358
|
-
parentSpanId?: string;
|
|
6359
|
-
}
|
|
6360
|
-
/**
|
|
6361
|
-
* Run `analyzeTraces` wrapped in a parent span with per-turn child spans.
|
|
6362
|
-
*/
|
|
6363
|
-
declare function tracedAnalyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions, traceOpts: TracedAnalystOptions): Promise<AnalyzeTracesResult>;
|
|
6364
|
-
|
|
6365
6366
|
/**
|
|
6366
6367
|
* Traced mutator wrapper — instruments reflective-mutation LLM calls.
|
|
6367
6368
|
*
|