@tangle-network/agent-eval 0.71.0 → 0.72.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +63 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +485 -9
- package/dist/campaign/index.js +618 -30
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
- package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +14 -8
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +339 -2627
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-6QZUCFKM.js.map +0 -1
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -1,36 +1,34 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BgA6BYTm.js';
|
|
2
2
|
import { R as RunRecord } from './run-record-BgTFzO2r.js';
|
|
3
3
|
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunSplitTag, a as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BgTFzO2r.js';
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
export {
|
|
7
|
-
import {
|
|
8
|
-
export {
|
|
9
|
-
import { T as TraceStore, R as RunFilter } from './store-CKUAgsJz.js';
|
|
10
|
-
export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, S as SpanFilter } from './store-CKUAgsJz.js';
|
|
11
|
-
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
12
|
-
export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-DbjLfz-K.js';
|
|
13
|
-
import { AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
14
|
-
export { AnalyzeTracesTurnSnapshot, CaptureFetchContext, CaptureFetchOptions, DEFAULT_REDACTION_RULES, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, flattenOtlpExportToNdjson, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
15
|
-
import { T as TraceAnalysisStore } from './store-jzKpMl16.js';
|
|
16
|
-
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, a as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, b as SearchTraceResult, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, f as TraceAnalystFilters, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-jzKpMl16.js';
|
|
17
|
-
import { b as JudgeFn, a as JudgeInput, B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
|
|
18
|
-
export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
|
|
19
|
-
import { a as Analyst, b as AnalystSeverity, c as AnalystFinding, d as AnalystCost, e as AnalystContext, C as ChatRequest, f as CreateChatClientOpts } from './registry-BGKyX6bw.js';
|
|
20
|
-
export { g as AnalystHooks, h as AnalystInputKind, A as AnalystRegistry, i as AnalystRegistryOptions, j as AnalystRequirements, k as AnalystRunEvent, l as AnalystRunInputs, m as AnalystRunResult, n as AnalystRunSummary, B as BudgetPolicy, o as ChatCallOpts, p as ChatClient, q as ChatResponse, r as ChatTransport, s as CliBridgeTransportOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RegistryRunOpts, t as RouterTransportOpts, S as SandboxSdkTransportOpts, u as computeFindingId, v as createChatClient, w as makeFinding } from './registry-BGKyX6bw.js';
|
|
4
|
+
export { B as BehavioralMetrics, z as ConceptComplexity, A as ConceptFinding, E as ConceptSpec, G as ConceptWeightStrategy, C as CreateAnalystAiConfig, H as DEFAULT_COMPLEXITY_WEIGHTS, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, f as FindingSubject, g as FindingSubjectKind, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, J as SEMANTIC_CONCEPT_JUDGE_VERSION, m as SKILL_USAGE_ANALYST, a as SemanticConceptJudgeInput, S as SemanticConceptJudgeOptions, L as SemanticConceptJudgeResult, n as SkillUsageAnalyst, M as SuboptimalCode, N as SuboptimalSignal, r as buildDefaultAnalystRegistry, O as computeTraceMetrics, t as createAnalystAi, Q as createSemanticConceptJudge, u as defaultIsMaterial, v as diffFindings, R as runSemanticConceptJudge } from './semantic-concept-judge-CV9Wlx4t.js';
|
|
5
|
+
export { C as CreateTraceAnalystKindOpts, a as RawAnalystFinding, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, r as renderPriorFindings } from './kind-factory-DW9XWPvM.js';
|
|
6
|
+
export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from './registry-DuVYiTvw.js';
|
|
7
|
+
import { l as ChatRequest, p as CreateChatClientOpts } from './types-CRD68aH7.js';
|
|
8
|
+
export { A as Analyst, a as AnalystContext, g as AnalystCost, c as AnalystFinding, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, h as AnalystSeverity, E as EvidenceRef, q as computeFindingId, s as makeFinding } from './types-CRD68aH7.js';
|
|
21
9
|
import { TCloud } from '@tangle-network/tcloud';
|
|
22
|
-
import {
|
|
10
|
+
import { B as BenchmarkRunnerConfig, S as Scenario, c as BenchmarkReport, P as ProductClientConfig, C as CheckResult, T as TestResult, d as PersonaConfig, D as DriverResult, e as DriverState, b as JudgeFn, f as CollectedArtifacts, g as ScenarioResult, h as TurnMetrics, i as ScenarioFile, j as CompletionCriterion } from './types-Croy5h7V.js';
|
|
11
|
+
export { A as ArtifactCheck, k as ArtifactResult, E as EvalResult, F as FeedbackPattern, l as JudgeConfig, a as JudgeInput, m as JudgeRubric, J as JudgeScore, n as PersonaRigor, R as RouteMap, o as RubricDimension, p as Turn, q as TurnResult } from './types-Croy5h7V.js';
|
|
23
12
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-DuFBYg7A.js';
|
|
24
13
|
import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
|
|
25
14
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-Dwqw-T_m.js';
|
|
26
|
-
import {
|
|
27
|
-
export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
28
|
-
import { A as AgentProfile$1 } from './agent-profile-
|
|
29
|
-
export {
|
|
15
|
+
import { b as FeedbackLabel, F as FeedbackTrajectoryStore, a as FeedbackTrajectory } from './feedback-trajectory-B3rErRsh.js';
|
|
16
|
+
export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-B3rErRsh.js';
|
|
17
|
+
import { A as AgentProfile$1 } from './agent-profile-DYRboYWu.js';
|
|
18
|
+
export { c as ArtifactCheckArtifact, d as ArtifactEventLike, e as ArtifactValidator, f as BackendIntegrityError, B as BackendIntegrityReport, C as CompletionRequirement, a as CompletionVerdict, b as CorrectnessChecker, L as LlmCorrectnessCheckerOpts, g as ProducedProposal, P as ProducedState, h as ProposalEventLike, i as RequirementCheck, R as RuntimeEventLike, S as SatisfiedBy, T as TaskGold, j as ToolCallEventLike, V as ValidationContext, k as ValidationIssue, l as ValidationResult, m as agentProfileHash, n as assertRealBackend, o as byteLengthRange, p as composeValidators, q as containsAll, r as createLlmCorrectnessChecker, s as extractProducedState, t as jsonHasKeys, u as parseCorrectnessResponse, v as regexMatch, w as summarizeBackendIntegrity, x as verifyCompletion } from './agent-profile-DYRboYWu.js';
|
|
30
19
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
31
20
|
import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
|
|
32
21
|
export { A as ActionableSideInfo, o as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, R as ReleaseConfidenceAxis, b as ReleaseConfidenceAxisName, c as ReleaseConfidenceInput, d as ReleaseConfidenceIssue, e as ReleaseConfidenceMetrics, g as ReleaseConfidenceStatus, i as ReleaseTraceEvidence, j as RenderReleaseReportOptions, V as Verdict, k as assertReleaseConfidence, l as bootstrapCi, m as evaluateReleaseConfidence, n as judgeReplayGate, r as renderReleaseReport } from './release-report-CN8hJlhk.js';
|
|
33
22
|
export { C as CliffsMagnitude, c as CorpusAgreementOptions, d as CorpusAgreementPerDimension, e as CorpusAgreementReport, f as CorpusScoreRecord, P as PairedBootstrapOptions, a as PairedBootstrapResult, W as WeightedCompositeInput, g as WeightedCompositeResult, b as benjaminiHochberg, h as bonferroni, i as cliffsDelta, j as cohensD, k as confidenceInterval, l as corpusInterRaterAgreement, m as corpusInterRaterAgreementFromJudgeScores, n as interRaterReliability, o as interpretCliffs, q as mannWhitneyU, r as normalizeScores, p as pairedBootstrap, s as pairedMde, t as pairedTTest, u as partialCredit, v as requiredSampleSize, x as weightedComposite, y as weightedMean, w as wilcoxonSignedRank } from './statistics-B7yCbi9i.js';
|
|
23
|
+
import { a as AnalyzeTracesInput, A as AnalyzeTracesOptions, b as AnalyzeTracesResult } from './analyst-t7zZS3TV.js';
|
|
24
|
+
export { c as AnalyzeTracesTurnSnapshot, d as analyzeTraces } from './analyst-t7zZS3TV.js';
|
|
25
|
+
import { OtelExporter, OtelExportConfig } from './traces.js';
|
|
26
|
+
export { CaptureFetchContext, CaptureFetchOptions, ExportableSpan, FlattenOtlpOptions, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpFlatLine, OtlpResourceSpans, OtlpSpan, OtlpToRunRecordsOptions, OtlpTraceRunRecord, ProjectedOtlpSpan, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TraceAggregate, TraceAnalystHookOptions, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, asNumber, asString, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, captureFetchToRawSink, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, extractOtlpAttributes, firstNumberAttr, firstStringAttr, flattenOtlpExportToNdjson, inferDomainKeywords, inferOtlpKind, iterateRawCalls, otelRunCompleteHook, otlpToRunRecords, otlpToTraceRunRecords, planTraceInsightQuestions, projectOtlpFlatLine, readOtlpStatus, scoreTraceInsightReadiness, stringField, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
27
|
+
export { D as DEFAULT_TRACE_ANALYST_BUDGETS, b as DatasetOverview, Q as QueryTracesPage, S as SearchSpanResult, c as SearchTraceResult, d as SpanMatchRecord, e as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, T as TraceAnalysisStore, f as TraceAnalystByteBudgets, g as TraceAnalystFilters, a as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, V as ViewSpansResult, k as ViewTraceOversized, l as ViewTraceResult } from './store-GmBE2pZZ.js';
|
|
28
|
+
import { d as RunCriticOptions, a as RunTrace, b as RunScoreWeights, R as RunScore } from './run-critic-BAIjX99r.js';
|
|
29
|
+
export { D as DEFAULT_RUN_SCORE_WEIGHTS, c as RunCritic, e as aggregateRunScore, f as clamp01 } from './run-critic-BAIjX99r.js';
|
|
30
|
+
import { S as SteeringBundle } from './harness-optimizer-EnEnQPsr.js';
|
|
31
|
+
export { D as DEFAULT_HARNESS_OBJECTIVES, H as HarnessAdapter, a as HarnessExperimentConfig, b as HarnessExperimentResult, c as HarnessIntervention, d as HarnessRunRequest, e as HarnessRunResult, f as HarnessScenario, g as HarnessSelection, h as HarnessVariant, i as HarnessVariantReport, M as MeasurementPolicy, j as SteeringDelta, k as SteeringRolePrompt, W as WorkflowTopology, m as mergeSteeringBundle, r as renderSteeringText, l as runHarnessExperiment, s as selectHarnessVariant, n as summarizeHarnessResults } from './harness-optimizer-EnEnQPsr.js';
|
|
34
32
|
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BdVaPyHT.js';
|
|
35
33
|
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BdVaPyHT.js';
|
|
36
34
|
import { T as TraceEmitter } from './emitter-DEZwY14K.js';
|
|
@@ -38,11 +36,17 @@ export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b a
|
|
|
38
36
|
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CJzrpUua.js';
|
|
39
37
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
|
|
40
38
|
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
39
|
+
export { D as DEFAULT_REDACTION_RULES, b as REDACTION_VERSION, a as RedactionReport, R as RedactionRule, r as redactString, c as redactValue } from './redact-B40YG2M_.js';
|
|
40
|
+
import { h as BudgetSpec, B as BudgetLedgerEntry, R as Run$1, L as LlmSpan } from './schema-m0gsnbt3.js';
|
|
41
|
+
export { A as Artifact, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, M as Message, d as RetrievalSpan, g as RunLayer, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
|
|
42
|
+
import { T as TraceStore, R as RunFilter } from './store-CKUAgsJz.js';
|
|
43
|
+
export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, S as SpanFilter } from './store-CKUAgsJz.js';
|
|
41
44
|
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-CL7IVgkJ.js';
|
|
42
45
|
import { a as BaselineReport } from './baseline-DE36-Np7.js';
|
|
43
46
|
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-DE36-Np7.js';
|
|
44
47
|
import { T as Trajectory, a as TrajectoryStep } from './trajectory-GEdXJCL5.js';
|
|
45
48
|
export { b as buildTrajectory } from './trajectory-GEdXJCL5.js';
|
|
49
|
+
export { D as Direction, O as Objective, P as ParetoResult, c as crowdingDistance, d as dominates, p as paretoFrontier, a as paretoFrontierWithCrowding, s as scalarScore } from './pareto-E-pembql.js';
|
|
46
50
|
export { D as DefaultVerdict } from './verdict-CeEgtjyI.js';
|
|
47
51
|
import { a as DatasetScenario, b as Dataset } from './dataset-B2kL-fSM.js';
|
|
48
52
|
export { d as DatasetDifficulty, c as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B2kL-fSM.js';
|
|
@@ -50,878 +54,20 @@ export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement,
|
|
|
50
54
|
export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b as RedTeamFinding, c as RedTeamPayload, d as RedTeamReport, r as redTeamDataset, e as redTeamReport, s as scoreRedTeamOutput, t as toolNamesForRun } from './red-team-DW9Ca_tj.js';
|
|
51
55
|
import { a as PrmGrader } from './rubric-BOfxn4ja.js';
|
|
52
56
|
export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
|
|
57
|
+
import { b as Layer, S as Severity, L as LayerResult, c as VerifyContext } from './multi-layer-verifier-DlWCXuxL.js';
|
|
58
|
+
export { F as Finding, d as LayerStatus, M as MultiLayerVerifier, a as VerificationReport, V as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-DlWCXuxL.js';
|
|
59
|
+
import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
|
|
60
|
+
export { d as LlmCallError, b as LlmCallRequest, c as LlmCallResult, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-DbjLfz-K.js';
|
|
53
61
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DsnOpCO6.js';
|
|
62
|
+
export { C as CallbackResearcher, d as CallbackResearcherOptions, e as CampaignFactoryParams, f as CampaignIntegrityPolicy, g as CampaignRunContext, h as CampaignRunOutcome, i as CampaignRunner, j as CampaignScenario, k as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, l as FailedRun, F as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-C_KJyIGg.js';
|
|
54
63
|
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, m as GateDecision, n as GateEvidence, H as HeldOutGate, o as HeldOutGateConfig, q as HeldOutGateRejectionCode, P as ParetoFigureSpec, c as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, d as ResearchReport, e as ResearchReportCandidate, f as ResearchReportDecision, g as ResearchReportMethodology, h as ResearchReportOptions, i as ResearchReportRecommendation, S as SummaryTable, j as SummaryTableOptions, k as SummaryTableRow, l as gainHistogram, p as paretoChart, r as researchReport, s as summaryTable } from './summary-report-ByiOUrHj.js';
|
|
55
64
|
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
56
|
-
import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-
|
|
57
|
-
import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-
|
|
65
|
+
import { S as Scenario$1, a as JudgeConfig, G as Gate } from './types-Bba0vl1V.js';
|
|
66
|
+
import { d as GepaDriverConstraints, R as RunImprovementLoopResult } from './run-improvement-loop-BqYH2vCR.js';
|
|
67
|
+
import '@ax-llm/ax';
|
|
68
|
+
import 'zod';
|
|
58
69
|
import './outcome-store-D6KWmYvj.js';
|
|
59
70
|
|
|
60
|
-
interface RunScore {
|
|
61
|
-
success: number;
|
|
62
|
-
goalProgress: number;
|
|
63
|
-
repoGroundedness: number;
|
|
64
|
-
driftPenalty: number;
|
|
65
|
-
toolUseQuality: number;
|
|
66
|
-
patchQuality: number;
|
|
67
|
-
testReality: number;
|
|
68
|
-
finalGate: number;
|
|
69
|
-
reviewerBlockers: number;
|
|
70
|
-
costUsd: number;
|
|
71
|
-
wallSeconds: number;
|
|
72
|
-
notes?: string[];
|
|
73
|
-
}
|
|
74
|
-
interface RunScoreWeights {
|
|
75
|
-
success: number;
|
|
76
|
-
goalProgress: number;
|
|
77
|
-
repoGroundedness: number;
|
|
78
|
-
driftPenalty: number;
|
|
79
|
-
toolUseQuality: number;
|
|
80
|
-
patchQuality: number;
|
|
81
|
-
testReality: number;
|
|
82
|
-
finalGate: number;
|
|
83
|
-
reviewerBlockers: number;
|
|
84
|
-
costUsd: number;
|
|
85
|
-
wallSeconds: number;
|
|
86
|
-
}
|
|
87
|
-
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
88
|
-
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
89
|
-
declare function clamp01(value: number): number;
|
|
90
|
-
|
|
91
|
-
interface RunTrace {
|
|
92
|
-
run: Run$1;
|
|
93
|
-
spans: Span[];
|
|
94
|
-
events: TraceEvent[];
|
|
95
|
-
artifacts: Artifact$1[];
|
|
96
|
-
budget: BudgetLedgerEntry[];
|
|
97
|
-
}
|
|
98
|
-
interface RunCriticOptions {
|
|
99
|
-
weights?: Partial<RunScoreWeights>;
|
|
100
|
-
driftPatterns?: RegExp[];
|
|
101
|
-
}
|
|
102
|
-
declare class RunCritic {
|
|
103
|
-
private readonly weights?;
|
|
104
|
-
private readonly driftPatterns;
|
|
105
|
-
constructor(options?: RunCriticOptions);
|
|
106
|
-
score(store: TraceStore, runId: string): Promise<RunScore>;
|
|
107
|
-
scoreTrace(trace: RunTrace): RunScore;
|
|
108
|
-
rank(score: RunScore): number;
|
|
109
|
-
private isDrift;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Semantic concept judge — "does the built artifact actually implement
|
|
114
|
-
* the features the user asked for?"
|
|
115
|
-
*
|
|
116
|
-
* Distinct from the domain/code/coherence judges in `judges.ts`:
|
|
117
|
-
* - those judges score free-form conversational agent outputs along
|
|
118
|
-
* quality dimensions (accuracy, depth, etc.)
|
|
119
|
-
* - this judge scores a *built artifact* (served HTML + source files)
|
|
120
|
-
* against an explicit list of expected concepts, returning per-concept
|
|
121
|
-
* {present, score 0-10, evidence, severity}.
|
|
122
|
-
*
|
|
123
|
-
* The judge is strict about distinguishing (a) a working implementation
|
|
124
|
-
* from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
|
|
125
|
-
* Only real, functional, wired-up code counts.
|
|
126
|
-
*
|
|
127
|
-
* Use via {@link createSemanticConceptJudge} or directly via
|
|
128
|
-
* {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
|
|
129
|
-
* or JSON-parse errors so the caller can treat that as "layer skipped"
|
|
130
|
-
* rather than "layer failed" in a multi-layer pipeline.
|
|
131
|
-
*/
|
|
132
|
-
|
|
133
|
-
/**
|
|
134
|
-
* Implementation complexity class for weighted scoring.
|
|
135
|
-
*
|
|
136
|
-
* - `render` (default): the concept is a UI surface that displays static
|
|
137
|
-
* data — render a list, show a counter, lay out a button. Single-file
|
|
138
|
-
* work, no external integration.
|
|
139
|
-
* - `integrate`: the concept requires wiring a real external system —
|
|
140
|
-
* wallet connect (wagmi + RainbowKit + chain config), payment provider
|
|
141
|
-
* (Stripe Elements + intent + webhook), an API client with auth.
|
|
142
|
-
* Multi-file, library-knowledge, runtime correctness matters.
|
|
143
|
-
* - `compute`: the concept requires algorithmic work — solver, simulator,
|
|
144
|
-
* constraint propagation, ML inference. Correctness > UI polish.
|
|
145
|
-
*
|
|
146
|
-
* Default weights (when applied via `weightConcepts: 'complexity'`):
|
|
147
|
-
* render=1.0, integrate=2.0, compute=2.5
|
|
148
|
-
*
|
|
149
|
-
* Cross-vertical scoring without complexity weighting silently inflates
|
|
150
|
-
* the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
|
|
151
|
-
* integration-heavy verticals (DeFi, wallets) — all concepts treated
|
|
152
|
-
* equally even though the agent does 2-3x the work for `integrate`.
|
|
153
|
-
*/
|
|
154
|
-
type ConceptComplexity = 'render' | 'integrate' | 'compute';
|
|
155
|
-
interface ConceptSpec {
|
|
156
|
-
name: string;
|
|
157
|
-
/** Short hints that help the judge; not used for matching. */
|
|
158
|
-
keywords?: string[];
|
|
159
|
-
/** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
|
|
160
|
-
weight?: number;
|
|
161
|
-
/** Implementation complexity class. Default `render`. */
|
|
162
|
-
complexity?: ConceptComplexity;
|
|
163
|
-
}
|
|
164
|
-
interface ConceptFinding {
|
|
165
|
-
concept: string;
|
|
166
|
-
present: boolean;
|
|
167
|
-
/** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
|
|
168
|
-
score: number;
|
|
169
|
-
evidence: string;
|
|
170
|
-
severity: Severity;
|
|
171
|
-
}
|
|
172
|
-
interface SemanticConceptJudgeInput {
|
|
173
|
-
/** Full natural-language prompt the agent was handed. */
|
|
174
|
-
userRequest: string;
|
|
175
|
-
/** Rendered HTML the preview returns (UI artifacts). Optional. */
|
|
176
|
-
servedHtml?: string;
|
|
177
|
-
/** Top-level source files from the agent's workdir. */
|
|
178
|
-
sourceFiles: Array<{
|
|
179
|
-
path: string;
|
|
180
|
-
content: string;
|
|
181
|
-
}>;
|
|
182
|
-
/** The expected concept list. */
|
|
183
|
-
expectedConcepts: ConceptSpec[];
|
|
184
|
-
/** Free-form metadata (id, difficulty) to inject into the prompt. */
|
|
185
|
-
artifactLabel?: string;
|
|
186
|
-
artifactDescription?: string;
|
|
187
|
-
}
|
|
188
|
-
interface SemanticConceptJudgeResult {
|
|
189
|
-
kind: 'semantic-concept';
|
|
190
|
-
version: string;
|
|
191
|
-
/** Normalized 0..1 score — mean of per-concept scores / 10. */
|
|
192
|
-
score: number;
|
|
193
|
-
presentCount: number;
|
|
194
|
-
totalCount: number;
|
|
195
|
-
findings: ConceptFinding[];
|
|
196
|
-
summary: string;
|
|
197
|
-
durationMs: number;
|
|
198
|
-
costUsd: number | null;
|
|
199
|
-
/** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
|
|
200
|
-
available: boolean;
|
|
201
|
-
error?: string;
|
|
202
|
-
}
|
|
203
|
-
/**
|
|
204
|
-
* Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
|
|
205
|
-
* `complexity` applies the default weight table (render=1, integrate=2,
|
|
206
|
-
* compute=2.5) unless a concept has an explicit `weight`. `explicit`
|
|
207
|
-
* honors only `weight` (defaulting to 1 for unspecified).
|
|
208
|
-
*/
|
|
209
|
-
type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
|
|
210
|
-
declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
|
|
211
|
-
interface SemanticConceptJudgeOptions {
|
|
212
|
-
/** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
|
|
213
|
-
model?: string;
|
|
214
|
-
/** Per-call timeout. Default 180s. */
|
|
215
|
-
timeoutMs?: number;
|
|
216
|
-
/** Pipeline budget for the prompt (source blob truncation). Default 45000. */
|
|
217
|
-
maxSourceChars?: number;
|
|
218
|
-
/** Per-file cap before inclusion. Default 20000. */
|
|
219
|
-
maxPerFileChars?: number;
|
|
220
|
-
/** HTML cap. Default 30000. */
|
|
221
|
-
maxHtmlChars?: number;
|
|
222
|
-
/** LlmClient config (baseUrl, apiKey, authHeader, …). */
|
|
223
|
-
llm?: LlmClientOptions;
|
|
224
|
-
/**
|
|
225
|
-
* Score aggregation strategy. Default `mean` — uniform average across
|
|
226
|
-
* concepts. Cross-vertical comparisons should use `complexity` to
|
|
227
|
-
* neutralize the integrate-vs-render asymmetry.
|
|
228
|
-
*/
|
|
229
|
-
weightConcepts?: ConceptWeightStrategy;
|
|
230
|
-
/** Override the default complexity → weight table. */
|
|
231
|
-
complexityWeights?: Partial<Record<ConceptComplexity, number>>;
|
|
232
|
-
}
|
|
233
|
-
declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
234
|
-
/**
|
|
235
|
-
* Run the semantic concept judge. Soft-fails to available=false on
|
|
236
|
-
* LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
|
|
237
|
-
* that as "skip" rather than "fail."
|
|
238
|
-
*/
|
|
239
|
-
declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
|
|
240
|
-
/**
|
|
241
|
-
* Factory: pin LLM options once, return a closure that accepts inputs.
|
|
242
|
-
* Convenient for pipelines that want to share a single LlmClient config.
|
|
243
|
-
*/
|
|
244
|
-
declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
|
|
245
|
-
|
|
246
|
-
/**
|
|
247
|
-
* Adapter factories — lift each existing agent-eval primitive into the
|
|
248
|
-
* Analyst contract without re-implementing it.
|
|
249
|
-
*
|
|
250
|
-
* Five primitives, five factories. Each one:
|
|
251
|
-
* - Builds an Analyst with a stable id (caller chooses; defaults
|
|
252
|
-
* given), a sensible default `inputKind`, a version derived from
|
|
253
|
-
* the wrapped primitive's version + an adapter revision, and an
|
|
254
|
-
* `analyze()` that calls the primitive and lifts its output to
|
|
255
|
-
* AnalystFinding[] using `makeFinding()`.
|
|
256
|
-
* - Maps severities: the existing `Severity` ('critical' | 'major' |
|
|
257
|
-
* 'minor' | 'info') projects onto AnalystSeverity ('critical' |
|
|
258
|
-
* 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →
|
|
259
|
-
* 'medium'. Domain analysts that want finer-grained mapping override.
|
|
260
|
-
*
|
|
261
|
-
* Adapters never own state. Calling the same factory twice with the
|
|
262
|
-
* same primitive instance is safe.
|
|
263
|
-
*/
|
|
264
|
-
|
|
265
|
-
declare function liftSeverity(s: Severity): AnalystSeverity;
|
|
266
|
-
interface TraceAnalystAdapterOpts {
|
|
267
|
-
id?: string;
|
|
268
|
-
area?: string;
|
|
269
|
-
/** The natural-language question(s) put to the analyst. One finding per question. */
|
|
270
|
-
questions: string[];
|
|
271
|
-
/** Caller-provided AxAI service — same one trace-analyst.ts expects. */
|
|
272
|
-
ai: AxAIService;
|
|
273
|
-
model?: string;
|
|
274
|
-
/** Forwarded to analyzeTraces. */
|
|
275
|
-
extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>;
|
|
276
|
-
}
|
|
277
|
-
/**
|
|
278
|
-
* @deprecated Prefer `createTraceAnalystKind` + one of the failure /
|
|
279
|
-
* improvement kinds from `./kinds`. This adapter wraps the legacy
|
|
280
|
-
* `analyzeTraces` flow whose output is `findings:string[]` — every
|
|
281
|
-
* bullet gets flat-defaulted severity `medium` / confidence `0.6`,
|
|
282
|
-
* which loses the per-finding grading kinds provide via Ax structured
|
|
283
|
-
* output + Zod validation. Kept for one minor while consumers migrate.
|
|
284
|
-
*/
|
|
285
|
-
declare function createTraceAnalystAdapter(opts: TraceAnalystAdapterOpts): Analyst<TraceAnalysisStore>;
|
|
286
|
-
interface VerifierAdapterOpts<Env> {
|
|
287
|
-
id?: string;
|
|
288
|
-
area?: string;
|
|
289
|
-
verifier: MultiLayerVerifier<Env>;
|
|
290
|
-
/**
|
|
291
|
-
* The verifier expects an `env` per run. Adapters take it from
|
|
292
|
-
* `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.
|
|
293
|
-
*/
|
|
294
|
-
options?: Omit<VerifyOptions<Env>, 'env'>;
|
|
295
|
-
}
|
|
296
|
-
declare function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env>;
|
|
297
|
-
interface RunCriticAdapterOpts {
|
|
298
|
-
id?: string;
|
|
299
|
-
area?: string;
|
|
300
|
-
critic?: RunCritic;
|
|
301
|
-
/** Optional threshold below which a dimension is reported as a finding. Default 0.5. */
|
|
302
|
-
threshold?: number;
|
|
303
|
-
}
|
|
304
|
-
declare function createRunCriticAdapter(opts?: RunCriticAdapterOpts): Analyst<RunTrace>;
|
|
305
|
-
interface JudgeAdapterOpts {
|
|
306
|
-
id?: string;
|
|
307
|
-
area?: string;
|
|
308
|
-
judge: JudgeFn;
|
|
309
|
-
/** TCloud handle the JudgeFn calls. */
|
|
310
|
-
tcloud: TCloud;
|
|
311
|
-
/** Optional cost classification — most judges call an LLM. */
|
|
312
|
-
cost?: Analyst['cost'];
|
|
313
|
-
/** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */
|
|
314
|
-
threshold?: number;
|
|
315
|
-
}
|
|
316
|
-
declare function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput>;
|
|
317
|
-
interface SemanticConceptJudgeAdapterOpts {
|
|
318
|
-
id?: string;
|
|
319
|
-
area?: string;
|
|
320
|
-
options?: SemanticConceptJudgeOptions;
|
|
321
|
-
}
|
|
322
|
-
declare function createSemanticConceptJudgeAdapter(opts?: SemanticConceptJudgeAdapterOpts): Analyst<SemanticConceptJudgeInput>;
|
|
323
|
-
|
|
324
|
-
/**
|
|
325
|
-
* Typed Ax output for analyst findings.
|
|
326
|
-
*
|
|
327
|
-
* Replaces the legacy `findings:string[]` pattern (where every bullet
|
|
328
|
-
* became a flat-severity `AnalystFinding`) with a structured object
|
|
329
|
-
* array. Ax binds the field as `findings:json[]` so the provider emits
|
|
330
|
-
* native structured output; at the kind-factory boundary we Zod-validate
|
|
331
|
-
* each emitted finding so malformed rows fail loud instead of being
|
|
332
|
-
* silently lifted with default severity.
|
|
333
|
-
*
|
|
334
|
-
* Why not `f.object().array()` directly in the signature? The Ax
|
|
335
|
-
* signature string `question:string -> findings:json[]` already lets
|
|
336
|
-
* the provider emit JSON arrays. A Zod boundary is required either
|
|
337
|
-
* way (the provider can return any JSON), and Zod gives us a single
|
|
338
|
-
* validation surface independent of which Ax version is installed.
|
|
339
|
-
*/
|
|
340
|
-
|
|
341
|
-
declare const ANALYST_SEVERITIES: readonly ["critical", "high", "medium", "low", "info"];
|
|
342
|
-
declare const RawAnalystFindingSchema: z.ZodObject<{
|
|
343
|
-
severity: z.ZodEnum<{
|
|
344
|
-
info: "info";
|
|
345
|
-
critical: "critical";
|
|
346
|
-
medium: "medium";
|
|
347
|
-
low: "low";
|
|
348
|
-
high: "high";
|
|
349
|
-
}>;
|
|
350
|
-
claim: z.ZodString;
|
|
351
|
-
subject: z.ZodOptional<z.ZodString>;
|
|
352
|
-
evidence_uri: z.ZodString;
|
|
353
|
-
evidence_excerpt: z.ZodOptional<z.ZodString>;
|
|
354
|
-
confidence: z.ZodNumber;
|
|
355
|
-
rationale: z.ZodOptional<z.ZodString>;
|
|
356
|
-
recommended_action: z.ZodOptional<z.ZodString>;
|
|
357
|
-
}, z.core.$strict>;
|
|
358
|
-
type RawAnalystFinding = z.infer<typeof RawAnalystFindingSchema>;
|
|
359
|
-
/**
|
|
360
|
-
* Description embedded into the actor prompt so the LLM knows what
|
|
361
|
-
* shape to emit. Kept here so kinds share one source of truth rather
|
|
362
|
-
* than restating the schema in every prompt.
|
|
363
|
-
*/
|
|
364
|
-
declare const RAW_FINDING_SCHEMA_PROMPT = "Each finding MUST be a JSON object with these fields:\n - severity: one of \"critical\" | \"high\" | \"medium\" | \"low\" | \"info\"\n - claim: one-sentence statement (max 2000 chars)\n - subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about\n - evidence_uri: \"span://<trace_id>/<span_id>\" for trace evidence, \"artifact://<relative-path>\" for files, \"metric://<name>\" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools\n - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact\n - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative\n - rationale?: one or two sentences explaining the reasoning\n - recommended_action?: concrete change phrased as an imperative (\"Add ...\", \"Replace ...\", \"Stop ...\") \u2014 omit when the finding is purely descriptive\n\nEmit an empty array when the question has no findings to report. Do not fabricate evidence.";
|
|
365
|
-
/**
|
|
366
|
-
* Validate one row emitted by the LLM. Returns the typed finding on
|
|
367
|
-
* success; returns `null` and logs the reason on failure so the kind
|
|
368
|
-
* factory can skip-and-count rather than abort the whole analyst run.
|
|
369
|
-
*/
|
|
370
|
-
declare function parseRawFinding(row: unknown, log?: (msg: string, fields?: Record<string, unknown>) => void): RawAnalystFinding | null;
|
|
371
|
-
|
|
372
|
-
/**
|
|
373
|
-
* Typed `FindingSubject` — the canonical grammar every analyst kind emits.
|
|
374
|
-
*
|
|
375
|
-
* Background: kind actor prompts have always documented a subject grammar
|
|
376
|
-
* (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`) but the
|
|
377
|
-
* LLM was unconstrained — it could emit `subject: "fix the prompt"`
|
|
378
|
-
* (prose) and downstream adapters routed on `startsWith(...)` would
|
|
379
|
-
* silently skip it. Every per-vertical `ImprovementAdapter` had a
|
|
380
|
-
* routing table that mostly caught nothing.
|
|
381
|
-
*
|
|
382
|
-
* This module fixes that:
|
|
383
|
-
* - `parseFindingSubject(raw)` — returns the typed `FindingSubject`
|
|
384
|
-
* when `raw` matches the grammar, else `null`. Used at the
|
|
385
|
-
* `RawAnalystFindingSchema` boundary so malformed subjects are
|
|
386
|
-
* rejected loudly instead of silently lifted into the registry.
|
|
387
|
-
* - `FindingSubjectKind` — the union of valid locus categories. Each
|
|
388
|
-
* variant carries the typed components downstream adapters resolve
|
|
389
|
-
* against the agent's surface manifest (no string parsing in the
|
|
390
|
-
* adapter).
|
|
391
|
-
* - `FINDING_SUBJECT_GRAMMAR_PROMPT` — single source of truth for the
|
|
392
|
-
* grammar string embedded in kind actor prompts. Drift between
|
|
393
|
-
* prompt and parser is impossible if every kind imports this.
|
|
394
|
-
*
|
|
395
|
-
* The grammar is intentionally NARROW — only loci the substrate's
|
|
396
|
-
* default `ImprovementAdapter` / `KnowledgeAdapter` can act on. A
|
|
397
|
-
* finding with a subject outside this set fails the parser; the kind
|
|
398
|
-
* author either extends the grammar here (and adds adapter routing)
|
|
399
|
-
* or rephrases the prompt to map onto an existing variant.
|
|
400
|
-
*
|
|
401
|
-
* `failure-mode` is the one exception — its subjects are free-form
|
|
402
|
-
* cluster labels, not loci. The schema preserves them as
|
|
403
|
-
* `{ kind: 'cluster', label }` and the adapters skip them (cluster
|
|
404
|
-
* findings are evidence, not actionable mutations).
|
|
405
|
-
*/
|
|
406
|
-
|
|
407
|
-
/**
|
|
408
|
-
* Discriminated union of every locus the substrate can route findings to.
|
|
409
|
-
*
|
|
410
|
-
* Adapters narrow on `kind` and use the typed components (no string
|
|
411
|
-
* parsing). Adding a variant here REQUIRES updating the parser, the
|
|
412
|
-
* grammar prompt, and at least one adapter — by design.
|
|
413
|
-
*/
|
|
414
|
-
type FindingSubject = {
|
|
415
|
-
kind: 'knowledge.wiki';
|
|
416
|
-
slug: string;
|
|
417
|
-
heading?: string;
|
|
418
|
-
} | {
|
|
419
|
-
kind: 'knowledge.claim';
|
|
420
|
-
topic: string;
|
|
421
|
-
} | {
|
|
422
|
-
kind: 'knowledge.raw';
|
|
423
|
-
sourceId: string;
|
|
424
|
-
} | {
|
|
425
|
-
kind: 'knowledge.stale';
|
|
426
|
-
slug: string;
|
|
427
|
-
} | {
|
|
428
|
-
kind: 'system-prompt';
|
|
429
|
-
section: string;
|
|
430
|
-
} | {
|
|
431
|
-
kind: 'tool-doc';
|
|
432
|
-
tool: string;
|
|
433
|
-
aspect?: string;
|
|
434
|
-
} | {
|
|
435
|
-
kind: 'new-tool';
|
|
436
|
-
name: string;
|
|
437
|
-
} | {
|
|
438
|
-
kind: 'rag';
|
|
439
|
-
corpus: string;
|
|
440
|
-
docId: string;
|
|
441
|
-
} | {
|
|
442
|
-
kind: 'memory';
|
|
443
|
-
key: string;
|
|
444
|
-
} | {
|
|
445
|
-
kind: 'scaffolding';
|
|
446
|
-
concern: string;
|
|
447
|
-
} | {
|
|
448
|
-
kind: 'output-schema';
|
|
449
|
-
field: string;
|
|
450
|
-
} | {
|
|
451
|
-
kind: 'websearch.outdated';
|
|
452
|
-
topic: string;
|
|
453
|
-
} | {
|
|
454
|
-
kind: 'prior-run-summary';
|
|
455
|
-
topic: string;
|
|
456
|
-
} | {
|
|
457
|
-
kind: 'cluster';
|
|
458
|
-
label: string;
|
|
459
|
-
};
|
|
460
|
-
type FindingSubjectKind = FindingSubject['kind'];
|
|
461
|
-
declare const FINDING_SUBJECT_KINDS: ReadonlyArray<FindingSubjectKind>;
|
|
462
|
-
/**
|
|
463
|
-
* Parse a raw subject string emitted by an analyst kind's actor.
|
|
464
|
-
*
|
|
465
|
-
* Returns the typed `FindingSubject` when `raw` matches the grammar,
|
|
466
|
-
* else `null`. Callers use the `null` return as a signal to either
|
|
467
|
-
* (a) reject the finding at parse time (kinds that emit typed loci —
|
|
468
|
-
* knowledge-gap, improvement, knowledge-poisoning) or (b) lift it as
|
|
469
|
-
* a cluster label (failure-mode).
|
|
470
|
-
*
|
|
471
|
-
* Slugs are constrained to `[a-z0-9-]+` (lowercase kebab) to keep file
|
|
472
|
-
* paths sane downstream. Topics / keys / sections allow any non-empty
|
|
473
|
-
* string (free-form for the LLM's voice) but get trimmed.
|
|
474
|
-
*
|
|
475
|
-
* Empty / whitespace-only inputs return `null`. `undefined` returns
|
|
476
|
-
* `null`. Both are surfaced by the caller as a rejected subject.
|
|
477
|
-
*/
|
|
478
|
-
declare function parseFindingSubject(raw: string | null | undefined): FindingSubject | null;
|
|
479
|
-
/**
|
|
480
|
-
* Render the parsed subject back to its canonical string form. Inverse
|
|
481
|
-
* of `parseFindingSubject`; useful when the substrate constructs new
|
|
482
|
-
* findings programmatically (e.g. for tests, replays, or
|
|
483
|
-
* `id_basis` carry-forward).
|
|
484
|
-
*/
|
|
485
|
-
declare function renderFindingSubject(s: FindingSubject): string;
|
|
486
|
-
/**
|
|
487
|
-
* The grammar text embedded into kind actor prompts. Kinds opt into
|
|
488
|
-
* the subset of variants they emit (e.g. `improvement` excludes the
|
|
489
|
-
* cluster variant; `failure-mode` includes ONLY the cluster variant).
|
|
490
|
-
*
|
|
491
|
-
* Drift between prompt and parser is impossible: every kind imports
|
|
492
|
-
* this constant + the matching `expects` set, and the unit tests below
|
|
493
|
-
* lock the table to the parser.
|
|
494
|
-
*/
|
|
495
|
-
declare const FINDING_SUBJECT_GRAMMAR_PROMPT: string;
|
|
496
|
-
/**
|
|
497
|
-
* The variants each kind is allowed to emit. Used at the kind factory
|
|
498
|
-
* boundary so a knowledge-gap finding can't sneak in a `system-prompt:*`
|
|
499
|
-
* subject (the improvement-analyst's job) and vice versa.
|
|
500
|
-
*
|
|
501
|
-
* `failure-mode` is restricted to `cluster` — the only kind that emits
|
|
502
|
-
* a non-locus subject.
|
|
503
|
-
*/
|
|
504
|
-
declare const KIND_EXPECTED_SUBJECTS: Record<string, ReadonlyArray<FindingSubjectKind>>;
|
|
505
|
-
/**
|
|
506
|
-
* Zod schema that validates a raw subject string and returns the parsed
|
|
507
|
-
* `FindingSubject`. Embedded in `RawAnalystFindingSchema` via
|
|
508
|
-
* `transform`, so `subject` arrives at the kind factory either as a
|
|
509
|
-
* typed locus or as a parse error attached to a single Zod issue.
|
|
510
|
-
*
|
|
511
|
-
* Optionality is preserved: subjects ARE optional on the wire (some
|
|
512
|
-
* findings are descriptive, not actionable). When present, they MUST
|
|
513
|
-
* parse — emitting a malformed subject is a contract violation, not a
|
|
514
|
-
* soft signal.
|
|
515
|
-
*/
|
|
516
|
-
declare const FindingSubjectStringSchema: z.ZodString;
|
|
517
|
-
|
|
518
|
-
/**
|
|
519
|
-
* FindingsStore — durable persistence for AnalystFinding rows + a diff
|
|
520
|
-
* helper so we can answer "what changed since the last run?" without
|
|
521
|
-
* recomputing analysts.
|
|
522
|
-
*
|
|
523
|
-
* On-disk shape is JSONL: one finding per line, append-only, locked via
|
|
524
|
-
* LockedJsonlAppender. Operators get crash-safety (no partial JSON),
|
|
525
|
-
* cheap reads (sequential parse), and trivial backup (rsync the file).
|
|
526
|
-
*
|
|
527
|
-
* Reads are non-locking: a reader sees a consistent snapshot of all
|
|
528
|
-
* fully-written lines and skips an incomplete trailing line if the
|
|
529
|
-
* writer is mid-append. Cross-process locking is intentionally out of
|
|
530
|
-
* scope (see locked-jsonl-appender.ts).
|
|
531
|
-
*
|
|
532
|
-
* The store is run-scoped: callers pass `runId` on append and on load,
|
|
533
|
-
* which keeps multi-run files cleanly partitioned. The `diffFindings`
|
|
534
|
-
* helper compares two run-id sets using stable `finding_id` semantics —
|
|
535
|
-
* the diff is the cross-run signal the regression dashboard renders.
|
|
536
|
-
*/
|
|
537
|
-
|
|
538
|
-
/**
|
|
539
|
-
* One persisted row. We attach `run_id` on disk so a single file can
|
|
540
|
-
* hold multiple runs and the diff helper can query without re-walking
|
|
541
|
-
* separate files.
|
|
542
|
-
*/
|
|
543
|
-
interface PersistedFinding extends AnalystFinding {
|
|
544
|
-
run_id: string;
|
|
545
|
-
}
|
|
546
|
-
declare class FindingsStore {
|
|
547
|
-
readonly path: string;
|
|
548
|
-
private readonly appender;
|
|
549
|
-
constructor(path: string);
|
|
550
|
-
append(runId: string, findings: AnalystFinding[]): Promise<void>;
|
|
551
|
-
/** Load every persisted finding. Discards malformed trailing lines silently. */
|
|
552
|
-
loadAll(): PersistedFinding[];
|
|
553
|
-
/** Filter to a single run. */
|
|
554
|
-
loadRun(runId: string): PersistedFinding[];
|
|
555
|
-
}
|
|
556
|
-
interface FindingsDiff {
|
|
557
|
-
/** New finding ids in `current` that weren't in `previous`. */
|
|
558
|
-
appeared: PersistedFinding[];
|
|
559
|
-
/** Finding ids in `previous` that aren't in `current`. */
|
|
560
|
-
disappeared: PersistedFinding[];
|
|
561
|
-
/** Same finding id present in both runs and unchanged per the materiality test. */
|
|
562
|
-
persisted: PersistedFinding[];
|
|
563
|
-
/**
|
|
564
|
-
* Same finding id in both runs but at least one non-identity field
|
|
565
|
-
* shifted per `DiffPolicy.isMaterial`. Reported as [previous, current].
|
|
566
|
-
*/
|
|
567
|
-
changed: Array<{
|
|
568
|
-
previous: PersistedFinding;
|
|
569
|
-
current: PersistedFinding;
|
|
570
|
-
}>;
|
|
571
|
-
}
|
|
572
|
-
interface DiffPolicy {
|
|
573
|
-
/**
|
|
574
|
-
* Predicate that decides whether two findings (same finding_id) count
|
|
575
|
-
* as a material change. Defaults to {@link defaultIsMaterial}: severity
|
|
576
|
-
* shift, confidence Δ > 0.05, or evidence count change. Compliance /
|
|
577
|
-
* perf consumers MAY supply a stricter predicate (e.g. rationale text
|
|
578
|
-
* diff, metric Δ thresholds).
|
|
579
|
-
*/
|
|
580
|
-
isMaterial?: (previous: AnalystFinding, current: AnalystFinding) => boolean;
|
|
581
|
-
}
|
|
582
|
-
/**
|
|
583
|
-
* Default materiality test. Deliberately narrow so LLM-reword churn
|
|
584
|
-
* doesn't flood the diff. Stricter tests are opt-in via DiffPolicy.
|
|
585
|
-
*/
|
|
586
|
-
declare function defaultIsMaterial(a: AnalystFinding, b: AnalystFinding): boolean;
|
|
587
|
-
/**
|
|
588
|
-
* Diff two findings sets by stable finding_id. Callers typically load
|
|
589
|
-
* the two run-id slices from the same store and pass them in.
|
|
590
|
-
*/
|
|
591
|
-
declare function diffFindings(previous: PersistedFinding[], current: PersistedFinding[], policy?: DiffPolicy): FindingsDiff;
|
|
592
|
-
|
|
593
|
-
/**
|
|
594
|
-
* Analyst-kind factory — the typed, focused replacement for the
|
|
595
|
-
* legacy `createTraceAnalystAdapter`.
|
|
596
|
-
*
|
|
597
|
-
* A "kind" is a specialized analyst whose actor prompt, tool subset,
|
|
598
|
-
* and Ax recursion config target one failure-mode lens (failure-mode
|
|
599
|
-
* classification, knowledge gap discovery, knowledge poisoning, recursive
|
|
600
|
-
* self-improvement, ...). Kinds emit findings in the typed `RawAnalystFinding`
|
|
601
|
-
* shape via a JSON-array Ax output; the factory validates each row with
|
|
602
|
-
* Zod and lifts it into `AnalystFinding[]` with no shape guessing.
|
|
603
|
-
*
|
|
604
|
-
* Composition rules:
|
|
605
|
-
* - Each kind owns its actor description. No generic "answer this
|
|
606
|
-
* question" prompt — the prompt names the failure lens.
|
|
607
|
-
* - Each kind picks a narrow tool subset from `ANALYST_TOOL_GROUPS`.
|
|
608
|
-
* A kind that never needs full-trace dumps can drop `viewTrace` /
|
|
609
|
-
* `viewSpans` and stay cheap.
|
|
610
|
-
* - Each kind declares its recursion + parallelism budget. Discovery-
|
|
611
|
-
* heavy kinds (failure-mode) get higher `maxDepth`; lens kinds
|
|
612
|
-
* (poisoning) usually stay at 0 since they have a tighter brief.
|
|
613
|
-
*
|
|
614
|
-
* Optimizer hook: kinds may declare `goldens` — labeled examples used
|
|
615
|
-
* by `AxMiPRO` / `AxBootstrapFewShot` / `AxGEPA` to fit the actor
|
|
616
|
-
* description programmatically. Stored on the kind, not the registry,
|
|
617
|
-
* because the right metric is kind-specific.
|
|
618
|
-
*/
|
|
619
|
-
|
|
620
|
-
/**
|
|
621
|
-
* Per-kind specification. The factory turns this into a regular
|
|
622
|
-
* `Analyst<TraceAnalysisStore>` ready for `AnalystRegistry.register()`.
|
|
623
|
-
*/
|
|
624
|
-
interface TraceAnalystKindSpec {
|
|
625
|
-
/** Stable id. Appears in finding_id, telemetry, and registry exclusions. */
|
|
626
|
-
id: string;
|
|
627
|
-
/** One-sentence description shown in `registry.list()`. */
|
|
628
|
-
description: string;
|
|
629
|
-
/** Coarse classification stamped on every emitted finding (`failure-mode`, `knowledge-gap`, ...). */
|
|
630
|
-
area: string;
|
|
631
|
-
/** Bump on any breaking change to the actor prompt or output schema. */
|
|
632
|
-
version: string;
|
|
633
|
-
/** Actor system prompt. Must instruct the LLM to emit `findings` per the schema. */
|
|
634
|
-
actorDescription: string;
|
|
635
|
-
/** Responder system prompt; falls back to a minimal "format the findings" instruction. */
|
|
636
|
-
responderDescription?: string;
|
|
637
|
-
/** Tool functions the actor may call. Pick narrow subsets via `ANALYST_TOOL_GROUPS`. */
|
|
638
|
-
buildTools: (store: TraceAnalysisStore) => AxFunction[];
|
|
639
|
-
/** Recursion budget. `maxDepth: 0` disables subagents. */
|
|
640
|
-
recursion?: {
|
|
641
|
-
maxDepth: number;
|
|
642
|
-
maxParallelSubagents?: number;
|
|
643
|
-
};
|
|
644
|
-
/** Actor turn cap. Default 12. */
|
|
645
|
-
maxTurns?: number;
|
|
646
|
-
/** Runtime char cap. Default 6000. */
|
|
647
|
-
maxRuntimeChars?: number;
|
|
648
|
-
/** Cost classification surfaced in `registry.list()` and budget enforcement. */
|
|
649
|
-
cost: AnalystCost;
|
|
650
|
-
/** Per-finding-row hook — kinds may reject / rewrite before lifting. */
|
|
651
|
-
postProcess?: (row: RawAnalystFinding, ctx: AnalystContext) => RawAnalystFinding | null;
|
|
652
|
-
/** Optional optimizer hook — populated when a kind wants to fit its prompt against labeled examples. */
|
|
653
|
-
goldens?: TraceAnalystGolden[];
|
|
654
|
-
}
|
|
655
|
-
/**
|
|
656
|
-
* One labeled example consumed by Ax optimizers (MIPRO / GEPA / Bootstrap).
|
|
657
|
-
* Each input is the same `{question}` an analyst would receive; `expected`
|
|
658
|
-
* is the ground-truth finding set a fitted prompt should produce on this
|
|
659
|
-
* input. Metric: kind-specific (default: F1 on `finding_id` overlap).
|
|
660
|
-
*/
|
|
661
|
-
interface TraceAnalystGolden {
|
|
662
|
-
question: string;
|
|
663
|
-
expected: ReadonlyArray<Omit<RawAnalystFinding, 'confidence'>>;
|
|
664
|
-
}
|
|
665
|
-
interface CreateTraceAnalystKindOpts {
|
|
666
|
-
/** AxAIService bound at registration time. */
|
|
667
|
-
ai: AxAIService;
|
|
668
|
-
/** Optional model override; falls back to the AI service's default. */
|
|
669
|
-
model?: string;
|
|
670
|
-
/** Override the spec's `version` (e.g. when an optimizer has fitted a new prompt). */
|
|
671
|
-
versionSuffix?: string;
|
|
672
|
-
}
|
|
673
|
-
/**
|
|
674
|
-
* Build an `Analyst<TraceAnalysisStore>` from a kind spec.
|
|
675
|
-
*
|
|
676
|
-
* Lifts the Ax pipeline once at registration time so the registry
|
|
677
|
-
* gets a stateless analyst. The Ax agent is freshly constructed per
|
|
678
|
-
* `analyze()` call (the agent carries chat-log + usage state we don't
|
|
679
|
-
* want shared across analyst runs).
|
|
680
|
-
*/
|
|
681
|
-
declare function createTraceAnalystKind(spec: TraceAnalystKindSpec, opts: CreateTraceAnalystKindOpts): Analyst<TraceAnalysisStore>;
|
|
682
|
-
/**
|
|
683
|
-
* Render a compact prior-findings block the actor reads alongside its
|
|
684
|
-
* brief. Each row is one line so the actor can scan dozens cheaply.
|
|
685
|
-
* The kind's prompt instructs the actor to (a) check whether a new
|
|
686
|
-
* cluster matches a prior `finding_id` (carry the id forward via
|
|
687
|
-
* `id_basis` to keep diffs stable) and (b) raise severity / confidence
|
|
688
|
-
* when a prior finding has reappeared without remediation.
|
|
689
|
-
*
|
|
690
|
-
* Returns the empty string when there are no prior findings — most
|
|
691
|
-
* runs are "first-of-its-kind" and the prompt stays unchanged.
|
|
692
|
-
*
|
|
693
|
-
* Exported for tests + for consumers that build their own actor
|
|
694
|
-
* prompts (e.g. specialized analysts living outside the default kinds).
|
|
695
|
-
*/
|
|
696
|
-
declare function renderPriorFindings(prior: AnalystContext['priorFindings']): string;
|
|
697
|
-
|
|
698
|
-
/**
|
|
699
|
-
* Failure-mode analyst — classifies what went wrong and why.
|
|
700
|
-
*
|
|
701
|
-
* Brief: read the trace dataset, identify the top failure modes across
|
|
702
|
-
* runs, classify each with severity + evidence, and surface them as
|
|
703
|
-
* findings. The actor's job is *taxonomy + evidence*, not fix-design —
|
|
704
|
-
* that's the improvement-analyst's job.
|
|
705
|
-
*
|
|
706
|
-
* Recursion is deep (`maxDepth: 3`) because real failure-mode
|
|
707
|
-
* discovery is genuinely tree-shaped: the actor splits the dataset
|
|
708
|
-
* into candidate clusters, each cluster spawns a focused investigator
|
|
709
|
-
* that drills into representative traces, and a deeply-recursed
|
|
710
|
-
* investigator may itself split a confounded mode into two sub-modes.
|
|
711
|
-
* Each level fans out 4-way, so the analyst can investigate up to
|
|
712
|
-
* ~16 leaf clusters before hitting the depth ceiling.
|
|
713
|
-
*/
|
|
714
|
-
|
|
715
|
-
declare const FAILURE_MODE_KIND_SPEC: TraceAnalystKindSpec;
|
|
716
|
-
|
|
717
|
-
/**
|
|
718
|
-
* Improvement analyst — actionable, recursive self-improvement findings.
|
|
719
|
-
*
|
|
720
|
-
* Brief: read findings from upstream analysts (failure-mode,
|
|
721
|
-
* knowledge-gap, knowledge-poisoning) AND the trace dataset itself,
|
|
722
|
-
* then propose **concrete edits** to the agent's runtime: prompt
|
|
723
|
-
* additions, RAG documents to ingest, tool descriptions to rewrite,
|
|
724
|
-
* scaffolding changes to make, memory entries to invalidate. Each
|
|
725
|
-
* finding is one proposed edit with the locus, the diff, and the
|
|
726
|
-
* expected effect.
|
|
727
|
-
*
|
|
728
|
-
* This is the recursive-self-improvement loop's last mile: the prior
|
|
729
|
-
* kinds describe *what's wrong*; this kind describes *what to change*.
|
|
730
|
-
*
|
|
731
|
-
* Recursion is deep (`maxDepth: 3`) because real improvement proposals
|
|
732
|
-
* are competitive: for each failure-mode there are usually 2-3 viable
|
|
733
|
-
* fix directions (tighten prompt vs add tool vs adjust scaffolding),
|
|
734
|
-
* and the actor should explore each with a focused subagent before
|
|
735
|
-
* picking the highest-leverage one to recommend.
|
|
736
|
-
*/
|
|
737
|
-
|
|
738
|
-
declare const IMPROVEMENT_KIND_SPEC: TraceAnalystKindSpec;
|
|
739
|
-
|
|
740
|
-
/**
|
|
741
|
-
* Knowledge-gap analyst — what did the agent NOT know that it needed?
|
|
742
|
-
*
|
|
743
|
-
* Brief: find moments in the trace where the agent had to guess, ask
|
|
744
|
-
* the user to fill in context, recover from a wrong assumption, or
|
|
745
|
-
* loop on a retrieval. Each finding names a *missing or outdated piece
|
|
746
|
-
* of knowledge* the agent's curated knowledge base should have held —
|
|
747
|
-
* or a downstream lookup (web, docs, tool description) that surfaced
|
|
748
|
-
* stale or outdated information.
|
|
749
|
-
*
|
|
750
|
-
* The primary expected store is `@tangle-network/agent-knowledge`: a
|
|
751
|
-
* Karpathy-style wiki the agent maintains with raw ↔ curated pages,
|
|
752
|
-
* source anchors, and claim/relation triples. A gap is anything the
|
|
753
|
-
* agent had to discover at run-time that should already have lived
|
|
754
|
-
* there. Secondary loci: web-search results that returned outdated
|
|
755
|
-
* pages, tool descriptions that omitted critical behavior, system-
|
|
756
|
-
* prompt sections that didn't cover the case.
|
|
757
|
-
*
|
|
758
|
-
* Distinct from failure-mode: failure-mode classifies *how* it broke;
|
|
759
|
-
* knowledge-gap names the *information* whose absence (or staleness)
|
|
760
|
-
* caused the break. One failure-mode often maps to several gaps.
|
|
761
|
-
*
|
|
762
|
-
* Recursion (`maxDepth: 2`) is enough to fan out one subagent per
|
|
763
|
-
* candidate gap-source layer; each subagent runs a focused detection.
|
|
764
|
-
*/
|
|
765
|
-
|
|
766
|
-
declare const KNOWLEDGE_GAP_KIND_SPEC: TraceAnalystKindSpec;
|
|
767
|
-
|
|
768
|
-
/**
|
|
769
|
-
* Knowledge-poisoning analyst — what FALSE information misled the agent?
|
|
770
|
-
*
|
|
771
|
-
* Brief: find moments where the agent acted on information that was
|
|
772
|
-
* *wrong* — stale memory, RAG documents that contradicted ground truth,
|
|
773
|
-
* tool descriptions that lied about return shapes, system-prompt
|
|
774
|
-
* instructions that no longer matched reality, prior-run summaries that
|
|
775
|
-
* cached a wrong decision.
|
|
776
|
-
*
|
|
777
|
-
* Distinct from knowledge-gap: a gap is "the agent didn't know X"; a
|
|
778
|
-
* poisoning is "the agent confidently used X, but X was wrong." Gaps
|
|
779
|
-
* surface as questions / self-correction; poisonings surface as
|
|
780
|
-
* confident-but-wrong actions that downstream evidence contradicts.
|
|
781
|
-
*
|
|
782
|
-
* Recursion is moderate (`maxDepth: 2`) because each candidate
|
|
783
|
-
* poisoning typically needs two sub-investigations: one to confirm
|
|
784
|
-
* the agent acted on the false belief, one to confirm the belief
|
|
785
|
-
* itself is actually false in ground truth.
|
|
786
|
-
*/
|
|
787
|
-
|
|
788
|
-
declare const KNOWLEDGE_POISONING_KIND_SPEC: TraceAnalystKindSpec;
|
|
789
|
-
|
|
790
|
-
/**
|
|
791
|
-
* Default analyst kinds focused on agent failure + recursive
|
|
792
|
-
* self-improvement.
|
|
793
|
-
*
|
|
794
|
-
* The four kinds chain: failure-mode classifies; knowledge-gap and
|
|
795
|
-
* knowledge-poisoning explain *why* in two orthogonal ways; improvement
|
|
796
|
-
* proposes concrete edits. Register all four against the same trace
|
|
797
|
-
* store and the registry runs them in dependency order if the operator
|
|
798
|
-
* pipes findings between them.
|
|
799
|
-
*/
|
|
800
|
-
|
|
801
|
-
/**
|
|
802
|
-
* The default kind suite. Order is the run order operators should
|
|
803
|
-
* use: failure-mode first (no upstream deps), gap + poisoning next
|
|
804
|
-
* (both depend on failures), improvement last (chains all three).
|
|
805
|
-
*/
|
|
806
|
-
declare const DEFAULT_TRACE_ANALYST_KINDS: readonly TraceAnalystKindSpec[];
|
|
807
|
-
|
|
808
|
-
/**
|
|
809
|
-
* Skill-usage analyst — a DETERMINISTIC `Analyst` over a Claude/Codex skill
|
|
810
|
-
* library + its trace corpus. Unlike the trace-store kinds (failure-mode,
|
|
811
|
-
* improvement, ...) this kind calls no LLM: it mines real usage and skill
|
|
812
|
-
* structure and emits findings by rule.
|
|
813
|
-
*
|
|
814
|
-
* It exists because the naive "Skill-tool invocation count" lies low — it
|
|
815
|
-
* misses orchestrated sub-dispatch (a leaf skill run BY /pursue or /governor
|
|
816
|
-
* logs under the parent), slash-command entry, local-script bypass, and
|
|
817
|
-
* on-disk artifacts. The 2026-05-30 skill audit found 39/53 skills at zero
|
|
818
|
-
* direct invocations, yet only one was a genuine cut: the rest were
|
|
819
|
-
* measurement-invisible or discovery-limited. This analyst encodes that
|
|
820
|
-
* lesson as a multi-signal usage model so a cheap repeatable pass can keep
|
|
821
|
-
* the library honest, and so the expensive audit workflow's verdicts can
|
|
822
|
-
* GEPA-distill it toward agreement (see `gold/skill-verdicts.gold.jsonl`).
|
|
823
|
-
*
|
|
824
|
-
* Report-building (`buildSkillUsageReport`, an fs scan) is separated from
|
|
825
|
-
* finding emission (`SkillUsageAnalyst.analyze`, pure) so the slow scan runs
|
|
826
|
-
* once at the registry boundary and the rule logic stays unit-testable.
|
|
827
|
-
*/
|
|
828
|
-
|
|
829
|
-
type SkillKind = 'public' | 'private';
|
|
830
|
-
/** One skill's multi-signal usage + structure. All counts are deterministic. */
|
|
831
|
-
interface SkillUsageRecord {
|
|
832
|
-
name: string;
|
|
833
|
-
kind: SkillKind;
|
|
834
|
-
/** Absolute path to the skill's SKILL.md. */
|
|
835
|
-
path: string;
|
|
836
|
-
lines: number;
|
|
837
|
-
/** `"skill":"<name>"` Skill-tool invocations across the trace corpus. */
|
|
838
|
-
directInvocations: number;
|
|
839
|
-
/** `<command-name>/<name>` slash invocations across the trace corpus. */
|
|
840
|
-
slashInvocations: number;
|
|
841
|
-
/** Sibling skills whose SKILL.md dispatches to this one (`/<name>`). Proxy
|
|
842
|
-
* for orchestrated sub-dispatch the per-skill counter cannot see. */
|
|
843
|
-
inboundRefs: number;
|
|
844
|
-
/** On-disk artifacts attributable to the skill (e.g. `.evolve/<name>/**`). */
|
|
845
|
-
artifactCount: number;
|
|
846
|
-
/** Tangle-private reference count in the body (leak signal for public skills). */
|
|
847
|
-
tanglePrivateRefs: number;
|
|
848
|
-
hasReferencesDir: boolean;
|
|
849
|
-
hasEvalsDir: boolean;
|
|
850
|
-
/** Body mentions `skill-runs.jsonl` (visible to /reflect + /governor). */
|
|
851
|
-
logsRuns: boolean;
|
|
852
|
-
/** Description carries an explicit `Triggers:` clause / trigger phrases. */
|
|
853
|
-
hasTriggerPhrases: boolean;
|
|
854
|
-
}
|
|
855
|
-
interface SkillUsageReport {
|
|
856
|
-
generatedFromTraces: number;
|
|
857
|
-
records: SkillUsageRecord[];
|
|
858
|
-
}
|
|
859
|
-
interface SkillUsageScanConfig {
|
|
860
|
-
/** Dirs holding `*.jsonl` transcripts (Claude `~/.claude/projects`, Codex sessions). */
|
|
861
|
-
transcriptDirs: string[];
|
|
862
|
-
/** Skill roots to scan; each dir directly under `root` with a `SKILL.md` is a skill. */
|
|
863
|
-
skillRoots: {
|
|
864
|
-
root: string;
|
|
865
|
-
kind: SkillKind;
|
|
866
|
-
}[];
|
|
867
|
-
/** Roots scanned for `<root>/.evolve/<skill>` artifact dirs. */
|
|
868
|
-
artifactRoots?: string[];
|
|
869
|
-
/** Token-prefixed mappings: skill name → extra artifact subpaths under an artifactRoot
|
|
870
|
-
* (e.g. reflect → `.evolve/reflections`). Catches non-eponymous artifact dirs. */
|
|
871
|
-
artifactAliases?: Record<string, string[]>;
|
|
872
|
-
/** Cap files read per transcript dir (bounds a huge corpus); 0 = unbounded. */
|
|
873
|
-
maxTranscriptsPerDir?: number;
|
|
874
|
-
}
|
|
875
|
-
/** Scan the corpus + skill roots into a {@link SkillUsageReport}. Deterministic. */
|
|
876
|
-
declare function buildSkillUsageReport(config: SkillUsageScanConfig): SkillUsageReport;
|
|
877
|
-
/** Pure rule pass over a report → findings. Exported for direct/unit use. */
|
|
878
|
-
declare function emitSkillUsageFindings(report: SkillUsageReport, producedAt: string): AnalystFinding[];
|
|
879
|
-
declare class SkillUsageAnalyst implements Analyst<SkillUsageReport> {
|
|
880
|
-
readonly id = "skill-usage";
|
|
881
|
-
readonly description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
|
|
882
|
-
readonly inputKind: "custom";
|
|
883
|
-
readonly cost: {
|
|
884
|
-
kind: "deterministic";
|
|
885
|
-
est_usd_per_run: number;
|
|
886
|
-
};
|
|
887
|
-
readonly version = "1.0.0";
|
|
888
|
-
analyze(input: SkillUsageReport, ctx: AnalystContext): Promise<AnalystFinding[]>;
|
|
889
|
-
}
|
|
890
|
-
declare const SKILL_USAGE_ANALYST: SkillUsageAnalyst;
|
|
891
|
-
|
|
892
|
-
/**
|
|
893
|
-
* Pre-curated tool subsets for analyst kinds.
|
|
894
|
-
*
|
|
895
|
-
* The full trace-analyst tool set is seven functions. Most kinds only
|
|
896
|
-
* need three or four. Picking from named groups instead of importing
|
|
897
|
-
* the whole bundle keeps every kind's actor-context budget tight and
|
|
898
|
-
* makes "what can this analyst see?" obvious at registration time.
|
|
899
|
-
*
|
|
900
|
-
* Each function in the group keeps its full `name`/`description` from
|
|
901
|
-
* `buildTraceAnalystTools` — we filter, we don't re-implement.
|
|
902
|
-
*/
|
|
903
|
-
|
|
904
|
-
/** Named tool sets. Kinds pass `tools: TRACE_TOOL_GROUPS.failureForensics` etc. */
|
|
905
|
-
type TraceToolGroupName =
|
|
906
|
-
/** All seven tools. Use for open-ended discovery kinds. */
|
|
907
|
-
'all'
|
|
908
|
-
/** Overview + paginated query + count. No deep reads. Cheap. */
|
|
909
|
-
| 'discovery'
|
|
910
|
-
/** Discovery + viewTrace + viewSpans. Deep-read but no regex search. */
|
|
911
|
-
| 'discoveryAndRead'
|
|
912
|
-
/** Discovery + search tools. For pattern-matching across many traces. */
|
|
913
|
-
| 'discoveryAndSearch'
|
|
914
|
-
/** Discovery + viewSpans + searchSpan. Targeted-span work after another kind narrows down. */
|
|
915
|
-
| 'targeted';
|
|
916
|
-
/**
|
|
917
|
-
* Build the tool set for a named group bound to a specific trace store.
|
|
918
|
-
*
|
|
919
|
-
* `all` returns every tool. Other groups filter `buildTraceAnalystTools`
|
|
920
|
-
* by name to the documented subset. An unrecognised group name throws —
|
|
921
|
-
* silently returning all tools would defeat the cost-control point.
|
|
922
|
-
*/
|
|
923
|
-
declare function buildTraceToolsForGroup(group: TraceToolGroupName, store: TraceAnalysisStore): AxFunction[];
|
|
924
|
-
|
|
925
71
|
/**
|
|
926
72
|
* Automated pull request opener for the production loop.
|
|
927
73
|
*
|
|
@@ -1653,193 +799,6 @@ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSl
|
|
|
1653
799
|
penaltyWeights: Record<SlopCategory, number>;
|
|
1654
800
|
}): AntiSlopReport;
|
|
1655
801
|
|
|
1656
|
-
/**
|
|
1657
|
-
* Artifact validators.
|
|
1658
|
-
*
|
|
1659
|
-
* Generic "score a produced artifact" primitive. Tax uses it for PDF form
|
|
1660
|
-
* correctness, research for sourced briefs, browser for task assertions, coding
|
|
1661
|
-
* for social posts. One interface, many validators; all plug into
|
|
1662
|
-
* `BenchmarkRunner` the same way.
|
|
1663
|
-
*
|
|
1664
|
-
* A validator receives an `Artifact` (file on disk, JSON blob, text, binary)
|
|
1665
|
-
* plus a `ValidationContext` (scenario id, the turns that produced it) and
|
|
1666
|
-
* returns a `ValidationResult` with pass/fail + 0..1 score + structured
|
|
1667
|
-
* issues.
|
|
1668
|
-
*/
|
|
1669
|
-
interface Artifact {
|
|
1670
|
-
/** Logical kind — validators type-guard on this */
|
|
1671
|
-
kind: 'file' | 'json' | 'text' | 'binary' | string;
|
|
1672
|
-
/** Filesystem-style path, optional */
|
|
1673
|
-
path?: string;
|
|
1674
|
-
/** String content for text/json/file kinds */
|
|
1675
|
-
content?: string;
|
|
1676
|
-
/** Binary content (if kind === 'binary') */
|
|
1677
|
-
bytes?: Uint8Array;
|
|
1678
|
-
/** Caller-supplied metadata (mimeType, sha256, size, etc.) */
|
|
1679
|
-
metadata?: Record<string, unknown>;
|
|
1680
|
-
}
|
|
1681
|
-
interface ValidationContext {
|
|
1682
|
-
scenarioId: string;
|
|
1683
|
-
turnIndex?: number;
|
|
1684
|
-
/** Prior artifacts for multi-artifact scenarios */
|
|
1685
|
-
priorArtifacts?: Artifact[];
|
|
1686
|
-
/** Free-form hints the validator uses for domain-specific checks */
|
|
1687
|
-
hints?: Record<string, unknown>;
|
|
1688
|
-
}
|
|
1689
|
-
interface ValidationIssue {
|
|
1690
|
-
severity: 'error' | 'warning' | 'info';
|
|
1691
|
-
message: string;
|
|
1692
|
-
/** Optional path into the artifact (e.g. JSON path or byte offset) */
|
|
1693
|
-
locus?: string;
|
|
1694
|
-
}
|
|
1695
|
-
interface ValidationResult {
|
|
1696
|
-
pass: boolean;
|
|
1697
|
-
/** 0–1 normalized score. Validators should be monotonic in pass-ness. */
|
|
1698
|
-
score: number;
|
|
1699
|
-
issues: ValidationIssue[];
|
|
1700
|
-
/** Diagnostic payload for reporters */
|
|
1701
|
-
evidence?: Record<string, unknown>;
|
|
1702
|
-
}
|
|
1703
|
-
interface ArtifactValidator {
|
|
1704
|
-
/** Stable identifier for the validator; appears in reports. */
|
|
1705
|
-
name: string;
|
|
1706
|
-
/** Optional description for human-facing reports. */
|
|
1707
|
-
description?: string;
|
|
1708
|
-
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
1709
|
-
validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
|
|
1710
|
-
}
|
|
1711
|
-
/**
|
|
1712
|
-
* Run every validator on the same artifact; aggregate pass as AND, score as
|
|
1713
|
-
* (weighted) mean, issues concatenated. Weights default to 1 each.
|
|
1714
|
-
*/
|
|
1715
|
-
declare function composeValidators(validators: ArtifactValidator[], options?: {
|
|
1716
|
-
name?: string;
|
|
1717
|
-
weights?: number[];
|
|
1718
|
-
}): ArtifactValidator;
|
|
1719
|
-
/** Pass if the artifact body matches a provided regex. */
|
|
1720
|
-
declare function regexMatch(name: string, pattern: RegExp): ArtifactValidator;
|
|
1721
|
-
/** Pass if JSON parses and every required key is present. */
|
|
1722
|
-
declare function jsonHasKeys(name: string, requiredPaths: string[]): ArtifactValidator;
|
|
1723
|
-
/** Pass if min ≤ byte length ≤ max. */
|
|
1724
|
-
declare function byteLengthRange(name: string, min: number, max: number): ArtifactValidator;
|
|
1725
|
-
/** Pass if the artifact contains every required substring (case-insensitive by default). */
|
|
1726
|
-
declare function containsAll(name: string, required: string[], options?: {
|
|
1727
|
-
caseSensitive?: boolean;
|
|
1728
|
-
}): ArtifactValidator;
|
|
1729
|
-
|
|
1730
|
-
/**
|
|
1731
|
-
* Completion verifier — the task-completion oracle.
|
|
1732
|
-
*
|
|
1733
|
-
* Answers the only eval question that is not a proxy: did the agent actually
|
|
1734
|
-
* COMPLETE the task — produce every required deliverable, persisted and
|
|
1735
|
-
* correct — rather than describe what should be done. A fluent transcript
|
|
1736
|
-
* that never produces the artifact scores zero here.
|
|
1737
|
-
*
|
|
1738
|
-
* Per requirement, a two-stage check:
|
|
1739
|
-
* 1. Structural — a produced item (vault artifact / approved proposal /
|
|
1740
|
-
* tool call) of the right kind is matched against the requirement and
|
|
1741
|
-
* carries non-empty content. Deterministic; no LLM.
|
|
1742
|
-
* 2. Correctness — only if structurally present AND the matched item
|
|
1743
|
-
* carries content, one targeted check decides whether that item
|
|
1744
|
-
* actually fulfils the requirement. A hallucinated artifact fails here;
|
|
1745
|
-
* an absent one already failed stage 1.
|
|
1746
|
-
*
|
|
1747
|
-
* `completionRate` is satisfied / total. Quality dimensions are meaningless
|
|
1748
|
-
* on an incomplete task — callers gate on `fullyComplete` / `completionRate`
|
|
1749
|
-
* before scoring quality.
|
|
1750
|
-
*/
|
|
1751
|
-
|
|
1752
|
-
/** What kind of produced state can satisfy a requirement structurally. */
|
|
1753
|
-
type SatisfiedBy = 'artifact' | 'proposal' | 'tool-call' | 'any';
|
|
1754
|
-
interface CompletionRequirement {
|
|
1755
|
-
/** Stable id from the task gold (e.g. a persona's `expected_requirements[].req_id`). */
|
|
1756
|
-
reqId: string;
|
|
1757
|
-
/** Human-readable description of the required deliverable. */
|
|
1758
|
-
title: string;
|
|
1759
|
-
/** Optional kind/category hint, matched against a produced item's kind. */
|
|
1760
|
-
category?: string;
|
|
1761
|
-
/** What produced state satisfies this requirement. Defaults to 'any'. */
|
|
1762
|
-
satisfiedBy?: SatisfiedBy;
|
|
1763
|
-
}
|
|
1764
|
-
interface TaskGold {
|
|
1765
|
-
taskId: string;
|
|
1766
|
-
requirements: CompletionRequirement[];
|
|
1767
|
-
}
|
|
1768
|
-
interface ProducedProposal {
|
|
1769
|
-
id: string;
|
|
1770
|
-
title: string;
|
|
1771
|
-
status: 'pending' | 'approved' | 'rejected';
|
|
1772
|
-
/** Optional persisted body — when present, enables a correctness check. */
|
|
1773
|
-
content?: string;
|
|
1774
|
-
}
|
|
1775
|
-
/** Everything observable about what a run actually produced. */
|
|
1776
|
-
interface ProducedState {
|
|
1777
|
-
/** Persisted vault artifacts. Reuses the shared `Artifact` shape. */
|
|
1778
|
-
artifacts: Artifact[];
|
|
1779
|
-
/** Proposals / filings the agent created. */
|
|
1780
|
-
proposals: ProducedProposal[];
|
|
1781
|
-
/** Names of tools the agent invoked. */
|
|
1782
|
-
toolCalls: string[];
|
|
1783
|
-
}
|
|
1784
|
-
interface RequirementCheck {
|
|
1785
|
-
reqId: string;
|
|
1786
|
-
title: string;
|
|
1787
|
-
/** A produced item of the right kind matched the requirement, non-empty. */
|
|
1788
|
-
structurallyPresent: boolean;
|
|
1789
|
-
/**
|
|
1790
|
-
* Whether the matched item actually fulfils the requirement. `null` when
|
|
1791
|
-
* not structurally present, or when the matched item carries no content
|
|
1792
|
-
* to assess.
|
|
1793
|
-
*/
|
|
1794
|
-
correct: boolean | null;
|
|
1795
|
-
/** structurallyPresent && correct !== false. */
|
|
1796
|
-
satisfied: boolean;
|
|
1797
|
-
/** Human-readable evidence for the verdict. */
|
|
1798
|
-
evidence: string[];
|
|
1799
|
-
}
|
|
1800
|
-
interface CompletionVerdict {
|
|
1801
|
-
taskId: string;
|
|
1802
|
-
requirements: RequirementCheck[];
|
|
1803
|
-
/** satisfied / total requirements. */
|
|
1804
|
-
completionRate: number;
|
|
1805
|
-
/** Every requirement satisfied. */
|
|
1806
|
-
fullyComplete: boolean;
|
|
1807
|
-
}
|
|
1808
|
-
/**
|
|
1809
|
-
* Decides whether a produced item's content actually fulfils a requirement.
|
|
1810
|
-
* Injected so the structural verifier stays pure and unit-testable; the
|
|
1811
|
-
* production implementation is `createLlmCorrectnessChecker`.
|
|
1812
|
-
*/
|
|
1813
|
-
type CorrectnessChecker = (requirement: CompletionRequirement, content: string) => Promise<{
|
|
1814
|
-
correct: boolean;
|
|
1815
|
-
reason: string;
|
|
1816
|
-
}>;
|
|
1817
|
-
/**
|
|
1818
|
-
* Verify whether a run completed the task. `checkCorrectness` is injected —
|
|
1819
|
-
* `createLlmCorrectnessChecker` for production, a deterministic stub in tests.
|
|
1820
|
-
*
|
|
1821
|
-
* Throws on a gold spec with no requirements: an eval task that requires
|
|
1822
|
-
* nothing is a misconfiguration, not a vacuously-complete task.
|
|
1823
|
-
*/
|
|
1824
|
-
declare function verifyCompletion(gold: TaskGold, state: ProducedState, checkCorrectness: CorrectnessChecker): Promise<CompletionVerdict>;
|
|
1825
|
-
interface LlmCorrectnessCheckerOpts {
|
|
1826
|
-
model?: string;
|
|
1827
|
-
/** Max chars of artifact content sent to the checker. */
|
|
1828
|
-
maxContentChars?: number;
|
|
1829
|
-
}
|
|
1830
|
-
/** Parse the correctness checker's model response. Fails loud on a bad shape. */
|
|
1831
|
-
declare function parseCorrectnessResponse(raw: string): {
|
|
1832
|
-
correct: boolean;
|
|
1833
|
-
reason: string;
|
|
1834
|
-
};
|
|
1835
|
-
/**
|
|
1836
|
-
* Production `CorrectnessChecker` — one LLM call per matched artifact,
|
|
1837
|
-
* deterministic (temperature 0), structured JSON out. Judges fulfilment
|
|
1838
|
-
* only: a plan, a gesture, or a description of what should be done does not
|
|
1839
|
-
* fulfil a requirement — the artifact must BE the deliverable.
|
|
1840
|
-
*/
|
|
1841
|
-
declare function createLlmCorrectnessChecker(tc: TCloud, opts?: LlmCorrectnessCheckerOpts): CorrectnessChecker;
|
|
1842
|
-
|
|
1843
802
|
/**
|
|
1844
803
|
* ConvergenceTracker — tracks completion percentage over turns.
|
|
1845
804
|
*
|
|
@@ -2215,188 +1174,6 @@ declare class FileSystemExperimentStore implements ExperimentStore {
|
|
|
2215
1174
|
private load;
|
|
2216
1175
|
}
|
|
2217
1176
|
|
|
2218
|
-
/**
|
|
2219
|
-
* Pareto frontier — multi-objective optimization over candidate runs.
|
|
2220
|
-
*
|
|
2221
|
-
* Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
|
|
2222
|
-
* trading off (cost, latency, quality) or (passRate, tokenBudget,
|
|
2223
|
-
* ttfb), you rarely have a single "winner" — you have a set of
|
|
2224
|
-
* non-dominated candidates. This module exposes:
|
|
2225
|
-
*
|
|
2226
|
-
* - `paretoFrontier`: filter a set of candidates to the non-dominated ones
|
|
2227
|
-
* - `dominates`: does A dominate B across all objectives?
|
|
2228
|
-
*
|
|
2229
|
-
* Each objective is declared with a direction: 'maximize' (higher=better)
|
|
2230
|
-
* or 'minimize' (lower=better). Candidates are any object; pass an
|
|
2231
|
-
* `objective(candidate)` accessor.
|
|
2232
|
-
*/
|
|
2233
|
-
type Direction = 'maximize' | 'minimize';
|
|
2234
|
-
interface Objective<T> {
|
|
2235
|
-
/** Stable label used in reports. */
|
|
2236
|
-
name: string;
|
|
2237
|
-
direction: Direction;
|
|
2238
|
-
value: (candidate: T) => number;
|
|
2239
|
-
}
|
|
2240
|
-
interface ParetoResult<T> {
|
|
2241
|
-
frontier: T[];
|
|
2242
|
-
dominated: T[];
|
|
2243
|
-
/** Index map: frontier[i] dominates each of dominatedBy[i]. */
|
|
2244
|
-
dominanceMap: Array<{
|
|
2245
|
-
dominator: T;
|
|
2246
|
-
dominated: T[];
|
|
2247
|
-
}>;
|
|
2248
|
-
}
|
|
2249
|
-
/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
|
|
2250
|
-
declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
|
|
2251
|
-
/**
|
|
2252
|
-
* Compute the non-dominated frontier. Candidates with NaN/Infinity on any
|
|
2253
|
-
* objective are excluded (can't rank them). A candidate enters the frontier
|
|
2254
|
-
* iff no other candidate dominates it.
|
|
2255
|
-
*/
|
|
2256
|
-
declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
|
|
2257
|
-
/**
|
|
2258
|
-
* Weighted-sum scalarisation. Use as a tie-break / single-winner selector
|
|
2259
|
-
* when callers don't want to consume a frontier. Each objective contributes
|
|
2260
|
-
* its normalised value (0..1 via min-max across the candidate pool) times
|
|
2261
|
-
* its weight; missing weights default to 1/N.
|
|
2262
|
-
*
|
|
2263
|
-
* Direction is honoured automatically — `minimize` axes have their values
|
|
2264
|
-
* inverted before scaling so "higher scalar = better" always holds.
|
|
2265
|
-
*/
|
|
2266
|
-
declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
|
|
2267
|
-
weights?: Partial<Record<string, number>>;
|
|
2268
|
-
}): Array<{
|
|
2269
|
-
candidate: T;
|
|
2270
|
-
score: number;
|
|
2271
|
-
}>;
|
|
2272
|
-
/**
|
|
2273
|
-
* NSGA-II crowding distance — secondary sort for ties on the frontier.
|
|
2274
|
-
*
|
|
2275
|
-
* When the Pareto front collapses to a single point (or many candidates tie
|
|
2276
|
-
* on dominance), naive selection picks arbitrarily and the population
|
|
2277
|
-
* degenerates over generations. NSGA-II preserves diversity by preferring
|
|
2278
|
-
* candidates with more empty space around them on the frontier.
|
|
2279
|
-
*
|
|
2280
|
-
* Returns an array of `{ candidate, distance }` in the SAME order as the
|
|
2281
|
-
* input. Higher distance = more isolated = should be preferred when
|
|
2282
|
-
* preserving diversity.
|
|
2283
|
-
*/
|
|
2284
|
-
declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
2285
|
-
candidate: T;
|
|
2286
|
-
distance: number;
|
|
2287
|
-
}>;
|
|
2288
|
-
/**
|
|
2289
|
-
* Pareto frontier with tie-break by crowding distance — the canonical
|
|
2290
|
-
* NSGA-II selection step. Returns the frontier sorted by descending crowding
|
|
2291
|
-
* distance so callers can `.slice(0, k)` to pick K diverse winners.
|
|
2292
|
-
*/
|
|
2293
|
-
declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
2294
|
-
candidate: T;
|
|
2295
|
-
distance: number;
|
|
2296
|
-
}>;
|
|
2297
|
-
|
|
2298
|
-
interface SteeringRolePrompt {
|
|
2299
|
-
system?: string;
|
|
2300
|
-
append?: string;
|
|
2301
|
-
}
|
|
2302
|
-
interface SteeringBundle {
|
|
2303
|
-
id: string;
|
|
2304
|
-
coderPrompt?: string;
|
|
2305
|
-
continuePrompt?: string;
|
|
2306
|
-
reviewerPrompts?: Record<string, string>;
|
|
2307
|
-
skills?: string[];
|
|
2308
|
-
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
2309
|
-
metadata?: Record<string, unknown>;
|
|
2310
|
-
}
|
|
2311
|
-
interface SteeringDelta {
|
|
2312
|
-
coderPrompt?: string;
|
|
2313
|
-
continuePrompt?: string;
|
|
2314
|
-
reviewerPrompts?: Record<string, string>;
|
|
2315
|
-
skills?: string[];
|
|
2316
|
-
rolePrompts?: Record<string, SteeringRolePrompt>;
|
|
2317
|
-
metadata?: Record<string, unknown>;
|
|
2318
|
-
}
|
|
2319
|
-
declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
|
|
2320
|
-
declare function renderSteeringText(bundle: SteeringBundle): string;
|
|
2321
|
-
|
|
2322
|
-
type HarnessIntervention = 'continue' | 'plan' | 'audit' | 'recover' | 'repair' | 'verify' | 'final_gate' | 'wait_for_measurement' | 'abort';
|
|
2323
|
-
interface WorkflowTopology {
|
|
2324
|
-
id: string;
|
|
2325
|
-
interventions: HarnessIntervention[];
|
|
2326
|
-
maxParallelBranches?: number;
|
|
2327
|
-
metadata?: Record<string, unknown>;
|
|
2328
|
-
}
|
|
2329
|
-
interface MeasurementPolicy {
|
|
2330
|
-
required: string[];
|
|
2331
|
-
optional?: string[];
|
|
2332
|
-
promoteOn?: Array<keyof RunScore | 'aggregate'>;
|
|
2333
|
-
}
|
|
2334
|
-
interface HarnessVariant {
|
|
2335
|
-
id: string;
|
|
2336
|
-
steering?: SteeringBundle;
|
|
2337
|
-
topology?: WorkflowTopology;
|
|
2338
|
-
measurement?: MeasurementPolicy;
|
|
2339
|
-
budgets?: Record<string, number>;
|
|
2340
|
-
models?: Record<string, string>;
|
|
2341
|
-
reviewers?: Record<string, string>;
|
|
2342
|
-
metadata?: Record<string, unknown>;
|
|
2343
|
-
}
|
|
2344
|
-
interface HarnessScenario {
|
|
2345
|
-
id: string;
|
|
2346
|
-
task: string;
|
|
2347
|
-
split?: 'train' | 'validation' | 'test' | string;
|
|
2348
|
-
metadata?: Record<string, unknown>;
|
|
2349
|
-
}
|
|
2350
|
-
interface HarnessRunRequest {
|
|
2351
|
-
variant: HarnessVariant;
|
|
2352
|
-
scenario: HarnessScenario;
|
|
2353
|
-
trialIndex: number;
|
|
2354
|
-
}
|
|
2355
|
-
interface HarnessAdapter {
|
|
2356
|
-
run(request: HarnessRunRequest): Promise<RunTrace>;
|
|
2357
|
-
}
|
|
2358
|
-
interface HarnessRunResult {
|
|
2359
|
-
variant: HarnessVariant;
|
|
2360
|
-
scenario: HarnessScenario;
|
|
2361
|
-
trialIndex: number;
|
|
2362
|
-
trace: RunTrace;
|
|
2363
|
-
score: RunScore;
|
|
2364
|
-
aggregate: number;
|
|
2365
|
-
}
|
|
2366
|
-
interface HarnessVariantReport {
|
|
2367
|
-
variant: HarnessVariant;
|
|
2368
|
-
runs: HarnessRunResult[];
|
|
2369
|
-
aggregateMean: number;
|
|
2370
|
-
passRate: number;
|
|
2371
|
-
costUsdMean: number;
|
|
2372
|
-
wallSecondsMean: number;
|
|
2373
|
-
scoreMean: RunScore;
|
|
2374
|
-
}
|
|
2375
|
-
interface HarnessSelection {
|
|
2376
|
-
winner: HarnessVariantReport;
|
|
2377
|
-
frontier: ParetoResult<HarnessVariantReport>;
|
|
2378
|
-
reports: HarnessVariantReport[];
|
|
2379
|
-
}
|
|
2380
|
-
interface HarnessExperimentResult {
|
|
2381
|
-
results: HarnessRunResult[];
|
|
2382
|
-
selection: HarnessSelection;
|
|
2383
|
-
}
|
|
2384
|
-
interface HarnessExperimentConfig {
|
|
2385
|
-
adapter: HarnessAdapter;
|
|
2386
|
-
variants: HarnessVariant[];
|
|
2387
|
-
scenarios: HarnessScenario[];
|
|
2388
|
-
trialsPerScenario?: number;
|
|
2389
|
-
parallelism?: number;
|
|
2390
|
-
weights?: Partial<RunScoreWeights>;
|
|
2391
|
-
objectives?: Array<Objective<HarnessVariantReport>>;
|
|
2392
|
-
score?: (trace: RunTrace, request: HarnessRunRequest) => RunScore | Promise<RunScore>;
|
|
2393
|
-
onResult?: (result: HarnessRunResult) => void | Promise<void>;
|
|
2394
|
-
}
|
|
2395
|
-
declare const DEFAULT_HARNESS_OBJECTIVES: Array<Objective<HarnessVariantReport>>;
|
|
2396
|
-
declare function runHarnessExperiment(config: HarnessExperimentConfig): Promise<HarnessExperimentResult>;
|
|
2397
|
-
declare function selectHarnessVariant(results: HarnessRunResult[], objectives?: Array<Objective<HarnessVariantReport>>): HarnessSelection;
|
|
2398
|
-
declare function summarizeHarnessResults(results: HarnessRunResult[]): HarnessVariantReport[];
|
|
2399
|
-
|
|
2400
1177
|
type SandboxJudgeKind = 'compiler' | 'test' | 'linter' | 'security';
|
|
2401
1178
|
interface SandboxJudgeSpec {
|
|
2402
1179
|
id: string;
|
|
@@ -2442,68 +1219,6 @@ declare function distillPlaybook(entries: PlaybookEntry[], options?: {
|
|
|
2442
1219
|
}): Playbook;
|
|
2443
1220
|
declare function renderPlaybookMarkdown(playbook: Playbook): string;
|
|
2444
1221
|
|
|
2445
|
-
/**
|
|
2446
|
-
* Produced-state extraction — normalize a run's runtime event stream into the
|
|
2447
|
-
* typed `ProducedState` the completion oracle consumes.
|
|
2448
|
-
*
|
|
2449
|
-
* `ProducedState` answers "what did the agent actually produce" — vault
|
|
2450
|
-
* artifacts, proposals, tool calls. The runtime emits these as a stream of
|
|
2451
|
-
* events; this module is the single normalization point from that stream to
|
|
2452
|
-
* the shape `verifyCompletion` expects.
|
|
2453
|
-
*
|
|
2454
|
-
* Input is structurally typed (`RuntimeEventLike`) so this module does not
|
|
2455
|
-
* depend on agent-runtime — agent-runtime's `RuntimeStreamEvent` satisfies it
|
|
2456
|
-
* structurally. The `content` on `ArtifactEventLike` and the whole
|
|
2457
|
-
* `proposal_created` variant are the runtime-side enrichments this contract
|
|
2458
|
-
* requires; the runtime emits them, this module consumes them.
|
|
2459
|
-
*/
|
|
2460
|
-
|
|
2461
|
-
/** A tool the agent invoked. */
|
|
2462
|
-
interface ToolCallEventLike {
|
|
2463
|
-
type: 'tool_call';
|
|
2464
|
-
toolName: string;
|
|
2465
|
-
}
|
|
2466
|
-
/**
|
|
2467
|
-
* An artifact the agent produced. `content` is the enriched field — the
|
|
2468
|
-
* runtime's base `artifact` event carries only metadata; the completion
|
|
2469
|
-
* oracle needs the body to verify the deliverable, so the runtime emits it.
|
|
2470
|
-
*/
|
|
2471
|
-
interface ArtifactEventLike {
|
|
2472
|
-
type: 'artifact';
|
|
2473
|
-
artifactId: string;
|
|
2474
|
-
name?: string;
|
|
2475
|
-
mimeType?: string;
|
|
2476
|
-
uri?: string;
|
|
2477
|
-
content?: string;
|
|
2478
|
-
}
|
|
2479
|
-
/** A proposal / filing the agent created. */
|
|
2480
|
-
interface ProposalEventLike {
|
|
2481
|
-
type: 'proposal_created';
|
|
2482
|
-
proposalId: string;
|
|
2483
|
-
title: string;
|
|
2484
|
-
status?: 'pending' | 'approved' | 'rejected';
|
|
2485
|
-
}
|
|
2486
|
-
/**
|
|
2487
|
-
* The subset of runtime stream events `extractProducedState` consumes.
|
|
2488
|
-
* agent-runtime's full `RuntimeStreamEvent` union satisfies this structurally;
|
|
2489
|
-
* the `{ type: string }` catch-all keeps the input permissive so callers can
|
|
2490
|
-
* pass the whole unfiltered telemetry stream — unrecognized events are skipped.
|
|
2491
|
-
*/
|
|
2492
|
-
type RuntimeEventLike = ToolCallEventLike | ArtifactEventLike | ProposalEventLike | {
|
|
2493
|
-
type: string;
|
|
2494
|
-
};
|
|
2495
|
-
/**
|
|
2496
|
-
* Normalize a run's runtime event stream into `ProducedState`.
|
|
2497
|
-
*
|
|
2498
|
-
* Pure and total — unrecognized event types are skipped. `toolCalls` is
|
|
2499
|
-
* deduplicated by name in first-seen order (completion cares about a tool's
|
|
2500
|
-
* presence, not its call count). An artifact with neither a name nor a uri
|
|
2501
|
-
* still yields an entry keyed by its `artifactId` so it is never silently
|
|
2502
|
-
* dropped; an artifact with no `content` yields empty content, which the
|
|
2503
|
-
* completion oracle's structural check then rejects on its own.
|
|
2504
|
-
*/
|
|
2505
|
-
declare function extractProducedState(events: readonly RuntimeEventLike[]): ProducedState;
|
|
2506
|
-
|
|
2507
1222
|
/**
|
|
2508
1223
|
* Versioned prompt registry.
|
|
2509
1224
|
*
|
|
@@ -2569,6 +1284,17 @@ interface SteeringOptimizationResult {
|
|
|
2569
1284
|
runs: number;
|
|
2570
1285
|
}>;
|
|
2571
1286
|
selector?: SteeringOptimizationSelector;
|
|
1287
|
+
/** Runnable handle on the trained classifier. Present only when the
|
|
1288
|
+
* ax-gepa backend completed training; calls the optimized selector
|
|
1289
|
+
* program via ax's `forward`. */
|
|
1290
|
+
selectVariant?: (row: {
|
|
1291
|
+
task: string;
|
|
1292
|
+
split: string;
|
|
1293
|
+
seedPreview: string;
|
|
1294
|
+
}) => Promise<{
|
|
1295
|
+
variantId: string;
|
|
1296
|
+
rationale: string;
|
|
1297
|
+
}>;
|
|
2572
1298
|
skipped?: boolean;
|
|
2573
1299
|
}
|
|
2574
1300
|
interface SteeringOptimizerConfig {
|
|
@@ -2579,7 +1305,7 @@ interface AxSteeringOptimizerConfig extends SteeringOptimizerConfig {
|
|
|
2579
1305
|
apiKey: string;
|
|
2580
1306
|
model: string;
|
|
2581
1307
|
teacherModel?: string;
|
|
2582
|
-
|
|
1308
|
+
minScenarioWinners?: number;
|
|
2583
1309
|
}
|
|
2584
1310
|
declare class PairwiseSteeringOptimizer {
|
|
2585
1311
|
optimize(rows: SteeringOptimizationRow[], config?: SteeringOptimizerConfig): SteeringOptimizationResult;
|
|
@@ -5992,4 +4718,4 @@ declare namespace index {
|
|
|
5992
4718
|
export { type index_AgentProfile as AgentProfile, type index_AgentProfileSection as AgentProfileSection, index_BASELINE_ROLES as BASELINE_ROLES, type index_BaselineRoleKey as BaselineRoleKey, type index_ProfileSkill as ProfileSkill, index_applyDomainPatch as applyDomainPatch, index_baselineProfile as baselineProfile, index_baselineProfileFromRole as baselineProfileFromRole, index_engineerRole as engineerRole, index_generalistRole as generalistRole, index_prodProfile as prodProfile, index_profileToSurface as profileToSurface, index_renderProfile as renderProfile, index_researcherRole as researcherRole, index_sectionHash as sectionHash };
|
|
5993
4719
|
}
|
|
5994
4720
|
|
|
5995
|
-
export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type ParseStudentLabel, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SplitGoldOptions, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseGoldJsonl, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|
|
4721
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_FINDERS, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffScorecardOptions, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, HarnessConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeFamily, type JudgeFleetOptions, JudgeFn, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, type Mutator, Mutex, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParseStudentLabel, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, RunScore, RunScoreWeights, RunTrace, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type SplitGoldOptions, SteeringBundle, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type StepAttribution, type SynthesisReason, type SynthesisTarget, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceStore, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxPool, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseGoldJsonl, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|