@tangle-network/agent-eval 0.29.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +195 -1
- package/dist/index.js +252 -13
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +21 -12
package/dist/index.d.ts
CHANGED
|
@@ -773,6 +773,42 @@ interface AnalystRunResult {
|
|
|
773
773
|
/** Total LLM cost in USD across all analysts in this registry.run(). */
|
|
774
774
|
total_cost_usd: number;
|
|
775
775
|
}
|
|
776
|
+
/**
|
|
777
|
+
* Events emitted by `AnalystRegistry.runStream(...)` in real time as
|
|
778
|
+
* the registry executes. UIs subscribe via `for await (const ev of
|
|
779
|
+
* registry.runStream(...))`; `registry.run(...)` is a thin collector
|
|
780
|
+
* over the same stream, so the two surfaces share their invariants.
|
|
781
|
+
*
|
|
782
|
+
* Per-finding events are intentionally omitted — analyzers are batch
|
|
783
|
+
* operations (an Ax actor returns the full `findings:json[]` at the
|
|
784
|
+
* end of the responder), so streaming inside one analyst would only
|
|
785
|
+
* emit partial JSON consumers can't render. The kind-completion event
|
|
786
|
+
* is the right granularity; subscribers wanting per-finding rendering
|
|
787
|
+
* iterate `event.findings` themselves.
|
|
788
|
+
*/
|
|
789
|
+
type AnalystRunEvent = {
|
|
790
|
+
type: 'run-started';
|
|
791
|
+
run_id: string;
|
|
792
|
+
correlation_id: string;
|
|
793
|
+
started_at: string;
|
|
794
|
+
/** The ordered list of analyst ids the registry will run. */
|
|
795
|
+
analyst_ids: ReadonlyArray<string>;
|
|
796
|
+
} | {
|
|
797
|
+
type: 'analyst-skipped';
|
|
798
|
+
summary: AnalystRunSummary;
|
|
799
|
+
} | {
|
|
800
|
+
type: 'analyst-started';
|
|
801
|
+
analyst_id: string;
|
|
802
|
+
started_at: string;
|
|
803
|
+
} | {
|
|
804
|
+
type: 'analyst-completed';
|
|
805
|
+
/** `summary.status` is `'ok'` for clean completion or `'failed'` for thrown analysts. */
|
|
806
|
+
summary: AnalystRunSummary;
|
|
807
|
+
findings: ReadonlyArray<AnalystFinding>;
|
|
808
|
+
} | {
|
|
809
|
+
type: 'run-completed';
|
|
810
|
+
result: AnalystRunResult;
|
|
811
|
+
};
|
|
776
812
|
|
|
777
813
|
/**
|
|
778
814
|
* Adapter factories — lift each existing agent-eval primitive into the
|
|
@@ -900,6 +936,152 @@ declare const RAW_FINDING_SCHEMA_PROMPT = "Each finding MUST be a JSON object wi
|
|
|
900
936
|
*/
|
|
901
937
|
declare function parseRawFinding(row: unknown, log?: (msg: string, fields?: Record<string, unknown>) => void): RawAnalystFinding | null;
|
|
902
938
|
|
|
939
|
+
/**
|
|
940
|
+
* Typed `FindingSubject` — the canonical grammar every analyst kind emits.
|
|
941
|
+
*
|
|
942
|
+
* Background: kind actor prompts have always documented a subject grammar
|
|
943
|
+
* (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`) but the
|
|
944
|
+
* LLM was unconstrained — it could emit `subject: "fix the prompt"`
|
|
945
|
+
* (prose) and downstream adapters routed on `startsWith(...)` would
|
|
946
|
+
* silently skip it. Every per-vertical `ImprovementAdapter` had a
|
|
947
|
+
* routing table that mostly caught nothing.
|
|
948
|
+
*
|
|
949
|
+
* This module fixes that:
|
|
950
|
+
* - `parseFindingSubject(raw)` — returns the typed `FindingSubject`
|
|
951
|
+
* when `raw` matches the grammar, else `null`. Used at the
|
|
952
|
+
* `RawAnalystFindingSchema` boundary so malformed subjects are
|
|
953
|
+
* rejected loudly instead of silently lifted into the registry.
|
|
954
|
+
* - `FindingSubjectKind` — the union of valid locus categories. Each
|
|
955
|
+
* variant carries the typed components downstream adapters resolve
|
|
956
|
+
* against the agent's surface manifest (no string parsing in the
|
|
957
|
+
* adapter).
|
|
958
|
+
* - `FINDING_SUBJECT_GRAMMAR_PROMPT` — single source of truth for the
|
|
959
|
+
* grammar string embedded in kind actor prompts. Drift between
|
|
960
|
+
* prompt and parser is impossible if every kind imports this.
|
|
961
|
+
*
|
|
962
|
+
* The grammar is intentionally NARROW — only loci the substrate's
|
|
963
|
+
* default `ImprovementAdapter` / `KnowledgeAdapter` can act on. A
|
|
964
|
+
* finding with a subject outside this set fails the parser; the kind
|
|
965
|
+
* author either extends the grammar here (and adds adapter routing)
|
|
966
|
+
* or rephrases the prompt to map onto an existing variant.
|
|
967
|
+
*
|
|
968
|
+
* `failure-mode` is the one exception — its subjects are free-form
|
|
969
|
+
* cluster labels, not loci. The schema preserves them as
|
|
970
|
+
* `{ kind: 'cluster', label }` and the adapters skip them (cluster
|
|
971
|
+
* findings are evidence, not actionable mutations).
|
|
972
|
+
*/
|
|
973
|
+
|
|
974
|
+
/**
|
|
975
|
+
* Discriminated union of every locus the substrate can route findings to.
|
|
976
|
+
*
|
|
977
|
+
* Adapters narrow on `kind` and use the typed components (no string
|
|
978
|
+
* parsing). Adding a variant here REQUIRES updating the parser, the
|
|
979
|
+
* grammar prompt, and at least one adapter — by design.
|
|
980
|
+
*/
|
|
981
|
+
type FindingSubject = {
|
|
982
|
+
kind: 'knowledge.wiki';
|
|
983
|
+
slug: string;
|
|
984
|
+
heading?: string;
|
|
985
|
+
} | {
|
|
986
|
+
kind: 'knowledge.claim';
|
|
987
|
+
topic: string;
|
|
988
|
+
} | {
|
|
989
|
+
kind: 'knowledge.raw';
|
|
990
|
+
sourceId: string;
|
|
991
|
+
} | {
|
|
992
|
+
kind: 'knowledge.stale';
|
|
993
|
+
slug: string;
|
|
994
|
+
} | {
|
|
995
|
+
kind: 'system-prompt';
|
|
996
|
+
section: string;
|
|
997
|
+
} | {
|
|
998
|
+
kind: 'tool-doc';
|
|
999
|
+
tool: string;
|
|
1000
|
+
aspect?: string;
|
|
1001
|
+
} | {
|
|
1002
|
+
kind: 'new-tool';
|
|
1003
|
+
name: string;
|
|
1004
|
+
} | {
|
|
1005
|
+
kind: 'rag';
|
|
1006
|
+
corpus: string;
|
|
1007
|
+
docId: string;
|
|
1008
|
+
} | {
|
|
1009
|
+
kind: 'memory';
|
|
1010
|
+
key: string;
|
|
1011
|
+
} | {
|
|
1012
|
+
kind: 'scaffolding';
|
|
1013
|
+
concern: string;
|
|
1014
|
+
} | {
|
|
1015
|
+
kind: 'output-schema';
|
|
1016
|
+
field: string;
|
|
1017
|
+
} | {
|
|
1018
|
+
kind: 'websearch.outdated';
|
|
1019
|
+
topic: string;
|
|
1020
|
+
} | {
|
|
1021
|
+
kind: 'prior-run-summary';
|
|
1022
|
+
topic: string;
|
|
1023
|
+
} | {
|
|
1024
|
+
kind: 'cluster';
|
|
1025
|
+
label: string;
|
|
1026
|
+
};
|
|
1027
|
+
type FindingSubjectKind = FindingSubject['kind'];
|
|
1028
|
+
declare const FINDING_SUBJECT_KINDS: ReadonlyArray<FindingSubjectKind>;
|
|
1029
|
+
/**
|
|
1030
|
+
* Parse a raw subject string emitted by an analyst kind's actor.
|
|
1031
|
+
*
|
|
1032
|
+
* Returns the typed `FindingSubject` when `raw` matches the grammar,
|
|
1033
|
+
* else `null`. Callers use the `null` return as a signal to either
|
|
1034
|
+
* (a) reject the finding at parse time (kinds that emit typed loci —
|
|
1035
|
+
* knowledge-gap, improvement, knowledge-poisoning) or (b) lift it as
|
|
1036
|
+
* a cluster label (failure-mode).
|
|
1037
|
+
*
|
|
1038
|
+
* Slugs are constrained to `[a-z0-9-]+` (lowercase kebab) to keep file
|
|
1039
|
+
* paths sane downstream. Topics / keys / sections allow any non-empty
|
|
1040
|
+
* string (free-form for the LLM's voice) but get trimmed.
|
|
1041
|
+
*
|
|
1042
|
+
* Empty / whitespace-only inputs return `null`. `undefined` returns
|
|
1043
|
+
* `null`. Both are surfaced by the caller as a rejected subject.
|
|
1044
|
+
*/
|
|
1045
|
+
declare function parseFindingSubject(raw: string | null | undefined): FindingSubject | null;
|
|
1046
|
+
/**
|
|
1047
|
+
* Render the parsed subject back to its canonical string form. Inverse
|
|
1048
|
+
* of `parseFindingSubject`; useful when the substrate constructs new
|
|
1049
|
+
* findings programmatically (e.g. for tests, replays, or
|
|
1050
|
+
* `id_basis` carry-forward).
|
|
1051
|
+
*/
|
|
1052
|
+
declare function renderFindingSubject(s: FindingSubject): string;
|
|
1053
|
+
/**
|
|
1054
|
+
* The grammar text embedded into kind actor prompts. Kinds opt into
|
|
1055
|
+
* the subset of variants they emit (e.g. `improvement` excludes the
|
|
1056
|
+
* cluster variant; `failure-mode` includes ONLY the cluster variant).
|
|
1057
|
+
*
|
|
1058
|
+
* Drift between prompt and parser is impossible: every kind imports
|
|
1059
|
+
* this constant + the matching `expects` set, and the unit tests below
|
|
1060
|
+
* lock the table to the parser.
|
|
1061
|
+
*/
|
|
1062
|
+
declare const FINDING_SUBJECT_GRAMMAR_PROMPT: string;
|
|
1063
|
+
/**
|
|
1064
|
+
* The variants each kind is allowed to emit. Used at the kind factory
|
|
1065
|
+
* boundary so a knowledge-gap finding can't sneak in a `system-prompt:*`
|
|
1066
|
+
* subject (the improvement-analyst's job) and vice versa.
|
|
1067
|
+
*
|
|
1068
|
+
* `failure-mode` is restricted to `cluster` — the only kind that emits
|
|
1069
|
+
* a non-locus subject.
|
|
1070
|
+
*/
|
|
1071
|
+
declare const KIND_EXPECTED_SUBJECTS: Record<string, ReadonlyArray<FindingSubjectKind>>;
|
|
1072
|
+
/**
|
|
1073
|
+
* Zod schema that validates a raw subject string and returns the parsed
|
|
1074
|
+
* `FindingSubject`. Embedded in `RawAnalystFindingSchema` via
|
|
1075
|
+
* `transform`, so `subject` arrives at the kind factory either as a
|
|
1076
|
+
* typed locus or as a parse error attached to a single Zod issue.
|
|
1077
|
+
*
|
|
1078
|
+
* Optionality is preserved: subjects ARE optional on the wire (some
|
|
1079
|
+
* findings are descriptive, not actionable). When present, they MUST
|
|
1080
|
+
* parse — emitting a malformed subject is a contract violation, not a
|
|
1081
|
+
* soft signal.
|
|
1082
|
+
*/
|
|
1083
|
+
declare const FindingSubjectStringSchema: z.ZodString;
|
|
1084
|
+
|
|
903
1085
|
/**
|
|
904
1086
|
* FindingsStore — durable persistence for AnalystFinding rows + a diff
|
|
905
1087
|
* helper so we can answer "what changed since the last run?" without
|
|
@@ -1299,6 +1481,18 @@ declare class AnalystRegistry {
|
|
|
1299
1481
|
cost: Analyst['cost'];
|
|
1300
1482
|
}>;
|
|
1301
1483
|
run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
|
|
1484
|
+
/**
|
|
1485
|
+
* Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
|
|
1486
|
+
* in real time — `run-started`, then per-analyst `skipped` /
|
|
1487
|
+
* `started` / `completed`, then a terminal `run-completed` whose
|
|
1488
|
+
* payload is the full `AnalystRunResult`. UIs use this to render
|
|
1489
|
+
* progress; persistence consumers use `run()` and read the result.
|
|
1490
|
+
*
|
|
1491
|
+
* Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
|
|
1492
|
+
* `onComplete`) fire as before — streaming is additive, not a hook
|
|
1493
|
+
* replacement.
|
|
1494
|
+
*/
|
|
1495
|
+
runStream(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): AsyncGenerator<AnalystRunEvent, void, void>;
|
|
1302
1496
|
private selectAnalysts;
|
|
1303
1497
|
private routeInput;
|
|
1304
1498
|
}
|
|
@@ -5979,4 +6173,4 @@ declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
|
|
|
5979
6173
|
mode: AggregatorMode;
|
|
5980
6174
|
}): TrialAggregate;
|
|
5981
6175
|
|
|
5982
|
-
export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
|
|
6176
|
+
export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseFindingSubject, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
|
package/dist/index.js
CHANGED
|
@@ -968,17 +968,194 @@ function resolveModel(req, defaultModel) {
|
|
|
968
968
|
}
|
|
969
969
|
|
|
970
970
|
// src/analyst/finding-signature.ts
|
|
971
|
+
import { z as z2 } from "zod";
|
|
972
|
+
|
|
973
|
+
// src/analyst/finding-subject.ts
|
|
971
974
|
import { z } from "zod";
|
|
975
|
+
var FINDING_SUBJECT_KINDS = [
|
|
976
|
+
"knowledge.wiki",
|
|
977
|
+
"knowledge.claim",
|
|
978
|
+
"knowledge.raw",
|
|
979
|
+
"knowledge.stale",
|
|
980
|
+
"system-prompt",
|
|
981
|
+
"tool-doc",
|
|
982
|
+
"new-tool",
|
|
983
|
+
"rag",
|
|
984
|
+
"memory",
|
|
985
|
+
"scaffolding",
|
|
986
|
+
"output-schema",
|
|
987
|
+
"websearch.outdated",
|
|
988
|
+
"prior-run-summary",
|
|
989
|
+
"cluster"
|
|
990
|
+
];
|
|
991
|
+
function parseFindingSubject(raw) {
|
|
992
|
+
if (raw === null || raw === void 0) return null;
|
|
993
|
+
const trimmed = raw.trim();
|
|
994
|
+
if (trimmed.length === 0) return null;
|
|
995
|
+
const wiki = trimmed.match(
|
|
996
|
+
/^agent-knowledge:wiki:([a-z0-9][a-z0-9-]*)(?:#([a-z0-9][a-z0-9-]*))?$/
|
|
997
|
+
);
|
|
998
|
+
if (wiki)
|
|
999
|
+
return { kind: "knowledge.wiki", slug: wiki[1], ...wiki[2] ? { heading: wiki[2] } : {} };
|
|
1000
|
+
const claim = trimmed.match(/^agent-knowledge:claim:(.+)$/);
|
|
1001
|
+
if (claim && claim[1].trim().length > 0)
|
|
1002
|
+
return { kind: "knowledge.claim", topic: claim[1].trim() };
|
|
1003
|
+
const raw_ = trimmed.match(/^agent-knowledge:raw:(.+)$/);
|
|
1004
|
+
if (raw_ && raw_[1].trim().length > 0)
|
|
1005
|
+
return { kind: "knowledge.raw", sourceId: raw_[1].trim() };
|
|
1006
|
+
const stale = trimmed.match(/^agent-knowledge:stale:([a-z0-9][a-z0-9-]*)$/);
|
|
1007
|
+
if (stale) return { kind: "knowledge.stale", slug: stale[1] };
|
|
1008
|
+
const sp = trimmed.match(/^system-prompt:(.+)$/);
|
|
1009
|
+
if (sp && sp[1].trim().length > 0) return { kind: "system-prompt", section: sp[1].trim() };
|
|
1010
|
+
const tdAspect = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*):(.+)$/);
|
|
1011
|
+
if (tdAspect && tdAspect[2].trim().length > 0) {
|
|
1012
|
+
return { kind: "tool-doc", tool: tdAspect[1], aspect: tdAspect[2].trim() };
|
|
1013
|
+
}
|
|
1014
|
+
const td = trimmed.match(/^tool-doc:([a-z0-9][a-z0-9_-]*)$/);
|
|
1015
|
+
if (td) return { kind: "tool-doc", tool: td[1] };
|
|
1016
|
+
const nt = trimmed.match(/^new-tool:([a-z0-9][a-z0-9_-]*)$/);
|
|
1017
|
+
if (nt) return { kind: "new-tool", name: nt[1] };
|
|
1018
|
+
const rag = trimmed.match(/^rag:([a-z0-9][a-z0-9_-]*):(.+)$/);
|
|
1019
|
+
if (rag && rag[2].trim().length > 0) {
|
|
1020
|
+
return { kind: "rag", corpus: rag[1], docId: rag[2].trim() };
|
|
1021
|
+
}
|
|
1022
|
+
const mem = trimmed.match(/^memory:(.+)$/);
|
|
1023
|
+
if (mem && mem[1].trim().length > 0) return { kind: "memory", key: mem[1].trim() };
|
|
1024
|
+
const sc = trimmed.match(/^scaffolding:(.+)$/);
|
|
1025
|
+
if (sc && sc[1].trim().length > 0) return { kind: "scaffolding", concern: sc[1].trim() };
|
|
1026
|
+
const os = trimmed.match(/^output-schema:(.+)$/);
|
|
1027
|
+
if (os && os[1].trim().length > 0) return { kind: "output-schema", field: os[1].trim() };
|
|
1028
|
+
const ws = trimmed.match(/^websearch:outdated:(.+)$/);
|
|
1029
|
+
if (ws && ws[1].trim().length > 0) return { kind: "websearch.outdated", topic: ws[1].trim() };
|
|
1030
|
+
const prs = trimmed.match(/^prior-run-summary:(.+)$/);
|
|
1031
|
+
if (prs && prs[1].trim().length > 0) return { kind: "prior-run-summary", topic: prs[1].trim() };
|
|
1032
|
+
if (/^[a-z0-9][a-z0-9-]*$/.test(trimmed) && trimmed.length <= 80) {
|
|
1033
|
+
return { kind: "cluster", label: trimmed };
|
|
1034
|
+
}
|
|
1035
|
+
return null;
|
|
1036
|
+
}
|
|
1037
|
+
function renderFindingSubject(s) {
|
|
1038
|
+
switch (s.kind) {
|
|
1039
|
+
case "knowledge.wiki":
|
|
1040
|
+
return s.heading ? `agent-knowledge:wiki:${s.slug}#${s.heading}` : `agent-knowledge:wiki:${s.slug}`;
|
|
1041
|
+
case "knowledge.claim":
|
|
1042
|
+
return `agent-knowledge:claim:${s.topic}`;
|
|
1043
|
+
case "knowledge.raw":
|
|
1044
|
+
return `agent-knowledge:raw:${s.sourceId}`;
|
|
1045
|
+
case "knowledge.stale":
|
|
1046
|
+
return `agent-knowledge:stale:${s.slug}`;
|
|
1047
|
+
case "system-prompt":
|
|
1048
|
+
return `system-prompt:${s.section}`;
|
|
1049
|
+
case "tool-doc":
|
|
1050
|
+
return s.aspect ? `tool-doc:${s.tool}:${s.aspect}` : `tool-doc:${s.tool}`;
|
|
1051
|
+
case "new-tool":
|
|
1052
|
+
return `new-tool:${s.name}`;
|
|
1053
|
+
case "rag":
|
|
1054
|
+
return `rag:${s.corpus}:${s.docId}`;
|
|
1055
|
+
case "memory":
|
|
1056
|
+
return `memory:${s.key}`;
|
|
1057
|
+
case "scaffolding":
|
|
1058
|
+
return `scaffolding:${s.concern}`;
|
|
1059
|
+
case "output-schema":
|
|
1060
|
+
return `output-schema:${s.field}`;
|
|
1061
|
+
case "websearch.outdated":
|
|
1062
|
+
return `websearch:outdated:${s.topic}`;
|
|
1063
|
+
case "prior-run-summary":
|
|
1064
|
+
return `prior-run-summary:${s.topic}`;
|
|
1065
|
+
case "cluster":
|
|
1066
|
+
return s.label;
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
var FINDING_SUBJECT_GRAMMAR_PROMPT = [
|
|
1070
|
+
"Subjects MUST match this grammar \u2014 anything else is rejected at parse time and your work is wasted:",
|
|
1071
|
+
"",
|
|
1072
|
+
" Knowledge loci (write to the agent-knowledge base):",
|
|
1073
|
+
" agent-knowledge:wiki:<slug>[#<heading>] create / update a wiki page",
|
|
1074
|
+
" agent-knowledge:claim:<topic> draft a claim / relation triple",
|
|
1075
|
+
" agent-knowledge:raw:<source-id> lift a raw source into a curated page",
|
|
1076
|
+
" agent-knowledge:stale:<slug> mark a page superseded",
|
|
1077
|
+
"",
|
|
1078
|
+
" Runtime mutable surfaces (write to prompts / tools / scaffolding):",
|
|
1079
|
+
" system-prompt:<section> add / replace a system-prompt section",
|
|
1080
|
+
" tool-doc:<tool>[:<aspect>] rewrite a tool description",
|
|
1081
|
+
" new-tool:<name> propose a new tool surface",
|
|
1082
|
+
" rag:<corpus>:<doc-id> ingest / correct a RAG document",
|
|
1083
|
+
" memory:<key> invalidate / set a memory entry",
|
|
1084
|
+
" scaffolding:<concern> change a precondition / retry / verifier",
|
|
1085
|
+
" output-schema:<field> constrain the agent output shape",
|
|
1086
|
+
"",
|
|
1087
|
+
" Stale signals (knowledge-poisoning only):",
|
|
1088
|
+
" websearch:outdated:<topic> stale web result",
|
|
1089
|
+
" prior-run-summary:<topic> stale prior-run summary",
|
|
1090
|
+
"",
|
|
1091
|
+
" Cluster label (failure-mode only):",
|
|
1092
|
+
' <kebab-case-label> short cluster id, e.g. "tool-call-loop"',
|
|
1093
|
+
"",
|
|
1094
|
+
"Slugs / tool ids: [a-z0-9-]+ (lowercase kebab). Topics / keys / sections: free-form, trimmed."
|
|
1095
|
+
].join("\n");
|
|
1096
|
+
var KIND_EXPECTED_SUBJECTS = {
|
|
1097
|
+
"failure-mode": ["cluster"],
|
|
1098
|
+
"knowledge-gap": [
|
|
1099
|
+
"knowledge.wiki",
|
|
1100
|
+
"knowledge.claim",
|
|
1101
|
+
"knowledge.raw",
|
|
1102
|
+
"knowledge.stale",
|
|
1103
|
+
"tool-doc",
|
|
1104
|
+
"system-prompt",
|
|
1105
|
+
"memory",
|
|
1106
|
+
"websearch.outdated",
|
|
1107
|
+
"prior-run-summary"
|
|
1108
|
+
],
|
|
1109
|
+
"knowledge-poisoning": [
|
|
1110
|
+
"knowledge.wiki",
|
|
1111
|
+
"knowledge.claim",
|
|
1112
|
+
"knowledge.raw",
|
|
1113
|
+
"tool-doc",
|
|
1114
|
+
"system-prompt",
|
|
1115
|
+
"memory",
|
|
1116
|
+
"websearch.outdated",
|
|
1117
|
+
"prior-run-summary"
|
|
1118
|
+
],
|
|
1119
|
+
improvement: [
|
|
1120
|
+
"system-prompt",
|
|
1121
|
+
"tool-doc",
|
|
1122
|
+
"new-tool",
|
|
1123
|
+
"rag",
|
|
1124
|
+
"memory",
|
|
1125
|
+
"scaffolding",
|
|
1126
|
+
"output-schema",
|
|
1127
|
+
"knowledge.wiki",
|
|
1128
|
+
"knowledge.claim"
|
|
1129
|
+
]
|
|
1130
|
+
};
|
|
1131
|
+
var FindingSubjectStringSchema = z.string().refine((s) => parseFindingSubject(s) !== null, {
|
|
1132
|
+
message: "subject does not match the finding-subject grammar"
|
|
1133
|
+
});
|
|
1134
|
+
|
|
1135
|
+
// src/analyst/finding-signature.ts
|
|
972
1136
|
var ANALYST_SEVERITIES = ["critical", "high", "medium", "low", "info"];
|
|
973
|
-
var RawAnalystFindingSchema =
|
|
974
|
-
severity:
|
|
975
|
-
claim:
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
1137
|
+
var RawAnalystFindingSchema = z2.object({
|
|
1138
|
+
severity: z2.enum(ANALYST_SEVERITIES),
|
|
1139
|
+
claim: z2.string().min(1).max(2e3),
|
|
1140
|
+
/**
|
|
1141
|
+
* Subject locus the finding is about. Validated at parse time
|
|
1142
|
+
* against the documented grammar (`finding-subject.ts`). Findings
|
|
1143
|
+
* with a malformed subject are rejected — they would have been
|
|
1144
|
+
* silently skipped by every downstream adapter, so failing loud at
|
|
1145
|
+
* parse time turns a hidden no-op into a kind-prompt audit signal.
|
|
1146
|
+
*
|
|
1147
|
+
* Optional because purely descriptive findings (no actionable
|
|
1148
|
+
* locus) are legitimate; they just don't route through the
|
|
1149
|
+
* KnowledgeAdapter / ImprovementAdapter.
|
|
1150
|
+
*/
|
|
1151
|
+
subject: z2.string().max(400).refine((s) => parseFindingSubject(s) !== null, {
|
|
1152
|
+
message: "subject does not match the finding-subject grammar"
|
|
1153
|
+
}).optional(),
|
|
1154
|
+
evidence_uri: z2.string().min(1).max(2e3),
|
|
1155
|
+
evidence_excerpt: z2.string().max(2e3).optional(),
|
|
1156
|
+
confidence: z2.number().min(0).max(1),
|
|
1157
|
+
rationale: z2.string().max(4e3).optional(),
|
|
1158
|
+
recommended_action: z2.string().max(2e3).optional()
|
|
982
1159
|
}).strict();
|
|
983
1160
|
var RAW_FINDING_SCHEMA_PROMPT = `Each finding MUST be a JSON object with these fields:
|
|
984
1161
|
- severity: one of "critical" | "high" | "medium" | "low" | "info"
|
|
@@ -1212,18 +1389,42 @@ function createTraceAnalystKind(spec, opts) {
|
|
|
1212
1389
|
tags: ctx.tags
|
|
1213
1390
|
});
|
|
1214
1391
|
const result = await ax.forward(opts.ai, { question: deriveQuestion(ctx, spec) });
|
|
1392
|
+
const expectedSubjects = KIND_EXPECTED_SUBJECTS[spec.id];
|
|
1215
1393
|
const out = [];
|
|
1216
1394
|
const rawRows = Array.isArray(result.findings) ? result.findings : [];
|
|
1395
|
+
let rejectedWrongKind = 0;
|
|
1217
1396
|
for (const row of rawRows) {
|
|
1218
1397
|
const parsed = parseRawFinding(row, ctx.log);
|
|
1219
1398
|
if (!parsed) continue;
|
|
1399
|
+
if (expectedSubjects && parsed.subject !== void 0) {
|
|
1400
|
+
const parsedSubject = parseFindingSubject(parsed.subject);
|
|
1401
|
+
if (parsedSubject === null) {
|
|
1402
|
+
ctx.log?.("finding rejected: subject failed to parse", {
|
|
1403
|
+
kind: spec.id,
|
|
1404
|
+
subject: parsed.subject
|
|
1405
|
+
});
|
|
1406
|
+
rejectedWrongKind += 1;
|
|
1407
|
+
continue;
|
|
1408
|
+
}
|
|
1409
|
+
if (!expectedSubjects.includes(parsedSubject.kind)) {
|
|
1410
|
+
ctx.log?.("finding rejected: subject variant not allowed for this kind", {
|
|
1411
|
+
kind: spec.id,
|
|
1412
|
+
subject_kind: parsedSubject.kind,
|
|
1413
|
+
subject: parsed.subject,
|
|
1414
|
+
allowed: expectedSubjects
|
|
1415
|
+
});
|
|
1416
|
+
rejectedWrongKind += 1;
|
|
1417
|
+
continue;
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1220
1420
|
const postProcessed = spec.postProcess?.(parsed, ctx) ?? parsed;
|
|
1221
1421
|
if (!postProcessed) continue;
|
|
1222
1422
|
out.push(toAnalystFinding(spec, postProcessed));
|
|
1223
1423
|
}
|
|
1224
1424
|
ctx.log?.(`analyst.kind ${spec.id} done`, {
|
|
1225
1425
|
emitted: rawRows.length,
|
|
1226
|
-
accepted: out.length
|
|
1426
|
+
accepted: out.length,
|
|
1427
|
+
rejected_wrong_subject: rejectedWrongKind
|
|
1227
1428
|
});
|
|
1228
1429
|
return out;
|
|
1229
1430
|
}
|
|
@@ -1547,6 +1748,23 @@ var AnalystRegistry = class {
|
|
|
1547
1748
|
}));
|
|
1548
1749
|
}
|
|
1549
1750
|
async run(runId, inputs, runOpts = {}) {
|
|
1751
|
+
for await (const ev of this.runStream(runId, inputs, runOpts)) {
|
|
1752
|
+
if (ev.type === "run-completed") return ev.result;
|
|
1753
|
+
}
|
|
1754
|
+
throw new Error("AnalystRegistry.run: stream completed without run-completed event");
|
|
1755
|
+
}
|
|
1756
|
+
/**
|
|
1757
|
+
* Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
|
|
1758
|
+
* in real time — `run-started`, then per-analyst `skipped` /
|
|
1759
|
+
* `started` / `completed`, then a terminal `run-completed` whose
|
|
1760
|
+
* payload is the full `AnalystRunResult`. UIs use this to render
|
|
1761
|
+
* progress; persistence consumers use `run()` and read the result.
|
|
1762
|
+
*
|
|
1763
|
+
* Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
|
|
1764
|
+
* `onComplete`) fire as before — streaming is additive, not a hook
|
|
1765
|
+
* replacement.
|
|
1766
|
+
*/
|
|
1767
|
+
async *runStream(runId, inputs, runOpts = {}) {
|
|
1550
1768
|
const correlationId = `ar_${randomUUID().slice(0, 12)}`;
|
|
1551
1769
|
const log = this.options.log ?? (() => {
|
|
1552
1770
|
});
|
|
@@ -1556,6 +1774,13 @@ var AnalystRegistry = class {
|
|
|
1556
1774
|
const deadlineMs = runOpts.timeoutMs ? started + runOpts.timeoutMs : void 0;
|
|
1557
1775
|
const selected = this.selectAnalysts(runOpts);
|
|
1558
1776
|
const budget = runOpts.budget ?? this.options.defaultBudget;
|
|
1777
|
+
yield {
|
|
1778
|
+
type: "run-started",
|
|
1779
|
+
run_id: runId,
|
|
1780
|
+
correlation_id: correlationId,
|
|
1781
|
+
started_at: startedAt,
|
|
1782
|
+
analyst_ids: selected.map((a) => a.id)
|
|
1783
|
+
};
|
|
1559
1784
|
const summaries = [];
|
|
1560
1785
|
const allFindings = [];
|
|
1561
1786
|
let totalCost = 0;
|
|
@@ -1575,6 +1800,7 @@ var AnalystRegistry = class {
|
|
|
1575
1800
|
summaries.push(summary);
|
|
1576
1801
|
log(`[analyst] skip ${analyst.id} \u2014 missing input`, { runId, kind: analyst.inputKind });
|
|
1577
1802
|
await hooks.onAfterAnalyze?.({ analyst, summary, findings: [], runId });
|
|
1803
|
+
yield { type: "analyst-skipped", summary };
|
|
1578
1804
|
continue;
|
|
1579
1805
|
}
|
|
1580
1806
|
const perBudget = allocateBudget(budget, {
|
|
@@ -1594,6 +1820,11 @@ var AnalystRegistry = class {
|
|
|
1594
1820
|
priorFindings: selectPriorFindings(runOpts.priorFindings, analyst.id)
|
|
1595
1821
|
};
|
|
1596
1822
|
await hooks.onBeforeAnalyze?.({ analyst, ctx, runId });
|
|
1823
|
+
yield {
|
|
1824
|
+
type: "analyst-started",
|
|
1825
|
+
analyst_id: analyst.id,
|
|
1826
|
+
started_at: new Date(t0).toISOString()
|
|
1827
|
+
};
|
|
1597
1828
|
try {
|
|
1598
1829
|
const findings = await analyst.analyze(input.value, ctx);
|
|
1599
1830
|
const latency = Date.now() - t0;
|
|
@@ -1616,6 +1847,7 @@ var AnalystRegistry = class {
|
|
|
1616
1847
|
cost_usd: cost
|
|
1617
1848
|
});
|
|
1618
1849
|
await hooks.onAfterAnalyze?.({ analyst, summary, findings, runId });
|
|
1850
|
+
yield { type: "analyst-completed", summary, findings };
|
|
1619
1851
|
} catch (err) {
|
|
1620
1852
|
const latency = Date.now() - t0;
|
|
1621
1853
|
const e = err instanceof Error ? err : new Error(String(err));
|
|
@@ -1636,6 +1868,7 @@ var AnalystRegistry = class {
|
|
|
1636
1868
|
error: e.message
|
|
1637
1869
|
});
|
|
1638
1870
|
await hooks.onAfterAnalyze?.({ analyst, summary, findings: hookFindings, runId });
|
|
1871
|
+
yield { type: "analyst-completed", summary, findings: hookFindings };
|
|
1639
1872
|
}
|
|
1640
1873
|
}
|
|
1641
1874
|
const result = {
|
|
@@ -1648,7 +1881,7 @@ var AnalystRegistry = class {
|
|
|
1648
1881
|
total_cost_usd: totalCost
|
|
1649
1882
|
};
|
|
1650
1883
|
await hooks.onComplete?.({ result });
|
|
1651
|
-
|
|
1884
|
+
yield { type: "run-completed", result };
|
|
1652
1885
|
}
|
|
1653
1886
|
selectAnalysts(opts) {
|
|
1654
1887
|
let candidates = Array.from(this.analysts.values());
|
|
@@ -9142,8 +9375,8 @@ function chiSquareCritical(df, alpha) {
|
|
|
9142
9375
|
if (TABLE[df]) return TABLE[df][idx];
|
|
9143
9376
|
if (df > 30) {
|
|
9144
9377
|
const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
|
|
9145
|
-
const
|
|
9146
|
-
const term = 1 - 2 / (9 * df) +
|
|
9378
|
+
const z3 = zMap[idx] ?? 1.96;
|
|
9379
|
+
const term = 1 - 2 / (9 * df) + z3 * Math.sqrt(2 / (9 * df));
|
|
9147
9380
|
return df * term ** 3;
|
|
9148
9381
|
}
|
|
9149
9382
|
const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
|
|
@@ -10095,10 +10328,13 @@ export {
|
|
|
10095
10328
|
ExperimentTracker,
|
|
10096
10329
|
FAILURE_CLASSES,
|
|
10097
10330
|
FAILURE_MODE_KIND_SPEC,
|
|
10331
|
+
FINDING_SUBJECT_GRAMMAR_PROMPT,
|
|
10332
|
+
FINDING_SUBJECT_KINDS,
|
|
10098
10333
|
FileSystemExperimentStore,
|
|
10099
10334
|
FileSystemFeedbackTrajectoryStore,
|
|
10100
10335
|
FileSystemRawProviderSink,
|
|
10101
10336
|
FileSystemTraceStore,
|
|
10337
|
+
FindingSubjectStringSchema,
|
|
10102
10338
|
FindingsStore,
|
|
10103
10339
|
HeldOutGate,
|
|
10104
10340
|
HoldoutAuditor,
|
|
@@ -10114,6 +10350,7 @@ export {
|
|
|
10114
10350
|
JsonlTrialCache,
|
|
10115
10351
|
JudgeError,
|
|
10116
10352
|
JudgeRunner,
|
|
10353
|
+
KIND_EXPECTED_SUBJECTS,
|
|
10117
10354
|
KNOWLEDGE_GAP_KIND_SPEC,
|
|
10118
10355
|
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
10119
10356
|
LineageRecorder,
|
|
@@ -10339,6 +10576,7 @@ export {
|
|
|
10339
10576
|
paretoFrontier,
|
|
10340
10577
|
paretoFrontierWithCrowding,
|
|
10341
10578
|
parseFeedbackTrajectoriesJsonl,
|
|
10579
|
+
parseFindingSubject,
|
|
10342
10580
|
parseRawFinding,
|
|
10343
10581
|
parseReflectionResponse,
|
|
10344
10582
|
parseRunRecordSafe,
|
|
@@ -10363,6 +10601,7 @@ export {
|
|
|
10363
10601
|
regexMatch,
|
|
10364
10602
|
regexMatches,
|
|
10365
10603
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
10604
|
+
renderFindingSubject,
|
|
10366
10605
|
renderMarkdown,
|
|
10367
10606
|
renderMarkdownReport,
|
|
10368
10607
|
renderPlaybookMarkdown,
|