npm - @tangle-network/agent-eval - Versions diffs - 0.67.0 → 0.68.0 - Mend

@tangle-network/agent-eval 0.67.0 → 0.68.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/CHANGELOG.md +15 -0
package/dist/campaign/index.js +9 -9
package/dist/{chunk-MZ2IYGGN.js → chunk-E24XD7A2.js} +4 -278
package/dist/chunk-E24XD7A2.js.map +1 -0
package/dist/{chunk-NV2PF37Q.js → chunk-JFGZPUMU.js} +277 -3
package/dist/chunk-JFGZPUMU.js.map +1 -0
package/dist/contract/index.js +6 -6
package/dist/index.d.ts +113 -5
package/dist/index.js +100 -2
package/dist/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/package.json +1 -1
package/dist/chunk-MZ2IYGGN.js.map +0 -1
package/dist/chunk-NV2PF37Q.js.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -25,7 +25,7 @@ import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
 export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-Dwqw-T_m.js';
 import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-8hKC5EOb.js';
 export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-8hKC5EOb.js';
-import { A as AgentProfile } from './agent-profile-DzcPHR1Z.js';
+import { A as AgentProfile$1 } from './agent-profile-DzcPHR1Z.js';
 export { a as BackendIntegrityError, B as BackendIntegrityReport, b as agentProfileHash, c as assertRealBackend, s as summarizeBackendIntegrity } from './agent-profile-DzcPHR1Z.js';
 export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
 import { h as ReleaseConfidenceThresholds, f as ReleaseConfidenceScorecard } from './release-report-CN8hJlhk.js';
@@ -2962,19 +2962,19 @@ interface ScorecardCell {
 interface Scorecard {
     cells: ScorecardCell[];
     /** Profile definitions seen — keeps the scorecard self-describing. */
-    profiles: Record<string, AgentProfile>;
+    profiles: Record<string, AgentProfile$1>;
 }
 /** One append-only log line — a single cell's entry for a single commit. */
 interface ScorecardLogLine {
     scenarioId: string;
     profileHash: string;
     model: string;
-    profile: AgentProfile;
+    profile: AgentProfile$1;
     entry: ScorecardEntry;
 }
 interface RecordRunsOptions {
     /** The profile that produced these runs — keys the cell. */
-    profile: AgentProfile;
+    profile: AgentProfile$1;
     commitSha: string;
     /** Defaults to `new Date().toISOString()`. */
     timestamp?: string;
@@ -5821,4 +5821,112 @@ declare function defaultRenderStudentPrompt<TInput>(args: {
  *  Throws on failure so the cell is recorded as failed, not silently zeroed. */
 declare function defaultParseStudentLabel<TProduced>(rawContent: string, scenarioId: string): TProduced;
-export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile, type AgreementResult, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type ParseStudentLabel, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SplitGoldOptions, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseGoldJsonl, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
+/**
+ * @experimental — surface may evolve as production agents wire it in.
+ *
+ * Structured agent profile — the system prompt as named, addressable sections
+ * instead of one opaque blob. The self-improvement loop targets ONE evolvable
+ * `domain` section at a time (via `applyDomainPatch`); the role, environment,
+ * tool conventions, and skill roster stay fixed so a candidate diff is
+ * attributable to a single section rather than a whole-prompt rewrite.
+ *
+ * `renderProfile` emits the prompt in a fixed five-zone order — role,
+ * Environment, Tool conventions, Skills, Domain guidance — mirroring the layout
+ * of Harvey LAB's `harness/system_prompt.md` (workspace root, read-only
+ * documents, output dir, skills dir). `profileToSurface` is the bridge to the
+ * loop's string `MutableSurface`: a profile renders to exactly the text a
+ * candidate is scored on.
+ *
+ * Distinct from the benchmark-cell `AgentProfile` in `src/agent-profile.ts`,
+ * which fingerprints a (model, skills, prompt, tools) cell for the scorecard.
+ * That one is the unit of variation; this one is the prompt CONTENT being
+ * varied. Consume this module by its own path (`@tangle-network/agent-eval`
+ * exposes it under the `profile` namespace) to avoid the name clash.
+ */
+/** A named, addressable region of the system prompt. `evolvable` marks whether
+ *  the self-improvement loop is allowed to patch its body; fixed scaffolding
+ *  (e.g. a compliance preamble) sets `evolvable: false`. */
+interface AgentProfileSection {
+    id: string;
+    title: string;
+    body: string;
+    evolvable: boolean;
+}
+/** A skill the agent can invoke. `triggers` are the phrases/conditions that
+ *  should activate it; `scriptsDir` points at its executable scripts when the
+ *  skill ships code (mirrors Harvey's `skills/<name>/scripts/`). */
+interface ProfileSkill {
+    name: string;
+    description: string;
+    triggers: string[];
+    scriptsDir?: string;
+}
+/** The structured system prompt. The first four fields are fixed scaffolding;
+ *  `domain` is the evolvable surface the loop optimizes. */
+interface AgentProfile {
+    role: string;
+    environment: string;
+    toolConventions: string;
+    skills: ProfileSkill[];
+    domain: AgentProfileSection[];
+}
+/**
+ * Emit the sectioned system prompt in fixed order: role text, then
+ * `## Environment`, `## Tool conventions`, `## Skills` (each skill as
+ * `### <name>` + description + triggers), `## Domain guidance` (each section as
+ * `### <title>` + body). The order is load-bearing — the loop diffs rendered
+ * text, and reordering would make every candidate look like a full rewrite.
+ */
+declare function renderProfile(p: AgentProfile): string;
+/** The string `MutableSurface` the self-improvement loop scores — a profile
+ *  renders to exactly the text a candidate is graded on. */
+declare function profileToSurface(p: AgentProfile): string;
+/**
+ * The fixed-scaffolding baseline: a `role`/`environment`/`toolConventions`
+ * profile with the stock skill roster and NO domain guidance. The
+ * production profile is this plus shipped domain sections.
+ */
+declare function baselineProfile(args: {
+    role: string;
+    environment?: string;
+    toolConventions?: string;
+    skills?: ProfileSkill[];
+}): AgentProfile;
+/** The production profile: the baseline scaffolding plus the domain sections
+ *  shipped after self-improvement. Differs from the baseline ONLY in `domain`
+ *  (and any skills the caller layered into the baseline) — the role,
+ *  environment, and tool conventions are carried through unchanged. */
+declare function prodProfile(baseline: AgentProfile, shipped: AgentProfileSection[]): AgentProfile;
+/**
+ * Section-scoped edit — replace the body of ONE domain section by id, leaving
+ * every other section byte-identical. This is how the loop targets a single
+ * evolvable surface. Throws on an unknown section id (a patch that hits no
+ * section is a silent no-op otherwise — fail loud). Throws when the targeted
+ * section is `evolvable: false`.
+ */
+declare function applyDomainPatch(p: AgentProfile, sectionId: string, newBody: string): AgentProfile;
+/**
+ * Content hash of a single section — its loop identity. Delegates to the
+ * campaign provenance `surfaceContentHash` (the same helper the loop's
+ * provenance record uses for full surfaces), so a section hash is byte-for-byte
+ * comparable with the `sha256:`-prefixed content hashes that record carries.
+ * Two sections that render identically (same title + body) share a hash; the
+ * `id` and `evolvable` flag are excluded — they are routing metadata, not
+ * content.
+ */
+declare function sectionHash(section: AgentProfileSection): string;
+type index_AgentProfile = AgentProfile;
+type index_AgentProfileSection = AgentProfileSection;
+type index_ProfileSkill = ProfileSkill;
+declare const index_applyDomainPatch: typeof applyDomainPatch;
+declare const index_baselineProfile: typeof baselineProfile;
+declare const index_prodProfile: typeof prodProfile;
+declare const index_profileToSurface: typeof profileToSurface;
+declare const index_renderProfile: typeof renderProfile;
+declare const index_sectionHash: typeof sectionHash;
+declare namespace index {
+  export { type index_AgentProfile as AgentProfile, type index_AgentProfileSection as AgentProfileSection, type index_ProfileSkill as ProfileSkill, index_applyDomainPatch as applyDomainPatch, index_baselineProfile as baselineProfile, index_prodProfile as prodProfile, index_profileToSurface as profileToSurface, index_renderProfile as renderProfile, index_sectionHash as sectionHash };
+}
+export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, AgentProfile$1 as AgentProfile, type AgreementResult, type AlignmentOp, Analyst, AnalystContext, AnalystCost, AnalystFinding, AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AssertCrossFamilyOptions, type AssertSingleBackendOptions, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BackendDescriptor, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, type BuildAgreementJudgeOptions, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, ChatRequest, CheckResult, CollectedArtifacts, type CommandRunner, type CompareLabels, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, CrossFamilyError, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FieldAgreementSpec, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldScenario, type GoldSplit, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFamily, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type ParseStudentLabel, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RenderStudentPrompt, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, type RunDistillationOptions, type RunDistillationResult, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SKILL_USAGE_ANALYST, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type SingleBackendDivergence, SingleBackendError, type SingleBackendField, type SingleBackendReport, SkillUsageAnalyst, type SkillUsageRecord, type SkillUsageReport, type SkillUsageScanConfig, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SplitGoldOptions, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertCrossFamily, assertSingleBackend, attributeCounterfactuals, bisect, buildAgreementJudge, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildSkillUsageReport, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, containsAll, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultParseStudentLabel, defaultReferenceReplayMatcher, defaultRenderStudentPrompt, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, emitSkillUsageFindings, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fieldAgreement, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isModelPriced, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeFamily, keyPreserved, liftSeverity, linterJudge, loadGoldScenarios, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseGoldJsonl, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, index as profile, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, resolveModelPricing, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runDistillation, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, splitGold, statusAdvanced, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };

package/dist/index.js CHANGED Viewed

@@ -29,8 +29,9 @@ import {
   runImprovementLoop,
   scalarScore,
   scoreRedTeamOutput,
+  surfaceContentHash,
   toolNamesForRun
-} from "./chunk-NV2PF37Q.js";
+} from "./chunk-JFGZPUMU.js";
 import {
   BackendIntegrityError,
   assertRealBackend,
@@ -274,7 +275,9 @@ import {
   ValidationError,
   VerificationError
 } from "./chunk-3BFEG2F6.js";
-import "./chunk-PZ5AY32C.js";
+import {
+  __export
+} from "./chunk-PZ5AY32C.js";
 // src/run-score.ts
 var DEFAULT_RUN_SCORE_WEIGHTS = {
@@ -10712,6 +10715,100 @@ function replacerSortKeys() {
     return value;
   };
 }
+// src/profile/index.ts
+var profile_exports = {};
+__export(profile_exports, {
+  applyDomainPatch: () => applyDomainPatch,
+  baselineProfile: () => baselineProfile,
+  prodProfile: () => prodProfile,
+  profileToSurface: () => profileToSurface,
+  renderProfile: () => renderProfile,
+  sectionHash: () => sectionHash
+});
+function renderSkill(skill) {
+  const lines = [`### ${skill.name}`, skill.description];
+  if (skill.triggers.length > 0) {
+    lines.push(`Triggers: ${skill.triggers.join(", ")}`);
+  }
+  if (skill.scriptsDir !== void 0) {
+    lines.push(`Scripts: ${skill.scriptsDir}`);
+  }
+  return lines.join("\n");
+}
+function renderProfile(p) {
+  const zones = [
+    p.role.trim(),
+    `## Environment
+${p.environment.trim()}`,
+    `## Tool conventions
+${p.toolConventions.trim()}`,
+    `## Skills
+${p.skills.length > 0 ? p.skills.map(renderSkill).join("\n\n") : "_No skills configured._"}`,
+    `## Domain guidance
+${p.domain.length > 0 ? p.domain.map((s) => `### ${s.title}
+${s.body.trim()}`).join("\n\n") : "_No domain guidance yet._"}`
+  ];
+  return zones.join("\n\n");
+}
+function profileToSurface(p) {
+  return renderProfile(p);
+}
+var STOCK_SKILLS = [
+  {
+    name: "read-documents",
+    description: "Read every source document under the read-only documents directory before drafting.",
+    triggers: [
+      "task references a source document",
+      "instructions cite an exhibit, filing, or input file"
+    ]
+  },
+  {
+    name: "write-deliverable",
+    description: "Write deliverables to the output directory; use markdown for notes and the file-type skills for binary artifacts.",
+    triggers: ["the task asks for a produced artifact", "a deliverable is named in the brief"]
+  }
+];
+var ENVIRONMENT_PREAMBLE = "You are an AI agent executing a task within a single workspace root.\n\n- Workspace root: bash starts here. Use it for notes, scratch files, and intermediate work.\n- Documents directory: task source documents live here and are read-only \u2014 never write to them.\n- Output directory: write every deliverable here; relative write/edit paths route here automatically.\n- Skills directory: skill scripts live at <workspace>/skills/<name>/scripts/.";
+var TOOL_CONVENTIONS_PREAMBLE = "Use read to consume input files. Use write only for plain markdown (typically a response summary). Use edit for incremental refinement of a file you already created. External-boundary calls return typed outcomes \u2014 inspect success before using a value.";
+function baselineProfile(args) {
+  return {
+    role: args.role,
+    environment: args.environment ?? ENVIRONMENT_PREAMBLE,
+    toolConventions: args.toolConventions ?? TOOL_CONVENTIONS_PREAMBLE,
+    skills: args.skills ?? STOCK_SKILLS,
+    domain: []
+  };
+}
+function prodProfile(baseline, shipped) {
+  return { ...baseline, domain: [...baseline.domain, ...shipped] };
+}
+function applyDomainPatch(p, sectionId, newBody) {
+  const idx = p.domain.findIndex((s) => s.id === sectionId);
+  if (idx === -1) {
+    throw new Error(
+      `applyDomainPatch: no domain section "${sectionId}" (have: ${p.domain.map((s) => s.id).join(", ") || "<none>"})`
+    );
+  }
+  const target = p.domain[idx];
+  if (target === void 0) {
+    throw new Error(`applyDomainPatch: domain section "${sectionId}" resolved to undefined`);
+  }
+  if (!target.evolvable) {
+    throw new Error(`applyDomainPatch: domain section "${sectionId}" is not evolvable`);
+  }
+  const next = [...p.domain];
+  next[idx] = { ...target, body: newBody };
+  return { ...p, domain: next };
+}
+function sectionHash(section) {
+  return surfaceContentHash(JSON.stringify({ title: section.title, body: section.body }));
+}
 export {
   AGENT_PROFILE_KINDS,
   ANALYST_SEVERITIES,
@@ -11058,6 +11155,7 @@ export {
   positionalBias,
   printDriverSummary,
   probeLlm,
+  profile_exports as profile,
   promptBisect,
   proposeAutomatedPullRequest,
   proposeSynthesisTargets,