@tangle-network/agent-eval 0.71.0 → 0.72.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +63 -0
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/otel.d.ts +3 -2
- package/dist/agent-profile-DYRboYWu.d.ts +364 -0
- package/dist/analyst/index.d.ts +221 -0
- package/dist/analyst/index.js +371 -0
- package/dist/analyst/index.js.map +1 -0
- package/dist/analyst-t7zZS3TV.d.ts +88 -0
- package/dist/campaign/index.d.ts +485 -9
- package/dist/campaign/index.js +618 -30
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-7W4SM7FD.js +1075 -0
- package/dist/chunk-7W4SM7FD.js.map +1 -0
- package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
- package/dist/chunk-JHA3ZGSO.js +1496 -0
- package/dist/chunk-JHA3ZGSO.js.map +1 -0
- package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
- package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
- package/dist/chunk-LB2UOI5F.js +412 -0
- package/dist/chunk-LB2UOI5F.js.map +1 -0
- package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
- package/dist/chunk-VUINJM5M.js.map +1 -0
- package/dist/chunk-WYIHD6EB.js +1044 -0
- package/dist/chunk-WYIHD6EB.js.map +1 -0
- package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
- package/dist/chunk-XPILG2CA.js.map +1 -0
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.d.ts +17 -13
- package/dist/contract/index.js +14 -8
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
- package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
- package/dist/hosted/index.d.ts +223 -2
- package/dist/index.d.ts +49 -1323
- package/dist/index.js +339 -2627
- package/dist/index.js.map +1 -1
- package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
- package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
- package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/pareto-E-pembql.d.ts +81 -0
- package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
- package/dist/redact-B40YG2M_.d.ts +45 -0
- package/dist/registry-DuVYiTvw.d.ts +128 -0
- package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
- package/dist/rl.d.ts +4 -3
- package/dist/rl.js +4 -4
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/run-critic-BAIjX99r.d.ts +56 -0
- package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
- package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
- package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
- package/dist/traces.d.ts +371 -308
- package/dist/traces.js +43 -18
- package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
- package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
- package/dist/wire/index.d.ts +1 -1
- package/dist/workflow/index.d.ts +494 -0
- package/dist/workflow/index.js +2177 -0
- package/dist/workflow/index.js.map +1 -0
- package/docs/design/self-improvement-roadmap.md +106 -0
- package/package.json +36 -12
- package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
- package/dist/chunk-6QZUCFKM.js.map +0 -1
- package/dist/chunk-ODGETRTM.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
2
|
+
import { M as MultiLayerVerifier, V as VerifyOptions, S as Severity } from '../multi-layer-verifier-DlWCXuxL.js';
|
|
3
|
+
import { c as RunCritic, a as RunTrace } from '../run-critic-BAIjX99r.js';
|
|
4
|
+
import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-CV9Wlx4t.js';
|
|
5
|
+
export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-CV9Wlx4t.js';
|
|
6
|
+
import { A as AnalyzeTracesOptions } from '../analyst-t7zZS3TV.js';
|
|
7
|
+
import { T as TraceAnalysisStore } from '../store-GmBE2pZZ.js';
|
|
8
|
+
import { b as JudgeFn, a as JudgeInput } from '../types-Croy5h7V.js';
|
|
9
|
+
import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../types-CRD68aH7.js';
|
|
10
|
+
export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-CRD68aH7.js';
|
|
11
|
+
import { TCloud } from '@tangle-network/tcloud';
|
|
12
|
+
export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DW9XWPvM.js';
|
|
13
|
+
export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-DuVYiTvw.js';
|
|
14
|
+
import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
|
|
15
|
+
import '../schema-m0gsnbt3.js';
|
|
16
|
+
import '../store-CKUAgsJz.js';
|
|
17
|
+
import 'zod';
|
|
18
|
+
import '../run-record-BgTFzO2r.js';
|
|
19
|
+
import '../errors-Dwqw-T_m.js';
|
|
20
|
+
import '../raw-provider-sink-C46HDghv.js';
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Adapter factories — lift each existing agent-eval primitive into the
|
|
24
|
+
* Analyst contract without re-implementing it.
|
|
25
|
+
*
|
|
26
|
+
* Five primitives, five factories. Each one:
|
|
27
|
+
* - Builds an Analyst with a stable id (caller chooses; defaults
|
|
28
|
+
* given), a sensible default `inputKind`, a version derived from
|
|
29
|
+
* the wrapped primitive's version + an adapter revision, and an
|
|
30
|
+
* `analyze()` that calls the primitive and lifts its output to
|
|
31
|
+
* AnalystFinding[] using `makeFinding()`.
|
|
32
|
+
* - Maps severities: the existing `Severity` ('critical' | 'major' |
|
|
33
|
+
* 'minor' | 'info') projects onto AnalystSeverity ('critical' |
|
|
34
|
+
* 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →
|
|
35
|
+
* 'medium'. Domain analysts that want finer-grained mapping override.
|
|
36
|
+
*
|
|
37
|
+
* Adapters never own state. Calling the same factory twice with the
|
|
38
|
+
* same primitive instance is safe.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
declare function liftSeverity(s: Severity): AnalystSeverity;
|
|
42
|
+
interface TraceAnalystAdapterOpts {
|
|
43
|
+
id?: string;
|
|
44
|
+
area?: string;
|
|
45
|
+
/** The natural-language question(s) put to the analyst. One finding per question. */
|
|
46
|
+
questions: string[];
|
|
47
|
+
/** Caller-provided AxAI service — same one trace-analyst.ts expects. */
|
|
48
|
+
ai: AxAIService;
|
|
49
|
+
model?: string;
|
|
50
|
+
/** Forwarded to analyzeTraces. */
|
|
51
|
+
extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* @deprecated Prefer `createTraceAnalystKind` + one of the failure /
|
|
55
|
+
* improvement kinds from `./kinds`. This adapter wraps the legacy
|
|
56
|
+
* `analyzeTraces` flow whose output is `findings:string[]` — every
|
|
57
|
+
* bullet gets flat-defaulted severity `medium` / confidence `0.6`,
|
|
58
|
+
* which loses the per-finding grading kinds provide via Ax structured
|
|
59
|
+
* output + Zod validation. Kept for one minor while consumers migrate.
|
|
60
|
+
*/
|
|
61
|
+
declare function createTraceAnalystAdapter(opts: TraceAnalystAdapterOpts): Analyst<TraceAnalysisStore>;
|
|
62
|
+
interface VerifierAdapterOpts<Env> {
|
|
63
|
+
id?: string;
|
|
64
|
+
area?: string;
|
|
65
|
+
verifier: MultiLayerVerifier<Env>;
|
|
66
|
+
/**
|
|
67
|
+
* The verifier expects an `env` per run. Adapters take it from
|
|
68
|
+
* `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.
|
|
69
|
+
*/
|
|
70
|
+
options?: Omit<VerifyOptions<Env>, 'env'>;
|
|
71
|
+
}
|
|
72
|
+
declare function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env>;
|
|
73
|
+
interface RunCriticAdapterOpts {
|
|
74
|
+
id?: string;
|
|
75
|
+
area?: string;
|
|
76
|
+
critic?: RunCritic;
|
|
77
|
+
/** Optional threshold below which a dimension is reported as a finding. Default 0.5. */
|
|
78
|
+
threshold?: number;
|
|
79
|
+
}
|
|
80
|
+
declare function createRunCriticAdapter(opts?: RunCriticAdapterOpts): Analyst<RunTrace>;
|
|
81
|
+
interface JudgeAdapterOpts {
|
|
82
|
+
id?: string;
|
|
83
|
+
area?: string;
|
|
84
|
+
judge: JudgeFn;
|
|
85
|
+
/** TCloud handle the JudgeFn calls. */
|
|
86
|
+
tcloud: TCloud;
|
|
87
|
+
/** Optional cost classification — most judges call an LLM. */
|
|
88
|
+
cost?: Analyst['cost'];
|
|
89
|
+
/** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */
|
|
90
|
+
threshold?: number;
|
|
91
|
+
}
|
|
92
|
+
declare function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput>;
|
|
93
|
+
interface SemanticConceptJudgeAdapterOpts {
|
|
94
|
+
id?: string;
|
|
95
|
+
area?: string;
|
|
96
|
+
options?: SemanticConceptJudgeOptions;
|
|
97
|
+
}
|
|
98
|
+
declare function createSemanticConceptJudgeAdapter(opts?: SemanticConceptJudgeAdapterOpts): Analyst<SemanticConceptJudgeInput>;
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* `behavioralAnalyst` — a DETERMINISTIC analyst (cost.kind = 'deterministic',
|
|
102
|
+
* never calls the LLM). It produces the efficiency/behavioral findings a
|
|
103
|
+
* tolerant agentic analyzer (HALO) re-derives per run inside the model —
|
|
104
|
+
* context bloat, output decay, tool monoculture, missing self-verification —
|
|
105
|
+
* directly from arithmetic over spans (`computeTraceMetrics`).
|
|
106
|
+
*
|
|
107
|
+
* Why it matters: these findings are model-agnostic BY CONSTRUCTION (no model
|
|
108
|
+
* in the loop), so they cannot return 0 on a weak model the way the Ax-RLM
|
|
109
|
+
* does — and they are strictly more reliable than HALO, which spends tokens
|
|
110
|
+
* re-deriving the same numbers and can hallucinate the trend. The agentic
|
|
111
|
+
* RLM kinds remain for SEMANTIC findings that genuinely need a model; this
|
|
112
|
+
* analyst owns the behavioral class.
|
|
113
|
+
*/
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Map computed signals → structured AnalystFindings. Pure: no LLM, no clock
|
|
117
|
+
* dependence beyond `produced_at` (overridable for deterministic tests).
|
|
118
|
+
*/
|
|
119
|
+
declare function deriveEfficiencyFindings(metrics: BehavioralMetrics, opts?: {
|
|
120
|
+
analystId?: string;
|
|
121
|
+
producedAt?: string;
|
|
122
|
+
}): AnalystFinding[];
|
|
123
|
+
/** The deterministic behavioral/efficiency analyst (no LLM, any-model). */
|
|
124
|
+
declare function behavioralAnalyst(): Analyst<TraceAnalysisStore>;
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Forgiving pre-parse for analyst findings. Weak models routinely emit
|
|
128
|
+
* schema-correct content in an unusable wrapper — fenced ```json blocks, a
|
|
129
|
+
* single object where an array is expected, trailing commas. Measured: GPT-4o
|
|
130
|
+
* drops to 0% usable output purely from markdown-fence wrapping
|
|
131
|
+
* (arXiv:2605.02363). A five-line de-fence recovers most of it. This module is
|
|
132
|
+
* the de-fence/coerce step that runs BEFORE Zod, so a recoverable finding is
|
|
133
|
+
* repaired, not dropped.
|
|
134
|
+
*
|
|
135
|
+
* Pure + deterministic. No model, no network.
|
|
136
|
+
*/
|
|
137
|
+
/** Strip a ```lang ... ``` (or bare ``` ... ```) code fence, if the string is one. */
|
|
138
|
+
declare function stripCodeFences(text: string): string;
|
|
139
|
+
/**
|
|
140
|
+
* Best-effort parse of a string into JSON. De-fences, drops trailing commas,
|
|
141
|
+
* then `JSON.parse`. Returns `undefined` (never throws) when unrecoverable.
|
|
142
|
+
*/
|
|
143
|
+
declare function coerceJson(text: string): unknown;
|
|
144
|
+
/**
|
|
145
|
+
* Coerce arbitrary actor/structurer output into an array of candidate finding
|
|
146
|
+
* rows: a JSON string → parse; a single object → 1-element array; an array →
|
|
147
|
+
* as-is; anything else → []. Callers still run each row through Zod
|
|
148
|
+
* (`parseRawFinding`) — this only fixes the SHAPE, never invents fields.
|
|
149
|
+
*/
|
|
150
|
+
declare function coerceToFindingRows(raw: unknown): unknown[];
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* `structureFindings` — the deferred structuring pass (DSPy TwoStepAdapter /
|
|
154
|
+
* HALO `synthesize_traces` analog). The agentic actor reasons FREE-FORM and
|
|
155
|
+
* emits a prose `report` (which any model does reliably); this separate, cheap
|
|
156
|
+
* call's ONLY job is to turn that report into `AnalystFinding[]`. Decoupling
|
|
157
|
+
* reasoning from structuring is what makes the SEMANTIC findings model-agnostic
|
|
158
|
+
* — the reasoning model never has to satisfy a strict typed-array contract
|
|
159
|
+
* while it diagnoses.
|
|
160
|
+
*
|
|
161
|
+
* Forgiving: the response runs through `coerceToFindingRows` (de-fence, lift
|
|
162
|
+
* single→array) before Zod, and on a zero-finding extraction from a substantive
|
|
163
|
+
* report it reasks ONCE with the schema restated. Returns a typed outcome so a
|
|
164
|
+
* legitimate "nothing to report" is distinguishable from a failed extraction
|
|
165
|
+
* (no silent empty).
|
|
166
|
+
*/
|
|
167
|
+
|
|
168
|
+
interface StructureFindingsOptions {
|
|
169
|
+
/** The actor's free-form diagnosis prose. */
|
|
170
|
+
report: string;
|
|
171
|
+
analystId: string;
|
|
172
|
+
/** Coarse classification stamped on every extracted finding. */
|
|
173
|
+
area: string;
|
|
174
|
+
model: string;
|
|
175
|
+
baseUrl: string;
|
|
176
|
+
apiKey?: string;
|
|
177
|
+
/** Max reask attempts after a zero/invalid extraction. Default 1. */
|
|
178
|
+
maxReasks?: number;
|
|
179
|
+
/** Test seam: inject a fetch (no network in unit tests). */
|
|
180
|
+
fetchImpl?: LlmClientOptions['fetch'];
|
|
181
|
+
}
|
|
182
|
+
interface StructureFindingsResult {
|
|
183
|
+
findings: AnalystFinding[];
|
|
184
|
+
outcome: 'ok' | 'extraction_failed';
|
|
185
|
+
}
|
|
186
|
+
declare function structureFindings(opts: StructureFindingsOptions): Promise<StructureFindingsResult>;
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Pre-curated tool subsets for analyst kinds.
|
|
190
|
+
*
|
|
191
|
+
* The full trace-analyst tool set is seven functions. Most kinds only
|
|
192
|
+
* need three or four. Picking from named groups instead of importing
|
|
193
|
+
* the whole bundle keeps every kind's actor-context budget tight and
|
|
194
|
+
* makes "what can this analyst see?" obvious at registration time.
|
|
195
|
+
*
|
|
196
|
+
* Each function in the group keeps its full `name`/`description` from
|
|
197
|
+
* `buildTraceAnalystTools` — we filter, we don't re-implement.
|
|
198
|
+
*/
|
|
199
|
+
|
|
200
|
+
/** Named tool sets. Kinds pass `tools: TRACE_TOOL_GROUPS.failureForensics` etc. */
|
|
201
|
+
type TraceToolGroupName =
|
|
202
|
+
/** All seven tools. Use for open-ended discovery kinds. */
|
|
203
|
+
'all'
|
|
204
|
+
/** Overview + paginated query + count. No deep reads. Cheap. */
|
|
205
|
+
| 'discovery'
|
|
206
|
+
/** Discovery + viewTrace + viewSpans. Deep-read but no regex search. */
|
|
207
|
+
| 'discoveryAndRead'
|
|
208
|
+
/** Discovery + search tools. For pattern-matching across many traces. */
|
|
209
|
+
| 'discoveryAndSearch'
|
|
210
|
+
/** Discovery + viewSpans + searchSpan. Targeted-span work after another kind narrows down. */
|
|
211
|
+
| 'targeted';
|
|
212
|
+
/**
|
|
213
|
+
* Build the tool set for a named group bound to a specific trace store.
|
|
214
|
+
*
|
|
215
|
+
* `all` returns every tool. Other groups filter `buildTraceAnalystTools`
|
|
216
|
+
* by name to the documented subset. An unrecognised group name throws —
|
|
217
|
+
* silently returning all tools would defeat the cost-control point.
|
|
218
|
+
*/
|
|
219
|
+
declare function buildTraceToolsForGroup(group: TraceToolGroupName, store: TraceAnalysisStore): AxFunction[];
|
|
220
|
+
|
|
221
|
+
export { Analyst, AnalystFinding, AnalystSeverity, type JudgeAdapterOpts, type RunCriticAdapterOpts, type SemanticConceptJudgeAdapterOpts, type StructureFindingsOptions, type StructureFindingsResult, type TraceAnalystAdapterOpts, type TraceToolGroupName, type VerifierAdapterOpts, behavioralAnalyst, buildTraceToolsForGroup, coerceJson, coerceToFindingRows, createJudgeAdapter, createRunCriticAdapter, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createVerifierAdapter, deriveEfficiencyFindings, liftSeverity, stripCodeFences, structureFindings };
|
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
import {
|
|
2
|
+
FindingsStore,
|
|
3
|
+
RunCritic,
|
|
4
|
+
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
5
|
+
SKILL_USAGE_ANALYST,
|
|
6
|
+
SkillUsageAnalyst,
|
|
7
|
+
behavioralAnalyst,
|
|
8
|
+
buildDefaultAnalystRegistry,
|
|
9
|
+
buildSkillUsageReport,
|
|
10
|
+
createAnalystAi,
|
|
11
|
+
createChatClient,
|
|
12
|
+
defaultIsMaterial,
|
|
13
|
+
deriveEfficiencyFindings,
|
|
14
|
+
diffFindings,
|
|
15
|
+
emitSkillUsageFindings,
|
|
16
|
+
runSemanticConceptJudge
|
|
17
|
+
} from "../chunk-7W4SM7FD.js";
|
|
18
|
+
import {
|
|
19
|
+
ANALYST_SEVERITIES,
|
|
20
|
+
AnalystRegistry,
|
|
21
|
+
DEFAULT_TRACE_ANALYST_KINDS,
|
|
22
|
+
FAILURE_MODE_KIND_SPEC,
|
|
23
|
+
FINDING_SUBJECT_GRAMMAR_PROMPT,
|
|
24
|
+
FINDING_SUBJECT_KINDS,
|
|
25
|
+
FindingSubjectStringSchema,
|
|
26
|
+
IMPROVEMENT_KIND_SPEC,
|
|
27
|
+
KIND_EXPECTED_SUBJECTS,
|
|
28
|
+
KNOWLEDGE_GAP_KIND_SPEC,
|
|
29
|
+
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
30
|
+
RAW_FINDING_SCHEMA_PROMPT,
|
|
31
|
+
RawAnalystFindingSchema,
|
|
32
|
+
buildTraceToolsForGroup,
|
|
33
|
+
coerceJson,
|
|
34
|
+
coerceToFindingRows,
|
|
35
|
+
computeFindingId,
|
|
36
|
+
createTraceAnalystKind,
|
|
37
|
+
makeFinding,
|
|
38
|
+
parseFindingSubject,
|
|
39
|
+
parseRawFinding,
|
|
40
|
+
renderFindingSubject,
|
|
41
|
+
renderPriorFindings,
|
|
42
|
+
stripCodeFences,
|
|
43
|
+
structureFindings
|
|
44
|
+
} from "../chunk-WYIHD6EB.js";
|
|
45
|
+
import "../chunk-IHDHUN2X.js";
|
|
46
|
+
import {
|
|
47
|
+
analyzeTraces
|
|
48
|
+
} from "../chunk-VUINJM5M.js";
|
|
49
|
+
import "../chunk-PC4UYEBM.js";
|
|
50
|
+
import "../chunk-3BFEG2F6.js";
|
|
51
|
+
import "../chunk-PZ5AY32C.js";
|
|
52
|
+
|
|
53
|
+
// src/analyst/adapters.ts
|
|
54
|
+
var ADAPTER_REV = "1";
|
|
55
|
+
function liftSeverity(s) {
|
|
56
|
+
switch (s) {
|
|
57
|
+
case "critical":
|
|
58
|
+
return "critical";
|
|
59
|
+
case "major":
|
|
60
|
+
return "high";
|
|
61
|
+
case "minor":
|
|
62
|
+
return "medium";
|
|
63
|
+
case "info":
|
|
64
|
+
return "info";
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
function createTraceAnalystAdapter(opts) {
|
|
68
|
+
const id = opts.id ?? "trace-analyst";
|
|
69
|
+
const area = opts.area ?? "agent-reasoning";
|
|
70
|
+
return {
|
|
71
|
+
id,
|
|
72
|
+
description: "Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.",
|
|
73
|
+
inputKind: "trace-store",
|
|
74
|
+
cost: { kind: "llm", models: opts.model ? [opts.model] : void 0 },
|
|
75
|
+
version: `trace-analyst-${ADAPTER_REV}`,
|
|
76
|
+
async analyze(store, ctx) {
|
|
77
|
+
const out = [];
|
|
78
|
+
for (const question of opts.questions) {
|
|
79
|
+
if (ctx.signal?.aborted) break;
|
|
80
|
+
const result = await analyzeTraces(
|
|
81
|
+
{ question },
|
|
82
|
+
{ source: store, ai: opts.ai, model: opts.model, ...opts.extra }
|
|
83
|
+
);
|
|
84
|
+
const subject = ctx.tags?.subject ?? question.slice(0, 60);
|
|
85
|
+
if (result.findings.length === 0) {
|
|
86
|
+
out.push(
|
|
87
|
+
makeFinding({
|
|
88
|
+
analyst_id: id,
|
|
89
|
+
area,
|
|
90
|
+
subject,
|
|
91
|
+
claim: result.answer.slice(0, 200),
|
|
92
|
+
rationale: result.answer,
|
|
93
|
+
severity: "info",
|
|
94
|
+
confidence: 0.5,
|
|
95
|
+
evidence_refs: [],
|
|
96
|
+
metadata: {
|
|
97
|
+
actor_prompt_version: result.actorPromptVersion,
|
|
98
|
+
turns: result.turnCount
|
|
99
|
+
}
|
|
100
|
+
})
|
|
101
|
+
);
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
result.findings.forEach((claim, i) => {
|
|
105
|
+
out.push(
|
|
106
|
+
makeFinding({
|
|
107
|
+
analyst_id: id,
|
|
108
|
+
area,
|
|
109
|
+
subject,
|
|
110
|
+
claim,
|
|
111
|
+
rationale: i === 0 ? result.answer : void 0,
|
|
112
|
+
severity: "medium",
|
|
113
|
+
confidence: 0.6,
|
|
114
|
+
evidence_refs: [],
|
|
115
|
+
metadata: { question, turns: result.turnCount, finding_index: i }
|
|
116
|
+
})
|
|
117
|
+
);
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
return out;
|
|
121
|
+
}
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
function createVerifierAdapter(opts) {
|
|
125
|
+
const id = opts.id ?? "multi-layer-verifier";
|
|
126
|
+
const area = opts.area ?? "verification";
|
|
127
|
+
return {
|
|
128
|
+
id,
|
|
129
|
+
description: "Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.",
|
|
130
|
+
inputKind: "custom",
|
|
131
|
+
cost: { kind: "deterministic" },
|
|
132
|
+
version: `verifier-${ADAPTER_REV}`,
|
|
133
|
+
async analyze(env, ctx) {
|
|
134
|
+
const report = await opts.verifier.run({ env, ...opts.options });
|
|
135
|
+
const out = [];
|
|
136
|
+
for (const layer of report.layers) {
|
|
137
|
+
for (const finding of layer.findings) {
|
|
138
|
+
out.push(liftLayerFinding(id, area, layer.layer, finding));
|
|
139
|
+
}
|
|
140
|
+
if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
|
|
141
|
+
out.push(
|
|
142
|
+
makeFinding({
|
|
143
|
+
analyst_id: id,
|
|
144
|
+
area,
|
|
145
|
+
subject: layer.layer,
|
|
146
|
+
claim: `layer "${layer.layer}" ${layer.status}: ${layer.reason ?? "no reason given"}`,
|
|
147
|
+
severity: layer.status === "error" ? "high" : layer.status === "timeout" ? "medium" : "high",
|
|
148
|
+
confidence: 1,
|
|
149
|
+
evidence_refs: [],
|
|
150
|
+
metadata: {
|
|
151
|
+
layer_status: layer.status,
|
|
152
|
+
duration_ms: layer.durationMs,
|
|
153
|
+
score: layer.score,
|
|
154
|
+
diagnostics: layer.diagnostics
|
|
155
|
+
}
|
|
156
|
+
})
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
ctx.log?.("verifier complete", {
|
|
161
|
+
layers: report.layers.length,
|
|
162
|
+
blended: report.blendedScore,
|
|
163
|
+
all_pass: report.allPass
|
|
164
|
+
});
|
|
165
|
+
return out;
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
function liftLayerFinding(analyst_id, area, layer, f) {
|
|
170
|
+
return makeFinding({
|
|
171
|
+
analyst_id,
|
|
172
|
+
area,
|
|
173
|
+
subject: f.layer ?? layer,
|
|
174
|
+
claim: f.message,
|
|
175
|
+
severity: liftSeverity(f.severity),
|
|
176
|
+
confidence: 0.85,
|
|
177
|
+
evidence_refs: f.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }] : [],
|
|
178
|
+
metadata: f.detail
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
function createRunCriticAdapter(opts = {}) {
|
|
182
|
+
const id = opts.id ?? "run-critic";
|
|
183
|
+
const area = opts.area ?? "run-quality";
|
|
184
|
+
const critic = opts.critic ?? new RunCritic();
|
|
185
|
+
const threshold = opts.threshold ?? 0.5;
|
|
186
|
+
return {
|
|
187
|
+
id,
|
|
188
|
+
description: "Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.",
|
|
189
|
+
inputKind: "custom",
|
|
190
|
+
cost: { kind: "deterministic" },
|
|
191
|
+
version: `run-critic-${ADAPTER_REV}`,
|
|
192
|
+
async analyze(trace) {
|
|
193
|
+
const score = critic.scoreTrace(trace);
|
|
194
|
+
const out = [];
|
|
195
|
+
const dims = [
|
|
196
|
+
["success", "critical", "run did not complete successfully"],
|
|
197
|
+
["goalProgress", "high", "goal progress is low"],
|
|
198
|
+
["repoGroundedness", "high", "output is poorly grounded in the repository"],
|
|
199
|
+
["toolUseQuality", "medium", "tool use quality is low"],
|
|
200
|
+
["patchQuality", "medium", "no real patch/edit evidence"],
|
|
201
|
+
["testReality", "high", "no real test/build evidence"],
|
|
202
|
+
["finalGate", "critical", "final gate is blocking"]
|
|
203
|
+
];
|
|
204
|
+
for (const [dim, sev, msg] of dims) {
|
|
205
|
+
const value = score[dim];
|
|
206
|
+
if (typeof value === "number" && value < threshold) {
|
|
207
|
+
out.push(
|
|
208
|
+
makeFinding({
|
|
209
|
+
analyst_id: id,
|
|
210
|
+
area,
|
|
211
|
+
subject: dim,
|
|
212
|
+
claim: msg,
|
|
213
|
+
rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,
|
|
214
|
+
severity: sev,
|
|
215
|
+
confidence: 1,
|
|
216
|
+
evidence_refs: [],
|
|
217
|
+
metadata: { dimension: dim, value, threshold, run_id: trace.run.runId }
|
|
218
|
+
})
|
|
219
|
+
);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
if (score.driftPenalty > 1 - threshold) {
|
|
223
|
+
out.push(
|
|
224
|
+
makeFinding({
|
|
225
|
+
analyst_id: id,
|
|
226
|
+
area,
|
|
227
|
+
subject: "drift",
|
|
228
|
+
claim: "agent output drifted from repository signal",
|
|
229
|
+
rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,
|
|
230
|
+
severity: "medium",
|
|
231
|
+
confidence: 0.9,
|
|
232
|
+
evidence_refs: [],
|
|
233
|
+
metadata: { drift_penalty: score.driftPenalty, notes: score.notes }
|
|
234
|
+
})
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
return out;
|
|
238
|
+
}
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
function createJudgeAdapter(opts) {
|
|
242
|
+
const id = opts.id ?? "judge";
|
|
243
|
+
const area = opts.area ?? "judge";
|
|
244
|
+
const threshold = opts.threshold ?? 6;
|
|
245
|
+
return {
|
|
246
|
+
id,
|
|
247
|
+
description: "Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.",
|
|
248
|
+
inputKind: "judge-input",
|
|
249
|
+
cost: opts.cost ?? { kind: "llm" },
|
|
250
|
+
version: `judge-${ADAPTER_REV}`,
|
|
251
|
+
async analyze(input) {
|
|
252
|
+
const scores = await opts.judge(opts.tcloud, input);
|
|
253
|
+
return scores.filter((s) => normalize10(s.score) < threshold).map((s) => liftJudgeScore(id, area, s));
|
|
254
|
+
}
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
function normalize10(s) {
|
|
258
|
+
return s <= 1 ? s * 10 : s;
|
|
259
|
+
}
|
|
260
|
+
function liftJudgeScore(analyst_id, area, s) {
|
|
261
|
+
const score10 = normalize10(s.score);
|
|
262
|
+
const severity = score10 < 3 ? "critical" : score10 < 5 ? "high" : score10 < 7 ? "medium" : "low";
|
|
263
|
+
return makeFinding({
|
|
264
|
+
analyst_id,
|
|
265
|
+
area,
|
|
266
|
+
subject: s.dimension,
|
|
267
|
+
claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,
|
|
268
|
+
rationale: s.reasoning,
|
|
269
|
+
severity,
|
|
270
|
+
confidence: 0.8,
|
|
271
|
+
evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
|
|
272
|
+
metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
function createSemanticConceptJudgeAdapter(opts = {}) {
|
|
276
|
+
const id = opts.id ?? "semantic-concept-judge";
|
|
277
|
+
const area = opts.area ?? "concept-coverage";
|
|
278
|
+
return {
|
|
279
|
+
id,
|
|
280
|
+
description: "Runs the semantic-concept judge and surfaces missing / weak concepts as findings.",
|
|
281
|
+
inputKind: "custom",
|
|
282
|
+
cost: { kind: "llm", models: opts.options?.model ? [opts.options.model] : void 0 },
|
|
283
|
+
version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,
|
|
284
|
+
async analyze(input) {
|
|
285
|
+
const result = await runSemanticConceptJudge(input, opts.options);
|
|
286
|
+
if (!result.available) {
|
|
287
|
+
return [
|
|
288
|
+
makeFinding({
|
|
289
|
+
analyst_id: id,
|
|
290
|
+
area,
|
|
291
|
+
claim: "semantic-concept judge unavailable",
|
|
292
|
+
rationale: result.error,
|
|
293
|
+
severity: "info",
|
|
294
|
+
confidence: 1,
|
|
295
|
+
evidence_refs: [],
|
|
296
|
+
metadata: { reason: result.error }
|
|
297
|
+
})
|
|
298
|
+
];
|
|
299
|
+
}
|
|
300
|
+
const out = [];
|
|
301
|
+
for (const f of result.findings) {
|
|
302
|
+
if (f.present && f.score >= 7) continue;
|
|
303
|
+
out.push(
|
|
304
|
+
makeFinding({
|
|
305
|
+
analyst_id: id,
|
|
306
|
+
area,
|
|
307
|
+
subject: f.concept,
|
|
308
|
+
claim: f.present ? `concept "${f.concept}" is weak (${f.score}/10)` : `concept "${f.concept}" is missing`,
|
|
309
|
+
rationale: f.evidence,
|
|
310
|
+
severity: liftSeverity(f.severity),
|
|
311
|
+
confidence: 0.85,
|
|
312
|
+
evidence_refs: [{ kind: "artifact", uri: "inline:evidence", excerpt: f.evidence }],
|
|
313
|
+
metadata: {
|
|
314
|
+
concept: f.concept,
|
|
315
|
+
present: f.present,
|
|
316
|
+
score_10: f.score,
|
|
317
|
+
cost_usd: result.costUsd ?? void 0
|
|
318
|
+
}
|
|
319
|
+
})
|
|
320
|
+
);
|
|
321
|
+
}
|
|
322
|
+
return out;
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
}
|
|
326
|
+
export {
|
|
327
|
+
ANALYST_SEVERITIES,
|
|
328
|
+
AnalystRegistry,
|
|
329
|
+
DEFAULT_TRACE_ANALYST_KINDS,
|
|
330
|
+
FAILURE_MODE_KIND_SPEC,
|
|
331
|
+
FINDING_SUBJECT_GRAMMAR_PROMPT,
|
|
332
|
+
FINDING_SUBJECT_KINDS,
|
|
333
|
+
FindingSubjectStringSchema,
|
|
334
|
+
FindingsStore,
|
|
335
|
+
IMPROVEMENT_KIND_SPEC,
|
|
336
|
+
KIND_EXPECTED_SUBJECTS,
|
|
337
|
+
KNOWLEDGE_GAP_KIND_SPEC,
|
|
338
|
+
KNOWLEDGE_POISONING_KIND_SPEC,
|
|
339
|
+
RAW_FINDING_SCHEMA_PROMPT,
|
|
340
|
+
RawAnalystFindingSchema,
|
|
341
|
+
SKILL_USAGE_ANALYST,
|
|
342
|
+
SkillUsageAnalyst,
|
|
343
|
+
behavioralAnalyst,
|
|
344
|
+
buildDefaultAnalystRegistry,
|
|
345
|
+
buildSkillUsageReport,
|
|
346
|
+
buildTraceToolsForGroup,
|
|
347
|
+
coerceJson,
|
|
348
|
+
coerceToFindingRows,
|
|
349
|
+
computeFindingId,
|
|
350
|
+
createAnalystAi,
|
|
351
|
+
createChatClient,
|
|
352
|
+
createJudgeAdapter,
|
|
353
|
+
createRunCriticAdapter,
|
|
354
|
+
createSemanticConceptJudgeAdapter,
|
|
355
|
+
createTraceAnalystAdapter,
|
|
356
|
+
createTraceAnalystKind,
|
|
357
|
+
createVerifierAdapter,
|
|
358
|
+
defaultIsMaterial,
|
|
359
|
+
deriveEfficiencyFindings,
|
|
360
|
+
diffFindings,
|
|
361
|
+
emitSkillUsageFindings,
|
|
362
|
+
liftSeverity,
|
|
363
|
+
makeFinding,
|
|
364
|
+
parseFindingSubject,
|
|
365
|
+
parseRawFinding,
|
|
366
|
+
renderFindingSubject,
|
|
367
|
+
renderPriorFindings,
|
|
368
|
+
stripCodeFences,
|
|
369
|
+
structureFindings
|
|
370
|
+
};
|
|
371
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/analyst/adapters.ts"],"sourcesContent":["/**\n * Adapter factories — lift each existing agent-eval primitive into the\n * Analyst contract without re-implementing it.\n *\n * Five primitives, five factories. Each one:\n * - Builds an Analyst with a stable id (caller chooses; defaults\n * given), a sensible default `inputKind`, a version derived from\n * the wrapped primitive's version + an adapter revision, and an\n * `analyze()` that calls the primitive and lifts its output to\n * AnalystFinding[] using `makeFinding()`.\n * - Maps severities: the existing `Severity` ('critical' | 'major' |\n * 'minor' | 'info') projects onto AnalystSeverity ('critical' |\n * 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →\n * 'medium'. Domain analysts that want finer-grained mapping override.\n *\n * Adapters never own state. Calling the same factory twice with the\n * same primitive instance is safe.\n */\n\nimport type { AxAIService } from '@ax-llm/ax'\nimport type {\n Finding as LayerFinding,\n Severity as LayerSeverity,\n MultiLayerVerifier,\n VerifyOptions,\n} from '../multi-layer-verifier'\nimport { RunCritic, type RunTrace } from '../run-critic'\nimport {\n runSemanticConceptJudge,\n SEMANTIC_CONCEPT_JUDGE_VERSION,\n type SemanticConceptJudgeInput,\n type SemanticConceptJudgeOptions,\n} from '../semantic-concept-judge'\nimport { type AnalyzeTracesOptions, analyzeTraces } from '../trace-analyst/analyst'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport type { JudgeFn, JudgeInput, JudgeScore, TCloud } from '../types'\nimport type { Analyst, AnalystFinding, AnalystSeverity } from './types'\nimport { makeFinding } from './types'\n\nconst ADAPTER_REV = '1'\n\n// ── Severity bridges ───────────────────────────────────────────────\n\nexport function liftSeverity(s: LayerSeverity): AnalystSeverity {\n switch (s) {\n case 'critical':\n return 'critical'\n case 'major':\n return 'high'\n case 'minor':\n return 'medium'\n case 'info':\n return 'info'\n }\n}\n\n// ── 1. analyzeTraces → Analyst ─────────────────────────────────────\n\nexport interface TraceAnalystAdapterOpts {\n id?: string\n area?: string\n /** The natural-language question(s) put to the analyst. One finding per question. */\n questions: string[]\n /** Caller-provided AxAI service — same one trace-analyst.ts expects. */\n ai: AxAIService\n model?: string\n /** Forwarded to analyzeTraces. */\n extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>\n}\n\n/**\n * @deprecated Prefer `createTraceAnalystKind` + one of the failure /\n * improvement kinds from `./kinds`. This adapter wraps the legacy\n * `analyzeTraces` flow whose output is `findings:string[]` — every\n * bullet gets flat-defaulted severity `medium` / confidence `0.6`,\n * which loses the per-finding grading kinds provide via Ax structured\n * output + Zod validation. Kept for one minor while consumers migrate.\n */\nexport function createTraceAnalystAdapter(\n opts: TraceAnalystAdapterOpts,\n): Analyst<TraceAnalysisStore> {\n const id = opts.id ?? 'trace-analyst'\n const area = opts.area ?? 'agent-reasoning'\n return {\n id,\n description:\n 'Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.',\n inputKind: 'trace-store',\n cost: { kind: 'llm', models: opts.model ? [opts.model] : undefined },\n version: `trace-analyst-${ADAPTER_REV}`,\n async analyze(store, ctx) {\n const out: AnalystFinding[] = []\n for (const question of opts.questions) {\n if (ctx.signal?.aborted) break\n const result = await analyzeTraces(\n { question },\n { source: store, ai: opts.ai, model: opts.model, ...opts.extra },\n )\n const subject = ctx.tags?.subject ?? question.slice(0, 60)\n // The responder produces a list of bullet strings. Each becomes\n // one finding; the prose answer is attached as rationale on the\n // first (so renderers that show only top-N still get context).\n if (result.findings.length === 0) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim: result.answer.slice(0, 200),\n rationale: result.answer,\n severity: 'info',\n confidence: 0.5,\n evidence_refs: [],\n metadata: {\n actor_prompt_version: result.actorPromptVersion,\n turns: result.turnCount,\n },\n }),\n )\n continue\n }\n result.findings.forEach((claim, i) => {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim,\n rationale: i === 0 ? result.answer : undefined,\n severity: 'medium',\n confidence: 0.6,\n evidence_refs: [],\n metadata: { question, turns: result.turnCount, finding_index: i },\n }),\n )\n })\n }\n return out\n },\n }\n}\n\n// ── 2. MultiLayerVerifier → Analyst ─────────────────────────────────\n\nexport interface VerifierAdapterOpts<Env> {\n id?: string\n area?: string\n verifier: MultiLayerVerifier<Env>\n /**\n * The verifier expects an `env` per run. Adapters take it from\n * `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.\n */\n options?: Omit<VerifyOptions<Env>, 'env'>\n}\n\nexport function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env> {\n const id = opts.id ?? 'multi-layer-verifier'\n const area = opts.area ?? 'verification'\n return {\n id,\n description:\n \"Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.\",\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `verifier-${ADAPTER_REV}`,\n async analyze(env, ctx) {\n const report = await opts.verifier.run({ env, ...opts.options })\n const out: AnalystFinding[] = []\n for (const layer of report.layers) {\n for (const finding of layer.findings) {\n out.push(liftLayerFinding(id, area, layer.layer, finding))\n }\n // Layer-level signal: a failed/error layer is itself a finding\n // even if it didn't emit per-finding rows.\n if (layer.status === 'fail' || layer.status === 'error' || layer.status === 'timeout') {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: layer.layer,\n claim: `layer \"${layer.layer}\" ${layer.status}: ${layer.reason ?? 'no reason given'}`,\n severity:\n layer.status === 'error' ? 'high' : layer.status === 'timeout' ? 'medium' : 'high',\n confidence: 1,\n evidence_refs: [],\n metadata: {\n layer_status: layer.status,\n duration_ms: layer.durationMs,\n score: layer.score,\n diagnostics: layer.diagnostics,\n },\n }),\n )\n }\n }\n ctx.log?.('verifier complete', {\n layers: report.layers.length,\n blended: report.blendedScore,\n all_pass: report.allPass,\n })\n return out\n },\n }\n}\n\nfunction liftLayerFinding(\n analyst_id: string,\n area: string,\n layer: string,\n f: LayerFinding,\n): AnalystFinding {\n return makeFinding({\n analyst_id,\n area,\n subject: f.layer ?? layer,\n claim: f.message,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: f.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }]\n : [],\n metadata: f.detail,\n })\n}\n\n// ── 3. RunCritic → Analyst ──────────────────────────────────────────\n\nexport interface RunCriticAdapterOpts {\n id?: string\n area?: string\n critic?: RunCritic\n /** Optional threshold below which a dimension is reported as a finding. Default 0.5. */\n threshold?: number\n}\n\nexport function createRunCriticAdapter(opts: RunCriticAdapterOpts = {}): Analyst<RunTrace> {\n const id = opts.id ?? 'run-critic'\n const area = opts.area ?? 'run-quality'\n const critic = opts.critic ?? new RunCritic()\n const threshold = opts.threshold ?? 0.5\n return {\n id,\n description:\n 'Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.',\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `run-critic-${ADAPTER_REV}`,\n async analyze(trace) {\n const score = critic.scoreTrace(trace)\n const out: AnalystFinding[] = []\n const dims: Array<[keyof typeof score, AnalystSeverity, string]> = [\n ['success', 'critical', 'run did not complete successfully'],\n ['goalProgress', 'high', 'goal progress is low'],\n ['repoGroundedness', 'high', 'output is poorly grounded in the repository'],\n ['toolUseQuality', 'medium', 'tool use quality is low'],\n ['patchQuality', 'medium', 'no real patch/edit evidence'],\n ['testReality', 'high', 'no real test/build evidence'],\n ['finalGate', 'critical', 'final gate is blocking'],\n ]\n for (const [dim, sev, msg] of dims) {\n const value = score[dim] as number\n if (typeof value === 'number' && value < threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: dim,\n claim: msg,\n rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,\n severity: sev,\n confidence: 1,\n evidence_refs: [],\n metadata: { dimension: dim, value, threshold, run_id: trace.run.runId },\n }),\n )\n }\n }\n // Drift penalty is high → surface as a finding (inverse threshold).\n if (score.driftPenalty > 1 - threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: 'drift',\n claim: 'agent output drifted from repository signal',\n rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,\n severity: 'medium',\n confidence: 0.9,\n evidence_refs: [],\n metadata: { drift_penalty: score.driftPenalty, notes: score.notes },\n }),\n )\n }\n return out\n },\n }\n}\n\n// ── 4. JudgeFn → Analyst ────────────────────────────────────────────\n\nexport interface JudgeAdapterOpts {\n id?: string\n area?: string\n judge: JudgeFn\n /** TCloud handle the JudgeFn calls. */\n tcloud: TCloud\n /** Optional cost classification — most judges call an LLM. */\n cost?: Analyst['cost']\n /** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */\n threshold?: number\n}\n\nexport function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput> {\n const id = opts.id ?? 'judge'\n const area = opts.area ?? 'judge'\n const threshold = opts.threshold ?? 6\n return {\n id,\n description:\n 'Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.',\n inputKind: 'judge-input',\n cost: opts.cost ?? { kind: 'llm' },\n version: `judge-${ADAPTER_REV}`,\n async analyze(input) {\n const scores = await opts.judge(opts.tcloud, input)\n return scores\n .filter((s) => normalize10(s.score) < threshold)\n .map((s) => liftJudgeScore(id, area, s))\n },\n }\n}\n\nfunction normalize10(s: number): number {\n // JudgeScore convention is 0-10 but some judges emit 0-1. Coerce to 0-10.\n return s <= 1 ? s * 10 : s\n}\n\nfunction liftJudgeScore(analyst_id: string, area: string, s: JudgeScore): AnalystFinding {\n const score10 = normalize10(s.score)\n const severity: AnalystSeverity =\n score10 < 3 ? 'critical' : score10 < 5 ? 'high' : score10 < 7 ? 'medium' : 'low'\n return makeFinding({\n analyst_id,\n area,\n subject: s.dimension,\n claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,\n rationale: s.reasoning,\n severity,\n confidence: 0.8,\n evidence_refs: s.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: s.evidence }]\n : [],\n metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 },\n })\n}\n\n// ── 5. SemanticConceptJudge → Analyst ──────────────────────────────\n\nexport interface SemanticConceptJudgeAdapterOpts {\n id?: string\n area?: string\n options?: SemanticConceptJudgeOptions\n}\n\nexport function createSemanticConceptJudgeAdapter(\n opts: SemanticConceptJudgeAdapterOpts = {},\n): Analyst<SemanticConceptJudgeInput> {\n const id = opts.id ?? 'semantic-concept-judge'\n const area = opts.area ?? 'concept-coverage'\n return {\n id,\n description:\n 'Runs the semantic-concept judge and surfaces missing / weak concepts as findings.',\n inputKind: 'custom',\n cost: { kind: 'llm', models: opts.options?.model ? [opts.options.model] : undefined },\n version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,\n async analyze(input) {\n const result = await runSemanticConceptJudge(input, opts.options)\n if (!result.available) {\n return [\n makeFinding({\n analyst_id: id,\n area,\n claim: 'semantic-concept judge unavailable',\n rationale: result.error,\n severity: 'info',\n confidence: 1,\n evidence_refs: [],\n metadata: { reason: result.error },\n }),\n ]\n }\n const out: AnalystFinding[] = []\n for (const f of result.findings) {\n // Only surface gaps: missing concepts or low scores. Concepts at\n // 7+/10 with present=true are not findings — they're successes.\n if (f.present && f.score >= 7) continue\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: f.concept,\n claim: f.present\n ? `concept \"${f.concept}\" is weak (${f.score}/10)`\n : `concept \"${f.concept}\" is missing`,\n rationale: f.evidence,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }],\n metadata: {\n concept: f.concept,\n present: f.present,\n score_10: f.score,\n cost_usd: result.costUsd ?? undefined,\n },\n }),\n )\n }\n return out\n },\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuCA,IAAM,cAAc;AAIb,SAAS,aAAa,GAAmC;AAC9D,UAAQ,GAAG;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,EACX;AACF;AAwBO,SAAS,0BACd,MAC6B;AAC7B,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,QAAQ,CAAC,KAAK,KAAK,IAAI,OAAU;AAAA,IACnE,SAAS,iBAAiB,WAAW;AAAA,IACrC,MAAM,QAAQ,OAAO,KAAK;AACxB,YAAM,MAAwB,CAAC;AAC/B,iBAAW,YAAY,KAAK,WAAW;AACrC,YAAI,IAAI,QAAQ,QAAS;AACzB,cAAM,SAAS,MAAM;AAAA,UACnB,EAAE,SAAS;AAAA,UACX,EAAE,QAAQ,OAAO,IAAI,KAAK,IAAI,OAAO,KAAK,OAAO,GAAG,KAAK,MAAM;AAAA,QACjE;AACA,cAAM,UAAU,IAAI,MAAM,WAAW,SAAS,MAAM,GAAG,EAAE;AAIzD,YAAI,OAAO,SAAS,WAAW,GAAG;AAChC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA,OAAO,OAAO,OAAO,MAAM,GAAG,GAAG;AAAA,cACjC,WAAW,OAAO;AAAA,cAClB,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,sBAAsB,OAAO;AAAA,gBAC7B,OAAO,OAAO;AAAA,cAChB;AAAA,YACF,CAAC;AAAA,UACH;AACA;AAAA,QACF;AACA,eAAO,SAAS,QAAQ,CAAC,OAAO,MAAM;AACpC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA;AAAA,cACA,WAAW,MAAM,IAAI,OAAO,SAAS;AAAA,cACrC,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,UAAU,OAAO,OAAO,WAAW,eAAe,EAAE;AAAA,YAClE,CAAC;AAAA,UACH;AAAA,QACF,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAeO,SAAS,sBAA2B,MAA8C;AACvF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,YAAY,WAAW;AAAA,IAChC,MAAM,QAAQ,KAAK,KAAK;AACtB,YAAM,SAAS,MAAM,KAAK,SAAS,IAAI,EAAE,KAAK,GAAG,KAAK,QAAQ,CAAC;AAC/D,YAAM,MAAwB,CAAC;AAC/B,iBAAW,SAAS,OAAO,QAAQ;AACjC,mBAAW,WAAW,MAAM,UAAU;AACpC,cAAI,KAAK,iBAAiB,IAAI,MAAM,MAAM,OAAO,OAAO,CAAC;AAAA,QAC3D;AAGA,YAAI,MAAM,WAAW,UAAU,MAAM,WAAW,WAAW,MAAM,WAAW,WAAW;AACrF,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS,MAAM;AAAA,cACf,OAAO,UAAU,MAAM,KAAK,KAAK,MAAM,MAAM,KAAK,MAAM,UAAU,iBAAiB;AAAA,cACnF,UACE,MAAM,WAAW,UAAU,SAAS,MAAM,WAAW,YAAY,WAAW;AAAA,cAC9E,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,cAAc,MAAM;AAAA,gBACpB,aAAa,MAAM;AAAA,gBACnB,OAAO,MAAM;AAAA,gBACb,aAAa,MAAM;AAAA,cACrB;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AACA,UAAI,MAAM,qBAAqB;AAAA,QAC7B,QAAQ,OAAO,OAAO;AAAA,QACtB,SAAS,OAAO;AAAA,QAChB,UAAU,OAAO;AAAA,MACnB,CAAC;AACD,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,iBACP,YACA,MACA,OACA,GACgB;AAChB,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE,SAAS;AAAA,IACpB,OAAO,EAAE;AAAA,IACT,UAAU,aAAa,EAAE,QAAQ;AAAA,IACjC,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE;AAAA,EACd,CAAC;AACH;AAYO,SAAS,uBAAuB,OAA6B,CAAC,GAAsB;AACzF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,SAAS,KAAK,UAAU,IAAI,UAAU;AAC5C,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,cAAc,WAAW;AAAA,IAClC,MAAM,QAAQ,OAAO;AACnB,YAAM,QAAQ,OAAO,WAAW,KAAK;AACrC,YAAM,MAAwB,CAAC;AAC/B,YAAM,OAA6D;AAAA,QACjE,CAAC,WAAW,YAAY,mCAAmC;AAAA,QAC3D,CAAC,gBAAgB,QAAQ,sBAAsB;AAAA,QAC/C,CAAC,oBAAoB,QAAQ,6CAA6C;AAAA,QAC1E,CAAC,kBAAkB,UAAU,yBAAyB;AAAA,QACtD,CAAC,gBAAgB,UAAU,6BAA6B;AAAA,QACxD,CAAC,eAAe,QAAQ,6BAA6B;AAAA,QACrD,CAAC,aAAa,YAAY,wBAAwB;AAAA,MACpD;AACA,iBAAW,CAAC,KAAK,KAAK,GAAG,KAAK,MAAM;AAClC,cAAM,QAAQ,MAAM,GAAG;AACvB,YAAI,OAAO,UAAU,YAAY,QAAQ,WAAW;AAClD,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS;AAAA,cACT,OAAO;AAAA,cACP,WAAW,GAAG,GAAG,IAAI,MAAM,QAAQ,CAAC,CAAC,oBAAoB,SAAS;AAAA,cAClE,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,WAAW,KAAK,OAAO,WAAW,QAAQ,MAAM,IAAI,MAAM;AAAA,YACxE,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,UAAI,MAAM,eAAe,IAAI,WAAW;AACtC,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS;AAAA,YACT,OAAO;AAAA,YACP,WAAW,gBAAgB,MAAM,aAAa,QAAQ,CAAC,CAAC;AAAA,YACxD,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,eAAe,MAAM,cAAc,OAAO,MAAM,MAAM;AAAA,UACpE,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAgBO,SAAS,mBAAmB,MAA6C;AAC9E,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,KAAK,QAAQ,EAAE,MAAM,MAAM;AAAA,IACjC,SAAS,SAAS,WAAW;AAAA,IAC7B,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,KAAK,MAAM,KAAK,QAAQ,KAAK;AAClD,aAAO,OACJ,OAAO,CAAC,MAAM,YAAY,EAAE,KAAK,IAAI,SAAS,EAC9C,IAAI,CAAC,MAAM,eAAe,IAAI,MAAM,CAAC,CAAC;AAAA,IAC3C;AAAA,EACF;AACF;AAEA,SAAS,YAAY,GAAmB;AAEtC,SAAO,KAAK,IAAI,IAAI,KAAK;AAC3B;AAEA,SAAS,eAAe,YAAoB,MAAc,GAA+B;AACvF,QAAM,UAAU,YAAY,EAAE,KAAK;AACnC,QAAM,WACJ,UAAU,IAAI,aAAa,UAAU,IAAI,SAAS,UAAU,IAAI,WAAW;AAC7E,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE;AAAA,IACX,OAAO,GAAG,EAAE,SAAS,IAAI,EAAE,SAAS,WAAW,QAAQ,QAAQ,CAAC,CAAC;AAAA,IACjE,WAAW,EAAE;AAAA,IACb;AAAA,IACA,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE,YAAY,EAAE,WAAW,WAAW,EAAE,WAAW,UAAU,QAAQ;AAAA,EACjF,CAAC;AACH;AAUO,SAAS,kCACd,OAAwC,CAAC,GACL;AACpC,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,SAAS,QAAQ,CAAC,KAAK,QAAQ,KAAK,IAAI,OAAU;AAAA,IACpF,SAAS,GAAG,8BAA8B,YAAY,WAAW;AAAA,IACjE,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,wBAAwB,OAAO,KAAK,OAAO;AAChE,UAAI,CAAC,OAAO,WAAW;AACrB,eAAO;AAAA,UACL,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,OAAO;AAAA,YACP,WAAW,OAAO;AAAA,YAClB,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,QAAQ,OAAO,MAAM;AAAA,UACnC,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,MAAwB,CAAC;AAC/B,iBAAW,KAAK,OAAO,UAAU;AAG/B,YAAI,EAAE,WAAW,EAAE,SAAS,EAAG;AAC/B,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS,EAAE;AAAA,YACX,OAAO,EAAE,UACL,YAAY,EAAE,OAAO,cAAc,EAAE,KAAK,SAC1C,YAAY,EAAE,OAAO;AAAA,YACzB,WAAW,EAAE;AAAA,YACb,UAAU,aAAa,EAAE,QAAQ;AAAA,YACjC,YAAY;AAAA,YACZ,eAAe,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC;AAAA,YACjF,UAAU;AAAA,cACR,SAAS,EAAE;AAAA,cACX,SAAS,EAAE;AAAA,cACX,UAAU,EAAE;AAAA,cACZ,UAAU,OAAO,WAAW;AAAA,YAC9B;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}
|