@tangle-network/agent-eval 0.77.0 → 0.79.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +2 -2
- package/dist/adapters/langchain.d.ts +2 -2
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
- package/dist/analyst/index.d.ts +42 -8
- package/dist/analyst/index.js +32 -2
- package/dist/analyst/index.js.map +1 -1
- package/dist/authenticity/index.d.ts +54 -1
- package/dist/authenticity/index.js +88 -1
- package/dist/authenticity/index.js.map +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +11 -11
- package/dist/campaign/index.js +4 -4
- package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
- package/dist/chunk-5LVWPNS5.js.map +1 -0
- package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
- package/dist/chunk-CF67I6QY.js.map +1 -0
- package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
- package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
- package/dist/chunk-KWRRMR3J.js.map +1 -0
- package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
- package/dist/chunk-RPLZ4OIB.js.map +1 -0
- package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
- package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
- package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
- package/dist/contract/index.d.ts +12 -12
- package/dist/contract/index.js +2 -2
- package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
- package/dist/index.d.ts +126 -25
- package/dist/index.js +32 -7
- package/dist/index.js.map +1 -1
- package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
- package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} +3 -3
- package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} +2 -2
- package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/rl.js +2 -2
- package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} +1 -1
- package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} +2 -4
- package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
- package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} +3 -3
- package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
- package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} +11 -3
- package/dist/workflow/index.d.ts +4 -4
- package/dist/workflow/index.js +1 -1
- package/docs/auto-research-loop-end-to-end.md +1 -1
- package/docs/feature-guide.md +4 -4
- package/docs/multi-shot-optimization.md +61 -115
- package/docs/product-eval-adoption.md +1 -1
- package/docs/three-package-architecture.md +1 -1
- package/docs/trace-analysis.md +19 -0
- package/package.json +1 -1
- package/dist/chunk-7W4SM7FD.js.map +0 -1
- package/dist/chunk-F3SRAAZO.js.map +0 -1
- package/dist/chunk-JYE3WOTE.js.map +0 -1
- package/dist/chunk-WYIHD6EB.js.map +0 -1
- /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
- /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
- /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
- /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-
|
|
2
|
-
import '../run-record-
|
|
1
|
+
import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-QHG0KnkF.js';
|
|
2
|
+
import '../run-record-sItO5ftF.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
|
5
5
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-
|
|
2
|
-
import '../run-record-
|
|
1
|
+
import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-QHG0KnkF.js';
|
|
2
|
+
import '../run-record-sItO5ftF.js';
|
|
3
3
|
import '../errors-Dwqw-T_m.js';
|
|
4
4
|
import '../schema-m0gsnbt3.js';
|
|
5
5
|
|
package/dist/adapters/otel.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { TraceSpanEvent, HostedClient } from '../hosted/index.js';
|
|
2
|
-
import '../types-
|
|
3
|
-
import '../run-record-
|
|
2
|
+
import '../types-QHG0KnkF.js';
|
|
3
|
+
import '../run-record-sItO5ftF.js';
|
|
4
4
|
import '../errors-Dwqw-T_m.js';
|
|
5
5
|
import '../schema-m0gsnbt3.js';
|
|
6
|
-
import '../insight-report-
|
|
7
|
-
import '../summary-report-
|
|
6
|
+
import '../insight-report-dlpEzQDi.js';
|
|
7
|
+
import '../summary-report-BTaXq1TS.js';
|
|
8
8
|
import '../failure-cluster-CL7IVgkJ.js';
|
|
9
9
|
import '../store-CKUAgsJz.js';
|
|
10
10
|
import '../judge-calibration-DilmB3Ml.js';
|
package/dist/analyst/index.d.ts
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
2
2
|
import { M as MultiLayerVerifier, V as VerifyOptions, S as Severity } from '../multi-layer-verifier-DlWCXuxL.js';
|
|
3
3
|
import { c as RunCritic, a as RunTrace } from '../run-critic-BAIjX99r.js';
|
|
4
|
-
import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-
|
|
5
|
-
export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-
|
|
4
|
+
import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-Du4ZVyef.js';
|
|
5
|
+
export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-Du4ZVyef.js';
|
|
6
6
|
import { A as AnalyzeTracesOptions } from '../analyst-t7zZS3TV.js';
|
|
7
7
|
import { T as TraceAnalysisStore } from '../store-GmBE2pZZ.js';
|
|
8
8
|
import { b as JudgeFn, a as JudgeInput } from '../types-Croy5h7V.js';
|
|
9
|
-
import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../types-
|
|
10
|
-
export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-
|
|
9
|
+
import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../types-DRvV0zRo.js';
|
|
10
|
+
export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-DRvV0zRo.js';
|
|
11
11
|
import { TCloud } from '@tangle-network/tcloud';
|
|
12
|
-
export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-
|
|
13
|
-
export {
|
|
12
|
+
export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DqV2t1Xk.js';
|
|
13
|
+
export { a as AnalystHooks, A as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-BmEuU94S.js';
|
|
14
14
|
import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
|
|
15
15
|
import '../schema-m0gsnbt3.js';
|
|
16
16
|
import '../store-CKUAgsJz.js';
|
|
17
17
|
import 'zod';
|
|
18
|
-
import '../run-record-
|
|
18
|
+
import '../run-record-sItO5ftF.js';
|
|
19
19
|
import '../errors-Dwqw-T_m.js';
|
|
20
20
|
import '../raw-provider-sink-C46HDghv.js';
|
|
21
21
|
|
|
@@ -149,6 +149,40 @@ declare function coerceJson(text: string): unknown;
|
|
|
149
149
|
*/
|
|
150
150
|
declare function coerceToFindingRows(raw: unknown): unknown[];
|
|
151
151
|
|
|
152
|
+
/** DESCRIPTIVE predicate: does the finding cite at least one observable
|
|
153
|
+
* (span/event/artifact) evidence ref. Useful for ranking evidence quality or
|
|
154
|
+
* rendering — it is NOT the steer gate. Evidence presence is the WRONG
|
|
155
|
+
* discriminator for steering: a legitimate trace-analyst observation may cite
|
|
156
|
+
* nothing (it would be wrongly rejected), and a judge verdict may cite an
|
|
157
|
+
* artifact (it would be wrongly admitted). Use `assertNoJudgeVerdict` to gate
|
|
158
|
+
* steering; use this only where "is this grounded in observable evidence" is the
|
|
159
|
+
* literal question. */
|
|
160
|
+
declare function isTraceObservable(finding: AnalystFinding): boolean;
|
|
161
|
+
/** True iff the finding is a JUDGE VERDICT (an acceptance score lifted into a
|
|
162
|
+
* finding), identified by provenance set at the lift site — independent of
|
|
163
|
+
* whatever evidence it cites. */
|
|
164
|
+
declare function isJudgeVerdict(finding: AnalystFinding): boolean;
|
|
165
|
+
/**
|
|
166
|
+
* THE steer firewall. Fail-loud guard for any path that admits analyst findings
|
|
167
|
+
* as STEERING input (the `f(trace)` role): rejects — naming the offenders — any
|
|
168
|
+
* finding whose provenance is a judge verdict, rather than let `J` leak into the
|
|
169
|
+
* loop. Returns the findings unchanged for chaining.
|
|
170
|
+
*
|
|
171
|
+
* Call this at the chokepoint where a detector that ALSO scores/gates has its
|
|
172
|
+
* findings turned into a steer (the judge-and-steer dual-role case). It keys on
|
|
173
|
+
* provenance, so it correctly admits evidence-less trace-analyst observations and
|
|
174
|
+
* correctly rejects an artifact-citing judge verdict — the cases an evidence
|
|
175
|
+
* check gets backwards.
|
|
176
|
+
*
|
|
177
|
+
* It is necessary, not sufficient: it stops PROVENANCE-tagged verdicts. A judge
|
|
178
|
+
* whose output is laundered through a hand-built finding with no provenance flag
|
|
179
|
+
* is out of its reach — provenance must be honestly set at every judge→finding
|
|
180
|
+
* lift (today: createJudgeAdapter). That is why the integrity rule lives at the
|
|
181
|
+
* lift site, and why ProposeContext.judgeScores?: never is the complementary
|
|
182
|
+
* compile-time tripwire on the obvious direct channel.
|
|
183
|
+
*/
|
|
184
|
+
declare function assertNoJudgeVerdict(findings: ReadonlyArray<AnalystFinding>, context?: string): ReadonlyArray<AnalystFinding>;
|
|
185
|
+
|
|
152
186
|
/**
|
|
153
187
|
* `structureFindings` — the deferred structuring pass (DSPy TwoStepAdapter /
|
|
154
188
|
* HALO `synthesize_traces` analog). The agentic actor reasons FREE-FORM and
|
|
@@ -218,4 +252,4 @@ type TraceToolGroupName =
|
|
|
218
252
|
*/
|
|
219
253
|
declare function buildTraceToolsForGroup(group: TraceToolGroupName, store: TraceAnalysisStore): AxFunction[];
|
|
220
254
|
|
|
221
|
-
export { Analyst, AnalystFinding, AnalystSeverity, type JudgeAdapterOpts, type RunCriticAdapterOpts, type SemanticConceptJudgeAdapterOpts, type StructureFindingsOptions, type StructureFindingsResult, type TraceAnalystAdapterOpts, type TraceToolGroupName, type VerifierAdapterOpts, behavioralAnalyst, buildTraceToolsForGroup, coerceJson, coerceToFindingRows, createJudgeAdapter, createRunCriticAdapter, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createVerifierAdapter, deriveEfficiencyFindings, liftSeverity, stripCodeFences, structureFindings };
|
|
255
|
+
export { Analyst, AnalystFinding, AnalystSeverity, type JudgeAdapterOpts, type RunCriticAdapterOpts, type SemanticConceptJudgeAdapterOpts, type StructureFindingsOptions, type StructureFindingsResult, type TraceAnalystAdapterOpts, type TraceToolGroupName, type VerifierAdapterOpts, assertNoJudgeVerdict, behavioralAnalyst, buildTraceToolsForGroup, coerceJson, coerceToFindingRows, createJudgeAdapter, createRunCriticAdapter, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createVerifierAdapter, deriveEfficiencyFindings, isJudgeVerdict, isTraceObservable, liftSeverity, stripCodeFences, structureFindings };
|
package/dist/analyst/index.js
CHANGED
|
@@ -14,7 +14,7 @@ import {
|
|
|
14
14
|
diffFindings,
|
|
15
15
|
emitSkillUsageFindings,
|
|
16
16
|
runSemanticConceptJudge
|
|
17
|
-
} from "../chunk-
|
|
17
|
+
} from "../chunk-5LVWPNS5.js";
|
|
18
18
|
import {
|
|
19
19
|
ANALYST_SEVERITIES,
|
|
20
20
|
AnalystRegistry,
|
|
@@ -41,7 +41,7 @@ import {
|
|
|
41
41
|
renderPriorFindings,
|
|
42
42
|
stripCodeFences,
|
|
43
43
|
structureFindings
|
|
44
|
-
} from "../chunk-
|
|
44
|
+
} from "../chunk-CF67I6QY.js";
|
|
45
45
|
import "../chunk-IHDHUN2X.js";
|
|
46
46
|
import {
|
|
47
47
|
analyzeTraces
|
|
@@ -269,6 +269,11 @@ function liftJudgeScore(analyst_id, area, s) {
|
|
|
269
269
|
severity,
|
|
270
270
|
confidence: 0.8,
|
|
271
271
|
evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
|
|
272
|
+
// Provenance: this finding IS a judge verdict (an acceptance score), not an
|
|
273
|
+
// observation of behavior. The steer firewall (assertNoJudgeVerdict) rejects
|
|
274
|
+
// it from steering — even when it cites an artifact above — because letting a
|
|
275
|
+
// verdict steer the next attempt is the held-out judge leaking into the loop.
|
|
276
|
+
derived_from_judge: true,
|
|
272
277
|
metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
|
|
273
278
|
});
|
|
274
279
|
}
|
|
@@ -323,6 +328,28 @@ function createSemanticConceptJudgeAdapter(opts = {}) {
|
|
|
323
328
|
}
|
|
324
329
|
};
|
|
325
330
|
}
|
|
331
|
+
|
|
332
|
+
// src/analyst/steer-firewall.ts
|
|
333
|
+
var OBSERVABLE_KINDS = /* @__PURE__ */ new Set([
|
|
334
|
+
"span",
|
|
335
|
+
"event",
|
|
336
|
+
"artifact"
|
|
337
|
+
]);
|
|
338
|
+
function isTraceObservable(finding) {
|
|
339
|
+
return finding.evidence_refs.some((ref) => OBSERVABLE_KINDS.has(ref.kind));
|
|
340
|
+
}
|
|
341
|
+
function isJudgeVerdict(finding) {
|
|
342
|
+
return finding.derived_from_judge === true;
|
|
343
|
+
}
|
|
344
|
+
function assertNoJudgeVerdict(findings, context = "steer") {
|
|
345
|
+
const leaks = findings.filter(isJudgeVerdict);
|
|
346
|
+
if (leaks.length > 0) {
|
|
347
|
+
throw new Error(
|
|
348
|
+
`${context}: a judge verdict cannot be admitted as steering input \u2014 that is the held-out judge leaking into the loop. Offending judge-derived findings: [${leaks.map((f) => f.finding_id).join(", ")}]. Steering consumes observations of behavior, never acceptance verdicts.`
|
|
349
|
+
);
|
|
350
|
+
}
|
|
351
|
+
return findings;
|
|
352
|
+
}
|
|
326
353
|
export {
|
|
327
354
|
ANALYST_SEVERITIES,
|
|
328
355
|
AnalystRegistry,
|
|
@@ -340,6 +367,7 @@ export {
|
|
|
340
367
|
RawAnalystFindingSchema,
|
|
341
368
|
SKILL_USAGE_ANALYST,
|
|
342
369
|
SkillUsageAnalyst,
|
|
370
|
+
assertNoJudgeVerdict,
|
|
343
371
|
behavioralAnalyst,
|
|
344
372
|
buildDefaultAnalystRegistry,
|
|
345
373
|
buildSkillUsageReport,
|
|
@@ -359,6 +387,8 @@ export {
|
|
|
359
387
|
deriveEfficiencyFindings,
|
|
360
388
|
diffFindings,
|
|
361
389
|
emitSkillUsageFindings,
|
|
390
|
+
isJudgeVerdict,
|
|
391
|
+
isTraceObservable,
|
|
362
392
|
liftSeverity,
|
|
363
393
|
makeFinding,
|
|
364
394
|
parseFindingSubject,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/analyst/adapters.ts"],"sourcesContent":["/**\n * Adapter factories — lift each existing agent-eval primitive into the\n * Analyst contract without re-implementing it.\n *\n * Five primitives, five factories. Each one:\n * - Builds an Analyst with a stable id (caller chooses; defaults\n * given), a sensible default `inputKind`, a version derived from\n * the wrapped primitive's version + an adapter revision, and an\n * `analyze()` that calls the primitive and lifts its output to\n * AnalystFinding[] using `makeFinding()`.\n * - Maps severities: the existing `Severity` ('critical' | 'major' |\n * 'minor' | 'info') projects onto AnalystSeverity ('critical' |\n * 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →\n * 'medium'. Domain analysts that want finer-grained mapping override.\n *\n * Adapters never own state. Calling the same factory twice with the\n * same primitive instance is safe.\n */\n\nimport type { AxAIService } from '@ax-llm/ax'\nimport type {\n Finding as LayerFinding,\n Severity as LayerSeverity,\n MultiLayerVerifier,\n VerifyOptions,\n} from '../multi-layer-verifier'\nimport { RunCritic, type RunTrace } from '../run-critic'\nimport {\n runSemanticConceptJudge,\n SEMANTIC_CONCEPT_JUDGE_VERSION,\n type SemanticConceptJudgeInput,\n type SemanticConceptJudgeOptions,\n} from '../semantic-concept-judge'\nimport { type AnalyzeTracesOptions, analyzeTraces } from '../trace-analyst/analyst'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport type { JudgeFn, JudgeInput, JudgeScore, TCloud } from '../types'\nimport type { Analyst, AnalystFinding, AnalystSeverity } from './types'\nimport { makeFinding } from './types'\n\nconst ADAPTER_REV = '1'\n\n// ── Severity bridges ───────────────────────────────────────────────\n\nexport function liftSeverity(s: LayerSeverity): AnalystSeverity {\n switch (s) {\n case 'critical':\n return 'critical'\n case 'major':\n return 'high'\n case 'minor':\n return 'medium'\n case 'info':\n return 'info'\n }\n}\n\n// ── 1. analyzeTraces → Analyst ─────────────────────────────────────\n\nexport interface TraceAnalystAdapterOpts {\n id?: string\n area?: string\n /** The natural-language question(s) put to the analyst. One finding per question. */\n questions: string[]\n /** Caller-provided AxAI service — same one trace-analyst.ts expects. */\n ai: AxAIService\n model?: string\n /** Forwarded to analyzeTraces. */\n extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>\n}\n\n/**\n * @deprecated Prefer `createTraceAnalystKind` + one of the failure /\n * improvement kinds from `./kinds`. This adapter wraps the legacy\n * `analyzeTraces` flow whose output is `findings:string[]` — every\n * bullet gets flat-defaulted severity `medium` / confidence `0.6`,\n * which loses the per-finding grading kinds provide via Ax structured\n * output + Zod validation. Kept for one minor while consumers migrate.\n */\nexport function createTraceAnalystAdapter(\n opts: TraceAnalystAdapterOpts,\n): Analyst<TraceAnalysisStore> {\n const id = opts.id ?? 'trace-analyst'\n const area = opts.area ?? 'agent-reasoning'\n return {\n id,\n description:\n 'Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.',\n inputKind: 'trace-store',\n cost: { kind: 'llm', models: opts.model ? [opts.model] : undefined },\n version: `trace-analyst-${ADAPTER_REV}`,\n async analyze(store, ctx) {\n const out: AnalystFinding[] = []\n for (const question of opts.questions) {\n if (ctx.signal?.aborted) break\n const result = await analyzeTraces(\n { question },\n { source: store, ai: opts.ai, model: opts.model, ...opts.extra },\n )\n const subject = ctx.tags?.subject ?? question.slice(0, 60)\n // The responder produces a list of bullet strings. Each becomes\n // one finding; the prose answer is attached as rationale on the\n // first (so renderers that show only top-N still get context).\n if (result.findings.length === 0) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim: result.answer.slice(0, 200),\n rationale: result.answer,\n severity: 'info',\n confidence: 0.5,\n evidence_refs: [],\n metadata: {\n actor_prompt_version: result.actorPromptVersion,\n turns: result.turnCount,\n },\n }),\n )\n continue\n }\n result.findings.forEach((claim, i) => {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim,\n rationale: i === 0 ? result.answer : undefined,\n severity: 'medium',\n confidence: 0.6,\n evidence_refs: [],\n metadata: { question, turns: result.turnCount, finding_index: i },\n }),\n )\n })\n }\n return out\n },\n }\n}\n\n// ── 2. MultiLayerVerifier → Analyst ─────────────────────────────────\n\nexport interface VerifierAdapterOpts<Env> {\n id?: string\n area?: string\n verifier: MultiLayerVerifier<Env>\n /**\n * The verifier expects an `env` per run. Adapters take it from\n * `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.\n */\n options?: Omit<VerifyOptions<Env>, 'env'>\n}\n\nexport function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env> {\n const id = opts.id ?? 'multi-layer-verifier'\n const area = opts.area ?? 'verification'\n return {\n id,\n description:\n \"Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.\",\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `verifier-${ADAPTER_REV}`,\n async analyze(env, ctx) {\n const report = await opts.verifier.run({ env, ...opts.options })\n const out: AnalystFinding[] = []\n for (const layer of report.layers) {\n for (const finding of layer.findings) {\n out.push(liftLayerFinding(id, area, layer.layer, finding))\n }\n // Layer-level signal: a failed/error layer is itself a finding\n // even if it didn't emit per-finding rows.\n if (layer.status === 'fail' || layer.status === 'error' || layer.status === 'timeout') {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: layer.layer,\n claim: `layer \"${layer.layer}\" ${layer.status}: ${layer.reason ?? 'no reason given'}`,\n severity:\n layer.status === 'error' ? 'high' : layer.status === 'timeout' ? 'medium' : 'high',\n confidence: 1,\n evidence_refs: [],\n metadata: {\n layer_status: layer.status,\n duration_ms: layer.durationMs,\n score: layer.score,\n diagnostics: layer.diagnostics,\n },\n }),\n )\n }\n }\n ctx.log?.('verifier complete', {\n layers: report.layers.length,\n blended: report.blendedScore,\n all_pass: report.allPass,\n })\n return out\n },\n }\n}\n\nfunction liftLayerFinding(\n analyst_id: string,\n area: string,\n layer: string,\n f: LayerFinding,\n): AnalystFinding {\n return makeFinding({\n analyst_id,\n area,\n subject: f.layer ?? layer,\n claim: f.message,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: f.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }]\n : [],\n metadata: f.detail,\n })\n}\n\n// ── 3. RunCritic → Analyst ──────────────────────────────────────────\n\nexport interface RunCriticAdapterOpts {\n id?: string\n area?: string\n critic?: RunCritic\n /** Optional threshold below which a dimension is reported as a finding. Default 0.5. */\n threshold?: number\n}\n\nexport function createRunCriticAdapter(opts: RunCriticAdapterOpts = {}): Analyst<RunTrace> {\n const id = opts.id ?? 'run-critic'\n const area = opts.area ?? 'run-quality'\n const critic = opts.critic ?? new RunCritic()\n const threshold = opts.threshold ?? 0.5\n return {\n id,\n description:\n 'Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.',\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `run-critic-${ADAPTER_REV}`,\n async analyze(trace) {\n const score = critic.scoreTrace(trace)\n const out: AnalystFinding[] = []\n const dims: Array<[keyof typeof score, AnalystSeverity, string]> = [\n ['success', 'critical', 'run did not complete successfully'],\n ['goalProgress', 'high', 'goal progress is low'],\n ['repoGroundedness', 'high', 'output is poorly grounded in the repository'],\n ['toolUseQuality', 'medium', 'tool use quality is low'],\n ['patchQuality', 'medium', 'no real patch/edit evidence'],\n ['testReality', 'high', 'no real test/build evidence'],\n ['finalGate', 'critical', 'final gate is blocking'],\n ]\n for (const [dim, sev, msg] of dims) {\n const value = score[dim] as number\n if (typeof value === 'number' && value < threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: dim,\n claim: msg,\n rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,\n severity: sev,\n confidence: 1,\n evidence_refs: [],\n metadata: { dimension: dim, value, threshold, run_id: trace.run.runId },\n }),\n )\n }\n }\n // Drift penalty is high → surface as a finding (inverse threshold).\n if (score.driftPenalty > 1 - threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: 'drift',\n claim: 'agent output drifted from repository signal',\n rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,\n severity: 'medium',\n confidence: 0.9,\n evidence_refs: [],\n metadata: { drift_penalty: score.driftPenalty, notes: score.notes },\n }),\n )\n }\n return out\n },\n }\n}\n\n// ── 4. JudgeFn → Analyst ────────────────────────────────────────────\n\nexport interface JudgeAdapterOpts {\n id?: string\n area?: string\n judge: JudgeFn\n /** TCloud handle the JudgeFn calls. */\n tcloud: TCloud\n /** Optional cost classification — most judges call an LLM. */\n cost?: Analyst['cost']\n /** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */\n threshold?: number\n}\n\nexport function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput> {\n const id = opts.id ?? 'judge'\n const area = opts.area ?? 'judge'\n const threshold = opts.threshold ?? 6\n return {\n id,\n description:\n 'Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.',\n inputKind: 'judge-input',\n cost: opts.cost ?? { kind: 'llm' },\n version: `judge-${ADAPTER_REV}`,\n async analyze(input) {\n const scores = await opts.judge(opts.tcloud, input)\n return scores\n .filter((s) => normalize10(s.score) < threshold)\n .map((s) => liftJudgeScore(id, area, s))\n },\n }\n}\n\nfunction normalize10(s: number): number {\n // JudgeScore convention is 0-10 but some judges emit 0-1. Coerce to 0-10.\n return s <= 1 ? s * 10 : s\n}\n\nfunction liftJudgeScore(analyst_id: string, area: string, s: JudgeScore): AnalystFinding {\n const score10 = normalize10(s.score)\n const severity: AnalystSeverity =\n score10 < 3 ? 'critical' : score10 < 5 ? 'high' : score10 < 7 ? 'medium' : 'low'\n return makeFinding({\n analyst_id,\n area,\n subject: s.dimension,\n claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,\n rationale: s.reasoning,\n severity,\n confidence: 0.8,\n evidence_refs: s.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: s.evidence }]\n : [],\n metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 },\n })\n}\n\n// ── 5. SemanticConceptJudge → Analyst ──────────────────────────────\n\nexport interface SemanticConceptJudgeAdapterOpts {\n id?: string\n area?: string\n options?: SemanticConceptJudgeOptions\n}\n\nexport function createSemanticConceptJudgeAdapter(\n opts: SemanticConceptJudgeAdapterOpts = {},\n): Analyst<SemanticConceptJudgeInput> {\n const id = opts.id ?? 'semantic-concept-judge'\n const area = opts.area ?? 'concept-coverage'\n return {\n id,\n description:\n 'Runs the semantic-concept judge and surfaces missing / weak concepts as findings.',\n inputKind: 'custom',\n cost: { kind: 'llm', models: opts.options?.model ? [opts.options.model] : undefined },\n version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,\n async analyze(input) {\n const result = await runSemanticConceptJudge(input, opts.options)\n if (!result.available) {\n return [\n makeFinding({\n analyst_id: id,\n area,\n claim: 'semantic-concept judge unavailable',\n rationale: result.error,\n severity: 'info',\n confidence: 1,\n evidence_refs: [],\n metadata: { reason: result.error },\n }),\n ]\n }\n const out: AnalystFinding[] = []\n for (const f of result.findings) {\n // Only surface gaps: missing concepts or low scores. Concepts at\n // 7+/10 with present=true are not findings — they're successes.\n if (f.present && f.score >= 7) continue\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: f.concept,\n claim: f.present\n ? `concept \"${f.concept}\" is weak (${f.score}/10)`\n : `concept \"${f.concept}\" is missing`,\n rationale: f.evidence,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }],\n metadata: {\n concept: f.concept,\n present: f.present,\n score_10: f.score,\n cost_usd: result.costUsd ?? undefined,\n },\n }),\n )\n }\n return out\n },\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuCA,IAAM,cAAc;AAIb,SAAS,aAAa,GAAmC;AAC9D,UAAQ,GAAG;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,EACX;AACF;AAwBO,SAAS,0BACd,MAC6B;AAC7B,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,QAAQ,CAAC,KAAK,KAAK,IAAI,OAAU;AAAA,IACnE,SAAS,iBAAiB,WAAW;AAAA,IACrC,MAAM,QAAQ,OAAO,KAAK;AACxB,YAAM,MAAwB,CAAC;AAC/B,iBAAW,YAAY,KAAK,WAAW;AACrC,YAAI,IAAI,QAAQ,QAAS;AACzB,cAAM,SAAS,MAAM;AAAA,UACnB,EAAE,SAAS;AAAA,UACX,EAAE,QAAQ,OAAO,IAAI,KAAK,IAAI,OAAO,KAAK,OAAO,GAAG,KAAK,MAAM;AAAA,QACjE;AACA,cAAM,UAAU,IAAI,MAAM,WAAW,SAAS,MAAM,GAAG,EAAE;AAIzD,YAAI,OAAO,SAAS,WAAW,GAAG;AAChC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA,OAAO,OAAO,OAAO,MAAM,GAAG,GAAG;AAAA,cACjC,WAAW,OAAO;AAAA,cAClB,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,sBAAsB,OAAO;AAAA,gBAC7B,OAAO,OAAO;AAAA,cAChB;AAAA,YACF,CAAC;AAAA,UACH;AACA;AAAA,QACF;AACA,eAAO,SAAS,QAAQ,CAAC,OAAO,MAAM;AACpC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA;AAAA,cACA,WAAW,MAAM,IAAI,OAAO,SAAS;AAAA,cACrC,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,UAAU,OAAO,OAAO,WAAW,eAAe,EAAE;AAAA,YAClE,CAAC;AAAA,UACH;AAAA,QACF,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAeO,SAAS,sBAA2B,MAA8C;AACvF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,YAAY,WAAW;AAAA,IAChC,MAAM,QAAQ,KAAK,KAAK;AACtB,YAAM,SAAS,MAAM,KAAK,SAAS,IAAI,EAAE,KAAK,GAAG,KAAK,QAAQ,CAAC;AAC/D,YAAM,MAAwB,CAAC;AAC/B,iBAAW,SAAS,OAAO,QAAQ;AACjC,mBAAW,WAAW,MAAM,UAAU;AACpC,cAAI,KAAK,iBAAiB,IAAI,MAAM,MAAM,OAAO,OAAO,CAAC;AAAA,QAC3D;AAGA,YAAI,MAAM,WAAW,UAAU,MAAM,WAAW,WAAW,MAAM,WAAW,WAAW;AACrF,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS,MAAM;AAAA,cACf,OAAO,UAAU,MAAM,KAAK,KAAK,MAAM,MAAM,KAAK,MAAM,UAAU,iBAAiB;AAAA,cACnF,UACE,MAAM,WAAW,UAAU,SAAS,MAAM,WAAW,YAAY,WAAW;AAAA,cAC9E,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,cAAc,MAAM;AAAA,gBACpB,aAAa,MAAM;AAAA,gBACnB,OAAO,MAAM;AAAA,gBACb,aAAa,MAAM;AAAA,cACrB;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AACA,UAAI,MAAM,qBAAqB;AAAA,QAC7B,QAAQ,OAAO,OAAO;AAAA,QACtB,SAAS,OAAO;AAAA,QAChB,UAAU,OAAO;AAAA,MACnB,CAAC;AACD,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,iBACP,YACA,MACA,OACA,GACgB;AAChB,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE,SAAS;AAAA,IACpB,OAAO,EAAE;AAAA,IACT,UAAU,aAAa,EAAE,QAAQ;AAAA,IACjC,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE;AAAA,EACd,CAAC;AACH;AAYO,SAAS,uBAAuB,OAA6B,CAAC,GAAsB;AACzF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,SAAS,KAAK,UAAU,IAAI,UAAU;AAC5C,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,cAAc,WAAW;AAAA,IAClC,MAAM,QAAQ,OAAO;AACnB,YAAM,QAAQ,OAAO,WAAW,KAAK;AACrC,YAAM,MAAwB,CAAC;AAC/B,YAAM,OAA6D;AAAA,QACjE,CAAC,WAAW,YAAY,mCAAmC;AAAA,QAC3D,CAAC,gBAAgB,QAAQ,sBAAsB;AAAA,QAC/C,CAAC,oBAAoB,QAAQ,6CAA6C;AAAA,QAC1E,CAAC,kBAAkB,UAAU,yBAAyB;AAAA,QACtD,CAAC,gBAAgB,UAAU,6BAA6B;AAAA,QACxD,CAAC,eAAe,QAAQ,6BAA6B;AAAA,QACrD,CAAC,aAAa,YAAY,wBAAwB;AAAA,MACpD;AACA,iBAAW,CAAC,KAAK,KAAK,GAAG,KAAK,MAAM;AAClC,cAAM,QAAQ,MAAM,GAAG;AACvB,YAAI,OAAO,UAAU,YAAY,QAAQ,WAAW;AAClD,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS;AAAA,cACT,OAAO;AAAA,cACP,WAAW,GAAG,GAAG,IAAI,MAAM,QAAQ,CAAC,CAAC,oBAAoB,SAAS;AAAA,cAClE,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,WAAW,KAAK,OAAO,WAAW,QAAQ,MAAM,IAAI,MAAM;AAAA,YACxE,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,UAAI,MAAM,eAAe,IAAI,WAAW;AACtC,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS;AAAA,YACT,OAAO;AAAA,YACP,WAAW,gBAAgB,MAAM,aAAa,QAAQ,CAAC,CAAC;AAAA,YACxD,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,eAAe,MAAM,cAAc,OAAO,MAAM,MAAM;AAAA,UACpE,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAgBO,SAAS,mBAAmB,MAA6C;AAC9E,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,KAAK,QAAQ,EAAE,MAAM,MAAM;AAAA,IACjC,SAAS,SAAS,WAAW;AAAA,IAC7B,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,KAAK,MAAM,KAAK,QAAQ,KAAK;AAClD,aAAO,OACJ,OAAO,CAAC,MAAM,YAAY,EAAE,KAAK,IAAI,SAAS,EAC9C,IAAI,CAAC,MAAM,eAAe,IAAI,MAAM,CAAC,CAAC;AAAA,IAC3C;AAAA,EACF;AACF;AAEA,SAAS,YAAY,GAAmB;AAEtC,SAAO,KAAK,IAAI,IAAI,KAAK;AAC3B;AAEA,SAAS,eAAe,YAAoB,MAAc,GAA+B;AACvF,QAAM,UAAU,YAAY,EAAE,KAAK;AACnC,QAAM,WACJ,UAAU,IAAI,aAAa,UAAU,IAAI,SAAS,UAAU,IAAI,WAAW;AAC7E,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE;AAAA,IACX,OAAO,GAAG,EAAE,SAAS,IAAI,EAAE,SAAS,WAAW,QAAQ,QAAQ,CAAC,CAAC;AAAA,IACjE,WAAW,EAAE;AAAA,IACb;AAAA,IACA,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE,YAAY,EAAE,WAAW,WAAW,EAAE,WAAW,UAAU,QAAQ;AAAA,EACjF,CAAC;AACH;AAUO,SAAS,kCACd,OAAwC,CAAC,GACL;AACpC,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,SAAS,QAAQ,CAAC,KAAK,QAAQ,KAAK,IAAI,OAAU;AAAA,IACpF,SAAS,GAAG,8BAA8B,YAAY,WAAW;AAAA,IACjE,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,wBAAwB,OAAO,KAAK,OAAO;AAChE,UAAI,CAAC,OAAO,WAAW;AACrB,eAAO;AAAA,UACL,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,OAAO;AAAA,YACP,WAAW,OAAO;AAAA,YAClB,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,QAAQ,OAAO,MAAM;AAAA,UACnC,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,MAAwB,CAAC;AAC/B,iBAAW,KAAK,OAAO,UAAU;AAG/B,YAAI,EAAE,WAAW,EAAE,SAAS,EAAG;AAC/B,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS,EAAE;AAAA,YACX,OAAO,EAAE,UACL,YAAY,EAAE,OAAO,cAAc,EAAE,KAAK,SAC1C,YAAY,EAAE,OAAO;AAAA,YACzB,WAAW,EAAE;AAAA,YACb,UAAU,aAAa,EAAE,QAAQ;AAAA,YACjC,YAAY;AAAA,YACZ,eAAe,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC;AAAA,YACjF,UAAU;AAAA,cACR,SAAS,EAAE;AAAA,cACX,SAAS,EAAE;AAAA,cACX,UAAU,EAAE;AAAA,cACZ,UAAU,OAAO,WAAW;AAAA,YAC9B;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/analyst/adapters.ts","../../src/analyst/steer-firewall.ts"],"sourcesContent":["/**\n * Adapter factories — lift each existing agent-eval primitive into the\n * Analyst contract without re-implementing it.\n *\n * Five primitives, five factories. Each one:\n * - Builds an Analyst with a stable id (caller chooses; defaults\n * given), a sensible default `inputKind`, a version derived from\n * the wrapped primitive's version + an adapter revision, and an\n * `analyze()` that calls the primitive and lifts its output to\n * AnalystFinding[] using `makeFinding()`.\n * - Maps severities: the existing `Severity` ('critical' | 'major' |\n * 'minor' | 'info') projects onto AnalystSeverity ('critical' |\n * 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →\n * 'medium'. Domain analysts that want finer-grained mapping override.\n *\n * Adapters never own state. Calling the same factory twice with the\n * same primitive instance is safe.\n */\n\nimport type { AxAIService } from '@ax-llm/ax'\nimport type {\n Finding as LayerFinding,\n Severity as LayerSeverity,\n MultiLayerVerifier,\n VerifyOptions,\n} from '../multi-layer-verifier'\nimport { RunCritic, type RunTrace } from '../run-critic'\nimport {\n runSemanticConceptJudge,\n SEMANTIC_CONCEPT_JUDGE_VERSION,\n type SemanticConceptJudgeInput,\n type SemanticConceptJudgeOptions,\n} from '../semantic-concept-judge'\nimport { type AnalyzeTracesOptions, analyzeTraces } from '../trace-analyst/analyst'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport type { JudgeFn, JudgeInput, JudgeScore, TCloud } from '../types'\nimport type { Analyst, AnalystFinding, AnalystSeverity } from './types'\nimport { makeFinding } from './types'\n\nconst ADAPTER_REV = '1'\n\n// ── Severity bridges ───────────────────────────────────────────────\n\nexport function liftSeverity(s: LayerSeverity): AnalystSeverity {\n switch (s) {\n case 'critical':\n return 'critical'\n case 'major':\n return 'high'\n case 'minor':\n return 'medium'\n case 'info':\n return 'info'\n }\n}\n\n// ── 1. analyzeTraces → Analyst ─────────────────────────────────────\n\nexport interface TraceAnalystAdapterOpts {\n id?: string\n area?: string\n /** The natural-language question(s) put to the analyst. One finding per question. */\n questions: string[]\n /** Caller-provided AxAI service — same one trace-analyst.ts expects. */\n ai: AxAIService\n model?: string\n /** Forwarded to analyzeTraces. */\n extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>\n}\n\n/**\n * @deprecated Prefer `createTraceAnalystKind` + one of the failure /\n * improvement kinds from `./kinds`. This adapter wraps the legacy\n * `analyzeTraces` flow whose output is `findings:string[]` — every\n * bullet gets flat-defaulted severity `medium` / confidence `0.6`,\n * which loses the per-finding grading kinds provide via Ax structured\n * output + Zod validation. Kept for one minor while consumers migrate.\n */\nexport function createTraceAnalystAdapter(\n opts: TraceAnalystAdapterOpts,\n): Analyst<TraceAnalysisStore> {\n const id = opts.id ?? 'trace-analyst'\n const area = opts.area ?? 'agent-reasoning'\n return {\n id,\n description:\n 'Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.',\n inputKind: 'trace-store',\n cost: { kind: 'llm', models: opts.model ? [opts.model] : undefined },\n version: `trace-analyst-${ADAPTER_REV}`,\n async analyze(store, ctx) {\n const out: AnalystFinding[] = []\n for (const question of opts.questions) {\n if (ctx.signal?.aborted) break\n const result = await analyzeTraces(\n { question },\n { source: store, ai: opts.ai, model: opts.model, ...opts.extra },\n )\n const subject = ctx.tags?.subject ?? question.slice(0, 60)\n // The responder produces a list of bullet strings. Each becomes\n // one finding; the prose answer is attached as rationale on the\n // first (so renderers that show only top-N still get context).\n if (result.findings.length === 0) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim: result.answer.slice(0, 200),\n rationale: result.answer,\n severity: 'info',\n confidence: 0.5,\n evidence_refs: [],\n metadata: {\n actor_prompt_version: result.actorPromptVersion,\n turns: result.turnCount,\n },\n }),\n )\n continue\n }\n result.findings.forEach((claim, i) => {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim,\n rationale: i === 0 ? result.answer : undefined,\n severity: 'medium',\n confidence: 0.6,\n evidence_refs: [],\n metadata: { question, turns: result.turnCount, finding_index: i },\n }),\n )\n })\n }\n return out\n },\n }\n}\n\n// ── 2. MultiLayerVerifier → Analyst ─────────────────────────────────\n\nexport interface VerifierAdapterOpts<Env> {\n id?: string\n area?: string\n verifier: MultiLayerVerifier<Env>\n /**\n * The verifier expects an `env` per run. Adapters take it from\n * `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.\n */\n options?: Omit<VerifyOptions<Env>, 'env'>\n}\n\nexport function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env> {\n const id = opts.id ?? 'multi-layer-verifier'\n const area = opts.area ?? 'verification'\n return {\n id,\n description:\n \"Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.\",\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `verifier-${ADAPTER_REV}`,\n async analyze(env, ctx) {\n const report = await opts.verifier.run({ env, ...opts.options })\n const out: AnalystFinding[] = []\n for (const layer of report.layers) {\n for (const finding of layer.findings) {\n out.push(liftLayerFinding(id, area, layer.layer, finding))\n }\n // Layer-level signal: a failed/error layer is itself a finding\n // even if it didn't emit per-finding rows.\n if (layer.status === 'fail' || layer.status === 'error' || layer.status === 'timeout') {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: layer.layer,\n claim: `layer \"${layer.layer}\" ${layer.status}: ${layer.reason ?? 'no reason given'}`,\n severity:\n layer.status === 'error' ? 'high' : layer.status === 'timeout' ? 'medium' : 'high',\n confidence: 1,\n evidence_refs: [],\n metadata: {\n layer_status: layer.status,\n duration_ms: layer.durationMs,\n score: layer.score,\n diagnostics: layer.diagnostics,\n },\n }),\n )\n }\n }\n ctx.log?.('verifier complete', {\n layers: report.layers.length,\n blended: report.blendedScore,\n all_pass: report.allPass,\n })\n return out\n },\n }\n}\n\nfunction liftLayerFinding(\n analyst_id: string,\n area: string,\n layer: string,\n f: LayerFinding,\n): AnalystFinding {\n return makeFinding({\n analyst_id,\n area,\n subject: f.layer ?? layer,\n claim: f.message,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: f.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }]\n : [],\n metadata: f.detail,\n })\n}\n\n// ── 3. RunCritic → Analyst ──────────────────────────────────────────\n\nexport interface RunCriticAdapterOpts {\n id?: string\n area?: string\n critic?: RunCritic\n /** Optional threshold below which a dimension is reported as a finding. Default 0.5. */\n threshold?: number\n}\n\nexport function createRunCriticAdapter(opts: RunCriticAdapterOpts = {}): Analyst<RunTrace> {\n const id = opts.id ?? 'run-critic'\n const area = opts.area ?? 'run-quality'\n const critic = opts.critic ?? new RunCritic()\n const threshold = opts.threshold ?? 0.5\n return {\n id,\n description:\n 'Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.',\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `run-critic-${ADAPTER_REV}`,\n async analyze(trace) {\n const score = critic.scoreTrace(trace)\n const out: AnalystFinding[] = []\n const dims: Array<[keyof typeof score, AnalystSeverity, string]> = [\n ['success', 'critical', 'run did not complete successfully'],\n ['goalProgress', 'high', 'goal progress is low'],\n ['repoGroundedness', 'high', 'output is poorly grounded in the repository'],\n ['toolUseQuality', 'medium', 'tool use quality is low'],\n ['patchQuality', 'medium', 'no real patch/edit evidence'],\n ['testReality', 'high', 'no real test/build evidence'],\n ['finalGate', 'critical', 'final gate is blocking'],\n ]\n for (const [dim, sev, msg] of dims) {\n const value = score[dim] as number\n if (typeof value === 'number' && value < threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: dim,\n claim: msg,\n rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,\n severity: sev,\n confidence: 1,\n evidence_refs: [],\n metadata: { dimension: dim, value, threshold, run_id: trace.run.runId },\n }),\n )\n }\n }\n // Drift penalty is high → surface as a finding (inverse threshold).\n if (score.driftPenalty > 1 - threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: 'drift',\n claim: 'agent output drifted from repository signal',\n rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,\n severity: 'medium',\n confidence: 0.9,\n evidence_refs: [],\n metadata: { drift_penalty: score.driftPenalty, notes: score.notes },\n }),\n )\n }\n return out\n },\n }\n}\n\n// ── 4. JudgeFn → Analyst ────────────────────────────────────────────\n\nexport interface JudgeAdapterOpts {\n id?: string\n area?: string\n judge: JudgeFn\n /** TCloud handle the JudgeFn calls. */\n tcloud: TCloud\n /** Optional cost classification — most judges call an LLM. */\n cost?: Analyst['cost']\n /** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */\n threshold?: number\n}\n\nexport function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput> {\n const id = opts.id ?? 'judge'\n const area = opts.area ?? 'judge'\n const threshold = opts.threshold ?? 6\n return {\n id,\n description:\n 'Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.',\n inputKind: 'judge-input',\n cost: opts.cost ?? { kind: 'llm' },\n version: `judge-${ADAPTER_REV}`,\n async analyze(input) {\n const scores = await opts.judge(opts.tcloud, input)\n return scores\n .filter((s) => normalize10(s.score) < threshold)\n .map((s) => liftJudgeScore(id, area, s))\n },\n }\n}\n\nfunction normalize10(s: number): number {\n // JudgeScore convention is 0-10 but some judges emit 0-1. Coerce to 0-10.\n return s <= 1 ? s * 10 : s\n}\n\nfunction liftJudgeScore(analyst_id: string, area: string, s: JudgeScore): AnalystFinding {\n const score10 = normalize10(s.score)\n const severity: AnalystSeverity =\n score10 < 3 ? 'critical' : score10 < 5 ? 'high' : score10 < 7 ? 'medium' : 'low'\n return makeFinding({\n analyst_id,\n area,\n subject: s.dimension,\n claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,\n rationale: s.reasoning,\n severity,\n confidence: 0.8,\n evidence_refs: s.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: s.evidence }]\n : [],\n // Provenance: this finding IS a judge verdict (an acceptance score), not an\n // observation of behavior. The steer firewall (assertNoJudgeVerdict) rejects\n // it from steering — even when it cites an artifact above — because letting a\n // verdict steer the next attempt is the held-out judge leaking into the loop.\n derived_from_judge: true,\n metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 },\n })\n}\n\n// ── 5. SemanticConceptJudge → Analyst ──────────────────────────────\n\nexport interface SemanticConceptJudgeAdapterOpts {\n id?: string\n area?: string\n options?: SemanticConceptJudgeOptions\n}\n\nexport function createSemanticConceptJudgeAdapter(\n opts: SemanticConceptJudgeAdapterOpts = {},\n): Analyst<SemanticConceptJudgeInput> {\n const id = opts.id ?? 'semantic-concept-judge'\n const area = opts.area ?? 'concept-coverage'\n return {\n id,\n description:\n 'Runs the semantic-concept judge and surfaces missing / weak concepts as findings.',\n inputKind: 'custom',\n cost: { kind: 'llm', models: opts.options?.model ? [opts.options.model] : undefined },\n version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,\n async analyze(input) {\n const result = await runSemanticConceptJudge(input, opts.options)\n if (!result.available) {\n return [\n makeFinding({\n analyst_id: id,\n area,\n claim: 'semantic-concept judge unavailable',\n rationale: result.error,\n severity: 'info',\n confidence: 1,\n evidence_refs: [],\n metadata: { reason: result.error },\n }),\n ]\n }\n const out: AnalystFinding[] = []\n for (const f of result.findings) {\n // Only surface gaps: missing concepts or low scores. Concepts at\n // 7+/10 with present=true are not findings — they're successes.\n if (f.present && f.score >= 7) continue\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: f.concept,\n claim: f.present\n ? `concept \"${f.concept}\" is weak (${f.score}/10)`\n : `concept \"${f.concept}\" is missing`,\n rationale: f.evidence,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }],\n metadata: {\n concept: f.concept,\n present: f.present,\n score_10: f.score,\n cost_usd: result.costUsd ?? undefined,\n },\n }),\n )\n }\n return out\n },\n }\n}\n","// The realness-oracle firewall (docs/learning-flywheel.md, \"The steer is f(trace)\").\n//\n// A realness/authenticity signal has TWO legitimate roles that must stay\n// separated by a firewall:\n// (a) anchor judge J — write-only: scores the chosen output, gates promotion,\n// NEVER seen by the worker/optimizer mid-run (else the loop games it).\n// (b) steer f(trace) — an analyst observes the agent's OWN behavior in the\n// trace (\"imported a stub\", \"used a non-crypto PRNG where encryption was\n// required\") and steers the next attempt. Legitimate, because it is derived\n// from OBSERVABLE BEHAVIOR, not from J's held-out verdict.\n//\n// The correct discriminator is PROVENANCE, not evidence presence. A judge verdict\n// lifted into a finding (createJudgeAdapter → liftJudgeScore) is a verdict even\n// when it cites an artifact; an evidence-less trace-analyst bullet is an\n// observation even though it cites nothing. So the firewall keys on\n// `AnalystFinding.derived_from_judge` (set at the judge lift site), NOT on whether\n// evidence_refs is populated. The instant a verdict steers the next attempt it is\n// a back-channel for J and the loop Goodharts realness exactly as it would\n// Goodhart pass-rate.\n\nimport type { AnalystFinding, EvidenceRef } from './types'\n\n/** Evidence grounded in the agent's OWN execution: OTLP trace elements\n * (`span`/`event`) or the artifact it produced (`artifact`). */\nconst OBSERVABLE_KINDS: ReadonlySet<EvidenceRef['kind']> = new Set<EvidenceRef['kind']>([\n 'span',\n 'event',\n 'artifact',\n])\n\n/** DESCRIPTIVE predicate: does the finding cite at least one observable\n * (span/event/artifact) evidence ref. Useful for ranking evidence quality or\n * rendering — it is NOT the steer gate. Evidence presence is the WRONG\n * discriminator for steering: a legitimate trace-analyst observation may cite\n * nothing (it would be wrongly rejected), and a judge verdict may cite an\n * artifact (it would be wrongly admitted). Use `assertNoJudgeVerdict` to gate\n * steering; use this only where \"is this grounded in observable evidence\" is the\n * literal question. */\nexport function isTraceObservable(finding: AnalystFinding): boolean {\n return finding.evidence_refs.some((ref) => OBSERVABLE_KINDS.has(ref.kind))\n}\n\n/** True iff the finding is a JUDGE VERDICT (an acceptance score lifted into a\n * finding), identified by provenance set at the lift site — independent of\n * whatever evidence it cites. */\nexport function isJudgeVerdict(finding: AnalystFinding): boolean {\n return finding.derived_from_judge === true\n}\n\n/**\n * THE steer firewall. Fail-loud guard for any path that admits analyst findings\n * as STEERING input (the `f(trace)` role): rejects — naming the offenders — any\n * finding whose provenance is a judge verdict, rather than let `J` leak into the\n * loop. Returns the findings unchanged for chaining.\n *\n * Call this at the chokepoint where a detector that ALSO scores/gates has its\n * findings turned into a steer (the judge-and-steer dual-role case). It keys on\n * provenance, so it correctly admits evidence-less trace-analyst observations and\n * correctly rejects an artifact-citing judge verdict — the cases an evidence\n * check gets backwards.\n *\n * It is necessary, not sufficient: it stops PROVENANCE-tagged verdicts. A judge\n * whose output is laundered through a hand-built finding with no provenance flag\n * is out of its reach — provenance must be honestly set at every judge→finding\n * lift (today: createJudgeAdapter). That is why the integrity rule lives at the\n * lift site, and why ProposeContext.judgeScores?: never is the complementary\n * compile-time tripwire on the obvious direct channel.\n */\nexport function assertNoJudgeVerdict(\n findings: ReadonlyArray<AnalystFinding>,\n context = 'steer',\n): ReadonlyArray<AnalystFinding> {\n const leaks = findings.filter(isJudgeVerdict)\n if (leaks.length > 0) {\n throw new Error(\n `${context}: a judge verdict cannot be admitted as steering input — that is the ` +\n `held-out judge leaking into the loop. Offending judge-derived findings: [${leaks\n .map((f) => f.finding_id)\n .join(', ')}]. Steering consumes observations of behavior, never acceptance verdicts.`,\n )\n }\n return findings\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuCA,IAAM,cAAc;AAIb,SAAS,aAAa,GAAmC;AAC9D,UAAQ,GAAG;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,EACX;AACF;AAwBO,SAAS,0BACd,MAC6B;AAC7B,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,QAAQ,CAAC,KAAK,KAAK,IAAI,OAAU;AAAA,IACnE,SAAS,iBAAiB,WAAW;AAAA,IACrC,MAAM,QAAQ,OAAO,KAAK;AACxB,YAAM,MAAwB,CAAC;AAC/B,iBAAW,YAAY,KAAK,WAAW;AACrC,YAAI,IAAI,QAAQ,QAAS;AACzB,cAAM,SAAS,MAAM;AAAA,UACnB,EAAE,SAAS;AAAA,UACX,EAAE,QAAQ,OAAO,IAAI,KAAK,IAAI,OAAO,KAAK,OAAO,GAAG,KAAK,MAAM;AAAA,QACjE;AACA,cAAM,UAAU,IAAI,MAAM,WAAW,SAAS,MAAM,GAAG,EAAE;AAIzD,YAAI,OAAO,SAAS,WAAW,GAAG;AAChC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA,OAAO,OAAO,OAAO,MAAM,GAAG,GAAG;AAAA,cACjC,WAAW,OAAO;AAAA,cAClB,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,sBAAsB,OAAO;AAAA,gBAC7B,OAAO,OAAO;AAAA,cAChB;AAAA,YACF,CAAC;AAAA,UACH;AACA;AAAA,QACF;AACA,eAAO,SAAS,QAAQ,CAAC,OAAO,MAAM;AACpC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA;AAAA,cACA,WAAW,MAAM,IAAI,OAAO,SAAS;AAAA,cACrC,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,UAAU,OAAO,OAAO,WAAW,eAAe,EAAE;AAAA,YAClE,CAAC;AAAA,UACH;AAAA,QACF,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAeO,SAAS,sBAA2B,MAA8C;AACvF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,YAAY,WAAW;AAAA,IAChC,MAAM,QAAQ,KAAK,KAAK;AACtB,YAAM,SAAS,MAAM,KAAK,SAAS,IAAI,EAAE,KAAK,GAAG,KAAK,QAAQ,CAAC;AAC/D,YAAM,MAAwB,CAAC;AAC/B,iBAAW,SAAS,OAAO,QAAQ;AACjC,mBAAW,WAAW,MAAM,UAAU;AACpC,cAAI,KAAK,iBAAiB,IAAI,MAAM,MAAM,OAAO,OAAO,CAAC;AAAA,QAC3D;AAGA,YAAI,MAAM,WAAW,UAAU,MAAM,WAAW,WAAW,MAAM,WAAW,WAAW;AACrF,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS,MAAM;AAAA,cACf,OAAO,UAAU,MAAM,KAAK,KAAK,MAAM,MAAM,KAAK,MAAM,UAAU,iBAAiB;AAAA,cACnF,UACE,MAAM,WAAW,UAAU,SAAS,MAAM,WAAW,YAAY,WAAW;AAAA,cAC9E,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,cAAc,MAAM;AAAA,gBACpB,aAAa,MAAM;AAAA,gBACnB,OAAO,MAAM;AAAA,gBACb,aAAa,MAAM;AAAA,cACrB;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AACA,UAAI,MAAM,qBAAqB;AAAA,QAC7B,QAAQ,OAAO,OAAO;AAAA,QACtB,SAAS,OAAO;AAAA,QAChB,UAAU,OAAO;AAAA,MACnB,CAAC;AACD,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,iBACP,YACA,MACA,OACA,GACgB;AAChB,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE,SAAS;AAAA,IACpB,OAAO,EAAE;AAAA,IACT,UAAU,aAAa,EAAE,QAAQ;AAAA,IACjC,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE;AAAA,EACd,CAAC;AACH;AAYO,SAAS,uBAAuB,OAA6B,CAAC,GAAsB;AACzF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,SAAS,KAAK,UAAU,IAAI,UAAU;AAC5C,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,cAAc,WAAW;AAAA,IAClC,MAAM,QAAQ,OAAO;AACnB,YAAM,QAAQ,OAAO,WAAW,KAAK;AACrC,YAAM,MAAwB,CAAC;AAC/B,YAAM,OAA6D;AAAA,QACjE,CAAC,WAAW,YAAY,mCAAmC;AAAA,QAC3D,CAAC,gBAAgB,QAAQ,sBAAsB;AAAA,QAC/C,CAAC,oBAAoB,QAAQ,6CAA6C;AAAA,QAC1E,CAAC,kBAAkB,UAAU,yBAAyB;AAAA,QACtD,CAAC,gBAAgB,UAAU,6BAA6B;AAAA,QACxD,CAAC,eAAe,QAAQ,6BAA6B;AAAA,QACrD,CAAC,aAAa,YAAY,wBAAwB;AAAA,MACpD;AACA,iBAAW,CAAC,KAAK,KAAK,GAAG,KAAK,MAAM;AAClC,cAAM,QAAQ,MAAM,GAAG;AACvB,YAAI,OAAO,UAAU,YAAY,QAAQ,WAAW;AAClD,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS;AAAA,cACT,OAAO;AAAA,cACP,WAAW,GAAG,GAAG,IAAI,MAAM,QAAQ,CAAC,CAAC,oBAAoB,SAAS;AAAA,cAClE,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,WAAW,KAAK,OAAO,WAAW,QAAQ,MAAM,IAAI,MAAM;AAAA,YACxE,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,UAAI,MAAM,eAAe,IAAI,WAAW;AACtC,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS;AAAA,YACT,OAAO;AAAA,YACP,WAAW,gBAAgB,MAAM,aAAa,QAAQ,CAAC,CAAC;AAAA,YACxD,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,eAAe,MAAM,cAAc,OAAO,MAAM,MAAM;AAAA,UACpE,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAgBO,SAAS,mBAAmB,MAA6C;AAC9E,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,KAAK,QAAQ,EAAE,MAAM,MAAM;AAAA,IACjC,SAAS,SAAS,WAAW;AAAA,IAC7B,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,KAAK,MAAM,KAAK,QAAQ,KAAK;AAClD,aAAO,OACJ,OAAO,CAAC,MAAM,YAAY,EAAE,KAAK,IAAI,SAAS,EAC9C,IAAI,CAAC,MAAM,eAAe,IAAI,MAAM,CAAC,CAAC;AAAA,IAC3C;AAAA,EACF;AACF;AAEA,SAAS,YAAY,GAAmB;AAEtC,SAAO,KAAK,IAAI,IAAI,KAAK;AAC3B;AAEA,SAAS,eAAe,YAAoB,MAAc,GAA+B;AACvF,QAAM,UAAU,YAAY,EAAE,KAAK;AACnC,QAAM,WACJ,UAAU,IAAI,aAAa,UAAU,IAAI,SAAS,UAAU,IAAI,WAAW;AAC7E,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE;AAAA,IACX,OAAO,GAAG,EAAE,SAAS,IAAI,EAAE,SAAS,WAAW,QAAQ,QAAQ,CAAC,CAAC;AAAA,IACjE,WAAW,EAAE;AAAA,IACb;AAAA,IACA,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA;AAAA;AAAA;AAAA;AAAA,IAKL,oBAAoB;AAAA,IACpB,UAAU,EAAE,YAAY,EAAE,WAAW,WAAW,EAAE,WAAW,UAAU,QAAQ;AAAA,EACjF,CAAC;AACH;AAUO,SAAS,kCACd,OAAwC,CAAC,GACL;AACpC,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,SAAS,QAAQ,CAAC,KAAK,QAAQ,KAAK,IAAI,OAAU;AAAA,IACpF,SAAS,GAAG,8BAA8B,YAAY,WAAW;AAAA,IACjE,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,wBAAwB,OAAO,KAAK,OAAO;AAChE,UAAI,CAAC,OAAO,WAAW;AACrB,eAAO;AAAA,UACL,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,OAAO;AAAA,YACP,WAAW,OAAO;AAAA,YAClB,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,QAAQ,OAAO,MAAM;AAAA,UACnC,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,MAAwB,CAAC;AAC/B,iBAAW,KAAK,OAAO,UAAU;AAG/B,YAAI,EAAE,WAAW,EAAE,SAAS,EAAG;AAC/B,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS,EAAE;AAAA,YACX,OAAO,EAAE,UACL,YAAY,EAAE,OAAO,cAAc,EAAE,KAAK,SAC1C,YAAY,EAAE,OAAO;AAAA,YACzB,WAAW,EAAE;AAAA,YACb,UAAU,aAAa,EAAE,QAAQ;AAAA,YACjC,YAAY;AAAA,YACZ,eAAe,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC;AAAA,YACjF,UAAU;AAAA,cACR,SAAS,EAAE;AAAA,cACX,SAAS,EAAE;AAAA,cACX,UAAU,EAAE;AAAA,cACZ,UAAU,OAAO,WAAW;AAAA,YAC9B;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;;;AClZA,IAAM,mBAAqD,oBAAI,IAAyB;AAAA,EACtF;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAUM,SAAS,kBAAkB,SAAkC;AAClE,SAAO,QAAQ,cAAc,KAAK,CAAC,QAAQ,iBAAiB,IAAI,IAAI,IAAI,CAAC;AAC3E;AAKO,SAAS,eAAe,SAAkC;AAC/D,SAAO,QAAQ,uBAAuB;AACxC;AAqBO,SAAS,qBACd,UACA,UAAU,SACqB;AAC/B,QAAM,QAAQ,SAAS,OAAO,cAAc;AAC5C,MAAI,MAAM,SAAS,GAAG;AACpB,UAAM,IAAI;AAAA,MACR,GAAG,OAAO,sJACoE,MACzE,IAAI,CAAC,MAAM,EAAE,UAAU,EACvB,KAAK,IAAI,CAAC;AAAA,IACjB;AAAA,EACF;AACA,SAAO;AACT;","names":[]}
|
|
@@ -63,6 +63,13 @@ interface AuthenticityResult {
|
|
|
63
63
|
usesRealImpl: boolean;
|
|
64
64
|
realInfra: boolean;
|
|
65
65
|
wired: boolean;
|
|
66
|
+
/** The required artifact is actually referenced/imported by other (non-artifact)
|
|
67
|
+
* files — i.e. wired into the rest of the system, not dead code. Domain-agnostic:
|
|
68
|
+
* a deliverable nothing else uses is suspect in any vertical. */
|
|
69
|
+
artifactReferenced: boolean;
|
|
70
|
+
/** Convenience: the artifact is connected to the running system, via either the
|
|
71
|
+
* domain wiring signal OR a structural reference. */
|
|
72
|
+
artifactWired: boolean;
|
|
66
73
|
fakeShim: boolean;
|
|
67
74
|
/** mock/stub markers per 1000 LOC, capped at 100. */
|
|
68
75
|
mockDensity: number;
|
|
@@ -81,6 +88,7 @@ interface RealnessGate {
|
|
|
81
88
|
declare function gateRealness(r: AuthenticityResult, opts?: {
|
|
82
89
|
floor?: number;
|
|
83
90
|
requireArtifact?: boolean;
|
|
91
|
+
requireArtifactWired?: boolean;
|
|
84
92
|
}): RealnessGate;
|
|
85
93
|
interface AuthenticityNuance {
|
|
86
94
|
/** 0 (nothing mocked) … 100 (entirely mocked). */
|
|
@@ -104,5 +112,50 @@ declare function scoreAuthenticityNuance(files: readonly ProducedFile[], complet
|
|
|
104
112
|
intent?: string;
|
|
105
113
|
prioritize?: RegExp;
|
|
106
114
|
}): Promise<AuthenticityNuance>;
|
|
115
|
+
interface RealnessJudgment {
|
|
116
|
+
/** 0 (facade/simulator) … 100 (real implementation on the intended infra). */
|
|
117
|
+
isReal: number;
|
|
118
|
+
rationale: string;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Ask an LLM to rate realness DIRECTLY on a 0-100 scale — the axis that matched
|
|
122
|
+
* human blind-labels in validation (F1 0.80→0.88 on the gray band; a fakePct/
|
|
123
|
+
* hollowness proxy over-penalized "real core + stubbed periphery" partials, and a
|
|
124
|
+
* weak judge model over-flagged — use a strong one). Domain-agnostic skeleton; the
|
|
125
|
+
* consumer supplies `intent` (what the deliverable should be) and `rubric` (domain
|
|
126
|
+
* specifics of real-vs-fake). Fail-closed: a bad response reads as fully fake.
|
|
127
|
+
*/
|
|
128
|
+
declare function judgeRealnessLlm(files: readonly ProducedFile[], complete: CompleteFn, opts?: {
|
|
129
|
+
intent?: string;
|
|
130
|
+
rubric?: string;
|
|
131
|
+
prioritize?: RegExp;
|
|
132
|
+
}): Promise<RealnessJudgment>;
|
|
133
|
+
type RealnessBand = 'clean-real' | 'clean-fake' | 'gray';
|
|
134
|
+
interface BlendedRealness extends AuthenticityResult {
|
|
135
|
+
/** Final realness after (only-when-needed) LLM adjudication, 0…100. */
|
|
136
|
+
blendedRealness: number;
|
|
137
|
+
band: RealnessBand;
|
|
138
|
+
/** True iff the LLM judge was actually consulted (gray band only). */
|
|
139
|
+
consultedLlm: boolean;
|
|
140
|
+
/** Present iff the LLM was consulted. */
|
|
141
|
+
judgment?: RealnessJudgment;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Score realness using the cheapest sufficient signal: trust the deterministic
|
|
145
|
+
* scorer on the CLEAN extremes (obvious fakes / obviously-real-and-wired), and only
|
|
146
|
+
* spend an LLM call on the GRAY band — cells that look real structurally but carry
|
|
147
|
+
* fakeness markers (a fake shim, an unwired/dead artifact, high mock density) or land
|
|
148
|
+
* mid-range. This caps LLM cost at the fraction of cells static analysis can't
|
|
149
|
+
* resolve, which matters at multi-vertical / multi-partner scale.
|
|
150
|
+
*
|
|
151
|
+
* Domain-agnostic: the gray-band TRIGGER is structural; the LLM judges via the
|
|
152
|
+
* consumer-supplied `intent`. Fail-closed (a bad LLM response reads as fully fake).
|
|
153
|
+
*/
|
|
154
|
+
declare function scoreRealnessBlended(files: readonly ProducedFile[], signals: AuthenticitySignals, complete: CompleteFn, opts?: {
|
|
155
|
+
intent?: string;
|
|
156
|
+
rubric?: string;
|
|
157
|
+
grayBand?: [number, number];
|
|
158
|
+
mockGrayThreshold?: number;
|
|
159
|
+
}): Promise<BlendedRealness>;
|
|
107
160
|
|
|
108
|
-
export { type AuthenticityNuance, type AuthenticityResult, type AuthenticitySignals, type CompleteFn, type ProducedFile, type RealnessGate, gateRealness, scoreAuthenticity, scoreAuthenticityNuance };
|
|
161
|
+
export { type AuthenticityNuance, type AuthenticityResult, type AuthenticitySignals, type BlendedRealness, type CompleteFn, type ProducedFile, type RealnessBand, type RealnessGate, type RealnessJudgment, gateRealness, judgeRealnessLlm, scoreAuthenticity, scoreAuthenticityNuance, scoreRealnessBlended };
|
|
@@ -5,6 +5,33 @@ var DEFAULT_MOCK = /\bmock|\bfake|\bdummy|\bstub\b|simulat|hardcoded|placeholder
|
|
|
5
5
|
function basename(p) {
|
|
6
6
|
return p.split("/").pop() ?? p;
|
|
7
7
|
}
|
|
8
|
+
function escapeRe(s) {
|
|
9
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
10
|
+
}
|
|
11
|
+
function declaredNames(content) {
|
|
12
|
+
const names = /* @__PURE__ */ new Set();
|
|
13
|
+
const re = /\b(?:contract|library|interface|abstract\s+contract|class|enum|struct|module|package)\s+([A-Za-z_]\w*)/g;
|
|
14
|
+
let m;
|
|
15
|
+
while (m = re.exec(content)) {
|
|
16
|
+
const name = m[1];
|
|
17
|
+
if (name && name.length >= 4) names.add(name);
|
|
18
|
+
}
|
|
19
|
+
return [...names];
|
|
20
|
+
}
|
|
21
|
+
function isArtifactReferenced(required, others) {
|
|
22
|
+
if (!required.length || !others.length) return false;
|
|
23
|
+
return required.some((rf) => {
|
|
24
|
+
const stem = rf.path.replace(/\.[^.]+$/, "");
|
|
25
|
+
const base = basename(rf.path);
|
|
26
|
+
const names = declaredNames(rf.content ?? "");
|
|
27
|
+
return others.some((o) => {
|
|
28
|
+
const c = o.content ?? "";
|
|
29
|
+
if (!c) return false;
|
|
30
|
+
if (c.includes(base) || c.includes(stem)) return true;
|
|
31
|
+
return names.some((n) => new RegExp(`\\b${escapeRe(n)}\\b`).test(c));
|
|
32
|
+
});
|
|
33
|
+
});
|
|
34
|
+
}
|
|
8
35
|
function scoreAuthenticity(files, signals) {
|
|
9
36
|
const w = {
|
|
10
37
|
artifact: signals.weights?.artifact ?? 40,
|
|
@@ -24,6 +51,8 @@ function scoreAuthenticity(files, signals) {
|
|
|
24
51
|
const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText);
|
|
25
52
|
const realInfra = signals.realInfra.test(allText);
|
|
26
53
|
const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false;
|
|
54
|
+
const artifactReferenced = isArtifactReferenced(required, others);
|
|
55
|
+
const artifactWired = wired || artifactReferenced;
|
|
27
56
|
const fakeShim = files.some(
|
|
28
57
|
(f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? "")
|
|
29
58
|
);
|
|
@@ -32,6 +61,7 @@ function scoreAuthenticity(files, signals) {
|
|
|
32
61
|
) ?? []).length;
|
|
33
62
|
const loc = Math.max(1, allText.split("\n").length);
|
|
34
63
|
const mockDensity = Math.min(100, Math.round(mockHits / loc * 1e3));
|
|
64
|
+
const decorativeArtifact = requiredArtifactPresent && usesRealImpl && !artifactWired;
|
|
35
65
|
let realness = 0;
|
|
36
66
|
if (requiredArtifactPresent) realness += w.artifact;
|
|
37
67
|
if (usesRealImpl) realness += w.impl;
|
|
@@ -56,6 +86,10 @@ function scoreAuthenticity(files, signals) {
|
|
|
56
86
|
flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`);
|
|
57
87
|
if (signals.wiring && requiredArtifactPresent && !wired)
|
|
58
88
|
flags.push("NOT_WIRED: artifact exists but is never used by the client");
|
|
89
|
+
if (decorativeArtifact)
|
|
90
|
+
flags.push(
|
|
91
|
+
"DEAD_ARTIFACT: required artifact is not referenced/imported anywhere \u2014 decorative or dead code"
|
|
92
|
+
);
|
|
59
93
|
return {
|
|
60
94
|
realness,
|
|
61
95
|
requiredArtifactPresent,
|
|
@@ -63,6 +97,8 @@ function scoreAuthenticity(files, signals) {
|
|
|
63
97
|
usesRealImpl,
|
|
64
98
|
realInfra,
|
|
65
99
|
wired,
|
|
100
|
+
artifactReferenced,
|
|
101
|
+
artifactWired,
|
|
66
102
|
fakeShim,
|
|
67
103
|
mockDensity,
|
|
68
104
|
flags
|
|
@@ -76,6 +112,9 @@ function gateRealness(r, opts = {}) {
|
|
|
76
112
|
if (r.fakeShim && !r.usesRealImpl) {
|
|
77
113
|
return { gated: true, reason: "fake shim with no real implementation" };
|
|
78
114
|
}
|
|
115
|
+
if (opts.requireArtifactWired && r.requiredArtifactPresent && r.usesRealImpl && !r.artifactWired) {
|
|
116
|
+
return { gated: true, reason: "required artifact present but never wired into the system" };
|
|
117
|
+
}
|
|
79
118
|
if (r.realness < floor)
|
|
80
119
|
return { gated: true, reason: `realness ${r.realness} below floor ${floor}` };
|
|
81
120
|
return { gated: false };
|
|
@@ -120,9 +159,57 @@ ${fileDigest(files, { prioritize: opts.prioritize })}`;
|
|
|
120
159
|
};
|
|
121
160
|
}
|
|
122
161
|
}
|
|
162
|
+
async function judgeRealnessLlm(files, complete, opts = {}) {
|
|
163
|
+
const system = "You are a skeptical auditor. Rate how REAL an agent's build is vs the intended deliverable, 0-100. A genuine implementation of the HARD part on the intended infrastructure is SUBSTANTIALLY REAL (>=50) even if peripheral layers are stubbed; a pure simulator / facade / branded-type stand-in / no-op-stubbed dependency with no real implementation is FAKE (<=25). Judge the core on its merits and note the runtime. " + (opts.rubric ? `Domain rubric: ${opts.rubric} ` : "") + 'Respond with ONLY JSON: {"isReal":0-100,"why":"one sentence"}.';
|
|
164
|
+
const user = (opts.intent ? `Intended deliverable: ${opts.intent}
|
|
165
|
+
|
|
166
|
+
` : "") + `Produced files:
|
|
167
|
+
${fileDigest(files, { prioritize: opts.prioritize })}`;
|
|
168
|
+
try {
|
|
169
|
+
const raw = await complete(system, user);
|
|
170
|
+
const m = raw.match(/\{[\s\S]*\}/);
|
|
171
|
+
if (!m) return { isReal: 0, rationale: "unparseable judge response" };
|
|
172
|
+
const j = JSON.parse(m[0]);
|
|
173
|
+
return {
|
|
174
|
+
isReal: clampPct(j.isReal),
|
|
175
|
+
rationale: typeof j.why === "string" ? j.why : ""
|
|
176
|
+
};
|
|
177
|
+
} catch (err) {
|
|
178
|
+
return {
|
|
179
|
+
isReal: 0,
|
|
180
|
+
rationale: `judge error: ${err instanceof Error ? err.message : String(err)}`
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
async function scoreRealnessBlended(files, signals, complete, opts = {}) {
|
|
185
|
+
const det = scoreAuthenticity(files, signals);
|
|
186
|
+
const [lo, hi] = opts.grayBand ?? [30, 70];
|
|
187
|
+
const mockGray = opts.mockGrayThreshold ?? 8;
|
|
188
|
+
const conflict = det.requiredArtifactPresent && det.usesRealImpl && (det.fakeShim || !det.wired || det.mockDensity >= mockGray);
|
|
189
|
+
const midRange = det.realness >= lo && det.realness <= hi;
|
|
190
|
+
let band;
|
|
191
|
+
if (conflict || midRange) band = "gray";
|
|
192
|
+
else if (det.realness < lo) band = "clean-fake";
|
|
193
|
+
else band = "clean-real";
|
|
194
|
+
if (band !== "gray") {
|
|
195
|
+
return { ...det, blendedRealness: det.realness, band, consultedLlm: false };
|
|
196
|
+
}
|
|
197
|
+
const judgment = await judgeRealnessLlm(files, complete, {
|
|
198
|
+
intent: opts.intent,
|
|
199
|
+
rubric: opts.rubric,
|
|
200
|
+
prioritize: signals.requiredArtifact
|
|
201
|
+
});
|
|
202
|
+
const blendedRealness = Math.max(
|
|
203
|
+
0,
|
|
204
|
+
Math.min(100, Math.round(0.25 * det.realness + 0.75 * judgment.isReal))
|
|
205
|
+
);
|
|
206
|
+
return { ...det, blendedRealness, band, consultedLlm: true, judgment };
|
|
207
|
+
}
|
|
123
208
|
export {
|
|
124
209
|
gateRealness,
|
|
210
|
+
judgeRealnessLlm,
|
|
125
211
|
scoreAuthenticity,
|
|
126
|
-
scoreAuthenticityNuance
|
|
212
|
+
scoreAuthenticityNuance,
|
|
213
|
+
scoreRealnessBlended
|
|
127
214
|
};
|
|
128
215
|
//# sourceMappingURL=index.js.map
|