@tangle-network/agent-eval 0.77.0 → 0.79.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/adapters/http.d.ts +2 -2
  2. package/dist/adapters/langchain.d.ts +2 -2
  3. package/dist/adapters/otel.d.ts +4 -4
  4. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  5. package/dist/analyst/index.d.ts +42 -8
  6. package/dist/analyst/index.js +32 -2
  7. package/dist/analyst/index.js.map +1 -1
  8. package/dist/authenticity/index.d.ts +54 -1
  9. package/dist/authenticity/index.js +88 -1
  10. package/dist/authenticity/index.js.map +1 -1
  11. package/dist/benchmarks/index.d.ts +2 -2
  12. package/dist/campaign/index.d.ts +11 -11
  13. package/dist/campaign/index.js +4 -4
  14. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  15. package/dist/chunk-5LVWPNS5.js.map +1 -0
  16. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  17. package/dist/chunk-CF67I6QY.js.map +1 -0
  18. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  19. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  20. package/dist/chunk-KWRRMR3J.js.map +1 -0
  21. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  22. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  23. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  24. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  25. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  26. package/dist/contract/index.d.ts +12 -12
  27. package/dist/contract/index.js +2 -2
  28. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  29. package/dist/control.d.ts +2 -2
  30. package/dist/control.js +2 -2
  31. package/dist/hosted/index.d.ts +4 -4
  32. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  33. package/dist/index.d.ts +126 -25
  34. package/dist/index.js +32 -7
  35. package/dist/index.js.map +1 -1
  36. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  37. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  38. package/dist/meta-eval/index.d.ts +2 -2
  39. package/dist/openapi.json +1 -1
  40. package/dist/{provenance-B-TFszPW.d.ts → provenance-CEAJI9rm.d.ts} +3 -3
  41. package/dist/{registry-DuVYiTvw.d.ts → registry-BmEuU94S.d.ts} +2 -2
  42. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  43. package/dist/reporting.d.ts +4 -4
  44. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  45. package/dist/rl.d.ts +6 -6
  46. package/dist/rl.js +2 -2
  47. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CWyWWLBg.d.ts} +1 -1
  48. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-Bgu4C59E.d.ts} +2 -4
  49. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  50. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-Du4ZVyef.d.ts} +3 -3
  51. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  52. package/dist/traces.d.ts +1 -1
  53. package/dist/traces.js +2 -2
  54. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  55. package/dist/{types-Bba0vl1V.d.ts → types-QHG0KnkF.d.ts} +11 -3
  56. package/dist/workflow/index.d.ts +4 -4
  57. package/dist/workflow/index.js +1 -1
  58. package/docs/auto-research-loop-end-to-end.md +1 -1
  59. package/docs/feature-guide.md +4 -4
  60. package/docs/multi-shot-optimization.md +61 -115
  61. package/docs/product-eval-adoption.md +1 -1
  62. package/docs/three-package-architecture.md +1 -1
  63. package/docs/trace-analysis.md +19 -0
  64. package/package.json +1 -1
  65. package/dist/chunk-7W4SM7FD.js.map +0 -1
  66. package/dist/chunk-F3SRAAZO.js.map +0 -1
  67. package/dist/chunk-JYE3WOTE.js.map +0 -1
  68. package/dist/chunk-WYIHD6EB.js.map +0 -1
  69. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  70. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  71. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  72. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -1,5 +1,5 @@
1
- import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-Bba0vl1V.js';
2
- import '../run-record-BgTFzO2r.js';
1
+ import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-QHG0KnkF.js';
2
+ import '../run-record-sItO5ftF.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
5
5
 
@@ -1,5 +1,5 @@
1
- import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-Bba0vl1V.js';
2
- import '../run-record-BgTFzO2r.js';
1
+ import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-QHG0KnkF.js';
2
+ import '../run-record-sItO5ftF.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
5
5
 
@@ -1,10 +1,10 @@
1
1
  import { TraceSpanEvent, HostedClient } from '../hosted/index.js';
2
- import '../types-Bba0vl1V.js';
3
- import '../run-record-BgTFzO2r.js';
2
+ import '../types-QHG0KnkF.js';
3
+ import '../run-record-sItO5ftF.js';
4
4
  import '../errors-Dwqw-T_m.js';
5
5
  import '../schema-m0gsnbt3.js';
6
- import '../insight-report-Df3lxYXM.js';
7
- import '../summary-report-ByiOUrHj.js';
6
+ import '../insight-report-dlpEzQDi.js';
7
+ import '../summary-report-BTaXq1TS.js';
8
8
  import '../failure-cluster-CL7IVgkJ.js';
9
9
  import '../store-CKUAgsJz.js';
10
10
  import '../judge-calibration-DilmB3Ml.js';
@@ -1,5 +1,5 @@
1
1
  import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
2
- import { R as RunRecord } from './run-record-BgTFzO2r.js';
2
+ import { R as RunRecord } from './run-record-sItO5ftF.js';
3
3
  import { TCloud } from '@tangle-network/tcloud';
4
4
 
5
5
  /**
@@ -1,21 +1,21 @@
1
1
  import { AxAIService, AxFunction } from '@ax-llm/ax';
2
2
  import { M as MultiLayerVerifier, V as VerifyOptions, S as Severity } from '../multi-layer-verifier-DlWCXuxL.js';
3
3
  import { c as RunCritic, a as RunTrace } from '../run-critic-BAIjX99r.js';
4
- import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-CV9Wlx4t.js';
5
- export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-CV9Wlx4t.js';
4
+ import { S as SemanticConceptJudgeOptions, a as SemanticConceptJudgeInput, B as BehavioralMetrics } from '../semantic-concept-judge-Du4ZVyef.js';
5
+ export { C as CreateAnalystAiConfig, D as DEFAULT_TRACE_ANALYST_KINDS, b as DefaultAnalystRegistryOptions, c as DiffPolicy, F as FAILURE_MODE_KIND_SPEC, d as FINDING_SUBJECT_GRAMMAR_PROMPT, e as FINDING_SUBJECT_KINDS, f as FindingSubject, g as FindingSubjectKind, h as FindingSubjectStringSchema, i as FindingsDiff, j as FindingsStore, I as IMPROVEMENT_KIND_SPEC, K as KIND_EXPECTED_SUBJECTS, k as KNOWLEDGE_GAP_KIND_SPEC, l as KNOWLEDGE_POISONING_KIND_SPEC, P as PersistedFinding, m as SKILL_USAGE_ANALYST, n as SkillUsageAnalyst, o as SkillUsageRecord, p as SkillUsageReport, q as SkillUsageScanConfig, r as buildDefaultAnalystRegistry, s as buildSkillUsageReport, t as createAnalystAi, u as defaultIsMaterial, v as diffFindings, w as emitSkillUsageFindings, x as parseFindingSubject, y as renderFindingSubject } from '../semantic-concept-judge-Du4ZVyef.js';
6
6
  import { A as AnalyzeTracesOptions } from '../analyst-t7zZS3TV.js';
7
7
  import { T as TraceAnalysisStore } from '../store-GmBE2pZZ.js';
8
8
  import { b as JudgeFn, a as JudgeInput } from '../types-Croy5h7V.js';
9
- import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../types-CRD68aH7.js';
10
- export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-CRD68aH7.js';
9
+ import { A as Analyst, h as AnalystSeverity, c as AnalystFinding } from '../types-DRvV0zRo.js';
10
+ export { a as AnalystContext, g as AnalystCost, i as AnalystInputKind, j as AnalystRequirements, f as AnalystRunEvent, e as AnalystRunInputs, d as AnalystRunResult, b as AnalystRunSummary, k as ChatCallOpts, C as ChatClient, l as ChatRequest, m as ChatResponse, n as ChatTransport, o as CliBridgeTransportOpts, p as CreateChatClientOpts, D as DirectProviderTransportOpts, E as EvidenceRef, M as MockTransportOpts, R as RouterTransportOpts, S as SandboxSdkTransportOpts, q as computeFindingId, r as createChatClient, s as makeFinding } from '../types-DRvV0zRo.js';
11
11
  import { TCloud } from '@tangle-network/tcloud';
12
- export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DW9XWPvM.js';
13
- export { A as AnalystHooks, a as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-DuVYiTvw.js';
12
+ export { A as ANALYST_SEVERITIES, C as CreateTraceAnalystKindOpts, R as RAW_FINDING_SCHEMA_PROMPT, a as RawAnalystFinding, b as RawAnalystFindingSchema, T as TraceAnalystGolden, c as TraceAnalystKindSpec, d as createTraceAnalystKind, p as parseRawFinding, r as renderPriorFindings } from '../kind-factory-DqV2t1Xk.js';
13
+ export { a as AnalystHooks, A as AnalystRegistry, b as AnalystRegistryOptions, B as BudgetPolicy, R as RegistryRunOpts } from '../registry-BmEuU94S.js';
14
14
  import { L as LlmClientOptions } from '../llm-client-DbjLfz-K.js';
15
15
  import '../schema-m0gsnbt3.js';
16
16
  import '../store-CKUAgsJz.js';
17
17
  import 'zod';
18
- import '../run-record-BgTFzO2r.js';
18
+ import '../run-record-sItO5ftF.js';
19
19
  import '../errors-Dwqw-T_m.js';
20
20
  import '../raw-provider-sink-C46HDghv.js';
21
21
 
@@ -149,6 +149,40 @@ declare function coerceJson(text: string): unknown;
149
149
  */
150
150
  declare function coerceToFindingRows(raw: unknown): unknown[];
151
151
 
152
+ /** DESCRIPTIVE predicate: does the finding cite at least one observable
153
+ * (span/event/artifact) evidence ref. Useful for ranking evidence quality or
154
+ * rendering — it is NOT the steer gate. Evidence presence is the WRONG
155
+ * discriminator for steering: a legitimate trace-analyst observation may cite
156
+ * nothing (it would be wrongly rejected), and a judge verdict may cite an
157
+ * artifact (it would be wrongly admitted). Use `assertNoJudgeVerdict` to gate
158
+ * steering; use this only where "is this grounded in observable evidence" is the
159
+ * literal question. */
160
+ declare function isTraceObservable(finding: AnalystFinding): boolean;
161
+ /** True iff the finding is a JUDGE VERDICT (an acceptance score lifted into a
162
+ * finding), identified by provenance set at the lift site — independent of
163
+ * whatever evidence it cites. */
164
+ declare function isJudgeVerdict(finding: AnalystFinding): boolean;
165
+ /**
166
+ * THE steer firewall. Fail-loud guard for any path that admits analyst findings
167
+ * as STEERING input (the `f(trace)` role): rejects — naming the offenders — any
168
+ * finding whose provenance is a judge verdict, rather than let `J` leak into the
169
+ * loop. Returns the findings unchanged for chaining.
170
+ *
171
+ * Call this at the chokepoint where a detector that ALSO scores/gates has its
172
+ * findings turned into a steer (the judge-and-steer dual-role case). It keys on
173
+ * provenance, so it correctly admits evidence-less trace-analyst observations and
174
+ * correctly rejects an artifact-citing judge verdict — the cases an evidence
175
+ * check gets backwards.
176
+ *
177
+ * It is necessary, not sufficient: it stops PROVENANCE-tagged verdicts. A judge
178
+ * whose output is laundered through a hand-built finding with no provenance flag
179
+ * is out of its reach — provenance must be honestly set at every judge→finding
180
+ * lift (today: createJudgeAdapter). That is why the integrity rule lives at the
181
+ * lift site, and why ProposeContext.judgeScores?: never is the complementary
182
+ * compile-time tripwire on the obvious direct channel.
183
+ */
184
+ declare function assertNoJudgeVerdict(findings: ReadonlyArray<AnalystFinding>, context?: string): ReadonlyArray<AnalystFinding>;
185
+
152
186
  /**
153
187
  * `structureFindings` — the deferred structuring pass (DSPy TwoStepAdapter /
154
188
  * HALO `synthesize_traces` analog). The agentic actor reasons FREE-FORM and
@@ -218,4 +252,4 @@ type TraceToolGroupName =
218
252
  */
219
253
  declare function buildTraceToolsForGroup(group: TraceToolGroupName, store: TraceAnalysisStore): AxFunction[];
220
254
 
221
- export { Analyst, AnalystFinding, AnalystSeverity, type JudgeAdapterOpts, type RunCriticAdapterOpts, type SemanticConceptJudgeAdapterOpts, type StructureFindingsOptions, type StructureFindingsResult, type TraceAnalystAdapterOpts, type TraceToolGroupName, type VerifierAdapterOpts, behavioralAnalyst, buildTraceToolsForGroup, coerceJson, coerceToFindingRows, createJudgeAdapter, createRunCriticAdapter, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createVerifierAdapter, deriveEfficiencyFindings, liftSeverity, stripCodeFences, structureFindings };
255
+ export { Analyst, AnalystFinding, AnalystSeverity, type JudgeAdapterOpts, type RunCriticAdapterOpts, type SemanticConceptJudgeAdapterOpts, type StructureFindingsOptions, type StructureFindingsResult, type TraceAnalystAdapterOpts, type TraceToolGroupName, type VerifierAdapterOpts, assertNoJudgeVerdict, behavioralAnalyst, buildTraceToolsForGroup, coerceJson, coerceToFindingRows, createJudgeAdapter, createRunCriticAdapter, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createVerifierAdapter, deriveEfficiencyFindings, isJudgeVerdict, isTraceObservable, liftSeverity, stripCodeFences, structureFindings };
@@ -14,7 +14,7 @@ import {
14
14
  diffFindings,
15
15
  emitSkillUsageFindings,
16
16
  runSemanticConceptJudge
17
- } from "../chunk-7W4SM7FD.js";
17
+ } from "../chunk-5LVWPNS5.js";
18
18
  import {
19
19
  ANALYST_SEVERITIES,
20
20
  AnalystRegistry,
@@ -41,7 +41,7 @@ import {
41
41
  renderPriorFindings,
42
42
  stripCodeFences,
43
43
  structureFindings
44
- } from "../chunk-WYIHD6EB.js";
44
+ } from "../chunk-CF67I6QY.js";
45
45
  import "../chunk-IHDHUN2X.js";
46
46
  import {
47
47
  analyzeTraces
@@ -269,6 +269,11 @@ function liftJudgeScore(analyst_id, area, s) {
269
269
  severity,
270
270
  confidence: 0.8,
271
271
  evidence_refs: s.evidence ? [{ kind: "artifact", uri: "inline:evidence", excerpt: s.evidence }] : [],
272
+ // Provenance: this finding IS a judge verdict (an acceptance score), not an
273
+ // observation of behavior. The steer firewall (assertNoJudgeVerdict) rejects
274
+ // it from steering — even when it cites an artifact above — because letting a
275
+ // verdict steer the next attempt is the held-out judge leaking into the loop.
276
+ derived_from_judge: true,
272
277
  metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 }
273
278
  });
274
279
  }
@@ -323,6 +328,28 @@ function createSemanticConceptJudgeAdapter(opts = {}) {
323
328
  }
324
329
  };
325
330
  }
331
+
332
+ // src/analyst/steer-firewall.ts
333
+ var OBSERVABLE_KINDS = /* @__PURE__ */ new Set([
334
+ "span",
335
+ "event",
336
+ "artifact"
337
+ ]);
338
+ function isTraceObservable(finding) {
339
+ return finding.evidence_refs.some((ref) => OBSERVABLE_KINDS.has(ref.kind));
340
+ }
341
+ function isJudgeVerdict(finding) {
342
+ return finding.derived_from_judge === true;
343
+ }
344
+ function assertNoJudgeVerdict(findings, context = "steer") {
345
+ const leaks = findings.filter(isJudgeVerdict);
346
+ if (leaks.length > 0) {
347
+ throw new Error(
348
+ `${context}: a judge verdict cannot be admitted as steering input \u2014 that is the held-out judge leaking into the loop. Offending judge-derived findings: [${leaks.map((f) => f.finding_id).join(", ")}]. Steering consumes observations of behavior, never acceptance verdicts.`
349
+ );
350
+ }
351
+ return findings;
352
+ }
326
353
  export {
327
354
  ANALYST_SEVERITIES,
328
355
  AnalystRegistry,
@@ -340,6 +367,7 @@ export {
340
367
  RawAnalystFindingSchema,
341
368
  SKILL_USAGE_ANALYST,
342
369
  SkillUsageAnalyst,
370
+ assertNoJudgeVerdict,
343
371
  behavioralAnalyst,
344
372
  buildDefaultAnalystRegistry,
345
373
  buildSkillUsageReport,
@@ -359,6 +387,8 @@ export {
359
387
  deriveEfficiencyFindings,
360
388
  diffFindings,
361
389
  emitSkillUsageFindings,
390
+ isJudgeVerdict,
391
+ isTraceObservable,
362
392
  liftSeverity,
363
393
  makeFinding,
364
394
  parseFindingSubject,
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/analyst/adapters.ts"],"sourcesContent":["/**\n * Adapter factories — lift each existing agent-eval primitive into the\n * Analyst contract without re-implementing it.\n *\n * Five primitives, five factories. Each one:\n * - Builds an Analyst with a stable id (caller chooses; defaults\n * given), a sensible default `inputKind`, a version derived from\n * the wrapped primitive's version + an adapter revision, and an\n * `analyze()` that calls the primitive and lifts its output to\n * AnalystFinding[] using `makeFinding()`.\n * - Maps severities: the existing `Severity` ('critical' | 'major' |\n * 'minor' | 'info') projects onto AnalystSeverity ('critical' |\n * 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →\n * 'medium'. Domain analysts that want finer-grained mapping override.\n *\n * Adapters never own state. Calling the same factory twice with the\n * same primitive instance is safe.\n */\n\nimport type { AxAIService } from '@ax-llm/ax'\nimport type {\n Finding as LayerFinding,\n Severity as LayerSeverity,\n MultiLayerVerifier,\n VerifyOptions,\n} from '../multi-layer-verifier'\nimport { RunCritic, type RunTrace } from '../run-critic'\nimport {\n runSemanticConceptJudge,\n SEMANTIC_CONCEPT_JUDGE_VERSION,\n type SemanticConceptJudgeInput,\n type SemanticConceptJudgeOptions,\n} from '../semantic-concept-judge'\nimport { type AnalyzeTracesOptions, analyzeTraces } from '../trace-analyst/analyst'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport type { JudgeFn, JudgeInput, JudgeScore, TCloud } from '../types'\nimport type { Analyst, AnalystFinding, AnalystSeverity } from './types'\nimport { makeFinding } from './types'\n\nconst ADAPTER_REV = '1'\n\n// ── Severity bridges ───────────────────────────────────────────────\n\nexport function liftSeverity(s: LayerSeverity): AnalystSeverity {\n switch (s) {\n case 'critical':\n return 'critical'\n case 'major':\n return 'high'\n case 'minor':\n return 'medium'\n case 'info':\n return 'info'\n }\n}\n\n// ── 1. analyzeTraces → Analyst ─────────────────────────────────────\n\nexport interface TraceAnalystAdapterOpts {\n id?: string\n area?: string\n /** The natural-language question(s) put to the analyst. One finding per question. */\n questions: string[]\n /** Caller-provided AxAI service — same one trace-analyst.ts expects. */\n ai: AxAIService\n model?: string\n /** Forwarded to analyzeTraces. */\n extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>\n}\n\n/**\n * @deprecated Prefer `createTraceAnalystKind` + one of the failure /\n * improvement kinds from `./kinds`. This adapter wraps the legacy\n * `analyzeTraces` flow whose output is `findings:string[]` — every\n * bullet gets flat-defaulted severity `medium` / confidence `0.6`,\n * which loses the per-finding grading kinds provide via Ax structured\n * output + Zod validation. Kept for one minor while consumers migrate.\n */\nexport function createTraceAnalystAdapter(\n opts: TraceAnalystAdapterOpts,\n): Analyst<TraceAnalysisStore> {\n const id = opts.id ?? 'trace-analyst'\n const area = opts.area ?? 'agent-reasoning'\n return {\n id,\n description:\n 'Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.',\n inputKind: 'trace-store',\n cost: { kind: 'llm', models: opts.model ? [opts.model] : undefined },\n version: `trace-analyst-${ADAPTER_REV}`,\n async analyze(store, ctx) {\n const out: AnalystFinding[] = []\n for (const question of opts.questions) {\n if (ctx.signal?.aborted) break\n const result = await analyzeTraces(\n { question },\n { source: store, ai: opts.ai, model: opts.model, ...opts.extra },\n )\n const subject = ctx.tags?.subject ?? question.slice(0, 60)\n // The responder produces a list of bullet strings. Each becomes\n // one finding; the prose answer is attached as rationale on the\n // first (so renderers that show only top-N still get context).\n if (result.findings.length === 0) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim: result.answer.slice(0, 200),\n rationale: result.answer,\n severity: 'info',\n confidence: 0.5,\n evidence_refs: [],\n metadata: {\n actor_prompt_version: result.actorPromptVersion,\n turns: result.turnCount,\n },\n }),\n )\n continue\n }\n result.findings.forEach((claim, i) => {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim,\n rationale: i === 0 ? result.answer : undefined,\n severity: 'medium',\n confidence: 0.6,\n evidence_refs: [],\n metadata: { question, turns: result.turnCount, finding_index: i },\n }),\n )\n })\n }\n return out\n },\n }\n}\n\n// ── 2. MultiLayerVerifier → Analyst ─────────────────────────────────\n\nexport interface VerifierAdapterOpts<Env> {\n id?: string\n area?: string\n verifier: MultiLayerVerifier<Env>\n /**\n * The verifier expects an `env` per run. Adapters take it from\n * `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.\n */\n options?: Omit<VerifyOptions<Env>, 'env'>\n}\n\nexport function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env> {\n const id = opts.id ?? 'multi-layer-verifier'\n const area = opts.area ?? 'verification'\n return {\n id,\n description:\n \"Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.\",\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `verifier-${ADAPTER_REV}`,\n async analyze(env, ctx) {\n const report = await opts.verifier.run({ env, ...opts.options })\n const out: AnalystFinding[] = []\n for (const layer of report.layers) {\n for (const finding of layer.findings) {\n out.push(liftLayerFinding(id, area, layer.layer, finding))\n }\n // Layer-level signal: a failed/error layer is itself a finding\n // even if it didn't emit per-finding rows.\n if (layer.status === 'fail' || layer.status === 'error' || layer.status === 'timeout') {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: layer.layer,\n claim: `layer \"${layer.layer}\" ${layer.status}: ${layer.reason ?? 'no reason given'}`,\n severity:\n layer.status === 'error' ? 'high' : layer.status === 'timeout' ? 'medium' : 'high',\n confidence: 1,\n evidence_refs: [],\n metadata: {\n layer_status: layer.status,\n duration_ms: layer.durationMs,\n score: layer.score,\n diagnostics: layer.diagnostics,\n },\n }),\n )\n }\n }\n ctx.log?.('verifier complete', {\n layers: report.layers.length,\n blended: report.blendedScore,\n all_pass: report.allPass,\n })\n return out\n },\n }\n}\n\nfunction liftLayerFinding(\n analyst_id: string,\n area: string,\n layer: string,\n f: LayerFinding,\n): AnalystFinding {\n return makeFinding({\n analyst_id,\n area,\n subject: f.layer ?? layer,\n claim: f.message,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: f.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }]\n : [],\n metadata: f.detail,\n })\n}\n\n// ── 3. RunCritic → Analyst ──────────────────────────────────────────\n\nexport interface RunCriticAdapterOpts {\n id?: string\n area?: string\n critic?: RunCritic\n /** Optional threshold below which a dimension is reported as a finding. Default 0.5. */\n threshold?: number\n}\n\nexport function createRunCriticAdapter(opts: RunCriticAdapterOpts = {}): Analyst<RunTrace> {\n const id = opts.id ?? 'run-critic'\n const area = opts.area ?? 'run-quality'\n const critic = opts.critic ?? new RunCritic()\n const threshold = opts.threshold ?? 0.5\n return {\n id,\n description:\n 'Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.',\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `run-critic-${ADAPTER_REV}`,\n async analyze(trace) {\n const score = critic.scoreTrace(trace)\n const out: AnalystFinding[] = []\n const dims: Array<[keyof typeof score, AnalystSeverity, string]> = [\n ['success', 'critical', 'run did not complete successfully'],\n ['goalProgress', 'high', 'goal progress is low'],\n ['repoGroundedness', 'high', 'output is poorly grounded in the repository'],\n ['toolUseQuality', 'medium', 'tool use quality is low'],\n ['patchQuality', 'medium', 'no real patch/edit evidence'],\n ['testReality', 'high', 'no real test/build evidence'],\n ['finalGate', 'critical', 'final gate is blocking'],\n ]\n for (const [dim, sev, msg] of dims) {\n const value = score[dim] as number\n if (typeof value === 'number' && value < threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: dim,\n claim: msg,\n rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,\n severity: sev,\n confidence: 1,\n evidence_refs: [],\n metadata: { dimension: dim, value, threshold, run_id: trace.run.runId },\n }),\n )\n }\n }\n // Drift penalty is high → surface as a finding (inverse threshold).\n if (score.driftPenalty > 1 - threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: 'drift',\n claim: 'agent output drifted from repository signal',\n rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,\n severity: 'medium',\n confidence: 0.9,\n evidence_refs: [],\n metadata: { drift_penalty: score.driftPenalty, notes: score.notes },\n }),\n )\n }\n return out\n },\n }\n}\n\n// ── 4. JudgeFn → Analyst ────────────────────────────────────────────\n\nexport interface JudgeAdapterOpts {\n id?: string\n area?: string\n judge: JudgeFn\n /** TCloud handle the JudgeFn calls. */\n tcloud: TCloud\n /** Optional cost classification — most judges call an LLM. */\n cost?: Analyst['cost']\n /** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */\n threshold?: number\n}\n\nexport function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput> {\n const id = opts.id ?? 'judge'\n const area = opts.area ?? 'judge'\n const threshold = opts.threshold ?? 6\n return {\n id,\n description:\n 'Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.',\n inputKind: 'judge-input',\n cost: opts.cost ?? { kind: 'llm' },\n version: `judge-${ADAPTER_REV}`,\n async analyze(input) {\n const scores = await opts.judge(opts.tcloud, input)\n return scores\n .filter((s) => normalize10(s.score) < threshold)\n .map((s) => liftJudgeScore(id, area, s))\n },\n }\n}\n\nfunction normalize10(s: number): number {\n // JudgeScore convention is 0-10 but some judges emit 0-1. Coerce to 0-10.\n return s <= 1 ? s * 10 : s\n}\n\nfunction liftJudgeScore(analyst_id: string, area: string, s: JudgeScore): AnalystFinding {\n const score10 = normalize10(s.score)\n const severity: AnalystSeverity =\n score10 < 3 ? 'critical' : score10 < 5 ? 'high' : score10 < 7 ? 'medium' : 'low'\n return makeFinding({\n analyst_id,\n area,\n subject: s.dimension,\n claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,\n rationale: s.reasoning,\n severity,\n confidence: 0.8,\n evidence_refs: s.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: s.evidence }]\n : [],\n metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 },\n })\n}\n\n// ── 5. SemanticConceptJudge → Analyst ──────────────────────────────\n\nexport interface SemanticConceptJudgeAdapterOpts {\n id?: string\n area?: string\n options?: SemanticConceptJudgeOptions\n}\n\nexport function createSemanticConceptJudgeAdapter(\n opts: SemanticConceptJudgeAdapterOpts = {},\n): Analyst<SemanticConceptJudgeInput> {\n const id = opts.id ?? 'semantic-concept-judge'\n const area = opts.area ?? 'concept-coverage'\n return {\n id,\n description:\n 'Runs the semantic-concept judge and surfaces missing / weak concepts as findings.',\n inputKind: 'custom',\n cost: { kind: 'llm', models: opts.options?.model ? [opts.options.model] : undefined },\n version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,\n async analyze(input) {\n const result = await runSemanticConceptJudge(input, opts.options)\n if (!result.available) {\n return [\n makeFinding({\n analyst_id: id,\n area,\n claim: 'semantic-concept judge unavailable',\n rationale: result.error,\n severity: 'info',\n confidence: 1,\n evidence_refs: [],\n metadata: { reason: result.error },\n }),\n ]\n }\n const out: AnalystFinding[] = []\n for (const f of result.findings) {\n // Only surface gaps: missing concepts or low scores. Concepts at\n // 7+/10 with present=true are not findings — they're successes.\n if (f.present && f.score >= 7) continue\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: f.concept,\n claim: f.present\n ? `concept \"${f.concept}\" is weak (${f.score}/10)`\n : `concept \"${f.concept}\" is missing`,\n rationale: f.evidence,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }],\n metadata: {\n concept: f.concept,\n present: f.present,\n score_10: f.score,\n cost_usd: result.costUsd ?? undefined,\n },\n }),\n )\n }\n return out\n },\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuCA,IAAM,cAAc;AAIb,SAAS,aAAa,GAAmC;AAC9D,UAAQ,GAAG;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,EACX;AACF;AAwBO,SAAS,0BACd,MAC6B;AAC7B,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,QAAQ,CAAC,KAAK,KAAK,IAAI,OAAU;AAAA,IACnE,SAAS,iBAAiB,WAAW;AAAA,IACrC,MAAM,QAAQ,OAAO,KAAK;AACxB,YAAM,MAAwB,CAAC;AAC/B,iBAAW,YAAY,KAAK,WAAW;AACrC,YAAI,IAAI,QAAQ,QAAS;AACzB,cAAM,SAAS,MAAM;AAAA,UACnB,EAAE,SAAS;AAAA,UACX,EAAE,QAAQ,OAAO,IAAI,KAAK,IAAI,OAAO,KAAK,OAAO,GAAG,KAAK,MAAM;AAAA,QACjE;AACA,cAAM,UAAU,IAAI,MAAM,WAAW,SAAS,MAAM,GAAG,EAAE;AAIzD,YAAI,OAAO,SAAS,WAAW,GAAG;AAChC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA,OAAO,OAAO,OAAO,MAAM,GAAG,GAAG;AAAA,cACjC,WAAW,OAAO;AAAA,cAClB,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,sBAAsB,OAAO;AAAA,gBAC7B,OAAO,OAAO;AAAA,cAChB;AAAA,YACF,CAAC;AAAA,UACH;AACA;AAAA,QACF;AACA,eAAO,SAAS,QAAQ,CAAC,OAAO,MAAM;AACpC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA;AAAA,cACA,WAAW,MAAM,IAAI,OAAO,SAAS;AAAA,cACrC,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,UAAU,OAAO,OAAO,WAAW,eAAe,EAAE;AAAA,YAClE,CAAC;AAAA,UACH;AAAA,QACF,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAeO,SAAS,sBAA2B,MAA8C;AACvF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,YAAY,WAAW;AAAA,IAChC,MAAM,QAAQ,KAAK,KAAK;AACtB,YAAM,SAAS,MAAM,KAAK,SAAS,IAAI,EAAE,KAAK,GAAG,KAAK,QAAQ,CAAC;AAC/D,YAAM,MAAwB,CAAC;AAC/B,iBAAW,SAAS,OAAO,QAAQ;AACjC,mBAAW,WAAW,MAAM,UAAU;AACpC,cAAI,KAAK,iBAAiB,IAAI,MAAM,MAAM,OAAO,OAAO,CAAC;AAAA,QAC3D;AAGA,YAAI,MAAM,WAAW,UAAU,MAAM,WAAW,WAAW,MAAM,WAAW,WAAW;AACrF,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS,MAAM;AAAA,cACf,OAAO,UAAU,MAAM,KAAK,KAAK,MAAM,MAAM,KAAK,MAAM,UAAU,iBAAiB;AAAA,cACnF,UACE,MAAM,WAAW,UAAU,SAAS,MAAM,WAAW,YAAY,WAAW;AAAA,cAC9E,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,cAAc,MAAM;AAAA,gBACpB,aAAa,MAAM;AAAA,gBACnB,OAAO,MAAM;AAAA,gBACb,aAAa,MAAM;AAAA,cACrB;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AACA,UAAI,MAAM,qBAAqB;AAAA,QAC7B,QAAQ,OAAO,OAAO;AAAA,QACtB,SAAS,OAAO;AAAA,QAChB,UAAU,OAAO;AAAA,MACnB,CAAC;AACD,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,iBACP,YACA,MACA,OACA,GACgB;AAChB,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE,SAAS;AAAA,IACpB,OAAO,EAAE;AAAA,IACT,UAAU,aAAa,EAAE,QAAQ;AAAA,IACjC,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE;AAAA,EACd,CAAC;AACH;AAYO,SAAS,uBAAuB,OAA6B,CAAC,GAAsB;AACzF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,SAAS,KAAK,UAAU,IAAI,UAAU;AAC5C,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,cAAc,WAAW;AAAA,IAClC,MAAM,QAAQ,OAAO;AACnB,YAAM,QAAQ,OAAO,WAAW,KAAK;AACrC,YAAM,MAAwB,CAAC;AAC/B,YAAM,OAA6D;AAAA,QACjE,CAAC,WAAW,YAAY,mCAAmC;AAAA,QAC3D,CAAC,gBAAgB,QAAQ,sBAAsB;AAAA,QAC/C,CAAC,oBAAoB,QAAQ,6CAA6C;AAAA,QAC1E,CAAC,kBAAkB,UAAU,yBAAyB;AAAA,QACtD,CAAC,gBAAgB,UAAU,6BAA6B;AAAA,QACxD,CAAC,eAAe,QAAQ,6BAA6B;AAAA,QACrD,CAAC,aAAa,YAAY,wBAAwB;AAAA,MACpD;AACA,iBAAW,CAAC,KAAK,KAAK,GAAG,KAAK,MAAM;AAClC,cAAM,QAAQ,MAAM,GAAG;AACvB,YAAI,OAAO,UAAU,YAAY,QAAQ,WAAW;AAClD,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS;AAAA,cACT,OAAO;AAAA,cACP,WAAW,GAAG,GAAG,IAAI,MAAM,QAAQ,CAAC,CAAC,oBAAoB,SAAS;AAAA,cAClE,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,WAAW,KAAK,OAAO,WAAW,QAAQ,MAAM,IAAI,MAAM;AAAA,YACxE,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,UAAI,MAAM,eAAe,IAAI,WAAW;AACtC,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS;AAAA,YACT,OAAO;AAAA,YACP,WAAW,gBAAgB,MAAM,aAAa,QAAQ,CAAC,CAAC;AAAA,YACxD,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,eAAe,MAAM,cAAc,OAAO,MAAM,MAAM;AAAA,UACpE,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAgBO,SAAS,mBAAmB,MAA6C;AAC9E,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,KAAK,QAAQ,EAAE,MAAM,MAAM;AAAA,IACjC,SAAS,SAAS,WAAW;AAAA,IAC7B,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,KAAK,MAAM,KAAK,QAAQ,KAAK;AAClD,aAAO,OACJ,OAAO,CAAC,MAAM,YAAY,EAAE,KAAK,IAAI,SAAS,EAC9C,IAAI,CAAC,MAAM,eAAe,IAAI,MAAM,CAAC,CAAC;AAAA,IAC3C;AAAA,EACF;AACF;AAEA,SAAS,YAAY,GAAmB;AAEtC,SAAO,KAAK,IAAI,IAAI,KAAK;AAC3B;AAEA,SAAS,eAAe,YAAoB,MAAc,GAA+B;AACvF,QAAM,UAAU,YAAY,EAAE,KAAK;AACnC,QAAM,WACJ,UAAU,IAAI,aAAa,UAAU,IAAI,SAAS,UAAU,IAAI,WAAW;AAC7E,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE;AAAA,IACX,OAAO,GAAG,EAAE,SAAS,IAAI,EAAE,SAAS,WAAW,QAAQ,QAAQ,CAAC,CAAC;AAAA,IACjE,WAAW,EAAE;AAAA,IACb;AAAA,IACA,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE,YAAY,EAAE,WAAW,WAAW,EAAE,WAAW,UAAU,QAAQ;AAAA,EACjF,CAAC;AACH;AAUO,SAAS,kCACd,OAAwC,CAAC,GACL;AACpC,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,SAAS,QAAQ,CAAC,KAAK,QAAQ,KAAK,IAAI,OAAU;AAAA,IACpF,SAAS,GAAG,8BAA8B,YAAY,WAAW;AAAA,IACjE,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,wBAAwB,OAAO,KAAK,OAAO;AAChE,UAAI,CAAC,OAAO,WAAW;AACrB,eAAO;AAAA,UACL,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,OAAO;AAAA,YACP,WAAW,OAAO;AAAA,YAClB,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,QAAQ,OAAO,MAAM;AAAA,UACnC,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,MAAwB,CAAC;AAC/B,iBAAW,KAAK,OAAO,UAAU;AAG/B,YAAI,EAAE,WAAW,EAAE,SAAS,EAAG;AAC/B,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS,EAAE;AAAA,YACX,OAAO,EAAE,UACL,YAAY,EAAE,OAAO,cAAc,EAAE,KAAK,SAC1C,YAAY,EAAE,OAAO;AAAA,YACzB,WAAW,EAAE;AAAA,YACb,UAAU,aAAa,EAAE,QAAQ;AAAA,YACjC,YAAY;AAAA,YACZ,eAAe,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC;AAAA,YACjF,UAAU;AAAA,cACR,SAAS,EAAE;AAAA,cACX,SAAS,EAAE;AAAA,cACX,UAAU,EAAE;AAAA,cACZ,UAAU,OAAO,WAAW;AAAA,YAC9B;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/analyst/adapters.ts","../../src/analyst/steer-firewall.ts"],"sourcesContent":["/**\n * Adapter factories — lift each existing agent-eval primitive into the\n * Analyst contract without re-implementing it.\n *\n * Five primitives, five factories. Each one:\n * - Builds an Analyst with a stable id (caller chooses; defaults\n * given), a sensible default `inputKind`, a version derived from\n * the wrapped primitive's version + an adapter revision, and an\n * `analyze()` that calls the primitive and lifts its output to\n * AnalystFinding[] using `makeFinding()`.\n * - Maps severities: the existing `Severity` ('critical' | 'major' |\n * 'minor' | 'info') projects onto AnalystSeverity ('critical' |\n * 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →\n * 'medium'. Domain analysts that want finer-grained mapping override.\n *\n * Adapters never own state. Calling the same factory twice with the\n * same primitive instance is safe.\n */\n\nimport type { AxAIService } from '@ax-llm/ax'\nimport type {\n Finding as LayerFinding,\n Severity as LayerSeverity,\n MultiLayerVerifier,\n VerifyOptions,\n} from '../multi-layer-verifier'\nimport { RunCritic, type RunTrace } from '../run-critic'\nimport {\n runSemanticConceptJudge,\n SEMANTIC_CONCEPT_JUDGE_VERSION,\n type SemanticConceptJudgeInput,\n type SemanticConceptJudgeOptions,\n} from '../semantic-concept-judge'\nimport { type AnalyzeTracesOptions, analyzeTraces } from '../trace-analyst/analyst'\nimport type { TraceAnalysisStore } from '../trace-analyst/store'\nimport type { JudgeFn, JudgeInput, JudgeScore, TCloud } from '../types'\nimport type { Analyst, AnalystFinding, AnalystSeverity } from './types'\nimport { makeFinding } from './types'\n\nconst ADAPTER_REV = '1'\n\n// ── Severity bridges ───────────────────────────────────────────────\n\nexport function liftSeverity(s: LayerSeverity): AnalystSeverity {\n switch (s) {\n case 'critical':\n return 'critical'\n case 'major':\n return 'high'\n case 'minor':\n return 'medium'\n case 'info':\n return 'info'\n }\n}\n\n// ── 1. analyzeTraces → Analyst ─────────────────────────────────────\n\nexport interface TraceAnalystAdapterOpts {\n id?: string\n area?: string\n /** The natural-language question(s) put to the analyst. One finding per question. */\n questions: string[]\n /** Caller-provided AxAI service — same one trace-analyst.ts expects. */\n ai: AxAIService\n model?: string\n /** Forwarded to analyzeTraces. */\n extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>\n}\n\n/**\n * @deprecated Prefer `createTraceAnalystKind` + one of the failure /\n * improvement kinds from `./kinds`. This adapter wraps the legacy\n * `analyzeTraces` flow whose output is `findings:string[]` — every\n * bullet gets flat-defaulted severity `medium` / confidence `0.6`,\n * which loses the per-finding grading kinds provide via Ax structured\n * output + Zod validation. Kept for one minor while consumers migrate.\n */\nexport function createTraceAnalystAdapter(\n opts: TraceAnalystAdapterOpts,\n): Analyst<TraceAnalysisStore> {\n const id = opts.id ?? 'trace-analyst'\n const area = opts.area ?? 'agent-reasoning'\n return {\n id,\n description:\n 'Runs the agent-eval trace analyst over an OTLP trace store and lifts its bulleted findings.',\n inputKind: 'trace-store',\n cost: { kind: 'llm', models: opts.model ? [opts.model] : undefined },\n version: `trace-analyst-${ADAPTER_REV}`,\n async analyze(store, ctx) {\n const out: AnalystFinding[] = []\n for (const question of opts.questions) {\n if (ctx.signal?.aborted) break\n const result = await analyzeTraces(\n { question },\n { source: store, ai: opts.ai, model: opts.model, ...opts.extra },\n )\n const subject = ctx.tags?.subject ?? question.slice(0, 60)\n // The responder produces a list of bullet strings. Each becomes\n // one finding; the prose answer is attached as rationale on the\n // first (so renderers that show only top-N still get context).\n if (result.findings.length === 0) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim: result.answer.slice(0, 200),\n rationale: result.answer,\n severity: 'info',\n confidence: 0.5,\n evidence_refs: [],\n metadata: {\n actor_prompt_version: result.actorPromptVersion,\n turns: result.turnCount,\n },\n }),\n )\n continue\n }\n result.findings.forEach((claim, i) => {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject,\n claim,\n rationale: i === 0 ? result.answer : undefined,\n severity: 'medium',\n confidence: 0.6,\n evidence_refs: [],\n metadata: { question, turns: result.turnCount, finding_index: i },\n }),\n )\n })\n }\n return out\n },\n }\n}\n\n// ── 2. MultiLayerVerifier → Analyst ─────────────────────────────────\n\nexport interface VerifierAdapterOpts<Env> {\n id?: string\n area?: string\n verifier: MultiLayerVerifier<Env>\n /**\n * The verifier expects an `env` per run. Adapters take it from\n * `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.\n */\n options?: Omit<VerifyOptions<Env>, 'env'>\n}\n\nexport function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env> {\n const id = opts.id ?? 'multi-layer-verifier'\n const area = opts.area ?? 'verification'\n return {\n id,\n description:\n \"Runs a MultiLayerVerifier and lifts each layer's findings into the analyst envelope.\",\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `verifier-${ADAPTER_REV}`,\n async analyze(env, ctx) {\n const report = await opts.verifier.run({ env, ...opts.options })\n const out: AnalystFinding[] = []\n for (const layer of report.layers) {\n for (const finding of layer.findings) {\n out.push(liftLayerFinding(id, area, layer.layer, finding))\n }\n // Layer-level signal: a failed/error layer is itself a finding\n // even if it didn't emit per-finding rows.\n if (layer.status === 'fail' || layer.status === 'error' || layer.status === 'timeout') {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: layer.layer,\n claim: `layer \"${layer.layer}\" ${layer.status}: ${layer.reason ?? 'no reason given'}`,\n severity:\n layer.status === 'error' ? 'high' : layer.status === 'timeout' ? 'medium' : 'high',\n confidence: 1,\n evidence_refs: [],\n metadata: {\n layer_status: layer.status,\n duration_ms: layer.durationMs,\n score: layer.score,\n diagnostics: layer.diagnostics,\n },\n }),\n )\n }\n }\n ctx.log?.('verifier complete', {\n layers: report.layers.length,\n blended: report.blendedScore,\n all_pass: report.allPass,\n })\n return out\n },\n }\n}\n\nfunction liftLayerFinding(\n analyst_id: string,\n area: string,\n layer: string,\n f: LayerFinding,\n): AnalystFinding {\n return makeFinding({\n analyst_id,\n area,\n subject: f.layer ?? layer,\n claim: f.message,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: f.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }]\n : [],\n metadata: f.detail,\n })\n}\n\n// ── 3. RunCritic → Analyst ──────────────────────────────────────────\n\nexport interface RunCriticAdapterOpts {\n id?: string\n area?: string\n critic?: RunCritic\n /** Optional threshold below which a dimension is reported as a finding. Default 0.5. */\n threshold?: number\n}\n\nexport function createRunCriticAdapter(opts: RunCriticAdapterOpts = {}): Analyst<RunTrace> {\n const id = opts.id ?? 'run-critic'\n const area = opts.area ?? 'run-quality'\n const critic = opts.critic ?? new RunCritic()\n const threshold = opts.threshold ?? 0.5\n return {\n id,\n description:\n 'Scores a single run across success / grounding / drift / tool-quality and surfaces below-threshold dimensions.',\n inputKind: 'custom',\n cost: { kind: 'deterministic' },\n version: `run-critic-${ADAPTER_REV}`,\n async analyze(trace) {\n const score = critic.scoreTrace(trace)\n const out: AnalystFinding[] = []\n const dims: Array<[keyof typeof score, AnalystSeverity, string]> = [\n ['success', 'critical', 'run did not complete successfully'],\n ['goalProgress', 'high', 'goal progress is low'],\n ['repoGroundedness', 'high', 'output is poorly grounded in the repository'],\n ['toolUseQuality', 'medium', 'tool use quality is low'],\n ['patchQuality', 'medium', 'no real patch/edit evidence'],\n ['testReality', 'high', 'no real test/build evidence'],\n ['finalGate', 'critical', 'final gate is blocking'],\n ]\n for (const [dim, sev, msg] of dims) {\n const value = score[dim] as number\n if (typeof value === 'number' && value < threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: dim,\n claim: msg,\n rationale: `${dim}=${value.toFixed(2)} below threshold ${threshold}`,\n severity: sev,\n confidence: 1,\n evidence_refs: [],\n metadata: { dimension: dim, value, threshold, run_id: trace.run.runId },\n }),\n )\n }\n }\n // Drift penalty is high → surface as a finding (inverse threshold).\n if (score.driftPenalty > 1 - threshold) {\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: 'drift',\n claim: 'agent output drifted from repository signal',\n rationale: `driftPenalty=${score.driftPenalty.toFixed(2)}`,\n severity: 'medium',\n confidence: 0.9,\n evidence_refs: [],\n metadata: { drift_penalty: score.driftPenalty, notes: score.notes },\n }),\n )\n }\n return out\n },\n }\n}\n\n// ── 4. JudgeFn → Analyst ────────────────────────────────────────────\n\nexport interface JudgeAdapterOpts {\n id?: string\n area?: string\n judge: JudgeFn\n /** TCloud handle the JudgeFn calls. */\n tcloud: TCloud\n /** Optional cost classification — most judges call an LLM. */\n cost?: Analyst['cost']\n /** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */\n threshold?: number\n}\n\nexport function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput> {\n const id = opts.id ?? 'judge'\n const area = opts.area ?? 'judge'\n const threshold = opts.threshold ?? 6\n return {\n id,\n description:\n 'Wraps an agent-eval JudgeFn into an analyst; below-threshold dimensions surface as findings.',\n inputKind: 'judge-input',\n cost: opts.cost ?? { kind: 'llm' },\n version: `judge-${ADAPTER_REV}`,\n async analyze(input) {\n const scores = await opts.judge(opts.tcloud, input)\n return scores\n .filter((s) => normalize10(s.score) < threshold)\n .map((s) => liftJudgeScore(id, area, s))\n },\n }\n}\n\nfunction normalize10(s: number): number {\n // JudgeScore convention is 0-10 but some judges emit 0-1. Coerce to 0-10.\n return s <= 1 ? s * 10 : s\n}\n\nfunction liftJudgeScore(analyst_id: string, area: string, s: JudgeScore): AnalystFinding {\n const score10 = normalize10(s.score)\n const severity: AnalystSeverity =\n score10 < 3 ? 'critical' : score10 < 5 ? 'high' : score10 < 7 ? 'medium' : 'low'\n return makeFinding({\n analyst_id,\n area,\n subject: s.dimension,\n claim: `${s.judgeName}/${s.dimension} scored ${score10.toFixed(1)}/10`,\n rationale: s.reasoning,\n severity,\n confidence: 0.8,\n evidence_refs: s.evidence\n ? [{ kind: 'artifact', uri: 'inline:evidence', excerpt: s.evidence }]\n : [],\n // Provenance: this finding IS a judge verdict (an acceptance score), not an\n // observation of behavior. The steer firewall (assertNoJudgeVerdict) rejects\n // it from steering — even when it cites an artifact above — because letting a\n // verdict steer the next attempt is the held-out judge leaking into the loop.\n derived_from_judge: true,\n metadata: { judge_name: s.judgeName, dimension: s.dimension, score_10: score10 },\n })\n}\n\n// ── 5. SemanticConceptJudge → Analyst ──────────────────────────────\n\nexport interface SemanticConceptJudgeAdapterOpts {\n id?: string\n area?: string\n options?: SemanticConceptJudgeOptions\n}\n\nexport function createSemanticConceptJudgeAdapter(\n opts: SemanticConceptJudgeAdapterOpts = {},\n): Analyst<SemanticConceptJudgeInput> {\n const id = opts.id ?? 'semantic-concept-judge'\n const area = opts.area ?? 'concept-coverage'\n return {\n id,\n description:\n 'Runs the semantic-concept judge and surfaces missing / weak concepts as findings.',\n inputKind: 'custom',\n cost: { kind: 'llm', models: opts.options?.model ? [opts.options.model] : undefined },\n version: `${SEMANTIC_CONCEPT_JUDGE_VERSION}-adapter-${ADAPTER_REV}`,\n async analyze(input) {\n const result = await runSemanticConceptJudge(input, opts.options)\n if (!result.available) {\n return [\n makeFinding({\n analyst_id: id,\n area,\n claim: 'semantic-concept judge unavailable',\n rationale: result.error,\n severity: 'info',\n confidence: 1,\n evidence_refs: [],\n metadata: { reason: result.error },\n }),\n ]\n }\n const out: AnalystFinding[] = []\n for (const f of result.findings) {\n // Only surface gaps: missing concepts or low scores. Concepts at\n // 7+/10 with present=true are not findings — they're successes.\n if (f.present && f.score >= 7) continue\n out.push(\n makeFinding({\n analyst_id: id,\n area,\n subject: f.concept,\n claim: f.present\n ? `concept \"${f.concept}\" is weak (${f.score}/10)`\n : `concept \"${f.concept}\" is missing`,\n rationale: f.evidence,\n severity: liftSeverity(f.severity),\n confidence: 0.85,\n evidence_refs: [{ kind: 'artifact', uri: 'inline:evidence', excerpt: f.evidence }],\n metadata: {\n concept: f.concept,\n present: f.present,\n score_10: f.score,\n cost_usd: result.costUsd ?? undefined,\n },\n }),\n )\n }\n return out\n },\n }\n}\n","// The realness-oracle firewall (docs/learning-flywheel.md, \"The steer is f(trace)\").\n//\n// A realness/authenticity signal has TWO legitimate roles that must stay\n// separated by a firewall:\n// (a) anchor judge J — write-only: scores the chosen output, gates promotion,\n// NEVER seen by the worker/optimizer mid-run (else the loop games it).\n// (b) steer f(trace) — an analyst observes the agent's OWN behavior in the\n// trace (\"imported a stub\", \"used a non-crypto PRNG where encryption was\n// required\") and steers the next attempt. Legitimate, because it is derived\n// from OBSERVABLE BEHAVIOR, not from J's held-out verdict.\n//\n// The correct discriminator is PROVENANCE, not evidence presence. A judge verdict\n// lifted into a finding (createJudgeAdapter → liftJudgeScore) is a verdict even\n// when it cites an artifact; an evidence-less trace-analyst bullet is an\n// observation even though it cites nothing. So the firewall keys on\n// `AnalystFinding.derived_from_judge` (set at the judge lift site), NOT on whether\n// evidence_refs is populated. The instant a verdict steers the next attempt it is\n// a back-channel for J and the loop Goodharts realness exactly as it would\n// Goodhart pass-rate.\n\nimport type { AnalystFinding, EvidenceRef } from './types'\n\n/** Evidence grounded in the agent's OWN execution: OTLP trace elements\n * (`span`/`event`) or the artifact it produced (`artifact`). */\nconst OBSERVABLE_KINDS: ReadonlySet<EvidenceRef['kind']> = new Set<EvidenceRef['kind']>([\n 'span',\n 'event',\n 'artifact',\n])\n\n/** DESCRIPTIVE predicate: does the finding cite at least one observable\n * (span/event/artifact) evidence ref. Useful for ranking evidence quality or\n * rendering — it is NOT the steer gate. Evidence presence is the WRONG\n * discriminator for steering: a legitimate trace-analyst observation may cite\n * nothing (it would be wrongly rejected), and a judge verdict may cite an\n * artifact (it would be wrongly admitted). Use `assertNoJudgeVerdict` to gate\n * steering; use this only where \"is this grounded in observable evidence\" is the\n * literal question. */\nexport function isTraceObservable(finding: AnalystFinding): boolean {\n return finding.evidence_refs.some((ref) => OBSERVABLE_KINDS.has(ref.kind))\n}\n\n/** True iff the finding is a JUDGE VERDICT (an acceptance score lifted into a\n * finding), identified by provenance set at the lift site — independent of\n * whatever evidence it cites. */\nexport function isJudgeVerdict(finding: AnalystFinding): boolean {\n return finding.derived_from_judge === true\n}\n\n/**\n * THE steer firewall. Fail-loud guard for any path that admits analyst findings\n * as STEERING input (the `f(trace)` role): rejects — naming the offenders — any\n * finding whose provenance is a judge verdict, rather than let `J` leak into the\n * loop. Returns the findings unchanged for chaining.\n *\n * Call this at the chokepoint where a detector that ALSO scores/gates has its\n * findings turned into a steer (the judge-and-steer dual-role case). It keys on\n * provenance, so it correctly admits evidence-less trace-analyst observations and\n * correctly rejects an artifact-citing judge verdict — the cases an evidence\n * check gets backwards.\n *\n * It is necessary, not sufficient: it stops PROVENANCE-tagged verdicts. A judge\n * whose output is laundered through a hand-built finding with no provenance flag\n * is out of its reach — provenance must be honestly set at every judge→finding\n * lift (today: createJudgeAdapter). That is why the integrity rule lives at the\n * lift site, and why ProposeContext.judgeScores?: never is the complementary\n * compile-time tripwire on the obvious direct channel.\n */\nexport function assertNoJudgeVerdict(\n findings: ReadonlyArray<AnalystFinding>,\n context = 'steer',\n): ReadonlyArray<AnalystFinding> {\n const leaks = findings.filter(isJudgeVerdict)\n if (leaks.length > 0) {\n throw new Error(\n `${context}: a judge verdict cannot be admitted as steering input — that is the ` +\n `held-out judge leaking into the loop. Offending judge-derived findings: [${leaks\n .map((f) => f.finding_id)\n .join(', ')}]. Steering consumes observations of behavior, never acceptance verdicts.`,\n )\n }\n return findings\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuCA,IAAM,cAAc;AAIb,SAAS,aAAa,GAAmC;AAC9D,UAAQ,GAAG;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,IACT,KAAK;AACH,aAAO;AAAA,EACX;AACF;AAwBO,SAAS,0BACd,MAC6B;AAC7B,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,QAAQ,CAAC,KAAK,KAAK,IAAI,OAAU;AAAA,IACnE,SAAS,iBAAiB,WAAW;AAAA,IACrC,MAAM,QAAQ,OAAO,KAAK;AACxB,YAAM,MAAwB,CAAC;AAC/B,iBAAW,YAAY,KAAK,WAAW;AACrC,YAAI,IAAI,QAAQ,QAAS;AACzB,cAAM,SAAS,MAAM;AAAA,UACnB,EAAE,SAAS;AAAA,UACX,EAAE,QAAQ,OAAO,IAAI,KAAK,IAAI,OAAO,KAAK,OAAO,GAAG,KAAK,MAAM;AAAA,QACjE;AACA,cAAM,UAAU,IAAI,MAAM,WAAW,SAAS,MAAM,GAAG,EAAE;AAIzD,YAAI,OAAO,SAAS,WAAW,GAAG;AAChC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA,OAAO,OAAO,OAAO,MAAM,GAAG,GAAG;AAAA,cACjC,WAAW,OAAO;AAAA,cAClB,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,sBAAsB,OAAO;AAAA,gBAC7B,OAAO,OAAO;AAAA,cAChB;AAAA,YACF,CAAC;AAAA,UACH;AACA;AAAA,QACF;AACA,eAAO,SAAS,QAAQ,CAAC,OAAO,MAAM;AACpC,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA;AAAA,cACA;AAAA,cACA,WAAW,MAAM,IAAI,OAAO,SAAS;AAAA,cACrC,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,UAAU,OAAO,OAAO,WAAW,eAAe,EAAE;AAAA,YAClE,CAAC;AAAA,UACH;AAAA,QACF,CAAC;AAAA,MACH;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAeO,SAAS,sBAA2B,MAA8C;AACvF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,YAAY,WAAW;AAAA,IAChC,MAAM,QAAQ,KAAK,KAAK;AACtB,YAAM,SAAS,MAAM,KAAK,SAAS,IAAI,EAAE,KAAK,GAAG,KAAK,QAAQ,CAAC;AAC/D,YAAM,MAAwB,CAAC;AAC/B,iBAAW,SAAS,OAAO,QAAQ;AACjC,mBAAW,WAAW,MAAM,UAAU;AACpC,cAAI,KAAK,iBAAiB,IAAI,MAAM,MAAM,OAAO,OAAO,CAAC;AAAA,QAC3D;AAGA,YAAI,MAAM,WAAW,UAAU,MAAM,WAAW,WAAW,MAAM,WAAW,WAAW;AACrF,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS,MAAM;AAAA,cACf,OAAO,UAAU,MAAM,KAAK,KAAK,MAAM,MAAM,KAAK,MAAM,UAAU,iBAAiB;AAAA,cACnF,UACE,MAAM,WAAW,UAAU,SAAS,MAAM,WAAW,YAAY,WAAW;AAAA,cAC9E,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU;AAAA,gBACR,cAAc,MAAM;AAAA,gBACpB,aAAa,MAAM;AAAA,gBACnB,OAAO,MAAM;AAAA,gBACb,aAAa,MAAM;AAAA,cACrB;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AACA,UAAI,MAAM,qBAAqB;AAAA,QAC7B,QAAQ,OAAO,OAAO;AAAA,QACtB,SAAS,OAAO;AAAA,QAChB,UAAU,OAAO;AAAA,MACnB,CAAC;AACD,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAEA,SAAS,iBACP,YACA,MACA,OACA,GACgB;AAChB,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE,SAAS;AAAA,IACpB,OAAO,EAAE;AAAA,IACT,UAAU,aAAa,EAAE,QAAQ;AAAA,IACjC,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA,IACL,UAAU,EAAE;AAAA,EACd,CAAC;AACH;AAYO,SAAS,uBAAuB,OAA6B,CAAC,GAAsB;AACzF,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,SAAS,KAAK,UAAU,IAAI,UAAU;AAC5C,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,gBAAgB;AAAA,IAC9B,SAAS,cAAc,WAAW;AAAA,IAClC,MAAM,QAAQ,OAAO;AACnB,YAAM,QAAQ,OAAO,WAAW,KAAK;AACrC,YAAM,MAAwB,CAAC;AAC/B,YAAM,OAA6D;AAAA,QACjE,CAAC,WAAW,YAAY,mCAAmC;AAAA,QAC3D,CAAC,gBAAgB,QAAQ,sBAAsB;AAAA,QAC/C,CAAC,oBAAoB,QAAQ,6CAA6C;AAAA,QAC1E,CAAC,kBAAkB,UAAU,yBAAyB;AAAA,QACtD,CAAC,gBAAgB,UAAU,6BAA6B;AAAA,QACxD,CAAC,eAAe,QAAQ,6BAA6B;AAAA,QACrD,CAAC,aAAa,YAAY,wBAAwB;AAAA,MACpD;AACA,iBAAW,CAAC,KAAK,KAAK,GAAG,KAAK,MAAM;AAClC,cAAM,QAAQ,MAAM,GAAG;AACvB,YAAI,OAAO,UAAU,YAAY,QAAQ,WAAW;AAClD,cAAI;AAAA,YACF,YAAY;AAAA,cACV,YAAY;AAAA,cACZ;AAAA,cACA,SAAS;AAAA,cACT,OAAO;AAAA,cACP,WAAW,GAAG,GAAG,IAAI,MAAM,QAAQ,CAAC,CAAC,oBAAoB,SAAS;AAAA,cAClE,UAAU;AAAA,cACV,YAAY;AAAA,cACZ,eAAe,CAAC;AAAA,cAChB,UAAU,EAAE,WAAW,KAAK,OAAO,WAAW,QAAQ,MAAM,IAAI,MAAM;AAAA,YACxE,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,UAAI,MAAM,eAAe,IAAI,WAAW;AACtC,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS;AAAA,YACT,OAAO;AAAA,YACP,WAAW,gBAAgB,MAAM,aAAa,QAAQ,CAAC,CAAC;AAAA,YACxD,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,eAAe,MAAM,cAAc,OAAO,MAAM,MAAM;AAAA,UACpE,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAgBO,SAAS,mBAAmB,MAA6C;AAC9E,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,KAAK,QAAQ,EAAE,MAAM,MAAM;AAAA,IACjC,SAAS,SAAS,WAAW;AAAA,IAC7B,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,KAAK,MAAM,KAAK,QAAQ,KAAK;AAClD,aAAO,OACJ,OAAO,CAAC,MAAM,YAAY,EAAE,KAAK,IAAI,SAAS,EAC9C,IAAI,CAAC,MAAM,eAAe,IAAI,MAAM,CAAC,CAAC;AAAA,IAC3C;AAAA,EACF;AACF;AAEA,SAAS,YAAY,GAAmB;AAEtC,SAAO,KAAK,IAAI,IAAI,KAAK;AAC3B;AAEA,SAAS,eAAe,YAAoB,MAAc,GAA+B;AACvF,QAAM,UAAU,YAAY,EAAE,KAAK;AACnC,QAAM,WACJ,UAAU,IAAI,aAAa,UAAU,IAAI,SAAS,UAAU,IAAI,WAAW;AAC7E,SAAO,YAAY;AAAA,IACjB;AAAA,IACA;AAAA,IACA,SAAS,EAAE;AAAA,IACX,OAAO,GAAG,EAAE,SAAS,IAAI,EAAE,SAAS,WAAW,QAAQ,QAAQ,CAAC,CAAC;AAAA,IACjE,WAAW,EAAE;AAAA,IACb;AAAA,IACA,YAAY;AAAA,IACZ,eAAe,EAAE,WACb,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC,IAClE,CAAC;AAAA;AAAA;AAAA;AAAA;AAAA,IAKL,oBAAoB;AAAA,IACpB,UAAU,EAAE,YAAY,EAAE,WAAW,WAAW,EAAE,WAAW,UAAU,QAAQ;AAAA,EACjF,CAAC;AACH;AAUO,SAAS,kCACd,OAAwC,CAAC,GACL;AACpC,QAAM,KAAK,KAAK,MAAM;AACtB,QAAM,OAAO,KAAK,QAAQ;AAC1B,SAAO;AAAA,IACL;AAAA,IACA,aACE;AAAA,IACF,WAAW;AAAA,IACX,MAAM,EAAE,MAAM,OAAO,QAAQ,KAAK,SAAS,QAAQ,CAAC,KAAK,QAAQ,KAAK,IAAI,OAAU;AAAA,IACpF,SAAS,GAAG,8BAA8B,YAAY,WAAW;AAAA,IACjE,MAAM,QAAQ,OAAO;AACnB,YAAM,SAAS,MAAM,wBAAwB,OAAO,KAAK,OAAO;AAChE,UAAI,CAAC,OAAO,WAAW;AACrB,eAAO;AAAA,UACL,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,OAAO;AAAA,YACP,WAAW,OAAO;AAAA,YAClB,UAAU;AAAA,YACV,YAAY;AAAA,YACZ,eAAe,CAAC;AAAA,YAChB,UAAU,EAAE,QAAQ,OAAO,MAAM;AAAA,UACnC,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,MAAwB,CAAC;AAC/B,iBAAW,KAAK,OAAO,UAAU;AAG/B,YAAI,EAAE,WAAW,EAAE,SAAS,EAAG;AAC/B,YAAI;AAAA,UACF,YAAY;AAAA,YACV,YAAY;AAAA,YACZ;AAAA,YACA,SAAS,EAAE;AAAA,YACX,OAAO,EAAE,UACL,YAAY,EAAE,OAAO,cAAc,EAAE,KAAK,SAC1C,YAAY,EAAE,OAAO;AAAA,YACzB,WAAW,EAAE;AAAA,YACb,UAAU,aAAa,EAAE,QAAQ;AAAA,YACjC,YAAY;AAAA,YACZ,eAAe,CAAC,EAAE,MAAM,YAAY,KAAK,mBAAmB,SAAS,EAAE,SAAS,CAAC;AAAA,YACjF,UAAU;AAAA,cACR,SAAS,EAAE;AAAA,cACX,SAAS,EAAE;AAAA,cACX,UAAU,EAAE;AAAA,cACZ,UAAU,OAAO,WAAW;AAAA,YAC9B;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;;;AClZA,IAAM,mBAAqD,oBAAI,IAAyB;AAAA,EACtF;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAUM,SAAS,kBAAkB,SAAkC;AAClE,SAAO,QAAQ,cAAc,KAAK,CAAC,QAAQ,iBAAiB,IAAI,IAAI,IAAI,CAAC;AAC3E;AAKO,SAAS,eAAe,SAAkC;AAC/D,SAAO,QAAQ,uBAAuB;AACxC;AAqBO,SAAS,qBACd,UACA,UAAU,SACqB;AAC/B,QAAM,QAAQ,SAAS,OAAO,cAAc;AAC5C,MAAI,MAAM,SAAS,GAAG;AACpB,UAAM,IAAI;AAAA,MACR,GAAG,OAAO,sJACoE,MACzE,IAAI,CAAC,MAAM,EAAE,UAAU,EACvB,KAAK,IAAI,CAAC;AAAA,IACjB;AAAA,EACF;AACA,SAAO;AACT;","names":[]}
@@ -63,6 +63,13 @@ interface AuthenticityResult {
63
63
  usesRealImpl: boolean;
64
64
  realInfra: boolean;
65
65
  wired: boolean;
66
+ /** The required artifact is actually referenced/imported by other (non-artifact)
67
+ * files — i.e. wired into the rest of the system, not dead code. Domain-agnostic:
68
+ * a deliverable nothing else uses is suspect in any vertical. */
69
+ artifactReferenced: boolean;
70
+ /** Convenience: the artifact is connected to the running system, via either the
71
+ * domain wiring signal OR a structural reference. */
72
+ artifactWired: boolean;
66
73
  fakeShim: boolean;
67
74
  /** mock/stub markers per 1000 LOC, capped at 100. */
68
75
  mockDensity: number;
@@ -81,6 +88,7 @@ interface RealnessGate {
81
88
  declare function gateRealness(r: AuthenticityResult, opts?: {
82
89
  floor?: number;
83
90
  requireArtifact?: boolean;
91
+ requireArtifactWired?: boolean;
84
92
  }): RealnessGate;
85
93
  interface AuthenticityNuance {
86
94
  /** 0 (nothing mocked) … 100 (entirely mocked). */
@@ -104,5 +112,50 @@ declare function scoreAuthenticityNuance(files: readonly ProducedFile[], complet
104
112
  intent?: string;
105
113
  prioritize?: RegExp;
106
114
  }): Promise<AuthenticityNuance>;
115
+ interface RealnessJudgment {
116
+ /** 0 (facade/simulator) … 100 (real implementation on the intended infra). */
117
+ isReal: number;
118
+ rationale: string;
119
+ }
120
+ /**
121
+ * Ask an LLM to rate realness DIRECTLY on a 0-100 scale — the axis that matched
122
+ * human blind-labels in validation (F1 0.80→0.88 on the gray band; a fakePct/
123
+ * hollowness proxy over-penalized "real core + stubbed periphery" partials, and a
124
+ * weak judge model over-flagged — use a strong one). Domain-agnostic skeleton; the
125
+ * consumer supplies `intent` (what the deliverable should be) and `rubric` (domain
126
+ * specifics of real-vs-fake). Fail-closed: a bad response reads as fully fake.
127
+ */
128
+ declare function judgeRealnessLlm(files: readonly ProducedFile[], complete: CompleteFn, opts?: {
129
+ intent?: string;
130
+ rubric?: string;
131
+ prioritize?: RegExp;
132
+ }): Promise<RealnessJudgment>;
133
+ type RealnessBand = 'clean-real' | 'clean-fake' | 'gray';
134
+ interface BlendedRealness extends AuthenticityResult {
135
+ /** Final realness after (only-when-needed) LLM adjudication, 0…100. */
136
+ blendedRealness: number;
137
+ band: RealnessBand;
138
+ /** True iff the LLM judge was actually consulted (gray band only). */
139
+ consultedLlm: boolean;
140
+ /** Present iff the LLM was consulted. */
141
+ judgment?: RealnessJudgment;
142
+ }
143
+ /**
144
+ * Score realness using the cheapest sufficient signal: trust the deterministic
145
+ * scorer on the CLEAN extremes (obvious fakes / obviously-real-and-wired), and only
146
+ * spend an LLM call on the GRAY band — cells that look real structurally but carry
147
+ * fakeness markers (a fake shim, an unwired/dead artifact, high mock density) or land
148
+ * mid-range. This caps LLM cost at the fraction of cells static analysis can't
149
+ * resolve, which matters at multi-vertical / multi-partner scale.
150
+ *
151
+ * Domain-agnostic: the gray-band TRIGGER is structural; the LLM judges via the
152
+ * consumer-supplied `intent`. Fail-closed (a bad LLM response reads as fully fake).
153
+ */
154
+ declare function scoreRealnessBlended(files: readonly ProducedFile[], signals: AuthenticitySignals, complete: CompleteFn, opts?: {
155
+ intent?: string;
156
+ rubric?: string;
157
+ grayBand?: [number, number];
158
+ mockGrayThreshold?: number;
159
+ }): Promise<BlendedRealness>;
107
160
 
108
- export { type AuthenticityNuance, type AuthenticityResult, type AuthenticitySignals, type CompleteFn, type ProducedFile, type RealnessGate, gateRealness, scoreAuthenticity, scoreAuthenticityNuance };
161
+ export { type AuthenticityNuance, type AuthenticityResult, type AuthenticitySignals, type BlendedRealness, type CompleteFn, type ProducedFile, type RealnessBand, type RealnessGate, type RealnessJudgment, gateRealness, judgeRealnessLlm, scoreAuthenticity, scoreAuthenticityNuance, scoreRealnessBlended };
@@ -5,6 +5,33 @@ var DEFAULT_MOCK = /\bmock|\bfake|\bdummy|\bstub\b|simulat|hardcoded|placeholder
5
5
  function basename(p) {
6
6
  return p.split("/").pop() ?? p;
7
7
  }
8
+ function escapeRe(s) {
9
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
10
+ }
11
+ function declaredNames(content) {
12
+ const names = /* @__PURE__ */ new Set();
13
+ const re = /\b(?:contract|library|interface|abstract\s+contract|class|enum|struct|module|package)\s+([A-Za-z_]\w*)/g;
14
+ let m;
15
+ while (m = re.exec(content)) {
16
+ const name = m[1];
17
+ if (name && name.length >= 4) names.add(name);
18
+ }
19
+ return [...names];
20
+ }
21
+ function isArtifactReferenced(required, others) {
22
+ if (!required.length || !others.length) return false;
23
+ return required.some((rf) => {
24
+ const stem = rf.path.replace(/\.[^.]+$/, "");
25
+ const base = basename(rf.path);
26
+ const names = declaredNames(rf.content ?? "");
27
+ return others.some((o) => {
28
+ const c = o.content ?? "";
29
+ if (!c) return false;
30
+ if (c.includes(base) || c.includes(stem)) return true;
31
+ return names.some((n) => new RegExp(`\\b${escapeRe(n)}\\b`).test(c));
32
+ });
33
+ });
34
+ }
8
35
  function scoreAuthenticity(files, signals) {
9
36
  const w = {
10
37
  artifact: signals.weights?.artifact ?? 40,
@@ -24,6 +51,8 @@ function scoreAuthenticity(files, signals) {
24
51
  const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText);
25
52
  const realInfra = signals.realInfra.test(allText);
26
53
  const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false;
54
+ const artifactReferenced = isArtifactReferenced(required, others);
55
+ const artifactWired = wired || artifactReferenced;
27
56
  const fakeShim = files.some(
28
57
  (f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? "")
29
58
  );
@@ -32,6 +61,7 @@ function scoreAuthenticity(files, signals) {
32
61
  ) ?? []).length;
33
62
  const loc = Math.max(1, allText.split("\n").length);
34
63
  const mockDensity = Math.min(100, Math.round(mockHits / loc * 1e3));
64
+ const decorativeArtifact = requiredArtifactPresent && usesRealImpl && !artifactWired;
35
65
  let realness = 0;
36
66
  if (requiredArtifactPresent) realness += w.artifact;
37
67
  if (usesRealImpl) realness += w.impl;
@@ -56,6 +86,10 @@ function scoreAuthenticity(files, signals) {
56
86
  flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`);
57
87
  if (signals.wiring && requiredArtifactPresent && !wired)
58
88
  flags.push("NOT_WIRED: artifact exists but is never used by the client");
89
+ if (decorativeArtifact)
90
+ flags.push(
91
+ "DEAD_ARTIFACT: required artifact is not referenced/imported anywhere \u2014 decorative or dead code"
92
+ );
59
93
  return {
60
94
  realness,
61
95
  requiredArtifactPresent,
@@ -63,6 +97,8 @@ function scoreAuthenticity(files, signals) {
63
97
  usesRealImpl,
64
98
  realInfra,
65
99
  wired,
100
+ artifactReferenced,
101
+ artifactWired,
66
102
  fakeShim,
67
103
  mockDensity,
68
104
  flags
@@ -76,6 +112,9 @@ function gateRealness(r, opts = {}) {
76
112
  if (r.fakeShim && !r.usesRealImpl) {
77
113
  return { gated: true, reason: "fake shim with no real implementation" };
78
114
  }
115
+ if (opts.requireArtifactWired && r.requiredArtifactPresent && r.usesRealImpl && !r.artifactWired) {
116
+ return { gated: true, reason: "required artifact present but never wired into the system" };
117
+ }
79
118
  if (r.realness < floor)
80
119
  return { gated: true, reason: `realness ${r.realness} below floor ${floor}` };
81
120
  return { gated: false };
@@ -120,9 +159,57 @@ ${fileDigest(files, { prioritize: opts.prioritize })}`;
120
159
  };
121
160
  }
122
161
  }
162
+ async function judgeRealnessLlm(files, complete, opts = {}) {
163
+ const system = "You are a skeptical auditor. Rate how REAL an agent's build is vs the intended deliverable, 0-100. A genuine implementation of the HARD part on the intended infrastructure is SUBSTANTIALLY REAL (>=50) even if peripheral layers are stubbed; a pure simulator / facade / branded-type stand-in / no-op-stubbed dependency with no real implementation is FAKE (<=25). Judge the core on its merits and note the runtime. " + (opts.rubric ? `Domain rubric: ${opts.rubric} ` : "") + 'Respond with ONLY JSON: {"isReal":0-100,"why":"one sentence"}.';
164
+ const user = (opts.intent ? `Intended deliverable: ${opts.intent}
165
+
166
+ ` : "") + `Produced files:
167
+ ${fileDigest(files, { prioritize: opts.prioritize })}`;
168
+ try {
169
+ const raw = await complete(system, user);
170
+ const m = raw.match(/\{[\s\S]*\}/);
171
+ if (!m) return { isReal: 0, rationale: "unparseable judge response" };
172
+ const j = JSON.parse(m[0]);
173
+ return {
174
+ isReal: clampPct(j.isReal),
175
+ rationale: typeof j.why === "string" ? j.why : ""
176
+ };
177
+ } catch (err) {
178
+ return {
179
+ isReal: 0,
180
+ rationale: `judge error: ${err instanceof Error ? err.message : String(err)}`
181
+ };
182
+ }
183
+ }
184
+ async function scoreRealnessBlended(files, signals, complete, opts = {}) {
185
+ const det = scoreAuthenticity(files, signals);
186
+ const [lo, hi] = opts.grayBand ?? [30, 70];
187
+ const mockGray = opts.mockGrayThreshold ?? 8;
188
+ const conflict = det.requiredArtifactPresent && det.usesRealImpl && (det.fakeShim || !det.wired || det.mockDensity >= mockGray);
189
+ const midRange = det.realness >= lo && det.realness <= hi;
190
+ let band;
191
+ if (conflict || midRange) band = "gray";
192
+ else if (det.realness < lo) band = "clean-fake";
193
+ else band = "clean-real";
194
+ if (band !== "gray") {
195
+ return { ...det, blendedRealness: det.realness, band, consultedLlm: false };
196
+ }
197
+ const judgment = await judgeRealnessLlm(files, complete, {
198
+ intent: opts.intent,
199
+ rubric: opts.rubric,
200
+ prioritize: signals.requiredArtifact
201
+ });
202
+ const blendedRealness = Math.max(
203
+ 0,
204
+ Math.min(100, Math.round(0.25 * det.realness + 0.75 * judgment.isReal))
205
+ );
206
+ return { ...det, blendedRealness, band, consultedLlm: true, judgment };
207
+ }
123
208
  export {
124
209
  gateRealness,
210
+ judgeRealnessLlm,
125
211
  scoreAuthenticity,
126
- scoreAuthenticityNuance
212
+ scoreAuthenticityNuance,
213
+ scoreRealnessBlended
127
214
  };
128
215
  //# sourceMappingURL=index.js.map