@tangle-network/agent-eval 0.76.0 → 0.77.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Authenticity — "is this real, or convincing BS?"
|
|
3
|
+
*
|
|
4
|
+
* Pass/build-style scoring rewards anything that compiles and renders, so an
|
|
5
|
+
* agent can ship a polished frontend with a FAKE in-browser engine and zero of
|
|
6
|
+
* the required on-chain/contract work, and outscore a half-finished real
|
|
7
|
+
* implementation. This module scores what buildability does not: did the agent
|
|
8
|
+
* actually build the intended thing on the intended infra, or fake it.
|
|
9
|
+
*
|
|
10
|
+
* Two layers:
|
|
11
|
+
* - DETERMINISTIC `scoreAuthenticity` — calibrated by construction (no LLM,
|
|
12
|
+
* trustworthy today). Structural signals over the produced files, driven by
|
|
13
|
+
* a domain `AuthenticitySignals` config: required artifact present, real
|
|
14
|
+
* implementation of the hard part, real infra calls, wiring, fake-shim
|
|
15
|
+
* detection, mock/stub density.
|
|
16
|
+
* - LLM NUANCE `scoreAuthenticityNuance` — mocked% / fake% / unique% for the
|
|
17
|
+
* "looks real but is hollow" cases structure can't see.
|
|
18
|
+
*
|
|
19
|
+
* `gateRealness` is the anti-Goodhart gate: a submission missing the required
|
|
20
|
+
* artifact (or faking it) is capped and cannot rank high regardless of how
|
|
21
|
+
* buildable it is. Domain-agnostic; ships a Solidity/Fhenix preset.
|
|
22
|
+
*
|
|
23
|
+
* Input is the produced-state currency: `{ path, content }[]` — exactly what
|
|
24
|
+
* `extractProducedState(...).artifacts` yields, so any consumer can feed a run's
|
|
25
|
+
* produced state straight in.
|
|
26
|
+
*/
|
|
27
|
+
interface ProducedFile {
|
|
28
|
+
path: string;
|
|
29
|
+
content?: string;
|
|
30
|
+
}
|
|
31
|
+
interface AuthenticitySignals {
|
|
32
|
+
/** Human label for the domain (e.g. 'fhenix-fhe'). */
|
|
33
|
+
label: string;
|
|
34
|
+
/** A file the task REQUIRES (e.g. /\.sol$/ for an on-chain task). */
|
|
35
|
+
requiredArtifact?: RegExp;
|
|
36
|
+
/** Vendored/3rd-party paths to exclude from required-artifact detection. */
|
|
37
|
+
vendored?: RegExp;
|
|
38
|
+
/** Real implementation of the hard part, inside the required artifact
|
|
39
|
+
* (e.g. Fhenix encrypted types + FHE.* ops). Matched against content, so it
|
|
40
|
+
* fails on comments/strings only if the regex is written tightly. */
|
|
41
|
+
realImpl: RegExp;
|
|
42
|
+
/** Real use of the intended client infra (e.g. cofhejs.encrypt() calls). */
|
|
43
|
+
realInfra: RegExp;
|
|
44
|
+
/** Evidence the artifact is actually wired/used (e.g. contract writes). */
|
|
45
|
+
wiring?: RegExp;
|
|
46
|
+
/** A fake shim standing in for the real thing — matched on file path AND body. */
|
|
47
|
+
fakeShim: RegExp;
|
|
48
|
+
/** Mock/stub/TODO markers. Defaults to a generic set. */
|
|
49
|
+
mock?: RegExp;
|
|
50
|
+
/** Score weights (default 40/25/20/15). */
|
|
51
|
+
weights?: {
|
|
52
|
+
artifact?: number;
|
|
53
|
+
impl?: number;
|
|
54
|
+
infra?: number;
|
|
55
|
+
wiring?: number;
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
interface AuthenticityResult {
|
|
59
|
+
/** Deterministic realness, 0 (BS) … 100 (real on real infra). */
|
|
60
|
+
realness: number;
|
|
61
|
+
requiredArtifactPresent: boolean;
|
|
62
|
+
requiredArtifactCount: number;
|
|
63
|
+
usesRealImpl: boolean;
|
|
64
|
+
realInfra: boolean;
|
|
65
|
+
wired: boolean;
|
|
66
|
+
fakeShim: boolean;
|
|
67
|
+
/** mock/stub markers per 1000 LOC, capped at 100. */
|
|
68
|
+
mockDensity: number;
|
|
69
|
+
/** Human-readable BS flags — what's missing or faked. */
|
|
70
|
+
flags: string[];
|
|
71
|
+
}
|
|
72
|
+
/** Deterministic authenticity scan of produced files. Pure — same files in,
|
|
73
|
+
* same score out. No LLM, no IO. */
|
|
74
|
+
declare function scoreAuthenticity(files: readonly ProducedFile[], signals: AuthenticitySignals): AuthenticityResult;
|
|
75
|
+
interface RealnessGate {
|
|
76
|
+
gated: boolean;
|
|
77
|
+
reason?: string;
|
|
78
|
+
}
|
|
79
|
+
/** Anti-Goodhart gate: a required-artifact-missing or faked submission is
|
|
80
|
+
* capped and cannot rank high regardless of buildability. */
|
|
81
|
+
declare function gateRealness(r: AuthenticityResult, opts?: {
|
|
82
|
+
floor?: number;
|
|
83
|
+
requireArtifact?: boolean;
|
|
84
|
+
}): RealnessGate;
|
|
85
|
+
interface AuthenticityNuance {
|
|
86
|
+
/** 0 (nothing mocked) … 100 (entirely mocked). */
|
|
87
|
+
mockedPct: number;
|
|
88
|
+
/** 0 (genuine) … 100 (a hollow facade / cargo-culted). */
|
|
89
|
+
fakePct: number;
|
|
90
|
+
/** 0 (boilerplate/template clone) … 100 (distinctive real work). */
|
|
91
|
+
uniquePct: number;
|
|
92
|
+
verdict: string;
|
|
93
|
+
}
|
|
94
|
+
/** A minimal completion fn — inject your model caller (router/tcloud). Keeps
|
|
95
|
+
* this module free of any specific LLM client. */
|
|
96
|
+
type CompleteFn = (system: string, user: string) => Promise<string>;
|
|
97
|
+
/**
|
|
98
|
+
* LLM nuance scoring — judges the "looks real but is hollow" axis structure
|
|
99
|
+
* misses. Inject a `complete` caller; returns mocked/fake/unique % + a verdict.
|
|
100
|
+
* Fail-soft: a bad/unparseable response yields a worst-case (fully-fake) read,
|
|
101
|
+
* never a false pass.
|
|
102
|
+
*/
|
|
103
|
+
declare function scoreAuthenticityNuance(files: readonly ProducedFile[], complete: CompleteFn, opts?: {
|
|
104
|
+
intent?: string;
|
|
105
|
+
prioritize?: RegExp;
|
|
106
|
+
}): Promise<AuthenticityNuance>;
|
|
107
|
+
|
|
108
|
+
export { type AuthenticityNuance, type AuthenticityResult, type AuthenticitySignals, type CompleteFn, type ProducedFile, type RealnessGate, gateRealness, scoreAuthenticity, scoreAuthenticityNuance };
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import "../chunk-PZ5AY32C.js";
|
|
2
|
+
|
|
3
|
+
// src/authenticity/index.ts
|
|
4
|
+
var DEFAULT_MOCK = /\bmock|\bfake|\bdummy|\bstub\b|simulat|hardcoded|placeholder|TODO|not\s+implemented|FIXME/i;
|
|
5
|
+
function basename(p) {
|
|
6
|
+
return p.split("/").pop() ?? p;
|
|
7
|
+
}
|
|
8
|
+
function scoreAuthenticity(files, signals) {
|
|
9
|
+
const w = {
|
|
10
|
+
artifact: signals.weights?.artifact ?? 40,
|
|
11
|
+
impl: signals.weights?.impl ?? 25,
|
|
12
|
+
infra: signals.weights?.infra ?? 20,
|
|
13
|
+
wiring: signals.weights?.wiring ?? 15
|
|
14
|
+
};
|
|
15
|
+
const mockRe = signals.mock ?? DEFAULT_MOCK;
|
|
16
|
+
const required = signals.requiredArtifact ? files.filter(
|
|
17
|
+
(f) => signals.requiredArtifact.test(f.path) && !(signals.vendored?.test(f.path) ?? false)
|
|
18
|
+
) : [];
|
|
19
|
+
const others = signals.requiredArtifact ? files.filter((f) => !required.includes(f)) : files;
|
|
20
|
+
const requiredText = required.map((f) => f.content ?? "").join("\n");
|
|
21
|
+
const otherText = others.map((f) => f.content ?? "").join("\n");
|
|
22
|
+
const allText = files.map((f) => f.content ?? "").join("\n");
|
|
23
|
+
const requiredArtifactPresent = signals.requiredArtifact ? required.length > 0 : true;
|
|
24
|
+
const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText);
|
|
25
|
+
const realInfra = signals.realInfra.test(allText);
|
|
26
|
+
const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false;
|
|
27
|
+
const fakeShim = files.some(
|
|
28
|
+
(f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? "")
|
|
29
|
+
);
|
|
30
|
+
const mockHits = (allText.match(
|
|
31
|
+
new RegExp(mockRe.source, mockRe.flags.includes("g") ? mockRe.flags : `${mockRe.flags}g`)
|
|
32
|
+
) ?? []).length;
|
|
33
|
+
const loc = Math.max(1, allText.split("\n").length);
|
|
34
|
+
const mockDensity = Math.min(100, Math.round(mockHits / loc * 1e3));
|
|
35
|
+
let realness = 0;
|
|
36
|
+
if (requiredArtifactPresent) realness += w.artifact;
|
|
37
|
+
if (usesRealImpl) realness += w.impl;
|
|
38
|
+
if (realInfra) realness += w.infra;
|
|
39
|
+
if (wired) realness += w.wiring;
|
|
40
|
+
if (fakeShim) realness -= 25;
|
|
41
|
+
realness -= Math.min(20, mockDensity);
|
|
42
|
+
realness = Math.max(0, Math.min(100, realness));
|
|
43
|
+
const flags = [];
|
|
44
|
+
if (signals.requiredArtifact && !requiredArtifactPresent) {
|
|
45
|
+
flags.push(
|
|
46
|
+
`NO_REQUIRED_ARTIFACT: task needs ${signals.label} artifact (${signals.requiredArtifact}); none produced`
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
if (requiredArtifactPresent && signals.requiredArtifact && !usesRealImpl) {
|
|
50
|
+
flags.push("ARTIFACT_NO_REAL_IMPL: required artifact exists but lacks the real implementation");
|
|
51
|
+
}
|
|
52
|
+
if (fakeShim) flags.push("FAKE_SHIM: ships a client-side stand-in simulating the real infra");
|
|
53
|
+
if (!realInfra && !requiredArtifactPresent)
|
|
54
|
+
flags.push("NO_REAL_INFRA: no real infra calls \u2014 cosmetic at best");
|
|
55
|
+
if (mockDensity >= 8)
|
|
56
|
+
flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`);
|
|
57
|
+
if (signals.wiring && requiredArtifactPresent && !wired)
|
|
58
|
+
flags.push("NOT_WIRED: artifact exists but is never used by the client");
|
|
59
|
+
return {
|
|
60
|
+
realness,
|
|
61
|
+
requiredArtifactPresent,
|
|
62
|
+
requiredArtifactCount: required.length,
|
|
63
|
+
usesRealImpl,
|
|
64
|
+
realInfra,
|
|
65
|
+
wired,
|
|
66
|
+
fakeShim,
|
|
67
|
+
mockDensity,
|
|
68
|
+
flags
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
function gateRealness(r, opts = {}) {
|
|
72
|
+
const floor = opts.floor ?? 30;
|
|
73
|
+
if ((opts.requireArtifact ?? true) && !r.requiredArtifactPresent) {
|
|
74
|
+
return { gated: true, reason: "required artifact missing" };
|
|
75
|
+
}
|
|
76
|
+
if (r.fakeShim && !r.usesRealImpl) {
|
|
77
|
+
return { gated: true, reason: "fake shim with no real implementation" };
|
|
78
|
+
}
|
|
79
|
+
if (r.realness < floor)
|
|
80
|
+
return { gated: true, reason: `realness ${r.realness} below floor ${floor}` };
|
|
81
|
+
return { gated: false };
|
|
82
|
+
}
|
|
83
|
+
function fileDigest(files, opts = {}) {
|
|
84
|
+
const maxFiles = opts.maxFiles ?? 14;
|
|
85
|
+
const perFile = opts.perFile ?? 1200;
|
|
86
|
+
const ordered = opts.prioritize ? [...files].sort(
|
|
87
|
+
(a, b) => Number(opts.prioritize.test(b.path)) - Number(opts.prioritize.test(a.path))
|
|
88
|
+
) : files;
|
|
89
|
+
return ordered.slice(0, maxFiles).map((f) => `// ${f.path}
|
|
90
|
+
${(f.content ?? "").slice(0, perFile)}`).join("\n\n");
|
|
91
|
+
}
|
|
92
|
+
function clampPct(v) {
|
|
93
|
+
const n = typeof v === "number" ? v : Number(v);
|
|
94
|
+
return Number.isFinite(n) ? Math.max(0, Math.min(100, Math.round(n))) : 0;
|
|
95
|
+
}
|
|
96
|
+
async function scoreAuthenticityNuance(files, complete, opts = {}) {
|
|
97
|
+
const system = 'You audit whether an agent BUILT THE REAL THING or faked it. Be skeptical: a pretty UI, cosmetic labels, simulated/in-memory stand-ins for real infra, and cargo-culted imports do NOT count as real. Respond with ONLY JSON: {"mockedPct":0-100,"fakePct":0-100,"uniquePct":0-100,"verdict":"one sentence"}. mockedPct = how much is mocked/stubbed; fakePct = how hollow/facade it is; uniquePct = how distinctive vs boilerplate.';
|
|
98
|
+
const user = (opts.intent ? `Intended deliverable: ${opts.intent}
|
|
99
|
+
|
|
100
|
+
` : "") + `Produced files:
|
|
101
|
+
${fileDigest(files, { prioritize: opts.prioritize })}`;
|
|
102
|
+
try {
|
|
103
|
+
const raw = await complete(system, user);
|
|
104
|
+
const m = raw.match(/\{[\s\S]*\}/);
|
|
105
|
+
if (!m)
|
|
106
|
+
return { mockedPct: 100, fakePct: 100, uniquePct: 0, verdict: "unparseable judge response" };
|
|
107
|
+
const j = JSON.parse(m[0]);
|
|
108
|
+
return {
|
|
109
|
+
mockedPct: clampPct(j.mockedPct),
|
|
110
|
+
fakePct: clampPct(j.fakePct),
|
|
111
|
+
uniquePct: clampPct(j.uniquePct),
|
|
112
|
+
verdict: typeof j.verdict === "string" ? j.verdict : ""
|
|
113
|
+
};
|
|
114
|
+
} catch (err) {
|
|
115
|
+
return {
|
|
116
|
+
mockedPct: 100,
|
|
117
|
+
fakePct: 100,
|
|
118
|
+
uniquePct: 0,
|
|
119
|
+
verdict: `judge error: ${err instanceof Error ? err.message : String(err)}`
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
export {
|
|
124
|
+
gateRealness,
|
|
125
|
+
scoreAuthenticity,
|
|
126
|
+
scoreAuthenticityNuance
|
|
127
|
+
};
|
|
128
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/authenticity/index.ts"],"sourcesContent":["/**\n * Authenticity — \"is this real, or convincing BS?\"\n *\n * Pass/build-style scoring rewards anything that compiles and renders, so an\n * agent can ship a polished frontend with a FAKE in-browser engine and zero of\n * the required on-chain/contract work, and outscore a half-finished real\n * implementation. This module scores what buildability does not: did the agent\n * actually build the intended thing on the intended infra, or fake it.\n *\n * Two layers:\n * - DETERMINISTIC `scoreAuthenticity` — calibrated by construction (no LLM,\n * trustworthy today). Structural signals over the produced files, driven by\n * a domain `AuthenticitySignals` config: required artifact present, real\n * implementation of the hard part, real infra calls, wiring, fake-shim\n * detection, mock/stub density.\n * - LLM NUANCE `scoreAuthenticityNuance` — mocked% / fake% / unique% for the\n * \"looks real but is hollow\" cases structure can't see.\n *\n * `gateRealness` is the anti-Goodhart gate: a submission missing the required\n * artifact (or faking it) is capped and cannot rank high regardless of how\n * buildable it is. Domain-agnostic; ships a Solidity/Fhenix preset.\n *\n * Input is the produced-state currency: `{ path, content }[]` — exactly what\n * `extractProducedState(...).artifacts` yields, so any consumer can feed a run's\n * produced state straight in.\n */\n\nexport interface ProducedFile {\n path: string\n content?: string\n}\n\nexport interface AuthenticitySignals {\n /** Human label for the domain (e.g. 'fhenix-fhe'). */\n label: string\n /** A file the task REQUIRES (e.g. /\\.sol$/ for an on-chain task). */\n requiredArtifact?: RegExp\n /** Vendored/3rd-party paths to exclude from required-artifact detection. */\n vendored?: RegExp\n /** Real implementation of the hard part, inside the required artifact\n * (e.g. Fhenix encrypted types + FHE.* ops). Matched against content, so it\n * fails on comments/strings only if the regex is written tightly. */\n realImpl: RegExp\n /** Real use of the intended client infra (e.g. cofhejs.encrypt() calls). */\n realInfra: RegExp\n /** Evidence the artifact is actually wired/used (e.g. contract writes). */\n wiring?: RegExp\n /** A fake shim standing in for the real thing — matched on file path AND body. */\n fakeShim: RegExp\n /** Mock/stub/TODO markers. Defaults to a generic set. */\n mock?: RegExp\n /** Score weights (default 40/25/20/15). */\n weights?: { artifact?: number; impl?: number; infra?: number; wiring?: number }\n}\n\nexport interface AuthenticityResult {\n /** Deterministic realness, 0 (BS) … 100 (real on real infra). */\n realness: number\n requiredArtifactPresent: boolean\n requiredArtifactCount: number\n usesRealImpl: boolean\n realInfra: boolean\n wired: boolean\n fakeShim: boolean\n /** mock/stub markers per 1000 LOC, capped at 100. */\n mockDensity: number\n /** Human-readable BS flags — what's missing or faked. */\n flags: string[]\n}\n\nconst DEFAULT_MOCK =\n /\\bmock|\\bfake|\\bdummy|\\bstub\\b|simulat|hardcoded|placeholder|TODO|not\\s+implemented|FIXME/i\n\nfunction basename(p: string): string {\n return p.split('/').pop() ?? p\n}\n\n/** Deterministic authenticity scan of produced files. Pure — same files in,\n * same score out. No LLM, no IO. */\nexport function scoreAuthenticity(\n files: readonly ProducedFile[],\n signals: AuthenticitySignals,\n): AuthenticityResult {\n const w = {\n artifact: signals.weights?.artifact ?? 40,\n impl: signals.weights?.impl ?? 25,\n infra: signals.weights?.infra ?? 20,\n wiring: signals.weights?.wiring ?? 15,\n }\n const mockRe = signals.mock ?? DEFAULT_MOCK\n\n const required = signals.requiredArtifact\n ? files.filter(\n (f) => signals.requiredArtifact!.test(f.path) && !(signals.vendored?.test(f.path) ?? false),\n )\n : []\n const others = signals.requiredArtifact ? files.filter((f) => !required.includes(f)) : files\n\n const requiredText = required.map((f) => f.content ?? '').join('\\n')\n const otherText = others.map((f) => f.content ?? '').join('\\n')\n const allText = files.map((f) => f.content ?? '').join('\\n')\n\n const requiredArtifactPresent = signals.requiredArtifact ? required.length > 0 : true\n // Real impl looked for in the required artifact when there is one, else anywhere.\n const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText)\n const realInfra = signals.realInfra.test(allText)\n const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false\n const fakeShim = files.some(\n (f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? ''),\n )\n\n const mockHits = (\n allText.match(\n new RegExp(mockRe.source, mockRe.flags.includes('g') ? mockRe.flags : `${mockRe.flags}g`),\n ) ?? []\n ).length\n const loc = Math.max(1, allText.split('\\n').length)\n const mockDensity = Math.min(100, Math.round((mockHits / loc) * 1000))\n\n let realness = 0\n if (requiredArtifactPresent) realness += w.artifact\n if (usesRealImpl) realness += w.impl\n if (realInfra) realness += w.infra\n if (wired) realness += w.wiring\n if (fakeShim) realness -= 25\n realness -= Math.min(20, mockDensity)\n realness = Math.max(0, Math.min(100, realness))\n\n const flags: string[] = []\n if (signals.requiredArtifact && !requiredArtifactPresent) {\n flags.push(\n `NO_REQUIRED_ARTIFACT: task needs ${signals.label} artifact (${signals.requiredArtifact}); none produced`,\n )\n }\n if (requiredArtifactPresent && signals.requiredArtifact && !usesRealImpl) {\n flags.push('ARTIFACT_NO_REAL_IMPL: required artifact exists but lacks the real implementation')\n }\n if (fakeShim) flags.push('FAKE_SHIM: ships a client-side stand-in simulating the real infra')\n if (!realInfra && !requiredArtifactPresent)\n flags.push('NO_REAL_INFRA: no real infra calls — cosmetic at best')\n if (mockDensity >= 8)\n flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`)\n if (signals.wiring && requiredArtifactPresent && !wired)\n flags.push('NOT_WIRED: artifact exists but is never used by the client')\n\n return {\n realness,\n requiredArtifactPresent,\n requiredArtifactCount: required.length,\n usesRealImpl,\n realInfra,\n wired,\n fakeShim,\n mockDensity,\n flags,\n }\n}\n\nexport interface RealnessGate {\n gated: boolean\n reason?: string\n}\n\n/** Anti-Goodhart gate: a required-artifact-missing or faked submission is\n * capped and cannot rank high regardless of buildability. */\nexport function gateRealness(\n r: AuthenticityResult,\n opts: { floor?: number; requireArtifact?: boolean } = {},\n): RealnessGate {\n const floor = opts.floor ?? 30\n if ((opts.requireArtifact ?? true) && !r.requiredArtifactPresent) {\n return { gated: true, reason: 'required artifact missing' }\n }\n if (r.fakeShim && !r.usesRealImpl) {\n return { gated: true, reason: 'fake shim with no real implementation' }\n }\n if (r.realness < floor)\n return { gated: true, reason: `realness ${r.realness} below floor ${floor}` }\n return { gated: false }\n}\n\n// ── LLM nuance layer ─────────────────────────────────────────────────────────\n\nexport interface AuthenticityNuance {\n /** 0 (nothing mocked) … 100 (entirely mocked). */\n mockedPct: number\n /** 0 (genuine) … 100 (a hollow facade / cargo-culted). */\n fakePct: number\n /** 0 (boilerplate/template clone) … 100 (distinctive real work). */\n uniquePct: number\n verdict: string\n}\n\n/** A minimal completion fn — inject your model caller (router/tcloud). Keeps\n * this module free of any specific LLM client. */\nexport type CompleteFn = (system: string, user: string) => Promise<string>\n\nfunction fileDigest(\n files: readonly ProducedFile[],\n opts: { maxFiles?: number; perFile?: number; prioritize?: RegExp } = {},\n): string {\n const maxFiles = opts.maxFiles ?? 14\n const perFile = opts.perFile ?? 1200\n // Lead with the required-artifact files (e.g. .sol) so a truncated digest\n // never hides the very thing the judge must assess.\n const ordered = opts.prioritize\n ? [...files].sort(\n (a, b) => Number(opts.prioritize!.test(b.path)) - Number(opts.prioritize!.test(a.path)),\n )\n : files\n return ordered\n .slice(0, maxFiles)\n .map((f) => `// ${f.path}\\n${(f.content ?? '').slice(0, perFile)}`)\n .join('\\n\\n')\n}\n\nfunction clampPct(v: unknown): number {\n const n = typeof v === 'number' ? v : Number(v)\n return Number.isFinite(n) ? Math.max(0, Math.min(100, Math.round(n))) : 0\n}\n\n/**\n * LLM nuance scoring — judges the \"looks real but is hollow\" axis structure\n * misses. Inject a `complete` caller; returns mocked/fake/unique % + a verdict.\n * Fail-soft: a bad/unparseable response yields a worst-case (fully-fake) read,\n * never a false pass.\n */\nexport async function scoreAuthenticityNuance(\n files: readonly ProducedFile[],\n complete: CompleteFn,\n opts: { intent?: string; prioritize?: RegExp } = {},\n): Promise<AuthenticityNuance> {\n const system =\n 'You audit whether an agent BUILT THE REAL THING or faked it. Be skeptical: ' +\n 'a pretty UI, cosmetic labels, simulated/in-memory stand-ins for real infra, ' +\n 'and cargo-culted imports do NOT count as real. Respond with ONLY JSON: ' +\n '{\"mockedPct\":0-100,\"fakePct\":0-100,\"uniquePct\":0-100,\"verdict\":\"one sentence\"}. ' +\n 'mockedPct = how much is mocked/stubbed; fakePct = how hollow/facade it is; ' +\n 'uniquePct = how distinctive vs boilerplate.'\n const user =\n (opts.intent ? `Intended deliverable: ${opts.intent}\\n\\n` : '') +\n `Produced files:\\n${fileDigest(files, { prioritize: opts.prioritize })}`\n try {\n const raw = await complete(system, user)\n const m = raw.match(/\\{[\\s\\S]*\\}/)\n if (!m)\n return { mockedPct: 100, fakePct: 100, uniquePct: 0, verdict: 'unparseable judge response' }\n const j = JSON.parse(m[0]) as Record<string, unknown>\n return {\n mockedPct: clampPct(j.mockedPct),\n fakePct: clampPct(j.fakePct),\n uniquePct: clampPct(j.uniquePct),\n verdict: typeof j.verdict === 'string' ? j.verdict : '',\n }\n } catch (err) {\n return {\n mockedPct: 100,\n fakePct: 100,\n uniquePct: 0,\n verdict: `judge error: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n}\n\n// Domain `AuthenticitySignals` (e.g. a Solidity/Fhenix preset) live in the\n// CONSUMER, not the substrate — this module stays domain-agnostic.\n"],"mappings":";;;AAsEA,IAAM,eACJ;AAEF,SAAS,SAAS,GAAmB;AACnC,SAAO,EAAE,MAAM,GAAG,EAAE,IAAI,KAAK;AAC/B;AAIO,SAAS,kBACd,OACA,SACoB;AACpB,QAAM,IAAI;AAAA,IACR,UAAU,QAAQ,SAAS,YAAY;AAAA,IACvC,MAAM,QAAQ,SAAS,QAAQ;AAAA,IAC/B,OAAO,QAAQ,SAAS,SAAS;AAAA,IACjC,QAAQ,QAAQ,SAAS,UAAU;AAAA,EACrC;AACA,QAAM,SAAS,QAAQ,QAAQ;AAE/B,QAAM,WAAW,QAAQ,mBACrB,MAAM;AAAA,IACJ,CAAC,MAAM,QAAQ,iBAAkB,KAAK,EAAE,IAAI,KAAK,EAAE,QAAQ,UAAU,KAAK,EAAE,IAAI,KAAK;AAAA,EACvF,IACA,CAAC;AACL,QAAM,SAAS,QAAQ,mBAAmB,MAAM,OAAO,CAAC,MAAM,CAAC,SAAS,SAAS,CAAC,CAAC,IAAI;AAEvF,QAAM,eAAe,SAAS,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AACnE,QAAM,YAAY,OAAO,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAC9D,QAAM,UAAU,MAAM,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAE3D,QAAM,0BAA0B,QAAQ,mBAAmB,SAAS,SAAS,IAAI;AAEjF,QAAM,eAAe,QAAQ,SAAS,KAAK,QAAQ,mBAAmB,eAAe,OAAO;AAC5F,QAAM,YAAY,QAAQ,UAAU,KAAK,OAAO;AAChD,QAAM,QAAQ,QAAQ,SAAS,QAAQ,OAAO,KAAK,aAAa,OAAO,IAAI;AAC3E,QAAM,WAAW,MAAM;AAAA,IACrB,CAAC,MAAM,QAAQ,SAAS,KAAK,SAAS,EAAE,IAAI,CAAC,KAAK,QAAQ,SAAS,KAAK,EAAE,WAAW,EAAE;AAAA,EACzF;AAEA,QAAM,YACJ,QAAQ;AAAA,IACN,IAAI,OAAO,OAAO,QAAQ,OAAO,MAAM,SAAS,GAAG,IAAI,OAAO,QAAQ,GAAG,OAAO,KAAK,GAAG;AAAA,EAC1F,KAAK,CAAC,GACN;AACF,QAAM,MAAM,KAAK,IAAI,GAAG,QAAQ,MAAM,IAAI,EAAE,MAAM;AAClD,QAAM,cAAc,KAAK,IAAI,KAAK,KAAK,MAAO,WAAW,MAAO,GAAI,CAAC;AAErE,MAAI,WAAW;AACf,MAAI,wBAAyB,aAAY,EAAE;AAC3C,MAAI,aAAc,aAAY,EAAE;AAChC,MAAI,UAAW,aAAY,EAAE;AAC7B,MAAI,MAAO,aAAY,EAAE;AACzB,MAAI,SAAU,aAAY;AAC1B,cAAY,KAAK,IAAI,IAAI,WAAW;AACpC,aAAW,KAAK,IAAI,GAAG,KAAK,IAAI,KAAK,QAAQ,CAAC;AAE9C,QAAM,QAAkB,CAAC;AACzB,MAAI,QAAQ,oBAAoB,CAAC,yBAAyB;AACxD,UAAM;AAAA,MACJ,oCAAoC,QAAQ,KAAK,cAAc,QAAQ,gBAAgB;AAAA,IACzF;AAAA,EACF;AACA,MAAI,2BAA2B,QAAQ,oBAAoB,CAAC,cAAc;AACxE,UAAM,KAAK,mFAAmF;AAAA,EAChG;AACA,MAAI,SAAU,OAAM,KAAK,mEAAmE;AAC5F,MAAI,CAAC,aAAa,CAAC;AACjB,UAAM,KAAK,4DAAuD;AACpE,MAAI,eAAe;AACjB,UAAM,KAAK,sBAAsB,WAAW,iCAAiC;AAC/E,MAAI,QAAQ,UAAU,2BAA2B,CAAC;AAChD,UAAM,KAAK,4DAA4D;AAEzE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,uBAAuB,SAAS;AAAA,IAChC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AASO,SAAS,aACd,GACA,OAAsD,CAAC,GACzC;AACd,QAAM,QAAQ,KAAK,SAAS;AAC5B,OAAK,KAAK,mBAAmB,SAAS,CAAC,EAAE,yBAAyB;AAChE,WAAO,EAAE,OAAO,MAAM,QAAQ,4BAA4B;AAAA,EAC5D;AACA,MAAI,EAAE,YAAY,CAAC,EAAE,cAAc;AACjC,WAAO,EAAE,OAAO,MAAM,QAAQ,wCAAwC;AAAA,EACxE;AACA,MAAI,EAAE,WAAW;AACf,WAAO,EAAE,OAAO,MAAM,QAAQ,YAAY,EAAE,QAAQ,gBAAgB,KAAK,GAAG;AAC9E,SAAO,EAAE,OAAO,MAAM;AACxB;AAkBA,SAAS,WACP,OACA,OAAqE,CAAC,GAC9D;AACR,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,UAAU,KAAK,WAAW;AAGhC,QAAM,UAAU,KAAK,aACjB,CAAC,GAAG,KAAK,EAAE;AAAA,IACT,CAAC,GAAG,MAAM,OAAO,KAAK,WAAY,KAAK,EAAE,IAAI,CAAC,IAAI,OAAO,KAAK,WAAY,KAAK,EAAE,IAAI,CAAC;AAAA,EACxF,IACA;AACJ,SAAO,QACJ,MAAM,GAAG,QAAQ,EACjB,IAAI,CAAC,MAAM,MAAM,EAAE,IAAI;AAAA,GAAM,EAAE,WAAW,IAAI,MAAM,GAAG,OAAO,CAAC,EAAE,EACjE,KAAK,MAAM;AAChB;AAEA,SAAS,SAAS,GAAoB;AACpC,QAAM,IAAI,OAAO,MAAM,WAAW,IAAI,OAAO,CAAC;AAC9C,SAAO,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,KAAK,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI;AAC1E;AAQA,eAAsB,wBACpB,OACA,UACA,OAAiD,CAAC,GACrB;AAC7B,QAAM,SACJ;AAMF,QAAM,QACH,KAAK,SAAS,yBAAyB,KAAK,MAAM;AAAA;AAAA,IAAS,MAC5D;AAAA,EAAoB,WAAW,OAAO,EAAE,YAAY,KAAK,WAAW,CAAC,CAAC;AACxE,MAAI;AACF,UAAM,MAAM,MAAM,SAAS,QAAQ,IAAI;AACvC,UAAM,IAAI,IAAI,MAAM,aAAa;AACjC,QAAI,CAAC;AACH,aAAO,EAAE,WAAW,KAAK,SAAS,KAAK,WAAW,GAAG,SAAS,6BAA6B;AAC7F,UAAM,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC;AACzB,WAAO;AAAA,MACL,WAAW,SAAS,EAAE,SAAS;AAAA,MAC/B,SAAS,SAAS,EAAE,OAAO;AAAA,MAC3B,WAAW,SAAS,EAAE,SAAS;AAAA,MAC/B,SAAS,OAAO,EAAE,YAAY,WAAW,EAAE,UAAU;AAAA,IACvD;AAAA,EACF,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,WAAW;AAAA,MACX,SAAS;AAAA,MACT,WAAW;AAAA,MACX,SAAS,gBAAgB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC3E;AAAA,EACF;AACF;","names":[]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.77.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.77.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -114,6 +114,11 @@
|
|
|
114
114
|
"import": "./dist/storyboard/index.js",
|
|
115
115
|
"default": "./dist/storyboard/index.js"
|
|
116
116
|
},
|
|
117
|
+
"./authenticity": {
|
|
118
|
+
"types": "./dist/authenticity/index.d.ts",
|
|
119
|
+
"import": "./dist/authenticity/index.js",
|
|
120
|
+
"default": "./dist/authenticity/index.js"
|
|
121
|
+
},
|
|
117
122
|
"./workflow": {
|
|
118
123
|
"types": "./dist/workflow/index.d.ts",
|
|
119
124
|
"import": "./dist/workflow/index.js",
|