@tangle-network/agent-eval 0.61.0 → 0.63.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -8
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +388 -11
- package/dist/campaign/index.js +597 -12
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
- package/dist/chunk-4ODZXQV2.js.map +1 -0
- package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
- package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
- package/dist/contract/index.d.ts +9 -9
- package/dist/contract/index.js +4 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
- package/dist/index.d.ts +98 -14
- package/dist/index.js +331 -128
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
- package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
- package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
- package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
- package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
- package/package.json +1 -1
- package/dist/chunk-GMXHLSLL.js.map +0 -1
- package/dist/chunk-OLULBECP.js.map +0 -1
- package/dist/chunk-SUGME4OT.js.map +0 -1
- /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -9,22 +9,29 @@ import {
|
|
|
9
9
|
runBehavioralCanaries
|
|
10
10
|
} from "./chunk-SHTXZ4O2.js";
|
|
11
11
|
import {
|
|
12
|
-
BackendIntegrityError,
|
|
13
12
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
14
13
|
DEFAULT_RED_TEAM_CORPUS,
|
|
15
14
|
Dataset,
|
|
16
15
|
HoldoutLockedError,
|
|
17
|
-
assertRealBackend,
|
|
18
16
|
buildReflectionPrompt,
|
|
17
|
+
crowdingDistance,
|
|
18
|
+
dominates,
|
|
19
19
|
hashScenarios,
|
|
20
|
+
paretoFrontier,
|
|
21
|
+
paretoFrontierWithCrowding,
|
|
20
22
|
parseReflectionResponse,
|
|
21
23
|
redTeamDataset,
|
|
22
24
|
redTeamReport,
|
|
23
25
|
runCanaries,
|
|
26
|
+
scalarScore,
|
|
24
27
|
scoreRedTeamOutput,
|
|
25
|
-
summarizeBackendIntegrity,
|
|
26
28
|
toolNamesForRun
|
|
27
|
-
} from "./chunk-
|
|
29
|
+
} from "./chunk-4ODZXQV2.js";
|
|
30
|
+
import {
|
|
31
|
+
BackendIntegrityError,
|
|
32
|
+
assertRealBackend,
|
|
33
|
+
summarizeBackendIntegrity
|
|
34
|
+
} from "./chunk-E22YUOAL.js";
|
|
28
35
|
import {
|
|
29
36
|
BENCHMARK_SPLIT_SEED,
|
|
30
37
|
benchmarks_exports,
|
|
@@ -723,8 +730,8 @@ function createVerifierAdapter(opts) {
|
|
|
723
730
|
const report = await opts.verifier.run({ env, ...opts.options });
|
|
724
731
|
const out = [];
|
|
725
732
|
for (const layer of report.layers) {
|
|
726
|
-
for (const
|
|
727
|
-
out.push(liftLayerFinding(id, area, layer.layer,
|
|
733
|
+
for (const finding2 of layer.findings) {
|
|
734
|
+
out.push(liftLayerFinding(id, area, layer.layer, finding2));
|
|
728
735
|
}
|
|
729
736
|
if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
|
|
730
737
|
out.push(
|
|
@@ -1754,6 +1761,281 @@ var DEFAULT_TRACE_ANALYST_KINDS = [
|
|
|
1754
1761
|
IMPROVEMENT_KIND_SPEC
|
|
1755
1762
|
];
|
|
1756
1763
|
|
|
1764
|
+
// src/analyst/kinds/skill-usage.ts
|
|
1765
|
+
import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
|
|
1766
|
+
import { join } from "path";
|
|
1767
|
+
var BLOAT_LINE_THRESHOLD = 300;
|
|
1768
|
+
var TANGLE_PRIVATE_RE = /\b(cli-bridge|tangletools|ops-board|drew-gtr-pro|@tangle-network\/|~\/company|tangle\.tools|gtm-agent)\b|\bkimi\b|\btcloud\b/gi;
|
|
1769
|
+
var TRIGGER_RE = /triggers?\s*[:-]/i;
|
|
1770
|
+
function listSkillDirs(root) {
|
|
1771
|
+
if (!existsSync3(root)) return [];
|
|
1772
|
+
const out = [];
|
|
1773
|
+
for (const entry of readdirSync(root, { withFileTypes: true })) {
|
|
1774
|
+
if (!entry.isDirectory() && !entry.isSymbolicLink()) continue;
|
|
1775
|
+
const skillMd = join(root, entry.name, "SKILL.md");
|
|
1776
|
+
if (existsSync3(skillMd)) out.push({ name: entry.name, path: skillMd });
|
|
1777
|
+
}
|
|
1778
|
+
return out;
|
|
1779
|
+
}
|
|
1780
|
+
function walkJsonl(dir, cap) {
|
|
1781
|
+
if (!existsSync3(dir)) return [];
|
|
1782
|
+
const files = [];
|
|
1783
|
+
const stack = [dir];
|
|
1784
|
+
while (stack.length) {
|
|
1785
|
+
const cur = stack.pop();
|
|
1786
|
+
let entries;
|
|
1787
|
+
try {
|
|
1788
|
+
entries = readdirSync(cur, { withFileTypes: true });
|
|
1789
|
+
} catch {
|
|
1790
|
+
continue;
|
|
1791
|
+
}
|
|
1792
|
+
for (const e of entries) {
|
|
1793
|
+
const full = join(cur, e.name);
|
|
1794
|
+
if (e.isDirectory()) stack.push(full);
|
|
1795
|
+
else if (e.name.endsWith(".jsonl")) {
|
|
1796
|
+
files.push(full);
|
|
1797
|
+
if (cap > 0 && files.length >= cap) return files;
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
}
|
|
1801
|
+
return files;
|
|
1802
|
+
}
|
|
1803
|
+
function frontmatterDescription(body) {
|
|
1804
|
+
const fm = /^---\n([\s\S]*?)\n---/.exec(body);
|
|
1805
|
+
const block = fm?.[1] ?? "";
|
|
1806
|
+
const m = /description:\s*(.+)/i.exec(block);
|
|
1807
|
+
return m?.[1] ?? "";
|
|
1808
|
+
}
|
|
1809
|
+
function countArtifacts(roots, name, aliases) {
|
|
1810
|
+
let n = 0;
|
|
1811
|
+
for (const root of roots) {
|
|
1812
|
+
const candidates = [join(root, ".evolve", name), ...aliases.map((a) => join(root, a))];
|
|
1813
|
+
for (const dir of candidates) {
|
|
1814
|
+
if (!existsSync3(dir)) continue;
|
|
1815
|
+
try {
|
|
1816
|
+
if (statSync(dir).isDirectory()) n += readdirSync(dir).length;
|
|
1817
|
+
else n += 1;
|
|
1818
|
+
} catch {
|
|
1819
|
+
}
|
|
1820
|
+
}
|
|
1821
|
+
}
|
|
1822
|
+
return n;
|
|
1823
|
+
}
|
|
1824
|
+
function buildSkillUsageReport(config) {
|
|
1825
|
+
const skills = config.skillRoots.flatMap(
|
|
1826
|
+
({ root, kind }) => listSkillDirs(root).map((s) => ({ ...s, kind }))
|
|
1827
|
+
);
|
|
1828
|
+
const names = skills.map((s) => s.name);
|
|
1829
|
+
const direct = new Map(names.map((n) => [n, 0]));
|
|
1830
|
+
const slash = new Map(names.map((n) => [n, 0]));
|
|
1831
|
+
const skillRe = /"skill"\s*:\s*"([a-z0-9_:-]+)"/g;
|
|
1832
|
+
const cmdRe = /<command-name>\/?([a-z0-9_:-]+)<\/command-name>/g;
|
|
1833
|
+
let transcripts = 0;
|
|
1834
|
+
for (const dir of config.transcriptDirs) {
|
|
1835
|
+
for (const file of walkJsonl(dir, config.maxTranscriptsPerDir ?? 0)) {
|
|
1836
|
+
transcripts += 1;
|
|
1837
|
+
let data;
|
|
1838
|
+
try {
|
|
1839
|
+
data = readFileSync2(file, "utf8");
|
|
1840
|
+
} catch {
|
|
1841
|
+
continue;
|
|
1842
|
+
}
|
|
1843
|
+
for (const m of data.matchAll(skillRe)) {
|
|
1844
|
+
const g = m[1];
|
|
1845
|
+
if (!g) continue;
|
|
1846
|
+
const n = g.split(":").pop() ?? g;
|
|
1847
|
+
const prev = direct.get(n);
|
|
1848
|
+
if (prev !== void 0) direct.set(n, prev + 1);
|
|
1849
|
+
}
|
|
1850
|
+
for (const m of data.matchAll(cmdRe)) {
|
|
1851
|
+
const g = m[1];
|
|
1852
|
+
if (g === void 0) continue;
|
|
1853
|
+
const prev = slash.get(g);
|
|
1854
|
+
if (prev !== void 0) slash.set(g, prev + 1);
|
|
1855
|
+
}
|
|
1856
|
+
}
|
|
1857
|
+
}
|
|
1858
|
+
const bodies = /* @__PURE__ */ new Map();
|
|
1859
|
+
for (const s of skills) {
|
|
1860
|
+
try {
|
|
1861
|
+
bodies.set(s.name, readFileSync2(s.path, "utf8"));
|
|
1862
|
+
} catch {
|
|
1863
|
+
bodies.set(s.name, "");
|
|
1864
|
+
}
|
|
1865
|
+
}
|
|
1866
|
+
const inbound = new Map(names.map((n) => [n, 0]));
|
|
1867
|
+
for (const target of names) {
|
|
1868
|
+
const ref = new RegExp(`/${target}\\b|\\[\\[${target}\\]\\]`);
|
|
1869
|
+
for (const s of skills) {
|
|
1870
|
+
if (s.name === target) continue;
|
|
1871
|
+
if (ref.test(bodies.get(s.name) ?? "")) inbound.set(target, inbound.get(target) + 1);
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
const records = skills.map((s) => {
|
|
1875
|
+
const body = bodies.get(s.name) ?? "";
|
|
1876
|
+
const dir = s.path.replace(/\/SKILL\.md$/, "");
|
|
1877
|
+
return {
|
|
1878
|
+
name: s.name,
|
|
1879
|
+
kind: s.kind,
|
|
1880
|
+
path: s.path,
|
|
1881
|
+
lines: body ? body.split("\n").length : 0,
|
|
1882
|
+
directInvocations: direct.get(s.name) ?? 0,
|
|
1883
|
+
slashInvocations: slash.get(s.name) ?? 0,
|
|
1884
|
+
inboundRefs: inbound.get(s.name) ?? 0,
|
|
1885
|
+
artifactCount: countArtifacts(
|
|
1886
|
+
config.artifactRoots ?? [],
|
|
1887
|
+
s.name,
|
|
1888
|
+
config.artifactAliases?.[s.name] ?? []
|
|
1889
|
+
),
|
|
1890
|
+
tanglePrivateRefs: (body.match(TANGLE_PRIVATE_RE) ?? []).length,
|
|
1891
|
+
hasReferencesDir: existsSync3(join(dir, "references")),
|
|
1892
|
+
hasEvalsDir: existsSync3(join(dir, "evals")),
|
|
1893
|
+
logsRuns: body.includes("skill-runs.jsonl"),
|
|
1894
|
+
hasTriggerPhrases: TRIGGER_RE.test(frontmatterDescription(body) || body.slice(0, 600))
|
|
1895
|
+
};
|
|
1896
|
+
});
|
|
1897
|
+
return { generatedFromTraces: transcripts, records };
|
|
1898
|
+
}
|
|
1899
|
+
var ANALYST_ID = "skill-usage";
|
|
1900
|
+
function finding(area, subject, claim, severity, confidence, producedAt, recommended, evidenceUri, rationale) {
|
|
1901
|
+
return {
|
|
1902
|
+
schema_version: "1.0.0",
|
|
1903
|
+
finding_id: computeFindingId({ analyst_id: ANALYST_ID, area, subject, claim }),
|
|
1904
|
+
analyst_id: ANALYST_ID,
|
|
1905
|
+
produced_at: producedAt,
|
|
1906
|
+
severity,
|
|
1907
|
+
area,
|
|
1908
|
+
claim,
|
|
1909
|
+
rationale,
|
|
1910
|
+
evidence_refs: [{ kind: "artifact", uri: evidenceUri }],
|
|
1911
|
+
recommended_action: recommended,
|
|
1912
|
+
confidence,
|
|
1913
|
+
subject
|
|
1914
|
+
};
|
|
1915
|
+
}
|
|
1916
|
+
function emitSkillUsageFindings(report, producedAt) {
|
|
1917
|
+
const out = [];
|
|
1918
|
+
for (const r of report.records) {
|
|
1919
|
+
const directTotal = r.directInvocations + r.slashInvocations;
|
|
1920
|
+
const trueUsage = directTotal + r.inboundRefs + r.artifactCount;
|
|
1921
|
+
if (trueUsage === 0) {
|
|
1922
|
+
out.push(
|
|
1923
|
+
finding(
|
|
1924
|
+
"skill-usage",
|
|
1925
|
+
r.name,
|
|
1926
|
+
`Skill '${r.name}' has zero usage across all signals (direct, slash, inbound-refs, artifacts)`,
|
|
1927
|
+
"high",
|
|
1928
|
+
0.6,
|
|
1929
|
+
producedAt,
|
|
1930
|
+
"Confirm the skill covers a real recurring job; if not, deprecate. Zero true usage is the only deterministic deprecation candidate.",
|
|
1931
|
+
r.path,
|
|
1932
|
+
"No Skill-tool call, no slash invocation, no sibling dispatches to it, and no on-disk artifacts."
|
|
1933
|
+
)
|
|
1934
|
+
);
|
|
1935
|
+
} else if (directTotal === 0 && r.inboundRefs + r.artifactCount > 0) {
|
|
1936
|
+
out.push(
|
|
1937
|
+
finding(
|
|
1938
|
+
"skill-usage",
|
|
1939
|
+
r.name,
|
|
1940
|
+
`Skill '${r.name}' shows 0 direct invocations but is used via orchestration/artifacts (inbound=${r.inboundRefs}, artifacts=${r.artifactCount})`,
|
|
1941
|
+
"info",
|
|
1942
|
+
0.8,
|
|
1943
|
+
producedAt,
|
|
1944
|
+
"Do NOT treat as unused \u2014 usage is real but logged under parent skills or on disk. Strengthen direct-invocation discovery only if direct use is desired.",
|
|
1945
|
+
r.path,
|
|
1946
|
+
"The Skill-tool counter undercounts orchestrated/chained leaf skills."
|
|
1947
|
+
)
|
|
1948
|
+
);
|
|
1949
|
+
}
|
|
1950
|
+
if (directTotal <= 2 && !r.hasTriggerPhrases) {
|
|
1951
|
+
out.push(
|
|
1952
|
+
finding(
|
|
1953
|
+
"discoverability",
|
|
1954
|
+
r.name,
|
|
1955
|
+
`Skill '${r.name}' is rarely invoked directly and its description has no explicit trigger phrases`,
|
|
1956
|
+
"medium",
|
|
1957
|
+
0.7,
|
|
1958
|
+
producedAt,
|
|
1959
|
+
"Add a `Triggers:` clause with verbatim user phrases to the frontmatter description so the model auto-invokes it.",
|
|
1960
|
+
r.path
|
|
1961
|
+
)
|
|
1962
|
+
);
|
|
1963
|
+
}
|
|
1964
|
+
if (r.kind === "public" && r.tanglePrivateRefs > 0) {
|
|
1965
|
+
out.push(
|
|
1966
|
+
finding(
|
|
1967
|
+
"safety",
|
|
1968
|
+
r.name,
|
|
1969
|
+
`Public skill '${r.name}' carries ${r.tanglePrivateRefs} Tangle-private reference(s)`,
|
|
1970
|
+
"high",
|
|
1971
|
+
0.75,
|
|
1972
|
+
producedAt,
|
|
1973
|
+
"Sanitize incidental internal refs (cli-bridge/kimi/tcloud/~company/private repos) or relocate to a private repo. Verify @tangle-network/* refs are to PUBLISHED packages before treating as a leak.",
|
|
1974
|
+
r.path
|
|
1975
|
+
)
|
|
1976
|
+
);
|
|
1977
|
+
}
|
|
1978
|
+
if (r.lines > BLOAT_LINE_THRESHOLD && !r.hasReferencesDir) {
|
|
1979
|
+
out.push(
|
|
1980
|
+
finding(
|
|
1981
|
+
"maintainability",
|
|
1982
|
+
r.name,
|
|
1983
|
+
`Skill '${r.name}' is ${r.lines} lines with no references/ split (progressive disclosure)`,
|
|
1984
|
+
"medium",
|
|
1985
|
+
0.8,
|
|
1986
|
+
producedAt,
|
|
1987
|
+
`Split detail into references/ loaded on demand; keep SKILL.md a short overview. ${r.lines} lines load into every session's context budget.`,
|
|
1988
|
+
r.path
|
|
1989
|
+
)
|
|
1990
|
+
);
|
|
1991
|
+
}
|
|
1992
|
+
if (!r.hasEvalsDir) {
|
|
1993
|
+
out.push(
|
|
1994
|
+
finding(
|
|
1995
|
+
"data-quality",
|
|
1996
|
+
r.name,
|
|
1997
|
+
`Skill '${r.name}' ships no evals/`,
|
|
1998
|
+
"low",
|
|
1999
|
+
0.6,
|
|
2000
|
+
producedAt,
|
|
2001
|
+
"Add evals/evals.json with >=3 scenarios proving the skill beats baseline; gives regression coverage.",
|
|
2002
|
+
r.path
|
|
2003
|
+
)
|
|
2004
|
+
);
|
|
2005
|
+
}
|
|
2006
|
+
if (!r.logsRuns) {
|
|
2007
|
+
out.push(
|
|
2008
|
+
finding(
|
|
2009
|
+
"observability",
|
|
2010
|
+
r.name,
|
|
2011
|
+
`Skill '${r.name}' never appends to .evolve/skill-runs.jsonl`,
|
|
2012
|
+
"low",
|
|
2013
|
+
0.55,
|
|
2014
|
+
producedAt,
|
|
2015
|
+
"Append one run line to .evolve/skill-runs.jsonl on completion, or declare it a non-logging leaf, so the self-improvement loop can see it ran.",
|
|
2016
|
+
r.path
|
|
2017
|
+
)
|
|
2018
|
+
);
|
|
2019
|
+
}
|
|
2020
|
+
}
|
|
2021
|
+
return out;
|
|
2022
|
+
}
|
|
2023
|
+
var SkillUsageAnalyst = class {
|
|
2024
|
+
id = ANALYST_ID;
|
|
2025
|
+
description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
|
|
2026
|
+
inputKind = "custom";
|
|
2027
|
+
cost = { kind: "deterministic", est_usd_per_run: 0 };
|
|
2028
|
+
version = "1.0.0";
|
|
2029
|
+
async analyze(input, ctx) {
|
|
2030
|
+
const producedAt = ctx.tags?.producedAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2031
|
+
ctx.log?.(
|
|
2032
|
+
`skill-usage: ${input.records.length} skills over ${input.generatedFromTraces} transcripts`
|
|
2033
|
+
);
|
|
2034
|
+
return emitSkillUsageFindings(input, producedAt);
|
|
2035
|
+
}
|
|
2036
|
+
};
|
|
2037
|
+
var SKILL_USAGE_ANALYST = new SkillUsageAnalyst();
|
|
2038
|
+
|
|
1757
2039
|
// src/analyst/registry.ts
|
|
1758
2040
|
import { randomUUID } from "crypto";
|
|
1759
2041
|
var AnalystRegistry = class {
|
|
@@ -2188,12 +2470,12 @@ function ghCliClient(opts = {}) {
|
|
|
2188
2470
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2189
2471
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2190
2472
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2191
|
-
const { dirname: dirname4, join:
|
|
2473
|
+
const { dirname: dirname4, join: join5, resolve } = await import("path");
|
|
2192
2474
|
for (const change of input.fileChanges) {
|
|
2193
2475
|
const abs = resolve(cwd, change.path);
|
|
2194
2476
|
await mkdir(dirname4(abs), { recursive: true });
|
|
2195
2477
|
await writeFile(abs, change.contents, "utf8");
|
|
2196
|
-
await run("git", ["add",
|
|
2478
|
+
await run("git", ["add", join5(change.path)]);
|
|
2197
2479
|
}
|
|
2198
2480
|
const env = {};
|
|
2199
2481
|
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
@@ -3224,10 +3506,10 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
3224
3506
|
}
|
|
3225
3507
|
async append(record) {
|
|
3226
3508
|
const { appendFile, mkdir } = await import("fs/promises");
|
|
3227
|
-
const { join:
|
|
3509
|
+
const { join: join5 } = await import("path");
|
|
3228
3510
|
await mkdir(this.dir, { recursive: true });
|
|
3229
3511
|
await appendFile(
|
|
3230
|
-
|
|
3512
|
+
join5(this.dir, "feedback-trajectories.ndjson"),
|
|
3231
3513
|
`${JSON.stringify(record)}
|
|
3232
3514
|
`,
|
|
3233
3515
|
"utf8"
|
|
@@ -3236,8 +3518,8 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
3236
3518
|
async load() {
|
|
3237
3519
|
if (this.loaded) return;
|
|
3238
3520
|
const { readFile } = await import("fs/promises");
|
|
3239
|
-
const { join:
|
|
3240
|
-
const file =
|
|
3521
|
+
const { join: join5 } = await import("path");
|
|
3522
|
+
const file = join5(this.dir, "feedback-trajectories.ndjson");
|
|
3241
3523
|
try {
|
|
3242
3524
|
const raw = await readFile(file, "utf8");
|
|
3243
3525
|
for (const line of raw.split("\n")) {
|
|
@@ -5262,89 +5544,6 @@ var FileSystemExperimentStore = class {
|
|
|
5262
5544
|
}
|
|
5263
5545
|
};
|
|
5264
5546
|
|
|
5265
|
-
// src/pareto.ts
|
|
5266
|
-
function dominates(a, b, objectives) {
|
|
5267
|
-
let strictlyBetter = false;
|
|
5268
|
-
for (const obj of objectives) {
|
|
5269
|
-
const av = obj.value(a);
|
|
5270
|
-
const bv = obj.value(b);
|
|
5271
|
-
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
5272
|
-
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
5273
|
-
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
5274
|
-
if (aIsWorse) return false;
|
|
5275
|
-
if (aIsBetter) strictlyBetter = true;
|
|
5276
|
-
}
|
|
5277
|
-
return strictlyBetter;
|
|
5278
|
-
}
|
|
5279
|
-
function paretoFrontier(candidates, objectives) {
|
|
5280
|
-
if (objectives.length === 0) {
|
|
5281
|
-
throw new Error("paretoFrontier: at least 1 objective required");
|
|
5282
|
-
}
|
|
5283
|
-
const valid = candidates.filter((c) => objectives.every((o) => Number.isFinite(o.value(c))));
|
|
5284
|
-
const frontier = [];
|
|
5285
|
-
const dominated = [];
|
|
5286
|
-
for (const c of valid) {
|
|
5287
|
-
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
5288
|
-
if (isDominated) dominated.push(c);
|
|
5289
|
-
else frontier.push(c);
|
|
5290
|
-
}
|
|
5291
|
-
const dominanceMap = frontier.map((d) => ({
|
|
5292
|
-
dominator: d,
|
|
5293
|
-
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
5294
|
-
}));
|
|
5295
|
-
return { frontier, dominated, dominanceMap };
|
|
5296
|
-
}
|
|
5297
|
-
function scalarScore(candidates, objectives, options = {}) {
|
|
5298
|
-
if (candidates.length === 0) return [];
|
|
5299
|
-
const weights = options.weights ?? {};
|
|
5300
|
-
const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
|
|
5301
|
-
const ranges = objectives.map((obj) => {
|
|
5302
|
-
const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
|
|
5303
|
-
if (values.length === 0) return { min: 0, max: 1 };
|
|
5304
|
-
const min = Math.min(...values);
|
|
5305
|
-
const max = Math.max(...values);
|
|
5306
|
-
return { min, max: max === min ? min + 1 : max };
|
|
5307
|
-
});
|
|
5308
|
-
return candidates.map((c) => {
|
|
5309
|
-
let score = 0;
|
|
5310
|
-
objectives.forEach((obj, i) => {
|
|
5311
|
-
const v = obj.value(c);
|
|
5312
|
-
if (!Number.isFinite(v)) return;
|
|
5313
|
-
const { min, max } = ranges[i];
|
|
5314
|
-
const normalised = (v - min) / (max - min);
|
|
5315
|
-
const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
|
|
5316
|
-
const weight = (weights[obj.name] ?? 1) / totalWeight;
|
|
5317
|
-
score += directional * weight;
|
|
5318
|
-
});
|
|
5319
|
-
return { candidate: c, score };
|
|
5320
|
-
});
|
|
5321
|
-
}
|
|
5322
|
-
function crowdingDistance(candidates, objectives) {
|
|
5323
|
-
const distances = new Map(candidates.map((c) => [c, 0]));
|
|
5324
|
-
for (const obj of objectives) {
|
|
5325
|
-
const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
|
|
5326
|
-
const min = obj.value(sorted[0]);
|
|
5327
|
-
const max = obj.value(sorted[sorted.length - 1]);
|
|
5328
|
-
const range = max - min || 1;
|
|
5329
|
-
distances.set(sorted[0], Infinity);
|
|
5330
|
-
distances.set(sorted[sorted.length - 1], Infinity);
|
|
5331
|
-
for (let i = 1; i < sorted.length - 1; i++) {
|
|
5332
|
-
const prev = obj.value(sorted[i - 1]);
|
|
5333
|
-
const next = obj.value(sorted[i + 1]);
|
|
5334
|
-
const current = distances.get(sorted[i]);
|
|
5335
|
-
if (current === Infinity) continue;
|
|
5336
|
-
distances.set(sorted[i], current + (next - prev) / range);
|
|
5337
|
-
}
|
|
5338
|
-
}
|
|
5339
|
-
return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
|
|
5340
|
-
}
|
|
5341
|
-
function paretoFrontierWithCrowding(candidates, objectives) {
|
|
5342
|
-
const { frontier } = paretoFrontier(candidates, objectives);
|
|
5343
|
-
if (frontier.length === 0) return [];
|
|
5344
|
-
const distances = crowdingDistance(frontier, objectives);
|
|
5345
|
-
return distances.sort((a, b) => b.distance - a.distance);
|
|
5346
|
-
}
|
|
5347
|
-
|
|
5348
5547
|
// src/harness-optimizer.ts
|
|
5349
5548
|
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
5350
5549
|
{ name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
|
|
@@ -6048,8 +6247,8 @@ function assertNonNegative(n, name) {
|
|
|
6048
6247
|
}
|
|
6049
6248
|
|
|
6050
6249
|
// src/muffled-gate-scanner.ts
|
|
6051
|
-
import { existsSync as
|
|
6052
|
-
import { join } from "path";
|
|
6250
|
+
import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
|
|
6251
|
+
import { join as join2 } from "path";
|
|
6053
6252
|
function codeOf(line) {
|
|
6054
6253
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
6055
6254
|
}
|
|
@@ -6161,14 +6360,14 @@ var UNIVERSAL_FINDERS = [findConstructorCwdDropped];
|
|
|
6161
6360
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
6162
6361
|
const matches = [];
|
|
6163
6362
|
const walk = (rel) => {
|
|
6164
|
-
const abs =
|
|
6165
|
-
if (!
|
|
6166
|
-
for (const entry of
|
|
6167
|
-
const sub =
|
|
6168
|
-
const subAbs =
|
|
6363
|
+
const abs = join2(repoRoot, rel);
|
|
6364
|
+
if (!existsSync4(abs)) return;
|
|
6365
|
+
for (const entry of readdirSync2(abs)) {
|
|
6366
|
+
const sub = join2(rel, entry);
|
|
6367
|
+
const subAbs = join2(repoRoot, sub);
|
|
6169
6368
|
let st;
|
|
6170
6369
|
try {
|
|
6171
|
-
st =
|
|
6370
|
+
st = statSync2(subAbs);
|
|
6172
6371
|
} catch {
|
|
6173
6372
|
continue;
|
|
6174
6373
|
}
|
|
@@ -6181,7 +6380,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
|
6181
6380
|
continue;
|
|
6182
6381
|
let text;
|
|
6183
6382
|
try {
|
|
6184
|
-
text =
|
|
6383
|
+
text = readFileSync3(subAbs, "utf8");
|
|
6185
6384
|
} catch {
|
|
6186
6385
|
continue;
|
|
6187
6386
|
}
|
|
@@ -6196,9 +6395,9 @@ function scanForMuffledGates(opts) {
|
|
|
6196
6395
|
const findings = [];
|
|
6197
6396
|
const scanned = /* @__PURE__ */ new Set();
|
|
6198
6397
|
for (const file of opts.scanFiles) {
|
|
6199
|
-
const abs =
|
|
6200
|
-
if (!
|
|
6201
|
-
const text =
|
|
6398
|
+
const abs = join2(opts.repoRoot, file);
|
|
6399
|
+
if (!existsSync4(abs)) continue;
|
|
6400
|
+
const text = readFileSync3(abs, "utf8");
|
|
6202
6401
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
6203
6402
|
scanned.add(file);
|
|
6204
6403
|
}
|
|
@@ -6211,9 +6410,9 @@ function scanForMuffledGates(opts) {
|
|
|
6211
6410
|
);
|
|
6212
6411
|
for (const file of importers) {
|
|
6213
6412
|
if (scanned.has(file)) continue;
|
|
6214
|
-
const abs =
|
|
6215
|
-
if (!
|
|
6216
|
-
const text =
|
|
6413
|
+
const abs = join2(opts.repoRoot, file);
|
|
6414
|
+
if (!existsSync4(abs)) continue;
|
|
6415
|
+
const text = readFileSync3(abs, "utf8");
|
|
6217
6416
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
6218
6417
|
}
|
|
6219
6418
|
}
|
|
@@ -6363,7 +6562,7 @@ function isObject(v) {
|
|
|
6363
6562
|
}
|
|
6364
6563
|
|
|
6365
6564
|
// src/scorecard.ts
|
|
6366
|
-
import { appendFileSync as appendFileSync2, existsSync as
|
|
6565
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
6367
6566
|
import { dirname as dirname2 } from "path";
|
|
6368
6567
|
function median(xs) {
|
|
6369
6568
|
if (xs.length === 0) return 0;
|
|
@@ -6439,10 +6638,10 @@ function recordRunsToScorecard(logPath, runs, opts) {
|
|
|
6439
6638
|
return lines;
|
|
6440
6639
|
}
|
|
6441
6640
|
function loadScorecard(logPath) {
|
|
6442
|
-
if (!
|
|
6641
|
+
if (!existsSync5(logPath)) return { cells: [], profiles: {} };
|
|
6443
6642
|
const cells = /* @__PURE__ */ new Map();
|
|
6444
6643
|
const profiles = {};
|
|
6445
|
-
for (const raw of
|
|
6644
|
+
for (const raw of readFileSync4(logPath, "utf8").split("\n")) {
|
|
6446
6645
|
const line = raw.trim();
|
|
6447
6646
|
if (!line) continue;
|
|
6448
6647
|
let parsed;
|
|
@@ -7319,7 +7518,7 @@ async function commitBisect(options) {
|
|
|
7319
7518
|
}
|
|
7320
7519
|
async function promptBisect(options) {
|
|
7321
7520
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
7322
|
-
const
|
|
7521
|
+
const join5 = (paragraphs) => paragraphs.join("\n\n");
|
|
7323
7522
|
const goodParas = split(options.good);
|
|
7324
7523
|
const badParas = split(options.bad);
|
|
7325
7524
|
if (goodParas.length !== badParas.length) {
|
|
@@ -7339,7 +7538,7 @@ async function promptBisect(options) {
|
|
|
7339
7538
|
const result = await bisect({
|
|
7340
7539
|
good: goodMask,
|
|
7341
7540
|
bad: badMask,
|
|
7342
|
-
runEval: (mask) => options.runEval(
|
|
7541
|
+
runEval: (mask) => options.runEval(join5(paragraphsFor(mask))),
|
|
7343
7542
|
maxIterations: options.maxIterations ?? n + 5,
|
|
7344
7543
|
halfway: (g, b) => {
|
|
7345
7544
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -7370,12 +7569,12 @@ async function promptBisect(options) {
|
|
|
7370
7569
|
}
|
|
7371
7570
|
}
|
|
7372
7571
|
const materializedPath = result.path.map((s) => ({
|
|
7373
|
-
state:
|
|
7572
|
+
state: join5(paragraphsFor(s.state)),
|
|
7374
7573
|
score: s.score,
|
|
7375
7574
|
pass: s.pass
|
|
7376
7575
|
}));
|
|
7377
7576
|
return {
|
|
7378
|
-
culprit:
|
|
7577
|
+
culprit: join5(paragraphsFor(culprit)),
|
|
7379
7578
|
path: materializedPath,
|
|
7380
7579
|
converged: result.converged,
|
|
7381
7580
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7869,8 +8068,8 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7869
8068
|
|
|
7870
8069
|
// src/command-runner.ts
|
|
7871
8070
|
import { spawnSync } from "child_process";
|
|
7872
|
-
import { existsSync as
|
|
7873
|
-
import { join as
|
|
8071
|
+
import { existsSync as existsSync6, readdirSync as readdirSync3, readFileSync as readFileSync5, statSync as statSync3 } from "fs";
|
|
8072
|
+
import { join as join3 } from "path";
|
|
7874
8073
|
var localCommandRunner = {
|
|
7875
8074
|
name: "local",
|
|
7876
8075
|
async run(input) {
|
|
@@ -7898,11 +8097,11 @@ var localCommandRunner = {
|
|
|
7898
8097
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
7899
8098
|
},
|
|
7900
8099
|
async fileExists(path) {
|
|
7901
|
-
return
|
|
8100
|
+
return existsSync6(path);
|
|
7902
8101
|
},
|
|
7903
8102
|
async readFile(path) {
|
|
7904
8103
|
try {
|
|
7905
|
-
return
|
|
8104
|
+
return readFileSync5(path, "utf8");
|
|
7906
8105
|
} catch {
|
|
7907
8106
|
return null;
|
|
7908
8107
|
}
|
|
@@ -7910,14 +8109,14 @@ var localCommandRunner = {
|
|
|
7910
8109
|
async readDir(path) {
|
|
7911
8110
|
let entries;
|
|
7912
8111
|
try {
|
|
7913
|
-
entries =
|
|
8112
|
+
entries = readdirSync3(path);
|
|
7914
8113
|
} catch {
|
|
7915
8114
|
return [];
|
|
7916
8115
|
}
|
|
7917
8116
|
const out = [];
|
|
7918
8117
|
for (const name of entries) {
|
|
7919
8118
|
try {
|
|
7920
|
-
const st =
|
|
8119
|
+
const st = statSync3(join3(path, name));
|
|
7921
8120
|
out.push({
|
|
7922
8121
|
name,
|
|
7923
8122
|
isDirectory: st.isDirectory(),
|
|
@@ -8834,7 +9033,7 @@ function multiToolchainLayer(config) {
|
|
|
8834
9033
|
}
|
|
8835
9034
|
|
|
8836
9035
|
// src/reference-replay.ts
|
|
8837
|
-
import { appendFileSync as appendFileSync3, existsSync as
|
|
9036
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3, readFileSync as readFileSync6 } from "fs";
|
|
8838
9037
|
import { dirname as dirname3 } from "path";
|
|
8839
9038
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
8840
9039
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
@@ -8960,7 +9159,7 @@ function jsonlReferenceReplayStore(path) {
|
|
|
8960
9159
|
},
|
|
8961
9160
|
async list() {
|
|
8962
9161
|
return lock.runExclusive(() => {
|
|
8963
|
-
if (!
|
|
9162
|
+
if (!existsSync7(path)) return [];
|
|
8964
9163
|
return readJsonl(path);
|
|
8965
9164
|
});
|
|
8966
9165
|
}
|
|
@@ -9303,7 +9502,7 @@ function throwIfAborted(signal) {
|
|
|
9303
9502
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9304
9503
|
}
|
|
9305
9504
|
function readJsonl(path) {
|
|
9306
|
-
const raw =
|
|
9505
|
+
const raw = readFileSync6(path, "utf8");
|
|
9307
9506
|
const out = [];
|
|
9308
9507
|
for (const line of raw.split("\n")) {
|
|
9309
9508
|
const trimmed = line.trim();
|
|
@@ -9460,7 +9659,7 @@ function createDefaultReviewer(options) {
|
|
|
9460
9659
|
|
|
9461
9660
|
// src/discover-personas.ts
|
|
9462
9661
|
import { promises as fs } from "fs";
|
|
9463
|
-
import { basename, extname, join as
|
|
9662
|
+
import { basename, extname, join as join4 } from "path";
|
|
9464
9663
|
var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
|
|
9465
9664
|
async function discoverPersonas(dir, opts = {}) {
|
|
9466
9665
|
const pattern = opts.pattern ?? DEFAULT_PATTERN;
|
|
@@ -9478,7 +9677,7 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9478
9677
|
}
|
|
9479
9678
|
const out = [];
|
|
9480
9679
|
for (const entry of entries) {
|
|
9481
|
-
const full =
|
|
9680
|
+
const full = join4(d, entry.name);
|
|
9482
9681
|
if (entry.isDir) {
|
|
9483
9682
|
if (opts.recursive) out.push(...await walk(full));
|
|
9484
9683
|
continue;
|
|
@@ -10322,9 +10521,11 @@ export {
|
|
|
10322
10521
|
RunIntegrityError,
|
|
10323
10522
|
RunRecordValidationError,
|
|
10324
10523
|
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
10524
|
+
SKILL_USAGE_ANALYST,
|
|
10325
10525
|
SandboxHarness,
|
|
10326
10526
|
ScenarioRegistry,
|
|
10327
10527
|
SingleBackendError,
|
|
10528
|
+
SkillUsageAnalyst,
|
|
10328
10529
|
SpanNotFoundError,
|
|
10329
10530
|
SubprocessSandboxDriver,
|
|
10330
10531
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
@@ -10375,6 +10576,7 @@ export {
|
|
|
10375
10576
|
buildReflectionPrompt,
|
|
10376
10577
|
buildReviewerPrompt,
|
|
10377
10578
|
buildSandboxAgentProfileCell,
|
|
10579
|
+
buildSkillUsageReport,
|
|
10378
10580
|
buildTraceAnalystTools,
|
|
10379
10581
|
buildTraceInsightContext,
|
|
10380
10582
|
buildTraceInsightPrompt,
|
|
@@ -10455,6 +10657,7 @@ export {
|
|
|
10455
10657
|
distillPlaybook,
|
|
10456
10658
|
domainEvidencePattern,
|
|
10457
10659
|
dominates,
|
|
10660
|
+
emitSkillUsageFindings,
|
|
10458
10661
|
estimateCost,
|
|
10459
10662
|
estimateTokens,
|
|
10460
10663
|
euAiActReport,
|