@tangle-network/agent-eval 0.61.0 → 0.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -8
- package/dist/adapters/http.d.ts +4 -1
- package/dist/adapters/langchain.d.ts +4 -1
- package/dist/adapters/otel.d.ts +4 -4
- package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/campaign/index.d.ts +8 -8
- package/dist/campaign/index.js +4 -3
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
- package/dist/chunk-7TPYV2ER.js.map +1 -0
- package/dist/{chunk-SUGME4OT.js → chunk-CV2BS2OV.js} +7 -5
- package/dist/chunk-CV2BS2OV.js.map +1 -0
- package/dist/chunk-E22YUOAL.js +111 -0
- package/dist/chunk-E22YUOAL.js.map +1 -0
- package/dist/{chunk-GMXHLSLL.js → chunk-SS2SOBBT.js} +1 -106
- package/dist/chunk-SS2SOBBT.js.map +1 -0
- package/dist/contract/index.d.ts +9 -9
- package/dist/contract/index.js +4 -3
- package/dist/contract/index.js.map +1 -1
- package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/hosted/index.d.ts +4 -4
- package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
- package/dist/{index-D9dwa00f.d.ts → index-DxfmYUjC.d.ts} +2 -2
- package/dist/index.d.ts +98 -14
- package/dist/index.js +324 -45
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/multishot/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-D0WeCXt1.d.ts → provenance-CYBV9Ox6.d.ts} +14 -3
- package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
- package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
- package/dist/rl.d.ts +6 -6
- package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
- package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
- package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
- package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
- package/dist/{types-Beb6KPqZ.d.ts → types-DH22o8hM.d.ts} +15 -11
- package/package.json +1 -1
- package/dist/chunk-GMXHLSLL.js.map +0 -1
- package/dist/chunk-OLULBECP.js.map +0 -1
- package/dist/chunk-SUGME4OT.js.map +0 -1
- /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -9,12 +9,10 @@ import {
|
|
|
9
9
|
runBehavioralCanaries
|
|
10
10
|
} from "./chunk-SHTXZ4O2.js";
|
|
11
11
|
import {
|
|
12
|
-
BackendIntegrityError,
|
|
13
12
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
14
13
|
DEFAULT_RED_TEAM_CORPUS,
|
|
15
14
|
Dataset,
|
|
16
15
|
HoldoutLockedError,
|
|
17
|
-
assertRealBackend,
|
|
18
16
|
buildReflectionPrompt,
|
|
19
17
|
hashScenarios,
|
|
20
18
|
parseReflectionResponse,
|
|
@@ -22,9 +20,13 @@ import {
|
|
|
22
20
|
redTeamReport,
|
|
23
21
|
runCanaries,
|
|
24
22
|
scoreRedTeamOutput,
|
|
25
|
-
summarizeBackendIntegrity,
|
|
26
23
|
toolNamesForRun
|
|
27
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-SS2SOBBT.js";
|
|
25
|
+
import {
|
|
26
|
+
BackendIntegrityError,
|
|
27
|
+
assertRealBackend,
|
|
28
|
+
summarizeBackendIntegrity
|
|
29
|
+
} from "./chunk-E22YUOAL.js";
|
|
28
30
|
import {
|
|
29
31
|
BENCHMARK_SPLIT_SEED,
|
|
30
32
|
benchmarks_exports,
|
|
@@ -723,8 +725,8 @@ function createVerifierAdapter(opts) {
|
|
|
723
725
|
const report = await opts.verifier.run({ env, ...opts.options });
|
|
724
726
|
const out = [];
|
|
725
727
|
for (const layer of report.layers) {
|
|
726
|
-
for (const
|
|
727
|
-
out.push(liftLayerFinding(id, area, layer.layer,
|
|
728
|
+
for (const finding2 of layer.findings) {
|
|
729
|
+
out.push(liftLayerFinding(id, area, layer.layer, finding2));
|
|
728
730
|
}
|
|
729
731
|
if (layer.status === "fail" || layer.status === "error" || layer.status === "timeout") {
|
|
730
732
|
out.push(
|
|
@@ -1754,6 +1756,279 @@ var DEFAULT_TRACE_ANALYST_KINDS = [
|
|
|
1754
1756
|
IMPROVEMENT_KIND_SPEC
|
|
1755
1757
|
];
|
|
1756
1758
|
|
|
1759
|
+
// src/analyst/kinds/skill-usage.ts
|
|
1760
|
+
import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
|
|
1761
|
+
import { join } from "path";
|
|
1762
|
+
var BLOAT_LINE_THRESHOLD = 300;
|
|
1763
|
+
var TANGLE_PRIVATE_RE = /\b(cli-bridge|tangletools|ops-board|drew-gtr-pro|@tangle-network\/|~\/company|tangle\.tools|gtm-agent)\b|\bkimi\b|\btcloud\b/gi;
|
|
1764
|
+
var TRIGGER_RE = /triggers?\s*[:\-]/i;
|
|
1765
|
+
function listSkillDirs(root) {
|
|
1766
|
+
if (!existsSync3(root)) return [];
|
|
1767
|
+
const out = [];
|
|
1768
|
+
for (const entry of readdirSync(root, { withFileTypes: true })) {
|
|
1769
|
+
if (!entry.isDirectory() && !entry.isSymbolicLink()) continue;
|
|
1770
|
+
const skillMd = join(root, entry.name, "SKILL.md");
|
|
1771
|
+
if (existsSync3(skillMd)) out.push({ name: entry.name, path: skillMd });
|
|
1772
|
+
}
|
|
1773
|
+
return out;
|
|
1774
|
+
}
|
|
1775
|
+
function walkJsonl(dir, cap) {
|
|
1776
|
+
if (!existsSync3(dir)) return [];
|
|
1777
|
+
const files = [];
|
|
1778
|
+
const stack = [dir];
|
|
1779
|
+
while (stack.length) {
|
|
1780
|
+
const cur = stack.pop();
|
|
1781
|
+
let entries;
|
|
1782
|
+
try {
|
|
1783
|
+
entries = readdirSync(cur, { withFileTypes: true });
|
|
1784
|
+
} catch {
|
|
1785
|
+
continue;
|
|
1786
|
+
}
|
|
1787
|
+
for (const e of entries) {
|
|
1788
|
+
const full = join(cur, e.name);
|
|
1789
|
+
if (e.isDirectory()) stack.push(full);
|
|
1790
|
+
else if (e.name.endsWith(".jsonl")) {
|
|
1791
|
+
files.push(full);
|
|
1792
|
+
if (cap > 0 && files.length >= cap) return files;
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
}
|
|
1796
|
+
return files;
|
|
1797
|
+
}
|
|
1798
|
+
function frontmatterDescription(body) {
|
|
1799
|
+
const fm = /^---\n([\s\S]*?)\n---/.exec(body);
|
|
1800
|
+
const block = fm?.[1] ?? "";
|
|
1801
|
+
const m = /description:\s*(.+)/i.exec(block);
|
|
1802
|
+
return m?.[1] ?? "";
|
|
1803
|
+
}
|
|
1804
|
+
function countArtifacts(roots, name, aliases) {
|
|
1805
|
+
let n = 0;
|
|
1806
|
+
for (const root of roots) {
|
|
1807
|
+
const candidates = [join(root, ".evolve", name), ...aliases.map((a) => join(root, a))];
|
|
1808
|
+
for (const dir of candidates) {
|
|
1809
|
+
if (!existsSync3(dir)) continue;
|
|
1810
|
+
try {
|
|
1811
|
+
if (statSync(dir).isDirectory()) n += readdirSync(dir).length;
|
|
1812
|
+
else n += 1;
|
|
1813
|
+
} catch {
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
return n;
|
|
1818
|
+
}
|
|
1819
|
+
function buildSkillUsageReport(config) {
|
|
1820
|
+
const skills = config.skillRoots.flatMap(
|
|
1821
|
+
({ root, kind }) => listSkillDirs(root).map((s) => ({ ...s, kind }))
|
|
1822
|
+
);
|
|
1823
|
+
const names = skills.map((s) => s.name);
|
|
1824
|
+
const direct = new Map(names.map((n) => [n, 0]));
|
|
1825
|
+
const slash = new Map(names.map((n) => [n, 0]));
|
|
1826
|
+
const skillRe = /"skill"\s*:\s*"([a-z0-9_:-]+)"/g;
|
|
1827
|
+
const cmdRe = /<command-name>\/?([a-z0-9_:-]+)<\/command-name>/g;
|
|
1828
|
+
let transcripts = 0;
|
|
1829
|
+
for (const dir of config.transcriptDirs) {
|
|
1830
|
+
for (const file of walkJsonl(dir, config.maxTranscriptsPerDir ?? 0)) {
|
|
1831
|
+
transcripts += 1;
|
|
1832
|
+
let data;
|
|
1833
|
+
try {
|
|
1834
|
+
data = readFileSync2(file, "utf8");
|
|
1835
|
+
} catch {
|
|
1836
|
+
continue;
|
|
1837
|
+
}
|
|
1838
|
+
for (const m of data.matchAll(skillRe)) {
|
|
1839
|
+
const g = m[1];
|
|
1840
|
+
if (!g) continue;
|
|
1841
|
+
const n = g.split(":").pop() ?? g;
|
|
1842
|
+
const prev = direct.get(n);
|
|
1843
|
+
if (prev !== void 0) direct.set(n, prev + 1);
|
|
1844
|
+
}
|
|
1845
|
+
for (const m of data.matchAll(cmdRe)) {
|
|
1846
|
+
const g = m[1];
|
|
1847
|
+
if (g === void 0) continue;
|
|
1848
|
+
const prev = slash.get(g);
|
|
1849
|
+
if (prev !== void 0) slash.set(g, prev + 1);
|
|
1850
|
+
}
|
|
1851
|
+
}
|
|
1852
|
+
}
|
|
1853
|
+
const bodies = /* @__PURE__ */ new Map();
|
|
1854
|
+
for (const s of skills) {
|
|
1855
|
+
try {
|
|
1856
|
+
bodies.set(s.name, readFileSync2(s.path, "utf8"));
|
|
1857
|
+
} catch {
|
|
1858
|
+
bodies.set(s.name, "");
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
const inbound = new Map(names.map((n) => [n, 0]));
|
|
1862
|
+
for (const target of names) {
|
|
1863
|
+
const ref = new RegExp(`/${target}\\b|\\[\\[${target}\\]\\]`);
|
|
1864
|
+
for (const s of skills) {
|
|
1865
|
+
if (s.name === target) continue;
|
|
1866
|
+
if (ref.test(bodies.get(s.name) ?? "")) inbound.set(target, inbound.get(target) + 1);
|
|
1867
|
+
}
|
|
1868
|
+
}
|
|
1869
|
+
const records = skills.map((s) => {
|
|
1870
|
+
const body = bodies.get(s.name) ?? "";
|
|
1871
|
+
const dir = s.path.replace(/\/SKILL\.md$/, "");
|
|
1872
|
+
return {
|
|
1873
|
+
name: s.name,
|
|
1874
|
+
kind: s.kind,
|
|
1875
|
+
path: s.path,
|
|
1876
|
+
lines: body ? body.split("\n").length : 0,
|
|
1877
|
+
directInvocations: direct.get(s.name) ?? 0,
|
|
1878
|
+
slashInvocations: slash.get(s.name) ?? 0,
|
|
1879
|
+
inboundRefs: inbound.get(s.name) ?? 0,
|
|
1880
|
+
artifactCount: countArtifacts(
|
|
1881
|
+
config.artifactRoots ?? [],
|
|
1882
|
+
s.name,
|
|
1883
|
+
config.artifactAliases?.[s.name] ?? []
|
|
1884
|
+
),
|
|
1885
|
+
tanglePrivateRefs: (body.match(TANGLE_PRIVATE_RE) ?? []).length,
|
|
1886
|
+
hasReferencesDir: existsSync3(join(dir, "references")),
|
|
1887
|
+
hasEvalsDir: existsSync3(join(dir, "evals")),
|
|
1888
|
+
logsRuns: body.includes("skill-runs.jsonl"),
|
|
1889
|
+
hasTriggerPhrases: TRIGGER_RE.test(frontmatterDescription(body) || body.slice(0, 600))
|
|
1890
|
+
};
|
|
1891
|
+
});
|
|
1892
|
+
return { generatedFromTraces: transcripts, records };
|
|
1893
|
+
}
|
|
1894
|
+
var ANALYST_ID = "skill-usage";
|
|
1895
|
+
function finding(area, subject, claim, severity, confidence, producedAt, recommended, evidenceUri, rationale) {
|
|
1896
|
+
return {
|
|
1897
|
+
schema_version: "1.0.0",
|
|
1898
|
+
finding_id: computeFindingId({ analyst_id: ANALYST_ID, area, subject, claim }),
|
|
1899
|
+
analyst_id: ANALYST_ID,
|
|
1900
|
+
produced_at: producedAt,
|
|
1901
|
+
severity,
|
|
1902
|
+
area,
|
|
1903
|
+
claim,
|
|
1904
|
+
rationale,
|
|
1905
|
+
evidence_refs: [{ kind: "artifact", uri: evidenceUri }],
|
|
1906
|
+
recommended_action: recommended,
|
|
1907
|
+
confidence,
|
|
1908
|
+
subject
|
|
1909
|
+
};
|
|
1910
|
+
}
|
|
1911
|
+
function emitSkillUsageFindings(report, producedAt) {
|
|
1912
|
+
const out = [];
|
|
1913
|
+
for (const r of report.records) {
|
|
1914
|
+
const directTotal = r.directInvocations + r.slashInvocations;
|
|
1915
|
+
const trueUsage = directTotal + r.inboundRefs + r.artifactCount;
|
|
1916
|
+
if (trueUsage === 0) {
|
|
1917
|
+
out.push(
|
|
1918
|
+
finding(
|
|
1919
|
+
"skill-usage",
|
|
1920
|
+
r.name,
|
|
1921
|
+
`Skill '${r.name}' has zero usage across all signals (direct, slash, inbound-refs, artifacts)`,
|
|
1922
|
+
"high",
|
|
1923
|
+
0.6,
|
|
1924
|
+
producedAt,
|
|
1925
|
+
"Confirm the skill covers a real recurring job; if not, deprecate. Zero true usage is the only deterministic deprecation candidate.",
|
|
1926
|
+
r.path,
|
|
1927
|
+
"No Skill-tool call, no slash invocation, no sibling dispatches to it, and no on-disk artifacts."
|
|
1928
|
+
)
|
|
1929
|
+
);
|
|
1930
|
+
} else if (directTotal === 0 && r.inboundRefs + r.artifactCount > 0) {
|
|
1931
|
+
out.push(
|
|
1932
|
+
finding(
|
|
1933
|
+
"skill-usage",
|
|
1934
|
+
r.name,
|
|
1935
|
+
`Skill '${r.name}' shows 0 direct invocations but is used via orchestration/artifacts (inbound=${r.inboundRefs}, artifacts=${r.artifactCount})`,
|
|
1936
|
+
"info",
|
|
1937
|
+
0.8,
|
|
1938
|
+
producedAt,
|
|
1939
|
+
"Do NOT treat as unused \u2014 usage is real but logged under parent skills or on disk. Strengthen direct-invocation discovery only if direct use is desired.",
|
|
1940
|
+
r.path,
|
|
1941
|
+
"The Skill-tool counter undercounts orchestrated/chained leaf skills."
|
|
1942
|
+
)
|
|
1943
|
+
);
|
|
1944
|
+
}
|
|
1945
|
+
if (directTotal <= 2 && !r.hasTriggerPhrases) {
|
|
1946
|
+
out.push(
|
|
1947
|
+
finding(
|
|
1948
|
+
"discoverability",
|
|
1949
|
+
r.name,
|
|
1950
|
+
`Skill '${r.name}' is rarely invoked directly and its description has no explicit trigger phrases`,
|
|
1951
|
+
"medium",
|
|
1952
|
+
0.7,
|
|
1953
|
+
producedAt,
|
|
1954
|
+
"Add a `Triggers:` clause with verbatim user phrases to the frontmatter description so the model auto-invokes it.",
|
|
1955
|
+
r.path
|
|
1956
|
+
)
|
|
1957
|
+
);
|
|
1958
|
+
}
|
|
1959
|
+
if (r.kind === "public" && r.tanglePrivateRefs > 0) {
|
|
1960
|
+
out.push(
|
|
1961
|
+
finding(
|
|
1962
|
+
"safety",
|
|
1963
|
+
r.name,
|
|
1964
|
+
`Public skill '${r.name}' carries ${r.tanglePrivateRefs} Tangle-private reference(s)`,
|
|
1965
|
+
"high",
|
|
1966
|
+
0.75,
|
|
1967
|
+
producedAt,
|
|
1968
|
+
"Sanitize incidental internal refs (cli-bridge/kimi/tcloud/~company/private repos) or relocate to a private repo. Verify @tangle-network/* refs are to PUBLISHED packages before treating as a leak.",
|
|
1969
|
+
r.path
|
|
1970
|
+
)
|
|
1971
|
+
);
|
|
1972
|
+
}
|
|
1973
|
+
if (r.lines > BLOAT_LINE_THRESHOLD && !r.hasReferencesDir) {
|
|
1974
|
+
out.push(
|
|
1975
|
+
finding(
|
|
1976
|
+
"maintainability",
|
|
1977
|
+
r.name,
|
|
1978
|
+
`Skill '${r.name}' is ${r.lines} lines with no references/ split (progressive disclosure)`,
|
|
1979
|
+
"medium",
|
|
1980
|
+
0.8,
|
|
1981
|
+
producedAt,
|
|
1982
|
+
`Split detail into references/ loaded on demand; keep SKILL.md a short overview. ${r.lines} lines load into every session's context budget.`,
|
|
1983
|
+
r.path
|
|
1984
|
+
)
|
|
1985
|
+
);
|
|
1986
|
+
}
|
|
1987
|
+
if (!r.hasEvalsDir) {
|
|
1988
|
+
out.push(
|
|
1989
|
+
finding(
|
|
1990
|
+
"data-quality",
|
|
1991
|
+
r.name,
|
|
1992
|
+
`Skill '${r.name}' ships no evals/`,
|
|
1993
|
+
"low",
|
|
1994
|
+
0.6,
|
|
1995
|
+
producedAt,
|
|
1996
|
+
"Add evals/evals.json with >=3 scenarios proving the skill beats baseline; gives regression coverage.",
|
|
1997
|
+
r.path
|
|
1998
|
+
)
|
|
1999
|
+
);
|
|
2000
|
+
}
|
|
2001
|
+
if (!r.logsRuns) {
|
|
2002
|
+
out.push(
|
|
2003
|
+
finding(
|
|
2004
|
+
"observability",
|
|
2005
|
+
r.name,
|
|
2006
|
+
`Skill '${r.name}' never appends to .evolve/skill-runs.jsonl`,
|
|
2007
|
+
"low",
|
|
2008
|
+
0.55,
|
|
2009
|
+
producedAt,
|
|
2010
|
+
"Append one run line to .evolve/skill-runs.jsonl on completion, or declare it a non-logging leaf, so the self-improvement loop can see it ran.",
|
|
2011
|
+
r.path
|
|
2012
|
+
)
|
|
2013
|
+
);
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
return out;
|
|
2017
|
+
}
|
|
2018
|
+
var SkillUsageAnalyst = class {
|
|
2019
|
+
id = ANALYST_ID;
|
|
2020
|
+
description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
|
|
2021
|
+
inputKind = "custom";
|
|
2022
|
+
cost = { kind: "deterministic", est_usd_per_run: 0 };
|
|
2023
|
+
version = "1.0.0";
|
|
2024
|
+
async analyze(input, ctx) {
|
|
2025
|
+
const producedAt = ctx.tags?.producedAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2026
|
+
ctx.log?.(`skill-usage: ${input.records.length} skills over ${input.generatedFromTraces} transcripts`);
|
|
2027
|
+
return emitSkillUsageFindings(input, producedAt);
|
|
2028
|
+
}
|
|
2029
|
+
};
|
|
2030
|
+
var SKILL_USAGE_ANALYST = new SkillUsageAnalyst();
|
|
2031
|
+
|
|
1757
2032
|
// src/analyst/registry.ts
|
|
1758
2033
|
import { randomUUID } from "crypto";
|
|
1759
2034
|
var AnalystRegistry = class {
|
|
@@ -2188,12 +2463,12 @@ function ghCliClient(opts = {}) {
|
|
|
2188
2463
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2189
2464
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2190
2465
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2191
|
-
const { dirname: dirname4, join:
|
|
2466
|
+
const { dirname: dirname4, join: join5, resolve } = await import("path");
|
|
2192
2467
|
for (const change of input.fileChanges) {
|
|
2193
2468
|
const abs = resolve(cwd, change.path);
|
|
2194
2469
|
await mkdir(dirname4(abs), { recursive: true });
|
|
2195
2470
|
await writeFile(abs, change.contents, "utf8");
|
|
2196
|
-
await run("git", ["add",
|
|
2471
|
+
await run("git", ["add", join5(change.path)]);
|
|
2197
2472
|
}
|
|
2198
2473
|
const env = {};
|
|
2199
2474
|
if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
|
|
@@ -3224,10 +3499,10 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
3224
3499
|
}
|
|
3225
3500
|
async append(record) {
|
|
3226
3501
|
const { appendFile, mkdir } = await import("fs/promises");
|
|
3227
|
-
const { join:
|
|
3502
|
+
const { join: join5 } = await import("path");
|
|
3228
3503
|
await mkdir(this.dir, { recursive: true });
|
|
3229
3504
|
await appendFile(
|
|
3230
|
-
|
|
3505
|
+
join5(this.dir, "feedback-trajectories.ndjson"),
|
|
3231
3506
|
`${JSON.stringify(record)}
|
|
3232
3507
|
`,
|
|
3233
3508
|
"utf8"
|
|
@@ -3236,8 +3511,8 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
3236
3511
|
async load() {
|
|
3237
3512
|
if (this.loaded) return;
|
|
3238
3513
|
const { readFile } = await import("fs/promises");
|
|
3239
|
-
const { join:
|
|
3240
|
-
const file =
|
|
3514
|
+
const { join: join5 } = await import("path");
|
|
3515
|
+
const file = join5(this.dir, "feedback-trajectories.ndjson");
|
|
3241
3516
|
try {
|
|
3242
3517
|
const raw = await readFile(file, "utf8");
|
|
3243
3518
|
for (const line of raw.split("\n")) {
|
|
@@ -6048,8 +6323,8 @@ function assertNonNegative(n, name) {
|
|
|
6048
6323
|
}
|
|
6049
6324
|
|
|
6050
6325
|
// src/muffled-gate-scanner.ts
|
|
6051
|
-
import { existsSync as
|
|
6052
|
-
import { join } from "path";
|
|
6326
|
+
import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
|
|
6327
|
+
import { join as join2 } from "path";
|
|
6053
6328
|
function codeOf(line) {
|
|
6054
6329
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
6055
6330
|
}
|
|
@@ -6161,14 +6436,14 @@ var UNIVERSAL_FINDERS = [findConstructorCwdDropped];
|
|
|
6161
6436
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
6162
6437
|
const matches = [];
|
|
6163
6438
|
const walk = (rel) => {
|
|
6164
|
-
const abs =
|
|
6165
|
-
if (!
|
|
6166
|
-
for (const entry of
|
|
6167
|
-
const sub =
|
|
6168
|
-
const subAbs =
|
|
6439
|
+
const abs = join2(repoRoot, rel);
|
|
6440
|
+
if (!existsSync4(abs)) return;
|
|
6441
|
+
for (const entry of readdirSync2(abs)) {
|
|
6442
|
+
const sub = join2(rel, entry);
|
|
6443
|
+
const subAbs = join2(repoRoot, sub);
|
|
6169
6444
|
let st;
|
|
6170
6445
|
try {
|
|
6171
|
-
st =
|
|
6446
|
+
st = statSync2(subAbs);
|
|
6172
6447
|
} catch {
|
|
6173
6448
|
continue;
|
|
6174
6449
|
}
|
|
@@ -6181,7 +6456,7 @@ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
|
6181
6456
|
continue;
|
|
6182
6457
|
let text;
|
|
6183
6458
|
try {
|
|
6184
|
-
text =
|
|
6459
|
+
text = readFileSync3(subAbs, "utf8");
|
|
6185
6460
|
} catch {
|
|
6186
6461
|
continue;
|
|
6187
6462
|
}
|
|
@@ -6196,9 +6471,9 @@ function scanForMuffledGates(opts) {
|
|
|
6196
6471
|
const findings = [];
|
|
6197
6472
|
const scanned = /* @__PURE__ */ new Set();
|
|
6198
6473
|
for (const file of opts.scanFiles) {
|
|
6199
|
-
const abs =
|
|
6200
|
-
if (!
|
|
6201
|
-
const text =
|
|
6474
|
+
const abs = join2(opts.repoRoot, file);
|
|
6475
|
+
if (!existsSync4(abs)) continue;
|
|
6476
|
+
const text = readFileSync3(abs, "utf8");
|
|
6202
6477
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
6203
6478
|
scanned.add(file);
|
|
6204
6479
|
}
|
|
@@ -6211,9 +6486,9 @@ function scanForMuffledGates(opts) {
|
|
|
6211
6486
|
);
|
|
6212
6487
|
for (const file of importers) {
|
|
6213
6488
|
if (scanned.has(file)) continue;
|
|
6214
|
-
const abs =
|
|
6215
|
-
if (!
|
|
6216
|
-
const text =
|
|
6489
|
+
const abs = join2(opts.repoRoot, file);
|
|
6490
|
+
if (!existsSync4(abs)) continue;
|
|
6491
|
+
const text = readFileSync3(abs, "utf8");
|
|
6217
6492
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
6218
6493
|
}
|
|
6219
6494
|
}
|
|
@@ -6363,7 +6638,7 @@ function isObject(v) {
|
|
|
6363
6638
|
}
|
|
6364
6639
|
|
|
6365
6640
|
// src/scorecard.ts
|
|
6366
|
-
import { appendFileSync as appendFileSync2, existsSync as
|
|
6641
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
6367
6642
|
import { dirname as dirname2 } from "path";
|
|
6368
6643
|
function median(xs) {
|
|
6369
6644
|
if (xs.length === 0) return 0;
|
|
@@ -6439,10 +6714,10 @@ function recordRunsToScorecard(logPath, runs, opts) {
|
|
|
6439
6714
|
return lines;
|
|
6440
6715
|
}
|
|
6441
6716
|
function loadScorecard(logPath) {
|
|
6442
|
-
if (!
|
|
6717
|
+
if (!existsSync5(logPath)) return { cells: [], profiles: {} };
|
|
6443
6718
|
const cells = /* @__PURE__ */ new Map();
|
|
6444
6719
|
const profiles = {};
|
|
6445
|
-
for (const raw of
|
|
6720
|
+
for (const raw of readFileSync4(logPath, "utf8").split("\n")) {
|
|
6446
6721
|
const line = raw.trim();
|
|
6447
6722
|
if (!line) continue;
|
|
6448
6723
|
let parsed;
|
|
@@ -7319,7 +7594,7 @@ async function commitBisect(options) {
|
|
|
7319
7594
|
}
|
|
7320
7595
|
async function promptBisect(options) {
|
|
7321
7596
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
7322
|
-
const
|
|
7597
|
+
const join5 = (paragraphs) => paragraphs.join("\n\n");
|
|
7323
7598
|
const goodParas = split(options.good);
|
|
7324
7599
|
const badParas = split(options.bad);
|
|
7325
7600
|
if (goodParas.length !== badParas.length) {
|
|
@@ -7339,7 +7614,7 @@ async function promptBisect(options) {
|
|
|
7339
7614
|
const result = await bisect({
|
|
7340
7615
|
good: goodMask,
|
|
7341
7616
|
bad: badMask,
|
|
7342
|
-
runEval: (mask) => options.runEval(
|
|
7617
|
+
runEval: (mask) => options.runEval(join5(paragraphsFor(mask))),
|
|
7343
7618
|
maxIterations: options.maxIterations ?? n + 5,
|
|
7344
7619
|
halfway: (g, b) => {
|
|
7345
7620
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -7370,12 +7645,12 @@ async function promptBisect(options) {
|
|
|
7370
7645
|
}
|
|
7371
7646
|
}
|
|
7372
7647
|
const materializedPath = result.path.map((s) => ({
|
|
7373
|
-
state:
|
|
7648
|
+
state: join5(paragraphsFor(s.state)),
|
|
7374
7649
|
score: s.score,
|
|
7375
7650
|
pass: s.pass
|
|
7376
7651
|
}));
|
|
7377
7652
|
return {
|
|
7378
|
-
culprit:
|
|
7653
|
+
culprit: join5(paragraphsFor(culprit)),
|
|
7379
7654
|
path: materializedPath,
|
|
7380
7655
|
converged: result.converged,
|
|
7381
7656
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7869,8 +8144,8 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7869
8144
|
|
|
7870
8145
|
// src/command-runner.ts
|
|
7871
8146
|
import { spawnSync } from "child_process";
|
|
7872
|
-
import { existsSync as
|
|
7873
|
-
import { join as
|
|
8147
|
+
import { existsSync as existsSync6, readdirSync as readdirSync3, readFileSync as readFileSync5, statSync as statSync3 } from "fs";
|
|
8148
|
+
import { join as join3 } from "path";
|
|
7874
8149
|
var localCommandRunner = {
|
|
7875
8150
|
name: "local",
|
|
7876
8151
|
async run(input) {
|
|
@@ -7898,11 +8173,11 @@ var localCommandRunner = {
|
|
|
7898
8173
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
7899
8174
|
},
|
|
7900
8175
|
async fileExists(path) {
|
|
7901
|
-
return
|
|
8176
|
+
return existsSync6(path);
|
|
7902
8177
|
},
|
|
7903
8178
|
async readFile(path) {
|
|
7904
8179
|
try {
|
|
7905
|
-
return
|
|
8180
|
+
return readFileSync5(path, "utf8");
|
|
7906
8181
|
} catch {
|
|
7907
8182
|
return null;
|
|
7908
8183
|
}
|
|
@@ -7910,14 +8185,14 @@ var localCommandRunner = {
|
|
|
7910
8185
|
async readDir(path) {
|
|
7911
8186
|
let entries;
|
|
7912
8187
|
try {
|
|
7913
|
-
entries =
|
|
8188
|
+
entries = readdirSync3(path);
|
|
7914
8189
|
} catch {
|
|
7915
8190
|
return [];
|
|
7916
8191
|
}
|
|
7917
8192
|
const out = [];
|
|
7918
8193
|
for (const name of entries) {
|
|
7919
8194
|
try {
|
|
7920
|
-
const st =
|
|
8195
|
+
const st = statSync3(join3(path, name));
|
|
7921
8196
|
out.push({
|
|
7922
8197
|
name,
|
|
7923
8198
|
isDirectory: st.isDirectory(),
|
|
@@ -8834,7 +9109,7 @@ function multiToolchainLayer(config) {
|
|
|
8834
9109
|
}
|
|
8835
9110
|
|
|
8836
9111
|
// src/reference-replay.ts
|
|
8837
|
-
import { appendFileSync as appendFileSync3, existsSync as
|
|
9112
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3, readFileSync as readFileSync6 } from "fs";
|
|
8838
9113
|
import { dirname as dirname3 } from "path";
|
|
8839
9114
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
8840
9115
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
@@ -8960,7 +9235,7 @@ function jsonlReferenceReplayStore(path) {
|
|
|
8960
9235
|
},
|
|
8961
9236
|
async list() {
|
|
8962
9237
|
return lock.runExclusive(() => {
|
|
8963
|
-
if (!
|
|
9238
|
+
if (!existsSync7(path)) return [];
|
|
8964
9239
|
return readJsonl(path);
|
|
8965
9240
|
});
|
|
8966
9241
|
}
|
|
@@ -9303,7 +9578,7 @@ function throwIfAborted(signal) {
|
|
|
9303
9578
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9304
9579
|
}
|
|
9305
9580
|
function readJsonl(path) {
|
|
9306
|
-
const raw =
|
|
9581
|
+
const raw = readFileSync6(path, "utf8");
|
|
9307
9582
|
const out = [];
|
|
9308
9583
|
for (const line of raw.split("\n")) {
|
|
9309
9584
|
const trimmed = line.trim();
|
|
@@ -9460,7 +9735,7 @@ function createDefaultReviewer(options) {
|
|
|
9460
9735
|
|
|
9461
9736
|
// src/discover-personas.ts
|
|
9462
9737
|
import { promises as fs } from "fs";
|
|
9463
|
-
import { basename, extname, join as
|
|
9738
|
+
import { basename, extname, join as join4 } from "path";
|
|
9464
9739
|
var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
|
|
9465
9740
|
async function discoverPersonas(dir, opts = {}) {
|
|
9466
9741
|
const pattern = opts.pattern ?? DEFAULT_PATTERN;
|
|
@@ -9478,7 +9753,7 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9478
9753
|
}
|
|
9479
9754
|
const out = [];
|
|
9480
9755
|
for (const entry of entries) {
|
|
9481
|
-
const full =
|
|
9756
|
+
const full = join4(d, entry.name);
|
|
9482
9757
|
if (entry.isDir) {
|
|
9483
9758
|
if (opts.recursive) out.push(...await walk(full));
|
|
9484
9759
|
continue;
|
|
@@ -10322,9 +10597,11 @@ export {
|
|
|
10322
10597
|
RunIntegrityError,
|
|
10323
10598
|
RunRecordValidationError,
|
|
10324
10599
|
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
10600
|
+
SKILL_USAGE_ANALYST,
|
|
10325
10601
|
SandboxHarness,
|
|
10326
10602
|
ScenarioRegistry,
|
|
10327
10603
|
SingleBackendError,
|
|
10604
|
+
SkillUsageAnalyst,
|
|
10328
10605
|
SpanNotFoundError,
|
|
10329
10606
|
SubprocessSandboxDriver,
|
|
10330
10607
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
@@ -10375,6 +10652,7 @@ export {
|
|
|
10375
10652
|
buildReflectionPrompt,
|
|
10376
10653
|
buildReviewerPrompt,
|
|
10377
10654
|
buildSandboxAgentProfileCell,
|
|
10655
|
+
buildSkillUsageReport,
|
|
10378
10656
|
buildTraceAnalystTools,
|
|
10379
10657
|
buildTraceInsightContext,
|
|
10380
10658
|
buildTraceInsightPrompt,
|
|
@@ -10455,6 +10733,7 @@ export {
|
|
|
10455
10733
|
distillPlaybook,
|
|
10456
10734
|
domainEvidencePattern,
|
|
10457
10735
|
dominates,
|
|
10736
|
+
emitSkillUsageFindings,
|
|
10458
10737
|
estimateCost,
|
|
10459
10738
|
estimateTokens,
|
|
10460
10739
|
euAiActReport,
|