@metaharness/weight-eft 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +147 -0
- package/dist/cli.d.ts +14 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +187 -0
- package/dist/cli.js.map +1 -0
- package/dist/eval.d.ts +50 -0
- package/dist/eval.d.ts.map +1 -0
- package/dist/eval.js +96 -0
- package/dist/eval.js.map +1 -0
- package/dist/export.d.ts +28 -0
- package/dist/export.d.ts.map +1 -0
- package/dist/export.js +249 -0
- package/dist/export.js.map +1 -0
- package/dist/genome.d.ts +38 -0
- package/dist/genome.d.ts.map +1 -0
- package/dist/genome.js +75 -0
- package/dist/genome.js.map +1 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +13 -0
- package/dist/index.js.map +1 -0
- package/dist/reward-hack.d.ts +17 -0
- package/dist/reward-hack.d.ts.map +1 -0
- package/dist/reward-hack.js +105 -0
- package/dist/reward-hack.js.map +1 -0
- package/dist/train.d.ts +112 -0
- package/dist/train.d.ts.map +1 -0
- package/dist/train.js +166 -0
- package/dist/train.js.map +1 -0
- package/dist/types.d.ts +144 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +20 -0
- package/dist/types.js.map +1 -0
- package/package.json +64 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RuvNet (https://ruv.io)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# @metaharness/weight-eft
|
|
2
|
+
|
|
3
|
+
**Evolutionary fine-tuning** — the bridge from Darwin's gradient-FREE policy
|
|
4
|
+
evolution (*freeze the model, evolve the harness*) to **gradient / weight**
|
|
5
|
+
self-learning on the **open cheap tier**.
|
|
6
|
+
|
|
7
|
+
## Thesis (honest, bounded)
|
|
8
|
+
|
|
9
|
+
We attack the **cost-Pareto axis, not the frontier ceiling.**
|
|
10
|
+
|
|
11
|
+
The metaharness cascade runs a cheap open model first (GLM / Qwen / DeepSeek)
|
|
12
|
+
and **escalates to a frontier model** (Opus / GPT) only on the hard tail. Each
|
|
13
|
+
escalation costs ~$0.50. `weight-eft` **distills the harness's archival
|
|
14
|
+
success into the cheap tier via LoRA** so the cheap model resolves more issues
|
|
15
|
+
on its own → **the cascade escalates less often** → **$/resolved drops.**
|
|
16
|
+
|
|
17
|
+
A 7-14B local-GPU tune **will not crack the hard tail** — that's a frontier
|
|
18
|
+
reasoning ceiling (clean-eval ~37.3%, ADR-177 §53). The win is **fewer
|
|
19
|
+
escalations**, and the telemetry stays honest about that. The eval metric is
|
|
20
|
+
**escalation-rate-reduction + cost/resolved**, *not* hard-tail cracking.
|
|
21
|
+
|
|
22
|
+
## The data recipe (on/off-policy)
|
|
23
|
+
|
|
24
|
+
| Set | Contents | Why |
|
|
25
|
+
|-----|----------|-----|
|
|
26
|
+
| **SFT** | **ALL** gold-resolved trajectories — cheap-OWN *and* frontier-escalation | SFT (max-likelihood) is off-policy-stable, so a frontier success on an issue the cheap model couldn't solve is **off-policy-safe DISTILLATION**. |
|
|
27
|
+
| **DPO** | **ON-POLICY cheap-vs-cheap pairs ONLY** — `chosen` = a resolved sample, `rejected` = an empty/failed sample by the **same cheap model on the same instance** (BoN-derived) | A frontier-chosen-vs-cheap-rejected pair is **off-policy and unstable** (the reference policy never produced the chosen completion). That signal goes to SFT instead. |
|
|
28
|
+
|
|
29
|
+
### Output formats (canonical / portable)
|
|
30
|
+
|
|
31
|
+
Exported files use **standard** schemas (portable to TRL / axolotl / unsloth /
|
|
32
|
+
ruvllm-MicroLoRA), never a custom format. A thin runner-adapter at the training
|
|
33
|
+
boundary maps standard → whatever the runner ingests.
|
|
34
|
+
|
|
35
|
+
- **SFT** — OpenAI chat JSONL:
|
|
36
|
+
`{"messages":[{role:system},{role:user},{role:assistant,tool_calls:[…]},{role:tool,…},…,{role:assistant}]}`.
|
|
37
|
+
**`tool_calls` are preserved** — the ReAct loop is **not** flattened to plain
|
|
38
|
+
text; the model learns real tool-use trajectories.
|
|
39
|
+
- **DPO** — TRL/HF conversational preference:
|
|
40
|
+
`{"prompt":[system+issue], "chosen":[resolved trajectory], "rejected":[failed trajectory]}`.
|
|
41
|
+
ReAct diverges from the first action, so `prompt` is the shared system+issue
|
|
42
|
+
and chosen/rejected are full trajectories from there.
|
|
43
|
+
|
|
44
|
+
## The guards
|
|
45
|
+
|
|
46
|
+
1. **Contamination guard (the headline correctness property).** Strict
|
|
47
|
+
**train/eval instance-ID disjointness.** The exporter excludes any
|
|
48
|
+
trajectory whose `instance_id` is in the caller's `evalHoldout`, and
|
|
49
|
+
`assertTrainEvalDisjoint` **throws** on any overlap. *Training on eval
|
|
50
|
+
instances is fake lift — the exact contamination we debunk elsewhere.*
|
|
51
|
+
2. **Reward-hacking filter** (Ornith-1.0 borrow). A **deterministic monitor**
|
|
52
|
+
drops any "success" that read a withheld gold/test path, modified the
|
|
53
|
+
verification harness, or escaped the sandbox. An archived reward-hack would
|
|
54
|
+
teach the model to reward-hack — this is the **training-data analog of the
|
|
55
|
+
conformance firewall**, separate from and *in addition to* the disjointness
|
|
56
|
+
guard.
|
|
57
|
+
3. **Long-context filter.** SWE/ReAct trajectories can blow past a 7-14B
|
|
58
|
+
context window (~32k). Over-budget trajectories are **dropped (or truncated
|
|
59
|
+
with `--truncate`) and REPORTED** — never silently lost.
|
|
60
|
+
|
|
61
|
+
Every drop is surfaced in the export report (`droppedRewardHacked`,
|
|
62
|
+
`excludedByHoldout`, `droppedOverLength`, `truncatedOverLength`).
|
|
63
|
+
|
|
64
|
+
## The `weightAdapter` genome gene (prune-the-overfitter safety net)
|
|
65
|
+
|
|
66
|
+
A LoRA tune can overfit. Rather than trust it blindly, the adapter is a **gene**
|
|
67
|
+
in the Darwin genome (`packages/darwin-mode/bench/swebench/evolve-config.mjs`):
|
|
68
|
+
|
|
69
|
+
- `weightAdapter: null` = **BASE** (no adapter) — the default and the control.
|
|
70
|
+
A genome that never opts in is **byte-identical (by key) to a pre-gene
|
|
71
|
+
genome.**
|
|
72
|
+
- `weightAdapter: 'sft'` = SFT-distilled adapter.
|
|
73
|
+
- `weightAdapter: 'sft-dpo'` = SFT then on-policy DPO.
|
|
74
|
+
|
|
75
|
+
Base competes against the tuned variants under the **same conformant fitness**,
|
|
76
|
+
so **evolution prunes an adapter that doesn't actually lift held-out resolve.**
|
|
77
|
+
The gene is inert until an adapter is trained (a GPU job) — it only *names* an
|
|
78
|
+
adapter; it does not create one.
|
|
79
|
+
|
|
80
|
+
## The training runner (GPU-gated)
|
|
81
|
+
|
|
82
|
+
`weight-eft train` is **$0 by default** — it emits a training **plan** (config +
|
|
83
|
+
the exact `ruvllm microlora …` command). A **real** run requires **BOTH** an
|
|
84
|
+
explicit `--train` flag **AND** a detected GPU / endpoint; otherwise it dry-runs
|
|
85
|
+
or refuses. Target is **7-14B** (Qwen2.5-Coder-7B / GLM-4-9B class) — *not* 32B
|
|
86
|
+
(§59: 32B q4 spills a 16GB GPU). Stages: SFT first, then optional on-policy DPO
|
|
87
|
+
from the SFT checkpoint.
|
|
88
|
+
|
|
89
|
+
## CLI
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Status / recipe summary
|
|
93
|
+
weight-eft status
|
|
94
|
+
metaharness weight-eft status # via the umbrella CLI
|
|
95
|
+
|
|
96
|
+
# Export training sets ($0). evalHoldout enforces the contamination guard.
|
|
97
|
+
weight-eft export --archive archive.json --eval-holdout holdout.json --out-dir ./out
|
|
98
|
+
|
|
99
|
+
# Emit the training plan ($0 dry-run). Add --train on a GPU host to run.
|
|
100
|
+
weight-eft train --base Qwen/Qwen2.5-Coder-7B-Instruct --params-b 7 \
|
|
101
|
+
--sft ./out/sft.jsonl --dpo ./out/dpo.jsonl --adapter glm5.2
|
|
102
|
+
|
|
103
|
+
# Measure the cost-Pareto delta (base vs adapter cascade runs).
|
|
104
|
+
weight-eft eval --base-outcomes base.json --adapter-outcomes adapter.json
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### The exact (later, GPU) command to train + eval
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
# 1) Export ($0) — disjoint train/eval, reward-hack-filtered, long-context-filtered
|
|
111
|
+
weight-eft export --archive darwin-archive.json --eval-holdout clean-eval-ids.json --out-dir ./eft
|
|
112
|
+
|
|
113
|
+
# 2) Train (GPU host) — SFT then on-policy DPO. ruvllm/MicroLoRA executes plan.command.
|
|
114
|
+
weight-eft train --base Qwen/Qwen2.5-Coder-7B-Instruct --params-b 7 \
|
|
115
|
+
--sft ./eft/sft.jsonl --dpo ./eft/dpo.jsonl --adapter glm5.2 --train
|
|
116
|
+
# (refuses unless WEIGHT_EFT_BASE_URL / CUDA_VISIBLE_DEVICES is set)
|
|
117
|
+
|
|
118
|
+
# 3) Run the conformant cascade twice (base vs glm5.2-sft-dpo adapter) on the
|
|
119
|
+
# HELD-OUT clean set via the existing darwin eval path, collect per-instance
|
|
120
|
+
# CascadeOutcome[] for each, then:
|
|
121
|
+
weight-eft eval --base-outcomes base-outcomes.json --adapter-outcomes adapter-outcomes.json
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Input contract
|
|
125
|
+
|
|
126
|
+
The exporter codes against `DarwinTrajectory[]` (see `src/types.ts`) —
|
|
127
|
+
reconstructable from Firestore `darwin_runs` + the local prediction/trajectory
|
|
128
|
+
artifacts (`predictions-*.jsonl` rows carry `instance_id` + `model_patch`; the
|
|
129
|
+
agentic loop carries the `messages` array with `tool_calls`, see
|
|
130
|
+
`darwin-mode/bench/swebench/solve-agentic.mjs`). A tiny mock fixture archive
|
|
131
|
+
lives in `__tests__/fixtures/`.
|
|
132
|
+
|
|
133
|
+
## Status (honest)
|
|
134
|
+
|
|
135
|
+
- **Runnable, $0:** exporter (with all three guards), training-plan emission,
|
|
136
|
+
cost-Pareto eval folding, the `weightAdapter` gene (wired into darwin's
|
|
137
|
+
evolve-config genome + the umbrella `metaharness weight-eft` CLI).
|
|
138
|
+
- **Scaffolded, GPU-gated:** the actual LoRA training (`spawn(plan.command)` on
|
|
139
|
+
a GPU host implementing the ruvllm/MicroLoRA seam). No training run, no GPU
|
|
140
|
+
job, no paid model call has been executed.
|
|
141
|
+
|
|
142
|
+
See **ADR-198** for the full rationale, the SFT-distill / on-policy-DPO recipe,
|
|
143
|
+
the disjointness invariant, and the self-scaffolding RL roadmap (Ornith-1.0).
|
|
144
|
+
|
|
145
|
+
## License
|
|
146
|
+
|
|
147
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
export interface CliResult {
|
|
3
|
+
code: number;
|
|
4
|
+
lines: string[];
|
|
5
|
+
}
|
|
6
|
+
/** `weight-eft export --archive a.json --eval-holdout h.json --out-dir ./out` */
|
|
7
|
+
export declare function exportCmd(args: string[]): Promise<CliResult>;
|
|
8
|
+
/** `weight-eft train --base <id> --params-b N --sft sft.jsonl [--dpo dpo.jsonl] --adapter <prefix> [--train]` */
|
|
9
|
+
export declare function trainCmd(args: string[]): Promise<CliResult>;
|
|
10
|
+
/** `weight-eft eval --base-outcomes b.json --adapter-outcomes a.json` */
|
|
11
|
+
export declare function evalCmd(args: string[]): Promise<CliResult>;
|
|
12
|
+
export declare function statusCmd(): CliResult;
|
|
13
|
+
export declare function dispatch(sub: string | undefined, args: string[]): Promise<CliResult>;
|
|
14
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AA2BA,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAUD,iFAAiF;AACjF,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,SAAS,CAAC,CAwClE;AAED,iHAAiH;AACjH,wBAAsB,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,SAAS,CAAC,CAwCjE;AAED,yEAAyE;AACzE,wBAAsB,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,SAAS,CAAC,CAoBhE;AAED,wBAAgB,SAAS,IAAI,SAAS,CA2BrC;AAED,wBAAsB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,SAAS,CAAC,CAgB1F"}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// SPDX-License-Identifier: MIT
|
|
3
|
+
//
|
|
4
|
+
// weight-eft CLI — export | train | eval | status.
|
|
5
|
+
//
|
|
6
|
+
// Also the delegation target for `metaharness weight-eft <...>` (the umbrella
|
|
7
|
+
// package forwards here). Everything is $0 by default: `export` reads an
|
|
8
|
+
// archive JSON and writes SFT/DPO JSONL; `train` is a dry-run plan unless
|
|
9
|
+
// --train AND a GPU/endpoint are present; `eval` folds outcome JSON into the
|
|
10
|
+
// cost-Pareto delta; `status` prints the recipe + guard summary.
|
|
11
|
+
import { mkdir, readFile, writeFile } from 'node:fs/promises';
|
|
12
|
+
import { resolve } from 'node:path';
|
|
13
|
+
import { exportTrainingData, sftToJsonl, dpoToJsonl } from './export.js';
|
|
14
|
+
import { sftConfig, buildPlan, runTraining, twoStagePlan, defaultDetectGpu, } from './train.js';
|
|
15
|
+
import { costParetoDelta } from './eval.js';
|
|
16
|
+
function flag(args, name) {
|
|
17
|
+
const i = args.indexOf(name);
|
|
18
|
+
return i >= 0 ? args[i + 1] : undefined;
|
|
19
|
+
}
|
|
20
|
+
function has(args, name) {
|
|
21
|
+
return args.includes(name);
|
|
22
|
+
}
|
|
23
|
+
/** `weight-eft export --archive a.json --eval-holdout h.json --out-dir ./out` */
|
|
24
|
+
export async function exportCmd(args) {
|
|
25
|
+
const archivePath = flag(args, '--archive');
|
|
26
|
+
const holdoutPath = flag(args, '--eval-holdout');
|
|
27
|
+
const outDir = flag(args, '--out-dir') ?? '.';
|
|
28
|
+
const maxTokens = flag(args, '--max-tokens');
|
|
29
|
+
const truncate = has(args, '--truncate');
|
|
30
|
+
if (!archivePath) {
|
|
31
|
+
return { code: 2, lines: ['Usage: weight-eft export --archive <archive.json> --eval-holdout <holdout.json> [--out-dir <dir>] [--max-tokens N] [--truncate]'] };
|
|
32
|
+
}
|
|
33
|
+
const archive = JSON.parse(await readFile(resolve(archivePath), 'utf8'));
|
|
34
|
+
const evalHoldout = holdoutPath
|
|
35
|
+
? JSON.parse(await readFile(resolve(holdoutPath), 'utf8'))
|
|
36
|
+
: [];
|
|
37
|
+
const result = exportTrainingData(archive, {
|
|
38
|
+
evalHoldout,
|
|
39
|
+
maxTokens: maxTokens ? Number(maxTokens) : undefined,
|
|
40
|
+
truncateOverLength: truncate,
|
|
41
|
+
});
|
|
42
|
+
const outDirAbs = resolve(outDir);
|
|
43
|
+
await mkdir(outDirAbs, { recursive: true });
|
|
44
|
+
const sftPath = resolve(outDirAbs, 'sft.jsonl');
|
|
45
|
+
const dpoPath = resolve(outDirAbs, 'dpo.jsonl');
|
|
46
|
+
const reportPath = resolve(outDirAbs, 'export-report.json');
|
|
47
|
+
await writeFile(sftPath, sftToJsonl(result.sft));
|
|
48
|
+
await writeFile(dpoPath, dpoToJsonl(result.dpo));
|
|
49
|
+
await writeFile(reportPath, JSON.stringify(result.report, null, 2));
|
|
50
|
+
const r = result.report;
|
|
51
|
+
return {
|
|
52
|
+
code: 0,
|
|
53
|
+
lines: [
|
|
54
|
+
`weight-eft export (contamination guard ENFORCED):`,
|
|
55
|
+
` trajectories in: ${r.totalTrajectories}`,
|
|
56
|
+
` excluded (holdout): ${r.excludedByHoldout} (train/eval disjointness)`,
|
|
57
|
+
` dropped over-len: ${r.droppedOverLength} truncated: ${r.truncatedOverLength}`,
|
|
58
|
+
` reward-hack drops: ${r.droppedRewardHacked} (gold-read / verification-tamper / sandbox-escape)`,
|
|
59
|
+
` SFT rows: ${r.sftRows} → ${sftPath}`,
|
|
60
|
+
` DPO rows: ${r.dpoRows} → ${dpoPath} (on-policy cheap-vs-cheap only)`,
|
|
61
|
+
` report: ${reportPath}`,
|
|
62
|
+
],
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
/** `weight-eft train --base <id> --params-b N --sft sft.jsonl [--dpo dpo.jsonl] --adapter <prefix> [--train]` */
|
|
66
|
+
export async function trainCmd(args) {
|
|
67
|
+
const baseId = flag(args, '--base') ?? 'Qwen/Qwen2.5-Coder-7B-Instruct';
|
|
68
|
+
const paramsB = Number(flag(args, '--params-b') ?? '7');
|
|
69
|
+
const sftData = flag(args, '--sft');
|
|
70
|
+
const dpoData = flag(args, '--dpo');
|
|
71
|
+
const adapter = flag(args, '--adapter') ?? 'cheap-tier';
|
|
72
|
+
const doTrain = has(args, '--train');
|
|
73
|
+
const base = { id: baseId, paramsB };
|
|
74
|
+
const lines = [];
|
|
75
|
+
if (!sftData) {
|
|
76
|
+
return { code: 2, lines: ['Usage: weight-eft train --base <id> --params-b N --sft <sft.jsonl> [--dpo <dpo.jsonl>] --adapter <prefix> [--train]'] };
|
|
77
|
+
}
|
|
78
|
+
const gpu = defaultDetectGpu();
|
|
79
|
+
lines.push(`weight-eft train (${doTrain ? '--train requested' : 'DRY-RUN — emit plan only'}):`);
|
|
80
|
+
lines.push(` GPU/endpoint: ${gpu.available ? 'available — ' : 'NOT available — '}${gpu.detail}`);
|
|
81
|
+
if (dpoData) {
|
|
82
|
+
const plans = twoStagePlan(base, sftData, dpoData, adapter);
|
|
83
|
+
for (const stage of [plans.sft, plans.dpo]) {
|
|
84
|
+
lines.push(` PLAN ${stage.summary}`);
|
|
85
|
+
lines.push(` ${stage.command}`);
|
|
86
|
+
}
|
|
87
|
+
if (doTrain) {
|
|
88
|
+
const sftRun = runTraining(plans.sft.config, { train: true });
|
|
89
|
+
lines.push(` SFT: ${sftRun.status}${sftRun.reason ? ' — ' + sftRun.reason : ''}`);
|
|
90
|
+
const dpoRun = runTraining(plans.dpo.config, { train: true });
|
|
91
|
+
lines.push(` DPO: ${dpoRun.status}${dpoRun.reason ? ' — ' + dpoRun.reason : ''}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
const plan = buildPlan(sftConfig(base, sftData, `${adapter}-sft`));
|
|
96
|
+
lines.push(` PLAN ${plan.summary}`);
|
|
97
|
+
lines.push(` ${plan.command}`);
|
|
98
|
+
if (doTrain) {
|
|
99
|
+
const run = runTraining(plan.config, { train: true });
|
|
100
|
+
lines.push(` SFT: ${run.status}${run.reason ? ' — ' + run.reason : ''}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return { code: 0, lines };
|
|
104
|
+
}
|
|
105
|
+
/** `weight-eft eval --base-outcomes b.json --adapter-outcomes a.json` */
|
|
106
|
+
export async function evalCmd(args) {
|
|
107
|
+
const basePath = flag(args, '--base-outcomes');
|
|
108
|
+
const adapterPath = flag(args, '--adapter-outcomes');
|
|
109
|
+
if (!basePath || !adapterPath) {
|
|
110
|
+
return { code: 2, lines: ['Usage: weight-eft eval --base-outcomes <base.json> --adapter-outcomes <adapter.json>'] };
|
|
111
|
+
}
|
|
112
|
+
const baseOutcomes = JSON.parse(await readFile(resolve(basePath), 'utf8'));
|
|
113
|
+
const adapterOutcomes = JSON.parse(await readFile(resolve(adapterPath), 'utf8'));
|
|
114
|
+
const d = costParetoDelta(baseOutcomes, adapterOutcomes);
|
|
115
|
+
return {
|
|
116
|
+
code: 0,
|
|
117
|
+
lines: [
|
|
118
|
+
`weight-eft eval (cost-Pareto, NOT hard-tail):`,
|
|
119
|
+
` cheap-resolve lift: ${(d.cheapResolveLift * 100).toFixed(1)}pp`,
|
|
120
|
+
` escalation reduction: ${(d.escalationRateReduction * 100).toFixed(1)}pp (the cost-Pareto win)`,
|
|
121
|
+
` cost/resolved reduction: $${d.costPerResolvedReduction.toFixed(4)}`,
|
|
122
|
+
` resolve-rate delta: ${(d.resolveRateDelta * 100).toFixed(1)}pp (expected ≈0 — ceiling unmoved)`,
|
|
123
|
+
` verdict: ${d.verdict}`,
|
|
124
|
+
],
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
export function statusCmd() {
|
|
128
|
+
return {
|
|
129
|
+
code: 0,
|
|
130
|
+
lines: [
|
|
131
|
+
`@metaharness/weight-eft — evolutionary fine-tuning (ADR-198)`,
|
|
132
|
+
``,
|
|
133
|
+
`THESIS: distill the archive into the open cheap tier (GLM/Qwen) via LoRA so`,
|
|
134
|
+
`the cost-cascade escalates to a frontier model LESS often. Cost-Pareto axis,`,
|
|
135
|
+
`NOT the frontier ceiling — a 7-14B tune will not crack the hard tail.`,
|
|
136
|
+
``,
|
|
137
|
+
`DATA RECIPE:`,
|
|
138
|
+
` SFT = ALL gold-resolved trajectories (cheap-own AND frontier-escalation;`,
|
|
139
|
+
` frontier successes for off-policy-safe DISTILLATION).`,
|
|
140
|
+
` DPO = ON-POLICY cheap-vs-cheap pairs ONLY (chosen=resolved, rejected=failed,`,
|
|
141
|
+
` same model + same instance). No frontier-chosen pairs (off-policy).`,
|
|
142
|
+
``,
|
|
143
|
+
`GUARDS:`,
|
|
144
|
+
` - Contamination: strict train/eval instance-ID disjointness (throws on overlap).`,
|
|
145
|
+
` - Long-context: drop/truncate over-budget trajectories, always reported.`,
|
|
146
|
+
` - Tool-call fidelity: tool_calls preserved (never stringified).`,
|
|
147
|
+
``,
|
|
148
|
+
`TRAIN: GPU-gated. Dry-run (plan) unless --train AND a GPU/endpoint is detected.`,
|
|
149
|
+
` Target 7-14B (Qwen2.5-Coder-7B / GLM-4-9B), NOT 32B.`,
|
|
150
|
+
``,
|
|
151
|
+
`Subcommands: export | train | eval | status`,
|
|
152
|
+
],
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
export async function dispatch(sub, args) {
|
|
156
|
+
switch (sub) {
|
|
157
|
+
case 'export':
|
|
158
|
+
return exportCmd(args);
|
|
159
|
+
case 'train':
|
|
160
|
+
return trainCmd(args);
|
|
161
|
+
case 'eval':
|
|
162
|
+
return evalCmd(args);
|
|
163
|
+
case 'status':
|
|
164
|
+
case undefined:
|
|
165
|
+
case '--help':
|
|
166
|
+
case '-h':
|
|
167
|
+
return statusCmd();
|
|
168
|
+
default:
|
|
169
|
+
return { code: 2, lines: [`Unknown subcommand: ${sub}`, `Run 'weight-eft status' for usage.`] };
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
// Direct CLI entry (when invoked as the `weight-eft` bin).
|
|
173
|
+
const isMain = import.meta.url === `file://${process.argv[1]}`;
|
|
174
|
+
if (isMain) {
|
|
175
|
+
const [, , sub, ...rest] = process.argv;
|
|
176
|
+
dispatch(sub, rest)
|
|
177
|
+
.then((r) => {
|
|
178
|
+
for (const l of r.lines)
|
|
179
|
+
console.log(l);
|
|
180
|
+
process.exit(r.code);
|
|
181
|
+
})
|
|
182
|
+
.catch((err) => {
|
|
183
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
184
|
+
process.exit(1);
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,+BAA+B;AAC/B,EAAE;AACF,mDAAmD;AACnD,EAAE;AACF,8EAA8E;AAC9E,yEAAyE;AACzE,0EAA0E;AAC1E,6EAA6E;AAC7E,iEAAiE;AAEjE,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,kBAAkB,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzE,OAAO,EACL,SAAS,EAET,SAAS,EACT,WAAW,EACX,YAAY,EACZ,gBAAgB,GAEjB,MAAM,YAAY,CAAC;AACpB,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAS5C,SAAS,IAAI,CAAC,IAAc,EAAE,IAAY;IACxC,MAAM,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAC7B,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AAC1C,CAAC;AACD,SAAS,GAAG,CAAC,IAAc,EAAE,IAAY;IACvC,OAAO,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AAC7B,CAAC;AAED,iFAAiF;AACjF,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAc;IAC5C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAC5C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;IACjD,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,GAAG,CAAC;IAC9C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;IACzC,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,iIAAiI,CAAC,EAAE,CAAC;IACjK,CAAC;IACD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,OAAO,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC,CAAuB,CAAC;IAC/F,MAAM,WAAW,GAAa,WAAW;QACvC,CAAC,CAAE,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,OAAO,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC,CAAc;QACxE,CAAC,CAAC,EAAE,CAAC;IACP,MAAM,MAAM,GAAG,kBAAkB,CAAC,OAAO,EAAE;QACzC,WAAW;QACX,SAAS,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;QACpD,kBAAkB,EAAE,QAAQ;KAC7B,CAAC,CAAC;IACH,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IAClC,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5C,MAAM,OAAO,GAAG,OAAO,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,OAAO,CAAC,SAAS,EAAE,oBAAoB,CAAC,CAAC;IAC5D,MAAM,SAAS,CAAC,OAAO,EAAE,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IACjD,MAAM,SAAS,CAAC,OAAO,EAAE,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IACjD,MAAM,SAAS,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IACpE,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACxB,OAAO;QACL,IAAI,EAAE,CAAC;QACP,KAAK,EAAE;YACL,mDAAmD;YACnD,wBAAwB,CAAC,CAAC,iBAAiB,EAAE;YAC7C,yBAAyB,CAAC,CAAC,iBAAiB,8BAA8B;YAC1E,wBAAwB,CAAC,CAAC,iBAAiB,kBAAkB,CAAC,CAAC,mBAAmB,EAAE;YACpF,wBAAwB,CAAC,CAAC,mBAAmB,wDAAwD;YACrG,wBAAwB,CAAC,CAAC,OAAO,MAAM,OAAO,EAAE;YAChD,wBAAwB,CAAC,CAAC,OAAO,MAAM,OAAO,oCAAoC;YAClF,wBAAwB,UAAU,EAAE;SACrC;KACF,CAAC;AACJ,CAAC;AAED,iHAAiH;AACjH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,IAAc;IAC3C,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,IAAI,gCAAgC,CAAC;IACxE,MAAM,OAAO,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,YAAY,CAAC,IAAI,GAAG,CAAC,CAAC;IACxD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY,CAAC;IACxD,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;IACrC,MAAM,IAAI,GAAkB,EAAE,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;IACpD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,qHAAqH,CAAC,EAAE,CAAC;IACrJ,CAAC;IAED,MAAM,GAAG,GAAG,gBAAgB,EAAE,CAAC;IAC/B,KAAK,CAAC,IAAI,CAAC,qBAAqB,OAAO,CAAC,CAAC,CAAC,mBAAmB,CAAC,CAAC,CAAC,0BAA0B,IAAI,CAAC,CAAC;IAChG,KAAK,CAAC,IAAI,CAAC,mBAAmB,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,kBAAkB,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,CAAC;IAElG,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,KAAK,GAAG,YAAY,CAAC,IAAI,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;QAC5D,KAAK,MAAM,KAAK,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3C,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YACtC,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QACxC,CAAC;QACD,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YAC9D,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACnF,MAAM,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YAC9D,KAAK,CAAC,IAAI,CAAC,UAAU,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACrF,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,GAAG,SAAS,CAAC,SAAS,CAAC,IAAI,EAAE,OAAO,EAAE,GAAG,OAAO,MAAM,CAAC,CAAC,CAAC;QACnE,KAAK,CAAC,IAAI,CAAC,UAAU,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QACrC,KAAK,CAAC,IAAI,CAAC,UAAU,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;QACrC,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,GAAG,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACtD,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5E,CAAC;IACH,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC;AAC5B,CAAC;AAED,yEAAyE;AACzE,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,IAAc;IAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,iBAAiB,CAAC,CAAC;IAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,EAAE,oBAAoB,CAAC,CAAC;IACrD,IAAI,CAAC,QAAQ,IAAI,CAAC,WAAW,EAAE,CAAC;QAC9B,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,sFAAsF,CAAC,EAAE,CAAC;IACtH,CAAC;IACD,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,CAAqB,CAAC;IAC/F,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,OAAO,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC,CAAqB,CAAC;IACrG,MAAM,CAAC,GAAG,eAAe,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IACzD,OAAO;QACL,IAAI,EAAE,CAAC;QACP,KAAK,EAAE;YACL,+CAA+C;YAC/C,8BAA8B,CAAC,CAAC,CAAC,gBAAgB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;YACvE,8BAA8B,CAAC,CAAC,CAAC,uBAAuB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,2BAA2B;YACrG,+BAA+B,CAAC,CAAC,wBAAwB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;YACtE,8BAA8B,CAAC,CAAC,CAAC,gBAAgB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,qCAAqC;YACxG,cAAc,CAAC,CAAC,OAAO,EAAE;SAC1B;KACF,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,SAAS;IACvB,OAAO;QACL,IAAI,EAAE,CAAC;QACP,KAAK,EAAE;YACL,8DAA8D;YAC9D,EAAE;YACF,6EAA6E;YAC7E,8EAA8E;YAC9E,uEAAuE;YACvE,EAAE;YACF,cAAc;YACd,4EAA4E;YAC5E,+DAA+D;YAC/D,gFAAgF;YAChF,6EAA6E;YAC7E,EAAE;YACF,SAAS;YACT,oFAAoF;YACpF,4EAA4E;YAC5E,mEAAmE;YACnE,EAAE;YACF,iFAAiF;YACjF,6DAA6D;YAC7D,EAAE;YACF,6CAA6C;SAC9C;KACF,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,GAAuB,EAAE,IAAc;IACpE,QAAQ,GAAG,EAAE,CAAC;QACZ,KAAK,QAAQ;YACX,OAAO,SAAS,CAAC,IAAI,CAAC,CAAC;QACzB,KAAK,OAAO;YACV,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC;QACxB,KAAK,MAAM;YACT,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC;QACvB,KAAK,QAAQ,CAAC;QACd,KAAK,SAAS,CAAC;QACf,KAAK,QAAQ,CAAC;QACd,KAAK,IAAI;YACP,OAAO,SAAS,EAAE,CAAC;QACrB;YACE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,uBAAuB,GAAG,EAAE,EAAE,oCAAoC,CAAC,EAAE,CAAC;IACpG,CAAC;AACH,CAAC;AAED,2DAA2D;AAC3D,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,KAAK,UAAU,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;AAC/D,IAAI,MAAM,EAAE,CAAC;IACX,MAAM,CAAC,EAAE,AAAD,EAAG,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IACxC,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC;SAChB,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE;QACV,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,KAAK;YAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IACvB,CAAC,CAAC;SACD,KAAK,CAAC,CAAC,GAAY,EAAE,EAAE;QACtB,OAAO,CAAC,KAAK,CAAC,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;QAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,CAAC;AACP,CAAC"}
|
package/dist/eval.d.ts
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/** Per-instance outcome from a conformant cascade eval run on the held-out set. */
|
|
2
|
+
export interface CascadeOutcome {
|
|
3
|
+
instance_id: string;
|
|
4
|
+
/** Did the cheap tier resolve it WITHOUT escalating? */
|
|
5
|
+
cheapResolved: boolean;
|
|
6
|
+
/** Did the run escalate to the frontier tier? */
|
|
7
|
+
escalated: boolean;
|
|
8
|
+
/** Final resolved status (cheap OR post-escalation), from gold eval. */
|
|
9
|
+
resolved: boolean;
|
|
10
|
+
/** Measured $ for this instance (cheap + any escalation). */
|
|
11
|
+
costUsd: number;
|
|
12
|
+
}
|
|
13
|
+
/** The cost-Pareto summary for one eval run (base OR adapter). */
|
|
14
|
+
export interface CascadeSummary {
|
|
15
|
+
n: number;
|
|
16
|
+
/** Fraction the CHEAP tier resolved alone. */
|
|
17
|
+
cheapResolveRate: number;
|
|
18
|
+
/** Fraction of runs that escalated to the frontier tier. */
|
|
19
|
+
escalationRate: number;
|
|
20
|
+
/** Final resolve rate (the headline, expected ~unchanged — ceiling unmoved). */
|
|
21
|
+
resolveRate: number;
|
|
22
|
+
/** Mean $ per RESOLVED instance (the cost-Pareto figure of merit). */
|
|
23
|
+
costPerResolved: number;
|
|
24
|
+
/** Total $ across the run. */
|
|
25
|
+
totalCostUsd: number;
|
|
26
|
+
}
|
|
27
|
+
/** The delta between an adapter run and the base run — the cost-Pareto win. */
|
|
28
|
+
export interface CostParetoDelta {
|
|
29
|
+
base: CascadeSummary;
|
|
30
|
+
adapter: CascadeSummary;
|
|
31
|
+
/** adapter.cheapResolveRate − base.cheapResolveRate (want > 0). */
|
|
32
|
+
cheapResolveLift: number;
|
|
33
|
+
/** base.escalationRate − adapter.escalationRate (want > 0 — FEWER escalations). */
|
|
34
|
+
escalationRateReduction: number;
|
|
35
|
+
/** base.costPerResolved − adapter.costPerResolved (want > 0 — cheaper). */
|
|
36
|
+
costPerResolvedReduction: number;
|
|
37
|
+
/** adapter.resolveRate − base.resolveRate (expected ≈ 0 — ceiling unmoved). */
|
|
38
|
+
resolveRateDelta: number;
|
|
39
|
+
/** Honest verdict string for telemetry. */
|
|
40
|
+
verdict: string;
|
|
41
|
+
}
|
|
42
|
+
/** Fold per-instance cascade outcomes into the cost-Pareto summary. */
|
|
43
|
+
export declare function summarizeCascade(outcomes: CascadeOutcome[]): CascadeSummary;
|
|
44
|
+
/**
|
|
45
|
+
* Compute the cost-Pareto delta between an adapter run and the base run.
|
|
46
|
+
* Stays HONEST: the headline is escalation-rate-reduction + cost/resolved, and
|
|
47
|
+
* the verdict flags when the resolve ceiling is (as expected) unmoved.
|
|
48
|
+
*/
|
|
49
|
+
export declare function costParetoDelta(baseOutcomes: CascadeOutcome[], adapterOutcomes: CascadeOutcome[]): CostParetoDelta;
|
|
50
|
+
//# sourceMappingURL=eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.d.ts","sourceRoot":"","sources":["../src/eval.ts"],"names":[],"mappings":"AAmBA,mFAAmF;AACnF,MAAM,WAAW,cAAc;IAC7B,WAAW,EAAE,MAAM,CAAC;IACpB,wDAAwD;IACxD,aAAa,EAAE,OAAO,CAAC;IACvB,iDAAiD;IACjD,SAAS,EAAE,OAAO,CAAC;IACnB,wEAAwE;IACxE,QAAQ,EAAE,OAAO,CAAC;IAClB,6DAA6D;IAC7D,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,kEAAkE;AAClE,MAAM,WAAW,cAAc;IAC7B,CAAC,EAAE,MAAM,CAAC;IACV,8CAA8C;IAC9C,gBAAgB,EAAE,MAAM,CAAC;IACzB,4DAA4D;IAC5D,cAAc,EAAE,MAAM,CAAC;IACvB,gFAAgF;IAChF,WAAW,EAAE,MAAM,CAAC;IACpB,sEAAsE;IACtE,eAAe,EAAE,MAAM,CAAC;IACxB,8BAA8B;IAC9B,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,+EAA+E;AAC/E,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,cAAc,CAAC;IACrB,OAAO,EAAE,cAAc,CAAC;IACxB,mEAAmE;IACnE,gBAAgB,EAAE,MAAM,CAAC;IACzB,mFAAmF;IACnF,uBAAuB,EAAE,MAAM,CAAC;IAChC,2EAA2E;IAC3E,wBAAwB,EAAE,MAAM,CAAC;IACjC,+EAA+E;IAC/E,gBAAgB,EAAE,MAAM,CAAC;IACzB,2CAA2C;IAC3C,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,uEAAuE;AACvE,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,cAAc,EAAE,GAAG,cAAc,CA8B3E;AAED;;;;GAIG;AACH,wBAAgB,eAAe,CAC7B,YAAY,EAAE,cAAc,EAAE,EAC9B,eAAe,EAAE,cAAc,EAAE,GAChC,eAAe,CAmCjB"}
|
package/dist/eval.js
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT
|
|
2
|
+
//
|
|
3
|
+
// eval.ts — the eval-harness hook.
|
|
4
|
+
//
|
|
5
|
+
// Given a tuned adapter, measure the COST-PARETO win — NOT hard-tail cracking.
|
|
6
|
+
// The metric that matters is:
|
|
7
|
+
// 1. cheap-tier-ALONE resolve on the HELD-OUT clean set, and
|
|
8
|
+
// 2. the cascade's ESCALATION-RATE delta (fewer $0.50 frontier escalations).
|
|
9
|
+
//
|
|
10
|
+
// A LoRA tune on 7-14B will NOT crack the hard tail (frontier reasoning
|
|
11
|
+
// ceiling, clean-eval 37.3% + §53). The honest win is: the cheap tier resolves
|
|
12
|
+
// more on its own → the cascade escalates less → $/resolved drops. This module
|
|
13
|
+
// computes that delta from two eval runs (base vs adapter) and stays honest
|
|
14
|
+
// about what it measures.
|
|
15
|
+
//
|
|
16
|
+
// The eval RUN itself (conformant swebench gold eval) is the existing darwin
|
|
17
|
+
// path; this hook consumes its per-instance outcomes and folds them into the
|
|
18
|
+
// cost-Pareto metric. No eval is executed here ($0).
|
|
19
|
+
/** Fold per-instance cascade outcomes into the cost-Pareto summary. */
|
|
20
|
+
export function summarizeCascade(outcomes) {
|
|
21
|
+
const n = outcomes.length;
|
|
22
|
+
if (n === 0) {
|
|
23
|
+
return {
|
|
24
|
+
n: 0,
|
|
25
|
+
cheapResolveRate: 0,
|
|
26
|
+
escalationRate: 0,
|
|
27
|
+
resolveRate: 0,
|
|
28
|
+
costPerResolved: 0,
|
|
29
|
+
totalCostUsd: 0,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
let cheap = 0;
|
|
33
|
+
let escalated = 0;
|
|
34
|
+
let resolved = 0;
|
|
35
|
+
let totalCost = 0;
|
|
36
|
+
for (const o of outcomes) {
|
|
37
|
+
if (o.cheapResolved)
|
|
38
|
+
cheap++;
|
|
39
|
+
if (o.escalated)
|
|
40
|
+
escalated++;
|
|
41
|
+
if (o.resolved)
|
|
42
|
+
resolved++;
|
|
43
|
+
totalCost += o.costUsd;
|
|
44
|
+
}
|
|
45
|
+
return {
|
|
46
|
+
n,
|
|
47
|
+
cheapResolveRate: cheap / n,
|
|
48
|
+
escalationRate: escalated / n,
|
|
49
|
+
resolveRate: resolved / n,
|
|
50
|
+
costPerResolved: resolved > 0 ? totalCost / resolved : 0,
|
|
51
|
+
totalCostUsd: totalCost,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Compute the cost-Pareto delta between an adapter run and the base run.
|
|
56
|
+
* Stays HONEST: the headline is escalation-rate-reduction + cost/resolved, and
|
|
57
|
+
* the verdict flags when the resolve ceiling is (as expected) unmoved.
|
|
58
|
+
*/
|
|
59
|
+
export function costParetoDelta(baseOutcomes, adapterOutcomes) {
|
|
60
|
+
const base = summarizeCascade(baseOutcomes);
|
|
61
|
+
const adapter = summarizeCascade(adapterOutcomes);
|
|
62
|
+
const cheapResolveLift = adapter.cheapResolveRate - base.cheapResolveRate;
|
|
63
|
+
const escalationRateReduction = base.escalationRate - adapter.escalationRate;
|
|
64
|
+
const costPerResolvedReduction = base.costPerResolved - adapter.costPerResolved;
|
|
65
|
+
const resolveRateDelta = adapter.resolveRate - base.resolveRate;
|
|
66
|
+
const fewer = escalationRateReduction > 0;
|
|
67
|
+
const cheaper = costPerResolvedReduction > 0;
|
|
68
|
+
const ceilingMoved = Math.abs(resolveRateDelta) > 0.02;
|
|
69
|
+
let verdict;
|
|
70
|
+
if (fewer && cheaper) {
|
|
71
|
+
verdict =
|
|
72
|
+
`COST-PARETO WIN: ${(escalationRateReduction * 100).toFixed(1)}pp fewer escalations, ` +
|
|
73
|
+
`$${costPerResolvedReduction.toFixed(3)} cheaper per resolved` +
|
|
74
|
+
(ceilingMoved
|
|
75
|
+
? ` (note: resolve rate also moved ${(resolveRateDelta * 100).toFixed(1)}pp — investigate)`
|
|
76
|
+
: ` (resolve ceiling unmoved, as expected — this tunes COST, not the frontier ceiling)`);
|
|
77
|
+
}
|
|
78
|
+
else if (fewer || cheaper) {
|
|
79
|
+
verdict = `PARTIAL: escalation−${(escalationRateReduction * 100).toFixed(1)}pp, cost−$${costPerResolvedReduction.toFixed(3)}/resolved`;
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
verdict =
|
|
83
|
+
`NO COST WIN: adapter did not reduce escalations or cost — the prune-the-overfitter ` +
|
|
84
|
+
`gene should select BASE over this adapter.`;
|
|
85
|
+
}
|
|
86
|
+
return {
|
|
87
|
+
base,
|
|
88
|
+
adapter,
|
|
89
|
+
cheapResolveLift,
|
|
90
|
+
escalationRateReduction,
|
|
91
|
+
costPerResolvedReduction,
|
|
92
|
+
resolveRateDelta,
|
|
93
|
+
verdict,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
//# sourceMappingURL=eval.js.map
|
package/dist/eval.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../src/eval.ts"],"names":[],"mappings":"AAAA,+BAA+B;AAC/B,EAAE;AACF,mCAAmC;AACnC,EAAE;AACF,+EAA+E;AAC/E,8BAA8B;AAC9B,+DAA+D;AAC/D,+EAA+E;AAC/E,EAAE;AACF,wEAAwE;AACxE,+EAA+E;AAC/E,+EAA+E;AAC/E,4EAA4E;AAC5E,0BAA0B;AAC1B,EAAE;AACF,6EAA6E;AAC7E,6EAA6E;AAC7E,qDAAqD;AA8CrD,uEAAuE;AACvE,MAAM,UAAU,gBAAgB,CAAC,QAA0B;IACzD,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC;IAC1B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO;YACL,CAAC,EAAE,CAAC;YACJ,gBAAgB,EAAE,CAAC;YACnB,cAAc,EAAE,CAAC;YACjB,WAAW,EAAE,CAAC;YACd,eAAe,EAAE,CAAC;YAClB,YAAY,EAAE,CAAC;SAChB,CAAC;IACJ,CAAC;IACD,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,QAAQ,GAAG,CAAC,CAAC;IACjB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;QACzB,IAAI,CAAC,CAAC,aAAa;YAAE,KAAK,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,SAAS;YAAE,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC,CAAC,QAAQ;YAAE,QAAQ,EAAE,CAAC;QAC3B,SAAS,IAAI,CAAC,CAAC,OAAO,CAAC;IACzB,CAAC;IACD,OAAO;QACL,CAAC;QACD,gBAAgB,EAAE,KAAK,GAAG,CAAC;QAC3B,cAAc,EAAE,SAAS,GAAG,CAAC;QAC7B,WAAW,EAAE,QAAQ,GAAG,CAAC;QACzB,eAAe,EAAE,QAAQ,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QACxD,YAAY,EAAE,SAAS;KACxB,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAC7B,YAA8B,EAC9B,eAAiC;IAEjC,MAAM,IAAI,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAC5C,MAAM,OAAO,GAAG,gBAAgB,CAAC,eAAe,CAAC,CAAC;IAClD,MAAM,gBAAgB,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,gBAAgB,CAAC;IAC1E,MAAM,uBAAuB,GAAG,IAAI,CAAC,cAAc,GAAG,OAAO,CAAC,cAAc,CAAC;IAC7E,MAAM,wBAAwB,GAAG,IAAI,CAAC,eAAe,GAAG,OAAO,CAAC,eAAe,CAAC;IAChF,MAAM,gBAAgB,GAAG,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;IAEhE,MAAM,KAAK,GAAG,uBAAuB,GAAG,CAAC,CAAC;IAC1C,MAAM,OAAO,GAAG,wBAAwB,GAAG,CAAC,CAAC;IAC7C,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC;IACvD,IAAI,OAAe,CAAC;IACpB,IAAI,KAAK,IAAI,OAAO,EAAE,CAAC;QACrB,OAAO;YACL,oBAAoB,CAAC,uBAAuB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,wBAAwB;gBACtF,IAAI,wBAAwB,CAAC,OAAO,CAAC,CAAC,CAAC,uBAAuB;gBAC9D,CAAC,YAAY;oBACX,CAAC,CAAC,mCAAmC,CAAC,gBAAgB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,mBAAmB;oBAC3F,CAAC,CAAC,qFAAqF,CAAC,CAAC;IAC/F,CAAC;SAAM,IAAI,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,OAAO,GAAG,uBAAuB,CAAC,uBAAuB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,aAAa,wBAAwB,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,CAAC;IACzI,CAAC;SAAM,CAAC;QACN,OAAO;YACL,qFAAqF;gBACrF,4CAA4C,CAAC;IACjD,CAAC;IACD,OAAO;QACL,IAAI;QACJ,OAAO;QACP,gBAAgB;QAChB,uBAAuB;QACvB,wBAAwB;QACxB,gBAAgB;QAChB,OAAO;KACR,CAAC;AACJ,CAAC"}
|
package/dist/export.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { ChatMessage, DarwinTrajectory, DpoRow, ExportOptions, ExportResult, SftRow } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Rough token estimate for a message list. We deliberately do NOT pull in a
|
|
4
|
+
* tokenizer dependency (the package is dependency-free, like darwin-mode). The
|
|
5
|
+
* ~4-chars-per-token heuristic is conservative and stable; tool_calls
|
|
6
|
+
* arguments count too (they cost context). This is a budget gate, not a billing
|
|
7
|
+
* figure — over-estimating is the safe direction.
|
|
8
|
+
*/
|
|
9
|
+
export declare function estimateTokens(messages: ChatMessage[]): number;
|
|
10
|
+
/**
|
|
11
|
+
* THE CONTAMINATION GUARD. Throws if any trajectory's instance_id appears in
|
|
12
|
+
* the eval holdout. Call it the first thing the exporter does — fail loud, not
|
|
13
|
+
* silently filter, when train/eval disjointness is violated.
|
|
14
|
+
*/
|
|
15
|
+
export declare function assertTrainEvalDisjoint(trajectories: DarwinTrajectory[], evalHoldout: Iterable<string>): void;
|
|
16
|
+
/**
|
|
17
|
+
* Build the SFT and DPO sets from a Darwin trajectory archive.
|
|
18
|
+
*
|
|
19
|
+
* @param trajectories the input archive (already-excluded-of-holdout OR raw —
|
|
20
|
+
* the exporter excludes holdout members itself, but ASSERTS disjointness on
|
|
21
|
+
* what remains so a programming error can't slip eval data through).
|
|
22
|
+
*/
|
|
23
|
+
export declare function exportTrainingData(trajectories: DarwinTrajectory[], options: ExportOptions): ExportResult;
|
|
24
|
+
/** Serialize SFT rows to JSONL (one row per line). */
|
|
25
|
+
export declare function sftToJsonl(rows: SftRow[]): string;
|
|
26
|
+
/** Serialize DPO rows to JSONL (one row per line). */
|
|
27
|
+
export declare function dpoToJsonl(rows: DpoRow[]): string;
|
|
28
|
+
//# sourceMappingURL=export.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"export.d.ts","sourceRoot":"","sources":["../src/export.ts"],"names":[],"mappings":"AA6BA,OAAO,KAAK,EACV,WAAW,EACX,gBAAgB,EAChB,MAAM,EACN,aAAa,EAEb,YAAY,EACZ,MAAM,EACP,MAAM,YAAY,CAAC;AAKpB;;;;;;GAMG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,MAAM,CAa9D;AAyCD;;;;GAIG;AACH,wBAAgB,uBAAuB,CACrC,YAAY,EAAE,gBAAgB,EAAE,EAChC,WAAW,EAAE,QAAQ,CAAC,MAAM,CAAC,GAC5B,IAAI,CAcN;AAED;;;;;;GAMG;AACH,wBAAgB,kBAAkB,CAChC,YAAY,EAAE,gBAAgB,EAAE,EAChC,OAAO,EAAE,aAAa,GACrB,YAAY,CAgId;AAED,sDAAsD;AACtD,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM,CAEjD;AAED,sDAAsD;AACtD,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM,CAEjD"}
|