@kernel.chat/kbot 4.0.1 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/futures/debate/index.d.ts +7 -0
- package/dist/futures/debate/index.js +6 -0
- package/dist/futures/debate/runner.d.ts +34 -0
- package/dist/futures/debate/runner.js +140 -0
- package/dist/futures/debate/synthesis.d.ts +25 -0
- package/dist/futures/debate/synthesis.js +81 -0
- package/dist/futures/debate/types.d.ts +72 -0
- package/dist/futures/debate/types.js +12 -0
- package/dist/futures/forecast/index.d.ts +5 -0
- package/dist/futures/forecast/index.js +5 -0
- package/dist/futures/forecast/projection.d.ts +31 -0
- package/dist/futures/forecast/projection.js +177 -0
- package/dist/futures/forecast/synthesize.d.ts +19 -0
- package/dist/futures/forecast/synthesize.js +89 -0
- package/dist/futures/forecast/types.d.ts +59 -0
- package/dist/futures/forecast/types.js +15 -0
- package/dist/futures/harness/critic-evaluator.d.ts +39 -0
- package/dist/futures/harness/critic-evaluator.js +131 -0
- package/dist/futures/harness/evolution-loop.d.ts +41 -0
- package/dist/futures/harness/evolution-loop.js +168 -0
- package/dist/futures/harness/index.d.ts +16 -0
- package/dist/futures/harness/index.js +13 -0
- package/dist/futures/harness/meta-evolution.d.ts +32 -0
- package/dist/futures/harness/meta-evolution.js +52 -0
- package/dist/futures/harness/noop-evolution.d.ts +23 -0
- package/dist/futures/harness/noop-evolution.js +29 -0
- package/dist/futures/harness/persistence.d.ts +30 -0
- package/dist/futures/harness/persistence.js +99 -0
- package/dist/futures/harness/types.d.ts +147 -0
- package/dist/futures/harness/types.js +18 -0
- package/dist/futures/index.d.ts +16 -0
- package/dist/futures/index.js +22 -0
- package/dist/futures/latent-state/envelope.d.ts +39 -0
- package/dist/futures/latent-state/envelope.js +178 -0
- package/dist/futures/latent-state/index.d.ts +5 -0
- package/dist/futures/latent-state/index.js +3 -0
- package/dist/futures/latent-state/types.d.ts +47 -0
- package/dist/futures/latent-state/types.js +13 -0
- package/dist/futures/persona/check.d.ts +45 -0
- package/dist/futures/persona/check.js +205 -0
- package/dist/futures/persona/index.d.ts +5 -0
- package/dist/futures/persona/index.js +5 -0
- package/dist/futures/persona/registry.d.ts +22 -0
- package/dist/futures/persona/registry.js +124 -0
- package/dist/futures/persona/types.d.ts +68 -0
- package/dist/futures/persona/types.js +28 -0
- package/dist/futures/skill-graph/graph.d.ts +31 -0
- package/dist/futures/skill-graph/graph.js +151 -0
- package/dist/futures/skill-graph/index.d.ts +13 -0
- package/dist/futures/skill-graph/index.js +10 -0
- package/dist/futures/skill-graph/synthesis.d.ts +20 -0
- package/dist/futures/skill-graph/synthesis.js +83 -0
- package/dist/futures/skill-graph/types.d.ts +53 -0
- package/dist/futures/skill-graph/types.js +19 -0
- package/dist/tools/forecast-summary.d.ts +25 -0
- package/dist/tools/forecast-summary.js +204 -0
- package/dist/tools/swarm-2026-04.js +2 -0
- package/package.json +1 -1
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// futures/forecast/synthesize — fan a list of Signals into Forecasts and
|
|
2
|
+
// produce human-readable summaries. Pure formatting; no IO.
|
|
3
|
+
import { bestProjection, clampHorizon, signalHistory } from './projection.js';
|
|
4
|
+
/**
|
|
5
|
+
* Project every signal forward at the given horizon. Skips signals whose
|
|
6
|
+
* history is too short for the horizon (clampHorizon = false). Returns
|
|
7
|
+
* forecasts sorted by absolute slope descending so the most-moving signals
|
|
8
|
+
* surface first.
|
|
9
|
+
*/
|
|
10
|
+
export function synthesizeForecasts(signals, horizon) {
|
|
11
|
+
const out = [];
|
|
12
|
+
for (const sig of signals) {
|
|
13
|
+
if (!clampHorizon(horizon, signalHistory(sig)))
|
|
14
|
+
continue;
|
|
15
|
+
out.push(bestProjection(sig, horizon));
|
|
16
|
+
}
|
|
17
|
+
out.sort((a, b) => Math.abs(b.trend.slope) - Math.abs(a.trend.slope));
|
|
18
|
+
return out;
|
|
19
|
+
}
|
|
20
|
+
function formatNumber(n) {
|
|
21
|
+
if (!Number.isFinite(n))
|
|
22
|
+
return '—';
|
|
23
|
+
const abs = Math.abs(n);
|
|
24
|
+
if (abs >= 1_000_000)
|
|
25
|
+
return `${(n / 1_000_000).toFixed(1)}M`;
|
|
26
|
+
if (abs >= 1_000)
|
|
27
|
+
return `${(n / 1_000).toFixed(1)}k`.replace('.0k', 'k');
|
|
28
|
+
if (abs >= 10)
|
|
29
|
+
return n.toFixed(0);
|
|
30
|
+
if (abs >= 1)
|
|
31
|
+
return n.toFixed(1);
|
|
32
|
+
return n.toFixed(2);
|
|
33
|
+
}
|
|
34
|
+
const HORIZON_LABEL = {
|
|
35
|
+
'1d': 'in 1 day',
|
|
36
|
+
'7d': 'in 7 days',
|
|
37
|
+
'30d': 'in 30 days',
|
|
38
|
+
'90d': 'in 90 days',
|
|
39
|
+
};
|
|
40
|
+
const ARROW = {
|
|
41
|
+
up: '📈',
|
|
42
|
+
down: '📉',
|
|
43
|
+
flat: '➖',
|
|
44
|
+
};
|
|
45
|
+
function arrowFor(slope, kind) {
|
|
46
|
+
if (kind === 'flat')
|
|
47
|
+
return ARROW.flat;
|
|
48
|
+
if (slope > 0)
|
|
49
|
+
return ARROW.up;
|
|
50
|
+
if (slope < 0)
|
|
51
|
+
return ARROW.down;
|
|
52
|
+
return ARROW.flat;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Markdown one-liner for a forecast.
|
|
56
|
+
* Example: `📈 npm downloads → 14.2k (in 30 days, linear, r²=0.78, ±890)`
|
|
57
|
+
*/
|
|
58
|
+
export function formatForecast(f) {
|
|
59
|
+
const arrow = arrowFor(f.trend.slope, f.trend.kind);
|
|
60
|
+
const point = formatNumber(f.pointEstimate);
|
|
61
|
+
const halfWidth = (f.upperBound - f.lowerBound) / 2;
|
|
62
|
+
const interval = Number.isFinite(halfWidth) ? `±${formatNumber(halfWidth)}` : '±?';
|
|
63
|
+
const r2Str = f.trend.kind === 'flat' ? 'flat' : `${f.trend.kind}, r²=${f.trend.r2.toFixed(2)}`;
|
|
64
|
+
return `${arrow} ${f.signal} → ${point} (${HORIZON_LABEL[f.horizon]}, ${r2Str}, ${interval})`;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Take a list of forecasts and produce a short paragraph naming the top-3
|
|
68
|
+
* by absolute slope. If empty, returns a polite no-data sentence.
|
|
69
|
+
*/
|
|
70
|
+
export function narrative(forecasts) {
|
|
71
|
+
if (forecasts.length === 0) {
|
|
72
|
+
return 'No forecasts available — not enough signal history to project.';
|
|
73
|
+
}
|
|
74
|
+
const top = forecasts.slice(0, 3);
|
|
75
|
+
const phrases = [];
|
|
76
|
+
for (const f of top) {
|
|
77
|
+
const direction = f.trend.kind === 'flat'
|
|
78
|
+
? 'is holding flat'
|
|
79
|
+
: f.trend.slope > 0
|
|
80
|
+
? 'is trending up'
|
|
81
|
+
: 'is trending down';
|
|
82
|
+
const point = formatNumber(f.pointEstimate);
|
|
83
|
+
const conf = `${(f.confidence * 100).toFixed(0)}% conf`;
|
|
84
|
+
phrases.push(`${f.signal} ${direction} toward ${point} ${HORIZON_LABEL[f.horizon]} (${conf})`);
|
|
85
|
+
}
|
|
86
|
+
const head = phrases.length === 1 ? phrases[0] : phrases.slice(0, -1).join('; ') + '; and ' + phrases[phrases.length - 1];
|
|
87
|
+
return `Top movers: ${head}.`;
|
|
88
|
+
}
|
|
89
|
+
//# sourceMappingURL=synthesize.js.map
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A time series. Timestamps are ms since epoch (matches Date.now()).
|
|
3
|
+
* Values are arbitrary scalars (counts, percentages, etc.).
|
|
4
|
+
* Order is not required; projection.ts sorts internally.
|
|
5
|
+
*/
|
|
6
|
+
export interface Signal {
|
|
7
|
+
name: string;
|
|
8
|
+
values: Array<{
|
|
9
|
+
ts: number;
|
|
10
|
+
value: number;
|
|
11
|
+
}>;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Trend describes the shape of the fit chosen for a Signal.
|
|
15
|
+
* - 'linear' : value ~ a + b*t (slope = b in value/ms)
|
|
16
|
+
* - 'exponential' : value ~ exp(a + b*t) (slope = b in log(value)/ms)
|
|
17
|
+
* - 'flat' : low-variance fallback; slope == 0, r2 == 0
|
|
18
|
+
*
|
|
19
|
+
* r2 is the coefficient of determination on whatever space was fit
|
|
20
|
+
* (raw values for linear, log(values) for exponential).
|
|
21
|
+
*/
|
|
22
|
+
export interface Trend {
|
|
23
|
+
kind: 'linear' | 'exponential' | 'flat';
|
|
24
|
+
slope: number;
|
|
25
|
+
r2: number;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Projection horizon. Always relative to the last observed timestamp.
|
|
29
|
+
* 1d/7d/30d/90d cover the practical surface (daily through quarterly).
|
|
30
|
+
*/
|
|
31
|
+
export type Horizon = '1d' | '7d' | '30d' | '90d';
|
|
32
|
+
/**
|
|
33
|
+
* A single forecast for a single signal at a single horizon.
|
|
34
|
+
*
|
|
35
|
+
* - pointEstimate is the model's expected value at horizon end
|
|
36
|
+
* - lowerBound / upperBound is roughly a 95% interval
|
|
37
|
+
* (point ± 2 * residual stddev, exp-transformed for exponential fits)
|
|
38
|
+
* - confidence is a 0..1 score derived from r2 and history length;
|
|
39
|
+
* higher means "trust this projection more"
|
|
40
|
+
* - method is a short human-readable tag, e.g. "linear-lsq", "exp-loglin",
|
|
41
|
+
* "flat-mean"
|
|
42
|
+
*/
|
|
43
|
+
export interface Forecast {
|
|
44
|
+
signal: string;
|
|
45
|
+
horizon: Horizon;
|
|
46
|
+
trend: Trend;
|
|
47
|
+
pointEstimate: number;
|
|
48
|
+
lowerBound: number;
|
|
49
|
+
upperBound: number;
|
|
50
|
+
confidence: number;
|
|
51
|
+
method: string;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Number of milliseconds in each horizon. Exported as a const for use by
|
|
55
|
+
* projection.ts and synthesize.ts; not exported as a value-typed enum so
|
|
56
|
+
* callers can keep using string literals.
|
|
57
|
+
*/
|
|
58
|
+
export declare const HORIZON_MS: Record<Horizon, number>;
|
|
59
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// futures/forecast — type definitions for projecting growth/research signals
|
|
2
|
+
// forward in time. Pure types: no runtime, no IO. Consumers (synthesize.ts,
|
|
3
|
+
// projection.ts) build Forecasts from raw Signal arrays.
|
|
4
|
+
/**
|
|
5
|
+
* Number of milliseconds in each horizon. Exported as a const for use by
|
|
6
|
+
* projection.ts and synthesize.ts; not exported as a value-typed enum so
|
|
7
|
+
* callers can keep using string literals.
|
|
8
|
+
*/
|
|
9
|
+
export const HORIZON_MS = {
|
|
10
|
+
'1d': 24 * 60 * 60 * 1000,
|
|
11
|
+
'7d': 7 * 24 * 60 * 60 * 1000,
|
|
12
|
+
'30d': 30 * 24 * 60 * 60 * 1000,
|
|
13
|
+
'90d': 90 * 24 * 60 * 60 * 1000,
|
|
14
|
+
};
|
|
15
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CriticEvaluator — adapts `critic-gate.ts` to the harness `Evaluator`
|
|
3
|
+
* interface from `types.ts`.
|
|
4
|
+
*
|
|
5
|
+
* critic-gate.ts gates a single tool result. The harness Evaluator grades
|
|
6
|
+
* an entire trace against a Task's acceptance criteria. This adapter
|
|
7
|
+
* walks the trace's tool steps, runs each through `gateToolResult`, and
|
|
8
|
+
* aggregates the per-step verdicts into a single EvaluationReport:
|
|
9
|
+
*
|
|
10
|
+
* - `pass` = every acceptance criterion satisfied AND no critic rejected
|
|
11
|
+
* a tool step
|
|
12
|
+
* - `score` = (criteriaPassRate * 0.7) + (toolAcceptRate * 0.3),
|
|
13
|
+
* efficiency-tiebroken by total step time
|
|
14
|
+
* - failureModes derived from critic verdicts' `failure_class` (RF-NN-*)
|
|
15
|
+
*
|
|
16
|
+
* Acceptance criteria are matched via case-insensitive substring against
|
|
17
|
+
* the trace's flattened `output | error | finalState | action` text.
|
|
18
|
+
* That keeps the adapter dependency-free; richer matchers can subclass.
|
|
19
|
+
*
|
|
20
|
+
* critic-gate.ts is NOT modified — this is a pure consumer.
|
|
21
|
+
*/
|
|
22
|
+
import type { EvaluationReport, Evaluator, ExecutionTrace, Task } from './types.js';
|
|
23
|
+
import type { CriticVerdict, GateOpts } from '../../critic-gate.js';
|
|
24
|
+
export interface CriticEvaluatorOpts {
|
|
25
|
+
/** Forwarded to `gateToolResult` — strictness, provider, llmClient stub. */
|
|
26
|
+
gate?: GateOpts;
|
|
27
|
+
/**
|
|
28
|
+
* If set, replaces `gateToolResult`. Lets tests inject a fully synchronous
|
|
29
|
+
* decision function and skip the critic-gate provider plumbing entirely.
|
|
30
|
+
*/
|
|
31
|
+
gateFn?: (tool: string, args: Record<string, unknown>, result: unknown) => Promise<CriticVerdict> | CriticVerdict;
|
|
32
|
+
}
|
|
33
|
+
export declare class CriticEvaluator implements Evaluator {
|
|
34
|
+
private readonly opts;
|
|
35
|
+
constructor(opts?: CriticEvaluatorOpts);
|
|
36
|
+
evaluate(trace: ExecutionTrace, task: Task): Promise<EvaluationReport>;
|
|
37
|
+
}
|
|
38
|
+
export declare function createCriticEvaluator(opts?: CriticEvaluatorOpts): Evaluator;
|
|
39
|
+
//# sourceMappingURL=critic-evaluator.d.ts.map
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CriticEvaluator — adapts `critic-gate.ts` to the harness `Evaluator`
|
|
3
|
+
* interface from `types.ts`.
|
|
4
|
+
*
|
|
5
|
+
* critic-gate.ts gates a single tool result. The harness Evaluator grades
|
|
6
|
+
* an entire trace against a Task's acceptance criteria. This adapter
|
|
7
|
+
* walks the trace's tool steps, runs each through `gateToolResult`, and
|
|
8
|
+
* aggregates the per-step verdicts into a single EvaluationReport:
|
|
9
|
+
*
|
|
10
|
+
* - `pass` = every acceptance criterion satisfied AND no critic rejected
|
|
11
|
+
* a tool step
|
|
12
|
+
* - `score` = (criteriaPassRate * 0.7) + (toolAcceptRate * 0.3),
|
|
13
|
+
* efficiency-tiebroken by total step time
|
|
14
|
+
* - failureModes derived from critic verdicts' `failure_class` (RF-NN-*)
|
|
15
|
+
*
|
|
16
|
+
* Acceptance criteria are matched via case-insensitive substring against
|
|
17
|
+
* the trace's flattened `output | error | finalState | action` text.
|
|
18
|
+
* That keeps the adapter dependency-free; richer matchers can subclass.
|
|
19
|
+
*
|
|
20
|
+
* critic-gate.ts is NOT modified — this is a pure consumer.
|
|
21
|
+
*/
|
|
22
|
+
import { gateToolResult } from '../../critic-gate.js';
|
|
23
|
+
/** Map RF taxonomy classes onto harness FailureMode kinds. */
|
|
24
|
+
function rfToFailureKind(rf) {
|
|
25
|
+
switch (rf) {
|
|
26
|
+
case 'RF-01-fabricated-evidence':
|
|
27
|
+
case 'RF-10-simulation-role-confusion':
|
|
28
|
+
return 'hallucinated-state';
|
|
29
|
+
case 'RF-02-metric-interpretation':
|
|
30
|
+
case 'RF-03-confused-provenance':
|
|
31
|
+
case 'RF-04-temporal-misordering':
|
|
32
|
+
return 'misinterpreted-state';
|
|
33
|
+
case 'RF-12-repetition-failure-to-resume':
|
|
34
|
+
return 'reasoning-loop';
|
|
35
|
+
case 'RF-08-evidential-insufficiency':
|
|
36
|
+
case 'RF-11-excessive-speculation':
|
|
37
|
+
return 'missing-capability';
|
|
38
|
+
case 'RF-16-arithmetic-error':
|
|
39
|
+
case 'RF-14-invalid-inference-pattern':
|
|
40
|
+
case 'RF-15-internal-contradiction':
|
|
41
|
+
return 'incorrect-tool-usage';
|
|
42
|
+
default:
|
|
43
|
+
return 'other';
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function flattenTrace(trace) {
|
|
47
|
+
const parts = [];
|
|
48
|
+
for (const s of trace.steps) {
|
|
49
|
+
if (s.action)
|
|
50
|
+
parts.push(s.action);
|
|
51
|
+
if (s.output)
|
|
52
|
+
parts.push(s.output);
|
|
53
|
+
if (s.error)
|
|
54
|
+
parts.push(s.error);
|
|
55
|
+
}
|
|
56
|
+
try {
|
|
57
|
+
parts.push(JSON.stringify(trace.finalState));
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
/* ignore unserializable */
|
|
61
|
+
}
|
|
62
|
+
return parts.join('\n');
|
|
63
|
+
}
|
|
64
|
+
export class CriticEvaluator {
|
|
65
|
+
opts;
|
|
66
|
+
constructor(opts = {}) {
|
|
67
|
+
this.opts = opts;
|
|
68
|
+
}
|
|
69
|
+
async evaluate(trace, task) {
|
|
70
|
+
const haystack = flattenTrace(trace).toLowerCase();
|
|
71
|
+
const criteriaResults = task.acceptance.map((criterion) => {
|
|
72
|
+
const passed = haystack.includes(criterion.toLowerCase());
|
|
73
|
+
return {
|
|
74
|
+
criterion,
|
|
75
|
+
passed,
|
|
76
|
+
evidence: passed ? 'substring match in trace' : 'no match in flattened trace',
|
|
77
|
+
};
|
|
78
|
+
});
|
|
79
|
+
const criteriaPassRate = criteriaResults.length === 0
|
|
80
|
+
? 1
|
|
81
|
+
: criteriaResults.filter((c) => c.passed).length / criteriaResults.length;
|
|
82
|
+
// Run critic on each tool step. Missing tools / responses are skipped.
|
|
83
|
+
const toolSteps = trace.steps.filter((s) => s.phase === 'tool');
|
|
84
|
+
const failureModes = [];
|
|
85
|
+
let toolAccepts = 0;
|
|
86
|
+
for (const step of toolSteps) {
|
|
87
|
+
const verdict = this.opts.gateFn
|
|
88
|
+
? await this.opts.gateFn(step.action, {}, step.output ?? step.error ?? '')
|
|
89
|
+
: await gateToolResult(step.action, {}, step.output ?? step.error ?? '', this.opts.gate);
|
|
90
|
+
if (verdict.accept) {
|
|
91
|
+
toolAccepts++;
|
|
92
|
+
}
|
|
93
|
+
else if (verdict.failure_class) {
|
|
94
|
+
failureModes.push({
|
|
95
|
+
kind: rfToFailureKind(verdict.failure_class),
|
|
96
|
+
detail: `${verdict.failure_class}: ${verdict.reason || 'critic rejected'}`,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
failureModes.push({
|
|
101
|
+
kind: 'other',
|
|
102
|
+
detail: verdict.reason || 'critic rejected without failure class',
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
const toolAcceptRate = toolSteps.length === 0 ? 1 : toolAccepts / toolSteps.length;
|
|
107
|
+
const baseScore = criteriaPassRate * 0.7 + toolAcceptRate * 0.3;
|
|
108
|
+
// Efficiency tiebreaker: small bonus inversely proportional to time.
|
|
109
|
+
const totalMs = trace.llmTimeMs + trace.toolTimeMs;
|
|
110
|
+
const efficiency = totalMs > 0 ? Math.min(0.05, 1000 / (totalMs + 1000) * 0.05) : 0.05;
|
|
111
|
+
const score = Math.max(0, Math.min(1, baseScore + efficiency));
|
|
112
|
+
const allCriteriaPass = criteriaResults.every((c) => c.passed);
|
|
113
|
+
const noToolRejections = failureModes.length === 0;
|
|
114
|
+
const pass = allCriteriaPass && noToolRejections;
|
|
115
|
+
return {
|
|
116
|
+
taskId: task.id,
|
|
117
|
+
harnessId: trace.harnessId,
|
|
118
|
+
pass,
|
|
119
|
+
score,
|
|
120
|
+
criteriaResults,
|
|
121
|
+
failureModes,
|
|
122
|
+
notes: pass
|
|
123
|
+
? 'all criteria passed; critic accepted every tool step'
|
|
124
|
+
: `pass=${pass} criteria=${criteriaPassRate.toFixed(2)} tools=${toolAcceptRate.toFixed(2)}`,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
export function createCriticEvaluator(opts = {}) {
|
|
129
|
+
return new CriticEvaluator(opts);
|
|
130
|
+
}
|
|
131
|
+
//# sourceMappingURL=critic-evaluator.js.map
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness Evolution Loop — inner loop (Algorithm 1 from Sylph).
|
|
3
|
+
*
|
|
4
|
+
* for i in 1..maxIterations:
|
|
5
|
+
* trace = Worker.execute(task, harness)
|
|
6
|
+
* report = Evaluator.evaluate(trace, task)
|
|
7
|
+
* record = { iteration, harness, trace, report, verdict }
|
|
8
|
+
* history.push(record)
|
|
9
|
+
* if report.score > bestScore: best = harness
|
|
10
|
+
* if earlyStopScore reached on consecutive iterations: stop
|
|
11
|
+
* if regression > revertThreshold: revert harness to best
|
|
12
|
+
* harness = EvolutionAgent.evolve(history, best)
|
|
13
|
+
*
|
|
14
|
+
* Pure orchestration — Worker / Evaluator / EvolutionAgent are injected,
|
|
15
|
+
* which makes the whole loop deterministic and testable with stub
|
|
16
|
+
* implementations. No LLM calls happen here directly.
|
|
17
|
+
*/
|
|
18
|
+
import type { EvolutionProtocol, EvolutionRecord, EvolutionResult, Task } from './types.js';
|
|
19
|
+
/** Optional hooks the caller can plug in to observe / persist each step. */
|
|
20
|
+
export interface RunOptions {
|
|
21
|
+
/** Called after each record is appended to in-memory history. */
|
|
22
|
+
onRecord?: (record: EvolutionRecord) => void | Promise<void>;
|
|
23
|
+
/** When set, `appendTrace` is invoked under this state dir for every record. */
|
|
24
|
+
persistDir?: string;
|
|
25
|
+
/** When false, skip filesystem persistence even if `persistDir` is set. Default true. */
|
|
26
|
+
persist?: boolean;
|
|
27
|
+
/**
|
|
28
|
+
* How many consecutive iterations must hit `earlyStopScore` to stop.
|
|
29
|
+
* Defaults to 1 — first hit ends the loop.
|
|
30
|
+
*/
|
|
31
|
+
earlyStopStreak?: number;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Run the inner Harness Evolution Loop against a single task.
|
|
35
|
+
*
|
|
36
|
+
* Always returns an `EvolutionResult` — never throws on Worker / Evaluator
|
|
37
|
+
* exceptions; instead, records a failure step and continues. (The
|
|
38
|
+
* Evaluator is supposed to score failures, not the loop itself.)
|
|
39
|
+
*/
|
|
40
|
+
export declare function runEvolutionLoop(protocol: EvolutionProtocol, task: Task, options?: RunOptions): Promise<EvolutionResult>;
|
|
41
|
+
//# sourceMappingURL=evolution-loop.d.ts.map
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness Evolution Loop — inner loop (Algorithm 1 from Sylph).
|
|
3
|
+
*
|
|
4
|
+
* for i in 1..maxIterations:
|
|
5
|
+
* trace = Worker.execute(task, harness)
|
|
6
|
+
* report = Evaluator.evaluate(trace, task)
|
|
7
|
+
* record = { iteration, harness, trace, report, verdict }
|
|
8
|
+
* history.push(record)
|
|
9
|
+
* if report.score > bestScore: best = harness
|
|
10
|
+
* if earlyStopScore reached on consecutive iterations: stop
|
|
11
|
+
* if regression > revertThreshold: revert harness to best
|
|
12
|
+
* harness = EvolutionAgent.evolve(history, best)
|
|
13
|
+
*
|
|
14
|
+
* Pure orchestration — Worker / Evaluator / EvolutionAgent are injected,
|
|
15
|
+
* which makes the whole loop deterministic and testable with stub
|
|
16
|
+
* implementations. No LLM calls happen here directly.
|
|
17
|
+
*/
|
|
18
|
+
import { appendTrace } from './persistence.js';
|
|
19
|
+
function compareVerdict(prev, current, revertThreshold) {
|
|
20
|
+
if (current > prev)
|
|
21
|
+
return 'improved';
|
|
22
|
+
if (current === prev)
|
|
23
|
+
return 'no-op';
|
|
24
|
+
if (revertThreshold !== undefined && (prev - current) >= revertThreshold) {
|
|
25
|
+
// A "regressed" verdict signals the loop should revert to best harness.
|
|
26
|
+
return 'regressed';
|
|
27
|
+
}
|
|
28
|
+
return 'regressed';
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Run the inner Harness Evolution Loop against a single task.
|
|
32
|
+
*
|
|
33
|
+
* Always returns an `EvolutionResult` — never throws on Worker / Evaluator
|
|
34
|
+
* exceptions; instead, records a failure step and continues. (The
|
|
35
|
+
* Evaluator is supposed to score failures, not the loop itself.)
|
|
36
|
+
*/
|
|
37
|
+
export async function runEvolutionLoop(protocol, task, options = {}) {
|
|
38
|
+
const { worker, evaluator, evolution, initialHarness, hyperparams } = protocol;
|
|
39
|
+
const maxIterations = Math.max(1, hyperparams.maxIterations | 0);
|
|
40
|
+
const earlyStopScore = hyperparams.earlyStopScore;
|
|
41
|
+
const revertThreshold = hyperparams.revertThreshold;
|
|
42
|
+
const earlyStopStreak = Math.max(1, options.earlyStopStreak ?? 1);
|
|
43
|
+
const shouldPersist = options.persist !== false && !!options.persistDir;
|
|
44
|
+
const history = [];
|
|
45
|
+
let harness = initialHarness;
|
|
46
|
+
let bestHarness = initialHarness;
|
|
47
|
+
let bestScore = -Infinity;
|
|
48
|
+
let prevScore = -Infinity;
|
|
49
|
+
let earlyHits = 0;
|
|
50
|
+
for (let iteration = 1; iteration <= maxIterations; iteration++) {
|
|
51
|
+
let trace;
|
|
52
|
+
try {
|
|
53
|
+
trace = await worker.execute(task, harness);
|
|
54
|
+
}
|
|
55
|
+
catch (err) {
|
|
56
|
+
// Synthesize a minimal failure trace so the evaluator can still grade.
|
|
57
|
+
trace = {
|
|
58
|
+
taskId: task.id,
|
|
59
|
+
harnessId: harness.id,
|
|
60
|
+
steps: [
|
|
61
|
+
{
|
|
62
|
+
index: 0,
|
|
63
|
+
phase: 'observe',
|
|
64
|
+
action: 'worker-error',
|
|
65
|
+
error: err instanceof Error ? err.message : String(err),
|
|
66
|
+
durationMs: 0,
|
|
67
|
+
},
|
|
68
|
+
],
|
|
69
|
+
finalState: {},
|
|
70
|
+
llmTimeMs: 0,
|
|
71
|
+
toolTimeMs: 0,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
let report;
|
|
75
|
+
try {
|
|
76
|
+
report = await evaluator.evaluate(trace, task);
|
|
77
|
+
}
|
|
78
|
+
catch (err) {
|
|
79
|
+
report = {
|
|
80
|
+
taskId: task.id,
|
|
81
|
+
harnessId: harness.id,
|
|
82
|
+
pass: false,
|
|
83
|
+
score: 0,
|
|
84
|
+
criteriaResults: task.acceptance.map((c) => ({
|
|
85
|
+
criterion: c,
|
|
86
|
+
passed: false,
|
|
87
|
+
evidence: 'evaluator-error',
|
|
88
|
+
})),
|
|
89
|
+
failureModes: [
|
|
90
|
+
{
|
|
91
|
+
kind: 'other',
|
|
92
|
+
detail: err instanceof Error ? err.message : String(err),
|
|
93
|
+
},
|
|
94
|
+
],
|
|
95
|
+
notes: 'evaluator threw; auto-fail',
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
const verdict = compareVerdict(prevScore, report.score, revertThreshold);
|
|
99
|
+
const record = {
|
|
100
|
+
iteration,
|
|
101
|
+
harness,
|
|
102
|
+
trace,
|
|
103
|
+
report,
|
|
104
|
+
verdict,
|
|
105
|
+
};
|
|
106
|
+
history.push(record);
|
|
107
|
+
if (options.onRecord) {
|
|
108
|
+
await options.onRecord(record);
|
|
109
|
+
}
|
|
110
|
+
if (shouldPersist) {
|
|
111
|
+
try {
|
|
112
|
+
await appendTrace(task.id, record, options.persistDir);
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
// persistence is best-effort; never block evolution on disk failure
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// Track best harness.
|
|
119
|
+
if (report.score > bestScore) {
|
|
120
|
+
bestScore = report.score;
|
|
121
|
+
bestHarness = harness;
|
|
122
|
+
}
|
|
123
|
+
// Early-stop check.
|
|
124
|
+
if (earlyStopScore !== undefined && report.score >= earlyStopScore) {
|
|
125
|
+
earlyHits++;
|
|
126
|
+
if (earlyHits >= earlyStopStreak) {
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
earlyHits = 0;
|
|
132
|
+
}
|
|
133
|
+
// Revert on regression past threshold.
|
|
134
|
+
if (revertThreshold !== undefined &&
|
|
135
|
+
bestScore - report.score >= revertThreshold) {
|
|
136
|
+
harness = bestHarness;
|
|
137
|
+
}
|
|
138
|
+
prevScore = report.score;
|
|
139
|
+
// Short-circuit before the final evolve call — no point mutating on the
|
|
140
|
+
// last iteration since we'll never execute the new harness.
|
|
141
|
+
if (iteration === maxIterations)
|
|
142
|
+
break;
|
|
143
|
+
try {
|
|
144
|
+
harness = await evolution.evolve(history, bestHarness);
|
|
145
|
+
}
|
|
146
|
+
catch {
|
|
147
|
+
// EvolutionAgent failed — keep current harness, keep going.
|
|
148
|
+
harness = bestHarness;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
// If nothing ran (shouldn't happen with maxIterations >= 1) make sure
|
|
152
|
+
// bestHarness is still defined.
|
|
153
|
+
if (history.length === 0) {
|
|
154
|
+
return {
|
|
155
|
+
taskId: task.id,
|
|
156
|
+
bestHarness: initialHarness,
|
|
157
|
+
bestScore: 0,
|
|
158
|
+
history,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
return {
|
|
162
|
+
taskId: task.id,
|
|
163
|
+
bestHarness,
|
|
164
|
+
bestScore: bestScore === -Infinity ? 0 : bestScore,
|
|
165
|
+
history,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
//# sourceMappingURL=evolution-loop.js.map
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness Evolution Loop — public surface.
|
|
3
|
+
*
|
|
4
|
+
* See `./README.md` for the high-level overview and `./types.ts` for the
|
|
5
|
+
* contract every other file in this directory targets.
|
|
6
|
+
*/
|
|
7
|
+
export * from './types.js';
|
|
8
|
+
export { runEvolutionLoop } from './evolution-loop.js';
|
|
9
|
+
export type { RunOptions } from './evolution-loop.js';
|
|
10
|
+
export { runMetaEvolution } from './meta-evolution.js';
|
|
11
|
+
export type { MetaOptions } from './meta-evolution.js';
|
|
12
|
+
export { CriticEvaluator, createCriticEvaluator, } from './critic-evaluator.js';
|
|
13
|
+
export type { CriticEvaluatorOpts } from './critic-evaluator.js';
|
|
14
|
+
export { NoopEvolutionAgent, createNoopEvolutionAgent, } from './noop-evolution.js';
|
|
15
|
+
export { appendTrace, readHistory, pruneOlderThan, defaultStateDir, } from './persistence.js';
|
|
16
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness Evolution Loop — public surface.
|
|
3
|
+
*
|
|
4
|
+
* See `./README.md` for the high-level overview and `./types.ts` for the
|
|
5
|
+
* contract every other file in this directory targets.
|
|
6
|
+
*/
|
|
7
|
+
export * from './types.js';
|
|
8
|
+
export { runEvolutionLoop } from './evolution-loop.js';
|
|
9
|
+
export { runMetaEvolution } from './meta-evolution.js';
|
|
10
|
+
export { CriticEvaluator, createCriticEvaluator, } from './critic-evaluator.js';
|
|
11
|
+
export { NoopEvolutionAgent, createNoopEvolutionAgent, } from './noop-evolution.js';
|
|
12
|
+
export { appendTrace, readHistory, pruneOlderThan, defaultStateDir, } from './persistence.js';
|
|
13
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness Meta-Evolution — outer loop (Algorithm 2 from Sylph).
|
|
3
|
+
*
|
|
4
|
+
* The inner loop optimizes one harness against one task. The outer loop
|
|
5
|
+
* runs the inner loop across a portfolio of tasks, aggregating per-task
|
|
6
|
+
* results and selecting the best protocol overall. Currently the
|
|
7
|
+
* "selection" step is averaging — when a real MetaEvolutionAgent ships,
|
|
8
|
+
* it'll consume the perTask EvolutionResult[] and propose protocol
|
|
9
|
+
* mutations.
|
|
10
|
+
*
|
|
11
|
+
* Pure orchestration. Tasks are run sequentially to keep the trace
|
|
12
|
+
* ordering deterministic; parallelism is a future concern.
|
|
13
|
+
*/
|
|
14
|
+
import type { EvolutionProtocol, EvolutionResult, MetaResult, Task } from './types.js';
|
|
15
|
+
import { type RunOptions } from './evolution-loop.js';
|
|
16
|
+
export interface MetaOptions extends RunOptions {
|
|
17
|
+
/** Called after each task's inner loop completes. */
|
|
18
|
+
onTaskComplete?: (result: EvolutionResult) => void | Promise<void>;
|
|
19
|
+
/**
|
|
20
|
+
* When `true`, abort the outer loop on the first task whose best score
|
|
21
|
+
* is below `failBelow`. Default false — always run the full portfolio.
|
|
22
|
+
*/
|
|
23
|
+
abortOnFailure?: boolean;
|
|
24
|
+
failBelow?: number;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Run the inner Evolution Loop across a portfolio of tasks, returning
|
|
28
|
+
* the best protocol (currently always the input protocol — there is no
|
|
29
|
+
* MetaEvolutionAgent yet) plus per-task results and the aggregate score.
|
|
30
|
+
*/
|
|
31
|
+
export declare function runMetaEvolution(protocol: EvolutionProtocol, tasks: Task[], options?: MetaOptions): Promise<MetaResult>;
|
|
32
|
+
//# sourceMappingURL=meta-evolution.d.ts.map
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness Meta-Evolution — outer loop (Algorithm 2 from Sylph).
|
|
3
|
+
*
|
|
4
|
+
* The inner loop optimizes one harness against one task. The outer loop
|
|
5
|
+
* runs the inner loop across a portfolio of tasks, aggregating per-task
|
|
6
|
+
* results and selecting the best protocol overall. Currently the
|
|
7
|
+
* "selection" step is averaging — when a real MetaEvolutionAgent ships,
|
|
8
|
+
* it'll consume the perTask EvolutionResult[] and propose protocol
|
|
9
|
+
* mutations.
|
|
10
|
+
*
|
|
11
|
+
* Pure orchestration. Tasks are run sequentially to keep the trace
|
|
12
|
+
* ordering deterministic; parallelism is a future concern.
|
|
13
|
+
*/
|
|
14
|
+
import { runEvolutionLoop } from './evolution-loop.js';
|
|
15
|
+
/**
|
|
16
|
+
* Run the inner Evolution Loop across a portfolio of tasks, returning
|
|
17
|
+
* the best protocol (currently always the input protocol — there is no
|
|
18
|
+
* MetaEvolutionAgent yet) plus per-task results and the aggregate score.
|
|
19
|
+
*/
|
|
20
|
+
export async function runMetaEvolution(protocol, tasks, options = {}) {
|
|
21
|
+
if (tasks.length === 0) {
|
|
22
|
+
return {
|
|
23
|
+
bestProtocol: protocol,
|
|
24
|
+
bestMetaScore: 0,
|
|
25
|
+
perTask: [],
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
const perTask = [];
|
|
29
|
+
let scoreSum = 0;
|
|
30
|
+
for (const task of tasks) {
|
|
31
|
+
const result = await runEvolutionLoop(protocol, task, options);
|
|
32
|
+
perTask.push(result);
|
|
33
|
+
scoreSum += result.bestScore;
|
|
34
|
+
if (options.onTaskComplete) {
|
|
35
|
+
await options.onTaskComplete(result);
|
|
36
|
+
}
|
|
37
|
+
if (options.abortOnFailure &&
|
|
38
|
+
options.failBelow !== undefined &&
|
|
39
|
+
result.bestScore < options.failBelow) {
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
// Aggregate score: mean of per-task best scores. Cheap, defensible, and
|
|
44
|
+
// matches the "average across tasks" framing in the Sylph outer loop.
|
|
45
|
+
const meanScore = perTask.length > 0 ? scoreSum / perTask.length : 0;
|
|
46
|
+
return {
|
|
47
|
+
bestProtocol: protocol,
|
|
48
|
+
bestMetaScore: meanScore,
|
|
49
|
+
perTask,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=meta-evolution.js.map
|