@kernel.chat/kbot 4.0.1 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/futures/debate/index.d.ts +7 -0
  2. package/dist/futures/debate/index.js +6 -0
  3. package/dist/futures/debate/runner.d.ts +34 -0
  4. package/dist/futures/debate/runner.js +140 -0
  5. package/dist/futures/debate/synthesis.d.ts +25 -0
  6. package/dist/futures/debate/synthesis.js +81 -0
  7. package/dist/futures/debate/types.d.ts +72 -0
  8. package/dist/futures/debate/types.js +12 -0
  9. package/dist/futures/forecast/index.d.ts +5 -0
  10. package/dist/futures/forecast/index.js +5 -0
  11. package/dist/futures/forecast/projection.d.ts +31 -0
  12. package/dist/futures/forecast/projection.js +177 -0
  13. package/dist/futures/forecast/synthesize.d.ts +19 -0
  14. package/dist/futures/forecast/synthesize.js +89 -0
  15. package/dist/futures/forecast/types.d.ts +59 -0
  16. package/dist/futures/forecast/types.js +15 -0
  17. package/dist/futures/harness/critic-evaluator.d.ts +39 -0
  18. package/dist/futures/harness/critic-evaluator.js +131 -0
  19. package/dist/futures/harness/evolution-loop.d.ts +41 -0
  20. package/dist/futures/harness/evolution-loop.js +168 -0
  21. package/dist/futures/harness/index.d.ts +16 -0
  22. package/dist/futures/harness/index.js +13 -0
  23. package/dist/futures/harness/meta-evolution.d.ts +32 -0
  24. package/dist/futures/harness/meta-evolution.js +52 -0
  25. package/dist/futures/harness/noop-evolution.d.ts +23 -0
  26. package/dist/futures/harness/noop-evolution.js +29 -0
  27. package/dist/futures/harness/persistence.d.ts +30 -0
  28. package/dist/futures/harness/persistence.js +99 -0
  29. package/dist/futures/harness/types.d.ts +147 -0
  30. package/dist/futures/harness/types.js +18 -0
  31. package/dist/futures/index.d.ts +16 -0
  32. package/dist/futures/index.js +22 -0
  33. package/dist/futures/latent-state/envelope.d.ts +39 -0
  34. package/dist/futures/latent-state/envelope.js +178 -0
  35. package/dist/futures/latent-state/index.d.ts +5 -0
  36. package/dist/futures/latent-state/index.js +3 -0
  37. package/dist/futures/latent-state/types.d.ts +47 -0
  38. package/dist/futures/latent-state/types.js +13 -0
  39. package/dist/futures/persona/check.d.ts +45 -0
  40. package/dist/futures/persona/check.js +205 -0
  41. package/dist/futures/persona/index.d.ts +5 -0
  42. package/dist/futures/persona/index.js +5 -0
  43. package/dist/futures/persona/registry.d.ts +22 -0
  44. package/dist/futures/persona/registry.js +124 -0
  45. package/dist/futures/persona/types.d.ts +68 -0
  46. package/dist/futures/persona/types.js +28 -0
  47. package/dist/futures/skill-graph/graph.d.ts +31 -0
  48. package/dist/futures/skill-graph/graph.js +151 -0
  49. package/dist/futures/skill-graph/index.d.ts +13 -0
  50. package/dist/futures/skill-graph/index.js +10 -0
  51. package/dist/futures/skill-graph/synthesis.d.ts +20 -0
  52. package/dist/futures/skill-graph/synthesis.js +83 -0
  53. package/dist/futures/skill-graph/types.d.ts +53 -0
  54. package/dist/futures/skill-graph/types.js +19 -0
  55. package/dist/tools/forecast-summary.d.ts +25 -0
  56. package/dist/tools/forecast-summary.js +204 -0
  57. package/dist/tools/swarm-2026-04.js +2 -0
  58. package/package.json +1 -1
@@ -0,0 +1,89 @@
1
+ // futures/forecast/synthesize — fan a list of Signals into Forecasts and
2
+ // produce human-readable summaries. Pure formatting; no IO.
3
+ import { bestProjection, clampHorizon, signalHistory } from './projection.js';
4
+ /**
5
+ * Project every signal forward at the given horizon. Skips signals whose
6
+ * history is too short for the horizon (clampHorizon = false). Returns
7
+ * forecasts sorted by absolute slope descending so the most-moving signals
8
+ * surface first.
9
+ */
10
+ export function synthesizeForecasts(signals, horizon) {
11
+ const out = [];
12
+ for (const sig of signals) {
13
+ if (!clampHorizon(horizon, signalHistory(sig)))
14
+ continue;
15
+ out.push(bestProjection(sig, horizon));
16
+ }
17
+ out.sort((a, b) => Math.abs(b.trend.slope) - Math.abs(a.trend.slope));
18
+ return out;
19
+ }
20
+ function formatNumber(n) {
21
+ if (!Number.isFinite(n))
22
+ return '—';
23
+ const abs = Math.abs(n);
24
+ if (abs >= 1_000_000)
25
+ return `${(n / 1_000_000).toFixed(1)}M`;
26
+ if (abs >= 1_000)
27
+ return `${(n / 1_000).toFixed(1)}k`.replace('.0k', 'k');
28
+ if (abs >= 10)
29
+ return n.toFixed(0);
30
+ if (abs >= 1)
31
+ return n.toFixed(1);
32
+ return n.toFixed(2);
33
+ }
34
+ const HORIZON_LABEL = {
35
+ '1d': 'in 1 day',
36
+ '7d': 'in 7 days',
37
+ '30d': 'in 30 days',
38
+ '90d': 'in 90 days',
39
+ };
40
+ const ARROW = {
41
+ up: '📈',
42
+ down: '📉',
43
+ flat: '➖',
44
+ };
45
+ function arrowFor(slope, kind) {
46
+ if (kind === 'flat')
47
+ return ARROW.flat;
48
+ if (slope > 0)
49
+ return ARROW.up;
50
+ if (slope < 0)
51
+ return ARROW.down;
52
+ return ARROW.flat;
53
+ }
54
+ /**
55
+ * Markdown one-liner for a forecast.
56
+ * Example: `📈 npm downloads → 14.2k (in 30 days, linear, r²=0.78, ±890)`
57
+ */
58
+ export function formatForecast(f) {
59
+ const arrow = arrowFor(f.trend.slope, f.trend.kind);
60
+ const point = formatNumber(f.pointEstimate);
61
+ const halfWidth = (f.upperBound - f.lowerBound) / 2;
62
+ const interval = Number.isFinite(halfWidth) ? `±${formatNumber(halfWidth)}` : '±?';
63
+ const r2Str = f.trend.kind === 'flat' ? 'flat' : `${f.trend.kind}, r²=${f.trend.r2.toFixed(2)}`;
64
+ return `${arrow} ${f.signal} → ${point} (${HORIZON_LABEL[f.horizon]}, ${r2Str}, ${interval})`;
65
+ }
66
+ /**
67
+ * Take a list of forecasts and produce a short paragraph naming the top-3
68
+ * by absolute slope. If empty, returns a polite no-data sentence.
69
+ */
70
+ export function narrative(forecasts) {
71
+ if (forecasts.length === 0) {
72
+ return 'No forecasts available — not enough signal history to project.';
73
+ }
74
+ const top = forecasts.slice(0, 3);
75
+ const phrases = [];
76
+ for (const f of top) {
77
+ const direction = f.trend.kind === 'flat'
78
+ ? 'is holding flat'
79
+ : f.trend.slope > 0
80
+ ? 'is trending up'
81
+ : 'is trending down';
82
+ const point = formatNumber(f.pointEstimate);
83
+ const conf = `${(f.confidence * 100).toFixed(0)}% conf`;
84
+ phrases.push(`${f.signal} ${direction} toward ${point} ${HORIZON_LABEL[f.horizon]} (${conf})`);
85
+ }
86
+ const head = phrases.length === 1 ? phrases[0] : phrases.slice(0, -1).join('; ') + '; and ' + phrases[phrases.length - 1];
87
+ return `Top movers: ${head}.`;
88
+ }
89
+ //# sourceMappingURL=synthesize.js.map
@@ -0,0 +1,59 @@
1
+ /**
2
+ * A time series. Timestamps are ms since epoch (matches Date.now()).
3
+ * Values are arbitrary scalars (counts, percentages, etc.).
4
+ * Order is not required; projection.ts sorts internally.
5
+ */
6
+ export interface Signal {
7
+ name: string;
8
+ values: Array<{
9
+ ts: number;
10
+ value: number;
11
+ }>;
12
+ }
13
+ /**
14
+ * Trend describes the shape of the fit chosen for a Signal.
15
+ * - 'linear' : value ~ a + b*t (slope = b in value/ms)
16
+ * - 'exponential' : value ~ exp(a + b*t) (slope = b in log(value)/ms)
17
+ * - 'flat' : low-variance fallback; slope == 0, r2 == 0
18
+ *
19
+ * r2 is the coefficient of determination on whatever space was fit
20
+ * (raw values for linear, log(values) for exponential).
21
+ */
22
+ export interface Trend {
23
+ kind: 'linear' | 'exponential' | 'flat';
24
+ slope: number;
25
+ r2: number;
26
+ }
27
+ /**
28
+ * Projection horizon. Always relative to the last observed timestamp.
29
+ * 1d/7d/30d/90d cover the practical surface (daily through quarterly).
30
+ */
31
+ export type Horizon = '1d' | '7d' | '30d' | '90d';
32
+ /**
33
+ * A single forecast for a single signal at a single horizon.
34
+ *
35
+ * - pointEstimate is the model's expected value at horizon end
36
+ * - lowerBound / upperBound is roughly a 95% interval
37
+ * (point ± 2 * residual stddev, exp-transformed for exponential fits)
38
+ * - confidence is a 0..1 score derived from r2 and history length;
39
+ * higher means "trust this projection more"
40
+ * - method is a short human-readable tag, e.g. "linear-lsq", "exp-loglin",
41
+ * "flat-mean"
42
+ */
43
+ export interface Forecast {
44
+ signal: string;
45
+ horizon: Horizon;
46
+ trend: Trend;
47
+ pointEstimate: number;
48
+ lowerBound: number;
49
+ upperBound: number;
50
+ confidence: number;
51
+ method: string;
52
+ }
53
+ /**
54
+ * Number of milliseconds in each horizon. Exported as a const for use by
55
+ * projection.ts and synthesize.ts; not exported as a value-typed enum so
56
+ * callers can keep using string literals.
57
+ */
58
+ export declare const HORIZON_MS: Record<Horizon, number>;
59
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1,15 @@
1
+ // futures/forecast — type definitions for projecting growth/research signals
2
+ // forward in time. Pure types: no runtime, no IO. Consumers (synthesize.ts,
3
+ // projection.ts) build Forecasts from raw Signal arrays.
4
+ /**
5
+ * Number of milliseconds in each horizon. Exported as a const for use by
6
+ * projection.ts and synthesize.ts; not exported as a value-typed enum so
7
+ * callers can keep using string literals.
8
+ */
9
+ export const HORIZON_MS = {
10
+ '1d': 24 * 60 * 60 * 1000,
11
+ '7d': 7 * 24 * 60 * 60 * 1000,
12
+ '30d': 30 * 24 * 60 * 60 * 1000,
13
+ '90d': 90 * 24 * 60 * 60 * 1000,
14
+ };
15
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1,39 @@
1
+ /**
2
+ * CriticEvaluator — adapts `critic-gate.ts` to the harness `Evaluator`
3
+ * interface from `types.ts`.
4
+ *
5
+ * critic-gate.ts gates a single tool result. The harness Evaluator grades
6
+ * an entire trace against a Task's acceptance criteria. This adapter
7
+ * walks the trace's tool steps, runs each through `gateToolResult`, and
8
+ * aggregates the per-step verdicts into a single EvaluationReport:
9
+ *
10
+ * - `pass` = every acceptance criterion satisfied AND no critic rejected
11
+ * a tool step
12
+ * - `score` = (criteriaPassRate * 0.7) + (toolAcceptRate * 0.3),
13
+ * efficiency-tiebroken by total step time
14
+ * - failureModes derived from critic verdicts' `failure_class` (RF-NN-*)
15
+ *
16
+ * Acceptance criteria are matched via case-insensitive substring against
17
+ * the trace's flattened `output | error | finalState | action` text.
18
+ * That keeps the adapter dependency-free; richer matchers can subclass.
19
+ *
20
+ * critic-gate.ts is NOT modified — this is a pure consumer.
21
+ */
22
+ import type { EvaluationReport, Evaluator, ExecutionTrace, Task } from './types.js';
23
+ import type { CriticVerdict, GateOpts } from '../../critic-gate.js';
24
+ export interface CriticEvaluatorOpts {
25
+ /** Forwarded to `gateToolResult` — strictness, provider, llmClient stub. */
26
+ gate?: GateOpts;
27
+ /**
28
+ * If set, replaces `gateToolResult`. Lets tests inject a fully synchronous
29
+ * decision function and skip the critic-gate provider plumbing entirely.
30
+ */
31
+ gateFn?: (tool: string, args: Record<string, unknown>, result: unknown) => Promise<CriticVerdict> | CriticVerdict;
32
+ }
33
+ export declare class CriticEvaluator implements Evaluator {
34
+ private readonly opts;
35
+ constructor(opts?: CriticEvaluatorOpts);
36
+ evaluate(trace: ExecutionTrace, task: Task): Promise<EvaluationReport>;
37
+ }
38
+ export declare function createCriticEvaluator(opts?: CriticEvaluatorOpts): Evaluator;
39
+ //# sourceMappingURL=critic-evaluator.d.ts.map
@@ -0,0 +1,131 @@
1
+ /**
2
+ * CriticEvaluator — adapts `critic-gate.ts` to the harness `Evaluator`
3
+ * interface from `types.ts`.
4
+ *
5
+ * critic-gate.ts gates a single tool result. The harness Evaluator grades
6
+ * an entire trace against a Task's acceptance criteria. This adapter
7
+ * walks the trace's tool steps, runs each through `gateToolResult`, and
8
+ * aggregates the per-step verdicts into a single EvaluationReport:
9
+ *
10
+ * - `pass` = every acceptance criterion satisfied AND no critic rejected
11
+ * a tool step
12
+ * - `score` = (criteriaPassRate * 0.7) + (toolAcceptRate * 0.3),
13
+ * efficiency-tiebroken by total step time
14
+ * - failureModes derived from critic verdicts' `failure_class` (RF-NN-*)
15
+ *
16
+ * Acceptance criteria are matched via case-insensitive substring against
17
+ * the trace's flattened `output | error | finalState | action` text.
18
+ * That keeps the adapter dependency-free; richer matchers can subclass.
19
+ *
20
+ * critic-gate.ts is NOT modified — this is a pure consumer.
21
+ */
22
+ import { gateToolResult } from '../../critic-gate.js';
23
+ /** Map RF taxonomy classes onto harness FailureMode kinds. */
24
+ function rfToFailureKind(rf) {
25
+ switch (rf) {
26
+ case 'RF-01-fabricated-evidence':
27
+ case 'RF-10-simulation-role-confusion':
28
+ return 'hallucinated-state';
29
+ case 'RF-02-metric-interpretation':
30
+ case 'RF-03-confused-provenance':
31
+ case 'RF-04-temporal-misordering':
32
+ return 'misinterpreted-state';
33
+ case 'RF-12-repetition-failure-to-resume':
34
+ return 'reasoning-loop';
35
+ case 'RF-08-evidential-insufficiency':
36
+ case 'RF-11-excessive-speculation':
37
+ return 'missing-capability';
38
+ case 'RF-16-arithmetic-error':
39
+ case 'RF-14-invalid-inference-pattern':
40
+ case 'RF-15-internal-contradiction':
41
+ return 'incorrect-tool-usage';
42
+ default:
43
+ return 'other';
44
+ }
45
+ }
46
+ function flattenTrace(trace) {
47
+ const parts = [];
48
+ for (const s of trace.steps) {
49
+ if (s.action)
50
+ parts.push(s.action);
51
+ if (s.output)
52
+ parts.push(s.output);
53
+ if (s.error)
54
+ parts.push(s.error);
55
+ }
56
+ try {
57
+ parts.push(JSON.stringify(trace.finalState));
58
+ }
59
+ catch {
60
+ /* ignore unserializable */
61
+ }
62
+ return parts.join('\n');
63
+ }
64
+ export class CriticEvaluator {
65
+ opts;
66
+ constructor(opts = {}) {
67
+ this.opts = opts;
68
+ }
69
+ async evaluate(trace, task) {
70
+ const haystack = flattenTrace(trace).toLowerCase();
71
+ const criteriaResults = task.acceptance.map((criterion) => {
72
+ const passed = haystack.includes(criterion.toLowerCase());
73
+ return {
74
+ criterion,
75
+ passed,
76
+ evidence: passed ? 'substring match in trace' : 'no match in flattened trace',
77
+ };
78
+ });
79
+ const criteriaPassRate = criteriaResults.length === 0
80
+ ? 1
81
+ : criteriaResults.filter((c) => c.passed).length / criteriaResults.length;
82
+ // Run critic on each tool step. Missing tools / responses are skipped.
83
+ const toolSteps = trace.steps.filter((s) => s.phase === 'tool');
84
+ const failureModes = [];
85
+ let toolAccepts = 0;
86
+ for (const step of toolSteps) {
87
+ const verdict = this.opts.gateFn
88
+ ? await this.opts.gateFn(step.action, {}, step.output ?? step.error ?? '')
89
+ : await gateToolResult(step.action, {}, step.output ?? step.error ?? '', this.opts.gate);
90
+ if (verdict.accept) {
91
+ toolAccepts++;
92
+ }
93
+ else if (verdict.failure_class) {
94
+ failureModes.push({
95
+ kind: rfToFailureKind(verdict.failure_class),
96
+ detail: `${verdict.failure_class}: ${verdict.reason || 'critic rejected'}`,
97
+ });
98
+ }
99
+ else {
100
+ failureModes.push({
101
+ kind: 'other',
102
+ detail: verdict.reason || 'critic rejected without failure class',
103
+ });
104
+ }
105
+ }
106
+ const toolAcceptRate = toolSteps.length === 0 ? 1 : toolAccepts / toolSteps.length;
107
+ const baseScore = criteriaPassRate * 0.7 + toolAcceptRate * 0.3;
108
+ // Efficiency tiebreaker: small bonus inversely proportional to time.
109
+ const totalMs = trace.llmTimeMs + trace.toolTimeMs;
110
+ const efficiency = totalMs > 0 ? Math.min(0.05, 1000 / (totalMs + 1000) * 0.05) : 0.05;
111
+ const score = Math.max(0, Math.min(1, baseScore + efficiency));
112
+ const allCriteriaPass = criteriaResults.every((c) => c.passed);
113
+ const noToolRejections = failureModes.length === 0;
114
+ const pass = allCriteriaPass && noToolRejections;
115
+ return {
116
+ taskId: task.id,
117
+ harnessId: trace.harnessId,
118
+ pass,
119
+ score,
120
+ criteriaResults,
121
+ failureModes,
122
+ notes: pass
123
+ ? 'all criteria passed; critic accepted every tool step'
124
+ : `pass=${pass} criteria=${criteriaPassRate.toFixed(2)} tools=${toolAcceptRate.toFixed(2)}`,
125
+ };
126
+ }
127
+ }
128
+ export function createCriticEvaluator(opts = {}) {
129
+ return new CriticEvaluator(opts);
130
+ }
131
+ //# sourceMappingURL=critic-evaluator.js.map
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Harness Evolution Loop — inner loop (Algorithm 1 from Sylph).
3
+ *
4
+ * for i in 1..maxIterations:
5
+ * trace = Worker.execute(task, harness)
6
+ * report = Evaluator.evaluate(trace, task)
7
+ * record = { iteration, harness, trace, report, verdict }
8
+ * history.push(record)
9
+ * if report.score > bestScore: best = harness
10
+ * if earlyStopScore reached on consecutive iterations: stop
11
+ * if regression > revertThreshold: revert harness to best
12
+ * harness = EvolutionAgent.evolve(history, best)
13
+ *
14
+ * Pure orchestration — Worker / Evaluator / EvolutionAgent are injected,
15
+ * which makes the whole loop deterministic and testable with stub
16
+ * implementations. No LLM calls happen here directly.
17
+ */
18
+ import type { EvolutionProtocol, EvolutionRecord, EvolutionResult, Task } from './types.js';
19
+ /** Optional hooks the caller can plug in to observe / persist each step. */
20
+ export interface RunOptions {
21
+ /** Called after each record is appended to in-memory history. */
22
+ onRecord?: (record: EvolutionRecord) => void | Promise<void>;
23
+ /** When set, `appendTrace` is invoked under this state dir for every record. */
24
+ persistDir?: string;
25
+ /** When false, skip filesystem persistence even if `persistDir` is set. Default true. */
26
+ persist?: boolean;
27
+ /**
28
+ * How many consecutive iterations must hit `earlyStopScore` to stop.
29
+ * Defaults to 1 — first hit ends the loop.
30
+ */
31
+ earlyStopStreak?: number;
32
+ }
33
+ /**
34
+ * Run the inner Harness Evolution Loop against a single task.
35
+ *
36
+ * Always returns an `EvolutionResult` — never throws on Worker / Evaluator
37
+ * exceptions; instead, records a failure step and continues. (The
38
+ * Evaluator is supposed to score failures, not the loop itself.)
39
+ */
40
+ export declare function runEvolutionLoop(protocol: EvolutionProtocol, task: Task, options?: RunOptions): Promise<EvolutionResult>;
41
+ //# sourceMappingURL=evolution-loop.d.ts.map
@@ -0,0 +1,168 @@
1
+ /**
2
+ * Harness Evolution Loop — inner loop (Algorithm 1 from Sylph).
3
+ *
4
+ * for i in 1..maxIterations:
5
+ * trace = Worker.execute(task, harness)
6
+ * report = Evaluator.evaluate(trace, task)
7
+ * record = { iteration, harness, trace, report, verdict }
8
+ * history.push(record)
9
+ * if report.score > bestScore: best = harness
10
+ * if earlyStopScore reached on consecutive iterations: stop
11
+ * if regression > revertThreshold: revert harness to best
12
+ * harness = EvolutionAgent.evolve(history, best)
13
+ *
14
+ * Pure orchestration — Worker / Evaluator / EvolutionAgent are injected,
15
+ * which makes the whole loop deterministic and testable with stub
16
+ * implementations. No LLM calls happen here directly.
17
+ */
18
+ import { appendTrace } from './persistence.js';
19
+ function compareVerdict(prev, current, revertThreshold) {
20
+ if (current > prev)
21
+ return 'improved';
22
+ if (current === prev)
23
+ return 'no-op';
24
+ if (revertThreshold !== undefined && (prev - current) >= revertThreshold) {
25
+ // A "regressed" verdict signals the loop should revert to best harness.
26
+ return 'regressed';
27
+ }
28
+ return 'regressed';
29
+ }
30
+ /**
31
+ * Run the inner Harness Evolution Loop against a single task.
32
+ *
33
+ * Always returns an `EvolutionResult` — never throws on Worker / Evaluator
34
+ * exceptions; instead, records a failure step and continues. (The
35
+ * Evaluator is supposed to score failures, not the loop itself.)
36
+ */
37
+ export async function runEvolutionLoop(protocol, task, options = {}) {
38
+ const { worker, evaluator, evolution, initialHarness, hyperparams } = protocol;
39
+ const maxIterations = Math.max(1, hyperparams.maxIterations | 0);
40
+ const earlyStopScore = hyperparams.earlyStopScore;
41
+ const revertThreshold = hyperparams.revertThreshold;
42
+ const earlyStopStreak = Math.max(1, options.earlyStopStreak ?? 1);
43
+ const shouldPersist = options.persist !== false && !!options.persistDir;
44
+ const history = [];
45
+ let harness = initialHarness;
46
+ let bestHarness = initialHarness;
47
+ let bestScore = -Infinity;
48
+ let prevScore = -Infinity;
49
+ let earlyHits = 0;
50
+ for (let iteration = 1; iteration <= maxIterations; iteration++) {
51
+ let trace;
52
+ try {
53
+ trace = await worker.execute(task, harness);
54
+ }
55
+ catch (err) {
56
+ // Synthesize a minimal failure trace so the evaluator can still grade.
57
+ trace = {
58
+ taskId: task.id,
59
+ harnessId: harness.id,
60
+ steps: [
61
+ {
62
+ index: 0,
63
+ phase: 'observe',
64
+ action: 'worker-error',
65
+ error: err instanceof Error ? err.message : String(err),
66
+ durationMs: 0,
67
+ },
68
+ ],
69
+ finalState: {},
70
+ llmTimeMs: 0,
71
+ toolTimeMs: 0,
72
+ };
73
+ }
74
+ let report;
75
+ try {
76
+ report = await evaluator.evaluate(trace, task);
77
+ }
78
+ catch (err) {
79
+ report = {
80
+ taskId: task.id,
81
+ harnessId: harness.id,
82
+ pass: false,
83
+ score: 0,
84
+ criteriaResults: task.acceptance.map((c) => ({
85
+ criterion: c,
86
+ passed: false,
87
+ evidence: 'evaluator-error',
88
+ })),
89
+ failureModes: [
90
+ {
91
+ kind: 'other',
92
+ detail: err instanceof Error ? err.message : String(err),
93
+ },
94
+ ],
95
+ notes: 'evaluator threw; auto-fail',
96
+ };
97
+ }
98
+ const verdict = compareVerdict(prevScore, report.score, revertThreshold);
99
+ const record = {
100
+ iteration,
101
+ harness,
102
+ trace,
103
+ report,
104
+ verdict,
105
+ };
106
+ history.push(record);
107
+ if (options.onRecord) {
108
+ await options.onRecord(record);
109
+ }
110
+ if (shouldPersist) {
111
+ try {
112
+ await appendTrace(task.id, record, options.persistDir);
113
+ }
114
+ catch {
115
+ // persistence is best-effort; never block evolution on disk failure
116
+ }
117
+ }
118
+ // Track best harness.
119
+ if (report.score > bestScore) {
120
+ bestScore = report.score;
121
+ bestHarness = harness;
122
+ }
123
+ // Early-stop check.
124
+ if (earlyStopScore !== undefined && report.score >= earlyStopScore) {
125
+ earlyHits++;
126
+ if (earlyHits >= earlyStopStreak) {
127
+ break;
128
+ }
129
+ }
130
+ else {
131
+ earlyHits = 0;
132
+ }
133
+ // Revert on regression past threshold.
134
+ if (revertThreshold !== undefined &&
135
+ bestScore - report.score >= revertThreshold) {
136
+ harness = bestHarness;
137
+ }
138
+ prevScore = report.score;
139
+ // Short-circuit before the final evolve call — no point mutating on the
140
+ // last iteration since we'll never execute the new harness.
141
+ if (iteration === maxIterations)
142
+ break;
143
+ try {
144
+ harness = await evolution.evolve(history, bestHarness);
145
+ }
146
+ catch {
147
+ // EvolutionAgent failed — keep current harness, keep going.
148
+ harness = bestHarness;
149
+ }
150
+ }
151
+ // If nothing ran (shouldn't happen with maxIterations >= 1) make sure
152
+ // bestHarness is still defined.
153
+ if (history.length === 0) {
154
+ return {
155
+ taskId: task.id,
156
+ bestHarness: initialHarness,
157
+ bestScore: 0,
158
+ history,
159
+ };
160
+ }
161
+ return {
162
+ taskId: task.id,
163
+ bestHarness,
164
+ bestScore: bestScore === -Infinity ? 0 : bestScore,
165
+ history,
166
+ };
167
+ }
168
+ //# sourceMappingURL=evolution-loop.js.map
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Harness Evolution Loop — public surface.
3
+ *
4
+ * See `./README.md` for the high-level overview and `./types.ts` for the
5
+ * contract every other file in this directory targets.
6
+ */
7
+ export * from './types.js';
8
+ export { runEvolutionLoop } from './evolution-loop.js';
9
+ export type { RunOptions } from './evolution-loop.js';
10
+ export { runMetaEvolution } from './meta-evolution.js';
11
+ export type { MetaOptions } from './meta-evolution.js';
12
+ export { CriticEvaluator, createCriticEvaluator, } from './critic-evaluator.js';
13
+ export type { CriticEvaluatorOpts } from './critic-evaluator.js';
14
+ export { NoopEvolutionAgent, createNoopEvolutionAgent, } from './noop-evolution.js';
15
+ export { appendTrace, readHistory, pruneOlderThan, defaultStateDir, } from './persistence.js';
16
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Harness Evolution Loop — public surface.
3
+ *
4
+ * See `./README.md` for the high-level overview and `./types.ts` for the
5
+ * contract every other file in this directory targets.
6
+ */
7
+ export * from './types.js';
8
+ export { runEvolutionLoop } from './evolution-loop.js';
9
+ export { runMetaEvolution } from './meta-evolution.js';
10
+ export { CriticEvaluator, createCriticEvaluator, } from './critic-evaluator.js';
11
+ export { NoopEvolutionAgent, createNoopEvolutionAgent, } from './noop-evolution.js';
12
+ export { appendTrace, readHistory, pruneOlderThan, defaultStateDir, } from './persistence.js';
13
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Harness Meta-Evolution — outer loop (Algorithm 2 from Sylph).
3
+ *
4
+ * The inner loop optimizes one harness against one task. The outer loop
5
+ * runs the inner loop across a portfolio of tasks, aggregating per-task
6
+ * results and selecting the best protocol overall. Currently the
7
+ * "selection" step is averaging — when a real MetaEvolutionAgent ships,
8
+ * it'll consume the perTask EvolutionResult[] and propose protocol
9
+ * mutations.
10
+ *
11
+ * Pure orchestration. Tasks are run sequentially to keep the trace
12
+ * ordering deterministic; parallelism is a future concern.
13
+ */
14
+ import type { EvolutionProtocol, EvolutionResult, MetaResult, Task } from './types.js';
15
+ import { type RunOptions } from './evolution-loop.js';
16
+ export interface MetaOptions extends RunOptions {
17
+ /** Called after each task's inner loop completes. */
18
+ onTaskComplete?: (result: EvolutionResult) => void | Promise<void>;
19
+ /**
20
+ * When `true`, abort the outer loop on the first task whose best score
21
+ * is below `failBelow`. Default false — always run the full portfolio.
22
+ */
23
+ abortOnFailure?: boolean;
24
+ failBelow?: number;
25
+ }
26
+ /**
27
+ * Run the inner Evolution Loop across a portfolio of tasks, returning
28
+ * the best protocol (currently always the input protocol — there is no
29
+ * MetaEvolutionAgent yet) plus per-task results and the aggregate score.
30
+ */
31
+ export declare function runMetaEvolution(protocol: EvolutionProtocol, tasks: Task[], options?: MetaOptions): Promise<MetaResult>;
32
+ //# sourceMappingURL=meta-evolution.d.ts.map
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Harness Meta-Evolution — outer loop (Algorithm 2 from Sylph).
3
+ *
4
+ * The inner loop optimizes one harness against one task. The outer loop
5
+ * runs the inner loop across a portfolio of tasks, aggregating per-task
6
+ * results and selecting the best protocol overall. Currently the
7
+ * "selection" step is averaging — when a real MetaEvolutionAgent ships,
8
+ * it'll consume the perTask EvolutionResult[] and propose protocol
9
+ * mutations.
10
+ *
11
+ * Pure orchestration. Tasks are run sequentially to keep the trace
12
+ * ordering deterministic; parallelism is a future concern.
13
+ */
14
+ import { runEvolutionLoop } from './evolution-loop.js';
15
+ /**
16
+ * Run the inner Evolution Loop across a portfolio of tasks, returning
17
+ * the best protocol (currently always the input protocol — there is no
18
+ * MetaEvolutionAgent yet) plus per-task results and the aggregate score.
19
+ */
20
+ export async function runMetaEvolution(protocol, tasks, options = {}) {
21
+ if (tasks.length === 0) {
22
+ return {
23
+ bestProtocol: protocol,
24
+ bestMetaScore: 0,
25
+ perTask: [],
26
+ };
27
+ }
28
+ const perTask = [];
29
+ let scoreSum = 0;
30
+ for (const task of tasks) {
31
+ const result = await runEvolutionLoop(protocol, task, options);
32
+ perTask.push(result);
33
+ scoreSum += result.bestScore;
34
+ if (options.onTaskComplete) {
35
+ await options.onTaskComplete(result);
36
+ }
37
+ if (options.abortOnFailure &&
38
+ options.failBelow !== undefined &&
39
+ result.bestScore < options.failBelow) {
40
+ break;
41
+ }
42
+ }
43
+ // Aggregate score: mean of per-task best scores. Cheap, defensible, and
44
+ // matches the "average across tasks" framing in the Sylph outer loop.
45
+ const meanScore = perTask.length > 0 ? scoreSum / perTask.length : 0;
46
+ return {
47
+ bestProtocol: protocol,
48
+ bestMetaScore: meanScore,
49
+ perTask,
50
+ };
51
+ }
52
+ //# sourceMappingURL=meta-evolution.js.map