@vextlabs/theron-agent-sdk 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -0
- package/LICENSE +21 -0
- package/README.md +270 -0
- package/dist/adapters/theron.cjs +92 -0
- package/dist/adapters/theron.d.cts +42 -0
- package/dist/adapters/theron.d.ts +42 -0
- package/dist/adapters/theron.js +89 -0
- package/dist/agent/index.cjs +33 -0
- package/dist/agent/index.d.cts +84 -0
- package/dist/agent/index.d.ts +84 -0
- package/dist/agent/index.js +31 -0
- package/dist/council/index.cjs +68 -0
- package/dist/council/index.d.cts +96 -0
- package/dist/council/index.d.ts +96 -0
- package/dist/council/index.js +66 -0
- package/dist/index.cjs +1288 -0
- package/dist/index.d.cts +60 -0
- package/dist/index.d.ts +60 -0
- package/dist/index.js +1244 -0
- package/dist/loop/index.cjs +106 -0
- package/dist/loop/index.d.cts +285 -0
- package/dist/loop/index.d.ts +285 -0
- package/dist/loop/index.js +95 -0
- package/dist/mcp/index.cjs +153 -0
- package/dist/mcp/index.d.cts +69 -0
- package/dist/mcp/index.d.ts +69 -0
- package/dist/mcp/index.js +150 -0
- package/dist/memory/index.cjs +53 -0
- package/dist/memory/index.d.cts +73 -0
- package/dist/memory/index.d.ts +73 -0
- package/dist/memory/index.js +50 -0
- package/dist/patterns/index.cjs +159 -0
- package/dist/patterns/index.d.cts +200 -0
- package/dist/patterns/index.d.ts +200 -0
- package/dist/patterns/index.js +150 -0
- package/dist/receipts/index.cjs +151 -0
- package/dist/receipts/index.d.cts +132 -0
- package/dist/receipts/index.d.ts +132 -0
- package/dist/receipts/index.js +146 -0
- package/dist/runtime/index.cjs +205 -0
- package/dist/runtime/index.d.cts +148 -0
- package/dist/runtime/index.d.ts +148 -0
- package/dist/runtime/index.js +203 -0
- package/dist/session/index.cjs +49 -0
- package/dist/session/index.d.cts +79 -0
- package/dist/session/index.d.ts +79 -0
- package/dist/session/index.js +47 -0
- package/dist/tools/index.cjs +51 -0
- package/dist/tools/index.d.cts +52 -0
- package/dist/tools/index.d.ts +52 -0
- package/dist/tools/index.js +46 -0
- package/dist/verifiers/index.cjs +96 -0
- package/dist/verifiers/index.d.cts +63 -0
- package/dist/verifiers/index.d.ts +63 -0
- package/dist/verifiers/index.js +93 -0
- package/examples/01_code_reviewer.ts +90 -0
- package/examples/02_research_assistant.ts +85 -0
- package/examples/03_council_of_three.ts +91 -0
- package/examples/_adapters/openrouter.ts +90 -0
- package/examples/adapters/openrouter.ts +144 -0
- package/examples/adapters/theron.ts +105 -0
- package/examples/basic-agent.ts +56 -0
- package/examples/council-deliberation.ts +90 -0
- package/examples/cyber-recon-bot.ts +163 -0
- package/examples/loop-primitives.ts +50 -0
- package/examples/meeting-prep-bot.ts +172 -0
- package/examples/reasoning-patterns.ts +125 -0
- package/examples/support-triage-bot.ts +181 -0
- package/examples/verifier-kernel.ts +108 -0
- package/package.json +154 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
interface MemoryRecord {
|
|
2
|
+
/** Unique identifier. */
|
|
3
|
+
id: string;
|
|
4
|
+
/** Key for retrieval. */
|
|
5
|
+
key: string;
|
|
6
|
+
/** The actual content. */
|
|
7
|
+
value: string;
|
|
8
|
+
/** Tenant scope (multi-tenant deployments). Always filter on this in
|
|
9
|
+
* production — there is no implicit isolation. */
|
|
10
|
+
tenant_id?: string;
|
|
11
|
+
/** Optional tags for categorization. */
|
|
12
|
+
tags?: string[];
|
|
13
|
+
/** When this memory was created. */
|
|
14
|
+
created_at: number;
|
|
15
|
+
/** When this memory was last accessed (for LRU eviction). */
|
|
16
|
+
last_accessed_at: number;
|
|
17
|
+
/** Optional embedding for semantic search (1024-dim float32). */
|
|
18
|
+
embedding?: number[];
|
|
19
|
+
}
|
|
20
|
+
interface MemoryQuery {
|
|
21
|
+
/** Filter by tenant. */
|
|
22
|
+
tenant_id?: string;
|
|
23
|
+
/** Exact key lookup. */
|
|
24
|
+
key?: string;
|
|
25
|
+
/** Tag-based filter. */
|
|
26
|
+
tags?: string[];
|
|
27
|
+
/** Semantic search by similarity to this query string. */
|
|
28
|
+
semantic_query?: string;
|
|
29
|
+
/** Max records to return. */
|
|
30
|
+
limit?: number;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Memory — abstract interface; implementations plug in any backend.
|
|
34
|
+
*
|
|
35
|
+
* Built-in implementations:
|
|
36
|
+
* - InMemoryStore (default; for development)
|
|
37
|
+
* - For production, plug in pgvector, R2, SQLite, etc. by extending Memory.
|
|
38
|
+
*
|
|
39
|
+
* Minimal usage:
|
|
40
|
+
* const mem = new InMemoryStore();
|
|
41
|
+
* await mem.set({
|
|
42
|
+
* key: "user_name", value: "Annalea",
|
|
43
|
+
* created_at: Date.now(), last_accessed_at: Date.now(),
|
|
44
|
+
* });
|
|
45
|
+
* const records = await mem.query({ key: "user_name" });
|
|
46
|
+
*/
|
|
47
|
+
declare abstract class Memory {
|
|
48
|
+
abstract set(record: Omit<MemoryRecord, "id"> & {
|
|
49
|
+
id?: string;
|
|
50
|
+
}): Promise<MemoryRecord>;
|
|
51
|
+
abstract get(id: string): Promise<MemoryRecord | undefined>;
|
|
52
|
+
abstract query(q: MemoryQuery): Promise<MemoryRecord[]>;
|
|
53
|
+
abstract delete(id: string): Promise<void>;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* InMemoryStore — default Memory implementation. Volatile; for development.
|
|
57
|
+
* Production should swap in a persistent backend.
|
|
58
|
+
*
|
|
59
|
+
* NOTE: this implementation does NOT enforce tenant isolation at the storage
|
|
60
|
+
* layer — `query({ tenant_id })` filters but does not partition. Production
|
|
61
|
+
* backends should partition by tenant at the storage layer.
|
|
62
|
+
*/
|
|
63
|
+
declare class InMemoryStore extends Memory {
|
|
64
|
+
private records;
|
|
65
|
+
set(record: Omit<MemoryRecord, "id"> & {
|
|
66
|
+
id?: string;
|
|
67
|
+
}): Promise<MemoryRecord>;
|
|
68
|
+
get(id: string): Promise<MemoryRecord | undefined>;
|
|
69
|
+
query(q: MemoryQuery): Promise<MemoryRecord[]>;
|
|
70
|
+
delete(id: string): Promise<void>;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export { InMemoryStore, Memory, type MemoryQuery, type MemoryRecord };
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
// src/memory/index.ts
|
|
2
|
+
var Memory = class {
|
|
3
|
+
};
|
|
4
|
+
var InMemoryStore = class extends Memory {
|
|
5
|
+
records = /* @__PURE__ */ new Map();
|
|
6
|
+
async set(record) {
|
|
7
|
+
const id = record.id ?? `mem_${Math.random().toString(36).slice(2)}_${Date.now()}`;
|
|
8
|
+
const full = {
|
|
9
|
+
...record,
|
|
10
|
+
id,
|
|
11
|
+
created_at: record.created_at ?? Date.now(),
|
|
12
|
+
last_accessed_at: record.last_accessed_at ?? Date.now()
|
|
13
|
+
};
|
|
14
|
+
this.records.set(id, full);
|
|
15
|
+
return full;
|
|
16
|
+
}
|
|
17
|
+
async get(id) {
|
|
18
|
+
const r = this.records.get(id);
|
|
19
|
+
if (r) {
|
|
20
|
+
r.last_accessed_at = Date.now();
|
|
21
|
+
}
|
|
22
|
+
return r;
|
|
23
|
+
}
|
|
24
|
+
async query(q) {
|
|
25
|
+
let results = Array.from(this.records.values());
|
|
26
|
+
if (q.tenant_id !== void 0) {
|
|
27
|
+
results = results.filter((r) => r.tenant_id === q.tenant_id);
|
|
28
|
+
}
|
|
29
|
+
if (q.key !== void 0) {
|
|
30
|
+
results = results.filter((r) => r.key === q.key);
|
|
31
|
+
}
|
|
32
|
+
if (q.tags !== void 0 && q.tags.length > 0) {
|
|
33
|
+
results = results.filter((r) => r.tags?.some((t) => q.tags.includes(t)));
|
|
34
|
+
}
|
|
35
|
+
if (q.semantic_query !== void 0) {
|
|
36
|
+
const ql = q.semantic_query.toLowerCase();
|
|
37
|
+
results = results.filter((r) => r.value.toLowerCase().includes(ql));
|
|
38
|
+
}
|
|
39
|
+
results.sort((a, b) => b.last_accessed_at - a.last_accessed_at);
|
|
40
|
+
if (q.limit !== void 0) {
|
|
41
|
+
results = results.slice(0, q.limit);
|
|
42
|
+
}
|
|
43
|
+
return results;
|
|
44
|
+
}
|
|
45
|
+
async delete(id) {
|
|
46
|
+
this.records.delete(id);
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
export { InMemoryStore, Memory };
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// src/patterns/index.ts
|
|
4
|
+
async function selfConsistency(opts) {
|
|
5
|
+
const n = Math.max(1, Math.floor(opts.samples));
|
|
6
|
+
const keyOf = opts.key ?? ((v) => JSON.stringify(v));
|
|
7
|
+
const clusters = /* @__PURE__ */ new Map();
|
|
8
|
+
let total = 0;
|
|
9
|
+
for (let i = 0; i < n; i++) {
|
|
10
|
+
const v = await opts.generate(i);
|
|
11
|
+
if (v === void 0 || v === null) continue;
|
|
12
|
+
total += 1;
|
|
13
|
+
const k = keyOf(v);
|
|
14
|
+
const c = clusters.get(k) ?? { count: 0, sample: v };
|
|
15
|
+
c.count += 1;
|
|
16
|
+
clusters.set(k, c);
|
|
17
|
+
}
|
|
18
|
+
if (total === 0) throw new Error("selfConsistency: generate produced no values");
|
|
19
|
+
const ranked = [...clusters.entries()].map(([key, { count, sample }]) => ({ key, count, sample })).sort((a, b) => b.count - a.count);
|
|
20
|
+
const winner = ranked[0];
|
|
21
|
+
return {
|
|
22
|
+
answer: winner.sample,
|
|
23
|
+
consistency: Math.round(winner.count / total * 1e3) / 1e3,
|
|
24
|
+
votes: winner.count,
|
|
25
|
+
total,
|
|
26
|
+
clusters: ranked
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
async function bestOfN(opts) {
|
|
30
|
+
const n = Math.max(1, Math.floor(opts.n));
|
|
31
|
+
let best = null;
|
|
32
|
+
const scores = [];
|
|
33
|
+
for (let i = 0; i < n; i++) {
|
|
34
|
+
const v = await opts.generate(i);
|
|
35
|
+
const s = await opts.score(v, i);
|
|
36
|
+
scores.push(s);
|
|
37
|
+
if (!best || s > best.score) best = { value: v, score: s, index: i };
|
|
38
|
+
}
|
|
39
|
+
if (!best) throw new Error("bestOfN: generate produced no candidates");
|
|
40
|
+
return { best: best.value, score: best.score, index: best.index, scores };
|
|
41
|
+
}
|
|
42
|
+
var DEFAULT_CLEAN = /\b(no (issues|problems|flaws|changes)|looks good|lgtm|nothing to (fix|improve))\b/i;
|
|
43
|
+
async function selfRefine(opts) {
|
|
44
|
+
const maxIters = Math.max(1, Math.floor(opts.maxIters ?? 2));
|
|
45
|
+
const isClean = opts.isClean ?? ((c) => DEFAULT_CLEAN.test(c));
|
|
46
|
+
let value = await opts.draft();
|
|
47
|
+
const trace = [];
|
|
48
|
+
let revised = 0;
|
|
49
|
+
for (let iter = 1; iter <= maxIters; iter++) {
|
|
50
|
+
const critique = String(await opts.critique(value, iter));
|
|
51
|
+
if (isClean(critique)) {
|
|
52
|
+
trace.push({ iter, critique, revised: false });
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
value = await opts.revise(value, critique, iter);
|
|
56
|
+
revised += 1;
|
|
57
|
+
trace.push({ iter, critique, revised: true });
|
|
58
|
+
}
|
|
59
|
+
return { answer: value, iterations: trace.length, revised, trace };
|
|
60
|
+
}
|
|
61
|
+
async function treeOfThoughts(opts) {
|
|
62
|
+
const breadth = Math.max(1, Math.floor(opts.breadth));
|
|
63
|
+
const depth = Math.max(1, Math.floor(opts.depth));
|
|
64
|
+
const path = [];
|
|
65
|
+
const scored = [];
|
|
66
|
+
for (let d = 0; d < depth; d++) {
|
|
67
|
+
let best = null;
|
|
68
|
+
for (let b = 0; b < breadth; b++) {
|
|
69
|
+
const cand = await opts.expand(path, b);
|
|
70
|
+
const s = await opts.score(cand, path);
|
|
71
|
+
if (!best || s > best.score) best = { thought: cand, score: s };
|
|
72
|
+
}
|
|
73
|
+
if (!best) break;
|
|
74
|
+
path.push(best.thought);
|
|
75
|
+
scored.push(best);
|
|
76
|
+
}
|
|
77
|
+
const answer = opts.synthesize ? await opts.synthesize(path) : path[path.length - 1];
|
|
78
|
+
return { answer, path: scored };
|
|
79
|
+
}
|
|
80
|
+
async function chainOfVerification(opts) {
|
|
81
|
+
const draft = await opts.draft();
|
|
82
|
+
const questions = await opts.planChecks(draft) ?? [];
|
|
83
|
+
const checks = [];
|
|
84
|
+
for (const q of questions) {
|
|
85
|
+
checks.push({ q, a: String(await opts.answerCheck(q)) });
|
|
86
|
+
}
|
|
87
|
+
const answer = checks.length ? await opts.revise(draft, checks) : draft;
|
|
88
|
+
return { answer, checks };
|
|
89
|
+
}
|
|
90
|
+
async function reflexion(opts) {
|
|
91
|
+
const maxAttempts = Math.max(1, Math.floor(opts.maxAttempts));
|
|
92
|
+
const reflections = [];
|
|
93
|
+
let last;
|
|
94
|
+
for (let i = 0; i < maxAttempts; i++) {
|
|
95
|
+
last = await opts.attempt(reflections, i);
|
|
96
|
+
const { success, feedback } = await opts.evaluate(last, i);
|
|
97
|
+
if (success) return { answer: last, attempts: i + 1, succeeded: true, reflections };
|
|
98
|
+
if (i < maxAttempts - 1) reflections.push(String(await opts.reflect(last, feedback, i)));
|
|
99
|
+
}
|
|
100
|
+
return { answer: last, attempts: maxAttempts, succeeded: false, reflections };
|
|
101
|
+
}
|
|
102
|
+
async function mixtureOfAgents(opts) {
|
|
103
|
+
const agents = Math.max(1, Math.floor(opts.agents));
|
|
104
|
+
const layers = Math.max(1, Math.floor(opts.layers));
|
|
105
|
+
const layerOutputs = [];
|
|
106
|
+
let current = [];
|
|
107
|
+
for (let a = 0; a < agents; a++) current.push(String(await opts.propose(a)));
|
|
108
|
+
layerOutputs.push([...current]);
|
|
109
|
+
for (let layer = 2; layer <= layers; layer++) {
|
|
110
|
+
if (!opts.refine) break;
|
|
111
|
+
const next = [];
|
|
112
|
+
for (let a = 0; a < agents; a++) {
|
|
113
|
+
const others = current.filter((_, i) => i !== a);
|
|
114
|
+
next.push(String(await opts.refine(a, others, layer)));
|
|
115
|
+
}
|
|
116
|
+
current = next;
|
|
117
|
+
layerOutputs.push([...current]);
|
|
118
|
+
}
|
|
119
|
+
const answer = String(await opts.aggregate(current));
|
|
120
|
+
return { answer, layerOutputs };
|
|
121
|
+
}
|
|
122
|
+
var mean = (xs) => xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
|
|
123
|
+
var round = (x) => Math.round(x * 1e3) / 1e3;
|
|
124
|
+
async function measureLift(opts) {
|
|
125
|
+
const perTask = [];
|
|
126
|
+
const baseScores = [];
|
|
127
|
+
const treatScores = [];
|
|
128
|
+
let wins = 0;
|
|
129
|
+
for (let i = 0; i < opts.tasks.length; i++) {
|
|
130
|
+
const task = opts.tasks[i];
|
|
131
|
+
const bOut = await opts.baseline(task, i);
|
|
132
|
+
const tOut = await opts.treatment(task, i);
|
|
133
|
+
const b = await opts.score(task, bOut, i);
|
|
134
|
+
const t = await opts.score(task, tOut, i);
|
|
135
|
+
baseScores.push(b);
|
|
136
|
+
treatScores.push(t);
|
|
137
|
+
if (t > b) wins += 1;
|
|
138
|
+
perTask.push({ task, baseline: round(b), treatment: round(t), delta: round(t - b) });
|
|
139
|
+
}
|
|
140
|
+
const baselineMean = round(mean(baseScores));
|
|
141
|
+
const treatmentMean = round(mean(treatScores));
|
|
142
|
+
return {
|
|
143
|
+
n: opts.tasks.length,
|
|
144
|
+
baselineMean,
|
|
145
|
+
treatmentMean,
|
|
146
|
+
lift: round(treatmentMean - baselineMean),
|
|
147
|
+
winRate: opts.tasks.length ? round(wins / opts.tasks.length) : 0,
|
|
148
|
+
perTask
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
exports.bestOfN = bestOfN;
|
|
153
|
+
exports.chainOfVerification = chainOfVerification;
|
|
154
|
+
exports.measureLift = measureLift;
|
|
155
|
+
exports.mixtureOfAgents = mixtureOfAgents;
|
|
156
|
+
exports.reflexion = reflexion;
|
|
157
|
+
exports.selfConsistency = selfConsistency;
|
|
158
|
+
exports.selfRefine = selfRefine;
|
|
159
|
+
exports.treeOfThoughts = treeOfThoughts;
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
interface SelfConsistencyOptions<T> {
|
|
2
|
+
/** How many independent samples to draw (clamped to >= 1). */
|
|
3
|
+
samples: number;
|
|
4
|
+
/** Produce sample `i` (0-based). Vary your prompt/temperature by `i`. */
|
|
5
|
+
generate: (i: number) => Promise<T> | T;
|
|
6
|
+
/** Cluster key for "same answer" — defaults to JSON of the value. Provide a
|
|
7
|
+
* normalizer (e.g. lowercase/trim) for free-text answers. */
|
|
8
|
+
key?: (value: T) => string;
|
|
9
|
+
}
|
|
10
|
+
interface SelfConsistencyResult<T> {
|
|
11
|
+
/** The majority-consistent answer. */
|
|
12
|
+
answer: T;
|
|
13
|
+
/** Agreement ratio of the winning cluster in [0,1] — a reliability signal. */
|
|
14
|
+
consistency: number;
|
|
15
|
+
/** Votes for the winner. */
|
|
16
|
+
votes: number;
|
|
17
|
+
/** Total samples that produced a value. */
|
|
18
|
+
total: number;
|
|
19
|
+
/** All clusters, most-voted first. */
|
|
20
|
+
clusters: Array<{
|
|
21
|
+
key: string;
|
|
22
|
+
count: number;
|
|
23
|
+
sample: T;
|
|
24
|
+
}>;
|
|
25
|
+
}
|
|
26
|
+
/** Sample N reasoning paths and return the majority-consistent answer plus the
|
|
27
|
+
* agreement ratio (Self-Consistency; Wang et al., 2022). */
|
|
28
|
+
declare function selfConsistency<T>(opts: SelfConsistencyOptions<T>): Promise<SelfConsistencyResult<T>>;
|
|
29
|
+
interface BestOfNOptions<T> {
|
|
30
|
+
/** How many candidates to generate (clamped to >= 1). */
|
|
31
|
+
n: number;
|
|
32
|
+
generate: (i: number) => Promise<T> | T;
|
|
33
|
+
/** Score a candidate — higher is better (e.g. a verifier confidence). */
|
|
34
|
+
score: (value: T, i: number) => Promise<number> | number;
|
|
35
|
+
}
|
|
36
|
+
interface BestOfNResult<T> {
|
|
37
|
+
best: T;
|
|
38
|
+
score: number;
|
|
39
|
+
index: number;
|
|
40
|
+
scores: number[];
|
|
41
|
+
}
|
|
42
|
+
/** Verifier-guided best-of-N: generate N candidates, score each, return the
|
|
43
|
+
* highest-scoring one. The score function is where you plug a verifier. */
|
|
44
|
+
declare function bestOfN<T>(opts: BestOfNOptions<T>): Promise<BestOfNResult<T>>;
|
|
45
|
+
interface SelfRefineOptions<T> {
|
|
46
|
+
draft: () => Promise<T> | T;
|
|
47
|
+
/** Critique the current value — return concrete flaws (or a clean signal). */
|
|
48
|
+
critique: (value: T, iter: number) => Promise<string> | string;
|
|
49
|
+
/** Revise the value to address the critique. */
|
|
50
|
+
revise: (value: T, critique: string, iter: number) => Promise<T> | T;
|
|
51
|
+
/** Max critique→revise iterations (default 2, clamped to >= 1). */
|
|
52
|
+
maxIters?: number;
|
|
53
|
+
/** Return true when a critique signals "nothing to fix" — stops early
|
|
54
|
+
* (saves work). Default: matches "no issues"/"looks good"/"lgtm". */
|
|
55
|
+
isClean?: (critique: string) => boolean;
|
|
56
|
+
}
|
|
57
|
+
interface SelfRefineResult<T> {
|
|
58
|
+
answer: T;
|
|
59
|
+
iterations: number;
|
|
60
|
+
revised: number;
|
|
61
|
+
trace: Array<{
|
|
62
|
+
iter: number;
|
|
63
|
+
critique: string;
|
|
64
|
+
revised: boolean;
|
|
65
|
+
}>;
|
|
66
|
+
}
|
|
67
|
+
/** Iteratively self-correct: draft, critique, revise — stopping early when the
|
|
68
|
+
* critique is clean (Self-Refine; Madaan et al., 2023). */
|
|
69
|
+
declare function selfRefine<T>(opts: SelfRefineOptions<T>): Promise<SelfRefineResult<T>>;
|
|
70
|
+
interface TreeOfThoughtsOptions<T> {
|
|
71
|
+
/** Candidate thoughts explored per depth step; the single best is retained
|
|
72
|
+
* (greedy, not a beam width). Clamped to >= 1. */
|
|
73
|
+
breadth: number;
|
|
74
|
+
/** Search depth (clamped to >= 1). */
|
|
75
|
+
depth: number;
|
|
76
|
+
/** Produce branch `b` of the current path. */
|
|
77
|
+
expand: (path: T[], b: number) => Promise<T> | T;
|
|
78
|
+
/** Score a candidate thought given the path so far — higher is better. */
|
|
79
|
+
score: (candidate: T, path: T[]) => Promise<number> | number;
|
|
80
|
+
/** Turn the winning path into a final answer (default: the last thought). */
|
|
81
|
+
synthesize?: (path: T[]) => Promise<T> | T;
|
|
82
|
+
}
|
|
83
|
+
interface TreeOfThoughtsResult<T> {
|
|
84
|
+
answer: T;
|
|
85
|
+
path: Array<{
|
|
86
|
+
thought: T;
|
|
87
|
+
score: number;
|
|
88
|
+
}>;
|
|
89
|
+
}
|
|
90
|
+
/** GREEDY best-first reasoning search (NOT beam search): at each depth, expand
|
|
91
|
+
* `breadth` candidate thoughts, keep ONLY the single highest-scored, and continue
|
|
92
|
+
* for `depth` steps, then synthesize the winning path. `breadth` controls how many
|
|
93
|
+
* candidates are explored per step (one is retained) — it is not a beam width.
|
|
94
|
+
* Inspired by Tree of Thoughts (Yao et al., 2023). */
|
|
95
|
+
declare function treeOfThoughts<T>(opts: TreeOfThoughtsOptions<T>): Promise<TreeOfThoughtsResult<T>>;
|
|
96
|
+
interface ChainOfVerificationOptions<T> {
|
|
97
|
+
draft: () => Promise<T> | T;
|
|
98
|
+
/** Verification questions probing the draft's claims. */
|
|
99
|
+
planChecks: (draft: T) => Promise<string[]> | string[];
|
|
100
|
+
/** Answer one verification question INDEPENDENTLY (no sight of the draft). */
|
|
101
|
+
answerCheck: (question: string) => Promise<string> | string;
|
|
102
|
+
/** Revise the draft to drop/correct claims the checks didn't support. */
|
|
103
|
+
revise: (draft: T, checks: Array<{
|
|
104
|
+
q: string;
|
|
105
|
+
a: string;
|
|
106
|
+
}>) => Promise<T> | T;
|
|
107
|
+
}
|
|
108
|
+
interface ChainOfVerificationResult<T> {
|
|
109
|
+
answer: T;
|
|
110
|
+
checks: Array<{
|
|
111
|
+
q: string;
|
|
112
|
+
a: string;
|
|
113
|
+
}>;
|
|
114
|
+
}
|
|
115
|
+
/** Draft → generate verification questions → answer each independently →
|
|
116
|
+
* revise. Targeted hallucination reduction (Chain-of-Verification; Dhuliawala
|
|
117
|
+
* et al., 2023). */
|
|
118
|
+
declare function chainOfVerification<T>(opts: ChainOfVerificationOptions<T>): Promise<ChainOfVerificationResult<T>>;
|
|
119
|
+
interface ReflexionOptions<T> {
|
|
120
|
+
/** Max attempts (clamped to >= 1). */
|
|
121
|
+
maxAttempts: number;
|
|
122
|
+
/** Attempt the task, with prior verbal reflections available as context. */
|
|
123
|
+
attempt: (reflections: string[], i: number) => Promise<T> | T;
|
|
124
|
+
/** Did the attempt succeed? Return success + feedback for reflection. */
|
|
125
|
+
evaluate: (result: T, i: number) => Promise<{
|
|
126
|
+
success: boolean;
|
|
127
|
+
feedback: string;
|
|
128
|
+
}> | {
|
|
129
|
+
success: boolean;
|
|
130
|
+
feedback: string;
|
|
131
|
+
};
|
|
132
|
+
/** Write a verbal reflection on the failure to carry into the next attempt. */
|
|
133
|
+
reflect: (result: T, feedback: string, i: number) => Promise<string> | string;
|
|
134
|
+
}
|
|
135
|
+
interface ReflexionResult<T> {
|
|
136
|
+
answer: T;
|
|
137
|
+
attempts: number;
|
|
138
|
+
succeeded: boolean;
|
|
139
|
+
/** The verbal reflections accumulated across failed attempts. */
|
|
140
|
+
reflections: string[];
|
|
141
|
+
}
|
|
142
|
+
/** Verbal reinforcement: attempt → evaluate → reflect → retry, carrying the
|
|
143
|
+
* accumulated reflections into each next attempt; stops on success or attempt
|
|
144
|
+
* budget (Reflexion; Shinn et al., 2023). Distinct from self-refine — it learns
|
|
145
|
+
* from the OUTCOME (success/feedback), not just the output's surface quality. */
|
|
146
|
+
declare function reflexion<T>(opts: ReflexionOptions<T>): Promise<ReflexionResult<T>>;
|
|
147
|
+
interface MixtureOfAgentsOptions {
|
|
148
|
+
/** Number of proposer agents per layer (clamped to >= 1). */
|
|
149
|
+
agents: number;
|
|
150
|
+
/** Refinement layers — layer 1 proposes, layers 2..L refine (clamped to >= 1). */
|
|
151
|
+
layers: number;
|
|
152
|
+
/** Layer-1 proposal from agent `a`. Vary persona/temperature by `a`. */
|
|
153
|
+
propose: (agent: number) => Promise<string> | string;
|
|
154
|
+
/** Refine agent `a`'s answer given the OTHER agents' prior-layer answers.
|
|
155
|
+
* Optional: only called for layers >= 2, so single-layer (propose → aggregate)
|
|
156
|
+
* usage need not supply it. */
|
|
157
|
+
refine?: (agent: number, others: string[], layer: number) => Promise<string> | string;
|
|
158
|
+
/** Synthesize the final layer's answers into one. */
|
|
159
|
+
aggregate: (finalLayer: string[]) => Promise<string> | string;
|
|
160
|
+
}
|
|
161
|
+
interface MixtureOfAgentsResult {
|
|
162
|
+
answer: string;
|
|
163
|
+
/** Each layer's per-agent outputs (layerOutputs[0] = layer 1 proposals). */
|
|
164
|
+
layerOutputs: string[][];
|
|
165
|
+
}
|
|
166
|
+
/** Layered Mixture-of-Agents: N agents propose, then refine while seeing each
|
|
167
|
+
* other's answers across L layers, then an aggregator synthesizes the best
|
|
168
|
+
* (Mixture-of-Agents; ICLR 2025). Diverse perspectives + cross-refinement. */
|
|
169
|
+
declare function mixtureOfAgents(opts: MixtureOfAgentsOptions): Promise<MixtureOfAgentsResult>;
|
|
170
|
+
interface MeasureLiftOptions<Task> {
|
|
171
|
+
/** The evaluation task set. */
|
|
172
|
+
tasks: Task[];
|
|
173
|
+
/** The control: produce an output for a task (e.g. a single-shot answer). */
|
|
174
|
+
baseline: (task: Task, index: number) => Promise<string> | string;
|
|
175
|
+
/** The treatment: produce an output via a pattern/loop (e.g. selfConsistency). */
|
|
176
|
+
treatment: (task: Task, index: number) => Promise<string> | string;
|
|
177
|
+
/** Score an output for a task in [0,1] — higher is better (your verifier). */
|
|
178
|
+
score: (task: Task, output: string, index: number) => Promise<number> | number;
|
|
179
|
+
}
|
|
180
|
+
interface MeasureLiftResult<Task> {
|
|
181
|
+
n: number;
|
|
182
|
+
baselineMean: number;
|
|
183
|
+
treatmentMean: number;
|
|
184
|
+
/** treatmentMean − baselineMean. Positive = the pattern helped. */
|
|
185
|
+
lift: number;
|
|
186
|
+
/** Fraction of tasks where treatment scored strictly higher than baseline. */
|
|
187
|
+
winRate: number;
|
|
188
|
+
perTask: Array<{
|
|
189
|
+
task: Task;
|
|
190
|
+
baseline: number;
|
|
191
|
+
treatment: number;
|
|
192
|
+
delta: number;
|
|
193
|
+
}>;
|
|
194
|
+
}
|
|
195
|
+
/** Measure a pattern/loop's score lift over a baseline on a task set. Returns
|
|
196
|
+
* mean scores, the lift (treatment − baseline), the win-rate, and per-task
|
|
197
|
+
* deltas — the evidence that the harness beats single-shot. */
|
|
198
|
+
declare function measureLift<Task>(opts: MeasureLiftOptions<Task>): Promise<MeasureLiftResult<Task>>;
|
|
199
|
+
|
|
200
|
+
export { type BestOfNOptions, type BestOfNResult, type ChainOfVerificationOptions, type ChainOfVerificationResult, type MeasureLiftOptions, type MeasureLiftResult, type MixtureOfAgentsOptions, type MixtureOfAgentsResult, type ReflexionOptions, type ReflexionResult, type SelfConsistencyOptions, type SelfConsistencyResult, type SelfRefineOptions, type SelfRefineResult, type TreeOfThoughtsOptions, type TreeOfThoughtsResult, bestOfN, chainOfVerification, measureLift, mixtureOfAgents, reflexion, selfConsistency, selfRefine, treeOfThoughts };
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
interface SelfConsistencyOptions<T> {
|
|
2
|
+
/** How many independent samples to draw (clamped to >= 1). */
|
|
3
|
+
samples: number;
|
|
4
|
+
/** Produce sample `i` (0-based). Vary your prompt/temperature by `i`. */
|
|
5
|
+
generate: (i: number) => Promise<T> | T;
|
|
6
|
+
/** Cluster key for "same answer" — defaults to JSON of the value. Provide a
|
|
7
|
+
* normalizer (e.g. lowercase/trim) for free-text answers. */
|
|
8
|
+
key?: (value: T) => string;
|
|
9
|
+
}
|
|
10
|
+
interface SelfConsistencyResult<T> {
|
|
11
|
+
/** The majority-consistent answer. */
|
|
12
|
+
answer: T;
|
|
13
|
+
/** Agreement ratio of the winning cluster in [0,1] — a reliability signal. */
|
|
14
|
+
consistency: number;
|
|
15
|
+
/** Votes for the winner. */
|
|
16
|
+
votes: number;
|
|
17
|
+
/** Total samples that produced a value. */
|
|
18
|
+
total: number;
|
|
19
|
+
/** All clusters, most-voted first. */
|
|
20
|
+
clusters: Array<{
|
|
21
|
+
key: string;
|
|
22
|
+
count: number;
|
|
23
|
+
sample: T;
|
|
24
|
+
}>;
|
|
25
|
+
}
|
|
26
|
+
/** Sample N reasoning paths and return the majority-consistent answer plus the
|
|
27
|
+
* agreement ratio (Self-Consistency; Wang et al., 2022). */
|
|
28
|
+
declare function selfConsistency<T>(opts: SelfConsistencyOptions<T>): Promise<SelfConsistencyResult<T>>;
|
|
29
|
+
interface BestOfNOptions<T> {
|
|
30
|
+
/** How many candidates to generate (clamped to >= 1). */
|
|
31
|
+
n: number;
|
|
32
|
+
generate: (i: number) => Promise<T> | T;
|
|
33
|
+
/** Score a candidate — higher is better (e.g. a verifier confidence). */
|
|
34
|
+
score: (value: T, i: number) => Promise<number> | number;
|
|
35
|
+
}
|
|
36
|
+
interface BestOfNResult<T> {
|
|
37
|
+
best: T;
|
|
38
|
+
score: number;
|
|
39
|
+
index: number;
|
|
40
|
+
scores: number[];
|
|
41
|
+
}
|
|
42
|
+
/** Verifier-guided best-of-N: generate N candidates, score each, return the
|
|
43
|
+
* highest-scoring one. The score function is where you plug a verifier. */
|
|
44
|
+
declare function bestOfN<T>(opts: BestOfNOptions<T>): Promise<BestOfNResult<T>>;
|
|
45
|
+
interface SelfRefineOptions<T> {
|
|
46
|
+
draft: () => Promise<T> | T;
|
|
47
|
+
/** Critique the current value — return concrete flaws (or a clean signal). */
|
|
48
|
+
critique: (value: T, iter: number) => Promise<string> | string;
|
|
49
|
+
/** Revise the value to address the critique. */
|
|
50
|
+
revise: (value: T, critique: string, iter: number) => Promise<T> | T;
|
|
51
|
+
/** Max critique→revise iterations (default 2, clamped to >= 1). */
|
|
52
|
+
maxIters?: number;
|
|
53
|
+
/** Return true when a critique signals "nothing to fix" — stops early
|
|
54
|
+
* (saves work). Default: matches "no issues"/"looks good"/"lgtm". */
|
|
55
|
+
isClean?: (critique: string) => boolean;
|
|
56
|
+
}
|
|
57
|
+
interface SelfRefineResult<T> {
|
|
58
|
+
answer: T;
|
|
59
|
+
iterations: number;
|
|
60
|
+
revised: number;
|
|
61
|
+
trace: Array<{
|
|
62
|
+
iter: number;
|
|
63
|
+
critique: string;
|
|
64
|
+
revised: boolean;
|
|
65
|
+
}>;
|
|
66
|
+
}
|
|
67
|
+
/** Iteratively self-correct: draft, critique, revise — stopping early when the
|
|
68
|
+
* critique is clean (Self-Refine; Madaan et al., 2023). */
|
|
69
|
+
declare function selfRefine<T>(opts: SelfRefineOptions<T>): Promise<SelfRefineResult<T>>;
|
|
70
|
+
interface TreeOfThoughtsOptions<T> {
|
|
71
|
+
/** Candidate thoughts explored per depth step; the single best is retained
|
|
72
|
+
* (greedy, not a beam width). Clamped to >= 1. */
|
|
73
|
+
breadth: number;
|
|
74
|
+
/** Search depth (clamped to >= 1). */
|
|
75
|
+
depth: number;
|
|
76
|
+
/** Produce branch `b` of the current path. */
|
|
77
|
+
expand: (path: T[], b: number) => Promise<T> | T;
|
|
78
|
+
/** Score a candidate thought given the path so far — higher is better. */
|
|
79
|
+
score: (candidate: T, path: T[]) => Promise<number> | number;
|
|
80
|
+
/** Turn the winning path into a final answer (default: the last thought). */
|
|
81
|
+
synthesize?: (path: T[]) => Promise<T> | T;
|
|
82
|
+
}
|
|
83
|
+
interface TreeOfThoughtsResult<T> {
|
|
84
|
+
answer: T;
|
|
85
|
+
path: Array<{
|
|
86
|
+
thought: T;
|
|
87
|
+
score: number;
|
|
88
|
+
}>;
|
|
89
|
+
}
|
|
90
|
+
/** GREEDY best-first reasoning search (NOT beam search): at each depth, expand
|
|
91
|
+
* `breadth` candidate thoughts, keep ONLY the single highest-scored, and continue
|
|
92
|
+
* for `depth` steps, then synthesize the winning path. `breadth` controls how many
|
|
93
|
+
* candidates are explored per step (one is retained) — it is not a beam width.
|
|
94
|
+
* Inspired by Tree of Thoughts (Yao et al., 2023). */
|
|
95
|
+
declare function treeOfThoughts<T>(opts: TreeOfThoughtsOptions<T>): Promise<TreeOfThoughtsResult<T>>;
|
|
96
|
+
interface ChainOfVerificationOptions<T> {
|
|
97
|
+
draft: () => Promise<T> | T;
|
|
98
|
+
/** Verification questions probing the draft's claims. */
|
|
99
|
+
planChecks: (draft: T) => Promise<string[]> | string[];
|
|
100
|
+
/** Answer one verification question INDEPENDENTLY (no sight of the draft). */
|
|
101
|
+
answerCheck: (question: string) => Promise<string> | string;
|
|
102
|
+
/** Revise the draft to drop/correct claims the checks didn't support. */
|
|
103
|
+
revise: (draft: T, checks: Array<{
|
|
104
|
+
q: string;
|
|
105
|
+
a: string;
|
|
106
|
+
}>) => Promise<T> | T;
|
|
107
|
+
}
|
|
108
|
+
interface ChainOfVerificationResult<T> {
|
|
109
|
+
answer: T;
|
|
110
|
+
checks: Array<{
|
|
111
|
+
q: string;
|
|
112
|
+
a: string;
|
|
113
|
+
}>;
|
|
114
|
+
}
|
|
115
|
+
/** Draft → generate verification questions → answer each independently →
|
|
116
|
+
* revise. Targeted hallucination reduction (Chain-of-Verification; Dhuliawala
|
|
117
|
+
* et al., 2023). */
|
|
118
|
+
declare function chainOfVerification<T>(opts: ChainOfVerificationOptions<T>): Promise<ChainOfVerificationResult<T>>;
|
|
119
|
+
interface ReflexionOptions<T> {
|
|
120
|
+
/** Max attempts (clamped to >= 1). */
|
|
121
|
+
maxAttempts: number;
|
|
122
|
+
/** Attempt the task, with prior verbal reflections available as context. */
|
|
123
|
+
attempt: (reflections: string[], i: number) => Promise<T> | T;
|
|
124
|
+
/** Did the attempt succeed? Return success + feedback for reflection. */
|
|
125
|
+
evaluate: (result: T, i: number) => Promise<{
|
|
126
|
+
success: boolean;
|
|
127
|
+
feedback: string;
|
|
128
|
+
}> | {
|
|
129
|
+
success: boolean;
|
|
130
|
+
feedback: string;
|
|
131
|
+
};
|
|
132
|
+
/** Write a verbal reflection on the failure to carry into the next attempt. */
|
|
133
|
+
reflect: (result: T, feedback: string, i: number) => Promise<string> | string;
|
|
134
|
+
}
|
|
135
|
+
interface ReflexionResult<T> {
|
|
136
|
+
answer: T;
|
|
137
|
+
attempts: number;
|
|
138
|
+
succeeded: boolean;
|
|
139
|
+
/** The verbal reflections accumulated across failed attempts. */
|
|
140
|
+
reflections: string[];
|
|
141
|
+
}
|
|
142
|
+
/** Verbal reinforcement: attempt → evaluate → reflect → retry, carrying the
|
|
143
|
+
* accumulated reflections into each next attempt; stops on success or attempt
|
|
144
|
+
* budget (Reflexion; Shinn et al., 2023). Distinct from self-refine — it learns
|
|
145
|
+
* from the OUTCOME (success/feedback), not just the output's surface quality. */
|
|
146
|
+
declare function reflexion<T>(opts: ReflexionOptions<T>): Promise<ReflexionResult<T>>;
|
|
147
|
+
interface MixtureOfAgentsOptions {
|
|
148
|
+
/** Number of proposer agents per layer (clamped to >= 1). */
|
|
149
|
+
agents: number;
|
|
150
|
+
/** Refinement layers — layer 1 proposes, layers 2..L refine (clamped to >= 1). */
|
|
151
|
+
layers: number;
|
|
152
|
+
/** Layer-1 proposal from agent `a`. Vary persona/temperature by `a`. */
|
|
153
|
+
propose: (agent: number) => Promise<string> | string;
|
|
154
|
+
/** Refine agent `a`'s answer given the OTHER agents' prior-layer answers.
|
|
155
|
+
* Optional: only called for layers >= 2, so single-layer (propose → aggregate)
|
|
156
|
+
* usage need not supply it. */
|
|
157
|
+
refine?: (agent: number, others: string[], layer: number) => Promise<string> | string;
|
|
158
|
+
/** Synthesize the final layer's answers into one. */
|
|
159
|
+
aggregate: (finalLayer: string[]) => Promise<string> | string;
|
|
160
|
+
}
|
|
161
|
+
interface MixtureOfAgentsResult {
|
|
162
|
+
answer: string;
|
|
163
|
+
/** Each layer's per-agent outputs (layerOutputs[0] = layer 1 proposals). */
|
|
164
|
+
layerOutputs: string[][];
|
|
165
|
+
}
|
|
166
|
+
/** Layered Mixture-of-Agents: N agents propose, then refine while seeing each
|
|
167
|
+
* other's answers across L layers, then an aggregator synthesizes the best
|
|
168
|
+
* (Mixture-of-Agents; ICLR 2025). Diverse perspectives + cross-refinement. */
|
|
169
|
+
declare function mixtureOfAgents(opts: MixtureOfAgentsOptions): Promise<MixtureOfAgentsResult>;
|
|
170
|
+
interface MeasureLiftOptions<Task> {
|
|
171
|
+
/** The evaluation task set. */
|
|
172
|
+
tasks: Task[];
|
|
173
|
+
/** The control: produce an output for a task (e.g. a single-shot answer). */
|
|
174
|
+
baseline: (task: Task, index: number) => Promise<string> | string;
|
|
175
|
+
/** The treatment: produce an output via a pattern/loop (e.g. selfConsistency). */
|
|
176
|
+
treatment: (task: Task, index: number) => Promise<string> | string;
|
|
177
|
+
/** Score an output for a task in [0,1] — higher is better (your verifier). */
|
|
178
|
+
score: (task: Task, output: string, index: number) => Promise<number> | number;
|
|
179
|
+
}
|
|
180
|
+
interface MeasureLiftResult<Task> {
|
|
181
|
+
n: number;
|
|
182
|
+
baselineMean: number;
|
|
183
|
+
treatmentMean: number;
|
|
184
|
+
/** treatmentMean − baselineMean. Positive = the pattern helped. */
|
|
185
|
+
lift: number;
|
|
186
|
+
/** Fraction of tasks where treatment scored strictly higher than baseline. */
|
|
187
|
+
winRate: number;
|
|
188
|
+
perTask: Array<{
|
|
189
|
+
task: Task;
|
|
190
|
+
baseline: number;
|
|
191
|
+
treatment: number;
|
|
192
|
+
delta: number;
|
|
193
|
+
}>;
|
|
194
|
+
}
|
|
195
|
+
/** Measure a pattern/loop's score lift over a baseline on a task set. Returns
|
|
196
|
+
* mean scores, the lift (treatment − baseline), the win-rate, and per-task
|
|
197
|
+
* deltas — the evidence that the harness beats single-shot. */
|
|
198
|
+
declare function measureLift<Task>(opts: MeasureLiftOptions<Task>): Promise<MeasureLiftResult<Task>>;
|
|
199
|
+
|
|
200
|
+
export { type BestOfNOptions, type BestOfNResult, type ChainOfVerificationOptions, type ChainOfVerificationResult, type MeasureLiftOptions, type MeasureLiftResult, type MixtureOfAgentsOptions, type MixtureOfAgentsResult, type ReflexionOptions, type ReflexionResult, type SelfConsistencyOptions, type SelfConsistencyResult, type SelfRefineOptions, type SelfRefineResult, type TreeOfThoughtsOptions, type TreeOfThoughtsResult, bestOfN, chainOfVerification, measureLift, mixtureOfAgents, reflexion, selfConsistency, selfRefine, treeOfThoughts };
|