omegon 0.6.9 → 0.6.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,163 +10,116 @@
10
10
  * supersede — "This new fact replaces that old one" (by ID + new content)
11
11
  * archive — "This fact appears stale/wrong" (by ID)
12
12
  * connect — "These two facts are related" (global extraction only)
13
+ *
14
+ * All LLM calls use direct HTTP (llm-direct.ts) — zero subprocess overhead.
13
15
  */
14
16
 
15
- import { spawn, type ChildProcess } from "node:child_process";
16
17
  import type { MemoryConfig } from "./types.ts";
17
18
  import type { Fact, Edge } from "./factstore.ts";
18
- import { resolveOmegonSubprocess } from "../lib/omegon-subprocess.ts";
19
+ import { chatDirect, cleanModelOutput, isCloudModel, getBudgetCloudModel } from "./llm-direct.ts";
19
20
 
20
21
  // ---------------------------------------------------------------------------
21
- // Shared subprocess runner
22
+ // Cancellation support
22
23
  // ---------------------------------------------------------------------------
23
24
 
24
- /** Track the currently running extraction process for cancellation */
25
- let activeProc: ChildProcess | null = null;
26
-
27
- /** Track all spawned processes for cleanup on module unload */
28
- const allProcs = new Set<ChildProcess>();
29
-
30
- /** Track the active direct-HTTP extraction AbortController for cancellation */
31
- let activeDirectAbort: AbortController | null = null;
32
-
33
- function killProc(proc: ChildProcess): void {
34
- try {
35
- if (proc.pid) process.kill(-proc.pid, "SIGTERM");
36
- } catch {
37
- try { proc.kill("SIGTERM"); } catch { /* already dead */ }
38
- }
39
- }
25
+ /** Active AbortController for the current extraction killable externally */
26
+ let activeAbort: AbortController | null = null;
40
27
 
41
28
  /**
42
- * Kill the active extraction subprocess OR direct HTTP fetch.
43
- * Returns true if something was killed/aborted.
29
+ * Kill the active extraction (abort in-flight HTTP request).
30
+ * Returns true if something was aborted.
44
31
  */
45
32
  export function killActiveExtraction(): boolean {
46
- let killed = false;
47
- if (activeProc) {
48
- killProc(activeProc);
49
- activeProc = null;
50
- killed = true;
51
- }
52
- if (activeDirectAbort) {
53
- activeDirectAbort.abort();
54
- activeDirectAbort = null;
55
- killed = true;
33
+ if (activeAbort) {
34
+ activeAbort.abort();
35
+ activeAbort = null;
36
+ return true;
56
37
  }
57
- return killed;
38
+ return false;
58
39
  }
59
40
 
60
41
  /**
61
- * Kill ALL tracked subprocesses AND abort any direct HTTP extraction.
62
- * Use during shutdown/reload to prevent orphaned processes and hanging fetches.
42
+ * Kill all active operations. Alias for killActiveExtraction since we no
43
+ * longer spawn subprocesses kept for API compatibility with index.ts.
63
44
  */
64
45
  export function killAllSubprocesses(): void {
65
- for (const proc of allProcs) {
66
- killProc(proc);
67
- }
68
- allProcs.clear();
69
- activeProc = null;
70
- if (activeDirectAbort) {
71
- activeDirectAbort.abort();
72
- activeDirectAbort = null;
73
- }
46
+ killActiveExtraction();
74
47
  }
75
48
 
76
49
  /** Check if an extraction is currently in progress */
77
50
  export function isExtractionRunning(): boolean {
78
- return activeProc !== null || activeDirectAbort !== null;
51
+ return activeAbort !== null;
79
52
  }
80
53
 
54
+ // ---------------------------------------------------------------------------
55
+ // Shared LLM call with abort tracking
56
+ // ---------------------------------------------------------------------------
57
+
81
58
  /**
82
- * Spawn a pi subprocess with a system prompt and user message.
83
- * Returns the raw stdout output. Handles timeout, cleanup, code fence stripping.
59
+ * Run a tracked LLM call sets activeAbort for external cancellation.
60
+ * Only one tracked call at a time (new call aborts previous).
84
61
  */
85
- function spawnExtraction(opts: {
86
- cwd: string;
62
+ async function trackedChat(opts: {
87
63
  model: string;
88
64
  systemPrompt: string;
89
65
  userMessage: string;
90
66
  timeout: number;
67
+ maxTokens?: number;
91
68
  label: string;
92
69
  }): Promise<string> {
93
- return new Promise<string>((resolve, reject) => {
94
- if (activeProc) {
95
- reject(new Error(`${opts.label}: extraction already in progress`));
96
- return;
97
- }
70
+ // Cancel any previous tracked call
71
+ if (activeAbort) activeAbort.abort();
72
+ const controller = new AbortController();
73
+ activeAbort = controller;
98
74
 
99
- const omegon = resolveOmegonSubprocess();
100
- const args = [
101
- ...omegon.argvPrefix,
102
- "--model", opts.model,
103
- "--no-session", "--no-tools", "--no-extensions",
104
- "--no-skills", "--no-themes", "--thinking", "off",
105
- "--system-prompt", opts.systemPrompt,
106
- "-p", opts.userMessage,
107
- ];
108
-
109
- const proc = spawn(omegon.command, args, {
110
- cwd: opts.cwd,
111
- stdio: ["ignore", "pipe", "pipe"],
112
- // Detach into new session so child has no controlling terminal.
113
- // Prevents child pi from opening /dev/tty and setting kitty keyboard
114
- // protocol, which corrupts parent terminal state if child is killed.
115
- detached: true,
116
- env: { ...process.env, TERM: "dumb" },
75
+ try {
76
+ const result = await chatDirect({
77
+ model: opts.model,
78
+ systemPrompt: opts.systemPrompt,
79
+ userMessage: opts.userMessage,
80
+ maxTokens: opts.maxTokens ?? 2048,
81
+ timeout: opts.timeout,
82
+ signal: controller.signal,
117
83
  });
118
- activeProc = proc;
119
- allProcs.add(proc);
84
+ return result.content;
85
+ } finally {
86
+ if (activeAbort === controller) activeAbort = null;
87
+ }
88
+ }
120
89
 
121
- let stdout = "";
122
- let stderr = "";
90
+ /**
91
+ * Run an untracked LLM call — does NOT set activeAbort.
92
+ * Used for secondary calls (pruning, episodes) that shouldn't cancel extraction.
93
+ */
94
+ async function untrackedChat(opts: {
95
+ model: string;
96
+ systemPrompt: string;
97
+ userMessage: string;
98
+ timeout: number;
99
+ maxTokens?: number;
100
+ }): Promise<string> {
101
+ const result = await chatDirect({
102
+ model: opts.model,
103
+ systemPrompt: opts.systemPrompt,
104
+ userMessage: opts.userMessage,
105
+ maxTokens: opts.maxTokens ?? 2048,
106
+ timeout: opts.timeout,
107
+ });
108
+ return result.content;
109
+ }
123
110
 
124
- proc.stdout.on("data", (d: Buffer) => { stdout += d.toString(); });
125
- proc.stderr.on("data", (d: Buffer) => { stderr += d.toString(); });
111
+ // ---------------------------------------------------------------------------
112
+ // Cloud fallback model cheapest available for budget tasks
113
+ // ---------------------------------------------------------------------------
126
114
 
127
- let escalationTimer: ReturnType<typeof setTimeout> | null = null;
128
- const killThisProc = (signal: NodeJS.Signals) => {
129
- try {
130
- if (proc.pid) process.kill(-proc.pid, signal);
131
- } catch {
132
- try { proc.kill(signal); } catch { /* already dead */ }
133
- }
134
- };
135
- const timeoutHandle = setTimeout(() => {
136
- killThisProc("SIGTERM");
137
- escalationTimer = setTimeout(() => {
138
- if (!proc.killed) killThisProc("SIGKILL");
139
- }, 5000);
140
- reject(new Error(`${opts.label} timed out`));
141
- }, opts.timeout);
142
-
143
- proc.on("close", (code) => {
144
- clearTimeout(timeoutHandle);
145
- if (escalationTimer) clearTimeout(escalationTimer);
146
- activeProc = null;
147
- allProcs.delete(proc);
148
-
149
- const output = stdout.trim();
150
- if (code === 0 && output) {
151
- // Strip code fences if the model wraps output
152
- const cleaned = output
153
- .replace(/^```(?:jsonl?|json)?\n?/, "")
154
- .replace(/\n?```\s*$/, "");
155
- resolve(cleaned);
156
- } else if (code === 0 && !output) {
157
- resolve("");
158
- } else {
159
- reject(new Error(`${opts.label} failed (exit ${code}): ${stderr.slice(0, 500)}`));
160
- }
161
- });
115
+ const CLOUD_FALLBACK_MODEL = "claude-haiku-4-5";
162
116
 
163
- proc.on("error", (err) => {
164
- clearTimeout(timeoutHandle);
165
- activeProc = null;
166
- allProcs.delete(proc);
167
- reject(err);
168
- });
169
- });
117
+ function resolveModel(configModel: string): string {
118
+ // If the configured model is a cloud model with a key, use it directly
119
+ if (isCloudModel(configModel)) return configModel;
120
+ // If it's a local model, try it (chatDirect handles Ollama)
121
+ // If Ollama is down, chatDirect will throw and caller handles fallback
122
+ return configModel;
170
123
  }
171
124
 
172
125
  // ---------------------------------------------------------------------------
@@ -246,115 +199,13 @@ export function formatFactsForExtraction(facts: Fact[]): string {
246
199
  return lines.join("\n");
247
200
  }
248
201
 
249
- // ---------------------------------------------------------------------------
250
- // Direct Ollama extraction (no pi subprocess overhead)
251
- // ---------------------------------------------------------------------------
252
-
253
- /**
254
- * Known cloud model prefixes. If a model starts with any of these, it's cloud.
255
- * Everything else is assumed local (Ollama).
256
- *
257
- * This is an allowlist approach — new cloud providers must be added here.
258
- * The alternative (detecting local by "name:tag" pattern) is too fragile
259
- * since Ollama accepts bare names without tags.
260
- */
261
- const CLOUD_MODEL_PREFIXES = [
262
- "claude-", // Anthropic
263
- "gpt-", // OpenAI
264
- "o1-", "o3-", "o4-", // OpenAI reasoning
265
- "gemini-", // Google
266
- "mistral-", // Mistral cloud (not devstral which is local)
267
- "command-", // Cohere
268
- ];
269
-
270
- /**
271
- * Check if extraction model is a local Ollama model.
272
- * Uses an explicit cloud-prefix allowlist. Models with a "/" are assumed
273
- * to be provider-qualified cloud models (e.g., "openai/gpt-4").
274
- */
275
- function isLocalModel(model: string): boolean {
276
- if (model.includes("/")) return false;
277
- for (const prefix of CLOUD_MODEL_PREFIXES) {
278
- if (model.startsWith(prefix)) return false;
279
- }
280
- return true;
281
- }
282
-
283
- /** Fallback cloud model when local extraction fails and Ollama is unreachable. */
284
- const CLOUD_FALLBACK_MODEL = "claude-sonnet-4-6";
285
-
286
- /**
287
- * Run extraction directly via Ollama HTTP API.
288
- * ~10x faster than spawning a pi subprocess — no process startup overhead.
289
- * Returns null if Ollama is unreachable (caller should fall back to subprocess).
290
- */
291
- async function runExtractionDirect(
292
- systemPrompt: string,
293
- userMessage: string,
294
- config: MemoryConfig,
295
- opts?: { ollamaUrl?: string },
296
- ): Promise<string | null> {
297
- const baseUrl = opts?.ollamaUrl ?? process.env.LOCAL_INFERENCE_URL ?? "http://localhost:11434";
298
- const timeout = config.extractionTimeout;
299
-
300
- // Create an AbortController that can be killed externally via killActiveExtraction().
301
- // Combines our controller with a timeout signal so either trigger aborts the fetch.
302
- const controller = new AbortController();
303
- activeDirectAbort = controller;
304
-
305
- try {
306
- const resp = await fetch(`${baseUrl}/api/chat`, {
307
- method: "POST",
308
- headers: { "Content-Type": "application/json" },
309
- body: JSON.stringify({
310
- model: config.extractionModel,
311
- stream: false,
312
- options: {
313
- temperature: 0.2,
314
- num_predict: 2048,
315
- num_ctx: 32768,
316
- },
317
- messages: [
318
- { role: "system", content: systemPrompt },
319
- { role: "user", content: userMessage },
320
- ],
321
- }),
322
- signal: typeof AbortSignal.any === "function"
323
- ? AbortSignal.any([controller.signal, AbortSignal.timeout(timeout)])
324
- : controller.signal, // Node <20.3: external abort works, timeout relies on Ollama's own
325
- });
326
-
327
- if (!resp.ok) return null;
328
-
329
- const data = await resp.json() as { message?: { content?: string } };
330
- const raw = data.message?.content?.trim();
331
- if (!raw) return null;
332
-
333
- // Strip code fences and <think> blocks from reasoning models
334
- return raw
335
- .replace(/^```(?:jsonl?|json)?\n?/, "")
336
- .replace(/\n?```\s*$/, "")
337
- .replace(/<think>[\s\S]*?<\/think>\s*/g, "")
338
- .trim();
339
- } catch {
340
- return null;
341
- } finally {
342
- if (activeDirectAbort === controller) {
343
- activeDirectAbort = null;
344
- }
345
- }
346
- }
347
-
348
202
  /**
349
203
  * Run project extraction (Phase 1).
350
204
  * Returns raw JSONL output from the extraction agent.
351
- *
352
- * When extractionModel is a local model, talks directly to Ollama HTTP API
353
- * (no subprocess overhead). Falls back to pi subprocess for cloud models
354
- * or if Ollama is unreachable.
205
+ * Uses direct HTTP — no subprocess spawning.
355
206
  */
356
207
  export async function runExtractionV2(
357
- cwd: string,
208
+ _cwd: string,
358
209
  currentFacts: Fact[],
359
210
  recentConversation: string,
360
211
  config: MemoryConfig,
@@ -370,21 +221,32 @@ export async function runExtractionV2(
370
221
  "\n\nOutput JSONL actions based on what you observe.",
371
222
  ].join("");
372
223
 
373
- // Try direct Ollama path for local models (bypasses pi subprocess entirely)
374
- if (isLocalModel(config.extractionModel)) {
375
- const result = await runExtractionDirect(prompt, userMessage, config);
376
- if (result !== null) return result;
377
- // Ollama unreachable — fall through to subprocess with cloud fallback
378
- }
224
+ const model = resolveModel(config.extractionModel);
379
225
 
380
- return spawnExtraction({
381
- cwd,
382
- model: isLocalModel(config.extractionModel) ? CLOUD_FALLBACK_MODEL : config.extractionModel,
383
- systemPrompt: prompt,
384
- userMessage,
385
- timeout: config.extractionTimeout,
386
- label: "Project extraction",
387
- });
226
+ try {
227
+ return await trackedChat({
228
+ model,
229
+ systemPrompt: prompt,
230
+ userMessage,
231
+ timeout: config.extractionTimeout,
232
+ label: "Project extraction",
233
+ });
234
+ } catch (err) {
235
+ // If configured model failed (e.g., Ollama down), try cloud fallback
236
+ if (!isCloudModel(model)) {
237
+ const fallback = getBudgetCloudModel();
238
+ if (fallback) {
239
+ return await trackedChat({
240
+ model: fallback,
241
+ systemPrompt: prompt,
242
+ userMessage,
243
+ timeout: config.extractionTimeout,
244
+ label: "Project extraction (cloud fallback)",
245
+ });
246
+ }
247
+ }
248
+ throw err;
249
+ }
388
250
  }
389
251
 
390
252
  // ---------------------------------------------------------------------------
@@ -486,10 +348,10 @@ export function formatGlobalExtractionInput(
486
348
  /**
487
349
  * Run global extraction (Phase 2).
488
350
  * Only called when Phase 1 produced new facts.
489
- * Uses direct Ollama path for local models, falls back to pi subprocess.
351
+ * Uses direct HTTP no subprocess spawning.
490
352
  */
491
353
  export async function runGlobalExtraction(
492
- cwd: string,
354
+ _cwd: string,
493
355
  newProjectFacts: Fact[],
494
356
  globalFacts: Fact[],
495
357
  globalEdges: Edge[],
@@ -502,22 +364,31 @@ export async function runGlobalExtraction(
502
364
  "\n\nOutput JSONL actions: promote generalizable facts and identify connections between GLOBAL facts.",
503
365
  ].join("");
504
366
 
505
- const systemPrompt = buildGlobalExtractionPrompt();
367
+ const model = resolveModel(config.extractionModel);
506
368
 
507
- // Try direct Ollama path for local models
508
- if (isLocalModel(config.extractionModel)) {
509
- const result = await runExtractionDirect(systemPrompt, userMessage, config);
510
- if (result !== null) return result;
369
+ try {
370
+ return await trackedChat({
371
+ model,
372
+ systemPrompt: buildGlobalExtractionPrompt(),
373
+ userMessage,
374
+ timeout: config.extractionTimeout,
375
+ label: "Global extraction",
376
+ });
377
+ } catch (err) {
378
+ if (!isCloudModel(model)) {
379
+ const fallback = getBudgetCloudModel();
380
+ if (fallback) {
381
+ return await trackedChat({
382
+ model: fallback,
383
+ systemPrompt: buildGlobalExtractionPrompt(),
384
+ userMessage,
385
+ timeout: config.extractionTimeout,
386
+ label: "Global extraction (cloud fallback)",
387
+ });
388
+ }
389
+ }
390
+ throw err;
511
391
  }
512
-
513
- return spawnExtraction({
514
- cwd,
515
- model: isLocalModel(config.extractionModel) ? CLOUD_FALLBACK_MODEL : config.extractionModel,
516
- systemPrompt,
517
- userMessage,
518
- timeout: config.extractionTimeout,
519
- label: "Global extraction",
520
- });
521
392
  }
522
393
 
523
394
  // ---------------------------------------------------------------------------
@@ -558,96 +429,60 @@ export interface SessionTelemetry {
558
429
  }
559
430
 
560
431
  /**
561
- * Generate a session episode via direct Ollama HTTP API call.
562
- * ~10x faster than spawning a pi subprocess no process startup overhead.
563
- * Falls back to subprocess-based generation if Ollama is unreachable.
432
+ * Generate a session episode via direct LLM call.
433
+ * Uses chatDirect no subprocess. Tries configured model, falls back to budget cloud.
564
434
  */
565
435
  export async function generateEpisodeDirect(
566
436
  recentConversation: string,
567
437
  config: MemoryConfig,
568
- opts?: { ollamaUrl?: string; model?: string },
569
438
  ): Promise<EpisodeOutput | null> {
570
- const baseUrl = opts?.ollamaUrl ?? process.env.LOCAL_INFERENCE_URL ?? "http://localhost:11434";
571
- const model = opts?.model ?? process.env.LOCAL_EPISODE_MODEL ?? "qwen3:30b";
439
+ const userMessage = `Session conversation:\n\n${recentConversation}\n\nOutput the episode JSON.`;
572
440
  const timeout = Math.min(config.shutdownExtractionTimeout, 10_000);
573
441
 
442
+ // Try configured extraction model first
574
443
  try {
575
- const resp = await fetch(`${baseUrl}/api/chat`, {
576
- method: "POST",
577
- headers: { "Content-Type": "application/json" },
578
- body: JSON.stringify({
579
- model,
580
- stream: false,
581
- options: { temperature: 0.3, num_predict: 512 },
582
- messages: [
583
- { role: "system", content: EPISODE_PROMPT },
584
- { role: "user", content: `Session conversation:\n\n${recentConversation}\n\nOutput the episode JSON.` },
585
- ],
586
- }),
587
- signal: AbortSignal.timeout(timeout),
444
+ const raw = await untrackedChat({
445
+ model: resolveModel(config.extractionModel),
446
+ systemPrompt: EPISODE_PROMPT,
447
+ userMessage,
448
+ timeout,
449
+ maxTokens: 512,
588
450
  });
589
-
590
- if (!resp.ok) return null;
591
-
592
- const data = await resp.json() as { message?: { content?: string } };
593
- const raw = data.message?.content?.trim();
594
- if (!raw) return null;
595
-
596
- const cleaned = raw
597
- .replace(/^```(?:json)?\n?/, "")
598
- .replace(/\n?```\s*$/, "")
599
- // Strip <think>...</think> blocks from reasoning models
600
- .replace(/<think>[\s\S]*?<\/think>\s*/g, "")
601
- .trim();
602
- const parsed = JSON.parse(cleaned);
603
-
604
- if (parsed.title && parsed.narrative) {
605
- return { title: parsed.title, narrative: parsed.narrative };
451
+ if (raw) {
452
+ const parsed = JSON.parse(raw);
453
+ if (parsed.title && parsed.narrative) return parsed as EpisodeOutput;
606
454
  }
607
- return null;
608
455
  } catch {
609
- return null;
456
+ // Fall through
610
457
  }
611
- }
612
458
 
613
- /**
614
- * Generate a session episode summary from recent conversation.
615
- * Uses pi subprocess (slower fallback). Prefer generateEpisodeDirect().
616
- */
617
- export async function generateEpisode(
618
- cwd: string,
619
- recentConversation: string,
620
- config: MemoryConfig,
621
- ): Promise<EpisodeOutput | null> {
622
- try {
623
- const raw = await spawnExtraction({
624
- cwd,
625
- model: config.extractionModel,
626
- systemPrompt: EPISODE_PROMPT,
627
- userMessage: `Session conversation:\n\n${recentConversation}\n\nOutput the episode JSON.`,
628
- timeout: config.shutdownExtractionTimeout,
629
- label: "Episode generation",
630
- });
631
-
632
- if (!raw.trim()) return null;
633
-
634
- // Strip any markdown code fences
635
- const cleaned = raw.replace(/^```(?:json)?\n?/, "").replace(/\n?```\s*$/, "").trim();
636
- const parsed = JSON.parse(cleaned);
637
-
638
- if (parsed.title && parsed.narrative) {
639
- return { title: parsed.title, narrative: parsed.narrative };
459
+ // Try budget cloud model
460
+ const budgetModel = getBudgetCloudModel();
461
+ if (budgetModel && budgetModel !== config.extractionModel) {
462
+ try {
463
+ const raw = await untrackedChat({
464
+ model: budgetModel,
465
+ systemPrompt: EPISODE_PROMPT,
466
+ userMessage,
467
+ timeout,
468
+ maxTokens: 512,
469
+ });
470
+ if (raw) {
471
+ const parsed = JSON.parse(raw);
472
+ if (parsed.title && parsed.narrative) return parsed as EpisodeOutput;
473
+ }
474
+ } catch {
475
+ // Fall through
640
476
  }
641
- return null;
642
- } catch {
643
- return null;
644
477
  }
478
+
479
+ return null;
645
480
  }
646
481
 
647
482
  /**
648
483
  * Build a minimum viable episode from raw session telemetry.
649
484
  * Zero I/O — assembled deterministically from already-collected data.
650
- * This is the guaranteed floor: always emitted when every model in the fallback chain fails.
485
+ * This is the guaranteed floor: always emitted when every model fails.
651
486
  */
652
487
  export function buildTemplateEpisode(telemetry: SessionTelemetry): EpisodeOutput {
653
488
  const allModified = [...new Set([...telemetry.filesWritten, ...telemetry.filesEdited])];
@@ -684,102 +519,20 @@ export function buildTemplateEpisode(telemetry: SessionTelemetry): EpisodeOutput
684
519
  }
685
520
 
686
521
  /**
687
- * Generate a session episode with a reliability-ordered fallback chain:
688
- * 1. Cloud primary (config.episodeModel codex-spark by default)
689
- * 2. Cloud retribution tier (haiku fast, cheap, always available)
690
- * 3. Ollama (direct HTTP — only if user has LOCAL_EPISODE_MODEL configured)
691
- * 4. Template episode (deterministic, zero I/O) — always succeeds
692
- *
693
- * Cloud is first because: (1) it's always available if pi is configured at all,
694
- * (2) retribution-tier cost is negligible (~$0.0001/call), (3) model quality
695
- * is substantially better than typical local models for narrative generation.
696
- * Ollama is tried last as an optional local preference, not a dependency.
522
+ * Generate a session episode with fallback chain:
523
+ * 1. Direct LLM call (configured model budget cloud)
524
+ * 2. Template episode (deterministic, zero I/O) always succeeds
697
525
  *
698
- * Step timeouts are taken from config.episodeStepTimeout, capped so the total
699
- * chain fits within config.shutdownExtractionTimeout.
526
+ * No subprocess spawning. Total time bounded by config timeouts.
700
527
  */
701
528
  export async function generateEpisodeWithFallback(
702
529
  recentConversation: string,
703
530
  telemetry: SessionTelemetry,
704
531
  config: MemoryConfig,
705
- cwd: string,
532
+ _cwd: string,
706
533
  ): Promise<EpisodeOutput> {
707
- const stepTimeout = Math.min(
708
- config.episodeStepTimeout,
709
- Math.floor(config.shutdownExtractionTimeout / 3),
710
- );
711
-
712
- if (config.episodeFallbackChain) {
713
- // Step 1: Cloud primary (episodeModel — codex-spark by default)
714
- // Always available if the user has a provider configured.
715
- try {
716
- const raw = await spawnExtraction({
717
- cwd,
718
- model: config.episodeModel,
719
- systemPrompt: EPISODE_PROMPT,
720
- userMessage: `Session conversation:\n\n${recentConversation}\n\nOutput the episode JSON.`,
721
- timeout: stepTimeout,
722
- label: "Episode generation (primary)",
723
- });
724
- if (raw.trim()) {
725
- const cleaned = raw.replace(/^```(?:json)?\n?/, "").replace(/\n?```\s*$/, "").trim();
726
- const parsed = JSON.parse(cleaned);
727
- if (parsed.title && parsed.narrative) return parsed as EpisodeOutput;
728
- }
729
- } catch {
730
- // Fall through
731
- }
732
-
733
- // Step 2: Cloud retribution tier (haiku — fast, cheap, independent model)
734
- try {
735
- const raw = await spawnExtraction({
736
- cwd,
737
- model: "claude-haiku-4-5",
738
- systemPrompt: EPISODE_PROMPT,
739
- userMessage: `Session conversation:\n\n${recentConversation}\n\nOutput the episode JSON.`,
740
- timeout: stepTimeout,
741
- label: "Episode generation (retribution fallback)",
742
- });
743
- if (raw.trim()) {
744
- const cleaned = raw.replace(/^```(?:json)?\n?/, "").replace(/\n?```\s*$/, "").trim();
745
- const parsed = JSON.parse(cleaned);
746
- if (parsed.title && parsed.narrative) return parsed as EpisodeOutput;
747
- }
748
- } catch {
749
- // Fall through
750
- }
751
-
752
- // Step 3: Ollama (optional — only meaningful if user has a local model running)
753
- if (process.env.LOCAL_EPISODE_MODEL || process.env.LOCAL_INFERENCE_URL) {
754
- try {
755
- const result = await generateEpisodeDirect(recentConversation, config);
756
- if (result) return result;
757
- } catch {
758
- // Fall through to template
759
- }
760
- }
761
- } else {
762
- // Chain disabled — try cloud primary only, no Ollama
763
- try {
764
- const raw = await spawnExtraction({
765
- cwd,
766
- model: config.episodeModel,
767
- systemPrompt: EPISODE_PROMPT,
768
- userMessage: `Session conversation:\n\n${recentConversation}\n\nOutput the episode JSON.`,
769
- timeout: stepTimeout,
770
- label: "Episode generation",
771
- });
772
- if (raw.trim()) {
773
- const cleaned = raw.replace(/^```(?:json)?\n?/, "").replace(/\n?```\s*$/, "").trim();
774
- const parsed = JSON.parse(cleaned);
775
- if (parsed.title && parsed.narrative) return parsed as EpisodeOutput;
776
- }
777
- } catch {
778
- // Fall through
779
- }
780
- }
781
-
782
- // Step 4: Template episode — guaranteed floor, zero I/O
534
+ const result = await generateEpisodeDirect(recentConversation, config);
535
+ if (result) return result;
783
536
  return buildTemplateEpisode(telemetry);
784
537
  }
785
538
 
@@ -803,6 +556,7 @@ Rules:
803
556
  /**
804
557
  * Run a targeted LLM archival pass over a single section when it exceeds the ceiling.
805
558
  * Returns the list of fact IDs recommended for archival.
559
+ * Uses direct HTTP — no subprocess spawning.
806
560
  */
807
561
  export async function runSectionPruningPass(
808
562
  section: string,
@@ -827,38 +581,56 @@ export async function runSectionPruningPass(
827
581
  `Return a JSON array of fact IDs to archive. Archive at least ${excessCount} to bring the section under ${targetCount + 1}.`,
828
582
  ].join("\n");
829
583
 
830
- // Try direct Ollama path for local models
831
- if (isLocalModel(config.extractionModel)) {
832
- try {
833
- const raw = await runExtractionDirect(SECTION_PRUNING_PROMPT, userMessage, config);
834
- if (raw) {
835
- const cleaned = raw.replace(/^```(?:json)?\n?/, "").replace(/\n?```\s*$/, "").trim();
836
- const parsed = JSON.parse(cleaned);
837
- if (Array.isArray(parsed)) return parsed.filter((id: unknown) => typeof id === "string");
838
- }
839
- } catch {
840
- // Fall through to cloud
841
- }
842
- }
584
+ const model = resolveModel(config.extractionModel);
843
585
 
844
- // Cloud fallback: use episodeModel (cloud tier, always available)
845
586
  try {
846
- const raw = await spawnExtraction({
847
- cwd: process.cwd(),
848
- model: config.episodeModel,
587
+ const raw = await untrackedChat({
588
+ model,
849
589
  systemPrompt: SECTION_PRUNING_PROMPT,
850
590
  userMessage,
851
591
  timeout: 30_000,
852
- label: `Section pruning (${section})`,
592
+ maxTokens: 1024,
853
593
  });
854
- if (raw.trim()) {
855
- const cleaned = raw.replace(/^```(?:json)?\n?/, "").replace(/\n?```\s*$/, "").trim();
856
- const parsed = JSON.parse(cleaned);
594
+ if (raw) {
595
+ const parsed = JSON.parse(raw);
857
596
  if (Array.isArray(parsed)) return parsed.filter((id: unknown) => typeof id === "string");
858
597
  }
859
598
  } catch {
860
- // Best effort return empty (no archival) rather than corrupt state
599
+ // Try budget cloud fallback
600
+ const fallback = getBudgetCloudModel();
601
+ if (fallback && fallback !== model) {
602
+ try {
603
+ const raw = await untrackedChat({
604
+ model: fallback,
605
+ systemPrompt: SECTION_PRUNING_PROMPT,
606
+ userMessage,
607
+ timeout: 30_000,
608
+ maxTokens: 1024,
609
+ });
610
+ if (raw) {
611
+ const parsed = JSON.parse(raw);
612
+ if (Array.isArray(parsed)) return parsed.filter((id: unknown) => typeof id === "string");
613
+ }
614
+ } catch {
615
+ // Best effort
616
+ }
617
+ }
861
618
  }
862
619
 
863
620
  return [];
864
621
  }
622
+
623
+ // ---------------------------------------------------------------------------
624
+ // Legacy API shim — generateEpisode (subprocess-based) now delegates to direct
625
+ // ---------------------------------------------------------------------------
626
+
627
+ /**
628
+ * @deprecated Use generateEpisodeDirect instead. Kept for API compatibility.
629
+ */
630
+ export async function generateEpisode(
631
+ _cwd: string,
632
+ recentConversation: string,
633
+ config: MemoryConfig,
634
+ ): Promise<EpisodeOutput | null> {
635
+ return generateEpisodeDirect(recentConversation, config);
636
+ }