@byte5ai/palaia 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,740 @@
1
+ /**
2
+ * Auto-capture logic: LLM-based and rule-based extraction.
3
+ *
4
+ * Extracted from hooks.ts during Phase 1.5 decomposition.
5
+ * No logic changes — pure structural refactoring.
6
+ */
7
+
8
+ import fs from "node:fs/promises";
9
+ import path from "node:path";
10
+ import os from "node:os";
11
+ import type { RunnerOpts } from "../runner.js";
12
+ import { isValidScope } from "./state.js";
13
+ import { extractMessageTexts } from "./recall.js";
14
+
15
+ // ============================================================================
16
+ // Capture Hints (Issue #81)
17
+ // ============================================================================
18
+
19
+ /** Parsed palaia-hint tag attributes */
20
+ export interface PalaiaHint {
21
+ project?: string;
22
+ scope?: string;
23
+ type?: string;
24
+ tags?: string[];
25
+ }
26
+
27
+ /**
28
+ * Parse `<palaia-hint ... />` tags from text.
29
+ * Returns extracted hints and cleaned text with hints removed.
30
+ */
31
+ export function parsePalaiaHints(text: string): { hints: PalaiaHint[]; cleanedText: string } {
32
+ const hints: PalaiaHint[] = [];
33
+ const regex = /<palaia-hint\s+([^/]*)\s*\/>/gi;
34
+
35
+ let match: RegExpExecArray | null;
36
+ while ((match = regex.exec(text)) !== null) {
37
+ const attrs = match[1];
38
+ const hint: PalaiaHint = {};
39
+
40
+ const projectMatch = attrs.match(/project\s*=\s*"([^"]*)"/i);
41
+ if (projectMatch) hint.project = projectMatch[1];
42
+
43
+ const scopeMatch = attrs.match(/scope\s*=\s*"([^"]*)"/i);
44
+ if (scopeMatch) hint.scope = scopeMatch[1];
45
+
46
+ const typeMatch = attrs.match(/type\s*=\s*"([^"]*)"/i);
47
+ if (typeMatch) hint.type = typeMatch[1];
48
+
49
+ const tagsMatch = attrs.match(/tags\s*=\s*"([^"]*)"/i);
50
+ if (tagsMatch) hint.tags = tagsMatch[1].split(",").map((t) => t.trim()).filter(Boolean);
51
+
52
+ hints.push(hint);
53
+ }
54
+
55
+ const cleanedText = text.replace(/<palaia-hint\s+[^/]*\s*\/>/gi, "").trim();
56
+ return { hints, cleanedText };
57
+ }
58
+
59
+ // ============================================================================
60
+ // Project Cache (Issue #81)
61
+ // ============================================================================
62
+
63
+ export interface CachedProject {
64
+ name: string;
65
+ description?: string;
66
+ }
67
+
68
+ let _cachedProjects: CachedProject[] | null = null;
69
+ let _projectCacheTime = 0;
70
+ const PROJECT_CACHE_TTL_MS = 60_000;
71
+
72
+ /** Reset project cache (for testing). */
73
+ export function resetProjectCache(): void {
74
+ _cachedProjects = null;
75
+ _projectCacheTime = 0;
76
+ }
77
+
78
+ /**
79
+ * Load known projects from CLI, with caching.
80
+ */
81
+ export async function loadProjects(opts: RunnerOpts): Promise<CachedProject[]> {
82
+ const now = Date.now();
83
+ if (_cachedProjects && (now - _projectCacheTime) < PROJECT_CACHE_TTL_MS) {
84
+ return _cachedProjects;
85
+ }
86
+
87
+ try {
88
+ const { runJson } = await import("../runner.js");
89
+ const result = await runJson<{ projects: Array<{ name: string; description?: string }> }>(
90
+ ["project", "list"],
91
+ opts,
92
+ );
93
+ _cachedProjects = (result.projects || []).map((p) => ({
94
+ name: p.name,
95
+ description: p.description,
96
+ }));
97
+ _projectCacheTime = now;
98
+ return _cachedProjects;
99
+ } catch {
100
+ return _cachedProjects || [];
101
+ }
102
+ }
103
+
104
+ // ============================================================================
105
+ // LLM-based Extraction (Issue #64 upgrade)
106
+ // ============================================================================
107
+
108
+ /** Result from LLM-based knowledge extraction */
109
+ export interface ExtractionResult {
110
+ content: string;
111
+ type: "memory" | "process" | "task";
112
+ tags: string[];
113
+ significance: number;
114
+ project?: string | null;
115
+ scope?: string | null;
116
+ }
117
+
118
+ type RunEmbeddedPiAgentFn = (params: Record<string, unknown>) => Promise<unknown>;
119
+
120
+ let _embeddedPiAgentLoader: Promise<RunEmbeddedPiAgentFn> | null = null;
121
+ /** Whether the LLM import failure has already been logged (to avoid spam). */
122
+ let _llmImportFailureLogged = false;
123
+
124
+ /**
125
+ * Resolve the path to OpenClaw's extensionAPI module.
126
+ * Uses multiple strategies for portability across installation layouts.
127
+ */
128
+ function resolveExtensionAPIPath(): string | null {
129
+ // Strategy 1: require.resolve with openclaw package exports
130
+ try {
131
+ return require.resolve("openclaw/dist/extensionAPI.js");
132
+ } catch {
133
+ // Not resolvable via standard module resolution
134
+ }
135
+
136
+ // Strategy 2: Resolve openclaw main entry, then navigate to dist/extensionAPI.js
137
+ try {
138
+ const openclawMain = require.resolve("openclaw");
139
+ const candidate = path.join(path.dirname(openclawMain), "extensionAPI.js");
140
+ if (require("node:fs").existsSync(candidate)) return candidate;
141
+ } catch {
142
+ // openclaw not resolvable at all
143
+ }
144
+
145
+ // Strategy 3: Sibling in global node_modules (plugin installed alongside openclaw)
146
+ try {
147
+ const thisFile = typeof __dirname !== "undefined" ? __dirname : path.dirname(new URL(import.meta.url).pathname);
148
+ // Walk up from plugin src/dist to node_modules, then into openclaw
149
+ let dir = thisFile;
150
+ for (let i = 0; i < 6; i++) {
151
+ const candidate = path.join(dir, "openclaw", "dist", "extensionAPI.js");
152
+ if (require("node:fs").existsSync(candidate)) return candidate;
153
+ const parent = path.dirname(dir);
154
+ if (parent === dir) break;
155
+ dir = parent;
156
+ }
157
+ } catch {
158
+ // Traversal failed
159
+ }
160
+
161
+ // Strategy 4: Well-known global install paths
162
+ const globalCandidates = [
163
+ path.join(os.homedir(), ".openclaw", "node_modules", "openclaw", "dist", "extensionAPI.js"),
164
+ "/home/linuxbrew/.linuxbrew/lib/node_modules/openclaw/dist/extensionAPI.js",
165
+ "/usr/local/lib/node_modules/openclaw/dist/extensionAPI.js",
166
+ "/usr/lib/node_modules/openclaw/dist/extensionAPI.js",
167
+ ];
168
+ for (const candidate of globalCandidates) {
169
+ try {
170
+ if (require("node:fs").existsSync(candidate)) return candidate;
171
+ } catch {
172
+ // skip
173
+ }
174
+ }
175
+
176
+ return null;
177
+ }
178
+
179
+ async function loadRunEmbeddedPiAgent(): Promise<RunEmbeddedPiAgentFn> {
180
+ const resolved = resolveExtensionAPIPath();
181
+ if (!resolved) {
182
+ throw new Error("Could not locate openclaw/dist/extensionAPI.js — tried module resolution, sibling lookup, and global paths");
183
+ }
184
+
185
+ const mod = (await import(resolved)) as { runEmbeddedPiAgent?: unknown };
186
+ const fn = (mod as any).runEmbeddedPiAgent;
187
+ if (typeof fn !== "function") {
188
+ throw new Error(`runEmbeddedPiAgent not exported from ${resolved}`);
189
+ }
190
+ return fn as RunEmbeddedPiAgentFn;
191
+ }
192
+
193
+ export function getEmbeddedPiAgent(): Promise<RunEmbeddedPiAgentFn> {
194
+ if (!_embeddedPiAgentLoader) {
195
+ _embeddedPiAgentLoader = loadRunEmbeddedPiAgent();
196
+ }
197
+ return _embeddedPiAgentLoader;
198
+ }
199
+
200
+ /** Reset cached loader (for testing). */
201
+ export function resetEmbeddedPiAgentLoader(): void {
202
+ _embeddedPiAgentLoader = null;
203
+ _llmImportFailureLogged = false;
204
+ }
205
+
206
+ /** Override the cached loader with a custom promise (for testing). */
207
+ export function setEmbeddedPiAgentLoader(loader: Promise<RunEmbeddedPiAgentFn> | null): void {
208
+ _embeddedPiAgentLoader = loader;
209
+ }
210
+
211
+ /** Expose _llmImportFailureLogged for index.ts to read/write. */
212
+ export function getLlmImportFailureLogged(): boolean {
213
+ return _llmImportFailureLogged;
214
+ }
215
+ export function setLlmImportFailureLogged(value: boolean): void {
216
+ _llmImportFailureLogged = value;
217
+ }
218
+
219
+ const EXTRACTION_SYSTEM_PROMPT_BASE = `You are a knowledge extraction engine. Analyze the following conversation exchange and identify information worth remembering long-term.
220
+
221
+ For each piece of knowledge, return a JSON array of objects:
222
+ - "content": concise summary of the knowledge (1-3 sentences)
223
+ - "type": "memory" (facts, decisions, preferences), "process" (workflows, procedures, steps), or "task" (action items, todos, commitments)
224
+ - "tags": array of significance tags from: ["decision", "lesson", "surprise", "commitment", "correction", "preference", "fact"]
225
+ - "significance": 0.0-1.0 how important this is for long-term recall
226
+ - "project": which project this belongs to (from known projects list, or null if unclear)
227
+ - "scope": "private" (personal preference, agent-specific), "team" (shared knowledge), or "public" (documentation)
228
+
229
+ WHAT TO CAPTURE (be thorough — capture anything worth remembering):
230
+ - Decisions and agreements ("we decided to...", "let's go with...", "agreed on...")
231
+ - Technical discoveries and debugging insights ("the root cause was...", "turns out the issue is...")
232
+ - Creative outcomes: naming decisions, design concepts, UX choices, brainstorming results, color schemes, architecture patterns
233
+ - User preferences and feedback ("I prefer...", "I like/don't like...", "always use...")
234
+ - Project context changes: scope changes, timeline shifts, requirement updates, priority changes
235
+ - Workflow patterns the user established ("my process is...", "I always do X before Y")
236
+
237
+ STRICT TASK CLASSIFICATION RULES — a "task" MUST have ALL three of:
238
+ 1. A clear, completable action (not just an observation or idea)
239
+ 2. An identifiable responsible party (explicitly named or unambiguously inferable from context)
240
+ 3. A concrete deliverable or measurable end state
241
+ If ANY of these is missing, classify as "memory" instead of "task". When in doubt, use "memory".
242
+ Observations, learnings, insights, opinions, and general knowledge are ALWAYS "memory", never "task".
243
+
244
+ Only extract genuinely significant knowledge. Skip small talk, acknowledgments, routine exchanges.
245
+ Do NOT extract if similar knowledge was likely captured in a recent exchange. Prefer quality over quantity. Skip routine status updates and acknowledgments.
246
+ Return empty array [] if nothing is worth remembering.
247
+ Return ONLY valid JSON, no markdown fences.`;
248
+
249
+ export function buildExtractionPrompt(projects: CachedProject[]): string {
250
+ if (projects.length === 0) return EXTRACTION_SYSTEM_PROMPT_BASE;
251
+ const projectList = projects
252
+ .map((p) => `${p.name}${p.description ? ` (${p.description})` : ""}`)
253
+ .join(", ");
254
+ return `${EXTRACTION_SYSTEM_PROMPT_BASE}\n\nKnown projects: ${projectList}`;
255
+ }
256
+
257
+ /** Whether the captureModel fallback warning has already been logged (to avoid spam). */
258
+ let _captureModelFallbackWarned = false;
259
+
260
+ /** Whether the captureModel->primary model fallback warning has been logged (max 1x per gateway lifetime). */
261
+ let _captureModelFailoverWarned = false;
262
+
263
+ /** Reset captureModel fallback warning flag (for testing). */
264
+ export function resetCaptureModelFallbackWarning(): void {
265
+ _captureModelFallbackWarned = false;
266
+ _captureModelFailoverWarned = false;
267
+ }
268
+
269
+ /** Module-level logger reference — set by setLogger(). */
270
+ let logger: { info: (...args: any[]) => void; warn: (...args: any[]) => void } = {
271
+ info: (...args: any[]) => console.log(...args),
272
+ warn: (...args: any[]) => console.warn(...args),
273
+ };
274
+
275
+ /** Set the module-level logger (called from hooks/index.ts). */
276
+ export function setLogger(l: { info: (...args: any[]) => void; warn: (...args: any[]) => void }): void {
277
+ logger = l;
278
+ }
279
+
280
+ /** Expose _captureModelFailoverWarned for index.ts to read/write. */
281
+ export function getCaptureModelFailoverWarned(): boolean {
282
+ return _captureModelFailoverWarned;
283
+ }
284
+ export function setCaptureModelFailoverWarned(value: boolean): void {
285
+ _captureModelFailoverWarned = value;
286
+ }
287
+
288
+ /**
289
+ * Heuristic patterns to identify cheap/small models from their name.
290
+ * Avoids hardcoded model IDs that go stale — instead matches naming
291
+ * conventions that providers consistently use for their smaller tiers.
292
+ *
293
+ * Patterns are matched as word boundaries (separated by `-`, `.`, `/`,
294
+ * or string edges) to avoid false positives like "ge*mini*" matching "mini".
295
+ */
296
+ const CHEAP_MODEL_PATTERNS = [
297
+ "haiku", // Anthropic's smallest tier
298
+ "flash", // Google's smallest tier
299
+ "mini", // OpenAI's smallest tier (gpt-4o-mini, gpt-4.1-mini, ...)
300
+ "small", // Mistral's smallest tier
301
+ "nano", // Future small models
302
+ "lite", // Various providers
303
+ "instant", // Anthropic legacy
304
+ ];
305
+
306
+ /** Check if a model name matches any cheap pattern (word-boundary aware). */
307
+ function isCheapModel(modelName: string): boolean {
308
+ const lower = modelName.toLowerCase();
309
+ // Split on common separators: dash, dot, slash, underscore
310
+ const parts = lower.split(/[-./_ ]/);
311
+ return CHEAP_MODEL_PATTERNS.some((p) => parts.includes(p));
312
+ }
313
+
314
+ /**
315
+ * Resolve the model to use for LLM-based capture extraction.
316
+ *
317
+ * Strategy (prefer cheapest available model for cost efficiency):
318
+ * 1. If captureModel is set explicitly (e.g. "anthropic/claude-haiku-4-5"): use it directly.
319
+ * 2. If captureModel is "cheap" or unset: detect the primary model's provider,
320
+ * then pick the cheapest known model for that provider.
321
+ * 3. If no cheap model is known for the provider, fall back to primary model
322
+ * with a one-time warning.
323
+ */
324
+ export function resolveCaptureModel(
325
+ config: any,
326
+ captureModel?: string,
327
+ ): { provider: string; model: string } | undefined {
328
+ // Case 1: explicit model ID provided (not "cheap")
329
+ if (captureModel && captureModel !== "cheap") {
330
+ const parts = captureModel.split("/");
331
+ if (parts.length >= 2) {
332
+ return { provider: parts[0], model: parts.slice(1).join("/") };
333
+ }
334
+ // No slash — treat as model name with provider from primary config
335
+ const defaultsModel = config?.agents?.defaults?.model;
336
+ const primary = typeof defaultsModel === "string"
337
+ ? defaultsModel.trim()
338
+ : (defaultsModel?.primary?.trim() ?? "");
339
+ const defaultProvider = primary.split("/")[0];
340
+ if (defaultProvider) {
341
+ return { provider: defaultProvider, model: captureModel };
342
+ }
343
+ }
344
+
345
+ // Case 2: "cheap" or unset — use primary model, but check if it's already cheap
346
+ const defaultsModel = config?.agents?.defaults?.model;
347
+
348
+ const primary = typeof defaultsModel === "string"
349
+ ? defaultsModel.trim()
350
+ : (typeof defaultsModel === "object" && defaultsModel !== null
351
+ ? String(defaultsModel.primary ?? "").trim()
352
+ : "");
353
+
354
+ if (primary) {
355
+ const parts = primary.split("/");
356
+ if (parts.length >= 2) {
357
+ const provider = parts[0];
358
+ const modelName = parts.slice(1).join("/");
359
+
360
+ // Check if the primary model is already cheap (e.g., user runs haiku as main)
361
+ if (isCheapModel(modelName)) {
362
+ return { provider, model: modelName };
363
+ }
364
+
365
+ // Primary is expensive — check if the runtime exposes available models
366
+ // so we can pick a cheap one from the same provider dynamically.
367
+ const runtimeModels: string[] = config?.runtime?.availableModels
368
+ ?? config?.models
369
+ ?? [];
370
+ const cheapFromRuntime = runtimeModels.find((m: string) => {
371
+ const lower = m.toLowerCase();
372
+ return lower.startsWith(provider + "/") && isCheapModel(m);
373
+ });
374
+ if (cheapFromRuntime) {
375
+ const rParts = cheapFromRuntime.split("/");
376
+ return { provider: rParts[0], model: rParts.slice(1).join("/") };
377
+ }
378
+
379
+ // No cheap alternative found — use primary model with one-time hint
380
+ if (!_captureModelFallbackWarned) {
381
+ _captureModelFallbackWarned = true;
382
+ logger.warn(`[palaia] Using primary model for capture extraction. Set captureModel in plugin config (e.g. "anthropic/claude-haiku-4-5") for cost savings.`);
383
+ }
384
+ return { provider, model: modelName };
385
+ }
386
+ }
387
+
388
+ return undefined;
389
+ }
390
+
391
+ function stripCodeFences(s: string): string {
392
+ const trimmed = s.trim();
393
+ const m = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
394
+ if (m) return (m[1] ?? "").trim();
395
+ return trimmed;
396
+ }
397
+
398
+ function collectText(payloads: Array<{ text?: string; isError?: boolean }> | undefined): string {
399
+ return (payloads ?? [])
400
+ .filter((p) => !p.isError && typeof p.text === "string")
401
+ .map((p) => p.text ?? "")
402
+ .join("\n")
403
+ .trim();
404
+ }
405
+
406
+ /**
407
+ * Trim message texts to a recent window for LLM extraction.
408
+ * Only extract from recent exchanges — full history causes LLM timeouts
409
+ * and dilutes extraction quality.
410
+ *
411
+ * Strategy: keep last N user+assistant pairs (skip toolResult roles),
412
+ * then hard-cap at maxChars from the end (newest messages kept).
413
+ */
414
+ export function trimToRecentExchanges(
415
+ texts: Array<{ role: string; text: string; provenance?: string }>,
416
+ maxPairs = 5,
417
+ maxChars = 10_000,
418
+ ): Array<{ role: string; text: string; provenance?: string }> {
419
+ // Filter to only user + assistant messages (skip tool, toolResult, system, etc.)
420
+ const exchanges = texts.filter((t) => t.role === "user" || t.role === "assistant");
421
+
422
+ // Keep the last N pairs (a pair = one user + one assistant message)
423
+ // Only count external_user messages as real user turns.
424
+ // System-injected user messages (inter_session, internal_system) don't count as conversation turns.
425
+ // Walk backwards, count pairs
426
+ let pairCount = 0;
427
+ let lastRole = "";
428
+ let cutIndex = 0; // default: keep everything
429
+ for (let i = exchanges.length - 1; i >= 0; i--) {
430
+ const isRealUser = exchanges[i].role === "user" && (
431
+ exchanges[i].provenance === "external_user" ||
432
+ !exchanges[i].provenance // backward compat: no provenance = treat as real user
433
+ );
434
+ // Count a new pair when we see a real user message after having seen an assistant
435
+ if (isRealUser && lastRole === "assistant") {
436
+ pairCount++;
437
+ if (pairCount > maxPairs) {
438
+ cutIndex = i + 1; // keep from next message onwards
439
+ break;
440
+ }
441
+ }
442
+ if (exchanges[i].role !== lastRole) {
443
+ lastRole = exchanges[i].role;
444
+ }
445
+ }
446
+ let trimmed = exchanges.slice(cutIndex);
447
+
448
+ // Hard cap: max chars from the end (keep newest)
449
+ let totalChars = trimmed.reduce((sum, t) => sum + t.text.length + t.role.length + 5, 0);
450
+ while (totalChars > maxChars && trimmed.length > 1) {
451
+ const removed = trimmed.shift()!;
452
+ totalChars -= removed.text.length + removed.role.length + 5;
453
+ }
454
+
455
+ return trimmed;
456
+ }
457
+
458
+ export async function extractWithLLM(
459
+ messages: unknown[],
460
+ config: any,
461
+ pluginConfig?: { captureModel?: string },
462
+ knownProjects?: CachedProject[],
463
+ ): Promise<ExtractionResult[]> {
464
+ const runEmbeddedPiAgent = await getEmbeddedPiAgent();
465
+
466
+ const resolved = resolveCaptureModel(config, pluginConfig?.captureModel);
467
+ if (!resolved) {
468
+ throw new Error("No model available for LLM extraction");
469
+ }
470
+
471
+ const allTexts = extractMessageTexts(messages);
472
+ // Strip Palaia-injected recall context from user messages to prevent feedback loop
473
+ const cleanedTexts = allTexts.map(t =>
474
+ t.role === "user"
475
+ ? { ...t, text: stripPalaiaInjectedContext(t.text) }
476
+ : t
477
+ );
478
+ // Only extract from recent exchanges — full history causes LLM timeouts
479
+ // and dilutes extraction quality
480
+ const recentTexts = trimToRecentExchanges(cleanedTexts);
481
+ const exchangeText = recentTexts
482
+ .map((t) => `[${t.role}]: ${t.text}`)
483
+ .join("\n");
484
+
485
+ if (!exchangeText.trim()) {
486
+ return [];
487
+ }
488
+
489
+ const systemPrompt = buildExtractionPrompt(knownProjects || []);
490
+ const prompt = `${systemPrompt}\n\n--- CONVERSATION ---\n${exchangeText}\n--- END ---`;
491
+
492
+ let tmpDir: string | null = null;
493
+ try {
494
+ // Use a fixed base directory for extraction temp dirs and clean up stale ones
495
+ const extractBaseDir = path.join(os.tmpdir(), "palaia-extractions");
496
+ await fs.mkdir(extractBaseDir, { recursive: true });
497
+ // Clean up stale extraction dirs (older than 5 minutes)
498
+ try {
499
+ const entries = await fs.readdir(extractBaseDir, { withFileTypes: true });
500
+ const now = Date.now();
501
+ for (const entry of entries) {
502
+ if (entry.isDirectory()) {
503
+ try {
504
+ const stat = await fs.stat(path.join(extractBaseDir, entry.name));
505
+ if (now - stat.mtimeMs > 5 * 60 * 1000) {
506
+ await fs.rm(path.join(extractBaseDir, entry.name), { recursive: true, force: true });
507
+ }
508
+ } catch { /* ignore individual cleanup errors */ }
509
+ }
510
+ }
511
+ } catch { /* ignore cleanup errors */ }
512
+ tmpDir = await fs.mkdtemp(path.join(extractBaseDir, "ext-"));
513
+ const sessionId = `palaia-extract-${Date.now()}`;
514
+ const sessionFile = path.join(tmpDir, "session.json");
515
+
516
+ const result = await runEmbeddedPiAgent({
517
+ sessionId,
518
+ sessionFile,
519
+ workspaceDir: config?.agents?.defaults?.workspace ?? process.cwd(),
520
+ config,
521
+ prompt,
522
+ timeoutMs: 15_000,
523
+ runId: `palaia-extract-${Date.now()}`,
524
+ provider: resolved.provider,
525
+ model: resolved.model,
526
+ disableTools: true,
527
+ streamParams: { maxTokens: 2048 },
528
+ });
529
+
530
+ const text = collectText((result as any).payloads);
531
+ if (!text) return [];
532
+
533
+ const raw = stripCodeFences(text);
534
+ let parsed: unknown;
535
+ try {
536
+ parsed = JSON.parse(raw);
537
+ } catch {
538
+ throw new Error(`LLM returned invalid JSON: ${raw.slice(0, 200)}`);
539
+ }
540
+
541
+ if (!Array.isArray(parsed)) {
542
+ throw new Error(`LLM returned non-array: ${typeof parsed}`);
543
+ }
544
+
545
+ const results: ExtractionResult[] = [];
546
+ for (const item of parsed) {
547
+ if (!item || typeof item !== "object") continue;
548
+ const content = typeof item.content === "string" ? item.content.trim() : "";
549
+ if (!content) continue;
550
+
551
+ const validTypes = new Set(["memory", "process", "task"]);
552
+ const type = validTypes.has(item.type) ? item.type : "memory";
553
+
554
+ const validTags = new Set([
555
+ "decision", "lesson", "surprise", "commitment",
556
+ "correction", "preference", "fact",
557
+ ]);
558
+ const tags = Array.isArray(item.tags)
559
+ ? item.tags.filter((t: unknown) => typeof t === "string" && validTags.has(t))
560
+ : [];
561
+
562
+ const significance = typeof item.significance === "number"
563
+ ? Math.max(0, Math.min(1, item.significance))
564
+ : 0.5;
565
+
566
+ const project = typeof item.project === "string" && item.project.trim()
567
+ ? item.project.trim()
568
+ : null;
569
+
570
+ const scope = typeof item.scope === "string" && isValidScope(item.scope)
571
+ ? item.scope
572
+ : null;
573
+
574
+ results.push({ content, type, tags, significance, project, scope });
575
+ }
576
+
577
+ return results;
578
+ } finally {
579
+ if (tmpDir) {
580
+ try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
581
+ }
582
+ }
583
+ }
584
+
585
+ // ============================================================================
586
+ // Auto-Capture: Rule-based extraction (Issue #64)
587
+ // ============================================================================
588
+
589
+ const TRIVIAL_RESPONSES = new Set([
590
+ "ok", "ja", "nein", "yes", "no", "sure", "klar", "danke", "thanks",
591
+ "thx", "k", "\u{1f44d}", "\u{1f44e}", "ack", "nope", "yep", "yup", "alright",
592
+ "fine", "gut", "passt", "okay", "hmm", "hm", "ah", "aha",
593
+ ]);
594
+
595
+ const SIGNIFICANCE_RULES: Array<{
596
+ pattern: RegExp;
597
+ tag: string;
598
+ type: "memory" | "process" | "task";
599
+ }> = [
600
+ // Decisions and agreements
601
+ { pattern: /(?:we decided|entschieden|decision:|beschlossen|let'?s go with|wir nehmen|agreed on)/i, tag: "decision", type: "memory" },
602
+ { pattern: /(?:will use|werden nutzen|going forward|ab jetzt|from now on)/i, tag: "decision", type: "memory" },
603
+ // Creative outcomes: naming, design, UX, brainstorming
604
+ { pattern: /(?:(?:we|i|let'?s)\s+(?:decided to |chose to )?(?:name|call|rename)\s+(?:it|this|the))/i, tag: "decision", type: "memory" },
605
+ { pattern: /(?:the design should|design concept|color scheme|colour scheme|UX flow|user flow|wireframe|mockup)/i, tag: "decision", type: "memory" },
606
+ { pattern: /(?:brainstorming results|brainstorm(?:ed)?|ideas?\s+(?:for|are)|naming convention)/i, tag: "decision", type: "memory" },
607
+ { pattern: /(?:the (?:logo|icon|brand|theme|layout|style)\s+(?:should|will|is))/i, tag: "decision", type: "memory" },
608
+ // Technical discoveries and debugging insights
609
+ { pattern: /(?:root cause|the (?:issue|bug|problem) (?:was|is)|figured out|the fix (?:is|was)|debugging showed)/i, tag: "lesson", type: "memory" },
610
+ { pattern: /(?:learned|gelernt|lesson:|erkenntnis|takeaway|insight|turns out|it seems)/i, tag: "lesson", type: "memory" },
611
+ { pattern: /(?:mistake was|fehler war|should have|hätten sollen|next time)/i, tag: "lesson", type: "memory" },
612
+ // Surprises
613
+ { pattern: /(?:surprising|überraschend|unexpected|unerwartet|didn'?t expect|nicht erwartet|plot twist)/i, tag: "surprise", type: "memory" },
614
+ // Commitments and tasks
615
+ { pattern: /(?:i will|ich werde|todo:|action item|must do|muss noch|need to|commit to|verspreche)/i, tag: "commitment", type: "task" },
616
+ { pattern: /(?:deadline|frist|due date|bis zum|by end of|spätestens)/i, tag: "commitment", type: "task" },
617
+ // Processes and workflows
618
+ { pattern: /(?:the process is|der prozess|steps?:|workflow:|how to|anleitung|recipe:|checklist)/i, tag: "process", type: "process" },
619
+ { pattern: /(?:first,?\s.*then|schritt \d|step \d|1\.\s.*2\.\s)/i, tag: "process", type: "process" },
620
+ // User preferences and feedback
621
+ { pattern: /(?:(?:i|the user)\s+prefer|i (?:like|don'?t like|always use|never use)|my (?:go-to|default|standard))/i, tag: "preference", type: "memory" },
622
+ { pattern: /(?:bevorzug|mag ich|immer nutzen|standard(?:mäßig)?(?:\s+ist|\s+nutze))/i, tag: "preference", type: "memory" },
623
+ // Project context changes
624
+ { pattern: /(?:scope (?:change|creep|update)|timeline (?:shift|change|update)|requirement(?:s)? (?:change|update)|priority (?:change|shift))/i, tag: "decision", type: "memory" },
625
+ { pattern: /(?:pivot(?:ed|ing)?|(?:re)?scoped|descoped|deferred|postponed|moved to (?:next|later))/i, tag: "decision", type: "memory" },
626
+ ];
627
+
628
+ const NOISE_PATTERNS: RegExp[] = [
629
+ /(?:PASSED|FAILED|ERROR)\s+\[?\d+%\]?/i,
630
+ /(?:test_\w+|tests?\/\w+\.(?:py|ts|js))\s*::/,
631
+ /(?:pytest|vitest|jest|mocha)\s+(?:run|--)/i,
632
+ /\d+ passed,?\s*\d* (?:failed|error|warning)/i,
633
+ /^(?:=+\s*(?:test session|ERRORS|FAILURES|short test summary))/m,
634
+ /(?:Traceback \(most recent call last\)|^\s+File ".*", line \d+)/m,
635
+ /^\s+at\s+\S+\s+\(.*:\d+:\d+\)/m,
636
+ /^(?:\/[\w/.-]+){3,}\s*$/m,
637
+ /(?:npm\s+(?:ERR|WARN)|pip\s+install|cargo\s+build)/i,
638
+ /^(?:warning|error)\[?\w*\]?:\s/m,
639
+ ];
640
+
641
+ export function isNoiseContent(text: string): boolean {
642
+ let matchCount = 0;
643
+ for (const pattern of NOISE_PATTERNS) {
644
+ if (pattern.test(text)) {
645
+ matchCount++;
646
+ if (matchCount >= 2) return true;
647
+ }
648
+ }
649
+
650
+ const lines = text.split("\n").filter((l) => l.trim().length > 0);
651
+ if (lines.length > 3) {
652
+ const pathLines = lines.filter((l) => /^\s*(?:\/[\w/.-]+){2,}/.test(l.trim()));
653
+ if (pathLines.length / lines.length > 0.5) return true;
654
+ }
655
+
656
+ return false;
657
+ }
658
+
659
+ export function shouldAttemptCapture(
660
+ exchangeText: string,
661
+ minChars = 100,
662
+ ): boolean {
663
+ const trimmed = exchangeText.trim();
664
+
665
+ if (trimmed.length < minChars) return false;
666
+
667
+ const words = trimmed.toLowerCase().split(/\s+/);
668
+ if (words.length <= 3 && words.every((w) => TRIVIAL_RESPONSES.has(w))) {
669
+ return false;
670
+ }
671
+
672
+ if (trimmed.includes("<relevant-memories>")) return false;
673
+ if (trimmed.startsWith("<") && trimmed.includes("</")) return false;
674
+
675
+ if (isNoiseContent(trimmed)) return false;
676
+
677
+ return true;
678
+ }
679
+
680
+ export function extractSignificance(
681
+ exchangeText: string,
682
+ ): { tags: string[]; type: "memory" | "process" | "task"; summary: string } | null {
683
+ const matched: Array<{ tag: string; type: "memory" | "process" | "task" }> = [];
684
+
685
+ for (const rule of SIGNIFICANCE_RULES) {
686
+ if (rule.pattern.test(exchangeText)) {
687
+ matched.push({ tag: rule.tag, type: rule.type });
688
+ }
689
+ }
690
+
691
+ if (matched.length === 0) return null;
692
+
693
+ // Require at least 2 different significance tags for rule-based capture
694
+ const uniqueTags = new Set(matched.map((m) => m.tag));
695
+ if (uniqueTags.size < 2) return null;
696
+
697
+ const typePriority: Record<string, number> = { task: 3, process: 2, memory: 1 };
698
+ const primaryType = matched.reduce(
699
+ (best, m) => (typePriority[m.type] > typePriority[best] ? m.type : best),
700
+ "memory" as "memory" | "process" | "task",
701
+ );
702
+
703
+ const tags = [...new Set(matched.map((m) => m.tag))];
704
+
705
+ const sentences = exchangeText
706
+ .split(/[.!?\n]+/)
707
+ .map((s) => s.trim())
708
+ .filter((s) => s.length > 20 && s.length < 500);
709
+
710
+ const relevantSentences = sentences.filter((s) =>
711
+ SIGNIFICANCE_RULES.some((r) => r.pattern.test(s)),
712
+ );
713
+
714
+ const summary = (relevantSentences.length > 0 ? relevantSentences : sentences)
715
+ .slice(0, 3)
716
+ .join(". ")
717
+ .slice(0, 500);
718
+
719
+ if (!summary) return null;
720
+
721
+ return { tags, type: primaryType, summary };
722
+ }
723
+
724
+ /**
725
+ * Strip Palaia-injected recall context from message text.
726
+ * The recall block is prepended to user messages by before_prompt_build via prependContext.
727
+ * OpenClaw merges it into the user message, so agent_end sees it as user content.
728
+ * Without stripping, auto-capture re-captures the injected memories -> feedback loop.
729
+ *
730
+ * The block has a stable structure:
731
+ * - Starts with "## Active Memory (Palaia)"
732
+ * - Contains [t/m], [t/pr], [t/tk] prefixed entries
733
+ * - Ends with "[palaia] auto-capture=on..." nudge line
734
+ */
735
+ export function stripPalaiaInjectedContext(text: string): string {
736
+ // Pattern: "## Active Memory (Palaia)" ... "[palaia] auto-capture=on..." + optional trailing newlines
737
+ // The nudge line is always present and marks the end of the injected block
738
+ const PALAIA_BLOCK_RE = /## Active Memory \(Palaia\)[\s\S]*?\[palaia\][^\n]*\n*/;
739
+ return text.replace(PALAIA_BLOCK_RE, '').trim();
740
+ }