@rubytech/taskmaster 1.0.111 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,8 +6,8 @@
6
6
  <title>Taskmaster Control</title>
7
7
  <meta name="color-scheme" content="dark light" />
8
8
  <link rel="icon" type="image/png" href="./favicon.png" />
9
- <script type="module" crossorigin src="./assets/index-Cp_azZBu.js"></script>
10
- <link rel="stylesheet" crossorigin href="./assets/index-BM3zZtpB.css">
9
+ <script type="module" crossorigin src="./assets/index-D7ZHRWnP.js"></script>
10
+ <link rel="stylesheet" crossorigin href="./assets/index-CfybK7_N.css">
11
11
  </head>
12
12
  <body>
13
13
  <taskmaster-app></taskmaster-app>
@@ -13,6 +13,8 @@ export const SessionsTranscriptEntrySchema = Type.Object({
13
13
  Type.Literal("user"),
14
14
  Type.Literal("assistant"),
15
15
  Type.Literal("tool"),
16
+ Type.Literal("tool_call"),
17
+ Type.Literal("tool_result"),
16
18
  Type.Literal("thinking"),
17
19
  Type.Literal("error"),
18
20
  Type.Literal("system"),
@@ -20,6 +22,7 @@ export const SessionsTranscriptEntrySchema = Type.Object({
20
22
  content: Type.String(),
21
23
  model: Type.Optional(Type.String()),
22
24
  toolName: Type.Optional(Type.String()),
25
+ toolCallId: Type.Optional(Type.String()),
23
26
  meta: Type.Optional(Type.Record(Type.String(), Type.Unknown())),
24
27
  });
25
28
  export const SessionsTranscriptResultSchema = Type.Object({
@@ -46,17 +46,20 @@ export function createChatRunState() {
46
46
  const buffers = new Map();
47
47
  const deltaSentAt = new Map();
48
48
  const abortedRuns = new Map();
49
+ const finalHadContent = new Map();
49
50
  const clear = () => {
50
51
  registry.clear();
51
52
  buffers.clear();
52
53
  deltaSentAt.clear();
53
54
  abortedRuns.clear();
55
+ finalHadContent.clear();
54
56
  };
55
57
  return {
56
58
  registry,
57
59
  buffers,
58
60
  deltaSentAt,
59
61
  abortedRuns,
62
+ finalHadContent,
60
63
  clear,
61
64
  };
62
65
  }
@@ -91,6 +94,9 @@ export function createAgentEventHandler({ broadcast, nodeSendToSession, agentRun
91
94
  chatRunState.deltaSentAt.delete(clientRunId);
92
95
  // Strip silent reply token so it never reaches the chat UI
93
96
  const text = isSilentReplyText(rawText) ? "" : rawText;
97
+ // Record whether the streaming buffer had content so the chat.send .then()
98
+ // handler knows whether it needs to broadcast the dispatcher's final reply.
99
+ chatRunState.finalHadContent.set(clientRunId, !!text);
94
100
  if (jobState === "done") {
95
101
  const payload = {
96
102
  runId: clientRunId,
@@ -72,6 +72,14 @@ export function createGatewayCloseHandler(params) {
72
72
  /* ignore */
73
73
  }
74
74
  }
75
+ if (params.infraAlertUnsub) {
76
+ try {
77
+ params.infraAlertUnsub();
78
+ }
79
+ catch {
80
+ /* ignore */
81
+ }
82
+ }
75
83
  params.chatRunState.clear();
76
84
  for (const c of params.clients) {
77
85
  try {
@@ -604,6 +604,52 @@ export const chatHandlers = {
604
604
  message,
605
605
  });
606
606
  }
607
+ else if (finalReplyParts.length > 0) {
608
+ // Agent started but the reply came through the dispatcher, not
609
+ // streaming (e.g. auth error, billing error, context overflow).
610
+ // emitChatFinal already broadcast a "final" — check whether it
611
+ // had content. If the streaming buffer was empty, the dispatcher's
612
+ // reply was the only response and needs to be persisted + broadcast.
613
+ const agentStreamedContent = context.chatFinalHadContent.get(clientRunId) ?? false;
614
+ if (!agentStreamedContent) {
615
+ const combinedReply = finalReplyParts
616
+ .map((part) => part.trim())
617
+ .filter(Boolean)
618
+ .join("\n\n")
619
+ .trim();
620
+ if (combinedReply) {
621
+ const { storePath: latestStorePath, entry: latestEntry } = loadSessionEntry(p.sessionKey);
622
+ const sessionId = latestEntry?.sessionId ?? entry?.sessionId ?? clientRunId;
623
+ const appended = appendAssistantTranscriptMessage({
624
+ message: combinedReply,
625
+ sessionId,
626
+ storePath: latestStorePath,
627
+ sessionFile: latestEntry?.sessionFile,
628
+ createIfMissing: true,
629
+ });
630
+ let message;
631
+ if (appended.ok) {
632
+ message = appended.message;
633
+ }
634
+ else {
635
+ context.logGateway.warn(`webchat transcript append (fallback) failed: ${appended.error ?? "unknown error"}`);
636
+ message = {
637
+ role: "assistant",
638
+ content: [{ type: "text", text: combinedReply }],
639
+ timestamp: Date.now(),
640
+ stopReason: "injected",
641
+ usage: { input: 0, output: 0, totalTokens: 0 },
642
+ };
643
+ }
644
+ broadcastChatFinal({
645
+ context,
646
+ runId: clientRunId,
647
+ sessionKey: p.sessionKey,
648
+ message,
649
+ });
650
+ }
651
+ }
652
+ }
607
653
  // Fire message:outbound hook for conversation archiving
608
654
  const outboundText = finalReplyParts.join("\n\n").trim();
609
655
  if (outboundText) {
@@ -644,6 +690,7 @@ export const chatHandlers = {
644
690
  })
645
691
  .finally(() => {
646
692
  context.chatAbortControllers.delete(clientRunId);
693
+ context.chatFinalHadContent.delete(clientRunId);
647
694
  });
648
695
  }
649
696
  catch (err) {
@@ -30,6 +30,28 @@ function extractTextFromContentBlocks(blocks) {
30
30
  }
31
31
  return parts.join("\n");
32
32
  }
33
+ /** Format tool input as readable key=value pairs instead of raw JSON. */
34
+ function formatToolInput(input) {
35
+ if (input == null)
36
+ return "";
37
+ if (typeof input === "string")
38
+ return input;
39
+ if (typeof input !== "object" || Array.isArray(input))
40
+ return JSON.stringify(input);
41
+ const obj = input;
42
+ const parts = [];
43
+ for (const [key, val] of Object.entries(obj)) {
44
+ if (val === undefined)
45
+ continue;
46
+ const valStr = typeof val === "string"
47
+ ? val.length > 200
48
+ ? `"${val.slice(0, 200)}..."`
49
+ : `"${val}"`
50
+ : JSON.stringify(val);
51
+ parts.push(`${key}: ${valStr}`);
52
+ }
53
+ return parts.join("\n");
54
+ }
33
55
  function expandLineToEntries(line, sessionId, sessionKey, agentId, fileMtimeMs) {
34
56
  const entries = [];
35
57
  const ts = resolveTimestamp(line, fileMtimeMs);
@@ -64,9 +86,10 @@ function expandLineToEntries(line, sessionId, sessionKey, agentId, fileMtimeMs)
64
86
  sessionKey,
65
87
  agentId,
66
88
  timestamp: ts,
67
- type: "tool",
89
+ type: "tool_result",
68
90
  content,
69
91
  ...(line.toolName ? { toolName: line.toolName } : {}),
92
+ ...(line.toolCallId ? { toolCallId: line.toolCallId } : {}),
70
93
  ...(model ? { model } : {}),
71
94
  });
72
95
  return entries;
@@ -74,6 +97,34 @@ function expandLineToEntries(line, sessionId, sessionKey, agentId, fileMtimeMs)
74
97
  if (line.type === "message" && line.message) {
75
98
  const msg = line.message;
76
99
  const contentBlocks = Array.isArray(msg.content) ? msg.content : [];
100
+ // Handle toolResult messages — these are separate JSONL lines from the SDK
101
+ // with role: "toolResult", toolCallId, toolName, and content blocks.
102
+ if (msg.role === "toolResult") {
103
+ const textParts = [];
104
+ for (const block of contentBlocks) {
105
+ if (block && typeof block === "object" && typeof block.text === "string" && block.text.trim()) {
106
+ textParts.push(block.text.trim());
107
+ }
108
+ }
109
+ if (typeof msg.content === "string" && msg.content.trim()) {
110
+ textParts.push(msg.content.trim());
111
+ }
112
+ const content = textParts.length > 0 ? textParts.join("\n") : "(empty result)";
113
+ const toolName = typeof msg.toolName === "string" ? msg.toolName : undefined;
114
+ const toolCallId = typeof msg.toolCallId === "string" ? msg.toolCallId : undefined;
115
+ entries.push({
116
+ sessionId,
117
+ sessionKey,
118
+ agentId,
119
+ timestamp: ts,
120
+ type: "tool_result",
121
+ content: msg.isError ? `[error] ${content}` : content,
122
+ ...(toolName ? { toolName } : {}),
123
+ ...(toolCallId ? { toolCallId } : {}),
124
+ ...(model ? { model } : {}),
125
+ });
126
+ return entries;
127
+ }
77
128
  // If content is a simple string (not blocks), treat as a single text entry
78
129
  if (typeof msg.content === "string") {
79
130
  const role = msg.role;
@@ -111,15 +162,17 @@ function expandLineToEntries(line, sessionId, sessionKey, agentId, fileMtimeMs)
111
162
  else if (blockType === "tool_use" || blockType === "toolCall") {
112
163
  const toolName = typeof block.name === "string" ? block.name : undefined;
113
164
  const input = blockType === "toolCall" ? block.arguments : block.input;
114
- const content = input != null ? JSON.stringify(input) : "";
165
+ const content = formatToolInput(input);
166
+ const blockId = typeof block.id === "string" ? block.id : undefined;
115
167
  entries.push({
116
168
  sessionId,
117
169
  sessionKey,
118
170
  agentId,
119
171
  timestamp: ts,
120
- type: "tool",
172
+ type: "tool_call",
121
173
  content,
122
174
  ...(toolName ? { toolName } : {}),
175
+ ...(blockId ? { toolCallId: blockId } : {}),
123
176
  ...(model ? { model } : {}),
124
177
  });
125
178
  }
@@ -13,6 +13,7 @@ import { applyPluginAutoEnable } from "../config/plugin-auto-enable.js";
13
13
  import { clearAgentRunContext, onAgentEvent } from "../infra/agent-events.js";
14
14
  import { onHeartbeatEvent } from "../infra/heartbeat-events.js";
15
15
  import { startHeartbeatRunner } from "../infra/heartbeat-runner.js";
16
+ import { onInfraAlertEvent } from "../infra/infra-alert-events.js";
16
17
  import { getMachineDisplayName } from "../infra/machine-name.js";
17
18
  import { ensureTaskmasterCliOnPath } from "../infra/path-env.js";
18
19
  import { primeRemoteSkillsCache, refreshRemoteBinsForConnectedNodes, setSkillsRemoteRegistry, } from "../infra/skills-remote.js";
@@ -327,6 +328,9 @@ export async function startGatewayServer(port = 18789, opts = {}) {
327
328
  const heartbeatUnsub = onHeartbeatEvent((evt) => {
328
329
  broadcast("heartbeat", evt, { dropIfSlow: true });
329
330
  });
331
+ const infraAlertUnsub = onInfraAlertEvent((evt) => {
332
+ broadcast("notification", evt);
333
+ });
330
334
  let heartbeatRunner = startHeartbeatRunner({ cfg: cfgAtStart });
331
335
  void cron.start().catch((err) => logCron.error(`failed to start: ${String(err)}`));
332
336
  const execApprovalManager = new ExecApprovalManager();
@@ -376,6 +380,7 @@ export async function startGatewayServer(port = 18789, opts = {}) {
376
380
  chatAbortControllers,
377
381
  chatAbortedRuns: chatRunState.abortedRuns,
378
382
  chatRunBuffers: chatRunState.buffers,
383
+ chatFinalHadContent: chatRunState.finalHadContent,
379
384
  chatDeltaSentAt: chatRunState.deltaSentAt,
380
385
  addChatRun,
381
386
  removeChatRun,
@@ -494,6 +499,7 @@ export async function startGatewayServer(port = 18789, opts = {}) {
494
499
  dedupeCleanup,
495
500
  agentUnsub,
496
501
  heartbeatUnsub,
502
+ infraAlertUnsub,
497
503
  chatRunState,
498
504
  clients,
499
505
  configReloader,
@@ -0,0 +1,99 @@
1
+ import { buildAuthHealthSummary, formatRemainingShort, } from "../agents/auth-health.js";
2
+ import { loadAuthProfileStore } from "../agents/auth-profiles.js";
3
+ import { getChannelPlugin } from "../channels/plugins/index.js";
4
+ import { createSubsystemLogger } from "../logging/subsystem.js";
5
+ import { emitInfraAlertEvent } from "./infra-alert-events.js";
6
+ import { deliverOutboundPayloads } from "./outbound/deliver.js";
7
+ const log = createSubsystemLogger("gateway/heartbeat-auth-notify");
8
+ const COOLDOWN_MS = 6 * 60 * 60 * 1000; // 6 hours — same as infra alerts
9
+ let lastNotifiedMs = 0;
10
+ function formatAuthAlertMessage(summary) {
11
+ const problems = summary.providers.filter((p) => p.status === "expired" || p.status === "expiring");
12
+ if (problems.length === 0)
13
+ return null;
14
+ const parts = [];
15
+ for (const provider of problems) {
16
+ const name = provider.provider.charAt(0).toUpperCase() + provider.provider.slice(1);
17
+ if (provider.status === "expired") {
18
+ parts.push(`${name} API key has expired`);
19
+ }
20
+ else if (provider.status === "expiring") {
21
+ const remaining = formatRemainingShort(provider.remainingMs);
22
+ parts.push(`${name} API key expires in ${remaining}`);
23
+ }
24
+ }
25
+ if (parts.length === 0)
26
+ return null;
27
+ const detail = parts.join("; ");
28
+ return `${detail}. Open the control panel and go to Settings > API Keys to update.`;
29
+ }
30
+ /**
31
+ * Proactive auth health check — runs after each heartbeat cycle to detect
32
+ * expired or soon-to-expire API tokens and notify the admin before they
33
+ * encounter errors.
34
+ *
35
+ * Alerts go to both the delivery channel (WhatsApp/iMessage) and the
36
+ * Control Panel via the infra-alert event bus.
37
+ *
38
+ * Returns true if an alert was sent, false otherwise. Never throws.
39
+ */
40
+ export async function checkAndNotifyAuthHealth(params) {
41
+ try {
42
+ const { cfg, delivery, deps } = params;
43
+ const nowMs = params.nowMs ?? Date.now();
44
+ // Cooldown: don't spam the admin.
45
+ if (nowMs - lastNotifiedMs < COOLDOWN_MS)
46
+ return false;
47
+ const store = loadAuthProfileStore();
48
+ const summary = buildAuthHealthSummary({ store, cfg });
49
+ const message = formatAuthAlertMessage(summary);
50
+ if (!message)
51
+ return false;
52
+ // Always broadcast to Control Panel regardless of delivery channel.
53
+ emitInfraAlertEvent({ category: "auth", message });
54
+ // Deliver via WhatsApp/iMessage if target available.
55
+ if (delivery.channel !== "none" && delivery.to) {
56
+ const plugin = getChannelPlugin(delivery.channel);
57
+ if (plugin?.heartbeat?.checkReady) {
58
+ const readiness = await plugin.heartbeat.checkReady({
59
+ cfg,
60
+ accountId: delivery.accountId,
61
+ deps,
62
+ });
63
+ if (!readiness.ok) {
64
+ log.debug("auth notify skipped channel delivery: not ready", {
65
+ reason: readiness.reason,
66
+ });
67
+ lastNotifiedMs = nowMs;
68
+ return true; // CP was still notified
69
+ }
70
+ }
71
+ await deliverOutboundPayloads({
72
+ cfg,
73
+ channel: delivery.channel,
74
+ to: delivery.to,
75
+ accountId: delivery.accountId,
76
+ payloads: [{ text: message }],
77
+ deps,
78
+ });
79
+ }
80
+ lastNotifiedMs = nowMs;
81
+ log.info("auth health alert sent", {
82
+ to: delivery.to ?? "control-panel-only",
83
+ problems: summary.providers
84
+ .filter((p) => p.status === "expired" || p.status === "expiring")
85
+ .map((p) => `${p.provider}:${p.status}`),
86
+ });
87
+ return true;
88
+ }
89
+ catch (err) {
90
+ log.error("auth health check failed", {
91
+ error: err instanceof Error ? err.message : String(err),
92
+ });
93
+ return false;
94
+ }
95
+ }
96
+ /** Reset cooldown timer. Exposed for testing. */
97
+ export function resetAuthNotifyCooldown() {
98
+ lastNotifiedMs = 0;
99
+ }
@@ -1,6 +1,7 @@
1
1
  import { describeFailoverError } from "../agents/failover-error.js";
2
2
  import { getChannelPlugin } from "../channels/plugins/index.js";
3
3
  import { createSubsystemLogger } from "../logging/subsystem.js";
4
+ import { emitInfraAlertEvent } from "./infra-alert-events.js";
4
5
  import { deliverOutboundPayloads } from "./outbound/deliver.js";
5
6
  const log = createSubsystemLogger("gateway/heartbeat-infra-alert");
6
7
  const COOLDOWN_MS = 6 * 60 * 60 * 1000; // 6 hours
@@ -79,6 +80,7 @@ export async function maybeAlertAdmin(ctx) {
79
80
  payloads: [{ text: message }],
80
81
  deps,
81
82
  });
83
+ emitInfraAlertEvent({ category, message });
82
84
  cooldowns.set(category, nowMs);
83
85
  log.info("infra alert sent", { category, to: delivery.to });
84
86
  return true;
@@ -115,6 +117,7 @@ export async function maybeAlertAdmin(ctx) {
115
117
  payloads: [{ text: message }],
116
118
  deps,
117
119
  });
120
+ emitInfraAlertEvent({ category, message });
118
121
  cooldowns.set(category, nowMs);
119
122
  log.info("infra alert sent", {
120
123
  category,
@@ -25,6 +25,7 @@ import { resolveHeartbeatVisibility } from "./heartbeat-visibility.js";
25
25
  import { requestHeartbeatNow, setHeartbeatWakeHandler, } from "./heartbeat-wake.js";
26
26
  import { deliverOutboundPayloads } from "./outbound/deliver.js";
27
27
  import { resolveHeartbeatDeliveryTarget, resolveHeartbeatSenderContext, } from "./outbound/targets.js";
28
+ import { checkAndNotifyAuthHealth } from "./heartbeat-auth-notify.js";
28
29
  import { maybeNotifyUpdateAvailable } from "./heartbeat-update-notify.js";
29
30
  const log = createSubsystemLogger("gateway/heartbeat");
30
31
  let heartbeatsEnabled = true;
@@ -630,6 +631,14 @@ async function checkAndNotifyUpdate(cfg, agent, deps) {
630
631
  const delivery = resolveHeartbeatDeliveryTarget({ cfg, entry, heartbeat, bindingAccountId });
631
632
  await maybeNotifyUpdateAvailable({ cfg, delivery, deps });
632
633
  }
634
+ async function checkAuthHealth(cfg, agent, deps) {
635
+ const agentId = agent.agentId;
636
+ const heartbeat = agent.heartbeat;
637
+ const { entry } = resolveHeartbeatSession(cfg, agentId, heartbeat);
638
+ const bindingAccountId = resolveAgentBoundAccountId(cfg, agentId, "whatsapp") ?? undefined;
639
+ const delivery = resolveHeartbeatDeliveryTarget({ cfg, entry, heartbeat, bindingAccountId });
640
+ await checkAndNotifyAuthHealth({ cfg, delivery, deps });
641
+ }
633
642
  export function startHeartbeatRunner(opts) {
634
643
  const runtime = opts.runtime ?? defaultRuntime;
635
644
  const runOnce = opts.runOnce ?? runHeartbeatOnce;
@@ -751,12 +760,14 @@ export function startHeartbeatRunner(opts) {
751
760
  if (res.status === "ran")
752
761
  ran = true;
753
762
  }
754
- // After heartbeat cycle: check for software updates and notify admin.
763
+ // After heartbeat cycle: check for software updates and auth health, notify admin.
755
764
  // Uses the first agent's delivery target. Non-blocking — never delays the next heartbeat.
756
765
  if (ran) {
757
766
  const firstAgent = state.agents.values().next().value;
758
767
  if (firstAgent) {
759
- void checkAndNotifyUpdate(state.cfg, firstAgent, { runtime: state.runtime }).catch(() => { });
768
+ const postRunDeps = { runtime: state.runtime };
769
+ void checkAndNotifyUpdate(state.cfg, firstAgent, postRunDeps).catch(() => { });
770
+ void checkAuthHealth(state.cfg, firstAgent, postRunDeps).catch(() => { });
760
771
  }
761
772
  }
762
773
  scheduleNext();
@@ -0,0 +1,16 @@
1
+ const listeners = new Set();
2
+ export function emitInfraAlertEvent(evt) {
3
+ const enriched = { ts: Date.now(), ...evt };
4
+ for (const listener of listeners) {
5
+ try {
6
+ listener(enriched);
7
+ }
8
+ catch {
9
+ /* ignore */
10
+ }
11
+ }
12
+ }
13
+ export function onInfraAlertEvent(listener) {
14
+ listeners.add(listener);
15
+ return () => listeners.delete(listener);
16
+ }
@@ -9,8 +9,10 @@ export function buildFtsQuery(raw) {
9
9
  return quoted.join(" AND ");
10
10
  }
11
11
  export function bm25RankToScore(rank) {
12
- const normalized = Number.isFinite(rank) ? Math.max(0, rank) : 999;
13
- return 1 / (1 + normalized);
12
+ // FTS5 bm25() returns negative values (more negative = more relevant).
13
+ // Convert to 0-1 scale: absRank/(1+absRank) → higher for better matches.
14
+ const absRank = Number.isFinite(rank) ? Math.abs(rank) : 0;
15
+ return absRank / (1 + absRank);
14
16
  }
15
17
  /**
16
18
  * Path-based boost factors applied during hybrid merge.
@@ -73,7 +75,14 @@ export function mergeHybridResults(params) {
73
75
  }
74
76
  }
75
77
  const merged = Array.from(byId.values()).map((entry) => {
76
- const raw = params.vectorWeight * entry.vectorScore + params.textWeight * entry.textScore;
78
+ const weighted = params.vectorWeight * entry.vectorScore + params.textWeight * entry.textScore;
79
+ // Keyword-only results (found by FTS but missed by vector search) must not be
80
+ // capped by textWeight — their text score passes through directly so exact keyword
81
+ // matches remain visible above minScore. When both signals are present, the weighted
82
+ // formula controls ranking as configured.
83
+ const raw = entry.vectorScore === 0 && entry.textScore > 0
84
+ ? Math.max(weighted, entry.textScore)
85
+ : weighted;
77
86
  const score = raw * pathBoost(entry.path);
78
87
  return {
79
88
  path: entry.path,
@@ -2,6 +2,21 @@ import crypto from "node:crypto";
2
2
  import fsSync from "node:fs";
3
3
  import fs from "node:fs/promises";
4
4
  import path from "node:path";
5
+ /**
6
+ * File extensions indexed by the memory system.
7
+ * Text-based formats are read as UTF-8; binary formats require extraction.
8
+ */
9
+ const TEXT_EXTENSIONS = new Set([".md", ".txt", ".html", ".htm", ".csv", ".json"]);
10
+ const BINARY_EXTENSIONS = new Set([".pdf", ".docx", ".xlsx", ".pptx"]);
11
+ const MEMORY_FILE_EXTENSIONS = new Set([...TEXT_EXTENSIONS, ...BINARY_EXTENSIONS]);
12
+ function isMemoryFileExtension(filename) {
13
+ const ext = path.extname(filename).toLowerCase();
14
+ return MEMORY_FILE_EXTENSIONS.has(ext);
15
+ }
16
+ export function isBinaryMemoryFile(filePath) {
17
+ const ext = path.extname(filePath).toLowerCase();
18
+ return BINARY_EXTENSIONS.has(ext);
19
+ }
5
20
  export function ensureDir(dir) {
6
21
  try {
7
22
  fsSync.mkdirSync(dir, { recursive: true });
@@ -41,7 +56,7 @@ async function walkDir(dir, files) {
41
56
  if (stat.isDirectory()) {
42
57
  await walkDir(full, files);
43
58
  }
44
- else if (stat.isFile() && entry.name.endsWith(".md")) {
59
+ else if (stat.isFile() && isMemoryFileExtension(entry.name)) {
45
60
  files.push(full);
46
61
  }
47
62
  }
@@ -56,7 +71,7 @@ async function walkDir(dir, files) {
56
71
  }
57
72
  if (!entry.isFile())
58
73
  continue;
59
- if (!entry.name.endsWith(".md"))
74
+ if (!isMemoryFileExtension(entry.name))
60
75
  continue;
61
76
  files.push(full);
62
77
  }
@@ -95,8 +110,11 @@ export function hashText(value) {
95
110
  }
96
111
  export async function buildFileEntry(absPath, workspaceDir) {
97
112
  const stat = await fs.stat(absPath);
98
- const content = await fs.readFile(absPath, "utf-8");
99
- const hash = hashText(content);
113
+ // Binary files (PDF, DOCX, etc.) are hashed from raw bytes; text files from UTF-8.
114
+ const binary = isBinaryMemoryFile(absPath);
115
+ const hash = binary
116
+ ? crypto.createHash("sha256").update(await fs.readFile(absPath)).digest("hex")
117
+ : hashText(await fs.readFile(absPath, "utf-8"));
100
118
  return {
101
119
  path: path.relative(workspaceDir, absPath).replace(/\\/g, "/"),
102
120
  absPath,
@@ -301,3 +319,97 @@ export function extractPeerFromPath(relPath) {
301
319
  const match = relPath.match(/^memory\/users\/([^/]+)\//i);
302
320
  return match?.[1];
303
321
  }
322
+ // ---------------------------------------------------------------------------
323
+ // Document content extraction for memory indexing
324
+ // ---------------------------------------------------------------------------
325
+ /**
326
+ * Strip HTML tags and decode common entities, producing plain text for indexing.
327
+ */
328
+ export function stripHtmlTags(html) {
329
+ return html
330
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
331
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "")
332
+ .replace(/<br\s*\/?>/gi, "\n")
333
+ .replace(/<\/(?:p|div|h[1-6]|li|tr|blockquote)>/gi, "\n")
334
+ .replace(/<[^>]+>/g, " ")
335
+ .replace(/&nbsp;/g, " ")
336
+ .replace(/&amp;/g, "&")
337
+ .replace(/&lt;/g, "<")
338
+ .replace(/&gt;/g, ">")
339
+ .replace(/&quot;/g, '"')
340
+ .replace(/&#0?39;/g, "'")
341
+ .replace(/[ \t]+/g, " ")
342
+ .replace(/\n{3,}/g, "\n\n")
343
+ .trim();
344
+ }
345
+ /**
346
+ * Extract indexable text content from a memory file.
347
+ *
348
+ * - `.md`, `.txt`, `.csv`, `.json` — read as UTF-8
349
+ * - `.html`, `.htm` — strip HTML tags
350
+ * - `.pdf` — extract via pdfjs-dist (lazy-loaded)
351
+ * - `.docx` — extract via mammoth (lazy-loaded)
352
+ * - `.xlsx` — extract via JSZip (lazy-loaded)
353
+ * - `.pptx` — extract via JSZip (lazy-loaded)
354
+ *
355
+ * Returns empty string if extraction fails (logged, not thrown).
356
+ */
357
+ export async function extractMemoryFileContent(absPath) {
358
+ const ext = path.extname(absPath).toLowerCase();
359
+ try {
360
+ switch (ext) {
361
+ case ".md":
362
+ case ".txt":
363
+ case ".csv":
364
+ case ".json":
365
+ return await fs.readFile(absPath, "utf-8");
366
+ case ".html":
367
+ case ".htm":
368
+ return stripHtmlTags(await fs.readFile(absPath, "utf-8"));
369
+ case ".pdf": {
370
+ const buffer = await fs.readFile(absPath);
371
+ const { getDocument } = await import("pdfjs-dist/legacy/build/pdf.mjs");
372
+ const pdf = await getDocument({ data: new Uint8Array(buffer), disableWorker: true })
373
+ .promise;
374
+ const parts = [];
375
+ for (let i = 1; i <= pdf.numPages; i++) {
376
+ const page = await pdf.getPage(i);
377
+ const tc = await page.getTextContent();
378
+ const text = tc.items
379
+ .map((item) => ("str" in item ? String(item.str) : ""))
380
+ .filter(Boolean)
381
+ .join(" ");
382
+ if (text.trim())
383
+ parts.push(text);
384
+ }
385
+ return parts.join("\n\n");
386
+ }
387
+ case ".docx": {
388
+ const buffer = await fs.readFile(absPath);
389
+ const { extractDocxContent } = await import("../media/document-extract.js");
390
+ const result = await extractDocxContent(buffer);
391
+ return result.text;
392
+ }
393
+ case ".xlsx": {
394
+ const buffer = await fs.readFile(absPath);
395
+ const { extractXlsxContent } = await import("../media/document-extract.js");
396
+ const result = await extractXlsxContent(buffer);
397
+ return result.text;
398
+ }
399
+ case ".pptx": {
400
+ const buffer = await fs.readFile(absPath);
401
+ const { extractPptxContent } = await import("../media/document-extract.js");
402
+ const result = await extractPptxContent(buffer);
403
+ return result.text;
404
+ }
405
+ default:
406
+ return await fs.readFile(absPath, "utf-8");
407
+ }
408
+ }
409
+ catch (err) {
410
+ // Extraction failure should not crash indexing — the file is skipped.
411
+ // Log the error so it's diagnosable (not silently swallowed).
412
+ console.warn(`[memory] extraction failed for ${path.basename(absPath)}: ${err instanceof Error ? err.message : String(err)}`);
413
+ return "";
414
+ }
415
+ }
@@ -16,7 +16,7 @@ import { DEFAULT_GEMINI_EMBEDDING_MODEL } from "./embeddings-gemini.js";
16
16
  import { DEFAULT_OPENAI_EMBEDDING_MODEL } from "./embeddings-openai.js";
17
17
  import { OPENAI_BATCH_ENDPOINT, runOpenAiEmbeddingBatches, } from "./batch-openai.js";
18
18
  import { runGeminiEmbeddingBatches } from "./batch-gemini.js";
19
- import { buildFileEntry, chunkMarkdown, ensureDir, extractPeerFromPath, hashText, isMemoryPath, listMemoryFiles, normalizeRelPath, parseEmbedding, } from "./internal.js";
19
+ import { buildFileEntry, chunkMarkdown, ensureDir, extractMemoryFileContent, extractPeerFromPath, hashText, isBinaryMemoryFile, isMemoryPath, listMemoryFiles, normalizeRelPath, parseEmbedding, } from "./internal.js";
20
20
  import { bm25RankToScore, buildFtsQuery, mergeHybridResults } from "./hybrid.js";
21
21
  import { searchKeyword, searchVector } from "./manager-search.js";
22
22
  import { ensureMemoryIndexSchema } from "./memory-schema.js";
@@ -545,7 +545,10 @@ export class MemoryIndexManager {
545
545
  if (!absPath.startsWith(this.workspaceDir)) {
546
546
  throw new Error("path escapes workspace");
547
547
  }
548
- const content = await fs.readFile(absPath, "utf-8");
548
+ // Binary/document files return extracted text; text files read as UTF-8.
549
+ const content = isBinaryMemoryFile(absPath)
550
+ ? await extractMemoryFileContent(absPath)
551
+ : await fs.readFile(absPath, "utf-8");
549
552
  if (!params.from && !params.lines) {
550
553
  return { text: content, path: relPath };
551
554
  }
@@ -2253,7 +2256,9 @@ export class MemoryIndexManager {
2253
2256
  return this.batch.enabled ? this.batch.concurrency : EMBEDDING_INDEX_CONCURRENCY;
2254
2257
  }
2255
2258
  async indexFile(entry, options) {
2256
- const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
2259
+ const content = options.content ?? (await extractMemoryFileContent(entry.absPath));
2260
+ if (!content)
2261
+ return; // Extraction failed — skip silently (logged in extractMemoryFileContent)
2257
2262
  const chunks = chunkMarkdown(content, this.settings.chunking).filter((chunk) => chunk.text.trim().length > 0);
2258
2263
  const embeddings = this.batch.enabled
2259
2264
  ? await this.embedChunksWithBatch(chunks, entry, options.source)