@rubytech/taskmaster 1.0.38 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,8 +6,8 @@
6
6
  <title>Taskmaster Control</title>
7
7
  <meta name="color-scheme" content="dark light" />
8
8
  <link rel="icon" type="image/png" href="./favicon.png" />
9
- <script type="module" crossorigin src="./assets/index-B0Q2Wmm1.js"></script>
10
- <link rel="stylesheet" crossorigin href="./assets/index-DkMDU6zX.css">
9
+ <script type="module" crossorigin src="./assets/index-RlAacvDz.js"></script>
10
+ <link rel="stylesheet" crossorigin href="./assets/index-BfV0Mtl7.css">
11
11
  </head>
12
12
  <body>
13
13
  <taskmaster-app></taskmaster-app>
@@ -15,6 +15,9 @@ const ENVELOPE_CHANNELS = [
15
15
  "BlueBubbles",
16
16
  ];
17
17
  const MESSAGE_ID_LINE = /^\s*\[message_id:\s*[^\]]+\]\s*$/i;
18
+ // Internal annotations prepended by buildInboundMediaNote / get-reply-run
19
+ const MEDIA_ATTACHED_LINE = /^\s*\[media attached(?:\s+\d+\/\d+)?:\s*[^\]]+\]\s*$/i;
20
+ const MEDIA_REPLY_HINT = /^\s*To send an image back, prefer the message tool\b/;
18
21
  function looksLikeEnvelopeHeader(header) {
19
22
  if (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}Z\b/.test(header))
20
23
  return true;
@@ -23,13 +26,16 @@ function looksLikeEnvelopeHeader(header) {
23
26
  return ENVELOPE_CHANNELS.some((label) => header.startsWith(`${label} `));
24
27
  }
25
28
  export function stripEnvelope(text) {
26
- const match = text.match(ENVELOPE_PREFIX);
27
- if (!match)
28
- return text;
29
- const header = match[1] ?? "";
30
- if (!looksLikeEnvelopeHeader(header))
31
- return text;
32
- return text.slice(match[0].length);
29
+ let result = text;
30
+ const match = result.match(ENVELOPE_PREFIX);
31
+ if (match) {
32
+ const header = match[1] ?? "";
33
+ if (looksLikeEnvelopeHeader(header)) {
34
+ result = result.slice(match[0].length);
35
+ }
36
+ }
37
+ result = stripMediaAnnotations(result);
38
+ return result;
33
39
  }
34
40
  function stripMessageIdHints(text) {
35
41
  if (!text.includes("[message_id:"))
@@ -38,6 +44,17 @@ function stripMessageIdHints(text) {
38
44
  const filtered = lines.filter((line) => !MESSAGE_ID_LINE.test(line));
39
45
  return filtered.length === lines.length ? text : filtered.join("\n");
40
46
  }
47
+ function stripMediaAnnotations(text) {
48
+ if (!text.includes("[media attached"))
49
+ return text;
50
+ const lines = text.split(/\r?\n/);
51
+ const filtered = lines.filter((line) => !MEDIA_ATTACHED_LINE.test(line) && !MEDIA_REPLY_HINT.test(line));
52
+ if (filtered.length === lines.length)
53
+ return text;
54
+ // Also strip the "[media attached: N files]" header line
55
+ const result = filtered.filter((line) => !/^\s*\[media attached:\s*\d+\s+files?\]\s*$/i.test(line));
56
+ return result.join("\n").trim();
57
+ }
41
58
  function stripEnvelopeFromContent(content) {
42
59
  let changed = false;
43
60
  const next = content.map((item) => {
@@ -10,9 +10,9 @@ import { dispatchInboundMessage } from "../../auto-reply/dispatch.js";
10
10
  import { createReplyDispatcher } from "../../auto-reply/reply/reply-dispatcher.js";
11
11
  import { extractShortModelName, } from "../../auto-reply/reply/response-prefix-template.js";
12
12
  import { resolveSendPolicy } from "../../sessions/send-policy.js";
13
+ import { createInternalHookEvent, triggerInternalHook } from "../../hooks/internal-hooks.js";
13
14
  import { INTERNAL_MESSAGE_CHANNEL } from "../../utils/message-channel.js";
14
15
  import { abortChatRunById, abortChatRunsForSessionKey, isChatStopCommandText, resolveChatRunExpiresAtMs, } from "../chat-abort.js";
15
- import { parseMessageWithAttachments } from "../chat-attachments.js";
16
16
  import { ErrorCodes, errorShape, formatValidationErrors, validateChatAbortParams, validateChatHistoryParams, validateChatInjectParams, validateChatSendParams, } from "../protocol/index.js";
17
17
  import { getMaxChatHistoryMessagesBytes } from "../server-constants.js";
18
18
  import { capArrayByJsonBytes, loadSessionEntry, readSessionMessages, resolveSessionModelRef, } from "../session-utils.js";
@@ -250,35 +250,74 @@ export const chatHandlers = {
250
250
  // Separate document attachments (PDFs, text files) from image attachments
251
251
  const imageAttachments = normalizedAttachments.filter((a) => a.type !== "document");
252
252
  const documentAttachments = normalizedAttachments.filter((a) => a.type === "document");
253
- let parsedMessage = p.message;
254
- let parsedImages = [];
255
- if (imageAttachments.length > 0) {
256
- try {
257
- const parsed = await parseMessageWithAttachments(p.message, imageAttachments, {
258
- maxBytes: 5_000_000,
259
- log: context.logGateway,
260
- });
261
- parsedMessage = parsed.message;
262
- parsedImages = parsed.images;
263
- }
264
- catch (err) {
265
- respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, String(err)));
266
- return;
267
- }
268
- }
269
- // Save document attachments to workspace uploads dir (persistent, accessible by agent)
270
- const savedDocPaths = [];
271
- if (documentAttachments.length > 0) {
253
+ // Resolve workspace uploads dir for all attachments (persistent, no TTL).
254
+ // Both images and documents are saved as plain files — same as every other channel.
255
+ let uploadsDir = null;
256
+ if (normalizedAttachments.length > 0) {
272
257
  const { cfg: sessionCfg } = loadSessionEntry(p.sessionKey);
273
258
  const agentId = resolveSessionAgentId({ sessionKey: p.sessionKey, config: sessionCfg });
274
259
  const workspaceDir = resolveAgentWorkspaceDir(sessionCfg, agentId);
275
- const uploadsDir = path.join(workspaceDir, "uploads");
260
+ uploadsDir = path.join(workspaceDir, "uploads");
276
261
  try {
277
262
  fs.mkdirSync(uploadsDir, { recursive: true });
278
263
  }
279
264
  catch {
280
265
  /* ignore if exists */
281
266
  }
267
+ }
268
+ // Save image attachments to workspace uploads dir (persistent, accessible by agent).
269
+ // The agent runner detects file path references via [media attached: ...] and
270
+ // loads them from disk at inference time — no inline base64 in transcripts.
271
+ const savedImagePaths = [];
272
+ const savedImageTypes = [];
273
+ if (imageAttachments.length > 0 && uploadsDir) {
274
+ for (const att of imageAttachments) {
275
+ if (!att.content || typeof att.content !== "string")
276
+ continue;
277
+ try {
278
+ let b64 = att.content.trim();
279
+ const dataUrlMatch = /^data:[^;]+;base64,(.*)$/.exec(b64);
280
+ if (dataUrlMatch)
281
+ b64 = dataUrlMatch[1];
282
+ const buffer = Buffer.from(b64, "base64");
283
+ // Derive extension from mime type
284
+ const mimeBase = att.mimeType?.split(";")[0]?.trim();
285
+ const extMap = {
286
+ "image/jpeg": ".jpg",
287
+ "image/png": ".png",
288
+ "image/gif": ".gif",
289
+ "image/webp": ".webp",
290
+ "image/heic": ".heic",
291
+ "image/heif": ".heif",
292
+ "image/svg+xml": ".svg",
293
+ "image/avif": ".avif",
294
+ };
295
+ const ext = (mimeBase && extMap[mimeBase]) ?? ".jpg";
296
+ const uuid = randomUUID();
297
+ let safeName;
298
+ if (att.fileName) {
299
+ const base = path
300
+ .parse(att.fileName)
301
+ .name.replace(/[^a-zA-Z0-9._-]/g, "_")
302
+ .slice(0, 60);
303
+ safeName = base ? `${base}---${uuid}${ext}` : `${uuid}${ext}`;
304
+ }
305
+ else {
306
+ safeName = `${uuid}${ext}`;
307
+ }
308
+ const destPath = path.join(uploadsDir, safeName);
309
+ fs.writeFileSync(destPath, buffer);
310
+ savedImagePaths.push(destPath);
311
+ savedImageTypes.push(mimeBase ?? "image/png");
312
+ }
313
+ catch (err) {
314
+ context.logGateway.warn(`chat image save failed: ${String(err)}`);
315
+ }
316
+ }
317
+ }
318
+ // Save document attachments to workspace uploads dir (persistent, accessible by agent)
319
+ const savedDocPaths = [];
320
+ if (documentAttachments.length > 0 && uploadsDir) {
282
321
  for (const doc of documentAttachments) {
283
322
  if (!doc.content || typeof doc.content !== "string")
284
323
  continue;
@@ -354,14 +393,14 @@ export const chatHandlers = {
354
393
  status: "started",
355
394
  };
356
395
  respond(true, ackPayload, undefined, { runId: clientRunId });
357
- const trimmedMessage = parsedMessage.trim();
396
+ const trimmedMessage = p.message.trim();
358
397
  const injectThinking = Boolean(p.thinking && trimmedMessage && !trimmedMessage.startsWith("/"));
359
- const commandBody = injectThinking ? `/think ${p.thinking} ${parsedMessage}` : parsedMessage;
398
+ const commandBody = injectThinking ? `/think ${p.thinking} ${p.message}` : p.message;
360
399
  // If documents were saved, prepend file paths to message so the agent knows about them
361
400
  const docNote = savedDocPaths.length > 0
362
401
  ? savedDocPaths.map((p) => `[file: ${p}]`).join("\n") + "\n\n"
363
402
  : "";
364
- const messageWithDocs = docNote + parsedMessage;
403
+ const messageWithDocs = docNote + p.message;
365
404
  const clientInfo = client?.connect?.client;
366
405
  const ctx = {
367
406
  Body: messageWithDocs,
@@ -379,11 +418,30 @@ export const chatHandlers = {
379
418
  SenderId: clientInfo?.id,
380
419
  SenderName: clientInfo?.displayName,
381
420
  SenderUsername: clientInfo?.displayName,
421
+ // Image/media paths — same pattern as WhatsApp. buildInboundMediaNote()
422
+ // will generate [media attached: ...] annotations that the agent runner
423
+ // detects and loads from disk at inference time.
424
+ MediaPaths: savedImagePaths.length > 0 ? savedImagePaths : undefined,
425
+ MediaPath: savedImagePaths[0],
426
+ MediaTypes: savedImageTypes.length > 0 ? savedImageTypes : undefined,
427
+ MediaType: savedImageTypes[0],
382
428
  };
383
429
  const agentId = resolveSessionAgentId({
384
430
  sessionKey: p.sessionKey,
385
431
  config: cfg,
386
432
  });
433
+ // Fire message:inbound hook for conversation archiving.
434
+ // Include image paths so the archive references the attached media.
435
+ const imageNote = savedImagePaths.length > 0 ? savedImagePaths.map((ip) => `[image: ${ip}]`).join("\n") : "";
436
+ const archiveText = [p.message, imageNote].filter(Boolean).join("\n").trim();
437
+ void triggerInternalHook(createInternalHookEvent("message", "inbound", p.sessionKey, {
438
+ text: archiveText || undefined,
439
+ timestamp: now,
440
+ chatType: "direct",
441
+ agentId,
442
+ channel: "webchat",
443
+ cfg,
444
+ }));
387
445
  let prefixContext = {
388
446
  identityName: resolveIdentityName(cfg, agentId),
389
447
  };
@@ -419,6 +477,7 @@ export const chatHandlers = {
419
477
  },
420
478
  });
421
479
  let agentRunStarted = false;
480
+ context.logGateway.info(`webchat dispatch: sessionKey=${p.sessionKey} runId=${clientRunId} body=${messageWithDocs.length}ch images=${savedImagePaths.length} docs=${savedDocPaths.length}`);
422
481
  void dispatchInboundMessage({
423
482
  ctx,
424
483
  cfg,
@@ -426,10 +485,10 @@ export const chatHandlers = {
426
485
  replyOptions: {
427
486
  runId: clientRunId,
428
487
  abortSignal: abortController.signal,
429
- images: parsedImages.length > 0 ? parsedImages : undefined,
430
488
  disableBlockStreaming: true,
431
- onAgentRunStart: () => {
489
+ onAgentRunStart: (runId) => {
432
490
  agentRunStarted = true;
491
+ context.logGateway.info(`webchat agent run started: sessionKey=${p.sessionKey} runId=${runId}`);
433
492
  },
434
493
  onModelSelected: (ctx) => {
435
494
  prefixContext.provider = ctx.provider;
@@ -440,6 +499,8 @@ export const chatHandlers = {
440
499
  },
441
500
  })
442
501
  .then(() => {
502
+ const { entry: postEntry } = loadSessionEntry(p.sessionKey);
503
+ context.logGateway.info(`webchat dispatch done: sessionKey=${p.sessionKey} agentRunStarted=${agentRunStarted} sessionId=${postEntry?.sessionId ?? "none"} sessionFile=${postEntry?.sessionFile ?? "none"}`);
443
504
  if (!agentRunStarted) {
444
505
  const combinedReply = finalReplyParts
445
506
  .map((part) => part.trim())
@@ -479,6 +540,18 @@ export const chatHandlers = {
479
540
  message,
480
541
  });
481
542
  }
543
+ // Fire message:outbound hook for conversation archiving
544
+ const outboundText = finalReplyParts.join("\n\n").trim();
545
+ if (outboundText) {
546
+ void triggerInternalHook(createInternalHookEvent("message", "outbound", p.sessionKey, {
547
+ text: outboundText,
548
+ timestamp: Date.now(),
549
+ chatType: "direct",
550
+ agentId,
551
+ channel: "webchat",
552
+ cfg,
553
+ }));
554
+ }
482
555
  context.dedupe.set(`chat:${clientRunId}`, {
483
556
  ts: Date.now(),
484
557
  ok: true,
@@ -486,6 +559,7 @@ export const chatHandlers = {
486
559
  });
487
560
  })
488
561
  .catch((err) => {
562
+ context.logGateway.warn(`webchat dispatch failed: sessionKey=${p.sessionKey} runId=${clientRunId} error=${formatForLog(err)}`);
489
563
  const error = errorShape(ErrorCodes.UNAVAILABLE, String(err));
490
564
  context.dedupe.set(`chat:${clientRunId}`, {
491
565
  ts: Date.now(),
@@ -74,6 +74,42 @@ export const memoryHandlers = {
74
74
  respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, String(err)));
75
75
  }
76
76
  },
77
+ "memory.search": async ({ params, respond }) => {
78
+ const query = typeof params.query === "string" ? params.query.trim() : "";
79
+ if (!query) {
80
+ respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "query is required"));
81
+ return;
82
+ }
83
+ const cfg = loadConfig();
84
+ const agentId = typeof params.agentId === "string" && params.agentId.trim()
85
+ ? params.agentId.trim()
86
+ : resolveDefaultAgentId(cfg);
87
+ const { manager, error } = await getMemorySearchManager({ cfg, agentId });
88
+ if (!manager) {
89
+ respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, error ?? "memory index unavailable"));
90
+ return;
91
+ }
92
+ try {
93
+ const maxResults = typeof params.maxResults === "number" ? params.maxResults : 10;
94
+ // minScore: 0 — show all results for diagnostic purposes
95
+ const results = await manager.search(query, { maxResults, minScore: 0 });
96
+ respond(true, {
97
+ ok: true,
98
+ agentId,
99
+ results: results.map((r) => ({
100
+ path: r.path,
101
+ startLine: r.startLine,
102
+ endLine: r.endLine,
103
+ score: r.score,
104
+ snippet: r.snippet,
105
+ source: r.source,
106
+ })),
107
+ });
108
+ }
109
+ catch (err) {
110
+ respond(false, undefined, errorShape(ErrorCodes.UNAVAILABLE, String(err)));
111
+ }
112
+ },
77
113
  "memory.auditClear": async ({ params, respond }) => {
78
114
  try {
79
115
  const cfg = loadConfig();
@@ -31,6 +31,13 @@ function extractPeerFromSessionKey(sessionKey) {
31
31
  }
32
32
  return null;
33
33
  }
34
+ /**
35
+ * Detect webchat session key format: agent:{agentId}:main
36
+ */
37
+ function isWebchatSessionKey(sessionKey) {
38
+ const parts = sessionKey.toLowerCase().split(":").filter(Boolean);
39
+ return parts.length === 3 && parts[0] === "agent" && parts[2] === "main";
40
+ }
34
41
  /**
35
42
  * Extract group ID from session key
36
43
  *
@@ -148,9 +155,10 @@ const archiveConversation = async (event) => {
148
155
  }
149
156
  // Get timestamp from context or event
150
157
  const timestamp = context.timestamp ?? event.timestamp;
151
- // Try DM first, then group
158
+ // Determine conversation type from session key and route to correct archive path
152
159
  const peer = extractPeerFromSessionKey(event.sessionKey);
153
160
  const groupId = peer ? null : extractGroupIdFromSessionKey(event.sessionKey);
161
+ const isWebchat = !peer && !groupId && isWebchatSessionKey(event.sessionKey);
154
162
  if (peer) {
155
163
  // Admin DMs archive to memory/admin/conversations/ (not accessible by public agent).
156
164
  // Public DMs archive to memory/users/{peer}/conversations/.
@@ -187,8 +195,13 @@ const archiveConversation = async (event) => {
187
195
  fileHeader,
188
196
  });
189
197
  }
198
+ else if (isWebchat) {
199
+ // Webchat (control panel) — archive under memory/admin/conversations/
200
+ const role = event.action === "inbound" ? "Admin" : "Assistant";
201
+ await archiveMessage({ workspaceDir, subdir: "admin", role, text, timestamp });
202
+ }
190
203
  else {
191
- // Neither DM nor group — skip
204
+ // Unknown session key format — skip
192
205
  return;
193
206
  }
194
207
  }
@@ -12,6 +12,32 @@ export function bm25RankToScore(rank) {
12
12
  const normalized = Number.isFinite(rank) ? Math.max(0, rank) : 999;
13
13
  return 1 / (1 + normalized);
14
14
  }
15
+ /**
16
+ * Path-based boost factors applied during hybrid merge.
17
+ * Curated knowledge (public/, shared/, root memory files) is boosted over
18
+ * raw logs (conversations/, session transcripts) so authoritative content
19
+ * outranks casual mentions at similar raw scores.
20
+ *
21
+ * Patterns are checked in order — first match wins.
22
+ */
23
+ const PATH_BOOST_RULES = [
24
+ // Conversation archives — demote (high volume, low signal-to-noise)
25
+ { pattern: /\/conversations\//, boost: 0.6 },
26
+ // Session source transcripts — demote
27
+ { pattern: /^sessions\//, boost: 0.6 },
28
+ // Curated public/shared knowledge — boost
29
+ { pattern: /^memory\/public\//, boost: 1.4 },
30
+ { pattern: /^memory\/shared\//, boost: 1.3 },
31
+ // Root memory files (MEMORY.md etc.) — slight boost
32
+ { pattern: /^(?:MEMORY|memory)\.md$/, boost: 1.2 },
33
+ ];
34
+ function pathBoost(filePath) {
35
+ for (const rule of PATH_BOOST_RULES) {
36
+ if (rule.pattern.test(filePath))
37
+ return rule.boost;
38
+ }
39
+ return 1.0;
40
+ }
15
41
  export function mergeHybridResults(params) {
16
42
  const byId = new Map();
17
43
  for (const r of params.vector) {
@@ -47,7 +73,8 @@ export function mergeHybridResults(params) {
47
73
  }
48
74
  }
49
75
  const merged = Array.from(byId.values()).map((entry) => {
50
- const score = params.vectorWeight * entry.vectorScore + params.textWeight * entry.textScore;
76
+ const raw = params.vectorWeight * entry.vectorScore + params.textWeight * entry.textScore;
77
+ const score = raw * pathBoost(entry.path);
51
78
  return {
52
79
  path: entry.path,
53
80
  startLine: entry.startLine,
@@ -89,77 +89,166 @@ export async function buildFileEntry(absPath, workspaceDir) {
89
89
  hash,
90
90
  };
91
91
  }
92
- export function chunkMarkdown(content, chunking) {
93
- const lines = content.split("\n");
94
- if (lines.length === 0)
92
+ /**
93
+ * Heading level (1-6) parsed from a markdown heading line, or 0 if not a heading.
94
+ */
95
+ function headingLevel(line) {
96
+ const match = line.match(/^(#{1,6})\s/);
97
+ return match ? match[1].length : 0;
98
+ }
99
+ /**
100
+ * Build a heading breadcrumb prefix from the current heading stack.
101
+ * E.g., ["# User Guide", "## Updating Taskmaster"] → "# User Guide > ## Updating Taskmaster"
102
+ */
103
+ function headingPrefix(stack) {
104
+ const filtered = stack.filter(Boolean);
105
+ return filtered.length > 0 ? filtered.join(" > ") + "\n" : "";
106
+ }
107
+ /**
108
+ * Split lines into fixed-size chunks (the original algorithm).
109
+ * Used as a fallback when a single section exceeds maxChars.
110
+ */
111
+ function chunkLinesFixed(entries, maxChars, prefix) {
112
+ if (entries.length === 0)
95
113
  return [];
96
- const maxChars = Math.max(32, chunking.tokens * 4);
97
- const overlapChars = Math.max(0, chunking.overlap * 4);
114
+ const prefixLen = prefix.length;
115
+ const effectiveMax = Math.max(32, maxChars - prefixLen);
98
116
  const chunks = [];
99
117
  let current = [];
100
118
  let currentChars = 0;
101
119
  const flush = () => {
102
120
  if (current.length === 0)
103
121
  return;
104
- const firstEntry = current[0];
105
- const lastEntry = current[current.length - 1];
106
- if (!firstEntry || !lastEntry)
107
- return;
108
- const text = current.map((entry) => entry.line).join("\n");
109
- const startLine = firstEntry.lineNo;
110
- const endLine = lastEntry.lineNo;
122
+ const first = current[0];
123
+ const last = current[current.length - 1];
124
+ const body = current.map((e) => e.line).join("\n");
125
+ const text = prefix + body;
111
126
  chunks.push({
112
- startLine,
113
- endLine,
127
+ startLine: first.lineNo,
128
+ endLine: last.lineNo,
114
129
  text,
115
130
  hash: hashText(text),
116
131
  });
117
132
  };
118
- const carryOverlap = () => {
119
- if (overlapChars <= 0 || current.length === 0) {
120
- current = [];
121
- currentChars = 0;
122
- return;
123
- }
124
- let acc = 0;
125
- const kept = [];
126
- for (let i = current.length - 1; i >= 0; i -= 1) {
127
- const entry = current[i];
128
- if (!entry)
129
- continue;
130
- acc += entry.line.length + 1;
131
- kept.unshift(entry);
132
- if (acc >= overlapChars)
133
- break;
134
- }
135
- current = kept;
136
- currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0);
137
- };
138
- for (let i = 0; i < lines.length; i += 1) {
139
- const line = lines[i] ?? "";
140
- const lineNo = i + 1;
133
+ for (const entry of entries) {
134
+ // Split overly long lines into segments that fit within effectiveMax
141
135
  const segments = [];
142
- if (line.length === 0) {
143
- segments.push("");
136
+ if (entry.line.length === 0) {
137
+ segments.push(entry);
144
138
  }
145
139
  else {
146
- for (let start = 0; start < line.length; start += maxChars) {
147
- segments.push(line.slice(start, start + maxChars));
140
+ for (let start = 0; start < entry.line.length; start += effectiveMax) {
141
+ segments.push({
142
+ line: entry.line.slice(start, start + effectiveMax),
143
+ lineNo: entry.lineNo,
144
+ });
148
145
  }
149
146
  }
150
- for (const segment of segments) {
151
- const lineSize = segment.length + 1;
152
- if (currentChars + lineSize > maxChars && current.length > 0) {
147
+ for (const seg of segments) {
148
+ const segSize = seg.line.length + 1;
149
+ if (currentChars + segSize > effectiveMax && current.length > 0) {
153
150
  flush();
154
- carryOverlap();
151
+ current = [];
152
+ currentChars = 0;
155
153
  }
156
- current.push({ line: segment, lineNo });
157
- currentChars += lineSize;
154
+ current.push(seg);
155
+ currentChars += segSize;
158
156
  }
159
157
  }
160
158
  flush();
161
159
  return chunks;
162
160
  }
161
+ /**
162
+ * Semantic markdown chunker.
163
+ *
164
+ * Splits content at markdown headings so each chunk corresponds to a logical section.
165
+ * Each chunk is prefixed with the heading breadcrumb (ancestor headings) so the embedding
166
+ * model has structural context — e.g., "# User Guide > ## Updating Taskmaster\n...content...".
167
+ *
168
+ * If a section exceeds maxChars, it falls back to fixed-size splitting within that section,
169
+ * but each sub-chunk still receives the heading prefix.
170
+ *
171
+ * Files with no headings are chunked using fixed-size splitting (original behavior).
172
+ */
173
+ export function chunkMarkdown(content, chunking) {
174
+ if (!content.trim())
175
+ return [];
176
+ const lines = content.split("\n");
177
+ const maxChars = Math.max(32, chunking.tokens * 4);
178
+ // Parse all lines to detect if there are any headings
179
+ const parsedLines = [];
180
+ let hasHeadings = false;
181
+ for (let i = 0; i < lines.length; i++) {
182
+ const line = lines[i] ?? "";
183
+ const level = headingLevel(line);
184
+ if (level > 0)
185
+ hasHeadings = true;
186
+ parsedLines.push({ line, lineNo: i + 1, level });
187
+ }
188
+ // No headings at all — fall back to fixed-size chunking (no prefix)
189
+ if (!hasHeadings) {
190
+ return chunkLinesFixed(parsedLines.map((p) => ({ line: p.line, lineNo: p.lineNo })), maxChars, "");
191
+ }
192
+ const sections = [];
193
+ // headingStack tracks the current heading hierarchy: index = level-1
194
+ const headingStack = [];
195
+ let currentSection = { headingStack: [], lines: [] };
196
+ for (const parsed of parsedLines) {
197
+ if (parsed.level > 0) {
198
+ // Flush the previous section if it has content
199
+ if (currentSection.lines.length > 0) {
200
+ sections.push(currentSection);
201
+ }
202
+ // Update the heading stack: trim to this level, then set this heading.
203
+ // Use splice to avoid sparse arrays (setting .length on a shorter array
204
+ // leaves undefined holes when the heading appears without ancestors).
205
+ if (headingStack.length >= parsed.level) {
206
+ headingStack.length = parsed.level - 1;
207
+ }
208
+ headingStack[parsed.level - 1] = parsed.line;
209
+ // Start a new section with the current heading stack as context
210
+ currentSection = {
211
+ headingStack: [...headingStack],
212
+ lines: [{ line: parsed.line, lineNo: parsed.lineNo }],
213
+ };
214
+ }
215
+ else {
216
+ currentSection.lines.push({ line: parsed.line, lineNo: parsed.lineNo });
217
+ }
218
+ }
219
+ // Flush final section
220
+ if (currentSection.lines.length > 0) {
221
+ sections.push(currentSection);
222
+ }
223
+ // Convert sections to chunks
224
+ const chunks = [];
225
+ for (const section of sections) {
226
+ // Build the prefix from ancestor headings (all except the current heading,
227
+ // which is already the first line of the section body)
228
+ const ancestors = section.headingStack.slice(0, -1);
229
+ const prefix = headingPrefix(ancestors);
230
+ const bodyText = section.lines.map((e) => e.line).join("\n");
231
+ const totalLen = prefix.length + bodyText.length;
232
+ if (totalLen <= maxChars) {
233
+ // Section fits in one chunk
234
+ const first = section.lines[0];
235
+ const last = section.lines[section.lines.length - 1];
236
+ const text = prefix + bodyText;
237
+ chunks.push({
238
+ startLine: first.lineNo,
239
+ endLine: last.lineNo,
240
+ text,
241
+ hash: hashText(text),
242
+ });
243
+ }
244
+ else {
245
+ // Section too large — split with fixed-size chunking, each sub-chunk gets prefix
246
+ const subChunks = chunkLinesFixed(section.lines, maxChars, prefix);
247
+ chunks.push(...subChunks);
248
+ }
249
+ }
250
+ return chunks;
251
+ }
163
252
  export function parseEmbedding(raw) {
164
253
  try {
165
254
  const parsed = JSON.parse(raw);