npm - @houtini/lm - Versions diffs - 2.7.0 → 2.9.0 - Mend

@houtini/lm 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.js CHANGED Viewed

@@ -7,16 +7,18 @@
  */
 import { Server } from '@modelcontextprotocol/sdk/server/index.js';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
-import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
-import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, } from './model-cache.js';
+import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
+import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, } from './model-cache.js';
+import { readFile } from 'node:fs/promises';
+import { isAbsolute, basename } from 'node:path';
 const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
 const LM_MODEL = process.env.LM_STUDIO_MODEL || '';
 const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || '';
-const DEFAULT_MAX_TOKENS = 2048;
+const DEFAULT_MAX_TOKENS = 16384; // fallback when model context is unknown — overridden by dynamic calculation below
 const DEFAULT_TEMPERATURE = 0.3;
 const CONNECT_TIMEOUT_MS = 5000;
 const INFERENCE_CONNECT_TIMEOUT_MS = 30_000; // generous connect timeout for inference
-const SOFT_TIMEOUT_MS = 55_000; // return partial results before MCP SDK ~60s timeout
+const SOFT_TIMEOUT_MS = 300_000; // 5 min — progress notifications reset MCP client timeout, so this is a safety net not the primary limit
 const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk
 const FALLBACK_CONTEXT_LENGTH = parseInt(process.env.LM_CONTEXT_WINDOW || '100000', 10);
 // ── Session-level token accounting ───────────────────────────────────
@@ -36,13 +38,17 @@ function recordUsage(resp) {
         session.promptTokens += resp.usage.prompt_tokens;
         session.completionTokens += resp.usage.completion_tokens;
     }
+    else if (resp.content.length > 0) {
+        // Estimate when usage is missing (truncated responses)
+        session.completionTokens += Math.ceil(resp.content.length / 4);
+    }
     // Track per-model perf stats
     if (resp.model) {
         const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
         existing.calls++;
         if (resp.ttftMs)
             existing.totalTtftMs += resp.ttftMs;
-        const tokPerSec = resp.usage && resp.generationMs > 0
+        const tokPerSec = resp.usage && resp.generationMs > 50
             ? (resp.usage.completion_tokens / (resp.generationMs / 1000))
             : 0;
         if (tokPerSec > 0) {
@@ -64,6 +70,18 @@ function apiHeaders() {
         h['Authorization'] = `Bearer ${LM_PASSWORD}`;
     return h;
 }
+// ── Request semaphore ────────────────────────────────────────────────
+// Most local LLM servers run a single model and queue parallel requests,
+// which stacks timeouts and wastes the 55s budget. This semaphore ensures
+// only one inference call runs at a time; others wait in line.
+let inferenceLock = Promise.resolve();
+function withInferenceLock(fn) {
+    let release;
+    const next = new Promise((resolve) => { release = resolve; });
+    const wait = inferenceLock;
+    inferenceLock = next;
+    return wait.then(fn).finally(() => release());
+}
 const MODEL_PROFILES = [
     {
         pattern: /nemotron|nemotron_h_moe/i,
@@ -320,10 +338,33 @@ async function timedRead(reader, timeoutMs) {
  * This means large code reviews return partial results instead of nothing.
  */
 async function chatCompletionStreaming(messages, options = {}) {
+    return withInferenceLock(() => chatCompletionStreamingInner(messages, options));
+}
+/** Get the first loaded model's info for context-aware defaults. */
+async function getActiveModel() {
+    try {
+        const models = await listModelsRaw();
+        return models.find((m) => m.state === 'loaded') ?? models[0] ?? null;
+    }
+    catch {
+        return null;
+    }
+}
+async function chatCompletionStreamingInner(messages, options = {}) {
+    // Derive max_tokens from the model's actual context window when not explicitly set.
+    // Uses 25% of context as a generous output budget (e.g. 262K context → 65K output).
+    let effectiveMaxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS;
+    if (!options.maxTokens) {
+        const activeModel = await getActiveModel();
+        if (activeModel) {
+            const ctx = getContextLength(activeModel);
+            effectiveMaxTokens = Math.floor(ctx * 0.25);
+        }
+    }
     const body = {
         messages,
         temperature: options.temperature ?? DEFAULT_TEMPERATURE,
-        max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
+        max_tokens: effectiveMaxTokens,
         stream: true,
         stream_options: { include_usage: true },
     };
@@ -333,6 +374,26 @@ async function chatCompletionStreaming(messages, options = {}) {
     if (options.responseFormat) {
         body.response_format = options.responseFormat;
     }
+    // Handle thinking/reasoning models.
+    // Some models (Gemma 4, Qwen3, DeepSeek) have extended thinking that consumes
+    // part of the max_tokens budget for invisible reasoning before producing content.
+    // Strategy: try to disable thinking via enable_thinking=false, BUT also inflate
+    // max_tokens as a safety net since some models (Gemma 4) hardcode thinking=true
+    // in their Jinja template and ignore the API parameter.
+    const modelId = (options.model || LM_MODEL || '').toString();
+    if (modelId) {
+        const thinking = await getThinkingSupport(modelId);
+        if (thinking?.supportsThinkingToggle) {
+            body.enable_thinking = false;
+            // Safety net: inflate max_tokens to account for reasoning budget.
+            // Gemma 4 ignores enable_thinking=false (hardcoded in template),
+            // so the model will think regardless. Without inflation, reasoning
+            // consumes all tokens and content comes back empty.
+            const requestedTokens = (options.maxTokens ?? DEFAULT_MAX_TOKENS);
+            body.max_tokens = Math.max(requestedTokens * 4, requestedTokens + 2000);
+            process.stderr.write(`[houtini-lm] Thinking model ${modelId}: enable_thinking=false, max_tokens inflated ${requestedTokens} → ${body.max_tokens}\n`);
+        }
+    }
     const startTime = Date.now();
     const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body) }, INFERENCE_CONNECT_TIMEOUT_MS);
     if (!res.ok) {
@@ -345,6 +406,7 @@ async function chatCompletionStreaming(messages, options = {}) {
     const reader = res.body.getReader();
     const decoder = new TextDecoder();
     let content = '';
+    let chunkCount = 0;
     let model = '';
     let usage;
     let finishReason = '';
@@ -386,10 +448,41 @@ async function chatCompletionStreaming(messages, options = {}) {
                     if (json.model)
                         model = json.model;
                     const delta = json.choices?.[0]?.delta;
+                    // Track reasoning/thinking tokens — models like Gemma 4, Qwen3, DeepSeek
+                    // emit reasoning_content during their thinking phase before producing
+                    // visible content. We must send progress notifications during this phase
+                    // to prevent MCP client timeout.
+                    if (delta?.reasoning_content) {
+                        chunkCount++;
+                        if (options.progressToken !== undefined) {
+                            server.notification({
+                                method: 'notifications/progress',
+                                params: {
+                                    progressToken: options.progressToken,
+                                    progress: chunkCount,
+                                    message: `Thinking... (${chunkCount} chunks)`,
+                                },
+                            }).catch(() => { });
+                        }
+                    }
                     if (delta?.content) {
                         if (ttftMs === undefined)
                             ttftMs = Date.now() - startTime;
                         content += delta.content;
+                        chunkCount++;
+                        // Send progress notification to reset MCP client timeout.
+                        // Each notification resets the 60s clock, giving slow models
+                        // unlimited time as long as they're actively generating.
+                        if (options.progressToken !== undefined) {
+                            server.notification({
+                                method: 'notifications/progress',
+                                params: {
+                                    progressToken: options.progressToken,
+                                    progress: chunkCount,
+                                    message: `Streaming... ${content.length} chars`,
+                                },
+                            }).catch(() => { });
+                        }
                     }
                     const reason = json.choices?.[0]?.finish_reason;
                     if (reason)
@@ -403,6 +496,33 @@ async function chatCompletionStreaming(messages, options = {}) {
                 }
             }
         }
+        // Flush remaining buffer — the usage chunk often arrives in the final SSE
+        // message and may not have a trailing newline, leaving it stranded in buffer.
+        if (buffer.trim()) {
+            const trimmed = buffer.trim();
+            if (trimmed.startsWith('data: ') && trimmed !== 'data: [DONE]') {
+                try {
+                    const json = JSON.parse(trimmed.slice(6));
+                    if (json.model)
+                        model = json.model;
+                    const delta = json.choices?.[0]?.delta;
+                    if (delta?.content) {
+                        if (ttftMs === undefined)
+                            ttftMs = Date.now() - startTime;
+                        content += delta.content;
+                    }
+                    const reason = json.choices?.[0]?.finish_reason;
+                    if (reason)
+                        finishReason = reason;
+                    if (json.usage)
+                        usage = json.usage;
+                }
+                catch (e) {
+                    // Incomplete JSON in final buffer — log for diagnostics
+                    process.stderr.write(`[houtini-lm] Unflushed buffer parse failed (${buffer.length} bytes): ${e}\n`);
+                }
+            }
+        }
     }
     finally {
         // Release the reader — don't await cancel() as it can hang
@@ -416,7 +536,17 @@ async function chatCompletionStreaming(messages, options = {}) {
     let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
     cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
     cleanContent = cleanContent.trim();
-    return { content: cleanContent, model, usage, finishReason, truncated, ttftMs, generationMs };
+    // Safety net on top of the thinking-model max_tokens inflation: some MLX/GGUF
+    // quants still exhaust their budget inside an unclosed <think> block despite
+    // `enable_thinking:false` and the 4× inflation. If stripping leaves nothing but
+    // raw output exists, return the raw reasoning so the caller sees *something*
+    // rather than an empty body + lone footer (issue #6).
+    let thinkStripFallback = false;
+    if (!cleanContent && content.trim()) {
+        thinkStripFallback = true;
+        cleanContent = content.trim();
+    }
+    return { content: cleanContent, rawContent: content, model, usage, finishReason, truncated, ttftMs, generationMs, thinkStripFallback };
 }
 /**
  * Fetch models from LM Studio's native v0 API first (richer metadata),
@@ -507,6 +637,39 @@ async function routeToModel(taskType) {
     }
     return result;
 }
+function assessQuality(resp, rawContent) {
+    const hadThinkBlocks = /<think>/.test(rawContent);
+    const estimated = !resp.usage && resp.content.length > 0;
+    const tokPerSec = resp.usage && resp.generationMs > 50
+        ? resp.usage.completion_tokens / (resp.generationMs / 1000)
+        : null;
+    return {
+        truncated: resp.truncated,
+        finishReason: resp.finishReason || 'unknown',
+        thinkBlocksStripped: hadThinkBlocks,
+        thinkStripFallback: resp.thinkStripFallback ?? false,
+        estimatedTokens: estimated,
+        contentLength: resp.content.length,
+        generationMs: resp.generationMs,
+        tokPerSec,
+    };
+}
+function formatQualityLine(quality) {
+    const flags = [];
+    if (quality.truncated)
+        flags.push('TRUNCATED');
+    if (quality.thinkStripFallback)
+        flags.push('think-strip-empty (showing raw reasoning — model ignored enable_thinking:false)');
+    else if (quality.thinkBlocksStripped)
+        flags.push('think-blocks-stripped');
+    if (quality.estimatedTokens)
+        flags.push('tokens-estimated');
+    if (quality.finishReason === 'length')
+        flags.push('hit-max-tokens');
+    if (flags.length === 0)
+        return '';
+    return `Quality: ${flags.join(', ')}`;
+}
 /**
  * Format a footer line for streaming results showing model, usage, and truncation status.
  */
@@ -516,13 +679,19 @@ function formatFooter(resp, extra) {
     const parts = [];
     if (resp.model)
         parts.push(`Model: ${resp.model}`);
-    if (resp.usage)
+    if (resp.usage) {
         parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
+    }
+    else if (resp.content.length > 0) {
+        // Estimate when usage is missing (truncated responses where final SSE chunk was lost)
+        const estTokens = Math.ceil(resp.content.length / 4);
+        parts.push(`~${estTokens} tokens (estimated)`);
+    }
     // Perf stats — computed from streaming, no proprietary API needed
     const perfParts = [];
     if (resp.ttftMs !== undefined)
         perfParts.push(`TTFT: ${resp.ttftMs}ms`);
-    if (resp.usage && resp.generationMs > 0) {
+    if (resp.usage && resp.generationMs > 50) {
         const tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
         perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
     }
@@ -532,6 +701,11 @@ function formatFooter(resp, extra) {
         parts.push(perfParts.join(', '));
     if (extra)
         parts.push(extra);
+    // Quality signals — structured metadata for orchestrator trust decisions
+    const quality = assessQuality(resp, resp.rawContent);
+    const qualityLine = formatQualityLine(quality);
+    if (qualityLine)
+        parts.push(qualityLine);
     if (resp.truncated)
         parts.push('⚠ TRUNCATED (soft timeout — partial result)');
     const sessionLine = sessionSummary();
@@ -683,6 +857,44 @@ const TOOLS = [
             required: ['code', 'task'],
         },
     },
+    {
+        name: 'code_task_files',
+        description: 'Like code_task, but the local LLM reads the files directly from disk — the contents never pass through the MCP client\'s context window.\n\n' +
+            'USE THIS instead of code_task when you want the LLM to review multiple files or a single large file, without copying source into the chat.\n\n' +
+            'HOW IT WORKS:\n' +
+            '• Provide absolute paths to the files you want analysed.\n' +
+            '• The server reads each file (Promise.allSettled — one unreadable file does not sink the call).\n' +
+            '• Files are concatenated with `=== <filename> ===` headers, then sent to the same code-review pipeline as code_task.\n' +
+            '• Read failures are surfaced inline (with the reason) so the LLM can still reason about what it did receive.\n\n' +
+            'WHEN TO USE:\n' +
+            '• Reviewing multiple related files (module + its tests, client + server pair)\n' +
+            '• Auditing a single large file too big to paste comfortably\n' +
+            '• Any code_task where saving MCP client tokens matters\n\n' +
+            'QA: Same rules as code_task — verify the output before acting on it.',
+        inputSchema: {
+            type: 'object',
+            properties: {
+                paths: {
+                    type: 'array',
+                    items: { type: 'string' },
+                    description: 'Absolute file paths to analyse. Relative paths are rejected — always pass absolute.',
+                },
+                task: {
+                    type: 'string',
+                    description: 'What to do: "Find bugs", "Explain this module", "Suggest a cleaner API", etc.',
+                },
+                language: {
+                    type: 'string',
+                    description: 'Optional language hint: "typescript", "python", etc. Shapes the system prompt.',
+                },
+                max_tokens: {
+                    type: 'number',
+                    description: 'Optional output budget override. Defaults to 25% of the loaded model\'s context window.',
+                },
+            },
+            required: ['paths', 'task'],
+        },
+    },
     {
         name: 'discover',
         description: 'Check whether the local LLM is online and what model is loaded. Returns model name, context window size, ' +
@@ -721,10 +933,55 @@ const TOOLS = [
     },
 ];
 // ── MCP Server ───────────────────────────────────────────────────────
-const server = new Server({ name: 'houtini-lm', version: '2.7.0' }, { capabilities: { tools: {} } });
+const server = new Server({ name: 'houtini-lm', version: '2.9.0' }, { capabilities: { tools: {}, resources: {} } });
+// ── MCP Resources ─────────────────────────────────────────────────────
+// Exposes session performance metrics as a readable resource so Claude can
+// proactively check offload efficiency and make smarter delegation decisions.
+server.setRequestHandler(ListResourcesRequestSchema, async () => ({
+    resources: [
+        {
+            uri: 'houtini://metrics/session',
+            name: 'Session Offload Metrics',
+            description: 'Cumulative token offload stats, per-model performance, and quality signals for the current session.',
+            mimeType: 'application/json',
+        },
+    ],
+}));
+server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
+    const { uri } = request.params;
+    if (uri === 'houtini://metrics/session') {
+        const modelStats = {};
+        for (const [modelId, stats] of session.modelStats) {
+            modelStats[modelId] = {
+                calls: stats.calls,
+                avgTtftMs: stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0,
+                avgTokPerSec: stats.perfCalls > 0 ? parseFloat((stats.totalTokPerSec / stats.perfCalls).toFixed(1)) : null,
+            };
+        }
+        const metrics = {
+            session: {
+                totalCalls: session.calls,
+                promptTokens: session.promptTokens,
+                completionTokens: session.completionTokens,
+                totalTokensOffloaded: session.promptTokens + session.completionTokens,
+            },
+            perModel: modelStats,
+            endpoint: LM_BASE_URL,
+        };
+        return {
+            contents: [{
+                    uri,
+                    mimeType: 'application/json',
+                    text: JSON.stringify(metrics, null, 2),
+                }],
+        };
+    }
+    throw new Error(`Unknown resource: ${uri}`);
+});
 server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
 server.setRequestHandler(CallToolRequestSchema, async (request) => {
     const { name, arguments: args } = request.params;
+    const progressToken = request.params._meta?.progressToken;
     try {
         switch (name) {
             case 'chat': {
@@ -746,6 +1003,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     maxTokens: max_tokens,
                     model: route.modelId,
                     responseFormat,
+                    progressToken,
                 });
                 const footer = formatFooter(resp);
                 return { content: [{ type: 'text', text: resp.content + footer }] };
@@ -759,10 +1017,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     : (route.hints.outputConstraint || undefined);
                 if (systemContent)
                     messages.push({ role: 'system', content: systemContent });
-                let userContent = instruction;
-                if (context)
-                    userContent = `Context:\n${context}\n\nInstruction:\n${instruction}`;
-                messages.push({ role: 'user', content: userContent });
+                // Multi-turn format prevents context bleed in smaller models.
+                // Context goes in a separate user→assistant exchange so the model
+                // "acknowledges" it before receiving the actual instruction.
+                if (context) {
+                    messages.push({ role: 'user', content: `Here is the context for analysis:\n\n${context}` });
+                    messages.push({ role: 'assistant', content: 'Understood. I have read the full context. What would you like me to do with it?' });
+                }
+                messages.push({ role: 'user', content: instruction });
                 const responseFormat = json_schema
                     ? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
                     : undefined;
@@ -771,6 +1033,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     maxTokens: max_tokens,
                     model: route.modelId,
                     responseFormat,
+                    progressToken,
                 });
                 const footer = formatFooter(resp);
                 return {
@@ -784,25 +1047,98 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 const outputConstraint = route.hints.outputConstraint
                     ? ` ${route.hints.outputConstraint}`
                     : '';
+                // Task goes in system message so smaller models don't lose it once
+                // the code block fills the attention window. Code is sole user content.
                 const codeMessages = [
                     {
                         role: 'system',
-                        content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes.${outputConstraint}`,
+                        content: `Expert ${lang} developer. Your task: ${task}\n\nBe specific — reference line numbers, function names, and concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
                     },
                     {
                         role: 'user',
-                        content: `Task: ${task}\n\n\`\`\`${lang}\n${code}\n\`\`\``,
+                        content: `\`\`\`${lang}\n${code}\n\`\`\``,
                     },
                 ];
                 const codeResp = await chatCompletionStreaming(codeMessages, {
                     temperature: route.hints.codeTemp,
                     maxTokens: codeMaxTokens ?? DEFAULT_MAX_TOKENS,
                     model: route.modelId,
+                    progressToken,
                 });
                 const codeFooter = formatFooter(codeResp, lang);
                 const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
                 return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
             }
+            case 'code_task_files': {
+                const { paths, task, language, max_tokens: codeMaxTokens } = args;
+                if (!Array.isArray(paths) || paths.length === 0) {
+                    return {
+                        content: [{ type: 'text', text: 'Error: paths must be a non-empty array of absolute file paths.' }],
+                        isError: true,
+                    };
+                }
+                // Reject relative paths early — silent resolution against cwd is surprising.
+                const relative = paths.filter((p) => typeof p !== 'string' || !isAbsolute(p));
+                if (relative.length > 0) {
+                    return {
+                        content: [{ type: 'text', text: `Error: all paths must be absolute. Relative paths: ${JSON.stringify(relative)}` }],
+                        isError: true,
+                    };
+                }
+                // Read all files in parallel. One unreadable file doesn't sink the call —
+                // failures become inline error sections so the model can still reason about
+                // the rest of the bundle.
+                const reads = await Promise.allSettled(paths.map(async (p) => ({ path: p, content: await readFile(p, 'utf8') })));
+                const sections = [];
+                let successCount = 0;
+                reads.forEach((r, i) => {
+                    const p = paths[i];
+                    if (r.status === 'fulfilled') {
+                        successCount++;
+                        sections.push(`=== ${basename(p)} (${p}) ===\n${r.value.content}`);
+                    }
+                    else {
+                        const reason = r.reason instanceof Error ? r.reason.message : String(r.reason);
+                        sections.push(`=== ${basename(p)} (${p}) — READ FAILED ===\n[Could not read: ${reason}]`);
+                    }
+                });
+                if (successCount === 0) {
+                    return {
+                        content: [{ type: 'text', text: `Error: none of the ${paths.length} file(s) could be read. Check the paths and permissions.\n\n${sections.join('\n\n')}` }],
+                        isError: true,
+                    };
+                }
+                const lang = language || 'unknown';
+                const route = await routeToModel('code');
+                const outputConstraint = route.hints.outputConstraint
+                    ? ` ${route.hints.outputConstraint}`
+                    : '';
+                const combined = sections.join('\n\n');
+                const codeMessages = [
+                    {
+                        role: 'system',
+                        content: `Expert ${lang} developer. Your task: ${task}\n\nThe user has provided ${paths.length} file(s), concatenated below with \`=== filename ===\` headers. Reference files by name in your output. Be specific — line numbers, function names, concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
+                    },
+                    {
+                        role: 'user',
+                        content: `\`\`\`${lang}\n${combined}\n\`\`\``,
+                    },
+                ];
+                // Pass codeMaxTokens raw (not `?? DEFAULT_MAX_TOKENS`) so the 25%-of-context
+                // auto-derivation in chatCompletionStreamingInner fires when the caller omits it.
+                const codeResp = await chatCompletionStreaming(codeMessages, {
+                    temperature: route.hints.codeTemp,
+                    maxTokens: codeMaxTokens,
+                    model: route.modelId,
+                    progressToken,
+                });
+                const readSummary = successCount === paths.length
+                    ? `${paths.length} file(s) read`
+                    : `${successCount}/${paths.length} file(s) read`;
+                const codeFooter = formatFooter(codeResp, `${lang} · ${readSummary}`);
+                const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
+                return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
+            }
             case 'discover': {
                 const start = Date.now();
                 let models;
@@ -870,7 +1206,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     }
                 }
                 text += `${sessionStats}\n\n`;
-                text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, or embed.`;
+                text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, code_task_files, or embed.`;
                 return { content: [{ type: 'text', text }] };
             }
             case 'list_models': {
@@ -896,33 +1232,35 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             }
             case 'embed': {
                 const { input, model: embedModel } = args;
-                const embedBody = { input };
-                if (embedModel) {
-                    embedBody.model = embedModel;
-                }
-                const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
-                if (!res.ok) {
-                    const errText = await res.text().catch(() => '');
-                    throw new Error(`Embeddings API error ${res.status}: ${errText}`);
-                }
-                const data = (await res.json());
-                const embedding = data.data[0]?.embedding;
-                if (!embedding)
-                    throw new Error('No embedding returned');
-                const usageInfo = data.usage
-                    ? `${data.usage.prompt_tokens} tokens embedded`
-                    : '';
-                return {
-                    content: [{
-                            type: 'text',
-                            text: JSON.stringify({
-                                model: data.model,
-                                dimensions: embedding.length,
-                                embedding,
-                                usage: usageInfo,
-                            }),
-                        }],
-                };
+                return await withInferenceLock(async () => {
+                    const embedBody = { input };
+                    if (embedModel) {
+                        embedBody.model = embedModel;
+                    }
+                    const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
+                    if (!res.ok) {
+                        const errText = await res.text().catch(() => '');
+                        throw new Error(`Embeddings API error ${res.status}: ${errText}`);
+                    }
+                    const data = (await res.json());
+                    const embedding = data.data[0]?.embedding;
+                    if (!embedding)
+                        throw new Error('No embedding returned');
+                    const usageInfo = data.usage
+                        ? `${data.usage.prompt_tokens} tokens embedded`
+                        : '';
+                    return {
+                        content: [{
+                                type: 'text',
+                                text: JSON.stringify({
+                                    model: data.model,
+                                    dimensions: embedding.length,
+                                    embedding,
+                                    usage: usageInfo,
+                                }),
+                            }],
+                    };
+                });
             }
             default:
                 throw new Error(`Unknown tool: ${name}`);