@houtini/lm 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -7,16 +7,18 @@
7
7
  */
8
8
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
9
9
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
10
- import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
11
- import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, } from './model-cache.js';
10
+ import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
11
+ import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, } from './model-cache.js';
12
+ import { readFile } from 'node:fs/promises';
13
+ import { isAbsolute, basename } from 'node:path';
12
14
  const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
13
15
  const LM_MODEL = process.env.LM_STUDIO_MODEL || '';
14
16
  const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || '';
15
- const DEFAULT_MAX_TOKENS = 2048;
17
+ const DEFAULT_MAX_TOKENS = 16384; // fallback when model context is unknown — overridden by dynamic calculation below
16
18
  const DEFAULT_TEMPERATURE = 0.3;
17
19
  const CONNECT_TIMEOUT_MS = 5000;
18
20
  const INFERENCE_CONNECT_TIMEOUT_MS = 30_000; // generous connect timeout for inference
19
- const SOFT_TIMEOUT_MS = 55_000; // return partial results before MCP SDK ~60s timeout
21
+ const SOFT_TIMEOUT_MS = 300_000; // 5 min progress notifications reset MCP client timeout, so this is a safety net not the primary limit
20
22
  const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk
21
23
  const FALLBACK_CONTEXT_LENGTH = parseInt(process.env.LM_CONTEXT_WINDOW || '100000', 10);
22
24
  // ── Session-level token accounting ───────────────────────────────────
@@ -36,13 +38,17 @@ function recordUsage(resp) {
36
38
  session.promptTokens += resp.usage.prompt_tokens;
37
39
  session.completionTokens += resp.usage.completion_tokens;
38
40
  }
41
+ else if (resp.content.length > 0) {
42
+ // Estimate when usage is missing (truncated responses)
43
+ session.completionTokens += Math.ceil(resp.content.length / 4);
44
+ }
39
45
  // Track per-model perf stats
40
46
  if (resp.model) {
41
47
  const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
42
48
  existing.calls++;
43
49
  if (resp.ttftMs)
44
50
  existing.totalTtftMs += resp.ttftMs;
45
- const tokPerSec = resp.usage && resp.generationMs > 0
51
+ const tokPerSec = resp.usage && resp.generationMs > 50
46
52
  ? (resp.usage.completion_tokens / (resp.generationMs / 1000))
47
53
  : 0;
48
54
  if (tokPerSec > 0) {
@@ -64,6 +70,18 @@ function apiHeaders() {
64
70
  h['Authorization'] = `Bearer ${LM_PASSWORD}`;
65
71
  return h;
66
72
  }
73
+ // ── Request semaphore ────────────────────────────────────────────────
74
+ // Most local LLM servers run a single model and queue parallel requests,
75
+ // which stacks timeouts and wastes the 55s budget. This semaphore ensures
76
+ // only one inference call runs at a time; others wait in line.
77
+ let inferenceLock = Promise.resolve();
78
+ function withInferenceLock(fn) {
79
+ let release;
80
+ const next = new Promise((resolve) => { release = resolve; });
81
+ const wait = inferenceLock;
82
+ inferenceLock = next;
83
+ return wait.then(fn).finally(() => release());
84
+ }
67
85
  const MODEL_PROFILES = [
68
86
  {
69
87
  pattern: /nemotron|nemotron_h_moe/i,
@@ -320,10 +338,33 @@ async function timedRead(reader, timeoutMs) {
320
338
  * This means large code reviews return partial results instead of nothing.
321
339
  */
322
340
  async function chatCompletionStreaming(messages, options = {}) {
341
+ return withInferenceLock(() => chatCompletionStreamingInner(messages, options));
342
+ }
343
+ /** Get the first loaded model's info for context-aware defaults. */
344
+ async function getActiveModel() {
345
+ try {
346
+ const models = await listModelsRaw();
347
+ return models.find((m) => m.state === 'loaded') ?? models[0] ?? null;
348
+ }
349
+ catch {
350
+ return null;
351
+ }
352
+ }
353
+ async function chatCompletionStreamingInner(messages, options = {}) {
354
+ // Derive max_tokens from the model's actual context window when not explicitly set.
355
+ // Uses 25% of context as a generous output budget (e.g. 262K context → 65K output).
356
+ let effectiveMaxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS;
357
+ if (!options.maxTokens) {
358
+ const activeModel = await getActiveModel();
359
+ if (activeModel) {
360
+ const ctx = getContextLength(activeModel);
361
+ effectiveMaxTokens = Math.floor(ctx * 0.25);
362
+ }
363
+ }
323
364
  const body = {
324
365
  messages,
325
366
  temperature: options.temperature ?? DEFAULT_TEMPERATURE,
326
- max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
367
+ max_tokens: effectiveMaxTokens,
327
368
  stream: true,
328
369
  stream_options: { include_usage: true },
329
370
  };
@@ -333,6 +374,26 @@ async function chatCompletionStreaming(messages, options = {}) {
333
374
  if (options.responseFormat) {
334
375
  body.response_format = options.responseFormat;
335
376
  }
377
+ // Handle thinking/reasoning models.
378
+ // Some models (Gemma 4, Qwen3, DeepSeek) have extended thinking that consumes
379
+ // part of the max_tokens budget for invisible reasoning before producing content.
380
+ // Strategy: try to disable thinking via enable_thinking=false, BUT also inflate
381
+ // max_tokens as a safety net since some models (Gemma 4) hardcode thinking=true
382
+ // in their Jinja template and ignore the API parameter.
383
+ const modelId = (options.model || LM_MODEL || '').toString();
384
+ if (modelId) {
385
+ const thinking = await getThinkingSupport(modelId);
386
+ if (thinking?.supportsThinkingToggle) {
387
+ body.enable_thinking = false;
388
+ // Safety net: inflate max_tokens to account for reasoning budget.
389
+ // Gemma 4 ignores enable_thinking=false (hardcoded in template),
390
+ // so the model will think regardless. Without inflation, reasoning
391
+ // consumes all tokens and content comes back empty.
392
+ const requestedTokens = (options.maxTokens ?? DEFAULT_MAX_TOKENS);
393
+ body.max_tokens = Math.max(requestedTokens * 4, requestedTokens + 2000);
394
+ process.stderr.write(`[houtini-lm] Thinking model ${modelId}: enable_thinking=false, max_tokens inflated ${requestedTokens} → ${body.max_tokens}\n`);
395
+ }
396
+ }
336
397
  const startTime = Date.now();
337
398
  const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body) }, INFERENCE_CONNECT_TIMEOUT_MS);
338
399
  if (!res.ok) {
@@ -345,6 +406,7 @@ async function chatCompletionStreaming(messages, options = {}) {
345
406
  const reader = res.body.getReader();
346
407
  const decoder = new TextDecoder();
347
408
  let content = '';
409
+ let chunkCount = 0;
348
410
  let model = '';
349
411
  let usage;
350
412
  let finishReason = '';
@@ -386,10 +448,41 @@ async function chatCompletionStreaming(messages, options = {}) {
386
448
  if (json.model)
387
449
  model = json.model;
388
450
  const delta = json.choices?.[0]?.delta;
451
+ // Track reasoning/thinking tokens — models like Gemma 4, Qwen3, DeepSeek
452
+ // emit reasoning_content during their thinking phase before producing
453
+ // visible content. We must send progress notifications during this phase
454
+ // to prevent MCP client timeout.
455
+ if (delta?.reasoning_content) {
456
+ chunkCount++;
457
+ if (options.progressToken !== undefined) {
458
+ server.notification({
459
+ method: 'notifications/progress',
460
+ params: {
461
+ progressToken: options.progressToken,
462
+ progress: chunkCount,
463
+ message: `Thinking... (${chunkCount} chunks)`,
464
+ },
465
+ }).catch(() => { });
466
+ }
467
+ }
389
468
  if (delta?.content) {
390
469
  if (ttftMs === undefined)
391
470
  ttftMs = Date.now() - startTime;
392
471
  content += delta.content;
472
+ chunkCount++;
473
+ // Send progress notification to reset MCP client timeout.
474
+ // Each notification resets the 60s clock, giving slow models
475
+ // unlimited time as long as they're actively generating.
476
+ if (options.progressToken !== undefined) {
477
+ server.notification({
478
+ method: 'notifications/progress',
479
+ params: {
480
+ progressToken: options.progressToken,
481
+ progress: chunkCount,
482
+ message: `Streaming... ${content.length} chars`,
483
+ },
484
+ }).catch(() => { });
485
+ }
393
486
  }
394
487
  const reason = json.choices?.[0]?.finish_reason;
395
488
  if (reason)
@@ -403,6 +496,33 @@ async function chatCompletionStreaming(messages, options = {}) {
403
496
  }
404
497
  }
405
498
  }
499
+ // Flush remaining buffer — the usage chunk often arrives in the final SSE
500
+ // message and may not have a trailing newline, leaving it stranded in buffer.
501
+ if (buffer.trim()) {
502
+ const trimmed = buffer.trim();
503
+ if (trimmed.startsWith('data: ') && trimmed !== 'data: [DONE]') {
504
+ try {
505
+ const json = JSON.parse(trimmed.slice(6));
506
+ if (json.model)
507
+ model = json.model;
508
+ const delta = json.choices?.[0]?.delta;
509
+ if (delta?.content) {
510
+ if (ttftMs === undefined)
511
+ ttftMs = Date.now() - startTime;
512
+ content += delta.content;
513
+ }
514
+ const reason = json.choices?.[0]?.finish_reason;
515
+ if (reason)
516
+ finishReason = reason;
517
+ if (json.usage)
518
+ usage = json.usage;
519
+ }
520
+ catch (e) {
521
+ // Incomplete JSON in final buffer — log for diagnostics
522
+ process.stderr.write(`[houtini-lm] Unflushed buffer parse failed (${buffer.length} bytes): ${e}\n`);
523
+ }
524
+ }
525
+ }
406
526
  }
407
527
  finally {
408
528
  // Release the reader — don't await cancel() as it can hang
@@ -416,7 +536,17 @@ async function chatCompletionStreaming(messages, options = {}) {
416
536
  let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
417
537
  cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
418
538
  cleanContent = cleanContent.trim();
419
- return { content: cleanContent, model, usage, finishReason, truncated, ttftMs, generationMs };
539
+ // Safety net on top of the thinking-model max_tokens inflation: some MLX/GGUF
540
+ // quants still exhaust their budget inside an unclosed <think> block despite
541
+ // `enable_thinking:false` and the 4× inflation. If stripping leaves nothing but
542
+ // raw output exists, return the raw reasoning so the caller sees *something*
543
+ // rather than an empty body + lone footer (issue #6).
544
+ let thinkStripFallback = false;
545
+ if (!cleanContent && content.trim()) {
546
+ thinkStripFallback = true;
547
+ cleanContent = content.trim();
548
+ }
549
+ return { content: cleanContent, rawContent: content, model, usage, finishReason, truncated, ttftMs, generationMs, thinkStripFallback };
420
550
  }
421
551
  /**
422
552
  * Fetch models from LM Studio's native v0 API first (richer metadata),
@@ -507,6 +637,39 @@ async function routeToModel(taskType) {
507
637
  }
508
638
  return result;
509
639
  }
640
+ function assessQuality(resp, rawContent) {
641
+ const hadThinkBlocks = /<think>/.test(rawContent);
642
+ const estimated = !resp.usage && resp.content.length > 0;
643
+ const tokPerSec = resp.usage && resp.generationMs > 50
644
+ ? resp.usage.completion_tokens / (resp.generationMs / 1000)
645
+ : null;
646
+ return {
647
+ truncated: resp.truncated,
648
+ finishReason: resp.finishReason || 'unknown',
649
+ thinkBlocksStripped: hadThinkBlocks,
650
+ thinkStripFallback: resp.thinkStripFallback ?? false,
651
+ estimatedTokens: estimated,
652
+ contentLength: resp.content.length,
653
+ generationMs: resp.generationMs,
654
+ tokPerSec,
655
+ };
656
+ }
657
+ function formatQualityLine(quality) {
658
+ const flags = [];
659
+ if (quality.truncated)
660
+ flags.push('TRUNCATED');
661
+ if (quality.thinkStripFallback)
662
+ flags.push('think-strip-empty (showing raw reasoning — model ignored enable_thinking:false)');
663
+ else if (quality.thinkBlocksStripped)
664
+ flags.push('think-blocks-stripped');
665
+ if (quality.estimatedTokens)
666
+ flags.push('tokens-estimated');
667
+ if (quality.finishReason === 'length')
668
+ flags.push('hit-max-tokens');
669
+ if (flags.length === 0)
670
+ return '';
671
+ return `Quality: ${flags.join(', ')}`;
672
+ }
510
673
  /**
511
674
  * Format a footer line for streaming results showing model, usage, and truncation status.
512
675
  */
@@ -516,13 +679,19 @@ function formatFooter(resp, extra) {
516
679
  const parts = [];
517
680
  if (resp.model)
518
681
  parts.push(`Model: ${resp.model}`);
519
- if (resp.usage)
682
+ if (resp.usage) {
520
683
  parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
684
+ }
685
+ else if (resp.content.length > 0) {
686
+ // Estimate when usage is missing (truncated responses where final SSE chunk was lost)
687
+ const estTokens = Math.ceil(resp.content.length / 4);
688
+ parts.push(`~${estTokens} tokens (estimated)`);
689
+ }
521
690
  // Perf stats — computed from streaming, no proprietary API needed
522
691
  const perfParts = [];
523
692
  if (resp.ttftMs !== undefined)
524
693
  perfParts.push(`TTFT: ${resp.ttftMs}ms`);
525
- if (resp.usage && resp.generationMs > 0) {
694
+ if (resp.usage && resp.generationMs > 50) {
526
695
  const tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
527
696
  perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
528
697
  }
@@ -532,6 +701,11 @@ function formatFooter(resp, extra) {
532
701
  parts.push(perfParts.join(', '));
533
702
  if (extra)
534
703
  parts.push(extra);
704
+ // Quality signals — structured metadata for orchestrator trust decisions
705
+ const quality = assessQuality(resp, resp.rawContent);
706
+ const qualityLine = formatQualityLine(quality);
707
+ if (qualityLine)
708
+ parts.push(qualityLine);
535
709
  if (resp.truncated)
536
710
  parts.push('⚠ TRUNCATED (soft timeout — partial result)');
537
711
  const sessionLine = sessionSummary();
@@ -683,6 +857,44 @@ const TOOLS = [
683
857
  required: ['code', 'task'],
684
858
  },
685
859
  },
860
+ {
861
+ name: 'code_task_files',
862
+ description: 'Like code_task, but the local LLM reads the files directly from disk — the contents never pass through the MCP client\'s context window.\n\n' +
863
+ 'USE THIS instead of code_task when you want the LLM to review multiple files or a single large file, without copying source into the chat.\n\n' +
864
+ 'HOW IT WORKS:\n' +
865
+ '• Provide absolute paths to the files you want analysed.\n' +
866
+ '• The server reads each file (Promise.allSettled — one unreadable file does not sink the call).\n' +
867
+ '• Files are concatenated with `=== <filename> ===` headers, then sent to the same code-review pipeline as code_task.\n' +
868
+ '• Read failures are surfaced inline (with the reason) so the LLM can still reason about what it did receive.\n\n' +
869
+ 'WHEN TO USE:\n' +
870
+ '• Reviewing multiple related files (module + its tests, client + server pair)\n' +
871
+ '• Auditing a single large file too big to paste comfortably\n' +
872
+ '• Any code_task where saving MCP client tokens matters\n\n' +
873
+ 'QA: Same rules as code_task — verify the output before acting on it.',
874
+ inputSchema: {
875
+ type: 'object',
876
+ properties: {
877
+ paths: {
878
+ type: 'array',
879
+ items: { type: 'string' },
880
+ description: 'Absolute file paths to analyse. Relative paths are rejected — always pass absolute.',
881
+ },
882
+ task: {
883
+ type: 'string',
884
+ description: 'What to do: "Find bugs", "Explain this module", "Suggest a cleaner API", etc.',
885
+ },
886
+ language: {
887
+ type: 'string',
888
+ description: 'Optional language hint: "typescript", "python", etc. Shapes the system prompt.',
889
+ },
890
+ max_tokens: {
891
+ type: 'number',
892
+ description: 'Optional output budget override. Defaults to 25% of the loaded model\'s context window.',
893
+ },
894
+ },
895
+ required: ['paths', 'task'],
896
+ },
897
+ },
686
898
  {
687
899
  name: 'discover',
688
900
  description: 'Check whether the local LLM is online and what model is loaded. Returns model name, context window size, ' +
@@ -721,10 +933,55 @@ const TOOLS = [
721
933
  },
722
934
  ];
723
935
  // ── MCP Server ───────────────────────────────────────────────────────
724
- const server = new Server({ name: 'houtini-lm', version: '2.7.0' }, { capabilities: { tools: {} } });
936
+ const server = new Server({ name: 'houtini-lm', version: '2.9.0' }, { capabilities: { tools: {}, resources: {} } });
937
+ // ── MCP Resources ─────────────────────────────────────────────────────
938
+ // Exposes session performance metrics as a readable resource so Claude can
939
+ // proactively check offload efficiency and make smarter delegation decisions.
940
+ server.setRequestHandler(ListResourcesRequestSchema, async () => ({
941
+ resources: [
942
+ {
943
+ uri: 'houtini://metrics/session',
944
+ name: 'Session Offload Metrics',
945
+ description: 'Cumulative token offload stats, per-model performance, and quality signals for the current session.',
946
+ mimeType: 'application/json',
947
+ },
948
+ ],
949
+ }));
950
+ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
951
+ const { uri } = request.params;
952
+ if (uri === 'houtini://metrics/session') {
953
+ const modelStats = {};
954
+ for (const [modelId, stats] of session.modelStats) {
955
+ modelStats[modelId] = {
956
+ calls: stats.calls,
957
+ avgTtftMs: stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0,
958
+ avgTokPerSec: stats.perfCalls > 0 ? parseFloat((stats.totalTokPerSec / stats.perfCalls).toFixed(1)) : null,
959
+ };
960
+ }
961
+ const metrics = {
962
+ session: {
963
+ totalCalls: session.calls,
964
+ promptTokens: session.promptTokens,
965
+ completionTokens: session.completionTokens,
966
+ totalTokensOffloaded: session.promptTokens + session.completionTokens,
967
+ },
968
+ perModel: modelStats,
969
+ endpoint: LM_BASE_URL,
970
+ };
971
+ return {
972
+ contents: [{
973
+ uri,
974
+ mimeType: 'application/json',
975
+ text: JSON.stringify(metrics, null, 2),
976
+ }],
977
+ };
978
+ }
979
+ throw new Error(`Unknown resource: ${uri}`);
980
+ });
725
981
  server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
726
982
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
727
983
  const { name, arguments: args } = request.params;
984
+ const progressToken = request.params._meta?.progressToken;
728
985
  try {
729
986
  switch (name) {
730
987
  case 'chat': {
@@ -746,6 +1003,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
746
1003
  maxTokens: max_tokens,
747
1004
  model: route.modelId,
748
1005
  responseFormat,
1006
+ progressToken,
749
1007
  });
750
1008
  const footer = formatFooter(resp);
751
1009
  return { content: [{ type: 'text', text: resp.content + footer }] };
@@ -759,10 +1017,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
759
1017
  : (route.hints.outputConstraint || undefined);
760
1018
  if (systemContent)
761
1019
  messages.push({ role: 'system', content: systemContent });
762
- let userContent = instruction;
763
- if (context)
764
- userContent = `Context:\n${context}\n\nInstruction:\n${instruction}`;
765
- messages.push({ role: 'user', content: userContent });
1020
+ // Multi-turn format prevents context bleed in smaller models.
1021
+ // Context goes in a separate user→assistant exchange so the model
1022
+ // "acknowledges" it before receiving the actual instruction.
1023
+ if (context) {
1024
+ messages.push({ role: 'user', content: `Here is the context for analysis:\n\n${context}` });
1025
+ messages.push({ role: 'assistant', content: 'Understood. I have read the full context. What would you like me to do with it?' });
1026
+ }
1027
+ messages.push({ role: 'user', content: instruction });
766
1028
  const responseFormat = json_schema
767
1029
  ? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
768
1030
  : undefined;
@@ -771,6 +1033,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
771
1033
  maxTokens: max_tokens,
772
1034
  model: route.modelId,
773
1035
  responseFormat,
1036
+ progressToken,
774
1037
  });
775
1038
  const footer = formatFooter(resp);
776
1039
  return {
@@ -784,25 +1047,98 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
784
1047
  const outputConstraint = route.hints.outputConstraint
785
1048
  ? ` ${route.hints.outputConstraint}`
786
1049
  : '';
1050
+ // Task goes in system message so smaller models don't lose it once
1051
+ // the code block fills the attention window. Code is sole user content.
787
1052
  const codeMessages = [
788
1053
  {
789
1054
  role: 'system',
790
- content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes.${outputConstraint}`,
1055
+ content: `Expert ${lang} developer. Your task: ${task}\n\nBe specific — reference line numbers, function names, and concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
791
1056
  },
792
1057
  {
793
1058
  role: 'user',
794
- content: `Task: ${task}\n\n\`\`\`${lang}\n${code}\n\`\`\``,
1059
+ content: `\`\`\`${lang}\n${code}\n\`\`\``,
795
1060
  },
796
1061
  ];
797
1062
  const codeResp = await chatCompletionStreaming(codeMessages, {
798
1063
  temperature: route.hints.codeTemp,
799
1064
  maxTokens: codeMaxTokens ?? DEFAULT_MAX_TOKENS,
800
1065
  model: route.modelId,
1066
+ progressToken,
801
1067
  });
802
1068
  const codeFooter = formatFooter(codeResp, lang);
803
1069
  const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
804
1070
  return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
805
1071
  }
1072
+ case 'code_task_files': {
1073
+ const { paths, task, language, max_tokens: codeMaxTokens } = args;
1074
+ if (!Array.isArray(paths) || paths.length === 0) {
1075
+ return {
1076
+ content: [{ type: 'text', text: 'Error: paths must be a non-empty array of absolute file paths.' }],
1077
+ isError: true,
1078
+ };
1079
+ }
1080
+ // Reject relative paths early — silent resolution against cwd is surprising.
1081
+ const relative = paths.filter((p) => typeof p !== 'string' || !isAbsolute(p));
1082
+ if (relative.length > 0) {
1083
+ return {
1084
+ content: [{ type: 'text', text: `Error: all paths must be absolute. Relative paths: ${JSON.stringify(relative)}` }],
1085
+ isError: true,
1086
+ };
1087
+ }
1088
+ // Read all files in parallel. One unreadable file doesn't sink the call —
1089
+ // failures become inline error sections so the model can still reason about
1090
+ // the rest of the bundle.
1091
+ const reads = await Promise.allSettled(paths.map(async (p) => ({ path: p, content: await readFile(p, 'utf8') })));
1092
+ const sections = [];
1093
+ let successCount = 0;
1094
+ reads.forEach((r, i) => {
1095
+ const p = paths[i];
1096
+ if (r.status === 'fulfilled') {
1097
+ successCount++;
1098
+ sections.push(`=== ${basename(p)} (${p}) ===\n${r.value.content}`);
1099
+ }
1100
+ else {
1101
+ const reason = r.reason instanceof Error ? r.reason.message : String(r.reason);
1102
+ sections.push(`=== ${basename(p)} (${p}) — READ FAILED ===\n[Could not read: ${reason}]`);
1103
+ }
1104
+ });
1105
+ if (successCount === 0) {
1106
+ return {
1107
+ content: [{ type: 'text', text: `Error: none of the ${paths.length} file(s) could be read. Check the paths and permissions.\n\n${sections.join('\n\n')}` }],
1108
+ isError: true,
1109
+ };
1110
+ }
1111
+ const lang = language || 'unknown';
1112
+ const route = await routeToModel('code');
1113
+ const outputConstraint = route.hints.outputConstraint
1114
+ ? ` ${route.hints.outputConstraint}`
1115
+ : '';
1116
+ const combined = sections.join('\n\n');
1117
+ const codeMessages = [
1118
+ {
1119
+ role: 'system',
1120
+ content: `Expert ${lang} developer. Your task: ${task}\n\nThe user has provided ${paths.length} file(s), concatenated below with \`=== filename ===\` headers. Reference files by name in your output. Be specific — line numbers, function names, concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
1121
+ },
1122
+ {
1123
+ role: 'user',
1124
+ content: `\`\`\`${lang}\n${combined}\n\`\`\``,
1125
+ },
1126
+ ];
1127
+ // Pass codeMaxTokens raw (not `?? DEFAULT_MAX_TOKENS`) so the 25%-of-context
1128
+ // auto-derivation in chatCompletionStreamingInner fires when the caller omits it.
1129
+ const codeResp = await chatCompletionStreaming(codeMessages, {
1130
+ temperature: route.hints.codeTemp,
1131
+ maxTokens: codeMaxTokens,
1132
+ model: route.modelId,
1133
+ progressToken,
1134
+ });
1135
+ const readSummary = successCount === paths.length
1136
+ ? `${paths.length} file(s) read`
1137
+ : `${successCount}/${paths.length} file(s) read`;
1138
+ const codeFooter = formatFooter(codeResp, `${lang} · ${readSummary}`);
1139
+ const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
1140
+ return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
1141
+ }
806
1142
  case 'discover': {
807
1143
  const start = Date.now();
808
1144
  let models;
@@ -870,7 +1206,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
870
1206
  }
871
1207
  }
872
1208
  text += `${sessionStats}\n\n`;
873
- text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, or embed.`;
1209
+ text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, code_task_files, or embed.`;
874
1210
  return { content: [{ type: 'text', text }] };
875
1211
  }
876
1212
  case 'list_models': {
@@ -896,33 +1232,35 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
896
1232
  }
897
1233
  case 'embed': {
898
1234
  const { input, model: embedModel } = args;
899
- const embedBody = { input };
900
- if (embedModel) {
901
- embedBody.model = embedModel;
902
- }
903
- const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
904
- if (!res.ok) {
905
- const errText = await res.text().catch(() => '');
906
- throw new Error(`Embeddings API error ${res.status}: ${errText}`);
907
- }
908
- const data = (await res.json());
909
- const embedding = data.data[0]?.embedding;
910
- if (!embedding)
911
- throw new Error('No embedding returned');
912
- const usageInfo = data.usage
913
- ? `${data.usage.prompt_tokens} tokens embedded`
914
- : '';
915
- return {
916
- content: [{
917
- type: 'text',
918
- text: JSON.stringify({
919
- model: data.model,
920
- dimensions: embedding.length,
921
- embedding,
922
- usage: usageInfo,
923
- }),
924
- }],
925
- };
1235
+ return await withInferenceLock(async () => {
1236
+ const embedBody = { input };
1237
+ if (embedModel) {
1238
+ embedBody.model = embedModel;
1239
+ }
1240
+ const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
1241
+ if (!res.ok) {
1242
+ const errText = await res.text().catch(() => '');
1243
+ throw new Error(`Embeddings API error ${res.status}: ${errText}`);
1244
+ }
1245
+ const data = (await res.json());
1246
+ const embedding = data.data[0]?.embedding;
1247
+ if (!embedding)
1248
+ throw new Error('No embedding returned');
1249
+ const usageInfo = data.usage
1250
+ ? `${data.usage.prompt_tokens} tokens embedded`
1251
+ : '';
1252
+ return {
1253
+ content: [{
1254
+ type: 'text',
1255
+ text: JSON.stringify({
1256
+ model: data.model,
1257
+ dimensions: embedding.length,
1258
+ embedding,
1259
+ usage: usageInfo,
1260
+ }),
1261
+ }],
1262
+ };
1263
+ });
926
1264
  }
927
1265
  default:
928
1266
  throw new Error(`Unknown tool: ${name}`);