@houtini/lm 2.7.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -11
- package/dist/index.js +382 -44
- package/dist/index.js.map +1 -1
- package/dist/model-cache.d.ts +10 -2
- package/dist/model-cache.js +117 -37
- package/dist/model-cache.js.map +1 -1
- package/package.json +2 -2
- package/server.json +2 -2
package/dist/index.js
CHANGED
|
@@ -7,16 +7,18 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
9
9
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
10
|
-
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
11
|
-
import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, } from './model-cache.js';
|
|
10
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
11
|
+
import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, } from './model-cache.js';
|
|
12
|
+
import { readFile } from 'node:fs/promises';
|
|
13
|
+
import { isAbsolute, basename } from 'node:path';
|
|
12
14
|
const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
|
|
13
15
|
const LM_MODEL = process.env.LM_STUDIO_MODEL || '';
|
|
14
16
|
const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || '';
|
|
15
|
-
const DEFAULT_MAX_TOKENS =
|
|
17
|
+
const DEFAULT_MAX_TOKENS = 16384; // fallback when model context is unknown — overridden by dynamic calculation below
|
|
16
18
|
const DEFAULT_TEMPERATURE = 0.3;
|
|
17
19
|
const CONNECT_TIMEOUT_MS = 5000;
|
|
18
20
|
const INFERENCE_CONNECT_TIMEOUT_MS = 30_000; // generous connect timeout for inference
|
|
19
|
-
const SOFT_TIMEOUT_MS =
|
|
21
|
+
const SOFT_TIMEOUT_MS = 300_000; // 5 min — progress notifications reset MCP client timeout, so this is a safety net not the primary limit
|
|
20
22
|
const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk
|
|
21
23
|
const FALLBACK_CONTEXT_LENGTH = parseInt(process.env.LM_CONTEXT_WINDOW || '100000', 10);
|
|
22
24
|
// ── Session-level token accounting ───────────────────────────────────
|
|
@@ -36,13 +38,17 @@ function recordUsage(resp) {
|
|
|
36
38
|
session.promptTokens += resp.usage.prompt_tokens;
|
|
37
39
|
session.completionTokens += resp.usage.completion_tokens;
|
|
38
40
|
}
|
|
41
|
+
else if (resp.content.length > 0) {
|
|
42
|
+
// Estimate when usage is missing (truncated responses)
|
|
43
|
+
session.completionTokens += Math.ceil(resp.content.length / 4);
|
|
44
|
+
}
|
|
39
45
|
// Track per-model perf stats
|
|
40
46
|
if (resp.model) {
|
|
41
47
|
const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
|
|
42
48
|
existing.calls++;
|
|
43
49
|
if (resp.ttftMs)
|
|
44
50
|
existing.totalTtftMs += resp.ttftMs;
|
|
45
|
-
const tokPerSec = resp.usage && resp.generationMs >
|
|
51
|
+
const tokPerSec = resp.usage && resp.generationMs > 50
|
|
46
52
|
? (resp.usage.completion_tokens / (resp.generationMs / 1000))
|
|
47
53
|
: 0;
|
|
48
54
|
if (tokPerSec > 0) {
|
|
@@ -64,6 +70,18 @@ function apiHeaders() {
|
|
|
64
70
|
h['Authorization'] = `Bearer ${LM_PASSWORD}`;
|
|
65
71
|
return h;
|
|
66
72
|
}
|
|
73
|
+
// ── Request semaphore ────────────────────────────────────────────────
|
|
74
|
+
// Most local LLM servers run a single model and queue parallel requests,
|
|
75
|
+
// which stacks timeouts and wastes the 55s budget. This semaphore ensures
|
|
76
|
+
// only one inference call runs at a time; others wait in line.
|
|
77
|
+
let inferenceLock = Promise.resolve();
|
|
78
|
+
function withInferenceLock(fn) {
|
|
79
|
+
let release;
|
|
80
|
+
const next = new Promise((resolve) => { release = resolve; });
|
|
81
|
+
const wait = inferenceLock;
|
|
82
|
+
inferenceLock = next;
|
|
83
|
+
return wait.then(fn).finally(() => release());
|
|
84
|
+
}
|
|
67
85
|
const MODEL_PROFILES = [
|
|
68
86
|
{
|
|
69
87
|
pattern: /nemotron|nemotron_h_moe/i,
|
|
@@ -320,10 +338,33 @@ async function timedRead(reader, timeoutMs) {
|
|
|
320
338
|
* This means large code reviews return partial results instead of nothing.
|
|
321
339
|
*/
|
|
322
340
|
async function chatCompletionStreaming(messages, options = {}) {
|
|
341
|
+
return withInferenceLock(() => chatCompletionStreamingInner(messages, options));
|
|
342
|
+
}
|
|
343
|
+
/** Get the first loaded model's info for context-aware defaults. */
|
|
344
|
+
async function getActiveModel() {
|
|
345
|
+
try {
|
|
346
|
+
const models = await listModelsRaw();
|
|
347
|
+
return models.find((m) => m.state === 'loaded') ?? models[0] ?? null;
|
|
348
|
+
}
|
|
349
|
+
catch {
|
|
350
|
+
return null;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
async function chatCompletionStreamingInner(messages, options = {}) {
|
|
354
|
+
// Derive max_tokens from the model's actual context window when not explicitly set.
|
|
355
|
+
// Uses 25% of context as a generous output budget (e.g. 262K context → 65K output).
|
|
356
|
+
let effectiveMaxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS;
|
|
357
|
+
if (!options.maxTokens) {
|
|
358
|
+
const activeModel = await getActiveModel();
|
|
359
|
+
if (activeModel) {
|
|
360
|
+
const ctx = getContextLength(activeModel);
|
|
361
|
+
effectiveMaxTokens = Math.floor(ctx * 0.25);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
323
364
|
const body = {
|
|
324
365
|
messages,
|
|
325
366
|
temperature: options.temperature ?? DEFAULT_TEMPERATURE,
|
|
326
|
-
max_tokens:
|
|
367
|
+
max_tokens: effectiveMaxTokens,
|
|
327
368
|
stream: true,
|
|
328
369
|
stream_options: { include_usage: true },
|
|
329
370
|
};
|
|
@@ -333,6 +374,26 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
333
374
|
if (options.responseFormat) {
|
|
334
375
|
body.response_format = options.responseFormat;
|
|
335
376
|
}
|
|
377
|
+
// Handle thinking/reasoning models.
|
|
378
|
+
// Some models (Gemma 4, Qwen3, DeepSeek) have extended thinking that consumes
|
|
379
|
+
// part of the max_tokens budget for invisible reasoning before producing content.
|
|
380
|
+
// Strategy: try to disable thinking via enable_thinking=false, BUT also inflate
|
|
381
|
+
// max_tokens as a safety net since some models (Gemma 4) hardcode thinking=true
|
|
382
|
+
// in their Jinja template and ignore the API parameter.
|
|
383
|
+
const modelId = (options.model || LM_MODEL || '').toString();
|
|
384
|
+
if (modelId) {
|
|
385
|
+
const thinking = await getThinkingSupport(modelId);
|
|
386
|
+
if (thinking?.supportsThinkingToggle) {
|
|
387
|
+
body.enable_thinking = false;
|
|
388
|
+
// Safety net: inflate max_tokens to account for reasoning budget.
|
|
389
|
+
// Gemma 4 ignores enable_thinking=false (hardcoded in template),
|
|
390
|
+
// so the model will think regardless. Without inflation, reasoning
|
|
391
|
+
// consumes all tokens and content comes back empty.
|
|
392
|
+
const requestedTokens = (options.maxTokens ?? DEFAULT_MAX_TOKENS);
|
|
393
|
+
body.max_tokens = Math.max(requestedTokens * 4, requestedTokens + 2000);
|
|
394
|
+
process.stderr.write(`[houtini-lm] Thinking model ${modelId}: enable_thinking=false, max_tokens inflated ${requestedTokens} → ${body.max_tokens}\n`);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
336
397
|
const startTime = Date.now();
|
|
337
398
|
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body) }, INFERENCE_CONNECT_TIMEOUT_MS);
|
|
338
399
|
if (!res.ok) {
|
|
@@ -345,6 +406,7 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
345
406
|
const reader = res.body.getReader();
|
|
346
407
|
const decoder = new TextDecoder();
|
|
347
408
|
let content = '';
|
|
409
|
+
let chunkCount = 0;
|
|
348
410
|
let model = '';
|
|
349
411
|
let usage;
|
|
350
412
|
let finishReason = '';
|
|
@@ -386,10 +448,41 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
386
448
|
if (json.model)
|
|
387
449
|
model = json.model;
|
|
388
450
|
const delta = json.choices?.[0]?.delta;
|
|
451
|
+
// Track reasoning/thinking tokens — models like Gemma 4, Qwen3, DeepSeek
|
|
452
|
+
// emit reasoning_content during their thinking phase before producing
|
|
453
|
+
// visible content. We must send progress notifications during this phase
|
|
454
|
+
// to prevent MCP client timeout.
|
|
455
|
+
if (delta?.reasoning_content) {
|
|
456
|
+
chunkCount++;
|
|
457
|
+
if (options.progressToken !== undefined) {
|
|
458
|
+
server.notification({
|
|
459
|
+
method: 'notifications/progress',
|
|
460
|
+
params: {
|
|
461
|
+
progressToken: options.progressToken,
|
|
462
|
+
progress: chunkCount,
|
|
463
|
+
message: `Thinking... (${chunkCount} chunks)`,
|
|
464
|
+
},
|
|
465
|
+
}).catch(() => { });
|
|
466
|
+
}
|
|
467
|
+
}
|
|
389
468
|
if (delta?.content) {
|
|
390
469
|
if (ttftMs === undefined)
|
|
391
470
|
ttftMs = Date.now() - startTime;
|
|
392
471
|
content += delta.content;
|
|
472
|
+
chunkCount++;
|
|
473
|
+
// Send progress notification to reset MCP client timeout.
|
|
474
|
+
// Each notification resets the 60s clock, giving slow models
|
|
475
|
+
// unlimited time as long as they're actively generating.
|
|
476
|
+
if (options.progressToken !== undefined) {
|
|
477
|
+
server.notification({
|
|
478
|
+
method: 'notifications/progress',
|
|
479
|
+
params: {
|
|
480
|
+
progressToken: options.progressToken,
|
|
481
|
+
progress: chunkCount,
|
|
482
|
+
message: `Streaming... ${content.length} chars`,
|
|
483
|
+
},
|
|
484
|
+
}).catch(() => { });
|
|
485
|
+
}
|
|
393
486
|
}
|
|
394
487
|
const reason = json.choices?.[0]?.finish_reason;
|
|
395
488
|
if (reason)
|
|
@@ -403,6 +496,33 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
403
496
|
}
|
|
404
497
|
}
|
|
405
498
|
}
|
|
499
|
+
// Flush remaining buffer — the usage chunk often arrives in the final SSE
|
|
500
|
+
// message and may not have a trailing newline, leaving it stranded in buffer.
|
|
501
|
+
if (buffer.trim()) {
|
|
502
|
+
const trimmed = buffer.trim();
|
|
503
|
+
if (trimmed.startsWith('data: ') && trimmed !== 'data: [DONE]') {
|
|
504
|
+
try {
|
|
505
|
+
const json = JSON.parse(trimmed.slice(6));
|
|
506
|
+
if (json.model)
|
|
507
|
+
model = json.model;
|
|
508
|
+
const delta = json.choices?.[0]?.delta;
|
|
509
|
+
if (delta?.content) {
|
|
510
|
+
if (ttftMs === undefined)
|
|
511
|
+
ttftMs = Date.now() - startTime;
|
|
512
|
+
content += delta.content;
|
|
513
|
+
}
|
|
514
|
+
const reason = json.choices?.[0]?.finish_reason;
|
|
515
|
+
if (reason)
|
|
516
|
+
finishReason = reason;
|
|
517
|
+
if (json.usage)
|
|
518
|
+
usage = json.usage;
|
|
519
|
+
}
|
|
520
|
+
catch (e) {
|
|
521
|
+
// Incomplete JSON in final buffer — log for diagnostics
|
|
522
|
+
process.stderr.write(`[houtini-lm] Unflushed buffer parse failed (${buffer.length} bytes): ${e}\n`);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
}
|
|
406
526
|
}
|
|
407
527
|
finally {
|
|
408
528
|
// Release the reader — don't await cancel() as it can hang
|
|
@@ -416,7 +536,17 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
416
536
|
let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
|
|
417
537
|
cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
|
|
418
538
|
cleanContent = cleanContent.trim();
|
|
419
|
-
|
|
539
|
+
// Safety net on top of the thinking-model max_tokens inflation: some MLX/GGUF
|
|
540
|
+
// quants still exhaust their budget inside an unclosed <think> block despite
|
|
541
|
+
// `enable_thinking:false` and the 4× inflation. If stripping leaves nothing but
|
|
542
|
+
// raw output exists, return the raw reasoning so the caller sees *something*
|
|
543
|
+
// rather than an empty body + lone footer (issue #6).
|
|
544
|
+
let thinkStripFallback = false;
|
|
545
|
+
if (!cleanContent && content.trim()) {
|
|
546
|
+
thinkStripFallback = true;
|
|
547
|
+
cleanContent = content.trim();
|
|
548
|
+
}
|
|
549
|
+
return { content: cleanContent, rawContent: content, model, usage, finishReason, truncated, ttftMs, generationMs, thinkStripFallback };
|
|
420
550
|
}
|
|
421
551
|
/**
|
|
422
552
|
* Fetch models from LM Studio's native v0 API first (richer metadata),
|
|
@@ -507,6 +637,39 @@ async function routeToModel(taskType) {
|
|
|
507
637
|
}
|
|
508
638
|
return result;
|
|
509
639
|
}
|
|
640
|
+
function assessQuality(resp, rawContent) {
|
|
641
|
+
const hadThinkBlocks = /<think>/.test(rawContent);
|
|
642
|
+
const estimated = !resp.usage && resp.content.length > 0;
|
|
643
|
+
const tokPerSec = resp.usage && resp.generationMs > 50
|
|
644
|
+
? resp.usage.completion_tokens / (resp.generationMs / 1000)
|
|
645
|
+
: null;
|
|
646
|
+
return {
|
|
647
|
+
truncated: resp.truncated,
|
|
648
|
+
finishReason: resp.finishReason || 'unknown',
|
|
649
|
+
thinkBlocksStripped: hadThinkBlocks,
|
|
650
|
+
thinkStripFallback: resp.thinkStripFallback ?? false,
|
|
651
|
+
estimatedTokens: estimated,
|
|
652
|
+
contentLength: resp.content.length,
|
|
653
|
+
generationMs: resp.generationMs,
|
|
654
|
+
tokPerSec,
|
|
655
|
+
};
|
|
656
|
+
}
|
|
657
|
+
function formatQualityLine(quality) {
|
|
658
|
+
const flags = [];
|
|
659
|
+
if (quality.truncated)
|
|
660
|
+
flags.push('TRUNCATED');
|
|
661
|
+
if (quality.thinkStripFallback)
|
|
662
|
+
flags.push('think-strip-empty (showing raw reasoning — model ignored enable_thinking:false)');
|
|
663
|
+
else if (quality.thinkBlocksStripped)
|
|
664
|
+
flags.push('think-blocks-stripped');
|
|
665
|
+
if (quality.estimatedTokens)
|
|
666
|
+
flags.push('tokens-estimated');
|
|
667
|
+
if (quality.finishReason === 'length')
|
|
668
|
+
flags.push('hit-max-tokens');
|
|
669
|
+
if (flags.length === 0)
|
|
670
|
+
return '';
|
|
671
|
+
return `Quality: ${flags.join(', ')}`;
|
|
672
|
+
}
|
|
510
673
|
/**
|
|
511
674
|
* Format a footer line for streaming results showing model, usage, and truncation status.
|
|
512
675
|
*/
|
|
@@ -516,13 +679,19 @@ function formatFooter(resp, extra) {
|
|
|
516
679
|
const parts = [];
|
|
517
680
|
if (resp.model)
|
|
518
681
|
parts.push(`Model: ${resp.model}`);
|
|
519
|
-
if (resp.usage)
|
|
682
|
+
if (resp.usage) {
|
|
520
683
|
parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
|
|
684
|
+
}
|
|
685
|
+
else if (resp.content.length > 0) {
|
|
686
|
+
// Estimate when usage is missing (truncated responses where final SSE chunk was lost)
|
|
687
|
+
const estTokens = Math.ceil(resp.content.length / 4);
|
|
688
|
+
parts.push(`~${estTokens} tokens (estimated)`);
|
|
689
|
+
}
|
|
521
690
|
// Perf stats — computed from streaming, no proprietary API needed
|
|
522
691
|
const perfParts = [];
|
|
523
692
|
if (resp.ttftMs !== undefined)
|
|
524
693
|
perfParts.push(`TTFT: ${resp.ttftMs}ms`);
|
|
525
|
-
if (resp.usage && resp.generationMs >
|
|
694
|
+
if (resp.usage && resp.generationMs > 50) {
|
|
526
695
|
const tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
|
|
527
696
|
perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
|
|
528
697
|
}
|
|
@@ -532,6 +701,11 @@ function formatFooter(resp, extra) {
|
|
|
532
701
|
parts.push(perfParts.join(', '));
|
|
533
702
|
if (extra)
|
|
534
703
|
parts.push(extra);
|
|
704
|
+
// Quality signals — structured metadata for orchestrator trust decisions
|
|
705
|
+
const quality = assessQuality(resp, resp.rawContent);
|
|
706
|
+
const qualityLine = formatQualityLine(quality);
|
|
707
|
+
if (qualityLine)
|
|
708
|
+
parts.push(qualityLine);
|
|
535
709
|
if (resp.truncated)
|
|
536
710
|
parts.push('⚠ TRUNCATED (soft timeout — partial result)');
|
|
537
711
|
const sessionLine = sessionSummary();
|
|
@@ -683,6 +857,44 @@ const TOOLS = [
|
|
|
683
857
|
required: ['code', 'task'],
|
|
684
858
|
},
|
|
685
859
|
},
|
|
860
|
+
{
|
|
861
|
+
name: 'code_task_files',
|
|
862
|
+
description: 'Like code_task, but the local LLM reads the files directly from disk — the contents never pass through the MCP client\'s context window.\n\n' +
|
|
863
|
+
'USE THIS instead of code_task when you want the LLM to review multiple files or a single large file, without copying source into the chat.\n\n' +
|
|
864
|
+
'HOW IT WORKS:\n' +
|
|
865
|
+
'• Provide absolute paths to the files you want analysed.\n' +
|
|
866
|
+
'• The server reads each file (Promise.allSettled — one unreadable file does not sink the call).\n' +
|
|
867
|
+
'• Files are concatenated with `=== <filename> ===` headers, then sent to the same code-review pipeline as code_task.\n' +
|
|
868
|
+
'• Read failures are surfaced inline (with the reason) so the LLM can still reason about what it did receive.\n\n' +
|
|
869
|
+
'WHEN TO USE:\n' +
|
|
870
|
+
'• Reviewing multiple related files (module + its tests, client + server pair)\n' +
|
|
871
|
+
'• Auditing a single large file too big to paste comfortably\n' +
|
|
872
|
+
'• Any code_task where saving MCP client tokens matters\n\n' +
|
|
873
|
+
'QA: Same rules as code_task — verify the output before acting on it.',
|
|
874
|
+
inputSchema: {
|
|
875
|
+
type: 'object',
|
|
876
|
+
properties: {
|
|
877
|
+
paths: {
|
|
878
|
+
type: 'array',
|
|
879
|
+
items: { type: 'string' },
|
|
880
|
+
description: 'Absolute file paths to analyse. Relative paths are rejected — always pass absolute.',
|
|
881
|
+
},
|
|
882
|
+
task: {
|
|
883
|
+
type: 'string',
|
|
884
|
+
description: 'What to do: "Find bugs", "Explain this module", "Suggest a cleaner API", etc.',
|
|
885
|
+
},
|
|
886
|
+
language: {
|
|
887
|
+
type: 'string',
|
|
888
|
+
description: 'Optional language hint: "typescript", "python", etc. Shapes the system prompt.',
|
|
889
|
+
},
|
|
890
|
+
max_tokens: {
|
|
891
|
+
type: 'number',
|
|
892
|
+
description: 'Optional output budget override. Defaults to 25% of the loaded model\'s context window.',
|
|
893
|
+
},
|
|
894
|
+
},
|
|
895
|
+
required: ['paths', 'task'],
|
|
896
|
+
},
|
|
897
|
+
},
|
|
686
898
|
{
|
|
687
899
|
name: 'discover',
|
|
688
900
|
description: 'Check whether the local LLM is online and what model is loaded. Returns model name, context window size, ' +
|
|
@@ -721,10 +933,55 @@ const TOOLS = [
|
|
|
721
933
|
},
|
|
722
934
|
];
|
|
723
935
|
// ── MCP Server ───────────────────────────────────────────────────────
|
|
724
|
-
const server = new Server({ name: 'houtini-lm', version: '2.
|
|
936
|
+
const server = new Server({ name: 'houtini-lm', version: '2.9.0' }, { capabilities: { tools: {}, resources: {} } });
|
|
937
|
+
// ── MCP Resources ─────────────────────────────────────────────────────
|
|
938
|
+
// Exposes session performance metrics as a readable resource so Claude can
|
|
939
|
+
// proactively check offload efficiency and make smarter delegation decisions.
|
|
940
|
+
server.setRequestHandler(ListResourcesRequestSchema, async () => ({
|
|
941
|
+
resources: [
|
|
942
|
+
{
|
|
943
|
+
uri: 'houtini://metrics/session',
|
|
944
|
+
name: 'Session Offload Metrics',
|
|
945
|
+
description: 'Cumulative token offload stats, per-model performance, and quality signals for the current session.',
|
|
946
|
+
mimeType: 'application/json',
|
|
947
|
+
},
|
|
948
|
+
],
|
|
949
|
+
}));
|
|
950
|
+
server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
951
|
+
const { uri } = request.params;
|
|
952
|
+
if (uri === 'houtini://metrics/session') {
|
|
953
|
+
const modelStats = {};
|
|
954
|
+
for (const [modelId, stats] of session.modelStats) {
|
|
955
|
+
modelStats[modelId] = {
|
|
956
|
+
calls: stats.calls,
|
|
957
|
+
avgTtftMs: stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0,
|
|
958
|
+
avgTokPerSec: stats.perfCalls > 0 ? parseFloat((stats.totalTokPerSec / stats.perfCalls).toFixed(1)) : null,
|
|
959
|
+
};
|
|
960
|
+
}
|
|
961
|
+
const metrics = {
|
|
962
|
+
session: {
|
|
963
|
+
totalCalls: session.calls,
|
|
964
|
+
promptTokens: session.promptTokens,
|
|
965
|
+
completionTokens: session.completionTokens,
|
|
966
|
+
totalTokensOffloaded: session.promptTokens + session.completionTokens,
|
|
967
|
+
},
|
|
968
|
+
perModel: modelStats,
|
|
969
|
+
endpoint: LM_BASE_URL,
|
|
970
|
+
};
|
|
971
|
+
return {
|
|
972
|
+
contents: [{
|
|
973
|
+
uri,
|
|
974
|
+
mimeType: 'application/json',
|
|
975
|
+
text: JSON.stringify(metrics, null, 2),
|
|
976
|
+
}],
|
|
977
|
+
};
|
|
978
|
+
}
|
|
979
|
+
throw new Error(`Unknown resource: ${uri}`);
|
|
980
|
+
});
|
|
725
981
|
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
|
|
726
982
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
727
983
|
const { name, arguments: args } = request.params;
|
|
984
|
+
const progressToken = request.params._meta?.progressToken;
|
|
728
985
|
try {
|
|
729
986
|
switch (name) {
|
|
730
987
|
case 'chat': {
|
|
@@ -746,6 +1003,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
746
1003
|
maxTokens: max_tokens,
|
|
747
1004
|
model: route.modelId,
|
|
748
1005
|
responseFormat,
|
|
1006
|
+
progressToken,
|
|
749
1007
|
});
|
|
750
1008
|
const footer = formatFooter(resp);
|
|
751
1009
|
return { content: [{ type: 'text', text: resp.content + footer }] };
|
|
@@ -759,10 +1017,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
759
1017
|
: (route.hints.outputConstraint || undefined);
|
|
760
1018
|
if (systemContent)
|
|
761
1019
|
messages.push({ role: 'system', content: systemContent });
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
1020
|
+
// Multi-turn format prevents context bleed in smaller models.
|
|
1021
|
+
// Context goes in a separate user→assistant exchange so the model
|
|
1022
|
+
// "acknowledges" it before receiving the actual instruction.
|
|
1023
|
+
if (context) {
|
|
1024
|
+
messages.push({ role: 'user', content: `Here is the context for analysis:\n\n${context}` });
|
|
1025
|
+
messages.push({ role: 'assistant', content: 'Understood. I have read the full context. What would you like me to do with it?' });
|
|
1026
|
+
}
|
|
1027
|
+
messages.push({ role: 'user', content: instruction });
|
|
766
1028
|
const responseFormat = json_schema
|
|
767
1029
|
? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
|
|
768
1030
|
: undefined;
|
|
@@ -771,6 +1033,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
771
1033
|
maxTokens: max_tokens,
|
|
772
1034
|
model: route.modelId,
|
|
773
1035
|
responseFormat,
|
|
1036
|
+
progressToken,
|
|
774
1037
|
});
|
|
775
1038
|
const footer = formatFooter(resp);
|
|
776
1039
|
return {
|
|
@@ -784,25 +1047,98 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
784
1047
|
const outputConstraint = route.hints.outputConstraint
|
|
785
1048
|
? ` ${route.hints.outputConstraint}`
|
|
786
1049
|
: '';
|
|
1050
|
+
// Task goes in system message so smaller models don't lose it once
|
|
1051
|
+
// the code block fills the attention window. Code is sole user content.
|
|
787
1052
|
const codeMessages = [
|
|
788
1053
|
{
|
|
789
1054
|
role: 'system',
|
|
790
|
-
content: `Expert ${lang} developer.
|
|
1055
|
+
content: `Expert ${lang} developer. Your task: ${task}\n\nBe specific — reference line numbers, function names, and concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
|
|
791
1056
|
},
|
|
792
1057
|
{
|
|
793
1058
|
role: 'user',
|
|
794
|
-
content:
|
|
1059
|
+
content: `\`\`\`${lang}\n${code}\n\`\`\``,
|
|
795
1060
|
},
|
|
796
1061
|
];
|
|
797
1062
|
const codeResp = await chatCompletionStreaming(codeMessages, {
|
|
798
1063
|
temperature: route.hints.codeTemp,
|
|
799
1064
|
maxTokens: codeMaxTokens ?? DEFAULT_MAX_TOKENS,
|
|
800
1065
|
model: route.modelId,
|
|
1066
|
+
progressToken,
|
|
801
1067
|
});
|
|
802
1068
|
const codeFooter = formatFooter(codeResp, lang);
|
|
803
1069
|
const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
|
|
804
1070
|
return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
|
|
805
1071
|
}
|
|
1072
|
+
case 'code_task_files': {
|
|
1073
|
+
const { paths, task, language, max_tokens: codeMaxTokens } = args;
|
|
1074
|
+
if (!Array.isArray(paths) || paths.length === 0) {
|
|
1075
|
+
return {
|
|
1076
|
+
content: [{ type: 'text', text: 'Error: paths must be a non-empty array of absolute file paths.' }],
|
|
1077
|
+
isError: true,
|
|
1078
|
+
};
|
|
1079
|
+
}
|
|
1080
|
+
// Reject relative paths early — silent resolution against cwd is surprising.
|
|
1081
|
+
const relative = paths.filter((p) => typeof p !== 'string' || !isAbsolute(p));
|
|
1082
|
+
if (relative.length > 0) {
|
|
1083
|
+
return {
|
|
1084
|
+
content: [{ type: 'text', text: `Error: all paths must be absolute. Relative paths: ${JSON.stringify(relative)}` }],
|
|
1085
|
+
isError: true,
|
|
1086
|
+
};
|
|
1087
|
+
}
|
|
1088
|
+
// Read all files in parallel. One unreadable file doesn't sink the call —
|
|
1089
|
+
// failures become inline error sections so the model can still reason about
|
|
1090
|
+
// the rest of the bundle.
|
|
1091
|
+
const reads = await Promise.allSettled(paths.map(async (p) => ({ path: p, content: await readFile(p, 'utf8') })));
|
|
1092
|
+
const sections = [];
|
|
1093
|
+
let successCount = 0;
|
|
1094
|
+
reads.forEach((r, i) => {
|
|
1095
|
+
const p = paths[i];
|
|
1096
|
+
if (r.status === 'fulfilled') {
|
|
1097
|
+
successCount++;
|
|
1098
|
+
sections.push(`=== ${basename(p)} (${p}) ===\n${r.value.content}`);
|
|
1099
|
+
}
|
|
1100
|
+
else {
|
|
1101
|
+
const reason = r.reason instanceof Error ? r.reason.message : String(r.reason);
|
|
1102
|
+
sections.push(`=== ${basename(p)} (${p}) — READ FAILED ===\n[Could not read: ${reason}]`);
|
|
1103
|
+
}
|
|
1104
|
+
});
|
|
1105
|
+
if (successCount === 0) {
|
|
1106
|
+
return {
|
|
1107
|
+
content: [{ type: 'text', text: `Error: none of the ${paths.length} file(s) could be read. Check the paths and permissions.\n\n${sections.join('\n\n')}` }],
|
|
1108
|
+
isError: true,
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
const lang = language || 'unknown';
|
|
1112
|
+
const route = await routeToModel('code');
|
|
1113
|
+
const outputConstraint = route.hints.outputConstraint
|
|
1114
|
+
? ` ${route.hints.outputConstraint}`
|
|
1115
|
+
: '';
|
|
1116
|
+
const combined = sections.join('\n\n');
|
|
1117
|
+
const codeMessages = [
|
|
1118
|
+
{
|
|
1119
|
+
role: 'system',
|
|
1120
|
+
content: `Expert ${lang} developer. Your task: ${task}\n\nThe user has provided ${paths.length} file(s), concatenated below with \`=== filename ===\` headers. Reference files by name in your output. Be specific — line numbers, function names, concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
|
|
1121
|
+
},
|
|
1122
|
+
{
|
|
1123
|
+
role: 'user',
|
|
1124
|
+
content: `\`\`\`${lang}\n${combined}\n\`\`\``,
|
|
1125
|
+
},
|
|
1126
|
+
];
|
|
1127
|
+
// Pass codeMaxTokens raw (not `?? DEFAULT_MAX_TOKENS`) so the 25%-of-context
|
|
1128
|
+
// auto-derivation in chatCompletionStreamingInner fires when the caller omits it.
|
|
1129
|
+
const codeResp = await chatCompletionStreaming(codeMessages, {
|
|
1130
|
+
temperature: route.hints.codeTemp,
|
|
1131
|
+
maxTokens: codeMaxTokens,
|
|
1132
|
+
model: route.modelId,
|
|
1133
|
+
progressToken,
|
|
1134
|
+
});
|
|
1135
|
+
const readSummary = successCount === paths.length
|
|
1136
|
+
? `${paths.length} file(s) read`
|
|
1137
|
+
: `${successCount}/${paths.length} file(s) read`;
|
|
1138
|
+
const codeFooter = formatFooter(codeResp, `${lang} · ${readSummary}`);
|
|
1139
|
+
const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
|
|
1140
|
+
return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
|
|
1141
|
+
}
|
|
806
1142
|
case 'discover': {
|
|
807
1143
|
const start = Date.now();
|
|
808
1144
|
let models;
|
|
@@ -870,7 +1206,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
870
1206
|
}
|
|
871
1207
|
}
|
|
872
1208
|
text += `${sessionStats}\n\n`;
|
|
873
|
-
text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, or embed.`;
|
|
1209
|
+
text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, code_task_files, or embed.`;
|
|
874
1210
|
return { content: [{ type: 'text', text }] };
|
|
875
1211
|
}
|
|
876
1212
|
case 'list_models': {
|
|
@@ -896,33 +1232,35 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
896
1232
|
}
|
|
897
1233
|
case 'embed': {
|
|
898
1234
|
const { input, model: embedModel } = args;
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
1235
|
+
return await withInferenceLock(async () => {
|
|
1236
|
+
const embedBody = { input };
|
|
1237
|
+
if (embedModel) {
|
|
1238
|
+
embedBody.model = embedModel;
|
|
1239
|
+
}
|
|
1240
|
+
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
|
|
1241
|
+
if (!res.ok) {
|
|
1242
|
+
const errText = await res.text().catch(() => '');
|
|
1243
|
+
throw new Error(`Embeddings API error ${res.status}: ${errText}`);
|
|
1244
|
+
}
|
|
1245
|
+
const data = (await res.json());
|
|
1246
|
+
const embedding = data.data[0]?.embedding;
|
|
1247
|
+
if (!embedding)
|
|
1248
|
+
throw new Error('No embedding returned');
|
|
1249
|
+
const usageInfo = data.usage
|
|
1250
|
+
? `${data.usage.prompt_tokens} tokens embedded`
|
|
1251
|
+
: '';
|
|
1252
|
+
return {
|
|
1253
|
+
content: [{
|
|
1254
|
+
type: 'text',
|
|
1255
|
+
text: JSON.stringify({
|
|
1256
|
+
model: data.model,
|
|
1257
|
+
dimensions: embedding.length,
|
|
1258
|
+
embedding,
|
|
1259
|
+
usage: usageInfo,
|
|
1260
|
+
}),
|
|
1261
|
+
}],
|
|
1262
|
+
};
|
|
1263
|
+
});
|
|
926
1264
|
}
|
|
927
1265
|
default:
|
|
928
1266
|
throw new Error(`Unknown tool: ${name}`);
|