@houtini/lm 2.4.1 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +206 -30
- package/dist/index.js +687 -57
- package/dist/index.js.map +1 -1
- package/dist/model-cache.d.ts +95 -0
- package/dist/model-cache.js +557 -0
- package/dist/model-cache.js.map +1 -0
- package/package.json +5 -3
- package/server.json +44 -44
package/dist/index.js
CHANGED
|
@@ -7,7 +7,8 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
9
9
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
10
|
-
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
10
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
11
|
+
import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, } from './model-cache.js';
|
|
11
12
|
const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
|
|
12
13
|
const LM_MODEL = process.env.LM_STUDIO_MODEL || '';
|
|
13
14
|
const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || '';
|
|
@@ -26,12 +27,33 @@ const session = {
|
|
|
26
27
|
calls: 0,
|
|
27
28
|
promptTokens: 0,
|
|
28
29
|
completionTokens: 0,
|
|
30
|
+
/** Per-model performance tracking for routing insights */
|
|
31
|
+
modelStats: new Map(),
|
|
29
32
|
};
|
|
30
|
-
function recordUsage(
|
|
33
|
+
function recordUsage(resp) {
|
|
31
34
|
session.calls++;
|
|
32
|
-
if (usage) {
|
|
33
|
-
session.promptTokens += usage.prompt_tokens;
|
|
34
|
-
session.completionTokens += usage.completion_tokens;
|
|
35
|
+
if (resp.usage) {
|
|
36
|
+
session.promptTokens += resp.usage.prompt_tokens;
|
|
37
|
+
session.completionTokens += resp.usage.completion_tokens;
|
|
38
|
+
}
|
|
39
|
+
else if (resp.content.length > 0) {
|
|
40
|
+
// Estimate when usage is missing (truncated responses)
|
|
41
|
+
session.completionTokens += Math.ceil(resp.content.length / 4);
|
|
42
|
+
}
|
|
43
|
+
// Track per-model perf stats
|
|
44
|
+
if (resp.model) {
|
|
45
|
+
const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
|
|
46
|
+
existing.calls++;
|
|
47
|
+
if (resp.ttftMs)
|
|
48
|
+
existing.totalTtftMs += resp.ttftMs;
|
|
49
|
+
const tokPerSec = resp.usage && resp.generationMs > 50
|
|
50
|
+
? (resp.usage.completion_tokens / (resp.generationMs / 1000))
|
|
51
|
+
: 0;
|
|
52
|
+
if (tokPerSec > 0) {
|
|
53
|
+
existing.perfCalls++;
|
|
54
|
+
existing.totalTokPerSec += tokPerSec;
|
|
55
|
+
}
|
|
56
|
+
session.modelStats.set(resp.model, existing);
|
|
35
57
|
}
|
|
36
58
|
}
|
|
37
59
|
function sessionSummary() {
|
|
@@ -46,6 +68,236 @@ function apiHeaders() {
|
|
|
46
68
|
h['Authorization'] = `Bearer ${LM_PASSWORD}`;
|
|
47
69
|
return h;
|
|
48
70
|
}
|
|
71
|
+
// ── Request semaphore ────────────────────────────────────────────────
|
|
72
|
+
// Most local LLM servers run a single model and queue parallel requests,
|
|
73
|
+
// which stacks timeouts and wastes the 55s budget. This semaphore ensures
|
|
74
|
+
// only one inference call runs at a time; others wait in line.
|
|
75
|
+
let inferenceLock = Promise.resolve();
|
|
76
|
+
function withInferenceLock(fn) {
|
|
77
|
+
let release;
|
|
78
|
+
const next = new Promise((resolve) => { release = resolve; });
|
|
79
|
+
const wait = inferenceLock;
|
|
80
|
+
inferenceLock = next;
|
|
81
|
+
return wait.then(fn).finally(() => release());
|
|
82
|
+
}
|
|
83
|
+
const MODEL_PROFILES = [
|
|
84
|
+
{
|
|
85
|
+
pattern: /nemotron|nemotron_h_moe/i,
|
|
86
|
+
profile: {
|
|
87
|
+
family: 'NVIDIA Nemotron',
|
|
88
|
+
description: 'NVIDIA\'s compact reasoning model optimised for accurate, structured responses. Strong at step-by-step logic and instruction following.',
|
|
89
|
+
strengths: ['logical reasoning', 'math', 'step-by-step problem solving', 'code review', 'structured output'],
|
|
90
|
+
weaknesses: ['creative writing', 'constrained generation', 'factual knowledge on niche topics'],
|
|
91
|
+
bestFor: ['analysis tasks', 'code bug-finding', 'math/science questions', 'data transformation'],
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
pattern: /granite|granitehybrid/i,
|
|
96
|
+
profile: {
|
|
97
|
+
family: 'IBM Granite',
|
|
98
|
+
description: 'IBM\'s enterprise-focused model family. Compact and efficient, designed for business and code tasks with strong instruction following.',
|
|
99
|
+
strengths: ['code generation', 'instruction following', 'enterprise tasks', 'efficiency'],
|
|
100
|
+
weaknesses: ['creative tasks', 'long-form generation'],
|
|
101
|
+
bestFor: ['boilerplate generation', 'code explanation', 'structured Q&A'],
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
pattern: /qwen3-coder|qwen3.*coder/i,
|
|
106
|
+
profile: {
|
|
107
|
+
family: 'Qwen3 Coder',
|
|
108
|
+
description: 'Alibaba\'s code-specialised model with agentic capabilities. Excellent at code generation, review, and multi-step coding tasks.',
|
|
109
|
+
strengths: ['code generation', 'code review', 'debugging', 'test writing', 'refactoring', 'multi-step reasoning'],
|
|
110
|
+
weaknesses: ['non-code creative tasks'],
|
|
111
|
+
bestFor: ['code generation', 'code review', 'test stubs', 'type definitions', 'refactoring'],
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
pattern: /qwen3-vl|qwen.*vl/i,
|
|
116
|
+
profile: {
|
|
117
|
+
family: 'Qwen3 Vision-Language',
|
|
118
|
+
description: 'Alibaba\'s multimodal model handling both text and image inputs. Can analyse screenshots, diagrams, and visual content.',
|
|
119
|
+
strengths: ['image understanding', 'visual Q&A', 'diagram analysis', 'OCR'],
|
|
120
|
+
weaknesses: ['pure text tasks (use a text-only model instead)'],
|
|
121
|
+
bestFor: ['screenshot analysis', 'UI review', 'diagram interpretation'],
|
|
122
|
+
},
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
pattern: /qwen3(?!.*coder)(?!.*vl)/i,
|
|
126
|
+
profile: {
|
|
127
|
+
family: 'Qwen3',
|
|
128
|
+
description: 'Alibaba\'s general-purpose model with strong multilingual and reasoning capabilities. Good all-rounder.',
|
|
129
|
+
strengths: ['general reasoning', 'multilingual', 'code', 'instruction following'],
|
|
130
|
+
weaknesses: ['specialised code tasks (use Qwen3 Coder instead)'],
|
|
131
|
+
bestFor: ['general Q&A', 'translation', 'summarisation', 'brainstorming'],
|
|
132
|
+
},
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
pattern: /llama[- ]?3/i,
|
|
136
|
+
profile: {
|
|
137
|
+
family: 'Meta LLaMA 3',
|
|
138
|
+
description: 'Meta\'s open-weight general-purpose model. Strong baseline across tasks with large community fine-tune ecosystem.',
|
|
139
|
+
strengths: ['general reasoning', 'code', 'instruction following', 'broad knowledge'],
|
|
140
|
+
weaknesses: ['specialised tasks where fine-tuned models excel'],
|
|
141
|
+
bestFor: ['general delegation', 'drafting', 'code review', 'Q&A'],
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
pattern: /minimax[- ]?m2/i,
|
|
146
|
+
profile: {
|
|
147
|
+
family: 'MiniMax M2',
|
|
148
|
+
description: 'MiniMax\'s large MoE model with strong long-context and reasoning capabilities.',
|
|
149
|
+
strengths: ['long context', 'reasoning', 'creative writing', 'multilingual'],
|
|
150
|
+
weaknesses: ['may be slower due to model size'],
|
|
151
|
+
bestFor: ['long document analysis', 'creative tasks', 'complex reasoning'],
|
|
152
|
+
},
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
pattern: /kimi[- ]?k2/i,
|
|
156
|
+
profile: {
|
|
157
|
+
family: 'Kimi K2',
|
|
158
|
+
description: 'Moonshot AI\'s large MoE model with strong agentic and tool-use capabilities.',
|
|
159
|
+
strengths: ['agentic tasks', 'tool use', 'code', 'reasoning', 'long context'],
|
|
160
|
+
weaknesses: ['may be slower due to model size'],
|
|
161
|
+
bestFor: ['complex multi-step tasks', 'code generation', 'reasoning chains'],
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
pattern: /gpt-oss/i,
|
|
166
|
+
profile: {
|
|
167
|
+
family: 'OpenAI GPT-OSS',
|
|
168
|
+
description: 'OpenAI\'s open-source model release. General-purpose with strong instruction following.',
|
|
169
|
+
strengths: ['instruction following', 'general reasoning', 'code'],
|
|
170
|
+
weaknesses: ['less tested in open ecosystem than LLaMA/Qwen'],
|
|
171
|
+
bestFor: ['general delegation', 'code tasks', 'Q&A'],
|
|
172
|
+
},
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
pattern: /glm[- ]?4/i,
|
|
176
|
+
profile: {
|
|
177
|
+
family: 'GLM-4',
|
|
178
|
+
description: 'Zhipu AI\'s open-weight MoE model. Fast inference with strong general reasoning, multilingual support, and tool-use capabilities. Uses chain-of-thought reasoning internally. MIT licensed.',
|
|
179
|
+
strengths: ['fast inference', 'general reasoning', 'tool use', 'multilingual', 'code', 'instruction following', 'chain-of-thought'],
|
|
180
|
+
weaknesses: ['always emits internal reasoning (stripped automatically)', 'less tested in English-only benchmarks than LLaMA/Qwen'],
|
|
181
|
+
bestFor: ['general delegation', 'fast drafting', 'code tasks', 'structured output', 'Q&A'],
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
pattern: /nomic.*embed|embed.*nomic/i,
|
|
186
|
+
profile: {
|
|
187
|
+
family: 'Nomic Embed',
|
|
188
|
+
description: 'Text embedding model for semantic search and similarity. Not a chat model — produces vector embeddings.',
|
|
189
|
+
strengths: ['text embeddings', 'semantic search', 'clustering'],
|
|
190
|
+
weaknesses: ['cannot chat or generate text'],
|
|
191
|
+
bestFor: ['RAG pipelines', 'semantic similarity', 'document search'],
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
pattern: /abliterated/i,
|
|
196
|
+
profile: {
|
|
197
|
+
family: 'Abliterated (uncensored)',
|
|
198
|
+
description: 'Community fine-tune with safety guardrails removed. More permissive but may produce lower-quality or unreliable output.',
|
|
199
|
+
strengths: ['fewer refusals', 'unconstrained generation'],
|
|
200
|
+
weaknesses: ['may hallucinate more', 'no safety filtering', 'less tested'],
|
|
201
|
+
bestFor: ['tasks where the base model refuses unnecessarily'],
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
];
|
|
205
|
+
/**
|
|
206
|
+
* Match a model to its known profile.
|
|
207
|
+
* Priority: 1) static MODEL_PROFILES (curated), 2) SQLite cache (auto-generated from HF)
|
|
208
|
+
*/
|
|
209
|
+
function getModelProfile(model) {
|
|
210
|
+
// Try static profiles first (curated, most reliable)
|
|
211
|
+
for (const { pattern, profile } of MODEL_PROFILES) {
|
|
212
|
+
if (pattern.test(model.id))
|
|
213
|
+
return profile;
|
|
214
|
+
}
|
|
215
|
+
if (model.arch) {
|
|
216
|
+
for (const { pattern, profile } of MODEL_PROFILES) {
|
|
217
|
+
if (pattern.test(model.arch))
|
|
218
|
+
return profile;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
return undefined;
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Async version that also checks SQLite cache for auto-generated profiles.
|
|
225
|
+
* Use this when you need the most complete profile available.
|
|
226
|
+
*/
|
|
227
|
+
async function getModelProfileAsync(model) {
|
|
228
|
+
// Static profiles take priority
|
|
229
|
+
const staticProfile = getModelProfile(model);
|
|
230
|
+
if (staticProfile)
|
|
231
|
+
return staticProfile;
|
|
232
|
+
// Check SQLite cache for auto-generated profile
|
|
233
|
+
try {
|
|
234
|
+
const cached = await getCachedProfile(model.id);
|
|
235
|
+
if (cached) {
|
|
236
|
+
const profile = cachedToProfile(cached);
|
|
237
|
+
if (profile)
|
|
238
|
+
return profile;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
// Cache lookup failed — fall through
|
|
243
|
+
}
|
|
244
|
+
return undefined;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Format a single model's full metadata for display.
|
|
248
|
+
* Async because it may fetch HuggingFace enrichment data.
|
|
249
|
+
*/
|
|
250
|
+
async function formatModelDetail(model, enrichWithHF = false) {
|
|
251
|
+
const ctx = getContextLength(model);
|
|
252
|
+
const maxCtx = getMaxContextLength(model);
|
|
253
|
+
// Use async profile lookup (checks static + SQLite cache)
|
|
254
|
+
const profile = await getModelProfileAsync(model);
|
|
255
|
+
const parts = [];
|
|
256
|
+
// Header line
|
|
257
|
+
parts.push(` ${model.state === 'loaded' ? '●' : '○'} ${model.id}`);
|
|
258
|
+
// Metadata line
|
|
259
|
+
const meta = [];
|
|
260
|
+
if (model.type)
|
|
261
|
+
meta.push(`type: ${model.type}`);
|
|
262
|
+
if (model.arch)
|
|
263
|
+
meta.push(`arch: ${model.arch}`);
|
|
264
|
+
if (model.quantization)
|
|
265
|
+
meta.push(`quant: ${model.quantization}`);
|
|
266
|
+
if (model.compatibility_type)
|
|
267
|
+
meta.push(`format: ${model.compatibility_type}`);
|
|
268
|
+
// Show loaded context vs max context when both are available and different
|
|
269
|
+
if (model.loaded_context_length && maxCtx && model.loaded_context_length !== maxCtx) {
|
|
270
|
+
meta.push(`context: ${model.loaded_context_length.toLocaleString()} (max ${maxCtx.toLocaleString()})`);
|
|
271
|
+
}
|
|
272
|
+
else if (ctx) {
|
|
273
|
+
meta.push(`context: ${ctx.toLocaleString()}`);
|
|
274
|
+
}
|
|
275
|
+
if (model.publisher)
|
|
276
|
+
meta.push(`by: ${model.publisher}`);
|
|
277
|
+
if (meta.length > 0)
|
|
278
|
+
parts.push(` ${meta.join(' · ')}`);
|
|
279
|
+
// Capabilities
|
|
280
|
+
if (model.capabilities && model.capabilities.length > 0) {
|
|
281
|
+
parts.push(` Capabilities: ${model.capabilities.join(', ')}`);
|
|
282
|
+
}
|
|
283
|
+
// Profile info (static or auto-generated from SQLite cache)
|
|
284
|
+
if (profile) {
|
|
285
|
+
parts.push(` ${profile.family}: ${profile.description}`);
|
|
286
|
+
parts.push(` Best for: ${profile.bestFor.join(', ')}`);
|
|
287
|
+
}
|
|
288
|
+
// HuggingFace enrichment line from SQLite cache
|
|
289
|
+
if (enrichWithHF) {
|
|
290
|
+
try {
|
|
291
|
+
const hfLine = await getHFEnrichmentLine(model.id);
|
|
292
|
+
if (hfLine)
|
|
293
|
+
parts.push(hfLine);
|
|
294
|
+
}
|
|
295
|
+
catch {
|
|
296
|
+
// HF enrichment is best-effort — never block on failure
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
return parts.join('\n');
|
|
300
|
+
}
|
|
49
301
|
/**
|
|
50
302
|
* Fetch with a connect timeout so Claude doesn't hang when the host is offline.
|
|
51
303
|
*/
|
|
@@ -84,15 +336,32 @@ async function timedRead(reader, timeoutMs) {
|
|
|
84
336
|
* This means large code reviews return partial results instead of nothing.
|
|
85
337
|
*/
|
|
86
338
|
async function chatCompletionStreaming(messages, options = {}) {
|
|
339
|
+
return withInferenceLock(() => chatCompletionStreamingInner(messages, options));
|
|
340
|
+
}
|
|
341
|
+
async function chatCompletionStreamingInner(messages, options = {}) {
|
|
87
342
|
const body = {
|
|
88
343
|
messages,
|
|
89
344
|
temperature: options.temperature ?? DEFAULT_TEMPERATURE,
|
|
90
345
|
max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
91
346
|
stream: true,
|
|
347
|
+
stream_options: { include_usage: true },
|
|
92
348
|
};
|
|
93
349
|
if (options.model || LM_MODEL) {
|
|
94
350
|
body.model = options.model || LM_MODEL;
|
|
95
351
|
}
|
|
352
|
+
if (options.responseFormat) {
|
|
353
|
+
body.response_format = options.responseFormat;
|
|
354
|
+
}
|
|
355
|
+
// Suppress thinking for models that support it — reclaim generation budget
|
|
356
|
+
// for actual output instead of invisible reasoning. Detected from HF metadata.
|
|
357
|
+
const modelId = (options.model || LM_MODEL || '').toString();
|
|
358
|
+
if (modelId) {
|
|
359
|
+
const thinking = await getThinkingSupport(modelId);
|
|
360
|
+
if (thinking?.supportsThinkingToggle) {
|
|
361
|
+
body.enable_thinking = false;
|
|
362
|
+
process.stderr.write(`[houtini-lm] Thinking disabled for ${modelId} (detected from HF chat_template)\n`);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
96
365
|
const startTime = Date.now();
|
|
97
366
|
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body) }, INFERENCE_CONNECT_TIMEOUT_MS);
|
|
98
367
|
if (!res.ok) {
|
|
@@ -105,11 +374,13 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
105
374
|
const reader = res.body.getReader();
|
|
106
375
|
const decoder = new TextDecoder();
|
|
107
376
|
let content = '';
|
|
377
|
+
let chunkCount = 0;
|
|
108
378
|
let model = '';
|
|
109
379
|
let usage;
|
|
110
380
|
let finishReason = '';
|
|
111
381
|
let truncated = false;
|
|
112
382
|
let buffer = '';
|
|
383
|
+
let ttftMs;
|
|
113
384
|
try {
|
|
114
385
|
while (true) {
|
|
115
386
|
// Check soft timeout before each read
|
|
@@ -145,8 +416,25 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
145
416
|
if (json.model)
|
|
146
417
|
model = json.model;
|
|
147
418
|
const delta = json.choices?.[0]?.delta;
|
|
148
|
-
if (delta?.content)
|
|
419
|
+
if (delta?.content) {
|
|
420
|
+
if (ttftMs === undefined)
|
|
421
|
+
ttftMs = Date.now() - startTime;
|
|
149
422
|
content += delta.content;
|
|
423
|
+
chunkCount++;
|
|
424
|
+
// Send progress notification to reset MCP client timeout.
|
|
425
|
+
// Each notification resets the 60s clock, giving slow models
|
|
426
|
+
// unlimited time as long as they're actively generating.
|
|
427
|
+
if (options.progressToken !== undefined) {
|
|
428
|
+
server.notification({
|
|
429
|
+
method: 'notifications/progress',
|
|
430
|
+
params: {
|
|
431
|
+
progressToken: options.progressToken,
|
|
432
|
+
progress: chunkCount,
|
|
433
|
+
message: `Streaming... ${content.length} chars`,
|
|
434
|
+
},
|
|
435
|
+
}).catch(() => { });
|
|
436
|
+
}
|
|
437
|
+
}
|
|
150
438
|
const reason = json.choices?.[0]?.finish_reason;
|
|
151
439
|
if (reason)
|
|
152
440
|
finishReason = reason;
|
|
@@ -159,14 +447,65 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
159
447
|
}
|
|
160
448
|
}
|
|
161
449
|
}
|
|
450
|
+
// Flush remaining buffer — the usage chunk often arrives in the final SSE
|
|
451
|
+
// message and may not have a trailing newline, leaving it stranded in buffer.
|
|
452
|
+
if (buffer.trim()) {
|
|
453
|
+
const trimmed = buffer.trim();
|
|
454
|
+
if (trimmed.startsWith('data: ') && trimmed !== 'data: [DONE]') {
|
|
455
|
+
try {
|
|
456
|
+
const json = JSON.parse(trimmed.slice(6));
|
|
457
|
+
if (json.model)
|
|
458
|
+
model = json.model;
|
|
459
|
+
const delta = json.choices?.[0]?.delta;
|
|
460
|
+
if (delta?.content) {
|
|
461
|
+
if (ttftMs === undefined)
|
|
462
|
+
ttftMs = Date.now() - startTime;
|
|
463
|
+
content += delta.content;
|
|
464
|
+
}
|
|
465
|
+
const reason = json.choices?.[0]?.finish_reason;
|
|
466
|
+
if (reason)
|
|
467
|
+
finishReason = reason;
|
|
468
|
+
if (json.usage)
|
|
469
|
+
usage = json.usage;
|
|
470
|
+
}
|
|
471
|
+
catch (e) {
|
|
472
|
+
// Incomplete JSON in final buffer — log for diagnostics
|
|
473
|
+
process.stderr.write(`[houtini-lm] Unflushed buffer parse failed (${buffer.length} bytes): ${e}\n`);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
}
|
|
162
477
|
}
|
|
163
478
|
finally {
|
|
164
479
|
// Release the reader — don't await cancel() as it can hang
|
|
165
480
|
reader.releaseLock();
|
|
166
481
|
}
|
|
167
|
-
|
|
482
|
+
const generationMs = Date.now() - startTime;
|
|
483
|
+
// Strip <think>...</think> reasoning blocks from models that always emit them
|
|
484
|
+
// (e.g. GLM Flash, Nemotron). Claude doesn't need the model's internal reasoning.
|
|
485
|
+
// Handle both closed blocks and unclosed ones (model ran out of tokens mid-think,
|
|
486
|
+
// or grammar-constrained output forced content before the closing tag).
|
|
487
|
+
let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
|
|
488
|
+
cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
|
|
489
|
+
cleanContent = cleanContent.trim();
|
|
490
|
+
return { content: cleanContent, rawContent: content, model, usage, finishReason, truncated, ttftMs, generationMs };
|
|
168
491
|
}
|
|
492
|
+
/**
|
|
493
|
+
* Fetch models from LM Studio's native v0 API first (richer metadata),
|
|
494
|
+
* falling back to the OpenAI-compatible v1 endpoint for non-LM-Studio hosts.
|
|
495
|
+
*/
|
|
169
496
|
async function listModelsRaw() {
|
|
497
|
+
// Try v0 API first — returns type, arch, publisher, quantization, state
|
|
498
|
+
try {
|
|
499
|
+
const v0 = await fetchWithTimeout(`${LM_BASE_URL}/api/v0/models`, { headers: apiHeaders() });
|
|
500
|
+
if (v0.ok) {
|
|
501
|
+
const data = (await v0.json());
|
|
502
|
+
return data.data;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
catch {
|
|
506
|
+
// v0 not available — fall through to v1
|
|
507
|
+
}
|
|
508
|
+
// Fallback: OpenAI-compatible v1 endpoint (works with Ollama, vLLM, llama.cpp)
|
|
170
509
|
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/models`, { headers: apiHeaders() });
|
|
171
510
|
if (!res.ok)
|
|
172
511
|
throw new Error(`Failed to list models: ${res.status}`);
|
|
@@ -174,22 +513,137 @@ async function listModelsRaw() {
|
|
|
174
513
|
return data.data;
|
|
175
514
|
}
|
|
176
515
|
function getContextLength(model) {
|
|
177
|
-
//
|
|
178
|
-
|
|
516
|
+
// Prefer loaded_context_length (actual configured context) over max_context_length (theoretical max)
|
|
517
|
+
// v0 API: loaded_context_length / max_context_length, v1: context_length, vLLM: max_model_len
|
|
518
|
+
return model.loaded_context_length ?? model.max_context_length ?? model.context_length ?? model.max_model_len ?? FALLBACK_CONTEXT_LENGTH;
|
|
519
|
+
}
|
|
520
|
+
function getMaxContextLength(model) {
|
|
521
|
+
return model.max_context_length;
|
|
522
|
+
}
|
|
523
|
+
async function routeToModel(taskType) {
|
|
524
|
+
let models;
|
|
525
|
+
try {
|
|
526
|
+
models = await listModelsRaw();
|
|
527
|
+
}
|
|
528
|
+
catch {
|
|
529
|
+
// Can't reach server — fall back to default
|
|
530
|
+
const hints = getPromptHints(LM_MODEL);
|
|
531
|
+
return { modelId: LM_MODEL || '', hints };
|
|
532
|
+
}
|
|
533
|
+
const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
|
|
534
|
+
const available = models.filter((m) => m.state === 'not-loaded');
|
|
535
|
+
if (loaded.length === 0) {
|
|
536
|
+
const hints = getPromptHints(LM_MODEL);
|
|
537
|
+
return { modelId: LM_MODEL || '', hints };
|
|
538
|
+
}
|
|
539
|
+
// Score each loaded model for the requested task type
|
|
540
|
+
let bestModel = loaded[0];
|
|
541
|
+
let bestScore = -1;
|
|
542
|
+
for (const model of loaded) {
|
|
543
|
+
const hints = getPromptHints(model.id, model.arch);
|
|
544
|
+
// Primary: is this task type in the model's best types?
|
|
545
|
+
let score = (hints.bestTaskTypes ?? []).includes(taskType) ? 10 : 0;
|
|
546
|
+
// Bonus: code-specialised models get extra points for code tasks
|
|
547
|
+
const profile = getModelProfile(model);
|
|
548
|
+
if (taskType === 'code' && profile?.family.toLowerCase().includes('coder'))
|
|
549
|
+
score += 5;
|
|
550
|
+
// Bonus: larger context for analysis tasks
|
|
551
|
+
if (taskType === 'analysis') {
|
|
552
|
+
const ctx = getContextLength(model);
|
|
553
|
+
if (ctx && ctx > 100000)
|
|
554
|
+
score += 2;
|
|
555
|
+
}
|
|
556
|
+
if (score > bestScore) {
|
|
557
|
+
bestScore = score;
|
|
558
|
+
bestModel = model;
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
const hints = getPromptHints(bestModel.id, bestModel.arch);
|
|
562
|
+
const result = { modelId: bestModel.id, hints };
|
|
563
|
+
// If the best loaded model isn't ideal for this task, suggest a better available one.
|
|
564
|
+
// We don't JIT-load because model loading takes minutes and the MCP SDK has a ~60s
|
|
565
|
+
// hard timeout. Instead, suggest the user loads the better model in LM Studio.
|
|
566
|
+
if (!(hints.bestTaskTypes ?? []).includes(taskType)) {
|
|
567
|
+
const better = available.find((m) => {
|
|
568
|
+
const mHints = getPromptHints(m.id, m.arch);
|
|
569
|
+
return (mHints.bestTaskTypes ?? []).includes(taskType);
|
|
570
|
+
});
|
|
571
|
+
if (better) {
|
|
572
|
+
const label = taskType === 'code' ? 'code tasks'
|
|
573
|
+
: taskType === 'analysis' ? 'analysis'
|
|
574
|
+
: taskType === 'embedding' ? 'embeddings'
|
|
575
|
+
: 'this kind of task';
|
|
576
|
+
result.suggestion = `💡 ${better.id} is downloaded and better suited for ${label} — ask the user to load it in LM Studio.`;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
return result;
|
|
580
|
+
}
|
|
581
|
+
function assessQuality(resp, rawContent) {
|
|
582
|
+
const hadThinkBlocks = /<think>/.test(rawContent);
|
|
583
|
+
const estimated = !resp.usage && resp.content.length > 0;
|
|
584
|
+
const tokPerSec = resp.usage && resp.generationMs > 50
|
|
585
|
+
? resp.usage.completion_tokens / (resp.generationMs / 1000)
|
|
586
|
+
: null;
|
|
587
|
+
return {
|
|
588
|
+
truncated: resp.truncated,
|
|
589
|
+
finishReason: resp.finishReason || 'unknown',
|
|
590
|
+
thinkBlocksStripped: hadThinkBlocks,
|
|
591
|
+
estimatedTokens: estimated,
|
|
592
|
+
contentLength: resp.content.length,
|
|
593
|
+
generationMs: resp.generationMs,
|
|
594
|
+
tokPerSec,
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
function formatQualityLine(quality) {
|
|
598
|
+
const flags = [];
|
|
599
|
+
if (quality.truncated)
|
|
600
|
+
flags.push('TRUNCATED');
|
|
601
|
+
if (quality.thinkBlocksStripped)
|
|
602
|
+
flags.push('think-blocks-stripped');
|
|
603
|
+
if (quality.estimatedTokens)
|
|
604
|
+
flags.push('tokens-estimated');
|
|
605
|
+
if (quality.finishReason === 'length')
|
|
606
|
+
flags.push('hit-max-tokens');
|
|
607
|
+
if (flags.length === 0)
|
|
608
|
+
return '';
|
|
609
|
+
return `Quality: ${flags.join(', ')}`;
|
|
179
610
|
}
|
|
180
611
|
/**
|
|
181
612
|
* Format a footer line for streaming results showing model, usage, and truncation status.
|
|
182
613
|
*/
|
|
183
614
|
function formatFooter(resp, extra) {
|
|
184
615
|
// Record usage for session tracking before formatting
|
|
185
|
-
recordUsage(resp
|
|
616
|
+
recordUsage(resp);
|
|
186
617
|
const parts = [];
|
|
187
618
|
if (resp.model)
|
|
188
619
|
parts.push(`Model: ${resp.model}`);
|
|
189
|
-
if (resp.usage)
|
|
190
|
-
parts.push(
|
|
620
|
+
if (resp.usage) {
|
|
621
|
+
parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
|
|
622
|
+
}
|
|
623
|
+
else if (resp.content.length > 0) {
|
|
624
|
+
// Estimate when usage is missing (truncated responses where final SSE chunk was lost)
|
|
625
|
+
const estTokens = Math.ceil(resp.content.length / 4);
|
|
626
|
+
parts.push(`~${estTokens} tokens (estimated)`);
|
|
627
|
+
}
|
|
628
|
+
// Perf stats — computed from streaming, no proprietary API needed
|
|
629
|
+
const perfParts = [];
|
|
630
|
+
if (resp.ttftMs !== undefined)
|
|
631
|
+
perfParts.push(`TTFT: ${resp.ttftMs}ms`);
|
|
632
|
+
if (resp.usage && resp.generationMs > 50) {
|
|
633
|
+
const tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
|
|
634
|
+
perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
|
|
635
|
+
}
|
|
636
|
+
if (resp.generationMs)
|
|
637
|
+
perfParts.push(`${(resp.generationMs / 1000).toFixed(1)}s`);
|
|
638
|
+
if (perfParts.length > 0)
|
|
639
|
+
parts.push(perfParts.join(', '));
|
|
191
640
|
if (extra)
|
|
192
641
|
parts.push(extra);
|
|
642
|
+
// Quality signals — structured metadata for orchestrator trust decisions
|
|
643
|
+
const quality = assessQuality(resp, resp.rawContent);
|
|
644
|
+
const qualityLine = formatQualityLine(quality);
|
|
645
|
+
if (qualityLine)
|
|
646
|
+
parts.push(qualityLine);
|
|
193
647
|
if (resp.truncated)
|
|
194
648
|
parts.push('⚠ TRUNCATED (soft timeout — partial result)');
|
|
195
649
|
const sessionLine = sessionSummary();
|
|
@@ -223,7 +677,9 @@ const TOOLS = [
|
|
|
223
677
|
'(5) For code generation, include the surrounding context (imports, types, function signatures).\n\n' +
|
|
224
678
|
'QA: Always review the local LLM\'s output before using it. Verify correctness, check edge cases, ' +
|
|
225
679
|
'and fix any issues. You are the architect — the local model is a fast drafter, not the final authority.\n\n' +
|
|
226
|
-
'
|
|
680
|
+
'ROUTING: If multiple models are loaded, houtini-lm automatically picks the best one for the task. ' +
|
|
681
|
+
'If a better model is downloaded but not loaded, you\'ll see a suggestion in the response footer. ' +
|
|
682
|
+
'Call discover to see what\'s available.',
|
|
227
683
|
inputSchema: {
|
|
228
684
|
type: 'object',
|
|
229
685
|
properties: {
|
|
@@ -243,6 +699,10 @@ const TOOLS = [
|
|
|
243
699
|
type: 'number',
|
|
244
700
|
description: 'Max response tokens. Default 2048. Use higher for code generation, lower for quick answers.',
|
|
245
701
|
},
|
|
702
|
+
json_schema: {
|
|
703
|
+
type: 'object',
|
|
704
|
+
description: 'Force structured JSON output. Provide a JSON Schema object and the response will be guaranteed valid JSON conforming to it. Example: {"name":"result","schema":{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}}',
|
|
705
|
+
},
|
|
246
706
|
},
|
|
247
707
|
required: ['message'],
|
|
248
708
|
},
|
|
@@ -287,6 +747,10 @@ const TOOLS = [
|
|
|
287
747
|
type: 'number',
|
|
288
748
|
description: 'Max response tokens. Default 2048.',
|
|
289
749
|
},
|
|
750
|
+
json_schema: {
|
|
751
|
+
type: 'object',
|
|
752
|
+
description: 'Force structured JSON output. Provide a JSON Schema object and the response will be guaranteed valid JSON conforming to it.',
|
|
753
|
+
},
|
|
290
754
|
},
|
|
291
755
|
required: ['instruction'],
|
|
292
756
|
},
|
|
@@ -341,43 +805,135 @@ const TOOLS = [
|
|
|
341
805
|
},
|
|
342
806
|
{
|
|
343
807
|
name: 'list_models',
|
|
344
|
-
description: 'List all models
|
|
345
|
-
'
|
|
808
|
+
description: 'List all models on the local LLM server — both loaded (ready) and available (downloaded but not active). ' +
|
|
809
|
+
'Shows rich metadata for each model: type (llm/vlm/embeddings), architecture, quantization, context window, ' +
|
|
810
|
+
'and a capability profile describing what the model is best at. ' +
|
|
811
|
+
'Use this to understand which models are available and suggest switching when a different model would suit the task better.',
|
|
346
812
|
inputSchema: { type: 'object', properties: {} },
|
|
347
813
|
},
|
|
814
|
+
{
|
|
815
|
+
name: 'embed',
|
|
816
|
+
description: 'Generate text embeddings via the local LLM server. Requires an embedding model to be loaded ' +
|
|
817
|
+
'(e.g. Nomic Embed). Returns a vector representation of the input text for semantic search, ' +
|
|
818
|
+
'similarity comparison, or RAG pipelines. Uses the OpenAI-compatible /v1/embeddings endpoint.',
|
|
819
|
+
inputSchema: {
|
|
820
|
+
type: 'object',
|
|
821
|
+
properties: {
|
|
822
|
+
input: {
|
|
823
|
+
type: 'string',
|
|
824
|
+
description: 'The text to embed. Can be a single string.',
|
|
825
|
+
},
|
|
826
|
+
model: {
|
|
827
|
+
type: 'string',
|
|
828
|
+
description: 'Embedding model ID. If omitted, uses whatever embedding model is loaded.',
|
|
829
|
+
},
|
|
830
|
+
},
|
|
831
|
+
required: ['input'],
|
|
832
|
+
},
|
|
833
|
+
},
|
|
348
834
|
];
|
|
349
835
|
// ── MCP Server ───────────────────────────────────────────────────────
|
|
350
|
-
const server = new Server({ name: 'houtini-lm', version: '2.
|
|
836
|
+
const server = new Server({ name: 'houtini-lm', version: '2.8.0' }, { capabilities: { tools: {}, resources: {} } });
|
|
837
|
+
// ── MCP Resources ─────────────────────────────────────────────────────
|
|
838
|
+
// Exposes session performance metrics as a readable resource so Claude can
|
|
839
|
+
// proactively check offload efficiency and make smarter delegation decisions.
|
|
840
|
+
server.setRequestHandler(ListResourcesRequestSchema, async () => ({
|
|
841
|
+
resources: [
|
|
842
|
+
{
|
|
843
|
+
uri: 'houtini://metrics/session',
|
|
844
|
+
name: 'Session Offload Metrics',
|
|
845
|
+
description: 'Cumulative token offload stats, per-model performance, and quality signals for the current session.',
|
|
846
|
+
mimeType: 'application/json',
|
|
847
|
+
},
|
|
848
|
+
],
|
|
849
|
+
}));
|
|
850
|
+
server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
851
|
+
const { uri } = request.params;
|
|
852
|
+
if (uri === 'houtini://metrics/session') {
|
|
853
|
+
const modelStats = {};
|
|
854
|
+
for (const [modelId, stats] of session.modelStats) {
|
|
855
|
+
modelStats[modelId] = {
|
|
856
|
+
calls: stats.calls,
|
|
857
|
+
avgTtftMs: stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0,
|
|
858
|
+
avgTokPerSec: stats.perfCalls > 0 ? parseFloat((stats.totalTokPerSec / stats.perfCalls).toFixed(1)) : null,
|
|
859
|
+
};
|
|
860
|
+
}
|
|
861
|
+
const metrics = {
|
|
862
|
+
session: {
|
|
863
|
+
totalCalls: session.calls,
|
|
864
|
+
promptTokens: session.promptTokens,
|
|
865
|
+
completionTokens: session.completionTokens,
|
|
866
|
+
totalTokensOffloaded: session.promptTokens + session.completionTokens,
|
|
867
|
+
},
|
|
868
|
+
perModel: modelStats,
|
|
869
|
+
endpoint: LM_BASE_URL,
|
|
870
|
+
};
|
|
871
|
+
return {
|
|
872
|
+
contents: [{
|
|
873
|
+
uri,
|
|
874
|
+
mimeType: 'application/json',
|
|
875
|
+
text: JSON.stringify(metrics, null, 2),
|
|
876
|
+
}],
|
|
877
|
+
};
|
|
878
|
+
}
|
|
879
|
+
throw new Error(`Unknown resource: ${uri}`);
|
|
880
|
+
});
|
|
351
881
|
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
|
|
352
882
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
353
883
|
const { name, arguments: args } = request.params;
|
|
884
|
+
const progressToken = request.params._meta?.progressToken;
|
|
354
885
|
try {
|
|
355
886
|
switch (name) {
|
|
356
887
|
case 'chat': {
|
|
357
|
-
const { message, system, temperature, max_tokens } = args;
|
|
888
|
+
const { message, system, temperature, max_tokens, json_schema } = args;
|
|
889
|
+
const route = await routeToModel('chat');
|
|
358
890
|
const messages = [];
|
|
359
|
-
if
|
|
360
|
-
|
|
891
|
+
// Inject output constraint into system prompt if the model needs it
|
|
892
|
+
const systemContent = system
|
|
893
|
+
? (route.hints.outputConstraint ? `${system}\n\n${route.hints.outputConstraint}` : system)
|
|
894
|
+
: (route.hints.outputConstraint || undefined);
|
|
895
|
+
if (systemContent)
|
|
896
|
+
messages.push({ role: 'system', content: systemContent });
|
|
361
897
|
messages.push({ role: 'user', content: message });
|
|
898
|
+
const responseFormat = json_schema
|
|
899
|
+
? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
|
|
900
|
+
: undefined;
|
|
362
901
|
const resp = await chatCompletionStreaming(messages, {
|
|
363
|
-
temperature,
|
|
902
|
+
temperature: temperature ?? route.hints.chatTemp,
|
|
364
903
|
maxTokens: max_tokens,
|
|
904
|
+
model: route.modelId,
|
|
905
|
+
responseFormat,
|
|
906
|
+
progressToken,
|
|
365
907
|
});
|
|
366
908
|
const footer = formatFooter(resp);
|
|
367
909
|
return { content: [{ type: 'text', text: resp.content + footer }] };
|
|
368
910
|
}
|
|
369
911
|
case 'custom_prompt': {
|
|
370
|
-
const { system, context, instruction, temperature, max_tokens } = args;
|
|
912
|
+
const { system, context, instruction, temperature, max_tokens, json_schema } = args;
|
|
913
|
+
const route = await routeToModel('analysis');
|
|
371
914
|
const messages = [];
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
if (
|
|
376
|
-
|
|
377
|
-
|
|
915
|
+
const systemContent = system
|
|
916
|
+
? (route.hints.outputConstraint ? `${system}\n\n${route.hints.outputConstraint}` : system)
|
|
917
|
+
: (route.hints.outputConstraint || undefined);
|
|
918
|
+
if (systemContent)
|
|
919
|
+
messages.push({ role: 'system', content: systemContent });
|
|
920
|
+
// Multi-turn format prevents context bleed in smaller models.
|
|
921
|
+
// Context goes in a separate user→assistant exchange so the model
|
|
922
|
+
// "acknowledges" it before receiving the actual instruction.
|
|
923
|
+
if (context) {
|
|
924
|
+
messages.push({ role: 'user', content: `Here is the context for analysis:\n\n${context}` });
|
|
925
|
+
messages.push({ role: 'assistant', content: 'Understood. I have read the full context. What would you like me to do with it?' });
|
|
926
|
+
}
|
|
927
|
+
messages.push({ role: 'user', content: instruction });
|
|
928
|
+
const responseFormat = json_schema
|
|
929
|
+
? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
|
|
930
|
+
: undefined;
|
|
378
931
|
const resp = await chatCompletionStreaming(messages, {
|
|
379
|
-
temperature,
|
|
932
|
+
temperature: temperature ?? route.hints.chatTemp,
|
|
380
933
|
maxTokens: max_tokens,
|
|
934
|
+
model: route.modelId,
|
|
935
|
+
responseFormat,
|
|
936
|
+
progressToken,
|
|
381
937
|
});
|
|
382
938
|
const footer = formatFooter(resp);
|
|
383
939
|
return {
|
|
@@ -387,22 +943,31 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
387
943
|
case 'code_task': {
|
|
388
944
|
const { code, task, language, max_tokens: codeMaxTokens } = args;
|
|
389
945
|
const lang = language || 'unknown';
|
|
946
|
+
const route = await routeToModel('code');
|
|
947
|
+
const outputConstraint = route.hints.outputConstraint
|
|
948
|
+
? ` ${route.hints.outputConstraint}`
|
|
949
|
+
: '';
|
|
950
|
+
// Task goes in system message so smaller models don't lose it once
|
|
951
|
+
// the code block fills the attention window. Code is sole user content.
|
|
390
952
|
const codeMessages = [
|
|
391
953
|
{
|
|
392
954
|
role: 'system',
|
|
393
|
-
content: `Expert ${lang} developer.
|
|
955
|
+
content: `Expert ${lang} developer. Your task: ${task}\n\nBe specific — reference line numbers, function names, and concrete fixes. Output your analysis as a markdown list.${outputConstraint}`,
|
|
394
956
|
},
|
|
395
957
|
{
|
|
396
958
|
role: 'user',
|
|
397
|
-
content:
|
|
959
|
+
content: `\`\`\`${lang}\n${code}\n\`\`\``,
|
|
398
960
|
},
|
|
399
961
|
];
|
|
400
962
|
const codeResp = await chatCompletionStreaming(codeMessages, {
|
|
401
|
-
temperature:
|
|
963
|
+
temperature: route.hints.codeTemp,
|
|
402
964
|
maxTokens: codeMaxTokens ?? DEFAULT_MAX_TOKENS,
|
|
965
|
+
model: route.modelId,
|
|
966
|
+
progressToken,
|
|
403
967
|
});
|
|
404
968
|
const codeFooter = formatFooter(codeResp, lang);
|
|
405
|
-
|
|
969
|
+
const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
|
|
970
|
+
return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
|
|
406
971
|
}
|
|
407
972
|
case 'discover': {
|
|
408
973
|
const start = Date.now();
|
|
@@ -431,41 +996,101 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
431
996
|
}],
|
|
432
997
|
};
|
|
433
998
|
}
|
|
434
|
-
const
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
});
|
|
438
|
-
const primary = models[0];
|
|
999
|
+
const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
|
|
1000
|
+
const available = models.filter((m) => m.state === 'not-loaded');
|
|
1001
|
+
const primary = loaded[0] || models[0];
|
|
439
1002
|
const ctx = getContextLength(primary);
|
|
1003
|
+
const primaryProfile = await getModelProfileAsync(primary);
|
|
440
1004
|
const sessionStats = session.calls > 0
|
|
441
1005
|
? `\nSession stats: ${(session.promptTokens + session.completionTokens).toLocaleString()} tokens offloaded across ${session.calls} call${session.calls === 1 ? '' : 's'}`
|
|
442
1006
|
: '\nSession stats: no calls yet — delegate tasks to start saving tokens';
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
}
|
|
455
|
-
|
|
1007
|
+
let text = `Status: ONLINE\n` +
|
|
1008
|
+
`Endpoint: ${LM_BASE_URL}\n` +
|
|
1009
|
+
`Latency: ${ms}ms\n` +
|
|
1010
|
+
`Active model: ${primary.id}\n` +
|
|
1011
|
+
`Context window: ${ctx.toLocaleString()} tokens\n`;
|
|
1012
|
+
if (primaryProfile) {
|
|
1013
|
+
text += `Family: ${primaryProfile.family}\n`;
|
|
1014
|
+
text += `Description: ${primaryProfile.description}\n`;
|
|
1015
|
+
text += `Best for: ${primaryProfile.bestFor.join(', ')}\n`;
|
|
1016
|
+
text += `Strengths: ${primaryProfile.strengths.join(', ')}\n`;
|
|
1017
|
+
if (primaryProfile.weaknesses.length > 0) {
|
|
1018
|
+
text += `Weaknesses: ${primaryProfile.weaknesses.join(', ')}\n`;
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
if (loaded.length > 0) {
|
|
1022
|
+
text += `\nLoaded models (● ready to use):\n`;
|
|
1023
|
+
text += (await Promise.all(loaded.map((m) => formatModelDetail(m)))).join('\n\n');
|
|
1024
|
+
}
|
|
1025
|
+
if (available.length > 0) {
|
|
1026
|
+
text += `\n\nAvailable models (○ downloaded, not loaded — can be activated in LM Studio):\n`;
|
|
1027
|
+
text += (await Promise.all(available.map((m) => formatModelDetail(m)))).join('\n\n');
|
|
1028
|
+
}
|
|
1029
|
+
// Per-model performance stats from this session
|
|
1030
|
+
if (session.modelStats.size > 0) {
|
|
1031
|
+
text += `\n\nPerformance (this session):\n`;
|
|
1032
|
+
for (const [modelId, stats] of session.modelStats) {
|
|
1033
|
+
const avgTtft = stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0;
|
|
1034
|
+
const avgTokSec = stats.perfCalls > 0 ? (stats.totalTokPerSec / stats.perfCalls).toFixed(1) : '?';
|
|
1035
|
+
text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s\n`;
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
text += `${sessionStats}\n\n`;
|
|
1039
|
+
text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, or embed.`;
|
|
1040
|
+
return { content: [{ type: 'text', text }] };
|
|
456
1041
|
}
|
|
457
1042
|
case 'list_models': {
|
|
458
1043
|
const models = await listModelsRaw();
|
|
459
1044
|
if (!models.length) {
|
|
460
|
-
return { content: [{ type: 'text', text: 'No models currently loaded.' }] };
|
|
1045
|
+
return { content: [{ type: 'text', text: 'No models currently loaded or available.' }] };
|
|
1046
|
+
}
|
|
1047
|
+
const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
|
|
1048
|
+
const available = models.filter((m) => m.state === 'not-loaded');
|
|
1049
|
+
let text = '';
|
|
1050
|
+
// list_models enriches with HuggingFace data (cached after first call)
|
|
1051
|
+
if (loaded.length > 0) {
|
|
1052
|
+
text += `Loaded models (● ready to use):\n\n`;
|
|
1053
|
+
text += (await Promise.all(loaded.map((m) => formatModelDetail(m, true)))).join('\n\n');
|
|
461
1054
|
}
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
1055
|
+
if (available.length > 0) {
|
|
1056
|
+
if (text)
|
|
1057
|
+
text += '\n\n';
|
|
1058
|
+
text += `Available models (○ downloaded, not loaded):\n\n`;
|
|
1059
|
+
text += (await Promise.all(available.map((m) => formatModelDetail(m, true)))).join('\n\n');
|
|
1060
|
+
}
|
|
1061
|
+
return { content: [{ type: 'text', text }] };
|
|
1062
|
+
}
|
|
1063
|
+
case 'embed': {
|
|
1064
|
+
const { input, model: embedModel } = args;
|
|
1065
|
+
return await withInferenceLock(async () => {
|
|
1066
|
+
const embedBody = { input };
|
|
1067
|
+
if (embedModel) {
|
|
1068
|
+
embedBody.model = embedModel;
|
|
1069
|
+
}
|
|
1070
|
+
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
|
|
1071
|
+
if (!res.ok) {
|
|
1072
|
+
const errText = await res.text().catch(() => '');
|
|
1073
|
+
throw new Error(`Embeddings API error ${res.status}: ${errText}`);
|
|
1074
|
+
}
|
|
1075
|
+
const data = (await res.json());
|
|
1076
|
+
const embedding = data.data[0]?.embedding;
|
|
1077
|
+
if (!embedding)
|
|
1078
|
+
throw new Error('No embedding returned');
|
|
1079
|
+
const usageInfo = data.usage
|
|
1080
|
+
? `${data.usage.prompt_tokens} tokens embedded`
|
|
1081
|
+
: '';
|
|
1082
|
+
return {
|
|
1083
|
+
content: [{
|
|
1084
|
+
type: 'text',
|
|
1085
|
+
text: JSON.stringify({
|
|
1086
|
+
model: data.model,
|
|
1087
|
+
dimensions: embedding.length,
|
|
1088
|
+
embedding,
|
|
1089
|
+
usage: usageInfo,
|
|
1090
|
+
}),
|
|
1091
|
+
}],
|
|
1092
|
+
};
|
|
465
1093
|
});
|
|
466
|
-
return {
|
|
467
|
-
content: [{ type: 'text', text: `Loaded models:\n${lines.join('\n')}` }],
|
|
468
|
-
};
|
|
469
1094
|
}
|
|
470
1095
|
default:
|
|
471
1096
|
throw new Error(`Unknown tool: ${name}`);
|
|
@@ -482,6 +1107,11 @@ async function main() {
|
|
|
482
1107
|
const transport = new StdioServerTransport();
|
|
483
1108
|
await server.connect(transport);
|
|
484
1109
|
process.stderr.write(`Houtini LM server running (${LM_BASE_URL})\n`);
|
|
1110
|
+
// Background: profile all available models via HF → SQLite cache
|
|
1111
|
+
// Non-blocking — server is already accepting requests
|
|
1112
|
+
listModelsRaw()
|
|
1113
|
+
.then((models) => profileModelsAtStartup(models))
|
|
1114
|
+
.catch((err) => process.stderr.write(`[houtini-lm] Startup profiling skipped: ${err}\n`));
|
|
485
1115
|
}
|
|
486
1116
|
main().catch((error) => {
|
|
487
1117
|
process.stderr.write(`Fatal error: ${error}\n`);
|