@houtini/lm 2.3.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +202 -114
- package/dist/index.js +562 -49
- package/dist/index.js.map +1 -1
- package/dist/model-cache.d.ts +87 -0
- package/dist/model-cache.js +516 -0
- package/dist/model-cache.js.map +1 -0
- package/package.json +4 -2
- package/server.json +44 -44
package/dist/index.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
9
9
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
10
10
|
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
11
|
+
import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, } from './model-cache.js';
|
|
11
12
|
const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
|
|
12
13
|
const LM_MODEL = process.env.LM_STUDIO_MODEL || '';
|
|
13
14
|
const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || '';
|
|
@@ -18,12 +19,269 @@ const INFERENCE_CONNECT_TIMEOUT_MS = 30_000; // generous connect timeout for inf
|
|
|
18
19
|
const SOFT_TIMEOUT_MS = 55_000; // return partial results before MCP SDK ~60s timeout
|
|
19
20
|
const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk
|
|
20
21
|
const FALLBACK_CONTEXT_LENGTH = parseInt(process.env.LM_CONTEXT_WINDOW || '100000', 10);
|
|
22
|
+
// ── Session-level token accounting ───────────────────────────────────
|
|
23
|
+
// Tracks cumulative tokens offloaded to the local LLM across all calls
|
|
24
|
+
// in this session. Shown in every response footer so Claude can reason
|
|
25
|
+
// about cost savings and continue delegating strategically.
|
|
26
|
+
const session = {
|
|
27
|
+
calls: 0,
|
|
28
|
+
promptTokens: 0,
|
|
29
|
+
completionTokens: 0,
|
|
30
|
+
/** Per-model performance tracking for routing insights */
|
|
31
|
+
modelStats: new Map(),
|
|
32
|
+
};
|
|
33
|
+
function recordUsage(resp) {
|
|
34
|
+
session.calls++;
|
|
35
|
+
if (resp.usage) {
|
|
36
|
+
session.promptTokens += resp.usage.prompt_tokens;
|
|
37
|
+
session.completionTokens += resp.usage.completion_tokens;
|
|
38
|
+
}
|
|
39
|
+
// Track per-model perf stats
|
|
40
|
+
if (resp.model) {
|
|
41
|
+
const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
|
|
42
|
+
existing.calls++;
|
|
43
|
+
if (resp.ttftMs)
|
|
44
|
+
existing.totalTtftMs += resp.ttftMs;
|
|
45
|
+
const tokPerSec = resp.usage && resp.generationMs > 0
|
|
46
|
+
? (resp.usage.completion_tokens / (resp.generationMs / 1000))
|
|
47
|
+
: 0;
|
|
48
|
+
if (tokPerSec > 0) {
|
|
49
|
+
existing.perfCalls++;
|
|
50
|
+
existing.totalTokPerSec += tokPerSec;
|
|
51
|
+
}
|
|
52
|
+
session.modelStats.set(resp.model, existing);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
function sessionSummary() {
|
|
56
|
+
const total = session.promptTokens + session.completionTokens;
|
|
57
|
+
if (session.calls === 0)
|
|
58
|
+
return '';
|
|
59
|
+
return `Session: ${total.toLocaleString()} tokens offloaded across ${session.calls} call${session.calls === 1 ? '' : 's'}`;
|
|
60
|
+
}
|
|
21
61
|
function apiHeaders() {
|
|
22
62
|
const h = { 'Content-Type': 'application/json' };
|
|
23
63
|
if (LM_PASSWORD)
|
|
24
64
|
h['Authorization'] = `Bearer ${LM_PASSWORD}`;
|
|
25
65
|
return h;
|
|
26
66
|
}
|
|
67
|
+
const MODEL_PROFILES = [
|
|
68
|
+
{
|
|
69
|
+
pattern: /nemotron|nemotron_h_moe/i,
|
|
70
|
+
profile: {
|
|
71
|
+
family: 'NVIDIA Nemotron',
|
|
72
|
+
description: 'NVIDIA\'s compact reasoning model optimised for accurate, structured responses. Strong at step-by-step logic and instruction following.',
|
|
73
|
+
strengths: ['logical reasoning', 'math', 'step-by-step problem solving', 'code review', 'structured output'],
|
|
74
|
+
weaknesses: ['creative writing', 'constrained generation', 'factual knowledge on niche topics'],
|
|
75
|
+
bestFor: ['analysis tasks', 'code bug-finding', 'math/science questions', 'data transformation'],
|
|
76
|
+
},
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
pattern: /granite|granitehybrid/i,
|
|
80
|
+
profile: {
|
|
81
|
+
family: 'IBM Granite',
|
|
82
|
+
description: 'IBM\'s enterprise-focused model family. Compact and efficient, designed for business and code tasks with strong instruction following.',
|
|
83
|
+
strengths: ['code generation', 'instruction following', 'enterprise tasks', 'efficiency'],
|
|
84
|
+
weaknesses: ['creative tasks', 'long-form generation'],
|
|
85
|
+
bestFor: ['boilerplate generation', 'code explanation', 'structured Q&A'],
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
pattern: /qwen3-coder|qwen3.*coder/i,
|
|
90
|
+
profile: {
|
|
91
|
+
family: 'Qwen3 Coder',
|
|
92
|
+
description: 'Alibaba\'s code-specialised model with agentic capabilities. Excellent at code generation, review, and multi-step coding tasks.',
|
|
93
|
+
strengths: ['code generation', 'code review', 'debugging', 'test writing', 'refactoring', 'multi-step reasoning'],
|
|
94
|
+
weaknesses: ['non-code creative tasks'],
|
|
95
|
+
bestFor: ['code generation', 'code review', 'test stubs', 'type definitions', 'refactoring'],
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
pattern: /qwen3-vl|qwen.*vl/i,
|
|
100
|
+
profile: {
|
|
101
|
+
family: 'Qwen3 Vision-Language',
|
|
102
|
+
description: 'Alibaba\'s multimodal model handling both text and image inputs. Can analyse screenshots, diagrams, and visual content.',
|
|
103
|
+
strengths: ['image understanding', 'visual Q&A', 'diagram analysis', 'OCR'],
|
|
104
|
+
weaknesses: ['pure text tasks (use a text-only model instead)'],
|
|
105
|
+
bestFor: ['screenshot analysis', 'UI review', 'diagram interpretation'],
|
|
106
|
+
},
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
pattern: /qwen3(?!.*coder)(?!.*vl)/i,
|
|
110
|
+
profile: {
|
|
111
|
+
family: 'Qwen3',
|
|
112
|
+
description: 'Alibaba\'s general-purpose model with strong multilingual and reasoning capabilities. Good all-rounder.',
|
|
113
|
+
strengths: ['general reasoning', 'multilingual', 'code', 'instruction following'],
|
|
114
|
+
weaknesses: ['specialised code tasks (use Qwen3 Coder instead)'],
|
|
115
|
+
bestFor: ['general Q&A', 'translation', 'summarisation', 'brainstorming'],
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
pattern: /llama[- ]?3/i,
|
|
120
|
+
profile: {
|
|
121
|
+
family: 'Meta LLaMA 3',
|
|
122
|
+
description: 'Meta\'s open-weight general-purpose model. Strong baseline across tasks with large community fine-tune ecosystem.',
|
|
123
|
+
strengths: ['general reasoning', 'code', 'instruction following', 'broad knowledge'],
|
|
124
|
+
weaknesses: ['specialised tasks where fine-tuned models excel'],
|
|
125
|
+
bestFor: ['general delegation', 'drafting', 'code review', 'Q&A'],
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
pattern: /minimax[- ]?m2/i,
|
|
130
|
+
profile: {
|
|
131
|
+
family: 'MiniMax M2',
|
|
132
|
+
description: 'MiniMax\'s large MoE model with strong long-context and reasoning capabilities.',
|
|
133
|
+
strengths: ['long context', 'reasoning', 'creative writing', 'multilingual'],
|
|
134
|
+
weaknesses: ['may be slower due to model size'],
|
|
135
|
+
bestFor: ['long document analysis', 'creative tasks', 'complex reasoning'],
|
|
136
|
+
},
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
pattern: /kimi[- ]?k2/i,
|
|
140
|
+
profile: {
|
|
141
|
+
family: 'Kimi K2',
|
|
142
|
+
description: 'Moonshot AI\'s large MoE model with strong agentic and tool-use capabilities.',
|
|
143
|
+
strengths: ['agentic tasks', 'tool use', 'code', 'reasoning', 'long context'],
|
|
144
|
+
weaknesses: ['may be slower due to model size'],
|
|
145
|
+
bestFor: ['complex multi-step tasks', 'code generation', 'reasoning chains'],
|
|
146
|
+
},
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
pattern: /gpt-oss/i,
|
|
150
|
+
profile: {
|
|
151
|
+
family: 'OpenAI GPT-OSS',
|
|
152
|
+
description: 'OpenAI\'s open-source model release. General-purpose with strong instruction following.',
|
|
153
|
+
strengths: ['instruction following', 'general reasoning', 'code'],
|
|
154
|
+
weaknesses: ['less tested in open ecosystem than LLaMA/Qwen'],
|
|
155
|
+
bestFor: ['general delegation', 'code tasks', 'Q&A'],
|
|
156
|
+
},
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
pattern: /glm[- ]?4/i,
|
|
160
|
+
profile: {
|
|
161
|
+
family: 'GLM-4',
|
|
162
|
+
description: 'Zhipu AI\'s open-weight MoE model. Fast inference with strong general reasoning, multilingual support, and tool-use capabilities. Uses chain-of-thought reasoning internally. MIT licensed.',
|
|
163
|
+
strengths: ['fast inference', 'general reasoning', 'tool use', 'multilingual', 'code', 'instruction following', 'chain-of-thought'],
|
|
164
|
+
weaknesses: ['always emits internal reasoning (stripped automatically)', 'less tested in English-only benchmarks than LLaMA/Qwen'],
|
|
165
|
+
bestFor: ['general delegation', 'fast drafting', 'code tasks', 'structured output', 'Q&A'],
|
|
166
|
+
},
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
pattern: /nomic.*embed|embed.*nomic/i,
|
|
170
|
+
profile: {
|
|
171
|
+
family: 'Nomic Embed',
|
|
172
|
+
description: 'Text embedding model for semantic search and similarity. Not a chat model — produces vector embeddings.',
|
|
173
|
+
strengths: ['text embeddings', 'semantic search', 'clustering'],
|
|
174
|
+
weaknesses: ['cannot chat or generate text'],
|
|
175
|
+
bestFor: ['RAG pipelines', 'semantic similarity', 'document search'],
|
|
176
|
+
},
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
pattern: /abliterated/i,
|
|
180
|
+
profile: {
|
|
181
|
+
family: 'Abliterated (uncensored)',
|
|
182
|
+
description: 'Community fine-tune with safety guardrails removed. More permissive but may produce lower-quality or unreliable output.',
|
|
183
|
+
strengths: ['fewer refusals', 'unconstrained generation'],
|
|
184
|
+
weaknesses: ['may hallucinate more', 'no safety filtering', 'less tested'],
|
|
185
|
+
bestFor: ['tasks where the base model refuses unnecessarily'],
|
|
186
|
+
},
|
|
187
|
+
},
|
|
188
|
+
];
|
|
189
|
+
/**
|
|
190
|
+
* Match a model to its known profile.
|
|
191
|
+
* Priority: 1) static MODEL_PROFILES (curated), 2) SQLite cache (auto-generated from HF)
|
|
192
|
+
*/
|
|
193
|
+
function getModelProfile(model) {
|
|
194
|
+
// Try static profiles first (curated, most reliable)
|
|
195
|
+
for (const { pattern, profile } of MODEL_PROFILES) {
|
|
196
|
+
if (pattern.test(model.id))
|
|
197
|
+
return profile;
|
|
198
|
+
}
|
|
199
|
+
if (model.arch) {
|
|
200
|
+
for (const { pattern, profile } of MODEL_PROFILES) {
|
|
201
|
+
if (pattern.test(model.arch))
|
|
202
|
+
return profile;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
return undefined;
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Async version that also checks SQLite cache for auto-generated profiles.
|
|
209
|
+
* Use this when you need the most complete profile available.
|
|
210
|
+
*/
|
|
211
|
+
async function getModelProfileAsync(model) {
|
|
212
|
+
// Static profiles take priority
|
|
213
|
+
const staticProfile = getModelProfile(model);
|
|
214
|
+
if (staticProfile)
|
|
215
|
+
return staticProfile;
|
|
216
|
+
// Check SQLite cache for auto-generated profile
|
|
217
|
+
try {
|
|
218
|
+
const cached = await getCachedProfile(model.id);
|
|
219
|
+
if (cached) {
|
|
220
|
+
const profile = cachedToProfile(cached);
|
|
221
|
+
if (profile)
|
|
222
|
+
return profile;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
catch {
|
|
226
|
+
// Cache lookup failed — fall through
|
|
227
|
+
}
|
|
228
|
+
return undefined;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Format a single model's full metadata for display.
|
|
232
|
+
* Async because it may fetch HuggingFace enrichment data.
|
|
233
|
+
*/
|
|
234
|
+
async function formatModelDetail(model, enrichWithHF = false) {
|
|
235
|
+
const ctx = getContextLength(model);
|
|
236
|
+
const maxCtx = getMaxContextLength(model);
|
|
237
|
+
// Use async profile lookup (checks static + SQLite cache)
|
|
238
|
+
const profile = await getModelProfileAsync(model);
|
|
239
|
+
const parts = [];
|
|
240
|
+
// Header line
|
|
241
|
+
parts.push(` ${model.state === 'loaded' ? '●' : '○'} ${model.id}`);
|
|
242
|
+
// Metadata line
|
|
243
|
+
const meta = [];
|
|
244
|
+
if (model.type)
|
|
245
|
+
meta.push(`type: ${model.type}`);
|
|
246
|
+
if (model.arch)
|
|
247
|
+
meta.push(`arch: ${model.arch}`);
|
|
248
|
+
if (model.quantization)
|
|
249
|
+
meta.push(`quant: ${model.quantization}`);
|
|
250
|
+
if (model.compatibility_type)
|
|
251
|
+
meta.push(`format: ${model.compatibility_type}`);
|
|
252
|
+
// Show loaded context vs max context when both are available and different
|
|
253
|
+
if (model.loaded_context_length && maxCtx && model.loaded_context_length !== maxCtx) {
|
|
254
|
+
meta.push(`context: ${model.loaded_context_length.toLocaleString()} (max ${maxCtx.toLocaleString()})`);
|
|
255
|
+
}
|
|
256
|
+
else if (ctx) {
|
|
257
|
+
meta.push(`context: ${ctx.toLocaleString()}`);
|
|
258
|
+
}
|
|
259
|
+
if (model.publisher)
|
|
260
|
+
meta.push(`by: ${model.publisher}`);
|
|
261
|
+
if (meta.length > 0)
|
|
262
|
+
parts.push(` ${meta.join(' · ')}`);
|
|
263
|
+
// Capabilities
|
|
264
|
+
if (model.capabilities && model.capabilities.length > 0) {
|
|
265
|
+
parts.push(` Capabilities: ${model.capabilities.join(', ')}`);
|
|
266
|
+
}
|
|
267
|
+
// Profile info (static or auto-generated from SQLite cache)
|
|
268
|
+
if (profile) {
|
|
269
|
+
parts.push(` ${profile.family}: ${profile.description}`);
|
|
270
|
+
parts.push(` Best for: ${profile.bestFor.join(', ')}`);
|
|
271
|
+
}
|
|
272
|
+
// HuggingFace enrichment line from SQLite cache
|
|
273
|
+
if (enrichWithHF) {
|
|
274
|
+
try {
|
|
275
|
+
const hfLine = await getHFEnrichmentLine(model.id);
|
|
276
|
+
if (hfLine)
|
|
277
|
+
parts.push(hfLine);
|
|
278
|
+
}
|
|
279
|
+
catch {
|
|
280
|
+
// HF enrichment is best-effort — never block on failure
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
return parts.join('\n');
|
|
284
|
+
}
|
|
27
285
|
/**
|
|
28
286
|
* Fetch with a connect timeout so Claude doesn't hang when the host is offline.
|
|
29
287
|
*/
|
|
@@ -67,10 +325,14 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
67
325
|
temperature: options.temperature ?? DEFAULT_TEMPERATURE,
|
|
68
326
|
max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
69
327
|
stream: true,
|
|
328
|
+
stream_options: { include_usage: true },
|
|
70
329
|
};
|
|
71
330
|
if (options.model || LM_MODEL) {
|
|
72
331
|
body.model = options.model || LM_MODEL;
|
|
73
332
|
}
|
|
333
|
+
if (options.responseFormat) {
|
|
334
|
+
body.response_format = options.responseFormat;
|
|
335
|
+
}
|
|
74
336
|
const startTime = Date.now();
|
|
75
337
|
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body) }, INFERENCE_CONNECT_TIMEOUT_MS);
|
|
76
338
|
if (!res.ok) {
|
|
@@ -88,6 +350,7 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
88
350
|
let finishReason = '';
|
|
89
351
|
let truncated = false;
|
|
90
352
|
let buffer = '';
|
|
353
|
+
let ttftMs;
|
|
91
354
|
try {
|
|
92
355
|
while (true) {
|
|
93
356
|
// Check soft timeout before each read
|
|
@@ -123,8 +386,11 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
123
386
|
if (json.model)
|
|
124
387
|
model = json.model;
|
|
125
388
|
const delta = json.choices?.[0]?.delta;
|
|
126
|
-
if (delta?.content)
|
|
389
|
+
if (delta?.content) {
|
|
390
|
+
if (ttftMs === undefined)
|
|
391
|
+
ttftMs = Date.now() - startTime;
|
|
127
392
|
content += delta.content;
|
|
393
|
+
}
|
|
128
394
|
const reason = json.choices?.[0]?.finish_reason;
|
|
129
395
|
if (reason)
|
|
130
396
|
finishReason = reason;
|
|
@@ -142,9 +408,33 @@ async function chatCompletionStreaming(messages, options = {}) {
|
|
|
142
408
|
// Release the reader — don't await cancel() as it can hang
|
|
143
409
|
reader.releaseLock();
|
|
144
410
|
}
|
|
145
|
-
|
|
411
|
+
const generationMs = Date.now() - startTime;
|
|
412
|
+
// Strip <think>...</think> reasoning blocks from models that always emit them
|
|
413
|
+
// (e.g. GLM Flash, Nemotron). Claude doesn't need the model's internal reasoning.
|
|
414
|
+
// Handle both closed blocks and unclosed ones (model ran out of tokens mid-think,
|
|
415
|
+
// or grammar-constrained output forced content before the closing tag).
|
|
416
|
+
let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
|
|
417
|
+
cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
|
|
418
|
+
cleanContent = cleanContent.trim();
|
|
419
|
+
return { content: cleanContent, model, usage, finishReason, truncated, ttftMs, generationMs };
|
|
146
420
|
}
|
|
421
|
+
/**
|
|
422
|
+
* Fetch models from LM Studio's native v0 API first (richer metadata),
|
|
423
|
+
* falling back to the OpenAI-compatible v1 endpoint for non-LM-Studio hosts.
|
|
424
|
+
*/
|
|
147
425
|
async function listModelsRaw() {
|
|
426
|
+
// Try v0 API first — returns type, arch, publisher, quantization, state
|
|
427
|
+
try {
|
|
428
|
+
const v0 = await fetchWithTimeout(`${LM_BASE_URL}/api/v0/models`, { headers: apiHeaders() });
|
|
429
|
+
if (v0.ok) {
|
|
430
|
+
const data = (await v0.json());
|
|
431
|
+
return data.data;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
catch {
|
|
435
|
+
// v0 not available — fall through to v1
|
|
436
|
+
}
|
|
437
|
+
// Fallback: OpenAI-compatible v1 endpoint (works with Ollama, vLLM, llama.cpp)
|
|
148
438
|
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/models`, { headers: apiHeaders() });
|
|
149
439
|
if (!res.ok)
|
|
150
440
|
throw new Error(`Failed to list models: ${res.status}`);
|
|
@@ -152,22 +442,101 @@ async function listModelsRaw() {
|
|
|
152
442
|
return data.data;
|
|
153
443
|
}
|
|
154
444
|
function getContextLength(model) {
|
|
155
|
-
//
|
|
156
|
-
|
|
445
|
+
// Prefer loaded_context_length (actual configured context) over max_context_length (theoretical max)
|
|
446
|
+
// v0 API: loaded_context_length / max_context_length, v1: context_length, vLLM: max_model_len
|
|
447
|
+
return model.loaded_context_length ?? model.max_context_length ?? model.context_length ?? model.max_model_len ?? FALLBACK_CONTEXT_LENGTH;
|
|
448
|
+
}
|
|
449
|
+
function getMaxContextLength(model) {
|
|
450
|
+
return model.max_context_length;
|
|
451
|
+
}
|
|
452
|
+
async function routeToModel(taskType) {
|
|
453
|
+
let models;
|
|
454
|
+
try {
|
|
455
|
+
models = await listModelsRaw();
|
|
456
|
+
}
|
|
457
|
+
catch {
|
|
458
|
+
// Can't reach server — fall back to default
|
|
459
|
+
const hints = getPromptHints(LM_MODEL);
|
|
460
|
+
return { modelId: LM_MODEL || '', hints };
|
|
461
|
+
}
|
|
462
|
+
const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
|
|
463
|
+
const available = models.filter((m) => m.state === 'not-loaded');
|
|
464
|
+
if (loaded.length === 0) {
|
|
465
|
+
const hints = getPromptHints(LM_MODEL);
|
|
466
|
+
return { modelId: LM_MODEL || '', hints };
|
|
467
|
+
}
|
|
468
|
+
// Score each loaded model for the requested task type
|
|
469
|
+
let bestModel = loaded[0];
|
|
470
|
+
let bestScore = -1;
|
|
471
|
+
for (const model of loaded) {
|
|
472
|
+
const hints = getPromptHints(model.id, model.arch);
|
|
473
|
+
// Primary: is this task type in the model's best types?
|
|
474
|
+
let score = (hints.bestTaskTypes ?? []).includes(taskType) ? 10 : 0;
|
|
475
|
+
// Bonus: code-specialised models get extra points for code tasks
|
|
476
|
+
const profile = getModelProfile(model);
|
|
477
|
+
if (taskType === 'code' && profile?.family.toLowerCase().includes('coder'))
|
|
478
|
+
score += 5;
|
|
479
|
+
// Bonus: larger context for analysis tasks
|
|
480
|
+
if (taskType === 'analysis') {
|
|
481
|
+
const ctx = getContextLength(model);
|
|
482
|
+
if (ctx && ctx > 100000)
|
|
483
|
+
score += 2;
|
|
484
|
+
}
|
|
485
|
+
if (score > bestScore) {
|
|
486
|
+
bestScore = score;
|
|
487
|
+
bestModel = model;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
const hints = getPromptHints(bestModel.id, bestModel.arch);
|
|
491
|
+
const result = { modelId: bestModel.id, hints };
|
|
492
|
+
// If the best loaded model isn't ideal for this task, suggest a better available one.
|
|
493
|
+
// We don't JIT-load because model loading takes minutes and the MCP SDK has a ~60s
|
|
494
|
+
// hard timeout. Instead, suggest the user loads the better model in LM Studio.
|
|
495
|
+
if (!(hints.bestTaskTypes ?? []).includes(taskType)) {
|
|
496
|
+
const better = available.find((m) => {
|
|
497
|
+
const mHints = getPromptHints(m.id, m.arch);
|
|
498
|
+
return (mHints.bestTaskTypes ?? []).includes(taskType);
|
|
499
|
+
});
|
|
500
|
+
if (better) {
|
|
501
|
+
const label = taskType === 'code' ? 'code tasks'
|
|
502
|
+
: taskType === 'analysis' ? 'analysis'
|
|
503
|
+
: taskType === 'embedding' ? 'embeddings'
|
|
504
|
+
: 'this kind of task';
|
|
505
|
+
result.suggestion = `💡 ${better.id} is downloaded and better suited for ${label} — ask the user to load it in LM Studio.`;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
return result;
|
|
157
509
|
}
|
|
158
510
|
/**
|
|
159
511
|
* Format a footer line for streaming results showing model, usage, and truncation status.
|
|
160
512
|
*/
|
|
161
513
|
function formatFooter(resp, extra) {
|
|
514
|
+
// Record usage for session tracking before formatting
|
|
515
|
+
recordUsage(resp);
|
|
162
516
|
const parts = [];
|
|
163
517
|
if (resp.model)
|
|
164
518
|
parts.push(`Model: ${resp.model}`);
|
|
165
519
|
if (resp.usage)
|
|
166
|
-
parts.push(
|
|
520
|
+
parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
|
|
521
|
+
// Perf stats — computed from streaming, no proprietary API needed
|
|
522
|
+
const perfParts = [];
|
|
523
|
+
if (resp.ttftMs !== undefined)
|
|
524
|
+
perfParts.push(`TTFT: ${resp.ttftMs}ms`);
|
|
525
|
+
if (resp.usage && resp.generationMs > 0) {
|
|
526
|
+
const tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
|
|
527
|
+
perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
|
|
528
|
+
}
|
|
529
|
+
if (resp.generationMs)
|
|
530
|
+
perfParts.push(`${(resp.generationMs / 1000).toFixed(1)}s`);
|
|
531
|
+
if (perfParts.length > 0)
|
|
532
|
+
parts.push(perfParts.join(', '));
|
|
167
533
|
if (extra)
|
|
168
534
|
parts.push(extra);
|
|
169
535
|
if (resp.truncated)
|
|
170
536
|
parts.push('⚠ TRUNCATED (soft timeout — partial result)');
|
|
537
|
+
const sessionLine = sessionSummary();
|
|
538
|
+
if (sessionLine)
|
|
539
|
+
parts.push(sessionLine);
|
|
171
540
|
return parts.length > 0 ? `\n\n---\n${parts.join(' | ')}` : '';
|
|
172
541
|
}
|
|
173
542
|
// ── MCP Tool definitions ─────────────────────────────────────────────
|
|
@@ -177,6 +546,9 @@ const TOOLS = [
|
|
|
177
546
|
description: 'Send a task to a local LLM running on a separate machine. This is a FREE, parallel worker — ' +
|
|
178
547
|
'use it to offload bounded work while you continue doing other things. The local LLM runs independently ' +
|
|
179
548
|
'and does not consume your tokens or rate limits.\n\n' +
|
|
549
|
+
'PLANNING: When you start a large task (refactoring, migrations, test suites, documentation), ' +
|
|
550
|
+
'break it into steps and identify which ones are bounded grunt work you can delegate here. ' +
|
|
551
|
+
'The more you offload, the more tokens you save. The session footer tracks cumulative savings.\n\n' +
|
|
180
552
|
'WHEN TO USE (delegate generously — it costs nothing):\n' +
|
|
181
553
|
'• Explain or summarise code/docs you just read\n' +
|
|
182
554
|
'• Generate boilerplate, test stubs, type definitions, mock data\n' +
|
|
@@ -185,11 +557,17 @@ const TOOLS = [
|
|
|
185
557
|
'• Translate or reformat content (JSON↔YAML, snake_case↔camelCase)\n' +
|
|
186
558
|
'• Brainstorm approaches before you commit to one\n' +
|
|
187
559
|
'• Any self-contained subtask that does not need tool access\n\n' +
|
|
188
|
-
'
|
|
560
|
+
'PROMPT QUALITY (the local model is highly capable — results depend on your prompt):\n' +
|
|
189
561
|
'(1) Always send COMPLETE code/context — never truncate, the local LLM cannot access files.\n' +
|
|
190
562
|
'(2) Be explicit about output format ("respond as a JSON array", "return only the function").\n' +
|
|
191
|
-
'(3)
|
|
192
|
-
'
|
|
563
|
+
'(3) Set a specific persona in the system field — "Senior TypeScript dev" beats "helpful assistant".\n' +
|
|
564
|
+
'(4) State constraints: "no preamble", "reference line numbers", "max 5 bullet points".\n' +
|
|
565
|
+
'(5) For code generation, include the surrounding context (imports, types, function signatures).\n\n' +
|
|
566
|
+
'QA: Always review the local LLM\'s output before using it. Verify correctness, check edge cases, ' +
|
|
567
|
+
'and fix any issues. You are the architect — the local model is a fast drafter, not the final authority.\n\n' +
|
|
568
|
+
'ROUTING: If multiple models are loaded, houtini-lm automatically picks the best one for the task. ' +
|
|
569
|
+
'If a better model is downloaded but not loaded, you\'ll see a suggestion in the response footer. ' +
|
|
570
|
+
'Call discover to see what\'s available.',
|
|
193
571
|
inputSchema: {
|
|
194
572
|
type: 'object',
|
|
195
573
|
properties: {
|
|
@@ -209,6 +587,10 @@ const TOOLS = [
|
|
|
209
587
|
type: 'number',
|
|
210
588
|
description: 'Max response tokens. Default 2048. Use higher for code generation, lower for quick answers.',
|
|
211
589
|
},
|
|
590
|
+
json_schema: {
|
|
591
|
+
type: 'object',
|
|
592
|
+
description: 'Force structured JSON output. Provide a JSON Schema object and the response will be guaranteed valid JSON conforming to it. Example: {"name":"result","schema":{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}}',
|
|
593
|
+
},
|
|
212
594
|
},
|
|
213
595
|
required: ['message'],
|
|
214
596
|
},
|
|
@@ -217,14 +599,19 @@ const TOOLS = [
|
|
|
217
599
|
name: 'custom_prompt',
|
|
218
600
|
description: 'Structured analysis via the local LLM with explicit system/context/instruction separation. ' +
|
|
219
601
|
'This 3-part format prevents context bleed and gets the best results from local models.\n\n' +
|
|
602
|
+
'USE THIS for complex tasks where prompt structure matters — it consistently outperforms ' +
|
|
603
|
+
'stuffing everything into a single message. The separation helps the local model focus.\n\n' +
|
|
220
604
|
'WHEN TO USE:\n' +
|
|
221
605
|
'• Code review — paste full source, ask for bugs/improvements\n' +
|
|
222
606
|
'• Comparison — paste two implementations, ask which is better and why\n' +
|
|
223
607
|
'• Refactoring suggestions — paste code, ask for a cleaner version\n' +
|
|
224
608
|
'• Content analysis — paste text, ask for structure/tone/issues\n' +
|
|
225
609
|
'• Any task where separating context from instruction improves clarity\n\n' +
|
|
226
|
-
'
|
|
227
|
-
'
|
|
610
|
+
'PROMPT STRUCTURE (each field has a job — keep them focused):\n' +
|
|
611
|
+
'• System: persona + constraints, under 30 words. "Expert Python developer focused on performance and correctness."\n' +
|
|
612
|
+
'• Context: COMPLETE data. Full source code, full logs, full text. NEVER truncate or summarise.\n' +
|
|
613
|
+
'• Instruction: exactly what to produce, under 50 words. Specify format: "Return a JSON array of {line, issue, fix}."\n\n' +
|
|
614
|
+
'QA: Review the output. The local model is a capable drafter — verify its analysis before acting on it.',
|
|
228
615
|
inputSchema: {
|
|
229
616
|
type: 'object',
|
|
230
617
|
properties: {
|
|
@@ -248,6 +635,10 @@ const TOOLS = [
|
|
|
248
635
|
type: 'number',
|
|
249
636
|
description: 'Max response tokens. Default 2048.',
|
|
250
637
|
},
|
|
638
|
+
json_schema: {
|
|
639
|
+
type: 'object',
|
|
640
|
+
description: 'Force structured JSON output. Provide a JSON Schema object and the response will be guaranteed valid JSON conforming to it.',
|
|
641
|
+
},
|
|
251
642
|
},
|
|
252
643
|
required: ['instruction'],
|
|
253
644
|
},
|
|
@@ -255,13 +646,20 @@ const TOOLS = [
|
|
|
255
646
|
{
|
|
256
647
|
name: 'code_task',
|
|
257
648
|
description: 'Send a code analysis task to the local LLM. Wraps the request with an optimised code-review system prompt.\n\n' +
|
|
649
|
+
'This is the fastest way to offload code-specific work. Temperature is locked to 0.2 for ' +
|
|
650
|
+
'focused, deterministic output. The system prompt is pre-configured for code review.\n\n' +
|
|
258
651
|
'WHEN TO USE:\n' +
|
|
259
652
|
'• Explain what a function/class does\n' +
|
|
260
653
|
'• Find bugs or suggest improvements\n' +
|
|
261
654
|
'• Generate unit tests or type definitions for existing code\n' +
|
|
262
655
|
'• Add error handling, logging, or validation\n' +
|
|
263
656
|
'• Convert between languages or patterns\n\n' +
|
|
264
|
-
'
|
|
657
|
+
'GETTING BEST RESULTS:\n' +
|
|
658
|
+
'• Provide COMPLETE source code — the local LLM cannot read files.\n' +
|
|
659
|
+
'• Include imports and type definitions so the model has full context.\n' +
|
|
660
|
+
'• Be specific in the task: "Write 3 Jest tests for the error paths in fetchUser" beats "Write tests".\n' +
|
|
661
|
+
'• Set the language field — it shapes the system prompt and improves accuracy.\n\n' +
|
|
662
|
+
'QA: Always verify generated code compiles, handles edge cases, and follows project conventions.',
|
|
265
663
|
inputSchema: {
|
|
266
664
|
type: 'object',
|
|
267
665
|
properties: {
|
|
@@ -288,49 +686,91 @@ const TOOLS = [
|
|
|
288
686
|
{
|
|
289
687
|
name: 'discover',
|
|
290
688
|
description: 'Check whether the local LLM is online and what model is loaded. Returns model name, context window size, ' +
|
|
291
|
-
'
|
|
689
|
+
'response latency, and cumulative session stats (tokens offloaded so far). ' +
|
|
690
|
+
'Call this if you are unsure whether the local LLM is available before delegating work. ' +
|
|
292
691
|
'Fast — typically responds in under 1 second, or returns an offline status within 5 seconds if the host is unreachable.',
|
|
293
692
|
inputSchema: { type: 'object', properties: {} },
|
|
294
693
|
},
|
|
295
694
|
{
|
|
296
695
|
name: 'list_models',
|
|
297
|
-
description: 'List all models
|
|
298
|
-
'
|
|
696
|
+
description: 'List all models on the local LLM server — both loaded (ready) and available (downloaded but not active). ' +
|
|
697
|
+
'Shows rich metadata for each model: type (llm/vlm/embeddings), architecture, quantization, context window, ' +
|
|
698
|
+
'and a capability profile describing what the model is best at. ' +
|
|
699
|
+
'Use this to understand which models are available and suggest switching when a different model would suit the task better.',
|
|
299
700
|
inputSchema: { type: 'object', properties: {} },
|
|
300
701
|
},
|
|
702
|
+
{
|
|
703
|
+
name: 'embed',
|
|
704
|
+
description: 'Generate text embeddings via the local LLM server. Requires an embedding model to be loaded ' +
|
|
705
|
+
'(e.g. Nomic Embed). Returns a vector representation of the input text for semantic search, ' +
|
|
706
|
+
'similarity comparison, or RAG pipelines. Uses the OpenAI-compatible /v1/embeddings endpoint.',
|
|
707
|
+
inputSchema: {
|
|
708
|
+
type: 'object',
|
|
709
|
+
properties: {
|
|
710
|
+
input: {
|
|
711
|
+
type: 'string',
|
|
712
|
+
description: 'The text to embed. Can be a single string.',
|
|
713
|
+
},
|
|
714
|
+
model: {
|
|
715
|
+
type: 'string',
|
|
716
|
+
description: 'Embedding model ID. If omitted, uses whatever embedding model is loaded.',
|
|
717
|
+
},
|
|
718
|
+
},
|
|
719
|
+
required: ['input'],
|
|
720
|
+
},
|
|
721
|
+
},
|
|
301
722
|
];
|
|
302
723
|
// ── MCP Server ───────────────────────────────────────────────────────
|
|
303
|
-
const server = new Server({ name: 'houtini-lm', version: '2.
|
|
724
|
+
const server = new Server({ name: 'houtini-lm', version: '2.7.0' }, { capabilities: { tools: {} } });
|
|
304
725
|
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
|
|
305
726
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
306
727
|
const { name, arguments: args } = request.params;
|
|
307
728
|
try {
|
|
308
729
|
switch (name) {
|
|
309
730
|
case 'chat': {
|
|
310
|
-
const { message, system, temperature, max_tokens } = args;
|
|
731
|
+
const { message, system, temperature, max_tokens, json_schema } = args;
|
|
732
|
+
const route = await routeToModel('chat');
|
|
311
733
|
const messages = [];
|
|
312
|
-
if
|
|
313
|
-
|
|
734
|
+
// Inject output constraint into system prompt if the model needs it
|
|
735
|
+
const systemContent = system
|
|
736
|
+
? (route.hints.outputConstraint ? `${system}\n\n${route.hints.outputConstraint}` : system)
|
|
737
|
+
: (route.hints.outputConstraint || undefined);
|
|
738
|
+
if (systemContent)
|
|
739
|
+
messages.push({ role: 'system', content: systemContent });
|
|
314
740
|
messages.push({ role: 'user', content: message });
|
|
741
|
+
const responseFormat = json_schema
|
|
742
|
+
? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
|
|
743
|
+
: undefined;
|
|
315
744
|
const resp = await chatCompletionStreaming(messages, {
|
|
316
|
-
temperature,
|
|
745
|
+
temperature: temperature ?? route.hints.chatTemp,
|
|
317
746
|
maxTokens: max_tokens,
|
|
747
|
+
model: route.modelId,
|
|
748
|
+
responseFormat,
|
|
318
749
|
});
|
|
319
750
|
const footer = formatFooter(resp);
|
|
320
751
|
return { content: [{ type: 'text', text: resp.content + footer }] };
|
|
321
752
|
}
|
|
322
753
|
case 'custom_prompt': {
|
|
323
|
-
const { system, context, instruction, temperature, max_tokens } = args;
|
|
754
|
+
const { system, context, instruction, temperature, max_tokens, json_schema } = args;
|
|
755
|
+
const route = await routeToModel('analysis');
|
|
324
756
|
const messages = [];
|
|
325
|
-
|
|
326
|
-
|
|
757
|
+
const systemContent = system
|
|
758
|
+
? (route.hints.outputConstraint ? `${system}\n\n${route.hints.outputConstraint}` : system)
|
|
759
|
+
: (route.hints.outputConstraint || undefined);
|
|
760
|
+
if (systemContent)
|
|
761
|
+
messages.push({ role: 'system', content: systemContent });
|
|
327
762
|
let userContent = instruction;
|
|
328
763
|
if (context)
|
|
329
764
|
userContent = `Context:\n${context}\n\nInstruction:\n${instruction}`;
|
|
330
765
|
messages.push({ role: 'user', content: userContent });
|
|
766
|
+
const responseFormat = json_schema
|
|
767
|
+
? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
|
|
768
|
+
: undefined;
|
|
331
769
|
const resp = await chatCompletionStreaming(messages, {
|
|
332
|
-
temperature,
|
|
770
|
+
temperature: temperature ?? route.hints.chatTemp,
|
|
333
771
|
maxTokens: max_tokens,
|
|
772
|
+
model: route.modelId,
|
|
773
|
+
responseFormat,
|
|
334
774
|
});
|
|
335
775
|
const footer = formatFooter(resp);
|
|
336
776
|
return {
|
|
@@ -340,10 +780,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
340
780
|
case 'code_task': {
|
|
341
781
|
const { code, task, language, max_tokens: codeMaxTokens } = args;
|
|
342
782
|
const lang = language || 'unknown';
|
|
783
|
+
const route = await routeToModel('code');
|
|
784
|
+
const outputConstraint = route.hints.outputConstraint
|
|
785
|
+
? ` ${route.hints.outputConstraint}`
|
|
786
|
+
: '';
|
|
343
787
|
const codeMessages = [
|
|
344
788
|
{
|
|
345
789
|
role: 'system',
|
|
346
|
-
content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes
|
|
790
|
+
content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes.${outputConstraint}`,
|
|
347
791
|
},
|
|
348
792
|
{
|
|
349
793
|
role: 'user',
|
|
@@ -351,11 +795,13 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
351
795
|
},
|
|
352
796
|
];
|
|
353
797
|
const codeResp = await chatCompletionStreaming(codeMessages, {
|
|
354
|
-
temperature:
|
|
798
|
+
temperature: route.hints.codeTemp,
|
|
355
799
|
maxTokens: codeMaxTokens ?? DEFAULT_MAX_TOKENS,
|
|
800
|
+
model: route.modelId,
|
|
356
801
|
});
|
|
357
802
|
const codeFooter = formatFooter(codeResp, lang);
|
|
358
|
-
|
|
803
|
+
const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
|
|
804
|
+
return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
|
|
359
805
|
}
|
|
360
806
|
case 'discover': {
|
|
361
807
|
const start = Date.now();
|
|
@@ -384,36 +830,98 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
384
830
|
}],
|
|
385
831
|
};
|
|
386
832
|
}
|
|
387
|
-
const
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
});
|
|
391
|
-
const primary = models[0];
|
|
833
|
+
const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
|
|
834
|
+
const available = models.filter((m) => m.state === 'not-loaded');
|
|
835
|
+
const primary = loaded[0] || models[0];
|
|
392
836
|
const ctx = getContextLength(primary);
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
837
|
+
const primaryProfile = await getModelProfileAsync(primary);
|
|
838
|
+
const sessionStats = session.calls > 0
|
|
839
|
+
? `\nSession stats: ${(session.promptTokens + session.completionTokens).toLocaleString()} tokens offloaded across ${session.calls} call${session.calls === 1 ? '' : 's'}`
|
|
840
|
+
: '\nSession stats: no calls yet — delegate tasks to start saving tokens';
|
|
841
|
+
let text = `Status: ONLINE\n` +
|
|
842
|
+
`Endpoint: ${LM_BASE_URL}\n` +
|
|
843
|
+
`Latency: ${ms}ms\n` +
|
|
844
|
+
`Active model: ${primary.id}\n` +
|
|
845
|
+
`Context window: ${ctx.toLocaleString()} tokens\n`;
|
|
846
|
+
if (primaryProfile) {
|
|
847
|
+
text += `Family: ${primaryProfile.family}\n`;
|
|
848
|
+
text += `Description: ${primaryProfile.description}\n`;
|
|
849
|
+
text += `Best for: ${primaryProfile.bestFor.join(', ')}\n`;
|
|
850
|
+
text += `Strengths: ${primaryProfile.strengths.join(', ')}\n`;
|
|
851
|
+
if (primaryProfile.weaknesses.length > 0) {
|
|
852
|
+
text += `Weaknesses: ${primaryProfile.weaknesses.join(', ')}\n`;
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
if (loaded.length > 0) {
|
|
856
|
+
text += `\nLoaded models (● ready to use):\n`;
|
|
857
|
+
text += (await Promise.all(loaded.map((m) => formatModelDetail(m)))).join('\n\n');
|
|
858
|
+
}
|
|
859
|
+
if (available.length > 0) {
|
|
860
|
+
text += `\n\nAvailable models (○ downloaded, not loaded — can be activated in LM Studio):\n`;
|
|
861
|
+
text += (await Promise.all(available.map((m) => formatModelDetail(m)))).join('\n\n');
|
|
862
|
+
}
|
|
863
|
+
// Per-model performance stats from this session
|
|
864
|
+
if (session.modelStats.size > 0) {
|
|
865
|
+
text += `\n\nPerformance (this session):\n`;
|
|
866
|
+
for (const [modelId, stats] of session.modelStats) {
|
|
867
|
+
const avgTtft = stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0;
|
|
868
|
+
const avgTokSec = stats.perfCalls > 0 ? (stats.totalTokPerSec / stats.perfCalls).toFixed(1) : '?';
|
|
869
|
+
text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s\n`;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
text += `${sessionStats}\n\n`;
|
|
873
|
+
text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, or embed.`;
|
|
874
|
+
return { content: [{ type: 'text', text }] };
|
|
405
875
|
}
|
|
406
876
|
case 'list_models': {
|
|
407
877
|
const models = await listModelsRaw();
|
|
408
878
|
if (!models.length) {
|
|
409
|
-
return { content: [{ type: 'text', text: 'No models currently loaded.' }] };
|
|
879
|
+
return { content: [{ type: 'text', text: 'No models currently loaded or available.' }] };
|
|
410
880
|
}
|
|
411
|
-
const
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
881
|
+
const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
|
|
882
|
+
const available = models.filter((m) => m.state === 'not-loaded');
|
|
883
|
+
let text = '';
|
|
884
|
+
// list_models enriches with HuggingFace data (cached after first call)
|
|
885
|
+
if (loaded.length > 0) {
|
|
886
|
+
text += `Loaded models (● ready to use):\n\n`;
|
|
887
|
+
text += (await Promise.all(loaded.map((m) => formatModelDetail(m, true)))).join('\n\n');
|
|
888
|
+
}
|
|
889
|
+
if (available.length > 0) {
|
|
890
|
+
if (text)
|
|
891
|
+
text += '\n\n';
|
|
892
|
+
text += `Available models (○ downloaded, not loaded):\n\n`;
|
|
893
|
+
text += (await Promise.all(available.map((m) => formatModelDetail(m, true)))).join('\n\n');
|
|
894
|
+
}
|
|
895
|
+
return { content: [{ type: 'text', text }] };
|
|
896
|
+
}
|
|
897
|
+
case 'embed': {
|
|
898
|
+
const { input, model: embedModel } = args;
|
|
899
|
+
const embedBody = { input };
|
|
900
|
+
if (embedModel) {
|
|
901
|
+
embedBody.model = embedModel;
|
|
902
|
+
}
|
|
903
|
+
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
|
|
904
|
+
if (!res.ok) {
|
|
905
|
+
const errText = await res.text().catch(() => '');
|
|
906
|
+
throw new Error(`Embeddings API error ${res.status}: ${errText}`);
|
|
907
|
+
}
|
|
908
|
+
const data = (await res.json());
|
|
909
|
+
const embedding = data.data[0]?.embedding;
|
|
910
|
+
if (!embedding)
|
|
911
|
+
throw new Error('No embedding returned');
|
|
912
|
+
const usageInfo = data.usage
|
|
913
|
+
? `${data.usage.prompt_tokens} tokens embedded`
|
|
914
|
+
: '';
|
|
415
915
|
return {
|
|
416
|
-
content: [{
|
|
916
|
+
content: [{
|
|
917
|
+
type: 'text',
|
|
918
|
+
text: JSON.stringify({
|
|
919
|
+
model: data.model,
|
|
920
|
+
dimensions: embedding.length,
|
|
921
|
+
embedding,
|
|
922
|
+
usage: usageInfo,
|
|
923
|
+
}),
|
|
924
|
+
}],
|
|
417
925
|
};
|
|
418
926
|
}
|
|
419
927
|
default:
|
|
@@ -431,6 +939,11 @@ async function main() {
|
|
|
431
939
|
const transport = new StdioServerTransport();
|
|
432
940
|
await server.connect(transport);
|
|
433
941
|
process.stderr.write(`Houtini LM server running (${LM_BASE_URL})\n`);
|
|
942
|
+
// Background: profile all available models via HF → SQLite cache
|
|
943
|
+
// Non-blocking — server is already accepting requests
|
|
944
|
+
listModelsRaw()
|
|
945
|
+
.then((models) => profileModelsAtStartup(models))
|
|
946
|
+
.catch((err) => process.stderr.write(`[houtini-lm] Startup profiling skipped: ${err}\n`));
|
|
434
947
|
}
|
|
435
948
|
main().catch((error) => {
|
|
436
949
|
process.stderr.write(`Fatal error: ${error}\n`);
|