@houtini/lm 2.4.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8,6 +8,7 @@
8
8
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
9
9
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
10
10
  import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
11
+ import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, } from './model-cache.js';
11
12
  const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
12
13
  const LM_MODEL = process.env.LM_STUDIO_MODEL || '';
13
14
  const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || '';
@@ -26,12 +27,29 @@ const session = {
26
27
  calls: 0,
27
28
  promptTokens: 0,
28
29
  completionTokens: 0,
30
+ /** Per-model performance tracking for routing insights */
31
+ modelStats: new Map(),
29
32
  };
30
- function recordUsage(usage) {
33
+ function recordUsage(resp) {
31
34
  session.calls++;
32
- if (usage) {
33
- session.promptTokens += usage.prompt_tokens;
34
- session.completionTokens += usage.completion_tokens;
35
+ if (resp.usage) {
36
+ session.promptTokens += resp.usage.prompt_tokens;
37
+ session.completionTokens += resp.usage.completion_tokens;
38
+ }
39
+ // Track per-model perf stats
40
+ if (resp.model) {
41
+ const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
42
+ existing.calls++;
43
+ if (resp.ttftMs)
44
+ existing.totalTtftMs += resp.ttftMs;
45
+ const tokPerSec = resp.usage && resp.generationMs > 0
46
+ ? (resp.usage.completion_tokens / (resp.generationMs / 1000))
47
+ : 0;
48
+ if (tokPerSec > 0) {
49
+ existing.perfCalls++;
50
+ existing.totalTokPerSec += tokPerSec;
51
+ }
52
+ session.modelStats.set(resp.model, existing);
35
53
  }
36
54
  }
37
55
  function sessionSummary() {
@@ -46,6 +64,224 @@ function apiHeaders() {
46
64
  h['Authorization'] = `Bearer ${LM_PASSWORD}`;
47
65
  return h;
48
66
  }
67
+ const MODEL_PROFILES = [
68
+ {
69
+ pattern: /nemotron|nemotron_h_moe/i,
70
+ profile: {
71
+ family: 'NVIDIA Nemotron',
72
+ description: 'NVIDIA\'s compact reasoning model optimised for accurate, structured responses. Strong at step-by-step logic and instruction following.',
73
+ strengths: ['logical reasoning', 'math', 'step-by-step problem solving', 'code review', 'structured output'],
74
+ weaknesses: ['creative writing', 'constrained generation', 'factual knowledge on niche topics'],
75
+ bestFor: ['analysis tasks', 'code bug-finding', 'math/science questions', 'data transformation'],
76
+ },
77
+ },
78
+ {
79
+ pattern: /granite|granitehybrid/i,
80
+ profile: {
81
+ family: 'IBM Granite',
82
+ description: 'IBM\'s enterprise-focused model family. Compact and efficient, designed for business and code tasks with strong instruction following.',
83
+ strengths: ['code generation', 'instruction following', 'enterprise tasks', 'efficiency'],
84
+ weaknesses: ['creative tasks', 'long-form generation'],
85
+ bestFor: ['boilerplate generation', 'code explanation', 'structured Q&A'],
86
+ },
87
+ },
88
+ {
89
+ pattern: /qwen3-coder|qwen3.*coder/i,
90
+ profile: {
91
+ family: 'Qwen3 Coder',
92
+ description: 'Alibaba\'s code-specialised model with agentic capabilities. Excellent at code generation, review, and multi-step coding tasks.',
93
+ strengths: ['code generation', 'code review', 'debugging', 'test writing', 'refactoring', 'multi-step reasoning'],
94
+ weaknesses: ['non-code creative tasks'],
95
+ bestFor: ['code generation', 'code review', 'test stubs', 'type definitions', 'refactoring'],
96
+ },
97
+ },
98
+ {
99
+ pattern: /qwen3-vl|qwen.*vl/i,
100
+ profile: {
101
+ family: 'Qwen3 Vision-Language',
102
+ description: 'Alibaba\'s multimodal model handling both text and image inputs. Can analyse screenshots, diagrams, and visual content.',
103
+ strengths: ['image understanding', 'visual Q&A', 'diagram analysis', 'OCR'],
104
+ weaknesses: ['pure text tasks (use a text-only model instead)'],
105
+ bestFor: ['screenshot analysis', 'UI review', 'diagram interpretation'],
106
+ },
107
+ },
108
+ {
109
+ pattern: /qwen3(?!.*coder)(?!.*vl)/i,
110
+ profile: {
111
+ family: 'Qwen3',
112
+ description: 'Alibaba\'s general-purpose model with strong multilingual and reasoning capabilities. Good all-rounder.',
113
+ strengths: ['general reasoning', 'multilingual', 'code', 'instruction following'],
114
+ weaknesses: ['specialised code tasks (use Qwen3 Coder instead)'],
115
+ bestFor: ['general Q&A', 'translation', 'summarisation', 'brainstorming'],
116
+ },
117
+ },
118
+ {
119
+ pattern: /llama[- ]?3/i,
120
+ profile: {
121
+ family: 'Meta LLaMA 3',
122
+ description: 'Meta\'s open-weight general-purpose model. Strong baseline across tasks with large community fine-tune ecosystem.',
123
+ strengths: ['general reasoning', 'code', 'instruction following', 'broad knowledge'],
124
+ weaknesses: ['specialised tasks where fine-tuned models excel'],
125
+ bestFor: ['general delegation', 'drafting', 'code review', 'Q&A'],
126
+ },
127
+ },
128
+ {
129
+ pattern: /minimax[- ]?m2/i,
130
+ profile: {
131
+ family: 'MiniMax M2',
132
+ description: 'MiniMax\'s large MoE model with strong long-context and reasoning capabilities.',
133
+ strengths: ['long context', 'reasoning', 'creative writing', 'multilingual'],
134
+ weaknesses: ['may be slower due to model size'],
135
+ bestFor: ['long document analysis', 'creative tasks', 'complex reasoning'],
136
+ },
137
+ },
138
+ {
139
+ pattern: /kimi[- ]?k2/i,
140
+ profile: {
141
+ family: 'Kimi K2',
142
+ description: 'Moonshot AI\'s large MoE model with strong agentic and tool-use capabilities.',
143
+ strengths: ['agentic tasks', 'tool use', 'code', 'reasoning', 'long context'],
144
+ weaknesses: ['may be slower due to model size'],
145
+ bestFor: ['complex multi-step tasks', 'code generation', 'reasoning chains'],
146
+ },
147
+ },
148
+ {
149
+ pattern: /gpt-oss/i,
150
+ profile: {
151
+ family: 'OpenAI GPT-OSS',
152
+ description: 'OpenAI\'s open-source model release. General-purpose with strong instruction following.',
153
+ strengths: ['instruction following', 'general reasoning', 'code'],
154
+ weaknesses: ['less tested in open ecosystem than LLaMA/Qwen'],
155
+ bestFor: ['general delegation', 'code tasks', 'Q&A'],
156
+ },
157
+ },
158
+ {
159
+ pattern: /glm[- ]?4/i,
160
+ profile: {
161
+ family: 'GLM-4',
162
+ description: 'Zhipu AI\'s open-weight MoE model. Fast inference with strong general reasoning, multilingual support, and tool-use capabilities. Uses chain-of-thought reasoning internally. MIT licensed.',
163
+ strengths: ['fast inference', 'general reasoning', 'tool use', 'multilingual', 'code', 'instruction following', 'chain-of-thought'],
164
+ weaknesses: ['always emits internal reasoning (stripped automatically)', 'less tested in English-only benchmarks than LLaMA/Qwen'],
165
+ bestFor: ['general delegation', 'fast drafting', 'code tasks', 'structured output', 'Q&A'],
166
+ },
167
+ },
168
+ {
169
+ pattern: /nomic.*embed|embed.*nomic/i,
170
+ profile: {
171
+ family: 'Nomic Embed',
172
+ description: 'Text embedding model for semantic search and similarity. Not a chat model — produces vector embeddings.',
173
+ strengths: ['text embeddings', 'semantic search', 'clustering'],
174
+ weaknesses: ['cannot chat or generate text'],
175
+ bestFor: ['RAG pipelines', 'semantic similarity', 'document search'],
176
+ },
177
+ },
178
+ {
179
+ pattern: /abliterated/i,
180
+ profile: {
181
+ family: 'Abliterated (uncensored)',
182
+ description: 'Community fine-tune with safety guardrails removed. More permissive but may produce lower-quality or unreliable output.',
183
+ strengths: ['fewer refusals', 'unconstrained generation'],
184
+ weaknesses: ['may hallucinate more', 'no safety filtering', 'less tested'],
185
+ bestFor: ['tasks where the base model refuses unnecessarily'],
186
+ },
187
+ },
188
+ ];
189
+ /**
190
+ * Match a model to its known profile.
191
+ * Priority: 1) static MODEL_PROFILES (curated), 2) SQLite cache (auto-generated from HF)
192
+ */
193
+ function getModelProfile(model) {
194
+ // Try static profiles first (curated, most reliable)
195
+ for (const { pattern, profile } of MODEL_PROFILES) {
196
+ if (pattern.test(model.id))
197
+ return profile;
198
+ }
199
+ if (model.arch) {
200
+ for (const { pattern, profile } of MODEL_PROFILES) {
201
+ if (pattern.test(model.arch))
202
+ return profile;
203
+ }
204
+ }
205
+ return undefined;
206
+ }
207
+ /**
208
+ * Async version that also checks SQLite cache for auto-generated profiles.
209
+ * Use this when you need the most complete profile available.
210
+ */
211
+ async function getModelProfileAsync(model) {
212
+ // Static profiles take priority
213
+ const staticProfile = getModelProfile(model);
214
+ if (staticProfile)
215
+ return staticProfile;
216
+ // Check SQLite cache for auto-generated profile
217
+ try {
218
+ const cached = await getCachedProfile(model.id);
219
+ if (cached) {
220
+ const profile = cachedToProfile(cached);
221
+ if (profile)
222
+ return profile;
223
+ }
224
+ }
225
+ catch {
226
+ // Cache lookup failed — fall through
227
+ }
228
+ return undefined;
229
+ }
230
+ /**
231
+ * Format a single model's full metadata for display.
232
+ * Async because it may fetch HuggingFace enrichment data.
233
+ */
234
+ async function formatModelDetail(model, enrichWithHF = false) {
235
+ const ctx = getContextLength(model);
236
+ const maxCtx = getMaxContextLength(model);
237
+ // Use async profile lookup (checks static + SQLite cache)
238
+ const profile = await getModelProfileAsync(model);
239
+ const parts = [];
240
+ // Header line
241
+ parts.push(` ${model.state === 'loaded' ? '●' : '○'} ${model.id}`);
242
+ // Metadata line
243
+ const meta = [];
244
+ if (model.type)
245
+ meta.push(`type: ${model.type}`);
246
+ if (model.arch)
247
+ meta.push(`arch: ${model.arch}`);
248
+ if (model.quantization)
249
+ meta.push(`quant: ${model.quantization}`);
250
+ if (model.compatibility_type)
251
+ meta.push(`format: ${model.compatibility_type}`);
252
+ // Show loaded context vs max context when both are available and different
253
+ if (model.loaded_context_length && maxCtx && model.loaded_context_length !== maxCtx) {
254
+ meta.push(`context: ${model.loaded_context_length.toLocaleString()} (max ${maxCtx.toLocaleString()})`);
255
+ }
256
+ else if (ctx) {
257
+ meta.push(`context: ${ctx.toLocaleString()}`);
258
+ }
259
+ if (model.publisher)
260
+ meta.push(`by: ${model.publisher}`);
261
+ if (meta.length > 0)
262
+ parts.push(` ${meta.join(' · ')}`);
263
+ // Capabilities
264
+ if (model.capabilities && model.capabilities.length > 0) {
265
+ parts.push(` Capabilities: ${model.capabilities.join(', ')}`);
266
+ }
267
+ // Profile info (static or auto-generated from SQLite cache)
268
+ if (profile) {
269
+ parts.push(` ${profile.family}: ${profile.description}`);
270
+ parts.push(` Best for: ${profile.bestFor.join(', ')}`);
271
+ }
272
+ // HuggingFace enrichment line from SQLite cache
273
+ if (enrichWithHF) {
274
+ try {
275
+ const hfLine = await getHFEnrichmentLine(model.id);
276
+ if (hfLine)
277
+ parts.push(hfLine);
278
+ }
279
+ catch {
280
+ // HF enrichment is best-effort — never block on failure
281
+ }
282
+ }
283
+ return parts.join('\n');
284
+ }
49
285
  /**
50
286
  * Fetch with a connect timeout so Claude doesn't hang when the host is offline.
51
287
  */
@@ -89,10 +325,14 @@ async function chatCompletionStreaming(messages, options = {}) {
89
325
  temperature: options.temperature ?? DEFAULT_TEMPERATURE,
90
326
  max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS,
91
327
  stream: true,
328
+ stream_options: { include_usage: true },
92
329
  };
93
330
  if (options.model || LM_MODEL) {
94
331
  body.model = options.model || LM_MODEL;
95
332
  }
333
+ if (options.responseFormat) {
334
+ body.response_format = options.responseFormat;
335
+ }
96
336
  const startTime = Date.now();
97
337
  const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body) }, INFERENCE_CONNECT_TIMEOUT_MS);
98
338
  if (!res.ok) {
@@ -110,6 +350,7 @@ async function chatCompletionStreaming(messages, options = {}) {
110
350
  let finishReason = '';
111
351
  let truncated = false;
112
352
  let buffer = '';
353
+ let ttftMs;
113
354
  try {
114
355
  while (true) {
115
356
  // Check soft timeout before each read
@@ -145,8 +386,11 @@ async function chatCompletionStreaming(messages, options = {}) {
145
386
  if (json.model)
146
387
  model = json.model;
147
388
  const delta = json.choices?.[0]?.delta;
148
- if (delta?.content)
389
+ if (delta?.content) {
390
+ if (ttftMs === undefined)
391
+ ttftMs = Date.now() - startTime;
149
392
  content += delta.content;
393
+ }
150
394
  const reason = json.choices?.[0]?.finish_reason;
151
395
  if (reason)
152
396
  finishReason = reason;
@@ -164,9 +408,33 @@ async function chatCompletionStreaming(messages, options = {}) {
164
408
  // Release the reader — don't await cancel() as it can hang
165
409
  reader.releaseLock();
166
410
  }
167
- return { content, model, usage, finishReason, truncated };
411
+ const generationMs = Date.now() - startTime;
412
+ // Strip <think>...</think> reasoning blocks from models that always emit them
413
+ // (e.g. GLM Flash, Nemotron). Claude doesn't need the model's internal reasoning.
414
+ // Handle both closed blocks and unclosed ones (model ran out of tokens mid-think,
415
+ // or grammar-constrained output forced content before the closing tag).
416
+ let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
417
+ cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
418
+ cleanContent = cleanContent.trim();
419
+ return { content: cleanContent, model, usage, finishReason, truncated, ttftMs, generationMs };
168
420
  }
421
+ /**
422
+ * Fetch models from LM Studio's native v0 API first (richer metadata),
423
+ * falling back to the OpenAI-compatible v1 endpoint for non-LM-Studio hosts.
424
+ */
169
425
  async function listModelsRaw() {
426
+ // Try v0 API first — returns type, arch, publisher, quantization, state
427
+ try {
428
+ const v0 = await fetchWithTimeout(`${LM_BASE_URL}/api/v0/models`, { headers: apiHeaders() });
429
+ if (v0.ok) {
430
+ const data = (await v0.json());
431
+ return data.data;
432
+ }
433
+ }
434
+ catch {
435
+ // v0 not available — fall through to v1
436
+ }
437
+ // Fallback: OpenAI-compatible v1 endpoint (works with Ollama, vLLM, llama.cpp)
170
438
  const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/models`, { headers: apiHeaders() });
171
439
  if (!res.ok)
172
440
  throw new Error(`Failed to list models: ${res.status}`);
@@ -174,20 +442,94 @@ async function listModelsRaw() {
174
442
  return data.data;
175
443
  }
176
444
  function getContextLength(model) {
177
- // LM Studio uses context_length, vLLM uses max_model_len, fall back to env/100k
178
- return model.context_length ?? model.max_model_len ?? FALLBACK_CONTEXT_LENGTH;
445
+ // Prefer loaded_context_length (actual configured context) over max_context_length (theoretical max)
446
+ // v0 API: loaded_context_length / max_context_length, v1: context_length, vLLM: max_model_len
447
+ return model.loaded_context_length ?? model.max_context_length ?? model.context_length ?? model.max_model_len ?? FALLBACK_CONTEXT_LENGTH;
448
+ }
449
+ function getMaxContextLength(model) {
450
+ return model.max_context_length;
451
+ }
452
+ async function routeToModel(taskType) {
453
+ let models;
454
+ try {
455
+ models = await listModelsRaw();
456
+ }
457
+ catch {
458
+ // Can't reach server — fall back to default
459
+ const hints = getPromptHints(LM_MODEL);
460
+ return { modelId: LM_MODEL || '', hints };
461
+ }
462
+ const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
463
+ const available = models.filter((m) => m.state === 'not-loaded');
464
+ if (loaded.length === 0) {
465
+ const hints = getPromptHints(LM_MODEL);
466
+ return { modelId: LM_MODEL || '', hints };
467
+ }
468
+ // Score each loaded model for the requested task type
469
+ let bestModel = loaded[0];
470
+ let bestScore = -1;
471
+ for (const model of loaded) {
472
+ const hints = getPromptHints(model.id, model.arch);
473
+ // Primary: is this task type in the model's best types?
474
+ let score = (hints.bestTaskTypes ?? []).includes(taskType) ? 10 : 0;
475
+ // Bonus: code-specialised models get extra points for code tasks
476
+ const profile = getModelProfile(model);
477
+ if (taskType === 'code' && profile?.family.toLowerCase().includes('coder'))
478
+ score += 5;
479
+ // Bonus: larger context for analysis tasks
480
+ if (taskType === 'analysis') {
481
+ const ctx = getContextLength(model);
482
+ if (ctx && ctx > 100000)
483
+ score += 2;
484
+ }
485
+ if (score > bestScore) {
486
+ bestScore = score;
487
+ bestModel = model;
488
+ }
489
+ }
490
+ const hints = getPromptHints(bestModel.id, bestModel.arch);
491
+ const result = { modelId: bestModel.id, hints };
492
+ // If the best loaded model isn't ideal for this task, suggest a better available one.
493
+ // We don't JIT-load because model loading takes minutes and the MCP SDK has a ~60s
494
+ // hard timeout. Instead, suggest the user loads the better model in LM Studio.
495
+ if (!(hints.bestTaskTypes ?? []).includes(taskType)) {
496
+ const better = available.find((m) => {
497
+ const mHints = getPromptHints(m.id, m.arch);
498
+ return (mHints.bestTaskTypes ?? []).includes(taskType);
499
+ });
500
+ if (better) {
501
+ const label = taskType === 'code' ? 'code tasks'
502
+ : taskType === 'analysis' ? 'analysis'
503
+ : taskType === 'embedding' ? 'embeddings'
504
+ : 'this kind of task';
505
+ result.suggestion = `💡 ${better.id} is downloaded and better suited for ${label} — ask the user to load it in LM Studio.`;
506
+ }
507
+ }
508
+ return result;
179
509
  }
180
510
  /**
181
511
  * Format a footer line for streaming results showing model, usage, and truncation status.
182
512
  */
183
513
  function formatFooter(resp, extra) {
184
514
  // Record usage for session tracking before formatting
185
- recordUsage(resp.usage);
515
+ recordUsage(resp);
186
516
  const parts = [];
187
517
  if (resp.model)
188
518
  parts.push(`Model: ${resp.model}`);
189
519
  if (resp.usage)
190
- parts.push(`This call: ${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
520
+ parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
521
+ // Perf stats — computed from streaming, no proprietary API needed
522
+ const perfParts = [];
523
+ if (resp.ttftMs !== undefined)
524
+ perfParts.push(`TTFT: ${resp.ttftMs}ms`);
525
+ if (resp.usage && resp.generationMs > 0) {
526
+ const tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
527
+ perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
528
+ }
529
+ if (resp.generationMs)
530
+ perfParts.push(`${(resp.generationMs / 1000).toFixed(1)}s`);
531
+ if (perfParts.length > 0)
532
+ parts.push(perfParts.join(', '));
191
533
  if (extra)
192
534
  parts.push(extra);
193
535
  if (resp.truncated)
@@ -223,7 +565,9 @@ const TOOLS = [
223
565
  '(5) For code generation, include the surrounding context (imports, types, function signatures).\n\n' +
224
566
  'QA: Always review the local LLM\'s output before using it. Verify correctness, check edge cases, ' +
225
567
  'and fix any issues. You are the architect — the local model is a fast drafter, not the final authority.\n\n' +
226
- 'The local model, context window, and speed vary — call the discover tool to check what is loaded.',
568
+ 'ROUTING: If multiple models are loaded, houtini-lm automatically picks the best one for the task. ' +
569
+ 'If a better model is downloaded but not loaded, you\'ll see a suggestion in the response footer. ' +
570
+ 'Call discover to see what\'s available.',
227
571
  inputSchema: {
228
572
  type: 'object',
229
573
  properties: {
@@ -243,6 +587,10 @@ const TOOLS = [
243
587
  type: 'number',
244
588
  description: 'Max response tokens. Default 2048. Use higher for code generation, lower for quick answers.',
245
589
  },
590
+ json_schema: {
591
+ type: 'object',
592
+ description: 'Force structured JSON output. Provide a JSON Schema object and the response will be guaranteed valid JSON conforming to it. Example: {"name":"result","schema":{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}}',
593
+ },
246
594
  },
247
595
  required: ['message'],
248
596
  },
@@ -287,6 +635,10 @@ const TOOLS = [
287
635
  type: 'number',
288
636
  description: 'Max response tokens. Default 2048.',
289
637
  },
638
+ json_schema: {
639
+ type: 'object',
640
+ description: 'Force structured JSON output. Provide a JSON Schema object and the response will be guaranteed valid JSON conforming to it.',
641
+ },
290
642
  },
291
643
  required: ['instruction'],
292
644
  },
@@ -341,43 +693,84 @@ const TOOLS = [
341
693
  },
342
694
  {
343
695
  name: 'list_models',
344
- description: 'List all models currently loaded in the local LLM server, with context window sizes. ' +
345
- 'Use discover instead for a quick availability check.',
696
+ description: 'List all models on the local LLM server both loaded (ready) and available (downloaded but not active). ' +
697
+ 'Shows rich metadata for each model: type (llm/vlm/embeddings), architecture, quantization, context window, ' +
698
+ 'and a capability profile describing what the model is best at. ' +
699
+ 'Use this to understand which models are available and suggest switching when a different model would suit the task better.',
346
700
  inputSchema: { type: 'object', properties: {} },
347
701
  },
702
+ {
703
+ name: 'embed',
704
+ description: 'Generate text embeddings via the local LLM server. Requires an embedding model to be loaded ' +
705
+ '(e.g. Nomic Embed). Returns a vector representation of the input text for semantic search, ' +
706
+ 'similarity comparison, or RAG pipelines. Uses the OpenAI-compatible /v1/embeddings endpoint.',
707
+ inputSchema: {
708
+ type: 'object',
709
+ properties: {
710
+ input: {
711
+ type: 'string',
712
+ description: 'The text to embed. Can be a single string.',
713
+ },
714
+ model: {
715
+ type: 'string',
716
+ description: 'Embedding model ID. If omitted, uses whatever embedding model is loaded.',
717
+ },
718
+ },
719
+ required: ['input'],
720
+ },
721
+ },
348
722
  ];
349
723
  // ── MCP Server ───────────────────────────────────────────────────────
350
- const server = new Server({ name: 'houtini-lm', version: '2.4.1' }, { capabilities: { tools: {} } });
724
+ const server = new Server({ name: 'houtini-lm', version: '2.7.0' }, { capabilities: { tools: {} } });
351
725
  server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
352
726
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
353
727
  const { name, arguments: args } = request.params;
354
728
  try {
355
729
  switch (name) {
356
730
  case 'chat': {
357
- const { message, system, temperature, max_tokens } = args;
731
+ const { message, system, temperature, max_tokens, json_schema } = args;
732
+ const route = await routeToModel('chat');
358
733
  const messages = [];
359
- if (system)
360
- messages.push({ role: 'system', content: system });
734
+ // Inject output constraint into system prompt if the model needs it
735
+ const systemContent = system
736
+ ? (route.hints.outputConstraint ? `${system}\n\n${route.hints.outputConstraint}` : system)
737
+ : (route.hints.outputConstraint || undefined);
738
+ if (systemContent)
739
+ messages.push({ role: 'system', content: systemContent });
361
740
  messages.push({ role: 'user', content: message });
741
+ const responseFormat = json_schema
742
+ ? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
743
+ : undefined;
362
744
  const resp = await chatCompletionStreaming(messages, {
363
- temperature,
745
+ temperature: temperature ?? route.hints.chatTemp,
364
746
  maxTokens: max_tokens,
747
+ model: route.modelId,
748
+ responseFormat,
365
749
  });
366
750
  const footer = formatFooter(resp);
367
751
  return { content: [{ type: 'text', text: resp.content + footer }] };
368
752
  }
369
753
  case 'custom_prompt': {
370
- const { system, context, instruction, temperature, max_tokens } = args;
754
+ const { system, context, instruction, temperature, max_tokens, json_schema } = args;
755
+ const route = await routeToModel('analysis');
371
756
  const messages = [];
372
- if (system)
373
- messages.push({ role: 'system', content: system });
757
+ const systemContent = system
758
+ ? (route.hints.outputConstraint ? `${system}\n\n${route.hints.outputConstraint}` : system)
759
+ : (route.hints.outputConstraint || undefined);
760
+ if (systemContent)
761
+ messages.push({ role: 'system', content: systemContent });
374
762
  let userContent = instruction;
375
763
  if (context)
376
764
  userContent = `Context:\n${context}\n\nInstruction:\n${instruction}`;
377
765
  messages.push({ role: 'user', content: userContent });
766
+ const responseFormat = json_schema
767
+ ? { type: 'json_schema', json_schema: { name: json_schema.name, strict: json_schema.strict ?? true, schema: json_schema.schema } }
768
+ : undefined;
378
769
  const resp = await chatCompletionStreaming(messages, {
379
- temperature,
770
+ temperature: temperature ?? route.hints.chatTemp,
380
771
  maxTokens: max_tokens,
772
+ model: route.modelId,
773
+ responseFormat,
381
774
  });
382
775
  const footer = formatFooter(resp);
383
776
  return {
@@ -387,10 +780,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
387
780
  case 'code_task': {
388
781
  const { code, task, language, max_tokens: codeMaxTokens } = args;
389
782
  const lang = language || 'unknown';
783
+ const route = await routeToModel('code');
784
+ const outputConstraint = route.hints.outputConstraint
785
+ ? ` ${route.hints.outputConstraint}`
786
+ : '';
390
787
  const codeMessages = [
391
788
  {
392
789
  role: 'system',
393
- content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes. No preamble.`,
790
+ content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes.${outputConstraint}`,
394
791
  },
395
792
  {
396
793
  role: 'user',
@@ -398,11 +795,13 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
398
795
  },
399
796
  ];
400
797
  const codeResp = await chatCompletionStreaming(codeMessages, {
401
- temperature: 0.2,
798
+ temperature: route.hints.codeTemp,
402
799
  maxTokens: codeMaxTokens ?? DEFAULT_MAX_TOKENS,
800
+ model: route.modelId,
403
801
  });
404
802
  const codeFooter = formatFooter(codeResp, lang);
405
- return { content: [{ type: 'text', text: codeResp.content + codeFooter }] };
803
+ const suggestionLine = route.suggestion ? `\n${route.suggestion}` : '';
804
+ return { content: [{ type: 'text', text: codeResp.content + codeFooter + suggestionLine }] };
406
805
  }
407
806
  case 'discover': {
408
807
  const start = Date.now();
@@ -431,40 +830,98 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
431
830
  }],
432
831
  };
433
832
  }
434
- const lines = models.map((m) => {
435
- const ctx = getContextLength(m);
436
- return ` • ${m.id} (context: ${ctx.toLocaleString()} tokens)`;
437
- });
438
- const primary = models[0];
833
+ const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
834
+ const available = models.filter((m) => m.state === 'not-loaded');
835
+ const primary = loaded[0] || models[0];
439
836
  const ctx = getContextLength(primary);
837
+ const primaryProfile = await getModelProfileAsync(primary);
440
838
  const sessionStats = session.calls > 0
441
839
  ? `\nSession stats: ${(session.promptTokens + session.completionTokens).toLocaleString()} tokens offloaded across ${session.calls} call${session.calls === 1 ? '' : 's'}`
442
840
  : '\nSession stats: no calls yet — delegate tasks to start saving tokens';
443
- return {
444
- content: [{
445
- type: 'text',
446
- text: `Status: ONLINE\n` +
447
- `Endpoint: ${LM_BASE_URL}\n` +
448
- `Latency: ${ms}ms\n` +
449
- `Model: ${primary.id}\n` +
450
- `Context window: ${ctx.toLocaleString()} tokens\n` +
451
- `\nLoaded models:\n${lines.join('\n')}` +
452
- `${sessionStats}\n\n` +
453
- `The local LLM is available. You can delegate tasks using chat, custom_prompt, or code_task.`,
454
- }],
455
- };
841
+ let text = `Status: ONLINE\n` +
842
+ `Endpoint: ${LM_BASE_URL}\n` +
843
+ `Latency: ${ms}ms\n` +
844
+ `Active model: ${primary.id}\n` +
845
+ `Context window: ${ctx.toLocaleString()} tokens\n`;
846
+ if (primaryProfile) {
847
+ text += `Family: ${primaryProfile.family}\n`;
848
+ text += `Description: ${primaryProfile.description}\n`;
849
+ text += `Best for: ${primaryProfile.bestFor.join(', ')}\n`;
850
+ text += `Strengths: ${primaryProfile.strengths.join(', ')}\n`;
851
+ if (primaryProfile.weaknesses.length > 0) {
852
+ text += `Weaknesses: ${primaryProfile.weaknesses.join(', ')}\n`;
853
+ }
854
+ }
855
+ if (loaded.length > 0) {
856
+ text += `\nLoaded models (● ready to use):\n`;
857
+ text += (await Promise.all(loaded.map((m) => formatModelDetail(m)))).join('\n\n');
858
+ }
859
+ if (available.length > 0) {
860
+ text += `\n\nAvailable models (○ downloaded, not loaded — can be activated in LM Studio):\n`;
861
+ text += (await Promise.all(available.map((m) => formatModelDetail(m)))).join('\n\n');
862
+ }
863
+ // Per-model performance stats from this session
864
+ if (session.modelStats.size > 0) {
865
+ text += `\n\nPerformance (this session):\n`;
866
+ for (const [modelId, stats] of session.modelStats) {
867
+ const avgTtft = stats.calls > 0 ? Math.round(stats.totalTtftMs / stats.calls) : 0;
868
+ const avgTokSec = stats.perfCalls > 0 ? (stats.totalTokPerSec / stats.perfCalls).toFixed(1) : '?';
869
+ text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s\n`;
870
+ }
871
+ }
872
+ text += `${sessionStats}\n\n`;
873
+ text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, or embed.`;
874
+ return { content: [{ type: 'text', text }] };
456
875
  }
457
876
  case 'list_models': {
458
877
  const models = await listModelsRaw();
459
878
  if (!models.length) {
460
- return { content: [{ type: 'text', text: 'No models currently loaded.' }] };
879
+ return { content: [{ type: 'text', text: 'No models currently loaded or available.' }] };
461
880
  }
462
- const lines = models.map((m) => {
463
- const ctx = getContextLength(m);
464
- return ` • ${m.id}${ctx ? ` (context: ${ctx.toLocaleString()} tokens)` : ''}`;
465
- });
881
+ const loaded = models.filter((m) => m.state === 'loaded' || !m.state);
882
+ const available = models.filter((m) => m.state === 'not-loaded');
883
+ let text = '';
884
+ // list_models enriches with HuggingFace data (cached after first call)
885
+ if (loaded.length > 0) {
886
+ text += `Loaded models (● ready to use):\n\n`;
887
+ text += (await Promise.all(loaded.map((m) => formatModelDetail(m, true)))).join('\n\n');
888
+ }
889
+ if (available.length > 0) {
890
+ if (text)
891
+ text += '\n\n';
892
+ text += `Available models (○ downloaded, not loaded):\n\n`;
893
+ text += (await Promise.all(available.map((m) => formatModelDetail(m, true)))).join('\n\n');
894
+ }
895
+ return { content: [{ type: 'text', text }] };
896
+ }
897
+ case 'embed': {
898
+ const { input, model: embedModel } = args;
899
+ const embedBody = { input };
900
+ if (embedModel) {
901
+ embedBody.model = embedModel;
902
+ }
903
+ const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/embeddings`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(embedBody) }, INFERENCE_CONNECT_TIMEOUT_MS);
904
+ if (!res.ok) {
905
+ const errText = await res.text().catch(() => '');
906
+ throw new Error(`Embeddings API error ${res.status}: ${errText}`);
907
+ }
908
+ const data = (await res.json());
909
+ const embedding = data.data[0]?.embedding;
910
+ if (!embedding)
911
+ throw new Error('No embedding returned');
912
+ const usageInfo = data.usage
913
+ ? `${data.usage.prompt_tokens} tokens embedded`
914
+ : '';
466
915
  return {
467
- content: [{ type: 'text', text: `Loaded models:\n${lines.join('\n')}` }],
916
+ content: [{
917
+ type: 'text',
918
+ text: JSON.stringify({
919
+ model: data.model,
920
+ dimensions: embedding.length,
921
+ embedding,
922
+ usage: usageInfo,
923
+ }),
924
+ }],
468
925
  };
469
926
  }
470
927
  default:
@@ -482,6 +939,11 @@ async function main() {
482
939
  const transport = new StdioServerTransport();
483
940
  await server.connect(transport);
484
941
  process.stderr.write(`Houtini LM server running (${LM_BASE_URL})\n`);
942
+ // Background: profile all available models via HF → SQLite cache
943
+ // Non-blocking — server is already accepting requests
944
+ listModelsRaw()
945
+ .then((models) => profileModelsAtStartup(models))
946
+ .catch((err) => process.stderr.write(`[houtini-lm] Startup profiling skipped: ${err}\n`));
485
947
  }
486
948
  main().catch((error) => {
487
949
  process.stderr.write(`Fatal error: ${error}\n`);