@houtini/lm 2.10.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8,7 +8,7 @@
8
8
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
9
9
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
10
10
  import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
11
- import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, } from './model-cache.js';
11
+ import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, recordPerformance, getAllPerformance, getLifetimeTotals, recordPrefillSample, getPrefillSamples, fitPrefillLinear, } from './model-cache.js';
12
12
  import { readFile } from 'node:fs/promises';
13
13
  import { isAbsolute, basename } from 'node:path';
14
14
  const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
@@ -19,7 +19,9 @@ const DEFAULT_TEMPERATURE = 0.3;
19
19
  const CONNECT_TIMEOUT_MS = 5000;
20
20
  const INFERENCE_CONNECT_TIMEOUT_MS = 30_000; // generous connect timeout for inference
21
21
  const SOFT_TIMEOUT_MS = 300_000; // 5 min — progress notifications reset MCP client timeout, so this is a safety net not the primary limit
22
- const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk
22
+ const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk mid-stream
23
+ const PREFILL_TIMEOUT_MS = 180_000; // max wait for the FIRST chunk — prompt prefill on slow hardware with big inputs can legitimately take 1-2 min
24
+ const PREFILL_KEEPALIVE_MS = 10_000; // fire a progress notification every N ms while waiting for prefill to finish
23
25
  const FALLBACK_CONTEXT_LENGTH = parseInt(process.env.LM_CONTEXT_WINDOW || '100000', 10);
24
26
  // ── Session-level token accounting ───────────────────────────────────
25
27
  // Tracks cumulative tokens offloaded to the local LLM across all calls
@@ -32,17 +34,65 @@ const session = {
32
34
  /** Per-model performance tracking for routing insights */
33
35
  modelStats: new Map(),
34
36
  };
37
+ // Lifetime mirror — kept in sync with the SQLite `model_performance` table
38
+ // so the footer/discover path stays synchronous. Hydrated once at startup
39
+ // from `getAllPerformance()`, then updated in-memory alongside every DB
40
+ // write in `recordUsage`. Also updated after the async DB write completes
41
+ // so counters can only ever run a tick behind, never ahead.
42
+ const lifetime = {
43
+ totalCalls: 0,
44
+ totalTokens: 0,
45
+ modelsUsed: 0,
46
+ firstSeenAt: null,
47
+ /** Per-model lifetime stats — same shape as session.modelStats for easy formatting. */
48
+ modelStats: new Map(),
49
+ };
50
+ async function hydrateLifetimeFromDb() {
51
+ try {
52
+ const totals = await getLifetimeTotals();
53
+ lifetime.totalCalls = totals.totalCalls;
54
+ lifetime.totalTokens = totals.totalTokens;
55
+ lifetime.modelsUsed = totals.modelsUsed;
56
+ lifetime.firstSeenAt = totals.firstSeenAt;
57
+ const rows = await getAllPerformance();
58
+ lifetime.modelStats.clear();
59
+ for (const r of rows) {
60
+ lifetime.modelStats.set(r.modelId, {
61
+ calls: r.totalCalls,
62
+ ttftCalls: r.ttftCalls,
63
+ perfCalls: r.perfCalls,
64
+ totalTtftMs: r.totalTtftMs,
65
+ totalTokPerSec: r.totalTokPerSec,
66
+ totalPromptTokens: r.totalPromptTokens,
67
+ firstSeenAt: r.firstSeenAt,
68
+ lastUsedAt: r.lastUsedAt,
69
+ });
70
+ }
71
+ }
72
+ catch (err) {
73
+ process.stderr.write(`[houtini-lm] Lifetime hydration failed (stats will build from this session): ${err}\n`);
74
+ }
75
+ }
35
76
  function recordUsage(resp) {
36
77
  session.calls++;
78
+ const promptTokens = resp.usage?.prompt_tokens ?? 0;
79
+ let completionTokens = resp.usage?.completion_tokens ?? 0;
80
+ const reasoningTokens = resp.usage?.completion_tokens_details?.reasoning_tokens ?? 0;
37
81
  if (resp.usage) {
38
- session.promptTokens += resp.usage.prompt_tokens;
39
- session.completionTokens += resp.usage.completion_tokens;
82
+ session.promptTokens += promptTokens;
83
+ session.completionTokens += completionTokens;
40
84
  }
41
85
  else if (resp.content.length > 0) {
42
86
  // Estimate when usage is missing (truncated responses)
43
- session.completionTokens += Math.ceil(resp.content.length / 4);
87
+ const est = Math.ceil(resp.content.length / 4);
88
+ completionTokens = est;
89
+ session.completionTokens += est;
44
90
  }
45
- // Track per-model perf stats
91
+ // Tok/s used by both session and lifetime stats
92
+ const tokPerSec = resp.usage && resp.generationMs > 50
93
+ ? (resp.usage.completion_tokens / (resp.generationMs / 1000))
94
+ : 0;
95
+ // Session per-model (unchanged behaviour)
46
96
  if (resp.model) {
47
97
  const existing = session.modelStats.get(resp.model) || { calls: 0, ttftCalls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
48
98
  existing.calls++;
@@ -50,22 +100,74 @@ function recordUsage(resp) {
50
100
  existing.totalTtftMs += resp.ttftMs;
51
101
  existing.ttftCalls++;
52
102
  }
53
- const tokPerSec = resp.usage && resp.generationMs > 50
54
- ? (resp.usage.completion_tokens / (resp.generationMs / 1000))
55
- : 0;
56
103
  if (tokPerSec > 0) {
57
104
  existing.perfCalls++;
58
105
  existing.totalTokPerSec += tokPerSec;
59
106
  }
60
107
  session.modelStats.set(resp.model, existing);
61
108
  }
109
+ // Lifetime mirror + SQLite write — fire-and-forget so a DB hiccup can't
110
+ // stall a tool response. The in-memory mirror is updated synchronously so
111
+ // the footer and discover output reflect this call immediately.
112
+ if (resp.model && (promptTokens > 0 || completionTokens > 0)) {
113
+ const now = Date.now();
114
+ const wasFirstEver = !lifetime.modelStats.has(resp.model);
115
+ const lExisting = lifetime.modelStats.get(resp.model) || {
116
+ calls: 0, ttftCalls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0, totalPromptTokens: 0,
117
+ firstSeenAt: now, lastUsedAt: now,
118
+ };
119
+ lExisting.calls++;
120
+ if (resp.ttftMs) {
121
+ lExisting.totalTtftMs += resp.ttftMs;
122
+ lExisting.ttftCalls++;
123
+ }
124
+ if (tokPerSec > 0) {
125
+ lExisting.perfCalls++;
126
+ lExisting.totalTokPerSec += tokPerSec;
127
+ }
128
+ lExisting.totalPromptTokens += promptTokens;
129
+ lExisting.lastUsedAt = now;
130
+ lifetime.modelStats.set(resp.model, lExisting);
131
+ lifetime.totalCalls++;
132
+ lifetime.totalTokens += promptTokens + completionTokens;
133
+ if (wasFirstEver) {
134
+ lifetime.modelsUsed++;
135
+ if (lifetime.firstSeenAt === null)
136
+ lifetime.firstSeenAt = now;
137
+ }
138
+ recordPerformance(resp.model, {
139
+ ttftMs: resp.ttftMs,
140
+ tokPerSec: tokPerSec > 0 ? tokPerSec : undefined,
141
+ promptTokens,
142
+ completionTokens,
143
+ reasoningTokens,
144
+ }).catch((err) => {
145
+ process.stderr.write(`[houtini-lm] Performance write failed (continuing): ${err}\n`);
146
+ });
147
+ // Record (prompt_tokens, TTFT) pair for the linear-fit prefill estimator.
148
+ // Only meaningful when both values are real (we have actual usage and a
149
+ // measured TTFT). Fire-and-forget alongside recordPerformance.
150
+ if (resp.ttftMs && promptTokens > 0) {
151
+ recordPrefillSample(resp.model, promptTokens, resp.ttftMs).catch((err) => {
152
+ process.stderr.write(`[houtini-lm] Prefill sample write failed (continuing): ${err}\n`);
153
+ });
154
+ }
155
+ }
62
156
  }
63
157
  function sessionSummary() {
64
158
  const total = session.promptTokens + session.completionTokens;
65
- if (session.calls === 0)
159
+ if (session.calls === 0 && lifetime.totalCalls === 0)
66
160
  return '';
67
- const callWord = session.calls === 1 ? 'call' : 'calls';
68
- return `💰 Claude quota saved this session: ${total.toLocaleString()} tokens across ${session.calls} offloaded ${callWord}`;
161
+ const callWord = (n) => (n === 1 ? 'call' : 'calls');
162
+ const sessionPart = session.calls > 0
163
+ ? `this session: ${total.toLocaleString()} tokens / ${session.calls} ${callWord(session.calls)}`
164
+ : 'this session: 0 tokens';
165
+ // Lifetime numbers only show once there's something in the DB — avoids a
166
+ // confusing "lifetime: 0" on a truly fresh install.
167
+ if (lifetime.totalCalls > 0) {
168
+ return `💰 Claude quota saved — ${sessionPart} · lifetime: ${lifetime.totalTokens.toLocaleString()} tokens / ${lifetime.totalCalls} ${callWord(lifetime.totalCalls)}`;
169
+ }
170
+ return `💰 Claude quota saved ${sessionPart}`;
69
171
  }
70
172
  /**
71
173
  * Return true when this response is the first one with measurable perf stats
@@ -381,6 +483,10 @@ async function chatCompletionStreamingInner(messages, options = {}) {
381
483
  messages,
382
484
  temperature: options.temperature ?? DEFAULT_TEMPERATURE,
383
485
  max_tokens: effectiveMaxTokens,
486
+ // Send max_completion_tokens alongside max_tokens for OpenAI reasoning-model
487
+ // compatibility (OpenAI spec distinguishes total generation cap from visible
488
+ // output cap). Backends that don't understand it ignore unknown fields.
489
+ max_completion_tokens: effectiveMaxTokens,
384
490
  stream: true,
385
491
  stream_options: { include_usage: true },
386
492
  };
@@ -391,23 +497,36 @@ async function chatCompletionStreamingInner(messages, options = {}) {
391
497
  body.response_format = options.responseFormat;
392
498
  }
393
499
  // Handle thinking/reasoning models.
394
- // Some models (Gemma 4, Qwen3, DeepSeek) have extended thinking that consumes
395
- // part of the max_tokens budget for invisible reasoning before producing content.
396
- // Strategy: try to disable thinking via enable_thinking=false, BUT also inflate
397
- // max_tokens as a safety net since some models (Gemma 4) hardcode thinking=true
398
- // in their Jinja template and ignore the API parameter.
500
+ // Some models (Gemma 4, Qwen3, DeepSeek R1, Nemotron, gpt-oss) have extended
501
+ // thinking that consumes part of the max_tokens budget for invisible reasoning
502
+ // before producing content. Strategy:
503
+ // 1. reasoning_effort=<family-specific value> to minimise reasoning
504
+ // 2. enable_thinking:false Qwen3 vendor param (ignored elsewhere)
505
+ // 3. inflate max_tokens 4× — safety net when both flags are ignored
506
+ // (e.g. Gemma 4 hardcodes enable_thinking=true in its Jinja template)
507
+ //
508
+ // IMPORTANT: reasoning_effort values are NOT standard. OpenAI/gpt-oss use
509
+ // 'low'|'medium'|'high'; Ollama adds 'none'; LM Studio's Nemotron adapter
510
+ // only accepts 'on'|'off'. Sending 'low' to Nemotron causes LM Studio to
511
+ // silently fall back to 'on' — maximising reasoning, the OPPOSITE of intent.
512
+ // Hence the family-specific mapping below. When uncertain, we omit the
513
+ // field entirely rather than risk a bad-value fallback.
399
514
  const modelId = (options.model || LM_MODEL || '').toString();
400
515
  if (modelId) {
401
516
  const thinking = await getThinkingSupport(modelId);
402
517
  if (thinking?.supportsThinkingToggle) {
403
518
  body.enable_thinking = false;
404
- // Safety net: inflate max_tokens to account for reasoning budget.
405
- // Gemma 4 ignores enable_thinking=false (hardcoded in template),
406
- // so the model will think regardless. Without inflation, reasoning
407
- // consumes all tokens and content comes back empty.
408
- const requestedTokens = (options.maxTokens ?? DEFAULT_MAX_TOKENS);
409
- body.max_tokens = Math.max(requestedTokens * 4, requestedTokens + 2000);
410
- process.stderr.write(`[houtini-lm] Thinking model ${modelId}: enable_thinking=false, max_tokens inflated ${requestedTokens} → ${body.max_tokens}\n`);
519
+ const reasoningValue = getReasoningEffortValue(modelId);
520
+ if (reasoningValue !== null) {
521
+ body.reasoning_effort = reasoningValue;
522
+ }
523
+ // Inflation uses effectiveMaxTokens (the context-aware value), not
524
+ // DEFAULT_MAX_TOKENS otherwise big-context models get sized down.
525
+ const beforeInflation = effectiveMaxTokens;
526
+ const inflated = Math.max(beforeInflation * 4, beforeInflation + 2000);
527
+ body.max_tokens = inflated;
528
+ body.max_completion_tokens = inflated;
529
+ process.stderr.write(`[houtini-lm] Thinking model ${modelId}: reasoning_effort=${reasoningValue ?? '(omitted)'}, enable_thinking=false, max_tokens inflated ${beforeInflation} → ${inflated}\n`);
411
530
  }
412
531
  }
413
532
  const startTime = Date.now();
@@ -422,13 +541,39 @@ async function chatCompletionStreamingInner(messages, options = {}) {
422
541
  const reader = res.body.getReader();
423
542
  const decoder = new TextDecoder();
424
543
  let content = '';
425
- let chunkCount = 0;
544
+ let reasoning = '';
545
+ let progressSeq = 0;
426
546
  let model = '';
427
547
  let usage;
428
548
  let finishReason = '';
429
549
  let truncated = false;
550
+ let prefillStall = false;
430
551
  let buffer = '';
431
552
  let ttftMs;
553
+ let firstChunkReceived = false;
554
+ // Prefill keep-alive — /v1/chat/completions gives no SSE events during
555
+ // prompt processing, so the MCP client clock ticks uninterrupted on a slow
556
+ // backend with a big input. Fire a progress notification every 10s until
557
+ // the first chunk arrives to keep the client from timing out at 60s.
558
+ const sendProgress = (message) => {
559
+ if (options.progressToken === undefined)
560
+ return;
561
+ progressSeq++;
562
+ server.notification({
563
+ method: 'notifications/progress',
564
+ params: {
565
+ progressToken: options.progressToken,
566
+ progress: progressSeq,
567
+ message,
568
+ },
569
+ }).catch(() => { });
570
+ };
571
+ const keepAliveTimer = setInterval(() => {
572
+ if (firstChunkReceived)
573
+ return;
574
+ const waitedMs = Date.now() - startTime;
575
+ sendProgress(`Waiting for model... (${(waitedMs / 1000).toFixed(0)}s, still in prefill)`);
576
+ }, PREFILL_KEEPALIVE_MS);
432
577
  try {
433
578
  while (true) {
434
579
  // Check soft timeout before each read
@@ -438,17 +583,24 @@ async function chatCompletionStreamingInner(messages, options = {}) {
438
583
  process.stderr.write(`[houtini-lm] Soft timeout at ${elapsed}ms, returning ${content.length} chars of partial content\n`);
439
584
  break;
440
585
  }
441
- // Read with per-chunk timeout (handles stalled generation)
586
+ // Split prefill vs mid-stream timeouts. Prefill on slow hardware with
587
+ // a 7k-token input can legitimately take 1-2 min; mid-stream stalls
588
+ // should surface much faster. Track firstChunkReceived to switch.
442
589
  const remaining = SOFT_TIMEOUT_MS - elapsed;
443
- const chunkTimeout = Math.min(READ_CHUNK_TIMEOUT_MS, remaining);
590
+ const perChunkCeiling = firstChunkReceived ? READ_CHUNK_TIMEOUT_MS : PREFILL_TIMEOUT_MS;
591
+ const chunkTimeout = Math.min(perChunkCeiling, remaining);
444
592
  const result = await timedRead(reader, chunkTimeout);
445
593
  if (result === 'timeout') {
446
594
  truncated = true;
447
- process.stderr.write(`[houtini-lm] Chunk read timeout, returning ${content.length} chars of partial content\n`);
595
+ prefillStall = !firstChunkReceived;
596
+ process.stderr.write(`[houtini-lm] ${prefillStall ? 'Prefill' : 'Mid-stream'} timeout, returning ${content.length} chars of partial content\n`);
448
597
  break;
449
598
  }
450
599
  if (result.done)
451
600
  break;
601
+ if (!firstChunkReceived) {
602
+ firstChunkReceived = true;
603
+ }
452
604
  buffer += decoder.decode(result.value, { stream: true });
453
605
  // Parse SSE lines
454
606
  const lines = buffer.split('\n');
@@ -464,41 +616,20 @@ async function chatCompletionStreamingInner(messages, options = {}) {
464
616
  if (json.model)
465
617
  model = json.model;
466
618
  const delta = json.choices?.[0]?.delta;
467
- // Track reasoning/thinking tokens models like Gemma 4, Qwen3, DeepSeek
468
- // emit reasoning_content during their thinking phase before producing
469
- // visible content. We must send progress notifications during this phase
470
- // to prevent MCP client timeout.
471
- if (delta?.reasoning_content) {
472
- chunkCount++;
473
- if (options.progressToken !== undefined) {
474
- server.notification({
475
- method: 'notifications/progress',
476
- params: {
477
- progressToken: options.progressToken,
478
- progress: chunkCount,
479
- message: `Thinking... (${chunkCount} chunks)`,
480
- },
481
- }).catch(() => { });
482
- }
619
+ // Reasoning channel. LM Studio (with "Separate reasoning_content"
620
+ // dev setting), DeepSeek R1, Ollama OpenAI-compat, Nemotron etc.
621
+ // stream reasoning via delta.reasoning_content we MUST capture it
622
+ // so the safety net below can return something when the model
623
+ // burns its entire budget before emitting a single content token.
624
+ if (typeof delta?.reasoning_content === 'string' && delta.reasoning_content.length > 0) {
625
+ reasoning += delta.reasoning_content;
626
+ sendProgress(`Thinking... (${reasoning.length} chars of reasoning)`);
483
627
  }
484
- if (delta?.content) {
628
+ if (typeof delta?.content === 'string' && delta.content.length > 0) {
485
629
  if (ttftMs === undefined)
486
630
  ttftMs = Date.now() - startTime;
487
631
  content += delta.content;
488
- chunkCount++;
489
- // Send progress notification to reset MCP client timeout.
490
- // Each notification resets the 60s clock, giving slow models
491
- // unlimited time as long as they're actively generating.
492
- if (options.progressToken !== undefined) {
493
- server.notification({
494
- method: 'notifications/progress',
495
- params: {
496
- progressToken: options.progressToken,
497
- progress: chunkCount,
498
- message: `Streaming... ${content.length} chars`,
499
- },
500
- }).catch(() => { });
501
- }
632
+ sendProgress(`Streaming... ${content.length} chars`);
502
633
  }
503
634
  const reason = json.choices?.[0]?.finish_reason;
504
635
  if (reason)
@@ -522,7 +653,10 @@ async function chatCompletionStreamingInner(messages, options = {}) {
522
653
  if (json.model)
523
654
  model = json.model;
524
655
  const delta = json.choices?.[0]?.delta;
525
- if (delta?.content) {
656
+ if (typeof delta?.reasoning_content === 'string' && delta.reasoning_content.length > 0) {
657
+ reasoning += delta.reasoning_content;
658
+ }
659
+ if (typeof delta?.content === 'string' && delta.content.length > 0) {
526
660
  if (ttftMs === undefined)
527
661
  ttftMs = Date.now() - startTime;
528
662
  content += delta.content;
@@ -541,6 +675,7 @@ async function chatCompletionStreamingInner(messages, options = {}) {
541
675
  }
542
676
  }
543
677
  finally {
678
+ clearInterval(keepAliveTimer);
544
679
  // Best-effort cancel with a short timeout — cancel() can hang if the upstream
545
680
  // connection is wedged, so we race it against a 500ms timer. This frees the
546
681
  // underlying socket sooner on abrupt client disconnects without blocking the
@@ -559,45 +694,98 @@ async function chatCompletionStreamingInner(messages, options = {}) {
559
694
  }
560
695
  const generationMs = Date.now() - startTime;
561
696
  // Strip <think>...</think> reasoning blocks from models that always emit them
562
- // (e.g. GLM Flash, Nemotron). Claude doesn't need the model's internal reasoning.
563
- // Handle both closed blocks and unclosed ones (model ran out of tokens mid-think,
564
- // or grammar-constrained output forced content before the closing tag).
697
+ // inline on the content channel (e.g. GLM Flash). Claude doesn't need the
698
+ // model's internal reasoning. Handle both closed and unclosed blocks.
565
699
  let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
566
700
  cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
567
701
  cleanContent = cleanContent.trim();
568
- // Safety net on top of the thinking-model max_tokens inflation: some MLX/GGUF
569
- // quants still exhaust their budget inside an unclosed <think> block despite
570
- // `enable_thinking:false` and the inflation. If stripping leaves nothing but
571
- // raw output exists, return the raw reasoning so the caller sees *something*
572
- // rather than an empty body + lone footer (issue #6).
702
+ // Safety nets for empty visible output. Try in order:
703
+ // 1. thinkStripFallback: stripping <think> left nothing, but raw content had text
704
+ // 2. reasoningFallback: no visible content AT ALL, but reasoning_content was streamed
705
+ // (this is the Nemotron/DeepSeek-R1/LM-Studio-dev-toggle case previously
706
+ // produced silent empty bodies because reasoning was discarded)
573
707
  let thinkStripFallback = false;
574
- if (!cleanContent && content.trim()) {
575
- thinkStripFallback = true;
576
- cleanContent = content.trim();
708
+ let reasoningFallback = false;
709
+ if (!cleanContent) {
710
+ if (content.trim()) {
711
+ thinkStripFallback = true;
712
+ cleanContent = content.trim();
713
+ }
714
+ else if (reasoning.trim()) {
715
+ reasoningFallback = true;
716
+ cleanContent =
717
+ '[No visible output — the model spent its entire output budget on reasoning_content before emitting any content. ' +
718
+ 'Raw reasoning below so you can see what it was doing:]\n\n' +
719
+ reasoning.trim();
720
+ }
577
721
  }
578
- return { content: cleanContent, rawContent: content, model, usage, finishReason, truncated, ttftMs, generationMs, thinkStripFallback };
722
+ return {
723
+ content: cleanContent,
724
+ rawContent: content,
725
+ reasoningContent: reasoning || undefined,
726
+ model,
727
+ usage,
728
+ finishReason,
729
+ truncated,
730
+ ttftMs,
731
+ generationMs,
732
+ thinkStripFallback,
733
+ reasoningFallback,
734
+ prefillStall,
735
+ };
736
+ }
737
+ let detectedBackend = null;
738
+ function getBackend() {
739
+ return detectedBackend ?? 'openai-compat';
579
740
  }
580
741
  /**
581
- * Fetch models from LM Studio's native v0 API first (richer metadata),
582
- * falling back to the OpenAI-compatible v1 endpoint for non-LM-Studio hosts.
742
+ * Fetch models with backend-aware probing.
743
+ * 1. LM Studio /api/v0/models richest metadata, sets backend='lmstudio'
744
+ * 2. Ollama /api/tags — native list, sets backend='ollama', maps to ModelInfo
745
+ * 3. OpenAI-compatible /v1/models — generic fallback (DeepSeek, vLLM, llama.cpp, OpenRouter)
583
746
  */
584
747
  async function listModelsRaw() {
585
- // Try v0 API first — returns type, arch, publisher, quantization, state
748
+ // Try LM Studio's v0 API first — returns type, arch, publisher, quantization, state
586
749
  try {
587
750
  const v0 = await fetchWithTimeout(`${LM_BASE_URL}/api/v0/models`, { headers: apiHeaders() });
588
751
  if (v0.ok) {
589
752
  const data = (await v0.json());
753
+ detectedBackend = 'lmstudio';
590
754
  return data.data;
591
755
  }
592
756
  }
593
757
  catch {
594
- // v0 not available — fall through to v1
758
+ // v0 not available — fall through
759
+ }
760
+ // Try Ollama's /api/tags next. Shape differs from OpenAI: returns
761
+ // { models: [{ name, model, size, details: { family, parameter_size, ... } }] }
762
+ try {
763
+ const tags = await fetchWithTimeout(`${LM_BASE_URL}/api/tags`, { headers: apiHeaders() });
764
+ if (tags.ok) {
765
+ const data = (await tags.json());
766
+ if (Array.isArray(data.models)) {
767
+ detectedBackend = 'ollama';
768
+ return data.models.map((m) => ({
769
+ id: m.name,
770
+ object: 'model',
771
+ type: 'llm',
772
+ arch: m.details?.family,
773
+ quantization: m.details?.quantization_level,
774
+ state: 'loaded', // Ollama loads on-demand; treat all listed as available
775
+ publisher: m.name.includes('/') ? m.name.split('/')[0] : undefined,
776
+ }));
777
+ }
778
+ }
779
+ }
780
+ catch {
781
+ // Not Ollama — fall through
595
782
  }
596
- // Fallback: OpenAI-compatible v1 endpoint (works with Ollama, vLLM, llama.cpp)
783
+ // Fallback: OpenAI-compatible v1 endpoint (DeepSeek, vLLM, llama.cpp, OpenRouter)
597
784
  const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/models`, { headers: apiHeaders() });
598
785
  if (!res.ok)
599
786
  throw new Error(`Failed to list models: ${res.status}`);
600
787
  const data = (await res.json());
788
+ detectedBackend = 'openai-compat';
601
789
  return data.data;
602
790
  }
603
791
  function getContextLength(model) {
@@ -608,6 +796,104 @@ function getContextLength(model) {
608
796
  function getMaxContextLength(model) {
609
797
  return model.max_context_length;
610
798
  }
799
+ /**
800
+ * Map model family / backend → reasoning_effort value that minimises reasoning.
801
+ *
802
+ * The `reasoning_effort` field exists across OpenAI, Ollama, LM Studio and
803
+ * DeepSeek, but the accepted values differ per vendor. Verified empirically
804
+ * from the LM Studio error response: "Supported values: none, minimal, low,
805
+ * medium, high, xhigh" (that's the set the LM Studio adapter accepts).
806
+ *
807
+ * OpenAI (gpt-5, o-series) : 'low' | 'medium' | 'high' (spec)
808
+ * Ollama : 'low' | 'medium' | 'high' | 'none'
809
+ * LM Studio (all models) : 'none' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh'
810
+ *
811
+ * We want the HARDEST off-switch we can portably send:
812
+ * - LM Studio / Ollama: 'none' (no reasoning budget at all)
813
+ * - Generic OpenAI-compat: 'low' (OpenAI's minimum, safe to send)
814
+ *
815
+ * An unsupported value is a hard 400 error on LM Studio (not a silent
816
+ * fallback), so this function is conservative — it returns null for
817
+ * unknown backends and we omit the field rather than risk a 400.
818
+ */
819
+ function getReasoningEffortValue(_modelId) {
820
+ const backend = getBackend();
821
+ // LM Studio accepts 'none' as an explicit reasoning-off switch for
822
+ // every thinking model (Nemotron, DeepSeek R1, Gemma 4, gpt-oss, ...).
823
+ if (backend === 'lmstudio')
824
+ return 'none';
825
+ // Ollama likewise documents 'none' as valid.
826
+ if (backend === 'ollama')
827
+ return 'none';
828
+ // Generic OpenAI-compatible — 'low' is the minimum OpenAI accepts per spec.
829
+ // DeepSeek's own API treats 'low' as minimum too.
830
+ return 'low';
831
+ }
832
+ /** Rough chars→tokens ratio used for pre-flight estimates. Matches the ratio
833
+ * we already use to estimate completion_tokens when usage is missing. */
834
+ const CHARS_PER_TOKEN = 4;
835
+ /** Conservative default prefill rate when no per-model measurement exists.
836
+ * Slower than real hardware so we err toward letting the call run — a false
837
+ * refusal is much worse than a false-ok that eventually times out. */
838
+ const DEFAULT_PREFILL_TOK_PER_SEC = 300;
839
+ /** Hard ceiling for when we refuse to send the call. Leaves ~15s of
840
+ * generation headroom inside the ~60s MCP-client request-timeout budget. */
841
+ const PREFILL_REFUSE_THRESHOLD_SEC = 45;
842
+ /** Soft warning threshold — we proceed but log a stderr warning. */
843
+ const PREFILL_WARN_THRESHOLD_SEC = 25;
844
+ /**
845
+ * Estimate prompt prefill time. Preferred method is a linear regression
846
+ * `TTFT ≈ α + β·prompt_tokens` over recent per-model samples — this separates
847
+ * fixed per-request overhead (α) from genuine per-token prefill cost (β) and
848
+ * avoids the under-prediction that a ratio-of-averages estimator produces
849
+ * when the current input is much larger than the historical mean.
850
+ *
851
+ * Falls back to the ratio estimator when we have fewer than
852
+ * PREFILL_FIT_MIN_SAMPLES points, and to a conservative default
853
+ * (DEFAULT_PREFILL_TOK_PER_SEC) when no measured data exists at all.
854
+ */
855
+ async function estimatePrefill(inputChars, modelId) {
856
+ const inputTokens = Math.ceil(inputChars / CHARS_PER_TOKEN);
857
+ // 1. Linear fit over recent samples (preferred).
858
+ try {
859
+ const samples = await getPrefillSamples(modelId);
860
+ const fit = fitPrefillLinear(samples);
861
+ if (fit) {
862
+ const estimatedMs = Math.max(0, fit.alphaMs + fit.betaMsPerToken * inputTokens);
863
+ return {
864
+ inputTokens,
865
+ estimatedSeconds: estimatedMs / 1000,
866
+ basis: 'linear-fit',
867
+ fit,
868
+ };
869
+ }
870
+ }
871
+ catch {
872
+ // Sample fetch failed — fall through to ratio estimator
873
+ }
874
+ // 2. Ratio fallback — uses aggregate stats already in memory.
875
+ const stats = lifetime.modelStats.get(modelId);
876
+ if (stats && stats.ttftCalls >= 2 && stats.totalTtftMs > 0 && stats.totalPromptTokens > 0) {
877
+ const avgPromptTokens = stats.totalPromptTokens / stats.calls;
878
+ const avgTtftSec = (stats.totalTtftMs / stats.ttftCalls) / 1000;
879
+ if (avgTtftSec > 0) {
880
+ const prefillTokPerSec = avgPromptTokens / avgTtftSec;
881
+ return {
882
+ inputTokens,
883
+ estimatedSeconds: inputTokens / prefillTokPerSec,
884
+ basis: 'ratio',
885
+ prefillTokPerSec,
886
+ };
887
+ }
888
+ }
889
+ // 3. Conservative default for unknown model/hardware.
890
+ return {
891
+ inputTokens,
892
+ estimatedSeconds: inputTokens / DEFAULT_PREFILL_TOK_PER_SEC,
893
+ basis: 'default',
894
+ prefillTokPerSec: DEFAULT_PREFILL_TOK_PER_SEC,
895
+ };
896
+ }
611
897
  async function routeToModel(taskType) {
612
898
  let models;
613
899
  try {
@@ -674,9 +960,11 @@ function assessQuality(resp, rawContent) {
674
960
  : null;
675
961
  return {
676
962
  truncated: resp.truncated,
963
+ prefillStall: resp.prefillStall ?? false,
677
964
  finishReason: resp.finishReason || 'unknown',
678
965
  thinkBlocksStripped: hadThinkBlocks,
679
966
  thinkStripFallback: resp.thinkStripFallback ?? false,
967
+ reasoningFallback: resp.reasoningFallback ?? false,
680
968
  estimatedTokens: estimated,
681
969
  contentLength: resp.content.length,
682
970
  generationMs: resp.generationMs,
@@ -685,9 +973,13 @@ function assessQuality(resp, rawContent) {
685
973
  }
686
974
  function formatQualityLine(quality) {
687
975
  const flags = [];
688
- if (quality.truncated)
976
+ if (quality.prefillStall)
977
+ flags.push('PREFILL-STALL (no tokens received — input may be too large for this model/hardware)');
978
+ else if (quality.truncated)
689
979
  flags.push('TRUNCATED');
690
- if (quality.thinkStripFallback)
980
+ if (quality.reasoningFallback)
981
+ flags.push('reasoning-only (model exhausted output budget before emitting visible content — showing raw reasoning)');
982
+ else if (quality.thinkStripFallback)
691
983
  flags.push('think-strip-empty (showing raw reasoning — model ignored enable_thinking:false)');
692
984
  else if (quality.thinkBlocksStripped)
693
985
  flags.push('think-blocks-stripped');
@@ -715,7 +1007,17 @@ function formatFooter(resp, extra) {
715
1007
  if (resp.model)
716
1008
  parts.push(`Model: ${resp.model}`);
717
1009
  if (resp.usage) {
718
- parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
1010
+ // OpenAI-spec reasoning-tokens split — when present, show it so the user
1011
+ // sees how much of the completion budget went to hidden reasoning vs
1012
+ // visible output. Diagnoses "empty body + hit-max-tokens" immediately.
1013
+ const reasoningTokens = resp.usage.completion_tokens_details?.reasoning_tokens;
1014
+ if (typeof reasoningTokens === 'number' && reasoningTokens > 0) {
1015
+ const visible = resp.usage.completion_tokens - reasoningTokens;
1016
+ parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens (${reasoningTokens} reasoning / ${visible} visible)`);
1017
+ }
1018
+ else {
1019
+ parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
1020
+ }
719
1021
  }
720
1022
  else if (resp.content.length > 0) {
721
1023
  // Estimate when usage is missing (truncated responses where final SSE chunk was lost)
@@ -899,11 +1201,13 @@ const TOOLS = [
899
1201
  '• Provide absolute paths. Relative paths are rejected.\n' +
900
1202
  '• Files are read in parallel (Promise.allSettled) — one unreadable file does not sink the call.\n' +
901
1203
  '• Files are concatenated with `=== filename ===` headers and sent to the same code-review pipeline as code_task.\n' +
902
- '• Read failures are surfaced inline with the reason so the LLM can still reason about the rest.\n\n' +
1204
+ '• Read failures are surfaced inline with the reason so the LLM can still reason about the rest.\n' +
1205
+ '• Pre-flight prefill estimate: if measured per-model data shows the input would exceed the MCP client\'s ~60s request timeout during prompt processing, the call is refused early with a diagnostic instead of hanging. Split or trim when this fires.\n\n' +
903
1206
  'Good fit:\n' +
904
1207
  '• Reviewing related files together (module + its tests, client + server pair)\n' +
905
1208
  '• Auditing a single large file too big to paste comfortably\n' +
906
1209
  '• Any code_task where keeping source out of the Claude context window matters\n\n' +
1210
+ 'Size guidance: on slow hardware (< 25 tok/s generation), keep total input under ~8,000 tokens (~32,000 chars) to stay safely under the client timeout. Faster hardware handles much more — the pre-flight estimator adapts once you\'ve done a few calls and real per-model timings are in the SQLite cache.\n\n' +
907
1211
  'Same review discipline as code_task — verify the output before acting on it.',
908
1212
  inputSchema: {
909
1213
  type: 'object',
@@ -965,6 +1269,23 @@ const TOOLS = [
965
1269
  required: ['input'],
966
1270
  },
967
1271
  },
1272
+ {
1273
+ name: 'stats',
1274
+ description: 'Show user stats: tokens offloaded, calls made, per-model performance — for the current session AND ' +
1275
+ 'lifetime (persisted in SQLite at ~/.houtini-lm/model-cache.db). Unlike `discover` which includes the ' +
1276
+ 'model catalog, `stats` returns just the numbers in a compact markdown table — cheap to call repeatedly ' +
1277
+ 'to see the 💰 Claude-quota savings counter climb. Useful for quantifying how much work the local model ' +
1278
+ 'is genuinely doing, and for noticing when a model\'s reasoning-token ratio is drifting.',
1279
+ inputSchema: {
1280
+ type: 'object',
1281
+ properties: {
1282
+ model: {
1283
+ type: 'string',
1284
+ description: 'Optional: filter output to a single model ID. Omit to see all models this workstation has used.',
1285
+ },
1286
+ },
1287
+ },
1288
+ },
968
1289
  ];
969
1290
  // ── MCP Server ───────────────────────────────────────────────────────
970
1291
  // Session-level sidekick framing. MCP clients surface this to the model
@@ -974,7 +1295,7 @@ const SIDEKICK_INSTRUCTIONS = `Houtini-lm is a local LLM sidekick. It runs on th
974
1295
  `When to reach for it: bounded, self-contained tasks you can describe in one message — explanations, boilerplate, test stubs, code review of pasted or file-loaded source, translations, commit messages, format conversion, brainstorming. Trades wall-clock time for tokens (typically 3-30× slower than frontier models).\n\n` +
975
1296
  `When not to: tasks that need tool access, cross-file reasoning you haven't captured, or work fast enough to answer directly before the delegation round-trip completes.\n\n` +
976
1297
  `Call \`discover\` in delegation-heavy sessions to see what model is loaded, its capability profile, and — after the first real call — its measured speed. The response footer reports cumulative tokens kept in the user's quota.`;
977
- const server = new Server({ name: 'houtini-lm', version: '2.10.0' }, { capabilities: { tools: {}, resources: {} }, instructions: SIDEKICK_INSTRUCTIONS });
1298
+ const server = new Server({ name: 'houtini-lm', version: '2.11.1' }, { capabilities: { tools: {}, resources: {} }, instructions: SIDEKICK_INSTRUCTIONS });
978
1299
  // ── MCP Resources ─────────────────────────────────────────────────────
979
1300
  // Exposes session performance metrics as a readable resource so Claude can
980
1301
  // proactively check offload efficiency and make smarter delegation decisions.
@@ -1155,6 +1476,44 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1155
1476
  ? ` ${route.hints.outputConstraint}`
1156
1477
  : '';
1157
1478
  const combined = sections.join('\n\n');
1479
+ // Pre-flight prefill estimate. Huge inputs can legitimately exceed
1480
+ // the MCP client's ~60s request timeout during prompt processing, and
1481
+ // progress notifications don't reset that timeout on Claude Desktop.
1482
+ // If measured per-model data in the SQLite cache shows this input
1483
+ // would obviously overrun, refuse with a concrete diagnostic so the
1484
+ // caller knows to split or trim instead of waiting for a silent hang.
1485
+ //
1486
+ // Preferred method: linear fit `TTFT ≈ α + β·prompt_tokens` over the
1487
+ // most recent PREFILL_SAMPLES_PER_MODEL (prompt_tokens, TTFT_ms) pairs.
1488
+ // Separates fixed per-request overhead from per-token prefill cost and
1489
+ // avoids the under-prediction a ratio-of-averages produces on inputs
1490
+ // much larger than the historical mean.
1491
+ const estimate = await estimatePrefill(combined.length, route.modelId);
1492
+ const isConfidentEstimate = estimate.basis === 'linear-fit' || estimate.basis === 'ratio';
1493
+ if (isConfidentEstimate && estimate.estimatedSeconds > PREFILL_REFUSE_THRESHOLD_SEC) {
1494
+ const estSec = Math.round(estimate.estimatedSeconds);
1495
+ const basisLine = estimate.basis === 'linear-fit'
1496
+ ? `• Estimator: linear fit — TTFT ≈ ${Math.round(estimate.fit.alphaMs)}ms + ${estimate.fit.betaMsPerToken.toFixed(2)}ms/token (n=${estimate.fit.n}, R²=${estimate.fit.r2.toFixed(2)})`
1497
+ : `• Estimator: ratio fallback — ~${Math.round(estimate.prefillTokPerSec)} tok/s (from ${lifetime.modelStats.get(route.modelId)?.ttftCalls ?? 0} prior calls; less accurate for inputs far from the historical mean)`;
1498
+ return {
1499
+ content: [{
1500
+ type: 'text',
1501
+ text: `Error: estimated prefill time exceeds the ~60s MCP client timeout.\n\n` +
1502
+ `• Input size: ~${estimate.inputTokens.toLocaleString()} tokens across ${successCount} file(s)\n` +
1503
+ `${basisLine}\n` +
1504
+ `• Estimated prefill: ~${estSec}s (threshold: ${PREFILL_REFUSE_THRESHOLD_SEC}s)\n\n` +
1505
+ `Options: split the files into smaller groups, trim the largest file, or use \`code_task\` with a focused excerpt. ` +
1506
+ `If you know this workstation can handle it, pass fewer files or run the task again when the measured rate improves.`,
1507
+ }],
1508
+ isError: true,
1509
+ };
1510
+ }
1511
+ if (estimate.estimatedSeconds > PREFILL_WARN_THRESHOLD_SEC) {
1512
+ const basisDetail = estimate.basis === 'linear-fit'
1513
+ ? `linear-fit n=${estimate.fit.n} R²=${estimate.fit.r2.toFixed(2)}`
1514
+ : estimate.basis;
1515
+ process.stderr.write(`[houtini-lm] Large input warning: ~${estimate.inputTokens} tokens, est prefill ~${Math.round(estimate.estimatedSeconds)}s (${basisDetail}). Proceeding.\n`);
1516
+ }
1158
1517
  const codeMessages = [
1159
1518
  {
1160
1519
  role: 'system',
@@ -1212,25 +1571,45 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1212
1571
  const primary = loaded[0] || models[0];
1213
1572
  const ctx = getContextLength(primary);
1214
1573
  const primaryProfile = await getModelProfileAsync(primary);
1215
- const totalOffloaded = session.promptTokens + session.completionTokens;
1216
- const sessionStats = session.calls > 0
1217
- ? `\n💰 Claude quota saved this session: ${totalOffloaded.toLocaleString()} tokens across ${session.calls} offloaded call${session.calls === 1 ? '' : 's'}`
1574
+ // Use sessionSummary() so discover matches the footer format and
1575
+ // automatically picks up the lifetime line when the SQLite cache has
1576
+ // cross-session data.
1577
+ const summary = sessionSummary();
1578
+ const sessionStats = session.calls > 0 || lifetime.totalCalls > 0
1579
+ ? `\n${summary}`
1218
1580
  : `\n💰 Claude quota saved this session: 0 tokens — no calls yet. Measured speed for each model will appear here after the first real call.`;
1219
1581
  // Measured speed line for the active model. Discover intentionally does
1220
1582
  // not run a synthetic warmup — speed is captured from real tasks, so the
1221
1583
  // numbers reflect actual workload rather than a contrived benchmark.
1584
+ // Shows session stats when this session has measured calls; otherwise
1585
+ // falls back to workstation lifetime stats so Claude sees historical
1586
+ // perf from call 1 instead of "not yet benchmarked".
1222
1587
  const primaryStats = session.modelStats.get(primary.id);
1588
+ const primaryLifetime = lifetime.modelStats.get(primary.id);
1223
1589
  let speedLine = '';
1224
1590
  if (primaryStats && primaryStats.perfCalls > 0) {
1225
1591
  const avgTtft = primaryStats.ttftCalls > 0 ? Math.round(primaryStats.totalTtftMs / primaryStats.ttftCalls) : 0;
1226
1592
  const avgTokSec = (primaryStats.totalTokPerSec / primaryStats.perfCalls).toFixed(1);
1227
- speedLine = `Measured speed: ${avgTokSec} tok/s · TTFT ${avgTtft}ms (avg over ${primaryStats.perfCalls} call${primaryStats.perfCalls === 1 ? '' : 's'} this session)\n`;
1593
+ speedLine = `Measured speed (session): ${avgTokSec} tok/s · TTFT ${avgTtft}ms (${primaryStats.perfCalls} call${primaryStats.perfCalls === 1 ? '' : 's'})\n`;
1594
+ if (primaryLifetime && primaryLifetime.perfCalls > primaryStats.perfCalls) {
1595
+ const lAvgTtft = primaryLifetime.ttftCalls > 0 ? Math.round(primaryLifetime.totalTtftMs / primaryLifetime.ttftCalls) : 0;
1596
+ const lAvgTokSec = (primaryLifetime.totalTokPerSec / primaryLifetime.perfCalls).toFixed(1);
1597
+ speedLine += `Measured speed (lifetime on this workstation): ${lAvgTokSec} tok/s · TTFT ${lAvgTtft}ms (${primaryLifetime.perfCalls} calls)\n`;
1598
+ }
1599
+ }
1600
+ else if (primaryLifetime && primaryLifetime.perfCalls > 0) {
1601
+ const lAvgTtft = primaryLifetime.ttftCalls > 0 ? Math.round(primaryLifetime.totalTtftMs / primaryLifetime.ttftCalls) : 0;
1602
+ const lAvgTokSec = (primaryLifetime.totalTokPerSec / primaryLifetime.perfCalls).toFixed(1);
1603
+ speedLine = `Measured speed (lifetime on this workstation): ${lAvgTokSec} tok/s · TTFT ${lAvgTtft}ms (${primaryLifetime.perfCalls} calls, last used ${new Date(primaryLifetime.lastUsedAt).toISOString().slice(0, 10)})\n`;
1228
1604
  }
1229
1605
  else {
1230
1606
  speedLine = `Measured speed: not yet benchmarked — will be captured on the first real call.\n`;
1231
1607
  }
1608
+ const backendLabel = getBackend() === 'lmstudio' ? 'LM Studio'
1609
+ : getBackend() === 'ollama' ? 'Ollama'
1610
+ : 'OpenAI-compatible';
1232
1611
  let text = `Status: ONLINE\n` +
1233
- `Endpoint: ${LM_BASE_URL}\n` +
1612
+ `Endpoint: ${LM_BASE_URL} (${backendLabel})\n` +
1234
1613
  `Connection latency: ${ms}ms (does not reflect inference speed)\n` +
1235
1614
  `Active model: ${primary.id}\n` +
1236
1615
  `Context window: ${ctx.toLocaleString()} tokens\n` +
@@ -1261,6 +1640,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1261
1640
  text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s\n`;
1262
1641
  }
1263
1642
  }
1643
+ // Workstation lifetime stats — built from SQLite, persists across restarts.
1644
+ // Only shown when there's lifetime data beyond this session, so a first-run
1645
+ // user doesn't see a duplicate of the session block above.
1646
+ const hasLifetimeBeyondSession = Array.from(lifetime.modelStats.entries())
1647
+ .some(([id, l]) => l.calls > (session.modelStats.get(id)?.calls ?? 0));
1648
+ if (hasLifetimeBeyondSession) {
1649
+ text += `\nPerformance (lifetime on this workstation):\n`;
1650
+ for (const [modelId, stats] of lifetime.modelStats) {
1651
+ const avgTtft = stats.ttftCalls > 0 ? Math.round(stats.totalTtftMs / stats.ttftCalls) : 0;
1652
+ const avgTokSec = stats.perfCalls > 0 ? (stats.totalTokPerSec / stats.perfCalls).toFixed(1) : '?';
1653
+ const lastUsed = new Date(stats.lastUsedAt).toISOString().slice(0, 10);
1654
+ text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s (last used ${lastUsed})\n`;
1655
+ }
1656
+ }
1264
1657
  text += `${sessionStats}\n\n`;
1265
1658
  text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, code_task_files, or embed.`;
1266
1659
  return { content: [{ type: 'text', text }] };
@@ -1318,6 +1711,89 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1318
1711
  };
1319
1712
  });
1320
1713
  }
1714
+ case 'stats': {
1715
+ const { model: filterModel } = args;
1716
+ const backendLabel = getBackend() === 'lmstudio' ? 'LM Studio'
1717
+ : getBackend() === 'ollama' ? 'Ollama'
1718
+ : 'OpenAI-compatible';
1719
+ const lines = [];
1720
+ lines.push(`## Houtini LM stats`);
1721
+ lines.push('');
1722
+ lines.push(`**Endpoint**: ${LM_BASE_URL} (${backendLabel})`);
1723
+ if (lifetime.firstSeenAt) {
1724
+ lines.push(`**First call on this workstation**: ${new Date(lifetime.firstSeenAt).toISOString().slice(0, 10)}`);
1725
+ }
1726
+ lines.push('');
1727
+ // Totals block
1728
+ lines.push(`### Totals`);
1729
+ lines.push('');
1730
+ lines.push(`| Scope | Calls | Prompt tokens | Completion tokens | Total tokens |`);
1731
+ lines.push(`|----------|------:|--------------:|------------------:|-------------:|`);
1732
+ lines.push(`| Session | ${session.calls} | ${session.promptTokens.toLocaleString()} | ${session.completionTokens.toLocaleString()} | ${(session.promptTokens + session.completionTokens).toLocaleString()} |`);
1733
+ lines.push(`| Lifetime | ${lifetime.totalCalls} | — | — | ${lifetime.totalTokens.toLocaleString()} |`);
1734
+ lines.push('');
1735
+ // Per-model block (union of session + lifetime model ids)
1736
+ const modelIds = new Set([
1737
+ ...session.modelStats.keys(),
1738
+ ...lifetime.modelStats.keys(),
1739
+ ]);
1740
+ const filtered = filterModel ? [...modelIds].filter((m) => m === filterModel) : [...modelIds];
1741
+ if (filtered.length > 0) {
1742
+ lines.push(`### Per-model performance`);
1743
+ lines.push('');
1744
+ lines.push(`| Model | Scope | Calls | Avg TTFT (ms) | Avg tok/s | Prompt tokens | Last used |`);
1745
+ lines.push(`|-------|-------|------:|--------------:|----------:|--------------:|-----------|`);
1746
+ for (const modelId of filtered.sort()) {
1747
+ const s = session.modelStats.get(modelId);
1748
+ const l = lifetime.modelStats.get(modelId);
1749
+ if (s) {
1750
+ const avgTtft = s.ttftCalls > 0 ? Math.round(s.totalTtftMs / s.ttftCalls) : '—';
1751
+ const avgTokSec = s.perfCalls > 0 ? (s.totalTokPerSec / s.perfCalls).toFixed(1) : '—';
1752
+ lines.push(`| ${modelId} | session | ${s.calls} | ${avgTtft} | ${avgTokSec} | — | — |`);
1753
+ }
1754
+ if (l) {
1755
+ const avgTtft = l.ttftCalls > 0 ? Math.round(l.totalTtftMs / l.ttftCalls) : '—';
1756
+ const avgTokSec = l.perfCalls > 0 ? (l.totalTokPerSec / l.perfCalls).toFixed(1) : '—';
1757
+ const lastUsed = new Date(l.lastUsedAt).toISOString().slice(0, 10);
1758
+ lines.push(`| ${modelId} | lifetime | ${l.calls} | ${avgTtft} | ${avgTokSec} | ${l.totalPromptTokens.toLocaleString()} | ${lastUsed} |`);
1759
+ }
1760
+ }
1761
+ lines.push('');
1762
+ }
1763
+ else if (filterModel) {
1764
+ lines.push(`No history for model: \`${filterModel}\`. Try \`list_models\` to see what's been used.`);
1765
+ lines.push('');
1766
+ }
1767
+ else {
1768
+ lines.push(`No calls yet — delegate a task via \`chat\`, \`custom_prompt\`, \`code_task\`, or \`code_task_files\` to start building stats.`);
1769
+ lines.push('');
1770
+ }
1771
+ // Reasoning-token diagnostic (lifetime only — needs persistence to be meaningful)
1772
+ if (!filterModel) {
1773
+ // Sum reasoning tokens across all models. We store this per-model
1774
+ // in SQLite but not in the in-memory mirror, so fetch on demand.
1775
+ try {
1776
+ const rows = await getAllPerformance();
1777
+ const totalReasoning = rows.reduce((sum, r) => sum + (r.totalReasoningTokens || 0), 0);
1778
+ const totalCompletion = rows.reduce((sum, r) => sum + r.totalCompletionTokens, 0);
1779
+ if (totalCompletion > 0) {
1780
+ const pct = ((totalReasoning / totalCompletion) * 100).toFixed(1);
1781
+ lines.push(`### Reasoning-token overhead (lifetime)`);
1782
+ lines.push('');
1783
+ lines.push(`${totalReasoning.toLocaleString()} / ${totalCompletion.toLocaleString()} completion tokens spent on hidden reasoning (${pct}% of generation budget). ` +
1784
+ (parseFloat(pct) > 30
1785
+ ? `**High** — consider loading a non-thinking model, or check that \`reasoning_effort\` is being honoured (see stderr logs).`
1786
+ : parseFloat(pct) > 10
1787
+ ? `Moderate — normal for thinking-model families.`
1788
+ : `Low — reasoning is effectively suppressed.`));
1789
+ lines.push('');
1790
+ }
1791
+ }
1792
+ catch { /* best-effort — don't fail the tool call */ }
1793
+ }
1794
+ lines.push(`*Stats persist across restarts in \`~/.houtini-lm/model-cache.db\`.*`);
1795
+ return { content: [{ type: 'text', text: lines.join('\n') }] };
1796
+ }
1321
1797
  default:
1322
1798
  throw new Error(`Unknown tool: ${name}`);
1323
1799
  }
@@ -1338,6 +1814,10 @@ async function main() {
1338
1814
  listModelsRaw()
1339
1815
  .then((models) => profileModelsAtStartup(models))
1340
1816
  .catch((err) => process.stderr.write(`[houtini-lm] Startup profiling skipped: ${err}\n`));
1817
+ // Hydrate the in-memory lifetime mirror from SQLite so the very first
1818
+ // tool call this session shows historical savings + per-model perf.
1819
+ // Non-blocking too; the footer degrades to session-only if this fails.
1820
+ hydrateLifetimeFromDb().catch((err) => process.stderr.write(`[houtini-lm] Lifetime hydration skipped: ${err}\n`));
1341
1821
  }
1342
1822
  main().catch((error) => {
1343
1823
  process.stderr.write(`Fatal error: ${error}\n`);