@houtini/lm 2.9.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +108 -17
- package/dist/index.js +639 -146
- package/dist/index.js.map +1 -1
- package/dist/model-cache.d.ts +54 -0
- package/dist/model-cache.js +205 -11
- package/dist/model-cache.js.map +1 -1
- package/package.json +2 -1
- package/server.json +2 -2
package/dist/index.js
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
9
9
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
10
10
|
import { CallToolRequestSchema, ListToolsRequestSchema, ListResourcesRequestSchema, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
11
|
-
import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, } from './model-cache.js';
|
|
11
|
+
import { profileModelsAtStartup, getCachedProfile, toModelProfile as cachedToProfile, getHFEnrichmentLine, getPromptHints, getThinkingSupport, recordPerformance, getAllPerformance, getLifetimeTotals, } from './model-cache.js';
|
|
12
12
|
import { readFile } from 'node:fs/promises';
|
|
13
13
|
import { isAbsolute, basename } from 'node:path';
|
|
14
14
|
const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234';
|
|
@@ -19,7 +19,9 @@ const DEFAULT_TEMPERATURE = 0.3;
|
|
|
19
19
|
const CONNECT_TIMEOUT_MS = 5000;
|
|
20
20
|
const INFERENCE_CONNECT_TIMEOUT_MS = 30_000; // generous connect timeout for inference
|
|
21
21
|
const SOFT_TIMEOUT_MS = 300_000; // 5 min — progress notifications reset MCP client timeout, so this is a safety net not the primary limit
|
|
22
|
-
const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk
|
|
22
|
+
const READ_CHUNK_TIMEOUT_MS = 30_000; // max wait for a single SSE chunk mid-stream
|
|
23
|
+
const PREFILL_TIMEOUT_MS = 180_000; // max wait for the FIRST chunk — prompt prefill on slow hardware with big inputs can legitimately take 1-2 min
|
|
24
|
+
const PREFILL_KEEPALIVE_MS = 10_000; // fire a progress notification every N ms while waiting for prefill to finish
|
|
23
25
|
const FALLBACK_CONTEXT_LENGTH = parseInt(process.env.LM_CONTEXT_WINDOW || '100000', 10);
|
|
24
26
|
// ── Session-level token accounting ───────────────────────────────────
|
|
25
27
|
// Tracks cumulative tokens offloaded to the local LLM across all calls
|
|
@@ -32,37 +34,145 @@ const session = {
|
|
|
32
34
|
/** Per-model performance tracking for routing insights */
|
|
33
35
|
modelStats: new Map(),
|
|
34
36
|
};
|
|
37
|
+
// Lifetime mirror — kept in sync with the SQLite `model_performance` table
|
|
38
|
+
// so the footer/discover path stays synchronous. Hydrated once at startup
|
|
39
|
+
// from `getAllPerformance()`, then updated in-memory alongside every DB
|
|
40
|
+
// write in `recordUsage`. Also updated after the async DB write completes
|
|
41
|
+
// so counters can only ever run a tick behind, never ahead.
|
|
42
|
+
const lifetime = {
|
|
43
|
+
totalCalls: 0,
|
|
44
|
+
totalTokens: 0,
|
|
45
|
+
modelsUsed: 0,
|
|
46
|
+
firstSeenAt: null,
|
|
47
|
+
/** Per-model lifetime stats — same shape as session.modelStats for easy formatting. */
|
|
48
|
+
modelStats: new Map(),
|
|
49
|
+
};
|
|
50
|
+
async function hydrateLifetimeFromDb() {
|
|
51
|
+
try {
|
|
52
|
+
const totals = await getLifetimeTotals();
|
|
53
|
+
lifetime.totalCalls = totals.totalCalls;
|
|
54
|
+
lifetime.totalTokens = totals.totalTokens;
|
|
55
|
+
lifetime.modelsUsed = totals.modelsUsed;
|
|
56
|
+
lifetime.firstSeenAt = totals.firstSeenAt;
|
|
57
|
+
const rows = await getAllPerformance();
|
|
58
|
+
lifetime.modelStats.clear();
|
|
59
|
+
for (const r of rows) {
|
|
60
|
+
lifetime.modelStats.set(r.modelId, {
|
|
61
|
+
calls: r.totalCalls,
|
|
62
|
+
ttftCalls: r.ttftCalls,
|
|
63
|
+
perfCalls: r.perfCalls,
|
|
64
|
+
totalTtftMs: r.totalTtftMs,
|
|
65
|
+
totalTokPerSec: r.totalTokPerSec,
|
|
66
|
+
totalPromptTokens: r.totalPromptTokens,
|
|
67
|
+
firstSeenAt: r.firstSeenAt,
|
|
68
|
+
lastUsedAt: r.lastUsedAt,
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
catch (err) {
|
|
73
|
+
process.stderr.write(`[houtini-lm] Lifetime hydration failed (stats will build from this session): ${err}\n`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
35
76
|
function recordUsage(resp) {
|
|
36
77
|
session.calls++;
|
|
78
|
+
const promptTokens = resp.usage?.prompt_tokens ?? 0;
|
|
79
|
+
let completionTokens = resp.usage?.completion_tokens ?? 0;
|
|
80
|
+
const reasoningTokens = resp.usage?.completion_tokens_details?.reasoning_tokens ?? 0;
|
|
37
81
|
if (resp.usage) {
|
|
38
|
-
session.promptTokens +=
|
|
39
|
-
session.completionTokens +=
|
|
82
|
+
session.promptTokens += promptTokens;
|
|
83
|
+
session.completionTokens += completionTokens;
|
|
40
84
|
}
|
|
41
85
|
else if (resp.content.length > 0) {
|
|
42
86
|
// Estimate when usage is missing (truncated responses)
|
|
43
|
-
|
|
87
|
+
const est = Math.ceil(resp.content.length / 4);
|
|
88
|
+
completionTokens = est;
|
|
89
|
+
session.completionTokens += est;
|
|
44
90
|
}
|
|
45
|
-
//
|
|
91
|
+
// Tok/s used by both session and lifetime stats
|
|
92
|
+
const tokPerSec = resp.usage && resp.generationMs > 50
|
|
93
|
+
? (resp.usage.completion_tokens / (resp.generationMs / 1000))
|
|
94
|
+
: 0;
|
|
95
|
+
// Session per-model (unchanged behaviour)
|
|
46
96
|
if (resp.model) {
|
|
47
|
-
const existing = session.modelStats.get(resp.model) || { calls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
|
|
97
|
+
const existing = session.modelStats.get(resp.model) || { calls: 0, ttftCalls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0 };
|
|
48
98
|
existing.calls++;
|
|
49
|
-
if (resp.ttftMs)
|
|
99
|
+
if (resp.ttftMs) {
|
|
50
100
|
existing.totalTtftMs += resp.ttftMs;
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
: 0;
|
|
101
|
+
existing.ttftCalls++;
|
|
102
|
+
}
|
|
54
103
|
if (tokPerSec > 0) {
|
|
55
104
|
existing.perfCalls++;
|
|
56
105
|
existing.totalTokPerSec += tokPerSec;
|
|
57
106
|
}
|
|
58
107
|
session.modelStats.set(resp.model, existing);
|
|
59
108
|
}
|
|
109
|
+
// Lifetime mirror + SQLite write — fire-and-forget so a DB hiccup can't
|
|
110
|
+
// stall a tool response. The in-memory mirror is updated synchronously so
|
|
111
|
+
// the footer and discover output reflect this call immediately.
|
|
112
|
+
if (resp.model && (promptTokens > 0 || completionTokens > 0)) {
|
|
113
|
+
const now = Date.now();
|
|
114
|
+
const wasFirstEver = !lifetime.modelStats.has(resp.model);
|
|
115
|
+
const lExisting = lifetime.modelStats.get(resp.model) || {
|
|
116
|
+
calls: 0, ttftCalls: 0, perfCalls: 0, totalTtftMs: 0, totalTokPerSec: 0, totalPromptTokens: 0,
|
|
117
|
+
firstSeenAt: now, lastUsedAt: now,
|
|
118
|
+
};
|
|
119
|
+
lExisting.calls++;
|
|
120
|
+
if (resp.ttftMs) {
|
|
121
|
+
lExisting.totalTtftMs += resp.ttftMs;
|
|
122
|
+
lExisting.ttftCalls++;
|
|
123
|
+
}
|
|
124
|
+
if (tokPerSec > 0) {
|
|
125
|
+
lExisting.perfCalls++;
|
|
126
|
+
lExisting.totalTokPerSec += tokPerSec;
|
|
127
|
+
}
|
|
128
|
+
lExisting.totalPromptTokens += promptTokens;
|
|
129
|
+
lExisting.lastUsedAt = now;
|
|
130
|
+
lifetime.modelStats.set(resp.model, lExisting);
|
|
131
|
+
lifetime.totalCalls++;
|
|
132
|
+
lifetime.totalTokens += promptTokens + completionTokens;
|
|
133
|
+
if (wasFirstEver) {
|
|
134
|
+
lifetime.modelsUsed++;
|
|
135
|
+
if (lifetime.firstSeenAt === null)
|
|
136
|
+
lifetime.firstSeenAt = now;
|
|
137
|
+
}
|
|
138
|
+
recordPerformance(resp.model, {
|
|
139
|
+
ttftMs: resp.ttftMs,
|
|
140
|
+
tokPerSec: tokPerSec > 0 ? tokPerSec : undefined,
|
|
141
|
+
promptTokens,
|
|
142
|
+
completionTokens,
|
|
143
|
+
reasoningTokens,
|
|
144
|
+
}).catch((err) => {
|
|
145
|
+
process.stderr.write(`[houtini-lm] Performance write failed (continuing): ${err}\n`);
|
|
146
|
+
});
|
|
147
|
+
}
|
|
60
148
|
}
|
|
61
149
|
function sessionSummary() {
|
|
62
150
|
const total = session.promptTokens + session.completionTokens;
|
|
63
|
-
if (session.calls === 0)
|
|
151
|
+
if (session.calls === 0 && lifetime.totalCalls === 0)
|
|
64
152
|
return '';
|
|
65
|
-
|
|
153
|
+
const callWord = (n) => (n === 1 ? 'call' : 'calls');
|
|
154
|
+
const sessionPart = session.calls > 0
|
|
155
|
+
? `this session: ${total.toLocaleString()} tokens / ${session.calls} ${callWord(session.calls)}`
|
|
156
|
+
: 'this session: 0 tokens';
|
|
157
|
+
// Lifetime numbers only show once there's something in the DB — avoids a
|
|
158
|
+
// confusing "lifetime: 0" on a truly fresh install.
|
|
159
|
+
if (lifetime.totalCalls > 0) {
|
|
160
|
+
return `💰 Claude quota saved — ${sessionPart} · lifetime: ${lifetime.totalTokens.toLocaleString()} tokens / ${lifetime.totalCalls} ${callWord(lifetime.totalCalls)}`;
|
|
161
|
+
}
|
|
162
|
+
return `💰 Claude quota saved ${sessionPart}`;
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Return true when this response is the first one with measurable perf stats
|
|
166
|
+
* for its model in the current session. Used to surface a one-off "benchmarked"
|
|
167
|
+
* line so Claude sees the real speed of the local model on a genuine task,
|
|
168
|
+
* not an artificial warmup.
|
|
169
|
+
*/
|
|
170
|
+
function isFirstBenchmarkedCall(modelId, tokPerSec) {
|
|
171
|
+
if (!modelId || tokPerSec <= 0)
|
|
172
|
+
return false;
|
|
173
|
+
const stats = session.modelStats.get(modelId);
|
|
174
|
+
// After recordUsage has run, perfCalls === 1 means this was the first measured call.
|
|
175
|
+
return !!stats && stats.perfCalls === 1;
|
|
66
176
|
}
|
|
67
177
|
function apiHeaders() {
|
|
68
178
|
const h = { 'Content-Type': 'application/json' };
|
|
@@ -365,6 +475,10 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
365
475
|
messages,
|
|
366
476
|
temperature: options.temperature ?? DEFAULT_TEMPERATURE,
|
|
367
477
|
max_tokens: effectiveMaxTokens,
|
|
478
|
+
// Send max_completion_tokens alongside max_tokens for OpenAI reasoning-model
|
|
479
|
+
// compatibility (OpenAI spec distinguishes total generation cap from visible
|
|
480
|
+
// output cap). Backends that don't understand it ignore unknown fields.
|
|
481
|
+
max_completion_tokens: effectiveMaxTokens,
|
|
368
482
|
stream: true,
|
|
369
483
|
stream_options: { include_usage: true },
|
|
370
484
|
};
|
|
@@ -375,23 +489,36 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
375
489
|
body.response_format = options.responseFormat;
|
|
376
490
|
}
|
|
377
491
|
// Handle thinking/reasoning models.
|
|
378
|
-
// Some models (Gemma 4, Qwen3, DeepSeek) have extended
|
|
379
|
-
// part of the max_tokens budget for invisible reasoning
|
|
380
|
-
//
|
|
381
|
-
//
|
|
382
|
-
//
|
|
492
|
+
// Some models (Gemma 4, Qwen3, DeepSeek R1, Nemotron, gpt-oss) have extended
|
|
493
|
+
// thinking that consumes part of the max_tokens budget for invisible reasoning
|
|
494
|
+
// before producing content. Strategy:
|
|
495
|
+
// 1. reasoning_effort=<family-specific value> to minimise reasoning
|
|
496
|
+
// 2. enable_thinking:false — Qwen3 vendor param (ignored elsewhere)
|
|
497
|
+
// 3. inflate max_tokens 4× — safety net when both flags are ignored
|
|
498
|
+
// (e.g. Gemma 4 hardcodes enable_thinking=true in its Jinja template)
|
|
499
|
+
//
|
|
500
|
+
// IMPORTANT: reasoning_effort values are NOT standard. OpenAI/gpt-oss use
|
|
501
|
+
// 'low'|'medium'|'high'; Ollama adds 'none'; LM Studio's Nemotron adapter
|
|
502
|
+
// only accepts 'on'|'off'. Sending 'low' to Nemotron causes LM Studio to
|
|
503
|
+
// silently fall back to 'on' — maximising reasoning, the OPPOSITE of intent.
|
|
504
|
+
// Hence the family-specific mapping below. When uncertain, we omit the
|
|
505
|
+
// field entirely rather than risk a bad-value fallback.
|
|
383
506
|
const modelId = (options.model || LM_MODEL || '').toString();
|
|
384
507
|
if (modelId) {
|
|
385
508
|
const thinking = await getThinkingSupport(modelId);
|
|
386
509
|
if (thinking?.supportsThinkingToggle) {
|
|
387
510
|
body.enable_thinking = false;
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
511
|
+
const reasoningValue = getReasoningEffortValue(modelId);
|
|
512
|
+
if (reasoningValue !== null) {
|
|
513
|
+
body.reasoning_effort = reasoningValue;
|
|
514
|
+
}
|
|
515
|
+
// Inflation uses effectiveMaxTokens (the context-aware value), not
|
|
516
|
+
// DEFAULT_MAX_TOKENS — otherwise big-context models get sized down.
|
|
517
|
+
const beforeInflation = effectiveMaxTokens;
|
|
518
|
+
const inflated = Math.max(beforeInflation * 4, beforeInflation + 2000);
|
|
519
|
+
body.max_tokens = inflated;
|
|
520
|
+
body.max_completion_tokens = inflated;
|
|
521
|
+
process.stderr.write(`[houtini-lm] Thinking model ${modelId}: reasoning_effort=${reasoningValue ?? '(omitted)'}, enable_thinking=false, max_tokens inflated ${beforeInflation} → ${inflated}\n`);
|
|
395
522
|
}
|
|
396
523
|
}
|
|
397
524
|
const startTime = Date.now();
|
|
@@ -406,13 +533,39 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
406
533
|
const reader = res.body.getReader();
|
|
407
534
|
const decoder = new TextDecoder();
|
|
408
535
|
let content = '';
|
|
409
|
-
let
|
|
536
|
+
let reasoning = '';
|
|
537
|
+
let progressSeq = 0;
|
|
410
538
|
let model = '';
|
|
411
539
|
let usage;
|
|
412
540
|
let finishReason = '';
|
|
413
541
|
let truncated = false;
|
|
542
|
+
let prefillStall = false;
|
|
414
543
|
let buffer = '';
|
|
415
544
|
let ttftMs;
|
|
545
|
+
let firstChunkReceived = false;
|
|
546
|
+
// Prefill keep-alive — /v1/chat/completions gives no SSE events during
|
|
547
|
+
// prompt processing, so the MCP client clock ticks uninterrupted on a slow
|
|
548
|
+
// backend with a big input. Fire a progress notification every 10s until
|
|
549
|
+
// the first chunk arrives to keep the client from timing out at 60s.
|
|
550
|
+
const sendProgress = (message) => {
|
|
551
|
+
if (options.progressToken === undefined)
|
|
552
|
+
return;
|
|
553
|
+
progressSeq++;
|
|
554
|
+
server.notification({
|
|
555
|
+
method: 'notifications/progress',
|
|
556
|
+
params: {
|
|
557
|
+
progressToken: options.progressToken,
|
|
558
|
+
progress: progressSeq,
|
|
559
|
+
message,
|
|
560
|
+
},
|
|
561
|
+
}).catch(() => { });
|
|
562
|
+
};
|
|
563
|
+
const keepAliveTimer = setInterval(() => {
|
|
564
|
+
if (firstChunkReceived)
|
|
565
|
+
return;
|
|
566
|
+
const waitedMs = Date.now() - startTime;
|
|
567
|
+
sendProgress(`Waiting for model... (${(waitedMs / 1000).toFixed(0)}s, still in prefill)`);
|
|
568
|
+
}, PREFILL_KEEPALIVE_MS);
|
|
416
569
|
try {
|
|
417
570
|
while (true) {
|
|
418
571
|
// Check soft timeout before each read
|
|
@@ -422,17 +575,24 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
422
575
|
process.stderr.write(`[houtini-lm] Soft timeout at ${elapsed}ms, returning ${content.length} chars of partial content\n`);
|
|
423
576
|
break;
|
|
424
577
|
}
|
|
425
|
-
//
|
|
578
|
+
// Split prefill vs mid-stream timeouts. Prefill on slow hardware with
|
|
579
|
+
// a 7k-token input can legitimately take 1-2 min; mid-stream stalls
|
|
580
|
+
// should surface much faster. Track firstChunkReceived to switch.
|
|
426
581
|
const remaining = SOFT_TIMEOUT_MS - elapsed;
|
|
427
|
-
const
|
|
582
|
+
const perChunkCeiling = firstChunkReceived ? READ_CHUNK_TIMEOUT_MS : PREFILL_TIMEOUT_MS;
|
|
583
|
+
const chunkTimeout = Math.min(perChunkCeiling, remaining);
|
|
428
584
|
const result = await timedRead(reader, chunkTimeout);
|
|
429
585
|
if (result === 'timeout') {
|
|
430
586
|
truncated = true;
|
|
431
|
-
|
|
587
|
+
prefillStall = !firstChunkReceived;
|
|
588
|
+
process.stderr.write(`[houtini-lm] ${prefillStall ? 'Prefill' : 'Mid-stream'} timeout, returning ${content.length} chars of partial content\n`);
|
|
432
589
|
break;
|
|
433
590
|
}
|
|
434
591
|
if (result.done)
|
|
435
592
|
break;
|
|
593
|
+
if (!firstChunkReceived) {
|
|
594
|
+
firstChunkReceived = true;
|
|
595
|
+
}
|
|
436
596
|
buffer += decoder.decode(result.value, { stream: true });
|
|
437
597
|
// Parse SSE lines
|
|
438
598
|
const lines = buffer.split('\n');
|
|
@@ -448,41 +608,20 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
448
608
|
if (json.model)
|
|
449
609
|
model = json.model;
|
|
450
610
|
const delta = json.choices?.[0]?.delta;
|
|
451
|
-
//
|
|
452
|
-
//
|
|
453
|
-
//
|
|
454
|
-
//
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
method: 'notifications/progress',
|
|
460
|
-
params: {
|
|
461
|
-
progressToken: options.progressToken,
|
|
462
|
-
progress: chunkCount,
|
|
463
|
-
message: `Thinking... (${chunkCount} chunks)`,
|
|
464
|
-
},
|
|
465
|
-
}).catch(() => { });
|
|
466
|
-
}
|
|
611
|
+
// Reasoning channel. LM Studio (with "Separate reasoning_content"
|
|
612
|
+
// dev setting), DeepSeek R1, Ollama OpenAI-compat, Nemotron etc.
|
|
613
|
+
// stream reasoning via delta.reasoning_content — we MUST capture it
|
|
614
|
+
// so the safety net below can return something when the model
|
|
615
|
+
// burns its entire budget before emitting a single content token.
|
|
616
|
+
if (typeof delta?.reasoning_content === 'string' && delta.reasoning_content.length > 0) {
|
|
617
|
+
reasoning += delta.reasoning_content;
|
|
618
|
+
sendProgress(`Thinking... (${reasoning.length} chars of reasoning)`);
|
|
467
619
|
}
|
|
468
|
-
if (delta?.content) {
|
|
620
|
+
if (typeof delta?.content === 'string' && delta.content.length > 0) {
|
|
469
621
|
if (ttftMs === undefined)
|
|
470
622
|
ttftMs = Date.now() - startTime;
|
|
471
623
|
content += delta.content;
|
|
472
|
-
|
|
473
|
-
// Send progress notification to reset MCP client timeout.
|
|
474
|
-
// Each notification resets the 60s clock, giving slow models
|
|
475
|
-
// unlimited time as long as they're actively generating.
|
|
476
|
-
if (options.progressToken !== undefined) {
|
|
477
|
-
server.notification({
|
|
478
|
-
method: 'notifications/progress',
|
|
479
|
-
params: {
|
|
480
|
-
progressToken: options.progressToken,
|
|
481
|
-
progress: chunkCount,
|
|
482
|
-
message: `Streaming... ${content.length} chars`,
|
|
483
|
-
},
|
|
484
|
-
}).catch(() => { });
|
|
485
|
-
}
|
|
624
|
+
sendProgress(`Streaming... ${content.length} chars`);
|
|
486
625
|
}
|
|
487
626
|
const reason = json.choices?.[0]?.finish_reason;
|
|
488
627
|
if (reason)
|
|
@@ -506,7 +645,10 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
506
645
|
if (json.model)
|
|
507
646
|
model = json.model;
|
|
508
647
|
const delta = json.choices?.[0]?.delta;
|
|
509
|
-
if (delta?.
|
|
648
|
+
if (typeof delta?.reasoning_content === 'string' && delta.reasoning_content.length > 0) {
|
|
649
|
+
reasoning += delta.reasoning_content;
|
|
650
|
+
}
|
|
651
|
+
if (typeof delta?.content === 'string' && delta.content.length > 0) {
|
|
510
652
|
if (ttftMs === undefined)
|
|
511
653
|
ttftMs = Date.now() - startTime;
|
|
512
654
|
content += delta.content;
|
|
@@ -525,50 +667,117 @@ async function chatCompletionStreamingInner(messages, options = {}) {
|
|
|
525
667
|
}
|
|
526
668
|
}
|
|
527
669
|
finally {
|
|
528
|
-
|
|
529
|
-
|
|
670
|
+
clearInterval(keepAliveTimer);
|
|
671
|
+
// Best-effort cancel with a short timeout — cancel() can hang if the upstream
|
|
672
|
+
// connection is wedged, so we race it against a 500ms timer. This frees the
|
|
673
|
+
// underlying socket sooner on abrupt client disconnects without blocking the
|
|
674
|
+
// tool response path.
|
|
675
|
+
try {
|
|
676
|
+
await Promise.race([
|
|
677
|
+
reader.cancel().catch(() => { }),
|
|
678
|
+
new Promise((resolve) => setTimeout(resolve, 500)),
|
|
679
|
+
]);
|
|
680
|
+
}
|
|
681
|
+
catch { /* never propagate cleanup errors */ }
|
|
682
|
+
try {
|
|
683
|
+
reader.releaseLock();
|
|
684
|
+
}
|
|
685
|
+
catch { /* already released */ }
|
|
530
686
|
}
|
|
531
687
|
const generationMs = Date.now() - startTime;
|
|
532
688
|
// Strip <think>...</think> reasoning blocks from models that always emit them
|
|
533
|
-
// (e.g. GLM Flash
|
|
534
|
-
// Handle both closed
|
|
535
|
-
// or grammar-constrained output forced content before the closing tag).
|
|
689
|
+
// inline on the content channel (e.g. GLM Flash). Claude doesn't need the
|
|
690
|
+
// model's internal reasoning. Handle both closed and unclosed blocks.
|
|
536
691
|
let cleanContent = content.replace(/<think>[\s\S]*?<\/think>\s*/g, ''); // closed blocks
|
|
537
692
|
cleanContent = cleanContent.replace(/^<think>\s*/, ''); // orphaned opening tag
|
|
538
693
|
cleanContent = cleanContent.trim();
|
|
539
|
-
// Safety
|
|
540
|
-
//
|
|
541
|
-
//
|
|
542
|
-
//
|
|
543
|
-
//
|
|
694
|
+
// Safety nets for empty visible output. Try in order:
|
|
695
|
+
// 1. thinkStripFallback: stripping <think> left nothing, but raw content had text
|
|
696
|
+
// 2. reasoningFallback: no visible content AT ALL, but reasoning_content was streamed
|
|
697
|
+
// (this is the Nemotron/DeepSeek-R1/LM-Studio-dev-toggle case — previously
|
|
698
|
+
// produced silent empty bodies because reasoning was discarded)
|
|
544
699
|
let thinkStripFallback = false;
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
700
|
+
let reasoningFallback = false;
|
|
701
|
+
if (!cleanContent) {
|
|
702
|
+
if (content.trim()) {
|
|
703
|
+
thinkStripFallback = true;
|
|
704
|
+
cleanContent = content.trim();
|
|
705
|
+
}
|
|
706
|
+
else if (reasoning.trim()) {
|
|
707
|
+
reasoningFallback = true;
|
|
708
|
+
cleanContent =
|
|
709
|
+
'[No visible output — the model spent its entire output budget on reasoning_content before emitting any content. ' +
|
|
710
|
+
'Raw reasoning below so you can see what it was doing:]\n\n' +
|
|
711
|
+
reasoning.trim();
|
|
712
|
+
}
|
|
548
713
|
}
|
|
549
|
-
return {
|
|
714
|
+
return {
|
|
715
|
+
content: cleanContent,
|
|
716
|
+
rawContent: content,
|
|
717
|
+
reasoningContent: reasoning || undefined,
|
|
718
|
+
model,
|
|
719
|
+
usage,
|
|
720
|
+
finishReason,
|
|
721
|
+
truncated,
|
|
722
|
+
ttftMs,
|
|
723
|
+
generationMs,
|
|
724
|
+
thinkStripFallback,
|
|
725
|
+
reasoningFallback,
|
|
726
|
+
prefillStall,
|
|
727
|
+
};
|
|
728
|
+
}
|
|
729
|
+
let detectedBackend = null;
|
|
730
|
+
function getBackend() {
|
|
731
|
+
return detectedBackend ?? 'openai-compat';
|
|
550
732
|
}
|
|
551
733
|
/**
|
|
552
|
-
* Fetch models
|
|
553
|
-
*
|
|
734
|
+
* Fetch models with backend-aware probing.
|
|
735
|
+
* 1. LM Studio /api/v0/models — richest metadata, sets backend='lmstudio'
|
|
736
|
+
* 2. Ollama /api/tags — native list, sets backend='ollama', maps to ModelInfo
|
|
737
|
+
* 3. OpenAI-compatible /v1/models — generic fallback (DeepSeek, vLLM, llama.cpp, OpenRouter)
|
|
554
738
|
*/
|
|
555
739
|
async function listModelsRaw() {
|
|
556
|
-
// Try v0 API first — returns type, arch, publisher, quantization, state
|
|
740
|
+
// Try LM Studio's v0 API first — returns type, arch, publisher, quantization, state
|
|
557
741
|
try {
|
|
558
742
|
const v0 = await fetchWithTimeout(`${LM_BASE_URL}/api/v0/models`, { headers: apiHeaders() });
|
|
559
743
|
if (v0.ok) {
|
|
560
744
|
const data = (await v0.json());
|
|
745
|
+
detectedBackend = 'lmstudio';
|
|
561
746
|
return data.data;
|
|
562
747
|
}
|
|
563
748
|
}
|
|
564
749
|
catch {
|
|
565
|
-
// v0 not available — fall through
|
|
750
|
+
// v0 not available — fall through
|
|
566
751
|
}
|
|
567
|
-
//
|
|
752
|
+
// Try Ollama's /api/tags next. Shape differs from OpenAI: returns
|
|
753
|
+
// { models: [{ name, model, size, details: { family, parameter_size, ... } }] }
|
|
754
|
+
try {
|
|
755
|
+
const tags = await fetchWithTimeout(`${LM_BASE_URL}/api/tags`, { headers: apiHeaders() });
|
|
756
|
+
if (tags.ok) {
|
|
757
|
+
const data = (await tags.json());
|
|
758
|
+
if (Array.isArray(data.models)) {
|
|
759
|
+
detectedBackend = 'ollama';
|
|
760
|
+
return data.models.map((m) => ({
|
|
761
|
+
id: m.name,
|
|
762
|
+
object: 'model',
|
|
763
|
+
type: 'llm',
|
|
764
|
+
arch: m.details?.family,
|
|
765
|
+
quantization: m.details?.quantization_level,
|
|
766
|
+
state: 'loaded', // Ollama loads on-demand; treat all listed as available
|
|
767
|
+
publisher: m.name.includes('/') ? m.name.split('/')[0] : undefined,
|
|
768
|
+
}));
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
catch {
|
|
773
|
+
// Not Ollama — fall through
|
|
774
|
+
}
|
|
775
|
+
// Fallback: OpenAI-compatible v1 endpoint (DeepSeek, vLLM, llama.cpp, OpenRouter)
|
|
568
776
|
const res = await fetchWithTimeout(`${LM_BASE_URL}/v1/models`, { headers: apiHeaders() });
|
|
569
777
|
if (!res.ok)
|
|
570
778
|
throw new Error(`Failed to list models: ${res.status}`);
|
|
571
779
|
const data = (await res.json());
|
|
780
|
+
detectedBackend = 'openai-compat';
|
|
572
781
|
return data.data;
|
|
573
782
|
}
|
|
574
783
|
function getContextLength(model) {
|
|
@@ -579,6 +788,81 @@ function getContextLength(model) {
|
|
|
579
788
|
function getMaxContextLength(model) {
|
|
580
789
|
return model.max_context_length;
|
|
581
790
|
}
|
|
791
|
+
/**
|
|
792
|
+
* Map model family / backend → reasoning_effort value that minimises reasoning.
|
|
793
|
+
*
|
|
794
|
+
* The `reasoning_effort` field exists across OpenAI, Ollama, LM Studio and
|
|
795
|
+
* DeepSeek, but the accepted values differ per vendor. Verified empirically
|
|
796
|
+
* from the LM Studio error response: "Supported values: none, minimal, low,
|
|
797
|
+
* medium, high, xhigh" (that's the set the LM Studio adapter accepts).
|
|
798
|
+
*
|
|
799
|
+
* OpenAI (gpt-5, o-series) : 'low' | 'medium' | 'high' (spec)
|
|
800
|
+
* Ollama : 'low' | 'medium' | 'high' | 'none'
|
|
801
|
+
* LM Studio (all models) : 'none' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh'
|
|
802
|
+
*
|
|
803
|
+
* We want the HARDEST off-switch we can portably send:
|
|
804
|
+
* - LM Studio / Ollama: 'none' (no reasoning budget at all)
|
|
805
|
+
* - Generic OpenAI-compat: 'low' (OpenAI's minimum, safe to send)
|
|
806
|
+
*
|
|
807
|
+
* An unsupported value is a hard 400 error on LM Studio (not a silent
|
|
808
|
+
* fallback), so this function is conservative — it returns null for
|
|
809
|
+
* unknown backends and we omit the field rather than risk a 400.
|
|
810
|
+
*/
|
|
811
|
+
function getReasoningEffortValue(_modelId) {
|
|
812
|
+
const backend = getBackend();
|
|
813
|
+
// LM Studio accepts 'none' as an explicit reasoning-off switch for
|
|
814
|
+
// every thinking model (Nemotron, DeepSeek R1, Gemma 4, gpt-oss, ...).
|
|
815
|
+
if (backend === 'lmstudio')
|
|
816
|
+
return 'none';
|
|
817
|
+
// Ollama likewise documents 'none' as valid.
|
|
818
|
+
if (backend === 'ollama')
|
|
819
|
+
return 'none';
|
|
820
|
+
// Generic OpenAI-compatible — 'low' is the minimum OpenAI accepts per spec.
|
|
821
|
+
// DeepSeek's own API treats 'low' as minimum too.
|
|
822
|
+
return 'low';
|
|
823
|
+
}
|
|
824
|
+
/** Rough chars→tokens ratio used for pre-flight estimates. Matches the ratio
|
|
825
|
+
* we already use to estimate completion_tokens when usage is missing. */
|
|
826
|
+
const CHARS_PER_TOKEN = 4;
|
|
827
|
+
/** Conservative default prefill rate when no per-model measurement exists.
|
|
828
|
+
* Slower than real hardware so we err toward letting the call run — a false
|
|
829
|
+
* refusal is much worse than a false-ok that eventually times out. */
|
|
830
|
+
const DEFAULT_PREFILL_TOK_PER_SEC = 300;
|
|
831
|
+
/** Hard ceiling for when we refuse to send the call. Leaves ~15s of
|
|
832
|
+
* generation headroom inside the ~60s MCP-client request-timeout budget. */
|
|
833
|
+
const PREFILL_REFUSE_THRESHOLD_SEC = 45;
|
|
834
|
+
/** Soft warning threshold — we proceed but log a stderr warning. */
|
|
835
|
+
const PREFILL_WARN_THRESHOLD_SEC = 25;
|
|
836
|
+
/**
|
|
837
|
+
* Estimate how long prompt prefill will take, using measured per-model data
|
|
838
|
+
* from the SQLite cache when available. `totalTtftMs` is very close to pure
|
|
839
|
+
* prefill time for a streaming call (first-content-delta arrives right after
|
|
840
|
+
* prefill finishes), so `totalPromptTokens / totalTtftMs` gives a usable
|
|
841
|
+
* prefill-tok/s rate for that specific (model, hardware) pair.
|
|
842
|
+
*/
|
|
843
|
+
function estimatePrefill(inputChars, modelId) {
|
|
844
|
+
const inputTokens = Math.ceil(inputChars / CHARS_PER_TOKEN);
|
|
845
|
+
const stats = lifetime.modelStats.get(modelId);
|
|
846
|
+
let prefillTokPerSec = DEFAULT_PREFILL_TOK_PER_SEC;
|
|
847
|
+
let basis = 'default';
|
|
848
|
+
if (stats && stats.ttftCalls >= 2 && stats.totalTtftMs > 0 && stats.totalPromptTokens > 0) {
|
|
849
|
+
// Only trust measured data after >=2 TTFT samples — single samples on a
|
|
850
|
+
// cold model run are noisy. Average prompt tokens per call approximated
|
|
851
|
+
// as totalPromptTokens / calls, then divided by average TTFT seconds.
|
|
852
|
+
const avgPromptTokens = stats.totalPromptTokens / stats.calls;
|
|
853
|
+
const avgTtftSec = (stats.totalTtftMs / stats.ttftCalls) / 1000;
|
|
854
|
+
if (avgTtftSec > 0) {
|
|
855
|
+
prefillTokPerSec = avgPromptTokens / avgTtftSec;
|
|
856
|
+
basis = 'measured';
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
return {
|
|
860
|
+
inputTokens,
|
|
861
|
+
estimatedSeconds: inputTokens / prefillTokPerSec,
|
|
862
|
+
prefillTokPerSec,
|
|
863
|
+
basis,
|
|
864
|
+
};
|
|
865
|
+
}
|
|
582
866
|
async function routeToModel(taskType) {
|
|
583
867
|
let models;
|
|
584
868
|
try {
|
|
@@ -645,9 +929,11 @@ function assessQuality(resp, rawContent) {
|
|
|
645
929
|
: null;
|
|
646
930
|
return {
|
|
647
931
|
truncated: resp.truncated,
|
|
932
|
+
prefillStall: resp.prefillStall ?? false,
|
|
648
933
|
finishReason: resp.finishReason || 'unknown',
|
|
649
934
|
thinkBlocksStripped: hadThinkBlocks,
|
|
650
935
|
thinkStripFallback: resp.thinkStripFallback ?? false,
|
|
936
|
+
reasoningFallback: resp.reasoningFallback ?? false,
|
|
651
937
|
estimatedTokens: estimated,
|
|
652
938
|
contentLength: resp.content.length,
|
|
653
939
|
generationMs: resp.generationMs,
|
|
@@ -656,9 +942,13 @@ function assessQuality(resp, rawContent) {
|
|
|
656
942
|
}
|
|
657
943
|
function formatQualityLine(quality) {
|
|
658
944
|
const flags = [];
|
|
659
|
-
if (quality.
|
|
945
|
+
if (quality.prefillStall)
|
|
946
|
+
flags.push('PREFILL-STALL (no tokens received — input may be too large for this model/hardware)');
|
|
947
|
+
else if (quality.truncated)
|
|
660
948
|
flags.push('TRUNCATED');
|
|
661
|
-
if (quality.
|
|
949
|
+
if (quality.reasoningFallback)
|
|
950
|
+
flags.push('reasoning-only (model exhausted output budget before emitting visible content — showing raw reasoning)');
|
|
951
|
+
else if (quality.thinkStripFallback)
|
|
662
952
|
flags.push('think-strip-empty (showing raw reasoning — model ignored enable_thinking:false)');
|
|
663
953
|
else if (quality.thinkBlocksStripped)
|
|
664
954
|
flags.push('think-blocks-stripped');
|
|
@@ -672,6 +962,12 @@ function formatQualityLine(quality) {
|
|
|
672
962
|
}
|
|
673
963
|
/**
|
|
674
964
|
* Format a footer line for streaming results showing model, usage, and truncation status.
|
|
965
|
+
*
|
|
966
|
+
* Layout:
|
|
967
|
+
* ---
|
|
968
|
+
* Model: ... | prompt→completion tokens | perf | extra | quality
|
|
969
|
+
* 📊 [first-call benchmark line, only on the first measured call per model]
|
|
970
|
+
* 💰 Claude quota saved this session: ...
|
|
675
971
|
*/
|
|
676
972
|
function formatFooter(resp, extra) {
|
|
677
973
|
// Record usage for session tracking before formatting
|
|
@@ -680,7 +976,17 @@ function formatFooter(resp, extra) {
|
|
|
680
976
|
if (resp.model)
|
|
681
977
|
parts.push(`Model: ${resp.model}`);
|
|
682
978
|
if (resp.usage) {
|
|
683
|
-
|
|
979
|
+
// OpenAI-spec reasoning-tokens split — when present, show it so the user
|
|
980
|
+
// sees how much of the completion budget went to hidden reasoning vs
|
|
981
|
+
// visible output. Diagnoses "empty body + hit-max-tokens" immediately.
|
|
982
|
+
const reasoningTokens = resp.usage.completion_tokens_details?.reasoning_tokens;
|
|
983
|
+
if (typeof reasoningTokens === 'number' && reasoningTokens > 0) {
|
|
984
|
+
const visible = resp.usage.completion_tokens - reasoningTokens;
|
|
985
|
+
parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens (${reasoningTokens} reasoning / ${visible} visible)`);
|
|
986
|
+
}
|
|
987
|
+
else {
|
|
988
|
+
parts.push(`${resp.usage.prompt_tokens}→${resp.usage.completion_tokens} tokens`);
|
|
989
|
+
}
|
|
684
990
|
}
|
|
685
991
|
else if (resp.content.length > 0) {
|
|
686
992
|
// Estimate when usage is missing (truncated responses where final SSE chunk was lost)
|
|
@@ -691,8 +997,9 @@ function formatFooter(resp, extra) {
|
|
|
691
997
|
const perfParts = [];
|
|
692
998
|
if (resp.ttftMs !== undefined)
|
|
693
999
|
perfParts.push(`TTFT: ${resp.ttftMs}ms`);
|
|
1000
|
+
let tokPerSec = 0;
|
|
694
1001
|
if (resp.usage && resp.generationMs > 50) {
|
|
695
|
-
|
|
1002
|
+
tokPerSec = resp.usage.completion_tokens / (resp.generationMs / 1000);
|
|
696
1003
|
perfParts.push(`${tokPerSec.toFixed(1)} tok/s`);
|
|
697
1004
|
}
|
|
698
1005
|
if (resp.generationMs)
|
|
@@ -708,40 +1015,43 @@ function formatFooter(resp, extra) {
|
|
|
708
1015
|
parts.push(qualityLine);
|
|
709
1016
|
if (resp.truncated)
|
|
710
1017
|
parts.push('⚠ TRUNCATED (soft timeout — partial result)');
|
|
1018
|
+
const benchmarkLine = isFirstBenchmarkedCall(resp.model, tokPerSec)
|
|
1019
|
+
? `📊 First measured call on ${resp.model}: ${tokPerSec.toFixed(1)} tok/s${resp.ttftMs !== undefined ? `, ${resp.ttftMs}ms to first token` : ''} — use this to gauge whether to delegate longer tasks.`
|
|
1020
|
+
: '';
|
|
711
1021
|
const sessionLine = sessionSummary();
|
|
1022
|
+
if (parts.length === 0 && !benchmarkLine && !sessionLine)
|
|
1023
|
+
return '';
|
|
1024
|
+
const lines = [`\n\n---${parts.length > 0 ? `\n${parts.join(' | ')}` : ''}`];
|
|
1025
|
+
// First-call speed benchmark — surfaced once per model per session, based on
|
|
1026
|
+
// the real task just completed (not a synthetic warmup). Gives Claude honest
|
|
1027
|
+
// speed data to calibrate future delegation decisions.
|
|
1028
|
+
if (benchmarkLine)
|
|
1029
|
+
lines.push(benchmarkLine);
|
|
1030
|
+
// Session savings — on its own line so it reads as value, not as accounting.
|
|
712
1031
|
if (sessionLine)
|
|
713
|
-
|
|
714
|
-
return
|
|
1032
|
+
lines.push(sessionLine);
|
|
1033
|
+
return lines.join('\n');
|
|
715
1034
|
}
|
|
716
1035
|
// ── MCP Tool definitions ─────────────────────────────────────────────
|
|
717
1036
|
const TOOLS = [
|
|
718
1037
|
{
|
|
719
1038
|
name: 'chat',
|
|
720
|
-
description: 'Send a task to a local LLM running on
|
|
721
|
-
'
|
|
722
|
-
'
|
|
723
|
-
'
|
|
724
|
-
'break it into steps and identify which ones are bounded grunt work you can delegate here. ' +
|
|
725
|
-
'The more you offload, the more tokens you save. The session footer tracks cumulative savings.\n\n' +
|
|
726
|
-
'WHEN TO USE (delegate generously — it costs nothing):\n' +
|
|
727
|
-
'• Explain or summarise code/docs you just read\n' +
|
|
1039
|
+
description: 'Send a task to a local LLM — a sidekick running on the user\'s hardware or a configured OpenAI-compatible endpoint. ' +
|
|
1040
|
+
'It does not consume the user\'s Claude quota. Trades latency for tokens: local inference is typically 3-30× slower than frontier models, so delegation wins when the task is bounded and self-contained.\n\n' +
|
|
1041
|
+
'Good fit:\n' +
|
|
1042
|
+
'• Explain or summarise code/docs you already have in context\n' +
|
|
728
1043
|
'• Generate boilerplate, test stubs, type definitions, mock data\n' +
|
|
729
1044
|
'• Answer factual questions about languages, frameworks, APIs\n' +
|
|
730
1045
|
'• Draft commit messages, PR descriptions, comments\n' +
|
|
731
1046
|
'• Translate or reformat content (JSON↔YAML, snake_case↔camelCase)\n' +
|
|
732
|
-
'• Brainstorm approaches before
|
|
733
|
-
'
|
|
734
|
-
'
|
|
735
|
-
'(1)
|
|
1047
|
+
'• Brainstorm approaches before committing to one\n\n' +
|
|
1048
|
+
'Less good when: the task needs tool access, depends on multi-file context you have not captured, or is quick enough for you to answer directly before the round-trip completes.\n\n' +
|
|
1049
|
+
'Prompt tips (local models take instructions literally):\n' +
|
|
1050
|
+
'(1) Send COMPLETE context — the local LLM cannot read files.\n' +
|
|
736
1051
|
'(2) Be explicit about output format ("respond as a JSON array", "return only the function").\n' +
|
|
737
|
-
'(3)
|
|
738
|
-
'(4) State constraints
|
|
739
|
-
'
|
|
740
|
-
'QA: Always review the local LLM\'s output before using it. Verify correctness, check edge cases, ' +
|
|
741
|
-
'and fix any issues. You are the architect — the local model is a fast drafter, not the final authority.\n\n' +
|
|
742
|
-
'ROUTING: If multiple models are loaded, houtini-lm automatically picks the best one for the task. ' +
|
|
743
|
-
'If a better model is downloaded but not loaded, you\'ll see a suggestion in the response footer. ' +
|
|
744
|
-
'Call discover to see what\'s available.',
|
|
1052
|
+
'(3) Specific system persona beats generic — "Senior TypeScript dev" not "helpful assistant".\n' +
|
|
1053
|
+
'(4) State constraints — "no preamble", "reference line numbers", "max 5 bullets".\n\n' +
|
|
1054
|
+
'Routing picks the best loaded model automatically. Call `discover` to see what is loaded and, after the first real call, its measured speed. The footer shows cumulative tokens kept in the user\'s quota.',
|
|
745
1055
|
inputSchema: {
|
|
746
1056
|
type: 'object',
|
|
747
1057
|
properties: {
|
|
@@ -759,7 +1069,7 @@ const TOOLS = [
|
|
|
759
1069
|
},
|
|
760
1070
|
max_tokens: {
|
|
761
1071
|
type: 'number',
|
|
762
|
-
description: 'Max response tokens.
|
|
1072
|
+
description: 'Max response tokens. Defaults to 25% of the loaded model\'s context window (fallback 16,384). Pass a number to cap it tighter for quick answers.',
|
|
763
1073
|
},
|
|
764
1074
|
json_schema: {
|
|
765
1075
|
type: 'object',
|
|
@@ -772,20 +1082,18 @@ const TOOLS = [
|
|
|
772
1082
|
{
|
|
773
1083
|
name: 'custom_prompt',
|
|
774
1084
|
description: 'Structured analysis via the local LLM with explicit system/context/instruction separation. ' +
|
|
775
|
-
'
|
|
776
|
-
'
|
|
777
|
-
'stuffing everything into a single message. The separation helps the local model focus.\n\n' +
|
|
778
|
-
'WHEN TO USE:\n' +
|
|
1085
|
+
'The 3-part format prevents context bleed in smaller models — the local LLM acknowledges the context in a fake assistant turn before receiving the instruction.\n\n' +
|
|
1086
|
+
'Good fit when prompt structure matters:\n' +
|
|
779
1087
|
'• Code review — paste full source, ask for bugs/improvements\n' +
|
|
780
1088
|
'• Comparison — paste two implementations, ask which is better and why\n' +
|
|
781
1089
|
'• Refactoring suggestions — paste code, ask for a cleaner version\n' +
|
|
782
1090
|
'• Content analysis — paste text, ask for structure/tone/issues\n' +
|
|
783
1091
|
'• Any task where separating context from instruction improves clarity\n\n' +
|
|
784
|
-
'
|
|
785
|
-
'•
|
|
786
|
-
'•
|
|
787
|
-
'•
|
|
788
|
-
'
|
|
1092
|
+
'Field guidance (each has a job — keep them focused):\n' +
|
|
1093
|
+
'• system: persona + constraints, under 30 words. "Expert Python developer focused on performance and correctness."\n' +
|
|
1094
|
+
'• context: COMPLETE data — full source, full logs, full text. Never truncate.\n' +
|
|
1095
|
+
'• instruction: exactly what to produce, under 50 words. Specify format: "Return a JSON array of {line, issue, fix}."\n\n' +
|
|
1096
|
+
'Review the output before acting on it — local model capability varies.',
|
|
789
1097
|
inputSchema: {
|
|
790
1098
|
type: 'object',
|
|
791
1099
|
properties: {
|
|
@@ -807,7 +1115,7 @@ const TOOLS = [
|
|
|
807
1115
|
},
|
|
808
1116
|
max_tokens: {
|
|
809
1117
|
type: 'number',
|
|
810
|
-
description: 'Max response tokens.
|
|
1118
|
+
description: 'Max response tokens. Defaults to 25% of the loaded model\'s context window (fallback 16,384).',
|
|
811
1119
|
},
|
|
812
1120
|
json_schema: {
|
|
813
1121
|
type: 'object',
|
|
@@ -819,21 +1127,19 @@ const TOOLS = [
|
|
|
819
1127
|
},
|
|
820
1128
|
{
|
|
821
1129
|
name: 'code_task',
|
|
822
|
-
description: 'Send a code
|
|
823
|
-
'
|
|
824
|
-
'focused, deterministic output. The system prompt is pre-configured for code review.\n\n' +
|
|
825
|
-
'WHEN TO USE:\n' +
|
|
1130
|
+
description: 'Send a code-specific task to the local LLM, wrapped with an optimised code-review system prompt. Temperature is locked low (0.2 or the routed model\'s hint) for deterministic output.\n\n' +
|
|
1131
|
+
'Good fit:\n' +
|
|
826
1132
|
'• Explain what a function/class does\n' +
|
|
827
1133
|
'• Find bugs or suggest improvements\n' +
|
|
828
1134
|
'• Generate unit tests or type definitions for existing code\n' +
|
|
829
1135
|
'• Add error handling, logging, or validation\n' +
|
|
830
1136
|
'• Convert between languages or patterns\n\n' +
|
|
831
|
-
'
|
|
832
|
-
'• Provide COMPLETE source
|
|
1137
|
+
'For best results:\n' +
|
|
1138
|
+
'• Provide COMPLETE source — the local LLM cannot read files.\n' +
|
|
833
1139
|
'• Include imports and type definitions so the model has full context.\n' +
|
|
834
|
-
'• Be specific
|
|
1140
|
+
'• Be specific: "Write 3 Jest tests for the error paths in fetchUser" beats "Write tests".\n' +
|
|
835
1141
|
'• Set the language field — it shapes the system prompt and improves accuracy.\n\n' +
|
|
836
|
-
'
|
|
1142
|
+
'Verify generated code compiles, handles edge cases, and follows project conventions before committing.',
|
|
837
1143
|
inputSchema: {
|
|
838
1144
|
type: 'object',
|
|
839
1145
|
properties: {
|
|
@@ -851,7 +1157,7 @@ const TOOLS = [
|
|
|
851
1157
|
},
|
|
852
1158
|
max_tokens: {
|
|
853
1159
|
type: 'number',
|
|
854
|
-
description: 'Max response tokens.
|
|
1160
|
+
description: 'Max response tokens. Defaults to 25% of the loaded model\'s context window (fallback 16,384).',
|
|
855
1161
|
},
|
|
856
1162
|
},
|
|
857
1163
|
required: ['code', 'task'],
|
|
@@ -859,18 +1165,19 @@ const TOOLS = [
|
|
|
859
1165
|
},
|
|
860
1166
|
{
|
|
861
1167
|
name: 'code_task_files',
|
|
862
|
-
description: 'Like code_task, but the local LLM reads
|
|
863
|
-
'
|
|
864
|
-
'
|
|
865
|
-
'•
|
|
866
|
-
'•
|
|
867
|
-
'•
|
|
868
|
-
'•
|
|
869
|
-
'
|
|
870
|
-
'• Reviewing
|
|
1168
|
+
description: 'Like code_task, but the local LLM reads files directly from disk — source never passes through the MCP client\'s context window. Use when reviewing multiple files or a single large file.\n\n' +
|
|
1169
|
+
'How it works:\n' +
|
|
1170
|
+
'• Provide absolute paths. Relative paths are rejected.\n' +
|
|
1171
|
+
'• Files are read in parallel (Promise.allSettled) — one unreadable file does not sink the call.\n' +
|
|
1172
|
+
'• Files are concatenated with `=== filename ===` headers and sent to the same code-review pipeline as code_task.\n' +
|
|
1173
|
+
'• Read failures are surfaced inline with the reason so the LLM can still reason about the rest.\n' +
|
|
1174
|
+
'• Pre-flight prefill estimate: if measured per-model data shows the input would exceed the MCP client\'s ~60s request timeout during prompt processing, the call is refused early with a diagnostic instead of hanging. Split or trim when this fires.\n\n' +
|
|
1175
|
+
'Good fit:\n' +
|
|
1176
|
+
'• Reviewing related files together (module + its tests, client + server pair)\n' +
|
|
871
1177
|
'• Auditing a single large file too big to paste comfortably\n' +
|
|
872
|
-
'• Any code_task where
|
|
873
|
-
'
|
|
1178
|
+
'• Any code_task where keeping source out of the Claude context window matters\n\n' +
|
|
1179
|
+
'Size guidance: on slow hardware (< 25 tok/s generation), keep total input under ~8,000 tokens (~32,000 chars) to stay safely under the client timeout. Faster hardware handles much more — the pre-flight estimator adapts once you\'ve done a few calls and real per-model timings are in the SQLite cache.\n\n' +
|
|
1180
|
+
'Same review discipline as code_task — verify the output before acting on it.',
|
|
874
1181
|
inputSchema: {
|
|
875
1182
|
type: 'object',
|
|
876
1183
|
properties: {
|
|
@@ -931,9 +1238,33 @@ const TOOLS = [
|
|
|
931
1238
|
required: ['input'],
|
|
932
1239
|
},
|
|
933
1240
|
},
|
|
1241
|
+
{
|
|
1242
|
+
name: 'stats',
|
|
1243
|
+
description: 'Show user stats: tokens offloaded, calls made, per-model performance — for the current session AND ' +
|
|
1244
|
+
'lifetime (persisted in SQLite at ~/.houtini-lm/model-cache.db). Unlike `discover` which includes the ' +
|
|
1245
|
+
'model catalog, `stats` returns just the numbers in a compact markdown table — cheap to call repeatedly ' +
|
|
1246
|
+
'to see the 💰 Claude-quota savings counter climb. Useful for quantifying how much work the local model ' +
|
|
1247
|
+
'is genuinely doing, and for noticing when a model\'s reasoning-token ratio is drifting.',
|
|
1248
|
+
inputSchema: {
|
|
1249
|
+
type: 'object',
|
|
1250
|
+
properties: {
|
|
1251
|
+
model: {
|
|
1252
|
+
type: 'string',
|
|
1253
|
+
description: 'Optional: filter output to a single model ID. Omit to see all models this workstation has used.',
|
|
1254
|
+
},
|
|
1255
|
+
},
|
|
1256
|
+
},
|
|
1257
|
+
},
|
|
934
1258
|
];
|
|
935
1259
|
// ── MCP Server ───────────────────────────────────────────────────────
|
|
936
|
-
|
|
1260
|
+
// Session-level sidekick framing. MCP clients surface this to the model
|
|
1261
|
+
// at initialisation, so it sets the baseline expectation for when to delegate
|
|
1262
|
+
// rather than relying on per-tool descriptions being re-read on every call.
|
|
1263
|
+
const SIDEKICK_INSTRUCTIONS = `Houtini-lm is a local LLM sidekick. It runs on the user's hardware (or a configured OpenAI-compatible endpoint) and handles bounded work without consuming the user's Claude quota.\n\n` +
|
|
1264
|
+
`When to reach for it: bounded, self-contained tasks you can describe in one message — explanations, boilerplate, test stubs, code review of pasted or file-loaded source, translations, commit messages, format conversion, brainstorming. Trades wall-clock time for tokens (typically 3-30× slower than frontier models).\n\n` +
|
|
1265
|
+
`When not to: tasks that need tool access, cross-file reasoning you haven't captured, or work fast enough to answer directly before the delegation round-trip completes.\n\n` +
|
|
1266
|
+
`Call \`discover\` in delegation-heavy sessions to see what model is loaded, its capability profile, and — after the first real call — its measured speed. The response footer reports cumulative tokens kept in the user's quota.`;
|
|
1267
|
+
const server = new Server({ name: 'houtini-lm', version: '2.11.0' }, { capabilities: { tools: {}, resources: {} }, instructions: SIDEKICK_INSTRUCTIONS });
|
|
937
1268
|
// ── MCP Resources ─────────────────────────────────────────────────────
|
|
938
1269
|
// Exposes session performance metrics as a readable resource so Claude can
|
|
939
1270
|
// proactively check offload efficiency and make smarter delegation decisions.
|
|
@@ -954,7 +1285,7 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
|
|
|
954
1285
|
for (const [modelId, stats] of session.modelStats) {
|
|
955
1286
|
modelStats[modelId] = {
|
|
956
1287
|
calls: stats.calls,
|
|
957
|
-
avgTtftMs: stats.
|
|
1288
|
+
avgTtftMs: stats.ttftCalls > 0 ? Math.round(stats.totalTtftMs / stats.ttftCalls) : 0,
|
|
958
1289
|
avgTokPerSec: stats.perfCalls > 0 ? parseFloat((stats.totalTokPerSec / stats.perfCalls).toFixed(1)) : null,
|
|
959
1290
|
};
|
|
960
1291
|
}
|
|
@@ -1114,6 +1445,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1114
1445
|
? ` ${route.hints.outputConstraint}`
|
|
1115
1446
|
: '';
|
|
1116
1447
|
const combined = sections.join('\n\n');
|
|
1448
|
+
// Pre-flight prefill estimate. Huge inputs can legitimately exceed
|
|
1449
|
+
// the MCP client's ~60s request timeout during prompt processing, and
|
|
1450
|
+
// progress notifications don't reset that timeout on Claude Desktop.
|
|
1451
|
+
// If measured per-model data in the SQLite cache shows this input
|
|
1452
|
+
// would obviously overrun, refuse with a concrete diagnostic so the
|
|
1453
|
+
// caller knows to split or trim instead of waiting for a silent hang.
|
|
1454
|
+
const estimate = estimatePrefill(combined.length, route.modelId);
|
|
1455
|
+
if (estimate.basis === 'measured' && estimate.estimatedSeconds > PREFILL_REFUSE_THRESHOLD_SEC) {
|
|
1456
|
+
const prefillRate = Math.round(estimate.prefillTokPerSec);
|
|
1457
|
+
const estSec = Math.round(estimate.estimatedSeconds);
|
|
1458
|
+
return {
|
|
1459
|
+
content: [{
|
|
1460
|
+
type: 'text',
|
|
1461
|
+
text: `Error: estimated prefill time exceeds the ~60s MCP client timeout.\n\n` +
|
|
1462
|
+
`• Input size: ~${estimate.inputTokens.toLocaleString()} tokens across ${successCount} file(s)\n` +
|
|
1463
|
+
`• Measured prefill rate on ${route.modelId}: ~${prefillRate} tok/s (from ${lifetime.modelStats.get(route.modelId)?.ttftCalls ?? 0} prior calls)\n` +
|
|
1464
|
+
`• Estimated prefill: ~${estSec}s (threshold: ${PREFILL_REFUSE_THRESHOLD_SEC}s)\n\n` +
|
|
1465
|
+
`Options: split the files into smaller groups, trim the largest file, or use \`code_task\` with a focused excerpt. ` +
|
|
1466
|
+
`If you know this workstation can handle it, pass fewer files or run the task again when the measured rate improves.`,
|
|
1467
|
+
}],
|
|
1468
|
+
isError: true,
|
|
1469
|
+
};
|
|
1470
|
+
}
|
|
1471
|
+
if (estimate.estimatedSeconds > PREFILL_WARN_THRESHOLD_SEC) {
|
|
1472
|
+
process.stderr.write(`[houtini-lm] Large input warning: ~${estimate.inputTokens} tokens, est prefill ~${Math.round(estimate.estimatedSeconds)}s (${estimate.basis}). Proceeding.\n`);
|
|
1473
|
+
}
|
|
1117
1474
|
const codeMessages = [
|
|
1118
1475
|
{
|
|
1119
1476
|
role: 'system',
|
|
@@ -1171,14 +1528,49 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1171
1528
|
const primary = loaded[0] || models[0];
|
|
1172
1529
|
const ctx = getContextLength(primary);
|
|
1173
1530
|
const primaryProfile = await getModelProfileAsync(primary);
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1531
|
+
// Use sessionSummary() so discover matches the footer format and
|
|
1532
|
+
// automatically picks up the lifetime line when the SQLite cache has
|
|
1533
|
+
// cross-session data.
|
|
1534
|
+
const summary = sessionSummary();
|
|
1535
|
+
const sessionStats = session.calls > 0 || lifetime.totalCalls > 0
|
|
1536
|
+
? `\n${summary}`
|
|
1537
|
+
: `\n💰 Claude quota saved this session: 0 tokens — no calls yet. Measured speed for each model will appear here after the first real call.`;
|
|
1538
|
+
// Measured speed line for the active model. Discover intentionally does
|
|
1539
|
+
// not run a synthetic warmup — speed is captured from real tasks, so the
|
|
1540
|
+
// numbers reflect actual workload rather than a contrived benchmark.
|
|
1541
|
+
// Shows session stats when this session has measured calls; otherwise
|
|
1542
|
+
// falls back to workstation lifetime stats so Claude sees historical
|
|
1543
|
+
// perf from call 1 instead of "not yet benchmarked".
|
|
1544
|
+
const primaryStats = session.modelStats.get(primary.id);
|
|
1545
|
+
const primaryLifetime = lifetime.modelStats.get(primary.id);
|
|
1546
|
+
let speedLine = '';
|
|
1547
|
+
if (primaryStats && primaryStats.perfCalls > 0) {
|
|
1548
|
+
const avgTtft = primaryStats.ttftCalls > 0 ? Math.round(primaryStats.totalTtftMs / primaryStats.ttftCalls) : 0;
|
|
1549
|
+
const avgTokSec = (primaryStats.totalTokPerSec / primaryStats.perfCalls).toFixed(1);
|
|
1550
|
+
speedLine = `Measured speed (session): ${avgTokSec} tok/s · TTFT ${avgTtft}ms (${primaryStats.perfCalls} call${primaryStats.perfCalls === 1 ? '' : 's'})\n`;
|
|
1551
|
+
if (primaryLifetime && primaryLifetime.perfCalls > primaryStats.perfCalls) {
|
|
1552
|
+
const lAvgTtft = primaryLifetime.ttftCalls > 0 ? Math.round(primaryLifetime.totalTtftMs / primaryLifetime.ttftCalls) : 0;
|
|
1553
|
+
const lAvgTokSec = (primaryLifetime.totalTokPerSec / primaryLifetime.perfCalls).toFixed(1);
|
|
1554
|
+
speedLine += `Measured speed (lifetime on this workstation): ${lAvgTokSec} tok/s · TTFT ${lAvgTtft}ms (${primaryLifetime.perfCalls} calls)\n`;
|
|
1555
|
+
}
|
|
1556
|
+
}
|
|
1557
|
+
else if (primaryLifetime && primaryLifetime.perfCalls > 0) {
|
|
1558
|
+
const lAvgTtft = primaryLifetime.ttftCalls > 0 ? Math.round(primaryLifetime.totalTtftMs / primaryLifetime.ttftCalls) : 0;
|
|
1559
|
+
const lAvgTokSec = (primaryLifetime.totalTokPerSec / primaryLifetime.perfCalls).toFixed(1);
|
|
1560
|
+
speedLine = `Measured speed (lifetime on this workstation): ${lAvgTokSec} tok/s · TTFT ${lAvgTtft}ms (${primaryLifetime.perfCalls} calls, last used ${new Date(primaryLifetime.lastUsedAt).toISOString().slice(0, 10)})\n`;
|
|
1561
|
+
}
|
|
1562
|
+
else {
|
|
1563
|
+
speedLine = `Measured speed: not yet benchmarked — will be captured on the first real call.\n`;
|
|
1564
|
+
}
|
|
1565
|
+
const backendLabel = getBackend() === 'lmstudio' ? 'LM Studio'
|
|
1566
|
+
: getBackend() === 'ollama' ? 'Ollama'
|
|
1567
|
+
: 'OpenAI-compatible';
|
|
1177
1568
|
let text = `Status: ONLINE\n` +
|
|
1178
|
-
`Endpoint: ${LM_BASE_URL}\n` +
|
|
1179
|
-
`
|
|
1569
|
+
`Endpoint: ${LM_BASE_URL} (${backendLabel})\n` +
|
|
1570
|
+
`Connection latency: ${ms}ms (does not reflect inference speed)\n` +
|
|
1180
1571
|
`Active model: ${primary.id}\n` +
|
|
1181
|
-
`Context window: ${ctx.toLocaleString()} tokens\n
|
|
1572
|
+
`Context window: ${ctx.toLocaleString()} tokens\n` +
|
|
1573
|
+
speedLine;
|
|
1182
1574
|
if (primaryProfile) {
|
|
1183
1575
|
text += `Family: ${primaryProfile.family}\n`;
|
|
1184
1576
|
text += `Description: ${primaryProfile.description}\n`;
|
|
@@ -1200,11 +1592,25 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1200
1592
|
if (session.modelStats.size > 0) {
|
|
1201
1593
|
text += `\n\nPerformance (this session):\n`;
|
|
1202
1594
|
for (const [modelId, stats] of session.modelStats) {
|
|
1203
|
-
const avgTtft = stats.
|
|
1595
|
+
const avgTtft = stats.ttftCalls > 0 ? Math.round(stats.totalTtftMs / stats.ttftCalls) : 0;
|
|
1204
1596
|
const avgTokSec = stats.perfCalls > 0 ? (stats.totalTokPerSec / stats.perfCalls).toFixed(1) : '?';
|
|
1205
1597
|
text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s\n`;
|
|
1206
1598
|
}
|
|
1207
1599
|
}
|
|
1600
|
+
// Workstation lifetime stats — built from SQLite, persists across restarts.
|
|
1601
|
+
// Only shown when there's lifetime data beyond this session, so a first-run
|
|
1602
|
+
// user doesn't see a duplicate of the session block above.
|
|
1603
|
+
const hasLifetimeBeyondSession = Array.from(lifetime.modelStats.entries())
|
|
1604
|
+
.some(([id, l]) => l.calls > (session.modelStats.get(id)?.calls ?? 0));
|
|
1605
|
+
if (hasLifetimeBeyondSession) {
|
|
1606
|
+
text += `\nPerformance (lifetime on this workstation):\n`;
|
|
1607
|
+
for (const [modelId, stats] of lifetime.modelStats) {
|
|
1608
|
+
const avgTtft = stats.ttftCalls > 0 ? Math.round(stats.totalTtftMs / stats.ttftCalls) : 0;
|
|
1609
|
+
const avgTokSec = stats.perfCalls > 0 ? (stats.totalTokPerSec / stats.perfCalls).toFixed(1) : '?';
|
|
1610
|
+
const lastUsed = new Date(stats.lastUsedAt).toISOString().slice(0, 10);
|
|
1611
|
+
text += ` ${modelId}: ${stats.calls} calls, avg TTFT ${avgTtft}ms, avg ${avgTokSec} tok/s (last used ${lastUsed})\n`;
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1208
1614
|
text += `${sessionStats}\n\n`;
|
|
1209
1615
|
text += `The local LLM is available. You can delegate tasks using chat, custom_prompt, code_task, code_task_files, or embed.`;
|
|
1210
1616
|
return { content: [{ type: 'text', text }] };
|
|
@@ -1262,6 +1668,89 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1262
1668
|
};
|
|
1263
1669
|
});
|
|
1264
1670
|
}
|
|
1671
|
+
case 'stats': {
|
|
1672
|
+
const { model: filterModel } = args;
|
|
1673
|
+
const backendLabel = getBackend() === 'lmstudio' ? 'LM Studio'
|
|
1674
|
+
: getBackend() === 'ollama' ? 'Ollama'
|
|
1675
|
+
: 'OpenAI-compatible';
|
|
1676
|
+
const lines = [];
|
|
1677
|
+
lines.push(`## Houtini LM stats`);
|
|
1678
|
+
lines.push('');
|
|
1679
|
+
lines.push(`**Endpoint**: ${LM_BASE_URL} (${backendLabel})`);
|
|
1680
|
+
if (lifetime.firstSeenAt) {
|
|
1681
|
+
lines.push(`**First call on this workstation**: ${new Date(lifetime.firstSeenAt).toISOString().slice(0, 10)}`);
|
|
1682
|
+
}
|
|
1683
|
+
lines.push('');
|
|
1684
|
+
// Totals block
|
|
1685
|
+
lines.push(`### Totals`);
|
|
1686
|
+
lines.push('');
|
|
1687
|
+
lines.push(`| Scope | Calls | Prompt tokens | Completion tokens | Total tokens |`);
|
|
1688
|
+
lines.push(`|----------|------:|--------------:|------------------:|-------------:|`);
|
|
1689
|
+
lines.push(`| Session | ${session.calls} | ${session.promptTokens.toLocaleString()} | ${session.completionTokens.toLocaleString()} | ${(session.promptTokens + session.completionTokens).toLocaleString()} |`);
|
|
1690
|
+
lines.push(`| Lifetime | ${lifetime.totalCalls} | — | — | ${lifetime.totalTokens.toLocaleString()} |`);
|
|
1691
|
+
lines.push('');
|
|
1692
|
+
// Per-model block (union of session + lifetime model ids)
|
|
1693
|
+
const modelIds = new Set([
|
|
1694
|
+
...session.modelStats.keys(),
|
|
1695
|
+
...lifetime.modelStats.keys(),
|
|
1696
|
+
]);
|
|
1697
|
+
const filtered = filterModel ? [...modelIds].filter((m) => m === filterModel) : [...modelIds];
|
|
1698
|
+
if (filtered.length > 0) {
|
|
1699
|
+
lines.push(`### Per-model performance`);
|
|
1700
|
+
lines.push('');
|
|
1701
|
+
lines.push(`| Model | Scope | Calls | Avg TTFT (ms) | Avg tok/s | Prompt tokens | Last used |`);
|
|
1702
|
+
lines.push(`|-------|-------|------:|--------------:|----------:|--------------:|-----------|`);
|
|
1703
|
+
for (const modelId of filtered.sort()) {
|
|
1704
|
+
const s = session.modelStats.get(modelId);
|
|
1705
|
+
const l = lifetime.modelStats.get(modelId);
|
|
1706
|
+
if (s) {
|
|
1707
|
+
const avgTtft = s.ttftCalls > 0 ? Math.round(s.totalTtftMs / s.ttftCalls) : '—';
|
|
1708
|
+
const avgTokSec = s.perfCalls > 0 ? (s.totalTokPerSec / s.perfCalls).toFixed(1) : '—';
|
|
1709
|
+
lines.push(`| ${modelId} | session | ${s.calls} | ${avgTtft} | ${avgTokSec} | — | — |`);
|
|
1710
|
+
}
|
|
1711
|
+
if (l) {
|
|
1712
|
+
const avgTtft = l.ttftCalls > 0 ? Math.round(l.totalTtftMs / l.ttftCalls) : '—';
|
|
1713
|
+
const avgTokSec = l.perfCalls > 0 ? (l.totalTokPerSec / l.perfCalls).toFixed(1) : '—';
|
|
1714
|
+
const lastUsed = new Date(l.lastUsedAt).toISOString().slice(0, 10);
|
|
1715
|
+
lines.push(`| ${modelId} | lifetime | ${l.calls} | ${avgTtft} | ${avgTokSec} | ${l.totalPromptTokens.toLocaleString()} | ${lastUsed} |`);
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
lines.push('');
|
|
1719
|
+
}
|
|
1720
|
+
else if (filterModel) {
|
|
1721
|
+
lines.push(`No history for model: \`${filterModel}\`. Try \`list_models\` to see what's been used.`);
|
|
1722
|
+
lines.push('');
|
|
1723
|
+
}
|
|
1724
|
+
else {
|
|
1725
|
+
lines.push(`No calls yet — delegate a task via \`chat\`, \`custom_prompt\`, \`code_task\`, or \`code_task_files\` to start building stats.`);
|
|
1726
|
+
lines.push('');
|
|
1727
|
+
}
|
|
1728
|
+
// Reasoning-token diagnostic (lifetime only — needs persistence to be meaningful)
|
|
1729
|
+
if (!filterModel) {
|
|
1730
|
+
// Sum reasoning tokens across all models. We store this per-model
|
|
1731
|
+
// in SQLite but not in the in-memory mirror, so fetch on demand.
|
|
1732
|
+
try {
|
|
1733
|
+
const rows = await getAllPerformance();
|
|
1734
|
+
const totalReasoning = rows.reduce((sum, r) => sum + (r.totalReasoningTokens || 0), 0);
|
|
1735
|
+
const totalCompletion = rows.reduce((sum, r) => sum + r.totalCompletionTokens, 0);
|
|
1736
|
+
if (totalCompletion > 0) {
|
|
1737
|
+
const pct = ((totalReasoning / totalCompletion) * 100).toFixed(1);
|
|
1738
|
+
lines.push(`### Reasoning-token overhead (lifetime)`);
|
|
1739
|
+
lines.push('');
|
|
1740
|
+
lines.push(`${totalReasoning.toLocaleString()} / ${totalCompletion.toLocaleString()} completion tokens spent on hidden reasoning (${pct}% of generation budget). ` +
|
|
1741
|
+
(parseFloat(pct) > 30
|
|
1742
|
+
? `**High** — consider loading a non-thinking model, or check that \`reasoning_effort\` is being honoured (see stderr logs).`
|
|
1743
|
+
: parseFloat(pct) > 10
|
|
1744
|
+
? `Moderate — normal for thinking-model families.`
|
|
1745
|
+
: `Low — reasoning is effectively suppressed.`));
|
|
1746
|
+
lines.push('');
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
catch { /* best-effort — don't fail the tool call */ }
|
|
1750
|
+
}
|
|
1751
|
+
lines.push(`*Stats persist across restarts in \`~/.houtini-lm/model-cache.db\`.*`);
|
|
1752
|
+
return { content: [{ type: 'text', text: lines.join('\n') }] };
|
|
1753
|
+
}
|
|
1265
1754
|
default:
|
|
1266
1755
|
throw new Error(`Unknown tool: ${name}`);
|
|
1267
1756
|
}
|
|
@@ -1282,6 +1771,10 @@ async function main() {
|
|
|
1282
1771
|
listModelsRaw()
|
|
1283
1772
|
.then((models) => profileModelsAtStartup(models))
|
|
1284
1773
|
.catch((err) => process.stderr.write(`[houtini-lm] Startup profiling skipped: ${err}\n`));
|
|
1774
|
+
// Hydrate the in-memory lifetime mirror from SQLite so the very first
|
|
1775
|
+
// tool call this session shows historical savings + per-model perf.
|
|
1776
|
+
// Non-blocking too; the footer degrades to session-only if this fails.
|
|
1777
|
+
hydrateLifetimeFromDb().catch((err) => process.stderr.write(`[houtini-lm] Lifetime hydration skipped: ${err}\n`));
|
|
1285
1778
|
}
|
|
1286
1779
|
main().catch((error) => {
|
|
1287
1780
|
process.stderr.write(`Fatal error: ${error}\n`);
|