smoltalk 0.0.66 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -154
- package/dist/classes/message/ToolMessage.js +1 -1
- package/dist/client.d.ts +3 -3
- package/dist/client.js +9 -5
- package/dist/clients/anthropic.d.ts +4 -4
- package/dist/clients/anthropic.js +1 -1
- package/dist/clients/baseClient.d.ts +17 -20
- package/dist/clients/baseClient.js +21 -43
- package/dist/clients/google.d.ts +4 -4
- package/dist/clients/google.js +1 -1
- package/dist/clients/ollama.d.ts +4 -4
- package/dist/clients/ollama.js +1 -1
- package/dist/clients/openai.d.ts +4 -4
- package/dist/clients/openai.js +2 -1
- package/dist/clients/openaiResponses.d.ts +4 -4
- package/dist/clients/openaiResponses.js +2 -1
- package/dist/functions.d.ts +13 -10
- package/dist/functions.js +4 -55
- package/dist/index.d.ts +2 -4
- package/dist/index.js +1 -2
- package/dist/model.d.ts +2 -5
- package/dist/model.js +11 -27
- package/dist/models.d.ts +2 -2
- package/dist/models.js +3 -1
- package/dist/testing/index.d.ts +9 -0
- package/dist/testing/index.js +41 -0
- package/dist/types.d.ts +52 -160
- package/dist/types.js +1 -1
- package/dist/util/logger.d.ts +17 -1
- package/dist/util/logger.js +68 -5
- package/package.json +15 -19
- package/dist/clients/llamaCpp.d.ts +0 -28
- package/dist/clients/llamaCpp.js +0 -316
- package/dist/latencyTracker.d.ts +0 -32
- package/dist/latencyTracker.js +0 -73
- package/dist/middleware.d.ts +0 -54
- package/dist/middleware.js +0 -321
- package/dist/strategies/baseStrategy.d.ts +0 -22
- package/dist/strategies/baseStrategy.js +0 -62
- package/dist/strategies/fallbackStrategy.d.ts +0 -14
- package/dist/strategies/fallbackStrategy.js +0 -122
- package/dist/strategies/fastestStrategy.d.ts +0 -19
- package/dist/strategies/fastestStrategy.js +0 -108
- package/dist/strategies/idStrategy.d.ts +0 -16
- package/dist/strategies/idStrategy.js +0 -62
- package/dist/strategies/index.d.ts +0 -17
- package/dist/strategies/index.js +0 -68
- package/dist/strategies/raceStrategy.d.ts +0 -12
- package/dist/strategies/raceStrategy.js +0 -72
- package/dist/strategies/randomStrategy.d.ts +0 -13
- package/dist/strategies/randomStrategy.js +0 -54
- package/dist/strategies/timeoutStrategy.d.ts +0 -13
- package/dist/strategies/timeoutStrategy.js +0 -65
- package/dist/strategies/types.d.ts +0 -78
- package/dist/strategies/types.js +0 -58
package/dist/types.js
CHANGED
|
@@ -4,7 +4,7 @@ export * from "./types/costEstimate.js";
|
|
|
4
4
|
export * from "./types/tokenUsage.js";
|
|
5
5
|
export function promptResult({ output, toolCalls, thinkingBlocks, usage, cost, model, }) {
|
|
6
6
|
return {
|
|
7
|
-
output: output
|
|
7
|
+
output: (output ?? null),
|
|
8
8
|
toolCalls: toolCalls || [],
|
|
9
9
|
thinkingBlocks: thinkingBlocks,
|
|
10
10
|
usage,
|
package/dist/util/logger.d.ts
CHANGED
|
@@ -1,2 +1,18 @@
|
|
|
1
|
-
|
|
1
|
+
export type LogLevel = "error" | "warn" | "info" | "debug";
|
|
2
|
+
export type EgonLogConfig = {
|
|
3
|
+
logLevel: LogLevel;
|
|
4
|
+
};
|
|
5
|
+
export declare class EgonLog {
|
|
6
|
+
private logLevel;
|
|
7
|
+
constructor(config: EgonLogConfig);
|
|
8
|
+
private shouldLog;
|
|
9
|
+
private log;
|
|
10
|
+
error(...args: unknown[]): void;
|
|
11
|
+
warn(...args: unknown[]): void;
|
|
12
|
+
info(...args: unknown[]): void;
|
|
13
|
+
debug(...args: unknown[]): void;
|
|
14
|
+
table(...args: unknown[]): void;
|
|
15
|
+
setLogLevel(level: LogLevel): void;
|
|
16
|
+
getLogLevel(): LogLevel;
|
|
17
|
+
}
|
|
2
18
|
export declare function getLogger(level?: LogLevel): EgonLog;
|
package/dist/util/logger.js
CHANGED
|
@@ -1,9 +1,72 @@
|
|
|
1
|
-
|
|
1
|
+
const LOG_LEVELS = {
|
|
2
|
+
error: 0,
|
|
3
|
+
warn: 1,
|
|
4
|
+
info: 2,
|
|
5
|
+
debug: 3,
|
|
6
|
+
};
|
|
7
|
+
const RED = "\x1b[31m";
|
|
8
|
+
const YELLOW = "\x1b[33m";
|
|
9
|
+
const GREEN = "\x1b[32m";
|
|
10
|
+
const RESET = "\x1b[0m";
|
|
11
|
+
export class EgonLog {
|
|
12
|
+
logLevel;
|
|
13
|
+
constructor(config) {
|
|
14
|
+
this.logLevel = config.logLevel;
|
|
15
|
+
}
|
|
16
|
+
shouldLog(messageLevel) {
|
|
17
|
+
return LOG_LEVELS[messageLevel] <= LOG_LEVELS[this.logLevel];
|
|
18
|
+
}
|
|
19
|
+
log(level, ...args) {
|
|
20
|
+
if (!this.shouldLog(level))
|
|
21
|
+
return;
|
|
22
|
+
const timestamp = new Date().toISOString();
|
|
23
|
+
const prefix = `[${timestamp}] [${level.toUpperCase()}]`;
|
|
24
|
+
switch (level) {
|
|
25
|
+
case "error":
|
|
26
|
+
console.error(RED + prefix, ...args, RESET);
|
|
27
|
+
break;
|
|
28
|
+
case "warn":
|
|
29
|
+
console.warn(YELLOW + prefix, ...args, RESET);
|
|
30
|
+
break;
|
|
31
|
+
case "info":
|
|
32
|
+
console.info(GREEN + prefix, ...args, RESET);
|
|
33
|
+
break;
|
|
34
|
+
case "debug":
|
|
35
|
+
console.debug(prefix, ...args);
|
|
36
|
+
break;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
error(...args) {
|
|
40
|
+
this.log("error", ...args);
|
|
41
|
+
}
|
|
42
|
+
warn(...args) {
|
|
43
|
+
this.log("warn", ...args);
|
|
44
|
+
}
|
|
45
|
+
info(...args) {
|
|
46
|
+
this.log("info", ...args);
|
|
47
|
+
}
|
|
48
|
+
debug(...args) {
|
|
49
|
+
this.log("debug", ...args);
|
|
50
|
+
}
|
|
51
|
+
table(...args) {
|
|
52
|
+
if (!this.shouldLog("debug"))
|
|
53
|
+
return;
|
|
54
|
+
console.table(...args);
|
|
55
|
+
}
|
|
56
|
+
setLogLevel(level) {
|
|
57
|
+
this.logLevel = level;
|
|
58
|
+
}
|
|
59
|
+
getLogLevel() {
|
|
60
|
+
return this.logLevel;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
2
63
|
let loggerInstance = null;
|
|
3
|
-
export function getLogger(level
|
|
4
|
-
if (loggerInstance) {
|
|
5
|
-
|
|
64
|
+
export function getLogger(level) {
|
|
65
|
+
if (!loggerInstance) {
|
|
66
|
+
loggerInstance = new EgonLog({ logLevel: level ?? "error" });
|
|
67
|
+
}
|
|
68
|
+
else if (level !== undefined) {
|
|
69
|
+
loggerInstance.setLogLevel(level);
|
|
6
70
|
}
|
|
7
|
-
loggerInstance = new EgonLog({ level });
|
|
8
71
|
return loggerInstance;
|
|
9
72
|
}
|
package/package.json
CHANGED
|
@@ -1,17 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "smoltalk",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "A common interface for LLM APIs",
|
|
5
5
|
"homepage": "https://github.com/egonSchiele/smoltalk",
|
|
6
|
-
"scripts": {
|
|
7
|
-
"test": "vitest",
|
|
8
|
-
"test:tsc": "tsc -p tests/tsconfig.json",
|
|
9
|
-
"build": "rm -rf dist && tsc",
|
|
10
|
-
"start": "cd dist && node index.js",
|
|
11
|
-
"doc": "typedoc --disableSources --out docs lib && prettier docs/ --write",
|
|
12
|
-
"typecheck": "tsc --noEmit",
|
|
13
|
-
"pull": "node-llama-cpp pull --dir ./models"
|
|
14
|
-
},
|
|
15
6
|
"files": [
|
|
16
7
|
"./dist"
|
|
17
8
|
],
|
|
@@ -20,6 +11,11 @@
|
|
|
20
11
|
"types": "./dist/index.d.ts",
|
|
21
12
|
"import": "./dist/index.js",
|
|
22
13
|
"require": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"./testing": {
|
|
16
|
+
"types": "./dist/testing/index.d.ts",
|
|
17
|
+
"import": "./dist/testing/index.js",
|
|
18
|
+
"require": "./dist/testing/index.js"
|
|
23
19
|
}
|
|
24
20
|
},
|
|
25
21
|
"type": "module",
|
|
@@ -31,21 +27,21 @@
|
|
|
31
27
|
],
|
|
32
28
|
"author": "Aditya Bhargava",
|
|
33
29
|
"license": "ISC",
|
|
34
|
-
"devDependencies": {
|
|
35
|
-
"@types/node": "^25.0.3",
|
|
36
|
-
"prettier": "^3.7.4",
|
|
37
|
-
"typedoc": "^0.28.15",
|
|
38
|
-
"typescript": "^5.9.3",
|
|
39
|
-
"vitest": "^4.0.16"
|
|
40
|
-
},
|
|
41
30
|
"dependencies": {
|
|
42
31
|
"@anthropic-ai/sdk": "^0.78.0",
|
|
43
32
|
"@google/genai": "^1.34.0",
|
|
44
|
-
"egonlog": "^0.0.2",
|
|
45
33
|
"nanoid": "^5.1.6",
|
|
46
|
-
"node-llama-cpp": "^3.17.1",
|
|
47
34
|
"ollama": "^0.6.3",
|
|
48
35
|
"openai": "^6.15.0",
|
|
49
36
|
"zod": "^4.3.6"
|
|
37
|
+
},
|
|
38
|
+
"scripts": {
|
|
39
|
+
"test": "vitest --exclude=**/*.live.test.ts",
|
|
40
|
+
"test:live": "vitest run lib/clients/*.live.test.ts",
|
|
41
|
+
"test:tsc": "tsc -p tests/tsconfig.json",
|
|
42
|
+
"build": "rm -rf dist && tsc",
|
|
43
|
+
"start": "cd dist && node index.js",
|
|
44
|
+
"doc": "typedoc --disableSources --out docs lib && prettier docs/ --write",
|
|
45
|
+
"typecheck": "tsc --noEmit"
|
|
50
46
|
}
|
|
51
47
|
}
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import { BaseClient } from "./baseClient.js";
|
|
2
|
-
import { BaseClientConfig, PromptConfig, PromptResult, Result, StreamChunk } from "../types.js";
|
|
3
|
-
export declare class LlamaCPP extends BaseClient {
|
|
4
|
-
private llama;
|
|
5
|
-
private llamaModel;
|
|
6
|
-
private modelDir;
|
|
7
|
-
private model;
|
|
8
|
-
private logger;
|
|
9
|
-
constructor(config: BaseClientConfig);
|
|
10
|
-
setup(): Promise<void>;
|
|
11
|
-
private getModelName;
|
|
12
|
-
/**
|
|
13
|
-
* Converts smoltalk messages to node-llama-cpp's ChatHistoryItem format.
|
|
14
|
-
* Builds the full history including the last user message (LlamaChat.generateResponse
|
|
15
|
-
* expects the complete history, unlike LlamaChatSession which takes the last message separately).
|
|
16
|
-
*/
|
|
17
|
-
private convertMessages;
|
|
18
|
-
/**
|
|
19
|
-
* Builds node-llama-cpp function definitions from smoltalk tool configs.
|
|
20
|
-
* Uses ChatModelFunctions (no handler) — LlamaChat.generateResponse() returns
|
|
21
|
-
* function calls without executing them, which matches smoltalk's tool loop model.
|
|
22
|
-
*/
|
|
23
|
-
private buildFunctions;
|
|
24
|
-
private calculateUsageAndCost;
|
|
25
|
-
private extractToolCalls;
|
|
26
|
-
_textSync(config: PromptConfig): Promise<Result<PromptResult>>;
|
|
27
|
-
_textStream(config: PromptConfig): AsyncGenerator<StreamChunk>;
|
|
28
|
-
}
|
package/dist/clients/llamaCpp.js
DELETED
|
@@ -1,316 +0,0 @@
|
|
|
1
|
-
import { getLlama, LlamaChat, LlamaLogLevel } from "node-llama-cpp";
|
|
2
|
-
import { BaseClient } from "./baseClient.js";
|
|
3
|
-
import { ToolCall } from "../classes/ToolCall.js";
|
|
4
|
-
import { getLogger } from "../util/logger.js";
|
|
5
|
-
import { Model } from "../model.js";
|
|
6
|
-
import { sanitizeAttributes } from "../util/util.js";
|
|
7
|
-
import { success, } from "../types.js";
|
|
8
|
-
import path from "path";
|
|
9
|
-
export class LlamaCPP extends BaseClient {
|
|
10
|
-
llama = null;
|
|
11
|
-
llamaModel = null;
|
|
12
|
-
modelDir;
|
|
13
|
-
model;
|
|
14
|
-
logger;
|
|
15
|
-
constructor(config) {
|
|
16
|
-
super(config);
|
|
17
|
-
if (!config.llamaCppModelDir) {
|
|
18
|
-
throw new Error("llamaCppModelDir is required in the config when using the LlamaCPP client.");
|
|
19
|
-
}
|
|
20
|
-
this.model = new Model(config.model);
|
|
21
|
-
this.modelDir = config.llamaCppModelDir;
|
|
22
|
-
this.logger = getLogger();
|
|
23
|
-
}
|
|
24
|
-
async setup() {
|
|
25
|
-
this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
|
|
26
|
-
this.llamaModel = await this.llama.loadModel({
|
|
27
|
-
modelPath: path.join(this.modelDir, this.config.model),
|
|
28
|
-
});
|
|
29
|
-
}
|
|
30
|
-
getModelName() {
|
|
31
|
-
return this.model.getResolvedModel();
|
|
32
|
-
}
|
|
33
|
-
/**
|
|
34
|
-
* Converts smoltalk messages to node-llama-cpp's ChatHistoryItem format.
|
|
35
|
-
* Builds the full history including the last user message (LlamaChat.generateResponse
|
|
36
|
-
* expects the complete history, unlike LlamaChatSession which takes the last message separately).
|
|
37
|
-
*/
|
|
38
|
-
convertMessages(messages) {
|
|
39
|
-
let systemPrompt;
|
|
40
|
-
const chatHistory = [];
|
|
41
|
-
for (let i = 0; i < messages.length; i++) {
|
|
42
|
-
const msg = messages[i];
|
|
43
|
-
if (msg.role === "system" || msg.role === "developer") {
|
|
44
|
-
if (!systemPrompt) {
|
|
45
|
-
systemPrompt = msg.content;
|
|
46
|
-
}
|
|
47
|
-
else {
|
|
48
|
-
systemPrompt += "\n" + msg.content;
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
else if (msg.role === "user") {
|
|
52
|
-
chatHistory.push({ type: "user", text: msg.content });
|
|
53
|
-
}
|
|
54
|
-
else if (msg.role === "assistant") {
|
|
55
|
-
const assistantMsg = msg;
|
|
56
|
-
const response = [];
|
|
57
|
-
if (assistantMsg.content) {
|
|
58
|
-
response.push(assistantMsg.content);
|
|
59
|
-
}
|
|
60
|
-
// Handle tool calls: pair them with their results from subsequent tool messages
|
|
61
|
-
if (assistantMsg.toolCalls?.length) {
|
|
62
|
-
for (const tc of assistantMsg.toolCalls) {
|
|
63
|
-
// Find the corresponding tool result message
|
|
64
|
-
const toolResultMsg = messages
|
|
65
|
-
.slice(i + 1)
|
|
66
|
-
.find((m) => m.role === "tool" &&
|
|
67
|
-
m.tool_call_id === tc.id);
|
|
68
|
-
response.push({
|
|
69
|
-
type: "functionCall",
|
|
70
|
-
name: tc.name,
|
|
71
|
-
params: tc.arguments,
|
|
72
|
-
result: toolResultMsg ? toolResultMsg.content : undefined,
|
|
73
|
-
});
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
chatHistory.push({ type: "model", response });
|
|
77
|
-
}
|
|
78
|
-
// Tool messages are handled as part of assistant messages above
|
|
79
|
-
}
|
|
80
|
-
// Prepend system message if present
|
|
81
|
-
if (systemPrompt) {
|
|
82
|
-
chatHistory.unshift({ type: "system", text: systemPrompt });
|
|
83
|
-
}
|
|
84
|
-
return { systemPrompt, chatHistory };
|
|
85
|
-
}
|
|
86
|
-
/**
|
|
87
|
-
* Builds node-llama-cpp function definitions from smoltalk tool configs.
|
|
88
|
-
* Uses ChatModelFunctions (no handler) — LlamaChat.generateResponse() returns
|
|
89
|
-
* function calls without executing them, which matches smoltalk's tool loop model.
|
|
90
|
-
*/
|
|
91
|
-
buildFunctions(tools) {
|
|
92
|
-
if (!tools)
|
|
93
|
-
return undefined;
|
|
94
|
-
const functions = {};
|
|
95
|
-
for (const tool of tools) {
|
|
96
|
-
const jsonSchema = tool.schema.toJSONSchema();
|
|
97
|
-
functions[tool.name] = {
|
|
98
|
-
description: tool.description,
|
|
99
|
-
params: jsonSchema,
|
|
100
|
-
};
|
|
101
|
-
}
|
|
102
|
-
return functions;
|
|
103
|
-
}
|
|
104
|
-
calculateUsageAndCost(meterBefore, meterAfter) {
|
|
105
|
-
const inputTokens = meterAfter.usedInputTokens - meterBefore.usedInputTokens;
|
|
106
|
-
const outputTokens = meterAfter.usedOutputTokens - meterBefore.usedOutputTokens;
|
|
107
|
-
const usage = {
|
|
108
|
-
inputTokens,
|
|
109
|
-
outputTokens,
|
|
110
|
-
totalTokens: inputTokens + outputTokens,
|
|
111
|
-
};
|
|
112
|
-
const cost = this.model.calculateCost(usage) ?? undefined;
|
|
113
|
-
return { usage, cost };
|
|
114
|
-
}
|
|
115
|
-
extractToolCalls(functionCalls) {
|
|
116
|
-
if (!functionCalls?.length)
|
|
117
|
-
return [];
|
|
118
|
-
return functionCalls.map((fc) => new ToolCall(fc.functionName, fc.functionName, (fc.params ?? {})));
|
|
119
|
-
}
|
|
120
|
-
async _textSync(config) {
|
|
121
|
-
if (!this.llama || !this.llamaModel) {
|
|
122
|
-
await this.setup();
|
|
123
|
-
}
|
|
124
|
-
const setupLlama = this.llama;
|
|
125
|
-
const setupModel = this.llamaModel;
|
|
126
|
-
const { chatHistory } = this.convertMessages(config.messages);
|
|
127
|
-
if (chatHistory.length === 0) {
|
|
128
|
-
return success({
|
|
129
|
-
output: "",
|
|
130
|
-
toolCalls: [],
|
|
131
|
-
model: this.getModelName(),
|
|
132
|
-
});
|
|
133
|
-
}
|
|
134
|
-
// Create grammar for response format
|
|
135
|
-
let grammar;
|
|
136
|
-
if (config.responseFormat) {
|
|
137
|
-
grammar = await setupLlama.createGrammarForJsonSchema(config.responseFormat.toJSONSchema());
|
|
138
|
-
}
|
|
139
|
-
// Create context and LlamaChat
|
|
140
|
-
const context = await setupModel.createContext();
|
|
141
|
-
const sequence = context.getSequence();
|
|
142
|
-
const chat = new LlamaChat({
|
|
143
|
-
contextSequence: sequence,
|
|
144
|
-
});
|
|
145
|
-
// Build tools if provided
|
|
146
|
-
const functions = this.buildFunctions(config.tools);
|
|
147
|
-
// Track token usage
|
|
148
|
-
const meterBefore = sequence.tokenMeter.getState();
|
|
149
|
-
// Build options
|
|
150
|
-
const options = {};
|
|
151
|
-
if (config.maxTokens !== undefined) {
|
|
152
|
-
options.maxTokens = config.maxTokens;
|
|
153
|
-
}
|
|
154
|
-
if (config.temperature !== undefined) {
|
|
155
|
-
options.temperature = config.temperature;
|
|
156
|
-
}
|
|
157
|
-
if (config.abortSignal) {
|
|
158
|
-
options.signal = config.abortSignal;
|
|
159
|
-
options.stopOnAbortSignal = true;
|
|
160
|
-
}
|
|
161
|
-
if (grammar && !functions) {
|
|
162
|
-
options.grammar = grammar;
|
|
163
|
-
}
|
|
164
|
-
if (functions) {
|
|
165
|
-
options.functions = functions;
|
|
166
|
-
}
|
|
167
|
-
// Apply raw attributes
|
|
168
|
-
Object.assign(options, sanitizeAttributes(config.rawAttributes));
|
|
169
|
-
this.logger.debug("Sending request to llama.cpp");
|
|
170
|
-
this.statelogClient?.promptRequest({
|
|
171
|
-
model: this.getModelName(),
|
|
172
|
-
messageCount: config.messages.length,
|
|
173
|
-
});
|
|
174
|
-
let result;
|
|
175
|
-
let meterAfter;
|
|
176
|
-
try {
|
|
177
|
-
result = await chat.generateResponse(chatHistory, options);
|
|
178
|
-
meterAfter = sequence.tokenMeter.getState();
|
|
179
|
-
}
|
|
180
|
-
finally {
|
|
181
|
-
chat.dispose();
|
|
182
|
-
await context.dispose();
|
|
183
|
-
}
|
|
184
|
-
// Extract text output
|
|
185
|
-
const output = result.response || null;
|
|
186
|
-
// Extract tool calls — generateResponse returns them without executing handlers
|
|
187
|
-
const toolCalls = this.extractToolCalls(result.functionCalls);
|
|
188
|
-
// Calculate usage and cost
|
|
189
|
-
const { usage, cost } = this.calculateUsageAndCost(meterBefore, meterAfter);
|
|
190
|
-
this.logger.debug("Response from llama.cpp:", output);
|
|
191
|
-
this.statelogClient?.promptResponse({ output, usage, cost });
|
|
192
|
-
return success({
|
|
193
|
-
output,
|
|
194
|
-
toolCalls,
|
|
195
|
-
usage,
|
|
196
|
-
cost,
|
|
197
|
-
model: this.getModelName(),
|
|
198
|
-
});
|
|
199
|
-
}
|
|
200
|
-
async *_textStream(config) {
|
|
201
|
-
if (!this.llama || !this.llamaModel) {
|
|
202
|
-
await this.setup();
|
|
203
|
-
}
|
|
204
|
-
const setupLlama = this.llama;
|
|
205
|
-
const setupModel = this.llamaModel;
|
|
206
|
-
const { chatHistory } = this.convertMessages(config.messages);
|
|
207
|
-
if (chatHistory.length === 0) {
|
|
208
|
-
yield {
|
|
209
|
-
type: "done",
|
|
210
|
-
result: { output: null, toolCalls: [], model: this.getModelName() },
|
|
211
|
-
};
|
|
212
|
-
return;
|
|
213
|
-
}
|
|
214
|
-
// Create grammar for response format
|
|
215
|
-
let grammar;
|
|
216
|
-
if (config.responseFormat) {
|
|
217
|
-
grammar = await setupLlama.createGrammarForJsonSchema(config.responseFormat.toJSONSchema());
|
|
218
|
-
}
|
|
219
|
-
// Create context and LlamaChat
|
|
220
|
-
const context = await setupModel.createContext();
|
|
221
|
-
const sequence = context.getSequence();
|
|
222
|
-
const chat = new LlamaChat({
|
|
223
|
-
contextSequence: sequence,
|
|
224
|
-
});
|
|
225
|
-
const functions = this.buildFunctions(config.tools);
|
|
226
|
-
const meterBefore = sequence.tokenMeter.getState();
|
|
227
|
-
// Bridge callback-based streaming to async generator using a queue
|
|
228
|
-
const chunks = [];
|
|
229
|
-
let resolveWaiter = null;
|
|
230
|
-
let done = false;
|
|
231
|
-
const pushChunk = (chunk) => {
|
|
232
|
-
chunks.push(chunk);
|
|
233
|
-
if (resolveWaiter) {
|
|
234
|
-
resolveWaiter();
|
|
235
|
-
resolveWaiter = null;
|
|
236
|
-
}
|
|
237
|
-
};
|
|
238
|
-
// Build options
|
|
239
|
-
const options = {
|
|
240
|
-
onTextChunk: (text) => {
|
|
241
|
-
pushChunk({ type: "text", text });
|
|
242
|
-
},
|
|
243
|
-
};
|
|
244
|
-
if (config.maxTokens !== undefined) {
|
|
245
|
-
options.maxTokens = config.maxTokens;
|
|
246
|
-
}
|
|
247
|
-
if (config.temperature !== undefined) {
|
|
248
|
-
options.temperature = config.temperature;
|
|
249
|
-
}
|
|
250
|
-
if (config.abortSignal) {
|
|
251
|
-
options.signal = config.abortSignal;
|
|
252
|
-
options.stopOnAbortSignal = true;
|
|
253
|
-
}
|
|
254
|
-
if (grammar && !functions) {
|
|
255
|
-
options.grammar = grammar;
|
|
256
|
-
}
|
|
257
|
-
if (functions) {
|
|
258
|
-
options.functions = functions;
|
|
259
|
-
}
|
|
260
|
-
Object.assign(options, sanitizeAttributes(config.rawAttributes));
|
|
261
|
-
this.logger.debug("Sending streaming request to llama.cpp");
|
|
262
|
-
this.statelogClient?.promptRequest({
|
|
263
|
-
model: this.getModelName(),
|
|
264
|
-
messageCount: config.messages.length,
|
|
265
|
-
});
|
|
266
|
-
// Run generateResponse in background, push chunks as they arrive
|
|
267
|
-
const promptPromise = chat
|
|
268
|
-
.generateResponse(chatHistory, options)
|
|
269
|
-
.then((result) => {
|
|
270
|
-
const meterAfter = sequence.tokenMeter.getState();
|
|
271
|
-
const toolCalls = this.extractToolCalls(result.functionCalls);
|
|
272
|
-
for (const tc of toolCalls) {
|
|
273
|
-
pushChunk({ type: "tool_call", toolCall: tc });
|
|
274
|
-
}
|
|
275
|
-
const { usage, cost } = this.calculateUsageAndCost(meterBefore, meterAfter);
|
|
276
|
-
const output = result.response || null;
|
|
277
|
-
this.logger.debug("Streaming response completed from llama.cpp");
|
|
278
|
-
this.statelogClient?.promptResponse({ output, usage, cost });
|
|
279
|
-
pushChunk({
|
|
280
|
-
type: "done",
|
|
281
|
-
result: {
|
|
282
|
-
output,
|
|
283
|
-
toolCalls,
|
|
284
|
-
usage,
|
|
285
|
-
cost,
|
|
286
|
-
model: this.getModelName(),
|
|
287
|
-
},
|
|
288
|
-
});
|
|
289
|
-
})
|
|
290
|
-
.catch((error) => {
|
|
291
|
-
pushChunk({ type: "error", error: error.message });
|
|
292
|
-
})
|
|
293
|
-
.finally(() => {
|
|
294
|
-
done = true;
|
|
295
|
-
chat.dispose();
|
|
296
|
-
context.dispose();
|
|
297
|
-
// Wake up the generator if it's waiting
|
|
298
|
-
if (resolveWaiter) {
|
|
299
|
-
resolveWaiter();
|
|
300
|
-
resolveWaiter = null;
|
|
301
|
-
}
|
|
302
|
-
});
|
|
303
|
-
// Yield chunks as they arrive
|
|
304
|
-
while (!done || chunks.length > 0) {
|
|
305
|
-
if (chunks.length > 0) {
|
|
306
|
-
yield chunks.shift();
|
|
307
|
-
}
|
|
308
|
-
else if (!done) {
|
|
309
|
-
await new Promise((resolve) => {
|
|
310
|
-
resolveWaiter = resolve;
|
|
311
|
-
});
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
await promptPromise;
|
|
315
|
-
}
|
|
316
|
-
}
|
package/dist/latencyTracker.d.ts
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
export type LatencySample = {
|
|
2
|
-
/** Milliseconds per output token */
|
|
3
|
-
msPerToken: number;
|
|
4
|
-
/** Timestamp when sample was recorded */
|
|
5
|
-
timestamp: number;
|
|
6
|
-
};
|
|
7
|
-
declare class LatencyTracker {
|
|
8
|
-
private samples;
|
|
9
|
-
private windowSize;
|
|
10
|
-
constructor(windowSize?: number);
|
|
11
|
-
/** Record a latency sample for a model. */
|
|
12
|
-
record(model: string, elapsedMs: number, outputTokens: number): void;
|
|
13
|
-
/** Get the windowed mean ms-per-token for a model, or null if no samples. */
|
|
14
|
-
getMeanMsPerToken(model: string): number | null;
|
|
15
|
-
/**
|
|
16
|
-
* Get estimated output tokens per second for a model based on tracked latency.
|
|
17
|
-
* Returns null if no samples exist or if the number of samples is below the minimum required.
|
|
18
|
-
*/
|
|
19
|
-
getTokensPerSecond(model: string, minSamples?: number): number | null;
|
|
20
|
-
/** Get the number of samples recorded for a model. */
|
|
21
|
-
getSampleCount(model: string): number;
|
|
22
|
-
/** Get all samples for a model (defensive copy). */
|
|
23
|
-
getSamples(model: string): LatencySample[];
|
|
24
|
-
/** Clear all samples for a model. */
|
|
25
|
-
clear(model?: string): void;
|
|
26
|
-
/** Update the window size. Existing samples beyond the new size are trimmed. */
|
|
27
|
-
setWindowSize(size: number): void;
|
|
28
|
-
getWindowSize(): number;
|
|
29
|
-
}
|
|
30
|
-
/** Global singleton latency tracker. */
|
|
31
|
-
export declare const latencyTracker: LatencyTracker;
|
|
32
|
-
export {};
|
package/dist/latencyTracker.js
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
const DEFAULT_WINDOW_SIZE = 10;
|
|
2
|
-
class LatencyTracker {
|
|
3
|
-
samples = new Map();
|
|
4
|
-
windowSize;
|
|
5
|
-
constructor(windowSize = DEFAULT_WINDOW_SIZE) {
|
|
6
|
-
this.windowSize = windowSize;
|
|
7
|
-
}
|
|
8
|
-
/** Record a latency sample for a model. */
|
|
9
|
-
record(model, elapsedMs, outputTokens) {
|
|
10
|
-
if (outputTokens <= 0 || elapsedMs <= 0)
|
|
11
|
-
return;
|
|
12
|
-
const msPerToken = elapsedMs / outputTokens;
|
|
13
|
-
const samples = this.samples.get(model) ?? [];
|
|
14
|
-
samples.push({ msPerToken, timestamp: Date.now() });
|
|
15
|
-
// Keep only the last windowSize samples
|
|
16
|
-
if (samples.length > this.windowSize) {
|
|
17
|
-
samples.splice(0, samples.length - this.windowSize);
|
|
18
|
-
}
|
|
19
|
-
this.samples.set(model, samples);
|
|
20
|
-
}
|
|
21
|
-
/** Get the windowed mean ms-per-token for a model, or null if no samples. */
|
|
22
|
-
getMeanMsPerToken(model) {
|
|
23
|
-
const samples = this.samples.get(model);
|
|
24
|
-
if (!samples || samples.length === 0)
|
|
25
|
-
return null;
|
|
26
|
-
const sum = samples.reduce((acc, s) => acc + s.msPerToken, 0);
|
|
27
|
-
return sum / samples.length;
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Get estimated output tokens per second for a model based on tracked latency.
|
|
31
|
-
* Returns null if no samples exist or if the number of samples is below the minimum required.
|
|
32
|
-
*/
|
|
33
|
-
getTokensPerSecond(model, minSamples = 1) {
|
|
34
|
-
const sampleCount = this.getSampleCount(model);
|
|
35
|
-
if (sampleCount < minSamples)
|
|
36
|
-
return null;
|
|
37
|
-
const msPerToken = this.getMeanMsPerToken(model);
|
|
38
|
-
if (msPerToken === null || msPerToken === 0)
|
|
39
|
-
return null;
|
|
40
|
-
return 1000 / msPerToken;
|
|
41
|
-
}
|
|
42
|
-
/** Get the number of samples recorded for a model. */
|
|
43
|
-
getSampleCount(model) {
|
|
44
|
-
return this.samples.get(model)?.length ?? 0;
|
|
45
|
-
}
|
|
46
|
-
/** Get all samples for a model (defensive copy). */
|
|
47
|
-
getSamples(model) {
|
|
48
|
-
return [...(this.samples.get(model) ?? [])];
|
|
49
|
-
}
|
|
50
|
-
/** Clear all samples for a model. */
|
|
51
|
-
clear(model) {
|
|
52
|
-
if (model) {
|
|
53
|
-
this.samples.delete(model);
|
|
54
|
-
}
|
|
55
|
-
else {
|
|
56
|
-
this.samples.clear();
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
/** Update the window size. Existing samples beyond the new size are trimmed. */
|
|
60
|
-
setWindowSize(size) {
|
|
61
|
-
this.windowSize = size;
|
|
62
|
-
for (const [model, samples] of this.samples) {
|
|
63
|
-
if (samples.length > size) {
|
|
64
|
-
samples.splice(0, samples.length - size);
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
getWindowSize() {
|
|
69
|
-
return this.windowSize;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
/** Global singleton latency tracker. */
|
|
73
|
-
export const latencyTracker = new LatencyTracker();
|
package/dist/middleware.d.ts
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import { ZodType } from "zod";
|
|
2
|
-
import { Message } from "./classes/message/index.js";
|
|
3
|
-
import { PromptConfig, PromptResult, SmolPromptConfig, StreamChunk } from "./types.js";
|
|
4
|
-
import { Result } from "./types/result.js";
|
|
5
|
-
import { TokenUsage } from "./types/tokenUsage.js";
|
|
6
|
-
import { CostEstimate } from "./types/costEstimate.js";
|
|
7
|
-
export type MiddlewareCheck = {
|
|
8
|
-
/** Messages for the middleware LLM call (original prompt messages are appended automatically). */
|
|
9
|
-
messages: Message[];
|
|
10
|
-
/** Optional Zod schema for structured output from the middleware. */
|
|
11
|
-
responseFormat?: ZodType;
|
|
12
|
-
responseFormatOptions?: PromptConfig["responseFormatOptions"];
|
|
13
|
-
/**
|
|
14
|
-
* Given the middleware's result, decide whether to block.
|
|
15
|
-
* Return a replacement output string to block, or null/undefined to pass.
|
|
16
|
-
*/
|
|
17
|
-
decide: (result: PromptResult) => string | null;
|
|
18
|
-
};
|
|
19
|
-
export type MiddlewareConfig = {
|
|
20
|
-
/** Run all checks before the main prompt, or in parallel with it. */
|
|
21
|
-
timing: "before" | "parallel";
|
|
22
|
-
/** Run checks in parallel or sequentially (short-circuit on first block). */
|
|
23
|
-
mode: "parallel" | "sequential";
|
|
24
|
-
/** The middleware checks to run. */
|
|
25
|
-
checks: MiddlewareCheck[];
|
|
26
|
-
};
|
|
27
|
-
export type MiddlewareResult = {
|
|
28
|
-
blocked: boolean;
|
|
29
|
-
result: Result<PromptResult>;
|
|
30
|
-
usage?: TokenUsage;
|
|
31
|
-
cost?: CostEstimate;
|
|
32
|
-
};
|
|
33
|
-
/**
|
|
34
|
-
* Run a single middleware check. Returns a MiddlewareResult indicating
|
|
35
|
-
* whether the check blocked and what output to use.
|
|
36
|
-
*/
|
|
37
|
-
export declare function runMiddlewareCheck(check: MiddlewareCheck, parentConfig: SmolPromptConfig, textSyncFn: (config: SmolPromptConfig) => Promise<Result<PromptResult>>): Promise<MiddlewareResult>;
|
|
38
|
-
/**
|
|
39
|
-
* Run multiple middleware checks in sequential or parallel mode.
|
|
40
|
-
* Returns a combined MiddlewareResult.
|
|
41
|
-
*/
|
|
42
|
-
export declare function runMiddlewareChecks(checks: MiddlewareCheck[], mode: "sequential" | "parallel", parentConfig: SmolPromptConfig, textSyncFn: (config: SmolPromptConfig) => Promise<Result<PromptResult>>): Promise<MiddlewareResult>;
|
|
43
|
-
/**
|
|
44
|
-
* High-level middleware orchestration for sync calls.
|
|
45
|
-
* Returns the blocked result if middleware blocks, the main prompt result for parallel timing,
|
|
46
|
-
* or null to indicate "proceed normally" (no middleware or middleware passed with "before" timing).
|
|
47
|
-
*/
|
|
48
|
-
export declare function executeMiddlewareSync(config: SmolPromptConfig, runMainPrompt: (config: SmolPromptConfig) => Promise<Result<PromptResult>>, textSyncFn: (config: SmolPromptConfig) => Promise<Result<PromptResult>>): Promise<Result<PromptResult> | null>;
|
|
49
|
-
/**
|
|
50
|
-
* High-level middleware orchestration for streaming calls.
|
|
51
|
-
* Yields stream chunks, handling middleware checks according to timing config.
|
|
52
|
-
* Only call this when middleware is configured — the caller should check first.
|
|
53
|
-
*/
|
|
54
|
-
export declare function executeMiddlewareStream(config: SmolPromptConfig, getStream: (config: SmolPromptConfig) => AsyncGenerator<StreamChunk>, textSyncFn: (config: SmolPromptConfig) => Promise<Result<PromptResult>>): AsyncGenerator<StreamChunk>;
|