@agorapete/wllama 3.5.1-q2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +3 -0
- package/.prettierignore +38 -0
- package/AGENTS.md +1 -0
- package/CMakeLists.txt +131 -0
- package/LICENCE +21 -0
- package/README-dev.md +178 -0
- package/README.md +225 -0
- package/README_banner.png +0 -0
- package/assets/screenshot_0.png +0 -0
- package/cpp/generate_glue_prototype.js +115 -0
- package/cpp/glue.hpp +664 -0
- package/cpp/test_glue.cpp +80 -0
- package/cpp/wllama-context.h +1172 -0
- package/cpp/wllama-fs.h +148 -0
- package/cpp/wllama.cpp +187 -0
- package/cpp/wllama.h +6 -0
- package/esm/cache-manager.d.ts +130 -0
- package/esm/debug.d.ts +28 -0
- package/esm/glue/glue.d.ts +22 -0
- package/esm/glue/messages.d.ts +146 -0
- package/esm/huggingface.d.ts +31 -0
- package/esm/index.cjs +3406 -0
- package/esm/index.d.ts +8 -0
- package/esm/index.js +3387 -0
- package/esm/index.min.js +1 -0
- package/esm/index.min.js.map +1 -0
- package/esm/model-manager.d.ts +136 -0
- package/esm/storage/cos.d.ts +36 -0
- package/esm/storage/index.d.ts +33 -0
- package/esm/storage/opfs.d.ts +12 -0
- package/esm/types/oai-compat.d.ts +278 -0
- package/esm/types/types.d.ts +112 -0
- package/esm/utils.d.ts +119 -0
- package/esm/wasm/source-map.d.ts +1 -0
- package/esm/wasm/wllama.wasm +0 -0
- package/esm/wasm-from-cdn.d.ts +8 -0
- package/esm/wllama.d.ts +397 -0
- package/esm/worker.d.ts +92 -0
- package/esm/workers-code/generated.d.ts +4 -0
- package/guides/intro-v2.md +132 -0
- package/guides/intro-v3.1.md +40 -0
- package/guides/intro-v3.md +230 -0
- package/index.ts +1 -0
- package/package.json +71 -0
- package/scripts/bisect_test.sh +33 -0
- package/scripts/build_hf_space.sh +26 -0
- package/scripts/build_source_map.js +269 -0
- package/scripts/build_wasm.sh +19 -0
- package/scripts/build_worker.sh +38 -0
- package/scripts/check_debug_build.js +30 -0
- package/scripts/check_package_size.js +25 -0
- package/scripts/docker-compose.yml +76 -0
- package/scripts/generate_wasm_from_cdn.js +24 -0
- package/scripts/http_server.js +44 -0
- package/scripts/post_build.sh +32 -0
- package/src/cache-manager.ts +358 -0
- package/src/debug.ts +111 -0
- package/src/glue/glue.ts +291 -0
- package/src/glue/messages.ts +773 -0
- package/src/huggingface.ts +151 -0
- package/src/index.ts +8 -0
- package/src/mjs.test.ts +44 -0
- package/src/model-manager.test.ts +200 -0
- package/src/model-manager.ts +359 -0
- package/src/storage/cos.test.ts +83 -0
- package/src/storage/cos.ts +171 -0
- package/src/storage/index.ts +40 -0
- package/src/storage/opfs.ts +119 -0
- package/src/types/oai-compat.ts +342 -0
- package/src/types/types.ts +133 -0
- package/src/utils.test.ts +231 -0
- package/src/utils.ts +403 -0
- package/src/wasm/source-map.ts +7 -0
- package/src/wasm/wllama.js +1 -0
- package/src/wasm/wllama.wasm +0 -0
- package/src/wasm-from-cdn.ts +13 -0
- package/src/wllama.test.ts +392 -0
- package/src/wllama.ts +1138 -0
- package/src/wllama.wgpu.test.ts +62 -0
- package/src/worker.ts +443 -0
- package/src/workers-code/generated.ts +11 -0
- package/src/workers-code/llama-cpp.js +511 -0
- package/src/workers-code/opfs-utils.js +150 -0
- package/tsconfig.build.json +34 -0
- package/tsup.config.ts +23 -0
- package/vitest.config.ts +61 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { createWorker, isSafariMobile } from '../utils';
|
|
2
|
+
import { OPFS_UTILS_WORKER_CODE } from '../workers-code/generated';
|
|
3
|
+
import type { StorageBackend } from './index';
|
|
4
|
+
|
|
5
|
+
export class OPFSBackend implements StorageBackend {
|
|
6
|
+
isSupported(): boolean {
|
|
7
|
+
return (
|
|
8
|
+
typeof navigator !== 'undefined' &&
|
|
9
|
+
'storage' in navigator &&
|
|
10
|
+
!!navigator.storage?.getDirectory
|
|
11
|
+
);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
async read(key: string): Promise<Blob | null> {
|
|
15
|
+
try {
|
|
16
|
+
const cacheDir = await getCacheDir();
|
|
17
|
+
const fileHandle = await cacheDir.getFileHandle(key);
|
|
18
|
+
return await fileHandle.getFile();
|
|
19
|
+
} catch (e) {
|
|
20
|
+
// NotFoundError or similar
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
async write(key: string, stream: ReadableStream): Promise<void> {
|
|
26
|
+
const writable = await openWritable(key);
|
|
27
|
+
await writable.truncate(0);
|
|
28
|
+
const reader = stream.getReader();
|
|
29
|
+
try {
|
|
30
|
+
while (true) {
|
|
31
|
+
const { done, value } = await reader.read();
|
|
32
|
+
if (done) break;
|
|
33
|
+
await writable.write(value);
|
|
34
|
+
}
|
|
35
|
+
} finally {
|
|
36
|
+
await writable.close();
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
async getSize(key: string): Promise<number> {
|
|
41
|
+
try {
|
|
42
|
+
const cacheDir = await getCacheDir();
|
|
43
|
+
const fileHandle = await cacheDir.getFileHandle(key);
|
|
44
|
+
const file = await fileHandle.getFile();
|
|
45
|
+
return file.size;
|
|
46
|
+
} catch (e) {
|
|
47
|
+
return -1;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async list(): Promise<Array<{ key: string; size: number }>> {
|
|
52
|
+
const cacheDir = await getCacheDir();
|
|
53
|
+
const result: Array<{ key: string; size: number }> = [];
|
|
54
|
+
// @ts-ignore
|
|
55
|
+
for await (const [name, handle] of cacheDir.entries()) {
|
|
56
|
+
if (handle.kind === 'file') {
|
|
57
|
+
const file = await (handle as FileSystemFileHandle).getFile();
|
|
58
|
+
result.push({ key: name, size: file.size });
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return result;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
async delete(key: string): Promise<void> {
|
|
65
|
+
try {
|
|
66
|
+
const cacheDir = await getCacheDir();
|
|
67
|
+
await cacheDir.removeEntry(key);
|
|
68
|
+
} catch (e: any) {
|
|
69
|
+
if (e?.name !== 'NotFoundError') throw e;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async function getCacheDir(): Promise<FileSystemDirectoryHandle> {
|
|
75
|
+
const opfsRoot = await navigator.storage.getDirectory();
|
|
76
|
+
return opfsRoot.getDirectoryHandle('cache', { create: true });
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async function openWritable(fileName: string): Promise<{
|
|
80
|
+
truncate(offset: number): Promise<void>;
|
|
81
|
+
write(value: Uint8Array): Promise<void>;
|
|
82
|
+
close(): Promise<void>;
|
|
83
|
+
}> {
|
|
84
|
+
const worker = createWorker(OPFS_UTILS_WORKER_CODE);
|
|
85
|
+
let pResolve: (v: any) => void;
|
|
86
|
+
let pReject: (v: any) => void;
|
|
87
|
+
worker.onmessage = (e: MessageEvent<any>) => {
|
|
88
|
+
if (e.data.ok) pResolve(null);
|
|
89
|
+
else if (e.data.err) pReject(e.data.err);
|
|
90
|
+
};
|
|
91
|
+
worker.onerror = (e) => pReject?.(e.message ?? e);
|
|
92
|
+
const workerExec = (
|
|
93
|
+
data:
|
|
94
|
+
| { action: 'open'; filename: string }
|
|
95
|
+
| { action: 'write'; buf: Uint8Array }
|
|
96
|
+
| { action: 'close' }
|
|
97
|
+
) =>
|
|
98
|
+
new Promise<void>((resolve, reject) => {
|
|
99
|
+
pResolve = resolve;
|
|
100
|
+
pReject = reject;
|
|
101
|
+
worker.postMessage(
|
|
102
|
+
data,
|
|
103
|
+
isSafariMobile()
|
|
104
|
+
? undefined
|
|
105
|
+
: { transfer: 'buf' in data && data.buf ? [data.buf.buffer] : [] }
|
|
106
|
+
);
|
|
107
|
+
});
|
|
108
|
+
await workerExec({ action: 'open', filename: fileName });
|
|
109
|
+
return {
|
|
110
|
+
truncate: async () => {
|
|
111
|
+
/* worker's openFile already calls truncate(0) on open */
|
|
112
|
+
},
|
|
113
|
+
write: (value) => workerExec({ action: 'write', buf: value }),
|
|
114
|
+
close: async () => {
|
|
115
|
+
await workerExec({ action: 'close' });
|
|
116
|
+
worker.terminate();
|
|
117
|
+
},
|
|
118
|
+
};
|
|
119
|
+
}
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import type { SamplingParams } from './types';
|
|
2
|
+
|
|
3
|
+
// Message content types
|
|
4
|
+
|
|
5
|
+
export interface ChatCompletionMessageText {
|
|
6
|
+
type: 'text';
|
|
7
|
+
text: string;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface ChatCompletionMessageImage {
|
|
11
|
+
type: 'image';
|
|
12
|
+
data: ArrayBuffer;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface ChatCompletionMessageAudio {
|
|
16
|
+
type: 'audio';
|
|
17
|
+
data: ArrayBuffer;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export type ChatCompletionMessageContent =
|
|
21
|
+
| ChatCompletionMessageText
|
|
22
|
+
| ChatCompletionMessageImage
|
|
23
|
+
| ChatCompletionMessageAudio;
|
|
24
|
+
|
|
25
|
+
// Tool definitions
|
|
26
|
+
|
|
27
|
+
export interface ChatCompletionToolFunctionParameters {
|
|
28
|
+
type: 'object';
|
|
29
|
+
properties: Record<
|
|
30
|
+
string,
|
|
31
|
+
{
|
|
32
|
+
type: string;
|
|
33
|
+
description?: string;
|
|
34
|
+
enum?: string[];
|
|
35
|
+
[key: string]: unknown;
|
|
36
|
+
}
|
|
37
|
+
>;
|
|
38
|
+
required?: string[];
|
|
39
|
+
additionalProperties?: boolean;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface ChatCompletionToolFunction {
|
|
43
|
+
name: string;
|
|
44
|
+
description?: string;
|
|
45
|
+
parameters?: ChatCompletionToolFunctionParameters;
|
|
46
|
+
strict?: boolean;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface ChatCompletionTool {
|
|
50
|
+
type: 'function';
|
|
51
|
+
function: ChatCompletionToolFunction;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export type ChatCompletionToolChoice =
|
|
55
|
+
| 'none'
|
|
56
|
+
| 'auto'
|
|
57
|
+
| 'required'
|
|
58
|
+
| { type: 'function'; function: { name: string } };
|
|
59
|
+
|
|
60
|
+
// Message types
|
|
61
|
+
|
|
62
|
+
export interface ChatCompletionSystemMessage {
|
|
63
|
+
role: 'system';
|
|
64
|
+
content: string;
|
|
65
|
+
name?: string;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export interface ChatCompletionUserMessage {
|
|
69
|
+
role: 'user';
|
|
70
|
+
content: string | ChatCompletionMessageContent[];
|
|
71
|
+
name?: string;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export interface ChatCompletionToolCall {
|
|
75
|
+
id: string;
|
|
76
|
+
type: 'function';
|
|
77
|
+
function: {
|
|
78
|
+
name: string;
|
|
79
|
+
arguments: string; // JSON-encoded string
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
export interface ChatCompletionAssistantMessage {
|
|
84
|
+
role: 'assistant';
|
|
85
|
+
content?: string | null;
|
|
86
|
+
name?: string;
|
|
87
|
+
tool_calls?: ChatCompletionToolCall[];
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export interface ChatCompletionToolMessage {
|
|
91
|
+
role: 'tool';
|
|
92
|
+
content: string;
|
|
93
|
+
tool_call_id: string;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export type ChatCompletionMessage =
|
|
97
|
+
| ChatCompletionSystemMessage
|
|
98
|
+
| ChatCompletionUserMessage
|
|
99
|
+
| ChatCompletionAssistantMessage
|
|
100
|
+
| ChatCompletionToolMessage;
|
|
101
|
+
|
|
102
|
+
// Request params
|
|
103
|
+
|
|
104
|
+
export type ChatCompletionParams = {
|
|
105
|
+
messages: ChatCompletionMessage[];
|
|
106
|
+
stream?: boolean;
|
|
107
|
+
model?: string;
|
|
108
|
+
abortSignal?: AbortSignal;
|
|
109
|
+
// sampling
|
|
110
|
+
temperature?: number;
|
|
111
|
+
max_tokens?: number;
|
|
112
|
+
// stop?: string | string[];
|
|
113
|
+
// n?: number;
|
|
114
|
+
logprobs?: boolean;
|
|
115
|
+
top_logprobs?: number;
|
|
116
|
+
logit_bias?: Record<string, number>;
|
|
117
|
+
// tools
|
|
118
|
+
tools?: ChatCompletionTool[];
|
|
119
|
+
tool_choice?: ChatCompletionToolChoice;
|
|
120
|
+
// parallel_tool_calls?: boolean;
|
|
121
|
+
// response format
|
|
122
|
+
response_format?: {
|
|
123
|
+
type: 'text' | 'json_object' | 'json_schema';
|
|
124
|
+
json_schema?: { name: string; schema: unknown; strict?: boolean };
|
|
125
|
+
};
|
|
126
|
+
// user-facing
|
|
127
|
+
user?: string;
|
|
128
|
+
// llama-server-specific
|
|
129
|
+
chat_template_kwargs?: Record<string, any>;
|
|
130
|
+
cache_prompt?: boolean;
|
|
131
|
+
return_tokens?: boolean;
|
|
132
|
+
timings_per_token?: boolean;
|
|
133
|
+
} & SamplingParams;
|
|
134
|
+
|
|
135
|
+
// Response types----------
|
|
136
|
+
|
|
137
|
+
export interface ChatCompletionLogprob {
|
|
138
|
+
token: string;
|
|
139
|
+
logprob: number;
|
|
140
|
+
bytes: number[] | null;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export interface ChatCompletionLogprobsContent extends ChatCompletionLogprob {
|
|
144
|
+
top_logprobs: ChatCompletionLogprob[];
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
export interface ChatCompletionChoiceLogprobs {
|
|
148
|
+
content: ChatCompletionLogprobsContent[] | null;
|
|
149
|
+
refusal: ChatCompletionLogprobsContent[] | null;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
export interface ChatCompletionChoice {
|
|
153
|
+
index: number;
|
|
154
|
+
message: ChatCompletionAssistantMessage;
|
|
155
|
+
finish_reason: 'stop' | 'length' | 'tool_calls' | 'content_filter' | null;
|
|
156
|
+
logprobs: ChatCompletionChoiceLogprobs | null;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
export interface ChatCompletionUsage {
|
|
160
|
+
prompt_tokens: number;
|
|
161
|
+
completion_tokens: number;
|
|
162
|
+
total_tokens: number;
|
|
163
|
+
prompt_tokens_details?: { cached_tokens: number; audio_tokens: number };
|
|
164
|
+
completion_tokens_details?: {
|
|
165
|
+
reasoning_tokens: number;
|
|
166
|
+
audio_tokens: number;
|
|
167
|
+
accepted_prediction_tokens: number;
|
|
168
|
+
rejected_prediction_tokens: number;
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/** Response when stream=false (or omitted) */
|
|
173
|
+
export interface ChatCompletionResponse {
|
|
174
|
+
id: string;
|
|
175
|
+
object: 'chat.completion';
|
|
176
|
+
created: number;
|
|
177
|
+
model: string;
|
|
178
|
+
choices: ChatCompletionChoice[];
|
|
179
|
+
usage: ChatCompletionUsage;
|
|
180
|
+
system_fingerprint?: string;
|
|
181
|
+
service_tier?: string;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Streaming response types
|
|
185
|
+
|
|
186
|
+
export interface ChatCompletionChunkDelta {
|
|
187
|
+
role?: 'assistant';
|
|
188
|
+
content?: string | null;
|
|
189
|
+
tool_calls?: Array<{
|
|
190
|
+
index: number;
|
|
191
|
+
id?: string;
|
|
192
|
+
type?: 'function';
|
|
193
|
+
function?: { name?: string; arguments?: string };
|
|
194
|
+
}>;
|
|
195
|
+
refusal?: string | null;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
export interface ChatCompletionChunkChoice {
|
|
199
|
+
index: number;
|
|
200
|
+
delta: ChatCompletionChunkDelta;
|
|
201
|
+
finish_reason: 'stop' | 'length' | 'tool_calls' | 'content_filter' | null;
|
|
202
|
+
logprobs: ChatCompletionChoiceLogprobs | null;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export interface ResultTimings {
|
|
206
|
+
cache_n: number;
|
|
207
|
+
prompt_n: number;
|
|
208
|
+
prompt_ms: number;
|
|
209
|
+
prompt_per_token_ms: number;
|
|
210
|
+
prompt_per_second: number;
|
|
211
|
+
predicted_n: number;
|
|
212
|
+
predicted_ms: number;
|
|
213
|
+
predicted_per_token_ms: number;
|
|
214
|
+
predicted_per_second: number;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/** Response when stream=true — one chunk per SSE event */
|
|
218
|
+
export interface ChatCompletionChunk {
|
|
219
|
+
id: string;
|
|
220
|
+
object: 'chat.completion.chunk';
|
|
221
|
+
created: number;
|
|
222
|
+
model: string;
|
|
223
|
+
choices: ChatCompletionChunkChoice[];
|
|
224
|
+
usage?: ChatCompletionUsage | null;
|
|
225
|
+
timings?: ResultTimings;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Raw (text) completion
|
|
229
|
+
|
|
230
|
+
export type RawCompletionParams = {
|
|
231
|
+
prompt: string | string[];
|
|
232
|
+
stream?: boolean;
|
|
233
|
+
model?: string;
|
|
234
|
+
abortSignal?: AbortSignal;
|
|
235
|
+
suffix?: string;
|
|
236
|
+
max_tokens?: number;
|
|
237
|
+
temperature?: number;
|
|
238
|
+
top_p?: number;
|
|
239
|
+
n?: number;
|
|
240
|
+
logprobs?: number | null;
|
|
241
|
+
echo?: boolean;
|
|
242
|
+
stop?: string | string[];
|
|
243
|
+
presence_penalty?: number;
|
|
244
|
+
frequency_penalty?: number;
|
|
245
|
+
best_of?: number;
|
|
246
|
+
logit_bias?: Record<string, number>;
|
|
247
|
+
seed?: number;
|
|
248
|
+
user?: string;
|
|
249
|
+
} & SamplingParams;
|
|
250
|
+
|
|
251
|
+
export interface RawCompletionChoice {
|
|
252
|
+
text: string;
|
|
253
|
+
index: number;
|
|
254
|
+
finish_reason: 'stop' | 'length' | 'content_filter' | null;
|
|
255
|
+
logprobs: {
|
|
256
|
+
tokens: string[];
|
|
257
|
+
token_logprobs: number[];
|
|
258
|
+
top_logprobs: Array<Record<string, number>>;
|
|
259
|
+
text_offset: number[];
|
|
260
|
+
} | null;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/** Response when stream=false */
|
|
264
|
+
export interface RawCompletionResponse {
|
|
265
|
+
id: string;
|
|
266
|
+
object: 'text_completion';
|
|
267
|
+
created: number;
|
|
268
|
+
model: string;
|
|
269
|
+
choices: RawCompletionChoice[];
|
|
270
|
+
usage: ChatCompletionUsage;
|
|
271
|
+
system_fingerprint?: string;
|
|
272
|
+
timings?: ResultTimings;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/** One chunk when stream=true */
|
|
276
|
+
export interface RawCompletionChunk {
|
|
277
|
+
id: string;
|
|
278
|
+
object: 'text_completion';
|
|
279
|
+
created: number;
|
|
280
|
+
model: string;
|
|
281
|
+
choices: Array<{
|
|
282
|
+
text: string;
|
|
283
|
+
index: number;
|
|
284
|
+
finish_reason: 'stop' | 'length' | 'content_filter' | null;
|
|
285
|
+
logprobs: null;
|
|
286
|
+
}>;
|
|
287
|
+
usage?: ChatCompletionUsage | null;
|
|
288
|
+
timings?: ResultTimings;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Embeddings
|
|
292
|
+
|
|
293
|
+
export interface EmbeddingCreateParams {
|
|
294
|
+
input: string | string[] | number[] | number[][];
|
|
295
|
+
model?: string;
|
|
296
|
+
encoding_format?: 'float' | 'base64';
|
|
297
|
+
// dimensions?: number; // unsupported by llama.cpp
|
|
298
|
+
// user?: string;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
export interface Embedding {
|
|
302
|
+
object: 'embedding';
|
|
303
|
+
index: number;
|
|
304
|
+
embedding: number[] | string; // float array or base64 string depending on encoding_format
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
export interface EmbeddingUsage {
|
|
308
|
+
prompt_tokens: number;
|
|
309
|
+
total_tokens: number;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
export interface CreateEmbeddingResponse {
|
|
313
|
+
object: 'list';
|
|
314
|
+
data: Embedding[];
|
|
315
|
+
model: string;
|
|
316
|
+
usage: EmbeddingUsage;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// Reranking (NOT official OAI-compat, but is a commonly-used API schema)
|
|
320
|
+
|
|
321
|
+
export interface RerankParams {
|
|
322
|
+
query: string;
|
|
323
|
+
documents: string[];
|
|
324
|
+
top_n?: number;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
export interface RerankResult {
|
|
328
|
+
index: number;
|
|
329
|
+
relevance_score: number;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
export interface RerankUsage {
|
|
333
|
+
prompt_tokens: number;
|
|
334
|
+
total_tokens: number;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
export interface RerankResponse {
|
|
338
|
+
model: string;
|
|
339
|
+
object: 'list';
|
|
340
|
+
usage: RerankUsage;
|
|
341
|
+
results: RerankResult[];
|
|
342
|
+
}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
// Note: snake_case is used to match llama.cpp's naming convention
|
|
2
|
+
export interface LoadModelParams {
|
|
3
|
+
log_level?: LogLevel;
|
|
4
|
+
seed?: number;
|
|
5
|
+
n_ctx?: number;
|
|
6
|
+
n_batch?: number;
|
|
7
|
+
// by default, all layers are offloaded if WebGPU is available
|
|
8
|
+
n_gpu_layers?: number;
|
|
9
|
+
// by default, on multi-thread build, we take half number of available threads (hardwareConcurrency / 2)
|
|
10
|
+
n_threads?: number;
|
|
11
|
+
embeddings?: boolean;
|
|
12
|
+
offload_kqv?: boolean;
|
|
13
|
+
pooling_type?: // legacy values
|
|
14
|
+
| 'LLAMA_POOLING_TYPE_UNSPECIFIED'
|
|
15
|
+
| 'LLAMA_POOLING_TYPE_NONE'
|
|
16
|
+
| 'LLAMA_POOLING_TYPE_MEAN'
|
|
17
|
+
| 'LLAMA_POOLING_TYPE_CLS'
|
|
18
|
+
// new values
|
|
19
|
+
| 'unspecified'
|
|
20
|
+
| 'none'
|
|
21
|
+
| 'mean'
|
|
22
|
+
| 'cls'
|
|
23
|
+
| 'last'
|
|
24
|
+
| 'rank';
|
|
25
|
+
// context extending
|
|
26
|
+
rope_scaling_type?:
|
|
27
|
+
| 'LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED'
|
|
28
|
+
| 'LLAMA_ROPE_SCALING_TYPE_NONE'
|
|
29
|
+
| 'LLAMA_ROPE_SCALING_TYPE_LINEAR'
|
|
30
|
+
| 'LLAMA_ROPE_SCALING_TYPE_YARN';
|
|
31
|
+
rope_freq_base?: number;
|
|
32
|
+
rope_freq_scale?: number;
|
|
33
|
+
yarn_ext_factor?: number;
|
|
34
|
+
yarn_attn_factor?: number;
|
|
35
|
+
yarn_beta_fast?: number;
|
|
36
|
+
yarn_beta_slow?: number;
|
|
37
|
+
yarn_orig_ctx?: number;
|
|
38
|
+
// optimizations
|
|
39
|
+
cache_type_k?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
|
|
40
|
+
cache_type_v?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
|
|
41
|
+
flash_attn?: boolean; // true is auto, false is disabled
|
|
42
|
+
swa_full?: boolean;
|
|
43
|
+
chat_template?: string;
|
|
44
|
+
jinja?: boolean;
|
|
45
|
+
reasoning?: boolean;
|
|
46
|
+
image_min_tokens?: number;
|
|
47
|
+
image_max_tokens?: number;
|
|
48
|
+
warmup?: boolean;
|
|
49
|
+
no_kv_offload?: boolean;
|
|
50
|
+
mmproj_offload?: boolean;
|
|
51
|
+
cont_batching?: boolean;
|
|
52
|
+
n_keep?: number;
|
|
53
|
+
ctx_shift?: boolean;
|
|
54
|
+
cache_idle_slots?: boolean;
|
|
55
|
+
n_cache_reuse?: number;
|
|
56
|
+
lora_adapters?: { path: string; scale?: number }[];
|
|
57
|
+
lora_init_without_apply?: boolean;
|
|
58
|
+
spec_draft_model?: string;
|
|
59
|
+
spec_draft_ngl?: number;
|
|
60
|
+
spec_draft_n_max?: number;
|
|
61
|
+
spec_draft_n_min?: number;
|
|
62
|
+
spec_draft_p_min?: number;
|
|
63
|
+
spec_draft_threads?: number;
|
|
64
|
+
spec_draft_threads_batch?: number;
|
|
65
|
+
kv_overrides?: Record<string, string>;
|
|
66
|
+
reasoning_budget_tokens?: number;
|
|
67
|
+
reasoning_budget_message?: string;
|
|
68
|
+
reasoning_format?: 'none' | 'deepseek-legacy' | 'deepseek';
|
|
69
|
+
skip_chat_parsing?: boolean;
|
|
70
|
+
prefill_assistant?: boolean;
|
|
71
|
+
default_template_kwargs?: Record<string, any>;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Note: snake_case is used to match llama.cpp's naming convention
|
|
75
|
+
export interface LoadedContextInfo {
|
|
76
|
+
n_vocab: number;
|
|
77
|
+
n_ctx: number;
|
|
78
|
+
n_batch: number;
|
|
79
|
+
n_ubatch: number;
|
|
80
|
+
n_ctx_train: number;
|
|
81
|
+
n_embd: number;
|
|
82
|
+
n_layer: number;
|
|
83
|
+
metadata: Record<string, string>;
|
|
84
|
+
token_bos: number;
|
|
85
|
+
token_eos: number;
|
|
86
|
+
token_eot: number;
|
|
87
|
+
list_tokens_eog: number[];
|
|
88
|
+
has_encoder: boolean;
|
|
89
|
+
token_decoder_start: number;
|
|
90
|
+
add_bos_token: boolean;
|
|
91
|
+
add_eos_token: boolean;
|
|
92
|
+
has_image_input: boolean;
|
|
93
|
+
has_audio_input: boolean;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Note: snake_case is used to match llama.cpp's naming convention
|
|
97
|
+
export interface SamplingParams {
|
|
98
|
+
// See sampling.h for more details
|
|
99
|
+
seed?: number;
|
|
100
|
+
mirostat?: number | undefined; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
101
|
+
mirostat_eta?: number | undefined;
|
|
102
|
+
mirostat_tau?: number | undefined;
|
|
103
|
+
samplers_sequence?: string[] | undefined; // unused for now
|
|
104
|
+
temp?: number | undefined; // temperature
|
|
105
|
+
top_p?: number | undefined;
|
|
106
|
+
top_k?: number | undefined;
|
|
107
|
+
penalty_last_n?: number | undefined;
|
|
108
|
+
penalty_repeat?: number | undefined;
|
|
109
|
+
penalty_freq?: number | undefined;
|
|
110
|
+
penalty_present?: number | undefined;
|
|
111
|
+
dynatemp_range?: number | undefined;
|
|
112
|
+
dynatemp_exponent?: number | undefined;
|
|
113
|
+
grammar?: string;
|
|
114
|
+
n_prev?: number | undefined;
|
|
115
|
+
n_probs?: number | undefined;
|
|
116
|
+
min_p?: number | undefined;
|
|
117
|
+
typ_p?: number | undefined;
|
|
118
|
+
typical_p?: number | undefined;
|
|
119
|
+
logit_bias?: { token: number; bias: number }[] | undefined;
|
|
120
|
+
ignore_eos?: boolean | undefined;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
export interface StreamParams<T> {
|
|
124
|
+
stream: true;
|
|
125
|
+
onData: (data: T) => void;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export enum LogLevel {
|
|
129
|
+
DEBUG = 1,
|
|
130
|
+
INFO = 2,
|
|
131
|
+
WARN = 3,
|
|
132
|
+
ERROR = 4,
|
|
133
|
+
}
|