localm-web 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,10 +1,3 @@
1
- const DOWNLOAD_PATTERN = /\b(fetch|download|loading from cache|cache hit|param)/i;
2
- const COMPILE_PATTERN = /\b(compil|shader|kernel|tensor|init|allocat|warm)/i;
3
- function classifyLoadPhase(text) {
4
- if (DOWNLOAD_PATTERN.test(text)) return "downloading";
5
- if (COMPILE_PATTERN.test(text)) return "compiling";
6
- return "loading";
7
- }
8
1
  class LocalmWebError extends Error {
9
2
  /**
10
3
  * @param message - Human-readable description of the error.
@@ -30,6 +23,301 @@ class QuotaExceededError extends LocalmWebError {
30
23
  }
31
24
  class BackendNotAvailableError extends LocalmWebError {
32
25
  }
26
+ class StructuredOutputError extends LocalmWebError {
27
+ }
28
+ const DOWNLOAD_PATTERN = /\b(fetch|download|loading from cache|cache hit|param)/i;
29
+ const COMPILE_PATTERN = /\b(compil|shader|kernel|tensor|init|allocat|warm)/i;
30
+ function classifyLoadPhase(text) {
31
+ if (DOWNLOAD_PATTERN.test(text)) return "downloading";
32
+ if (COMPILE_PATTERN.test(text)) return "compiling";
33
+ return "loading";
34
+ }
35
+ let transformersModulePromise$2 = null;
36
+ async function loadTransformers$2() {
37
+ if (!transformersModulePromise$2) {
38
+ transformersModulePromise$2 = import("@huggingface/transformers");
39
+ }
40
+ return transformersModulePromise$2;
41
+ }
42
+ function buildSamplingKwargs(options) {
43
+ const kwargs = {};
44
+ if (options.maxTokens !== void 0) kwargs.max_new_tokens = options.maxTokens;
45
+ if (options.temperature !== void 0) kwargs.temperature = options.temperature;
46
+ if (options.topP !== void 0) kwargs.top_p = options.topP;
47
+ if (options.topK !== void 0) kwargs.top_k = options.topK;
48
+ if (options.temperature !== void 0 && options.temperature > 0) {
49
+ kwargs.do_sample = true;
50
+ }
51
+ return kwargs;
52
+ }
53
+ function toChatMessages$1(messages) {
54
+ return messages.map((m) => ({ role: m.role, content: m.content }));
55
+ }
56
+ function lastAssistantContent(output, promptText) {
57
+ const item = Array.isArray(output) ? output[0] : output;
58
+ if (!item) return "";
59
+ const generated = item.generated_text;
60
+ if (typeof generated === "string") {
61
+ return generated.startsWith(promptText) ? generated.slice(promptText.length) : generated;
62
+ }
63
+ if (Array.isArray(generated)) {
64
+ for (let i = generated.length - 1; i >= 0; i -= 1) {
65
+ const turn = generated[i];
66
+ if (turn && turn.role === "assistant") return turn.content;
67
+ }
68
+ }
69
+ return "";
70
+ }
71
+ function createAsyncQueue() {
72
+ const buffer = [];
73
+ let waiters = [];
74
+ let finished = false;
75
+ let pendingError = null;
76
+ const drain = () => {
77
+ while (buffer.length > 0 && waiters.length > 0) {
78
+ const resolver = waiters.shift();
79
+ const value = buffer.shift();
80
+ resolver?.({ value, done: false });
81
+ }
82
+ if ((finished || pendingError) && waiters.length > 0) {
83
+ const all = waiters;
84
+ waiters = [];
85
+ for (const w of all) {
86
+ if (pendingError) {
87
+ w({ value: void 0, done: true });
88
+ } else {
89
+ w({ value: void 0, done: true });
90
+ }
91
+ }
92
+ }
93
+ };
94
+ return {
95
+ push(item) {
96
+ buffer.push(item);
97
+ drain();
98
+ },
99
+ end(error) {
100
+ finished = true;
101
+ if (error) pendingError = error;
102
+ drain();
103
+ },
104
+ iterator: {
105
+ [Symbol.asyncIterator]() {
106
+ return {
107
+ next() {
108
+ if (buffer.length > 0) {
109
+ return Promise.resolve({ value: buffer.shift(), done: false });
110
+ }
111
+ if (pendingError) {
112
+ const err = pendingError;
113
+ pendingError = null;
114
+ return Promise.reject(err);
115
+ }
116
+ if (finished) {
117
+ return Promise.resolve({ value: void 0, done: true });
118
+ }
119
+ return new Promise((resolve) => waiters.push(resolve));
120
+ }
121
+ };
122
+ }
123
+ }
124
+ };
125
+ }
126
+ class TransformersTextEngine {
127
+ generator = null;
128
+ currentAbortController = null;
129
+ isLoaded() {
130
+ return this.generator !== null;
131
+ }
132
+ async load(modelId, onProgress) {
133
+ const transformers = await loadTransformers$2();
134
+ try {
135
+ const generator = await transformers.pipeline("text-generation", modelId, {
136
+ progress_callback: (report) => {
137
+ const progress = typeof report.progress === "number" ? report.progress / 100 : 0;
138
+ const text = report.status ?? "loading";
139
+ onProgress?.({
140
+ progress,
141
+ text,
142
+ loaded: 0,
143
+ total: 0,
144
+ phase: classifyLoadPhase(text)
145
+ });
146
+ }
147
+ });
148
+ this.generator = generator;
149
+ onProgress?.({
150
+ progress: 1,
151
+ text: "Model ready.",
152
+ loaded: 0,
153
+ total: 0,
154
+ phase: "ready"
155
+ });
156
+ } catch (err) {
157
+ throw new ModelLoadError(`Failed to load transformers model "${modelId}".`, err);
158
+ }
159
+ }
160
+ async generate(messages, options = {}) {
161
+ const generator = this.requireGenerator();
162
+ if (options.signal?.aborted) {
163
+ throw new GenerationAbortedError("Generation aborted before start.");
164
+ }
165
+ const chat = toChatMessages$1(messages);
166
+ try {
167
+ const output = await generator(chat, buildSamplingKwargs(options));
168
+ return lastAssistantContent(output, "");
169
+ } catch (err) {
170
+ if (err instanceof GenerationAbortedError) throw err;
171
+ throw new ModelLoadError("Transformers generation failed.", err);
172
+ }
173
+ }
174
+ async *stream(messages, options = {}) {
175
+ const generator = this.requireGenerator();
176
+ if (options.signal?.aborted) {
177
+ throw new GenerationAbortedError("Generation aborted before start.");
178
+ }
179
+ const transformers = await loadTransformers$2();
180
+ const queue = createAsyncQueue();
181
+ let index = 0;
182
+ const tokenizer = generator.tokenizer;
183
+ const streamer = new transformers.TextStreamer(tokenizer, {
184
+ skip_prompt: true,
185
+ skip_special_tokens: true,
186
+ callback_function: (text) => {
187
+ if (text) {
188
+ queue.push({ text, index, done: false });
189
+ index += 1;
190
+ }
191
+ }
192
+ });
193
+ const abortPromise = new Promise((_, reject) => {
194
+ if (options.signal) {
195
+ const onAbort = () => {
196
+ reject(new GenerationAbortedError("Generation aborted by signal."));
197
+ };
198
+ options.signal.addEventListener("abort", onAbort, { once: true });
199
+ }
200
+ });
201
+ const chat = toChatMessages$1(messages);
202
+ const generation = generator(chat, { ...buildSamplingKwargs(options), streamer }).then(() => {
203
+ queue.push({ text: "", index, done: true });
204
+ queue.end();
205
+ }).catch((err) => {
206
+ queue.end(err instanceof Error ? err : new Error(String(err)));
207
+ });
208
+ void Promise.race([generation, abortPromise]).catch((err) => {
209
+ if (err instanceof GenerationAbortedError) queue.end(err);
210
+ });
211
+ for await (const chunk of queue.iterator) {
212
+ yield chunk;
213
+ }
214
+ }
215
+ async complete(prompt, options = {}) {
216
+ const generator = this.requireGenerator();
217
+ if (options.signal?.aborted) {
218
+ throw new GenerationAbortedError("Generation aborted before start.");
219
+ }
220
+ try {
221
+ const output = await generator(prompt, buildSamplingKwargs(options));
222
+ return lastAssistantContent(output, prompt);
223
+ } catch (err) {
224
+ if (err instanceof GenerationAbortedError) throw err;
225
+ throw new ModelLoadError("Transformers completion failed.", err);
226
+ }
227
+ }
228
+ async *streamCompletion(prompt, options = {}) {
229
+ const generator = this.requireGenerator();
230
+ if (options.signal?.aborted) {
231
+ throw new GenerationAbortedError("Generation aborted before start.");
232
+ }
233
+ const transformers = await loadTransformers$2();
234
+ const queue = createAsyncQueue();
235
+ let index = 0;
236
+ const tokenizer = generator.tokenizer;
237
+ const streamer = new transformers.TextStreamer(tokenizer, {
238
+ skip_prompt: true,
239
+ skip_special_tokens: true,
240
+ callback_function: (text) => {
241
+ if (text) {
242
+ queue.push({ text, index, done: false });
243
+ index += 1;
244
+ }
245
+ }
246
+ });
247
+ generator(prompt, { ...buildSamplingKwargs(options), streamer }).then(() => {
248
+ queue.push({ text: "", index, done: true });
249
+ queue.end();
250
+ }).catch((err) => {
251
+ queue.end(err instanceof Error ? err : new Error(String(err)));
252
+ });
253
+ if (options.signal) {
254
+ options.signal.addEventListener(
255
+ "abort",
256
+ () => {
257
+ queue.end(new GenerationAbortedError("Generation aborted by signal."));
258
+ },
259
+ { once: true }
260
+ );
261
+ }
262
+ for await (const chunk of queue.iterator) {
263
+ yield chunk;
264
+ }
265
+ }
266
+ async unload() {
267
+ if (this.generator) {
268
+ const disposable = this.generator;
269
+ if (typeof disposable.dispose === "function") {
270
+ await disposable.dispose();
271
+ }
272
+ this.generator = null;
273
+ }
274
+ this.currentAbortController?.abort();
275
+ this.currentAbortController = null;
276
+ }
277
+ requireGenerator() {
278
+ if (!this.generator) {
279
+ throw new ModelNotLoadedError(
280
+ "TransformersTextEngine not loaded. Call load() before generation."
281
+ );
282
+ }
283
+ return this.generator;
284
+ }
285
+ }
286
+ function assertJsonSchema(schema) {
287
+ if (schema === null || typeof schema !== "object" || Array.isArray(schema)) {
288
+ throw new StructuredOutputError("jsonSchema must be a plain object describing a JSON Schema.");
289
+ }
290
+ const keys = Object.keys(schema);
291
+ const recognized = [
292
+ "type",
293
+ "$ref",
294
+ "oneOf",
295
+ "anyOf",
296
+ "allOf",
297
+ "enum",
298
+ "const",
299
+ "properties"
300
+ ];
301
+ if (!keys.some((key) => recognized.includes(key))) {
302
+ throw new StructuredOutputError(
303
+ "jsonSchema does not look like a JSON Schema (missing type/$ref/oneOf/anyOf/allOf/enum/const/properties)."
304
+ );
305
+ }
306
+ }
307
+ function serializeJsonSchema(schema) {
308
+ assertJsonSchema(schema);
309
+ return JSON.stringify(schema);
310
+ }
311
+ function parseStructuredOutput(text) {
312
+ try {
313
+ return JSON.parse(text);
314
+ } catch (err) {
315
+ throw new StructuredOutputError(
316
+ "Engine output is not valid JSON. The model may have ignored the constrained decoding directive.",
317
+ err
318
+ );
319
+ }
320
+ }
33
321
  let webllmModulePromise = null;
34
322
  async function loadWebLLM() {
35
323
  if (!webllmModulePromise) {
@@ -47,6 +335,15 @@ function buildSamplingParams(options) {
47
335
  if (options.topP !== void 0) params.top_p = options.topP;
48
336
  return params;
49
337
  }
338
+ function buildResponseFormat(options) {
339
+ if (options.jsonSchema !== void 0) {
340
+ return { type: "json_object", schema: serializeJsonSchema(options.jsonSchema) };
341
+ }
342
+ if (options.json) {
343
+ return { type: "json_object" };
344
+ }
345
+ return void 0;
346
+ }
50
347
  function toChatMessages(messages) {
51
348
  return messages.map((m) => {
52
349
  switch (m.role) {
@@ -101,10 +398,12 @@ class WebLLMEngine {
101
398
  if (options.signal?.aborted) {
102
399
  throw new GenerationAbortedError("Generation aborted before start.");
103
400
  }
401
+ const responseFormat = buildResponseFormat(options);
104
402
  const completion = await engine.chat.completions.create({
105
403
  ...buildSamplingParams(options),
106
404
  messages: toChatMessages(messages),
107
- stream: false
405
+ stream: false,
406
+ ...responseFormat ? { response_format: responseFormat } : {}
108
407
  });
109
408
  return completion.choices[0]?.message?.content ?? "";
110
409
  }
@@ -113,10 +412,12 @@ class WebLLMEngine {
113
412
  if (options.signal?.aborted) {
114
413
  throw new GenerationAbortedError("Generation aborted before start.");
115
414
  }
415
+ const responseFormat = buildResponseFormat(options);
116
416
  const completion = await engine.chat.completions.create({
117
417
  ...buildSamplingParams(options),
118
418
  messages: toChatMessages(messages),
119
- stream: true
419
+ stream: true,
420
+ ...responseFormat ? { response_format: responseFormat } : {}
120
421
  });
121
422
  let index = 0;
122
423
  let finished = false;
@@ -150,10 +451,12 @@ class WebLLMEngine {
150
451
  if (options.signal?.aborted) {
151
452
  throw new GenerationAbortedError("Generation aborted before start.");
152
453
  }
454
+ const responseFormat = buildResponseFormat(options);
153
455
  const completion = await engine.completions.create({
154
456
  ...buildSamplingParams(options),
155
457
  prompt,
156
- stream: false
458
+ stream: false,
459
+ ...responseFormat ? { response_format: responseFormat } : {}
157
460
  });
158
461
  return completion.choices[0]?.text ?? "";
159
462
  }
@@ -162,10 +465,12 @@ class WebLLMEngine {
162
465
  if (options.signal?.aborted) {
163
466
  throw new GenerationAbortedError("Generation aborted before start.");
164
467
  }
468
+ const responseFormat = buildResponseFormat(options);
165
469
  const completion = await engine.completions.create({
166
470
  ...buildSamplingParams(options),
167
471
  prompt,
168
- stream: true
472
+ stream: true,
473
+ ...responseFormat ? { response_format: responseFormat } : {}
169
474
  });
170
475
  let index = 0;
171
476
  let finished = false;
@@ -491,6 +796,7 @@ const MODEL_PRESETS = Object.freeze({
491
796
  parameters: "3.8B",
492
797
  quantization: "q4f16_1",
493
798
  webllmId: "Phi-3.5-mini-instruct-q4f16_1-MLC",
799
+ transformersId: "onnx-community/Phi-3.5-mini-instruct-onnx-web",
494
800
  contextWindow: 4096,
495
801
  description: "Microsoft Phi-3.5 mini, INT4 quantized for browser inference."
496
802
  },
@@ -500,6 +806,7 @@ const MODEL_PRESETS = Object.freeze({
500
806
  parameters: "1B",
501
807
  quantization: "q4f16_1",
502
808
  webllmId: "Llama-3.2-1B-Instruct-q4f16_1-MLC",
809
+ transformersId: "onnx-community/Llama-3.2-1B-Instruct",
503
810
  contextWindow: 4096,
504
811
  description: "Meta Llama 3.2 1B Instruct, INT4 quantized."
505
812
  },
@@ -509,8 +816,19 @@ const MODEL_PRESETS = Object.freeze({
509
816
  parameters: "1.5B",
510
817
  quantization: "q4f16_1",
511
818
  webllmId: "Qwen2.5-1.5B-Instruct-q4f16_1-MLC",
819
+ transformersId: "onnx-community/Qwen2.5-1.5B-Instruct",
512
820
  contextWindow: 4096,
513
821
  description: "Alibaba Qwen 2.5 1.5B Instruct, INT4 quantized."
822
+ },
823
+ "smollm2-360m-int8": {
824
+ id: "smollm2-360m-int8",
825
+ family: "SmolLM2",
826
+ parameters: "360M",
827
+ quantization: "q8",
828
+ webllmId: "SmolLM2-360M-Instruct-q4f16_1-MLC",
829
+ transformersId: "HuggingFaceTB/SmolLM2-360M-Instruct",
830
+ contextWindow: 2048,
831
+ description: "HuggingFace SmolLM2 360M Instruct — smallest viable chat model, ideal for the fallback path on low-end devices."
514
832
  }
515
833
  });
516
834
  function resolveModelPreset(modelId) {
@@ -583,12 +901,33 @@ function listSupportedRerankerModels() {
583
901
  function createInferenceWorker() {
584
902
  return new Worker(new URL(
585
903
  /* @vite-ignore */
586
- "/assets/inference.worker-CwvQtobb.js",
904
+ "/assets/inference.worker-DZbXKJZY.js",
587
905
  import.meta.url
588
906
  ), {
589
907
  type: "module"
590
908
  });
591
909
  }
910
+ function defaultWebGPUDetector() {
911
+ return typeof navigator !== "undefined" && "gpu" in navigator;
912
+ }
913
+ function resolveBackend(choice, preset, webGPUAvailable) {
914
+ if (choice === "webllm") return "webllm";
915
+ if (choice === "transformers") {
916
+ if (!preset.transformersId) {
917
+ throw new BackendNotAvailableError(
918
+ `Model "${preset.id}" has no transformersId — cannot run on the transformers.js backend.`
919
+ );
920
+ }
921
+ return "transformers";
922
+ }
923
+ if (webGPUAvailable) return "webllm";
924
+ if (!preset.transformersId) {
925
+ throw new BackendNotAvailableError(
926
+ `WebGPU is unavailable and model "${preset.id}" has no transformersId for the fallback path.`
927
+ );
928
+ }
929
+ return "transformers";
930
+ }
592
931
  class LMTask {
593
932
  constructor(engine, preset) {
594
933
  this.engine = engine;
@@ -604,13 +943,29 @@ class LMTask {
604
943
  */
605
944
  static async createEngine(modelId, options = {}) {
606
945
  const preset = resolveModelPreset(modelId);
607
- const engine = options.engine ?? LMTask.defaultEngine(options);
946
+ if (options.engine) {
947
+ if (!options.engine.isLoaded()) {
948
+ await options.engine.load(preset.webllmId, options.onProgress);
949
+ }
950
+ return { engine: options.engine, preset };
951
+ }
952
+ const choice = options.backend ?? "auto";
953
+ const resolved = resolveBackend(
954
+ choice,
955
+ preset,
956
+ defaultWebGPUDetector()
957
+ );
958
+ const engine = LMTask.instantiateEngine(resolved, options);
959
+ const loadId = resolved === "transformers" ? preset.transformersId ?? "" : preset.webllmId;
608
960
  if (!engine.isLoaded()) {
609
- await engine.load(preset.webllmId, options.onProgress);
961
+ await engine.load(loadId, options.onProgress);
610
962
  }
611
963
  return { engine, preset };
612
964
  }
613
- static defaultEngine(options) {
965
+ static instantiateEngine(resolved, options) {
966
+ if (resolved === "transformers") {
967
+ return new TransformersTextEngine();
968
+ }
614
969
  const useWorker = options.inWorker ?? true;
615
970
  if (useWorker) {
616
971
  return new WorkerEngine(createInferenceWorker());
@@ -633,6 +988,20 @@ class ChatReply {
633
988
  this.tokensGenerated = tokensGenerated;
634
989
  this.finishReason = finishReason;
635
990
  }
991
+ /**
992
+ * Parse {@link ChatReply.text} as JSON.
993
+ *
994
+ * Intended for replies generated with `json: true` or `jsonSchema`.
995
+ * The result is cast to `T` without runtime validation; pair with Zod /
996
+ * Ajv on the call site if you need to verify the schema.
997
+ *
998
+ * @typeParam T - Expected parsed shape.
999
+ * @returns The parsed JSON value.
1000
+ * @throws StructuredOutputError if the text is not valid JSON.
1001
+ */
1002
+ json() {
1003
+ return parseStructuredOutput(this.text);
1004
+ }
636
1005
  }
637
1006
  class CompletionResult {
638
1007
  constructor(text, prompt, tokensGenerated, finishReason) {
@@ -641,6 +1010,19 @@ class CompletionResult {
641
1010
  this.tokensGenerated = tokensGenerated;
642
1011
  this.finishReason = finishReason;
643
1012
  }
1013
+ /**
1014
+ * Parse {@link CompletionResult.text} as JSON.
1015
+ *
1016
+ * Intended for completions generated with `json: true` or `jsonSchema`.
1017
+ * The result is cast to `T` without runtime validation.
1018
+ *
1019
+ * @typeParam T - Expected parsed shape.
1020
+ * @returns The parsed JSON value.
1021
+ * @throws StructuredOutputError if the text is not valid JSON.
1022
+ */
1023
+ json() {
1024
+ return parseStructuredOutput(this.text);
1025
+ }
644
1026
  }
645
1027
  class Chat extends LMTask {
646
1028
  history = [];
@@ -1104,7 +1486,7 @@ async function* tap(stream, onChunk) {
1104
1486
  yield chunk;
1105
1487
  }
1106
1488
  }
1107
- const VERSION = "0.3.0";
1489
+ const VERSION = "0.5.0";
1108
1490
  export {
1109
1491
  BackendNotAvailableError,
1110
1492
  Chat,
@@ -1123,18 +1505,25 @@ export {
1123
1505
  QuotaExceededError,
1124
1506
  RERANKER_PRESETS,
1125
1507
  Reranker,
1508
+ StructuredOutputError,
1509
+ TransformersTextEngine,
1126
1510
  UnknownModelError,
1127
1511
  VERSION,
1128
1512
  WebGPUUnavailableError,
1513
+ WebLLMEngine,
1129
1514
  WorkerEngine,
1515
+ assertJsonSchema,
1130
1516
  collectStream,
1131
1517
  createInferenceWorker,
1132
1518
  listSupportedEmbeddingModels,
1133
1519
  listSupportedModels,
1134
1520
  listSupportedRerankerModels,
1521
+ parseStructuredOutput,
1522
+ resolveBackend,
1135
1523
  resolveEmbeddingPreset,
1136
1524
  resolveModelPreset,
1137
1525
  resolveRerankerPreset,
1526
+ serializeJsonSchema,
1138
1527
  tap
1139
1528
  };
1140
1529
  //# sourceMappingURL=index.js.map