node-llama-cpp 3.3.2 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. package/README.md +3 -2
  2. package/dist/bindings/AddonTypes.d.ts +12 -4
  3. package/dist/bindings/Llama.d.ts +9 -0
  4. package/dist/bindings/Llama.js +52 -28
  5. package/dist/bindings/Llama.js.map +1 -1
  6. package/dist/bindings/getLlama.d.ts +2 -1
  7. package/dist/bindings/getLlama.js +19 -9
  8. package/dist/bindings/getLlama.js.map +1 -1
  9. package/dist/bindings/utils/asyncSome.js +2 -0
  10. package/dist/bindings/utils/asyncSome.js.map +1 -1
  11. package/dist/bindings/utils/compileLLamaCpp.d.ts +1 -1
  12. package/dist/bindings/utils/compileLLamaCpp.js +115 -34
  13. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  14. package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +1 -0
  15. package/dist/bindings/utils/detectAvailableComputeLayers.js +4 -4
  16. package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
  17. package/dist/bindings/utils/detectBuildTools.d.ts +14 -0
  18. package/dist/bindings/utils/detectBuildTools.js +149 -0
  19. package/dist/bindings/utils/detectBuildTools.js.map +1 -0
  20. package/dist/bindings/utils/resolveActualBindingBinaryPath.d.ts +1 -0
  21. package/dist/bindings/utils/resolveActualBindingBinaryPath.js +18 -0
  22. package/dist/bindings/utils/resolveActualBindingBinaryPath.js.map +1 -0
  23. package/dist/bindings/utils/testBindingBinary.d.ts +1 -1
  24. package/dist/bindings/utils/testBindingBinary.js +58 -5
  25. package/dist/bindings/utils/testBindingBinary.js.map +1 -1
  26. package/dist/chatWrappers/AlpacaChatWrapper.d.ts +4 -0
  27. package/dist/chatWrappers/AlpacaChatWrapper.js +4 -0
  28. package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
  29. package/dist/chatWrappers/FalconChatWrapper.d.ts +4 -0
  30. package/dist/chatWrappers/FalconChatWrapper.js +4 -0
  31. package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
  32. package/dist/chatWrappers/GeneralChatWrapper.d.ts +4 -0
  33. package/dist/chatWrappers/GeneralChatWrapper.js +4 -0
  34. package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
  35. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +2 -0
  36. package/dist/chatWrappers/utils/resolveChatWrapper.js +8 -27
  37. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
  38. package/dist/cli/commands/ChatCommand.d.ts +4 -0
  39. package/dist/cli/commands/ChatCommand.js +158 -13
  40. package/dist/cli/commands/ChatCommand.js.map +1 -1
  41. package/dist/cli/commands/CompleteCommand.d.ts +4 -0
  42. package/dist/cli/commands/CompleteCommand.js +143 -10
  43. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  44. package/dist/cli/commands/DebugCommand.js +5 -5
  45. package/dist/cli/commands/DebugCommand.js.map +1 -1
  46. package/dist/cli/commands/InfillCommand.d.ts +4 -0
  47. package/dist/cli/commands/InfillCommand.js +142 -10
  48. package/dist/cli/commands/InfillCommand.js.map +1 -1
  49. package/dist/cli/commands/OnPostInstallCommand.js +12 -2
  50. package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
  51. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +1 -0
  52. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +14 -7
  53. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -1
  54. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +13 -3
  55. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  56. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +20 -10
  57. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
  58. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +2 -0
  59. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +234 -77
  60. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
  61. package/dist/cli/recommendedModels.js +11 -1
  62. package/dist/cli/recommendedModels.js.map +1 -1
  63. package/dist/cli/utils/ConsoleTable.d.ts +1 -0
  64. package/dist/cli/utils/ConsoleTable.js +5 -1
  65. package/dist/cli/utils/ConsoleTable.js.map +1 -1
  66. package/dist/cli/utils/interactivelyAskForModel.d.ts +2 -1
  67. package/dist/cli/utils/interactivelyAskForModel.js +16 -13
  68. package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
  69. package/dist/cli/utils/isRunningUnderRosetta.d.ts +1 -0
  70. package/dist/cli/utils/isRunningUnderRosetta.js +20 -0
  71. package/dist/cli/utils/isRunningUnderRosetta.js.map +1 -0
  72. package/dist/cli/utils/printCommonInfoLines.d.ts +4 -2
  73. package/dist/cli/utils/printCommonInfoLines.js +67 -5
  74. package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
  75. package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
  76. package/dist/cli/utils/resolveCommandGgufPath.js +6 -5
  77. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
  78. package/dist/cli/utils/toBytes.d.ts +1 -0
  79. package/dist/cli/utils/toBytes.js +5 -0
  80. package/dist/cli/utils/toBytes.js.map +1 -0
  81. package/dist/config.d.ts +3 -0
  82. package/dist/config.js +3 -0
  83. package/dist/config.js.map +1 -1
  84. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +12 -3
  85. package/dist/evaluator/LlamaChat/LlamaChat.js +21 -7
  86. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  87. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +6 -2
  88. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +3 -0
  89. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  90. package/dist/evaluator/LlamaCompletion.d.ts +3 -0
  91. package/dist/evaluator/LlamaCompletion.js +5 -0
  92. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  93. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +81 -38
  94. package/dist/evaluator/LlamaContext/LlamaContext.js +678 -132
  95. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  96. package/dist/evaluator/LlamaContext/TokenPredictor.d.ts +55 -0
  97. package/dist/evaluator/LlamaContext/TokenPredictor.js +20 -0
  98. package/dist/evaluator/LlamaContext/TokenPredictor.js.map +1 -0
  99. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts +56 -0
  100. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js +266 -0
  101. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map +1 -0
  102. package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts +58 -0
  103. package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js +138 -0
  104. package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map +1 -0
  105. package/dist/evaluator/LlamaContext/types.d.ts +198 -5
  106. package/dist/evaluator/LlamaEmbeddingContext.d.ts +3 -0
  107. package/dist/evaluator/LlamaEmbeddingContext.js +3 -0
  108. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  109. package/dist/evaluator/LlamaGrammar.d.ts +7 -1
  110. package/dist/evaluator/LlamaGrammar.js +6 -0
  111. package/dist/evaluator/LlamaGrammar.js.map +1 -1
  112. package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +4 -4
  113. package/dist/evaluator/LlamaGrammarEvaluationState.js +16 -8
  114. package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
  115. package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +5 -0
  116. package/dist/evaluator/LlamaJsonSchemaGrammar.js +7 -0
  117. package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
  118. package/dist/evaluator/LlamaModel/LlamaModel.d.ts +19 -11
  119. package/dist/evaluator/LlamaModel/LlamaModel.js +23 -29
  120. package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
  121. package/dist/evaluator/LlamaRankingContext.d.ts +76 -0
  122. package/dist/evaluator/LlamaRankingContext.js +158 -0
  123. package/dist/evaluator/LlamaRankingContext.js.map +1 -0
  124. package/dist/evaluator/TokenBias.d.ts +3 -0
  125. package/dist/evaluator/TokenBias.js +3 -0
  126. package/dist/evaluator/TokenBias.js.map +1 -1
  127. package/dist/evaluator/utils/chunkDocument.d.ts +86 -0
  128. package/dist/evaluator/utils/chunkDocument.js +212 -0
  129. package/dist/evaluator/utils/chunkDocument.js.map +1 -0
  130. package/dist/gguf/insights/GgufInsights.d.ts +3 -1
  131. package/dist/gguf/insights/GgufInsights.js +114 -8
  132. package/dist/gguf/insights/GgufInsights.js.map +1 -1
  133. package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +6 -3
  134. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +11 -7
  135. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
  136. package/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts +2 -1
  137. package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js +13 -7
  138. package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map +1 -1
  139. package/dist/gguf/parser/GgufV2Parser.js +29 -8
  140. package/dist/gguf/parser/GgufV2Parser.js.map +1 -1
  141. package/dist/gguf/parser/parseGguf.js +11 -11
  142. package/dist/gguf/parser/parseGguf.js.map +1 -1
  143. package/dist/gguf/readGgufFileInfo.js +8 -3
  144. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  145. package/dist/gguf/types/GgufFileInfoTypes.d.ts +1 -0
  146. package/dist/gguf/types/GgufMetadataTypes.d.ts +9 -9
  147. package/dist/gguf/types/GgufMetadataTypes.js +1 -1
  148. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  149. package/dist/gguf/types/GgufTensorInfoTypes.d.ts +13 -0
  150. package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -1
  151. package/dist/index.d.ts +7 -2
  152. package/dist/index.js +6 -1
  153. package/dist/index.js.map +1 -1
  154. package/dist/tsconfig.tsbuildinfo +1 -1
  155. package/dist/utils/LlamaText.d.ts +4 -1
  156. package/dist/utils/LlamaText.js +4 -1
  157. package/dist/utils/LlamaText.js.map +1 -1
  158. package/dist/utils/cmake.js +23 -0
  159. package/dist/utils/cmake.js.map +1 -1
  160. package/dist/utils/pushAll.d.ts +1 -1
  161. package/dist/utils/pushAll.js.map +1 -1
  162. package/dist/utils/tokenizerUtils.js +1 -1
  163. package/dist/utils/utilTypes.d.ts +5 -0
  164. package/llama/CMakeLists.txt +25 -8
  165. package/llama/addon/AddonContext.cpp +196 -22
  166. package/llama/addon/AddonContext.h +1 -0
  167. package/llama/addon/AddonGrammar.cpp +1 -4
  168. package/llama/addon/AddonGrammarEvaluationState.cpp +16 -5
  169. package/llama/addon/AddonModel.cpp +31 -39
  170. package/llama/addon/AddonModel.h +1 -1
  171. package/llama/addon/AddonModelLora.cpp +2 -2
  172. package/llama/addon/AddonModelLora.h +1 -1
  173. package/llama/addon/AddonSampler.cpp +7 -12
  174. package/llama/addon/addon.cpp +26 -7
  175. package/llama/addon/globals/getGpuInfo.cpp +30 -5
  176. package/llama/addon/globals/getGpuInfo.h +6 -1
  177. package/llama/addon/globals/getMemoryInfo.cpp +63 -0
  178. package/llama/addon/globals/getMemoryInfo.h +4 -0
  179. package/llama/binariesGithubRelease.json +1 -1
  180. package/llama/cmake/win32.ensureNinjaPath.cmake +68 -0
  181. package/llama/cmake/win32.ensureNodeLib.cmake +34 -0
  182. package/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake +12 -0
  183. package/llama/cmake/win32.llvmEnsureCmakeAr.cmake +37 -0
  184. package/llama/cmake/win32.llvmUseGnuModeCompilers.cmake +87 -0
  185. package/llama/cmake/win32.programFilesPaths.cmake +35 -0
  186. package/llama/gitRelease.bundle +0 -0
  187. package/llama/gpuInfo/vulkan-gpu-info.cpp +29 -2
  188. package/llama/gpuInfo/vulkan-gpu-info.h +1 -0
  189. package/llama/llama.cpp.info.json +1 -1
  190. package/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake +14 -0
  191. package/llama/profiles/llvm.win32.host-x64.target-arm64.cmake +14 -0
  192. package/llama/profiles/llvm.win32.host-x64.target-x64.cmake +14 -0
  193. package/llama/toolchains/llvm.win32.host-x64.target-x64.cmake +20 -0
  194. package/llama/toolchains/win32.host-arm64.target-arm64.cmake +21 -0
  195. package/llama/toolchains/win32.host-x64.target-arm64.cmake +14 -34
  196. package/package.json +47 -44
  197. package/templates/README.md +1 -1
  198. package/templates/packed/electron-typescript-react.json +1 -1
  199. package/templates/packed/node-typescript.json +1 -1
@@ -1,9 +1,12 @@
1
- import { AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
1
+ import { acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
2
2
  import { removeNullFields } from "../../utils/removeNullFields.js";
3
3
  import { compareTokens } from "../../utils/compareTokens.js";
4
4
  import { DisposeGuard } from "../../utils/DisposeGuard.js";
5
5
  import { TokenMeter } from "../TokenMeter.js";
6
6
  import { UnsupportedError } from "../../utils/UnsupportedError.js";
7
+ import { pushAll } from "../../utils/pushAll.js";
8
+ import { safeEventCallback } from "../../utils/safeEventCallback.js";
9
+ import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
7
10
  import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
8
11
  import { LlamaSampler } from "./LlamaSampler.js";
9
12
  const defaultLoraScale = 1;
@@ -13,6 +16,7 @@ const defaultFailedCreationRemedy = {
13
16
  retries: 6,
14
17
  autoContextSizeShrink: 0.16
15
18
  };
19
+ const defaultEvaluationPriority = 5;
16
20
  export class LlamaContext {
17
21
  /** @internal */ _llama;
18
22
  /** @internal */ _ctx;
@@ -43,7 +47,7 @@ export class LlamaContext {
43
47
  /** @internal */ _allocatedContextSize;
44
48
  /** @internal */ _disposed = false;
45
49
  onDispose = new EventRelay();
46
- constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings }) {
50
+ constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings, _ranking }) {
47
51
  if (_model.disposed)
48
52
  throw new DisposedError();
49
53
  this._llama = _model._llama;
@@ -70,6 +74,7 @@ export class LlamaContext {
70
74
  flashAttention: this._flashAttention,
71
75
  threads: this._idealThreads,
72
76
  embeddings: _embeddings,
77
+ ranking: _ranking,
73
78
  performanceTracking: this._performanceTracking
74
79
  }));
75
80
  this._batchingOptions = {
@@ -163,7 +168,7 @@ export class LlamaContext {
163
168
  * When there are no sequences left, this method will throw an error.
164
169
  */
165
170
  getSequence(options = {}) {
166
- const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = options;
171
+ const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, _tokenMeter } = options;
167
172
  this._ensureNotDisposed();
168
173
  const nextSequenceId = this._popSequenceId();
169
174
  if (nextSequenceId == null)
@@ -175,7 +180,8 @@ export class LlamaContext {
175
180
  contextShift: {
176
181
  size: contextShiftSize,
177
182
  strategy: contextShiftStrategy
178
- }
183
+ },
184
+ tokenPredictor
179
185
  });
180
186
  }
181
187
  dispatchPendingBatch() {
@@ -189,6 +195,7 @@ export class LlamaContext {
189
195
  this._dispatchDecodeScheduled = false;
190
196
  this._batchDispatchPending = false;
191
197
  let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
198
+ const queuedDecodeToMappedLogits = new Map();
192
199
  const resolvePrioritizationStrategy = () => {
193
200
  try {
194
201
  this._ensureNotDisposed();
@@ -205,6 +212,7 @@ export class LlamaContext {
205
212
  for (const queuedDecode of this._queuedDecodes) {
206
213
  const batchItem = {
207
214
  tokens: queuedDecode.tokens,
215
+ logits: queuedDecode.logits,
208
216
  evaluationPriority: queuedDecode.evaluationPriority
209
217
  };
210
218
  batchItemToQueuedDecodeMap.set(batchItem, queuedDecode);
@@ -262,15 +270,16 @@ export class LlamaContext {
262
270
  if (currentBatchSize !== 0)
263
271
  this._ctx.initBatch(currentBatchSize);
264
272
  for (const { queuedDecode, processAmount } of batchItems) {
265
- let batchLogitIndex;
273
+ let batchLogitIndexes;
274
+ const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
275
+ const tokenIndexesWithLogitsToProcess = queuedDecode.logits.slice(0, processAmount)
276
+ .map((logit, index) => (logit ? index : undefined))
277
+ .filter((index) => index != undefined);
278
+ const numberOfOutputTokens = tokenIndexesWithLogitsToProcess.length;
279
+ TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
280
+ TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
266
281
  try {
267
- const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
268
- processAmount === queuedDecode.tokens.length;
269
- const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
270
- const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
271
- TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
272
- TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
273
- batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
282
+ batchLogitIndexes = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), Uint32Array.from(tokenIndexesWithLogitsToProcess));
274
283
  }
275
284
  catch (err) {
276
285
  this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
@@ -280,13 +289,23 @@ export class LlamaContext {
280
289
  if (queuedDecode.tokens.length === processAmount) {
281
290
  queuedDecodesToDelete.add(queuedDecode);
282
291
  afterDecodeActions.push({
283
- batchLogitIndex,
284
- response: queuedDecode.response,
285
- onDone: queuedDecode.onDone
292
+ queuedDecode,
293
+ batchLogitIndexes,
294
+ batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
295
+ firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
296
+ returnResults: true
286
297
  });
287
298
  }
288
299
  else {
300
+ if (batchLogitIndexes.length > 0)
301
+ afterDecodeActions.push({
302
+ queuedDecode,
303
+ batchLogitIndexes,
304
+ batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
305
+ firstTokenIndex: queuedDecode.firstTokenSequenceIndex
306
+ });
289
307
  queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
308
+ queuedDecode.logits = queuedDecode.logits.slice(processAmount);
290
309
  queuedDecode.firstTokenSequenceIndex += processAmount;
291
310
  }
292
311
  }
@@ -315,18 +334,50 @@ export class LlamaContext {
315
334
  return;
316
335
  }
317
336
  }
318
- for (const action of afterDecodeActions) {
319
- const [accept, reject] = action.response;
320
- if (action.onDone != null && action.batchLogitIndex != null) {
321
- try {
322
- accept(action.onDone(action.batchLogitIndex ?? null));
323
- }
324
- catch (err) {
325
- reject(err);
326
- }
337
+ function finishAfterDecodeAction(action, mappedLogitValues) {
338
+ if (mappedLogitValues != null && mappedLogitValues.length > 0) {
339
+ if (queuedDecodeToMappedLogits.has(action.queuedDecode))
340
+ pushAll(queuedDecodeToMappedLogits.get(action.queuedDecode), mappedLogitValues);
341
+ else
342
+ queuedDecodeToMappedLogits.set(action.queuedDecode, mappedLogitValues);
343
+ }
344
+ if (action.returnResults != null) {
345
+ const [accept] = action.queuedDecode.response;
346
+ const mappedLogits = queuedDecodeToMappedLogits.get(action.queuedDecode) ?? [];
347
+ queuedDecodeToMappedLogits.delete(action.queuedDecode);
348
+ accept(mappedLogits);
327
349
  }
328
- accept(undefined);
329
350
  }
351
+ const afterDecodeActionResults = afterDecodeActions.map((action) => {
352
+ if (action.batchLogitIndexes.length === 0) {
353
+ finishAfterDecodeAction(action);
354
+ return undefined;
355
+ }
356
+ const mappedLogitValues = [];
357
+ let promiseChain = undefined;
358
+ const batchLogitIndexes = action.batchLogitIndexes;
359
+ const batchLogitTokenIndexes = action.batchLogitTokenIndexes;
360
+ for (let i = 0; i < batchLogitIndexes.length; i++) {
361
+ const tokenIndex = batchLogitTokenIndexes[i];
362
+ const mappedValue = promiseChain != null
363
+ ? promiseChain
364
+ .then(() => action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex))
365
+ : action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex);
366
+ if (mappedValue instanceof Promise) {
367
+ promiseChain = mappedValue;
368
+ mappedLogitValues.push(mappedValue
369
+ .then((value) => [tokenIndex + action.firstTokenIndex, value]));
370
+ }
371
+ else
372
+ mappedLogitValues.push([tokenIndex + action.firstTokenIndex, mappedValue]);
373
+ }
374
+ if (promiseChain != null)
375
+ return Promise.all(mappedLogitValues)
376
+ .then((resolvedMappedLogitValues) => finishAfterDecodeAction(action, resolvedMappedLogitValues));
377
+ finishAfterDecodeAction(action, mappedLogitValues);
378
+ return undefined;
379
+ });
380
+ await Promise.all(afterDecodeActionResults);
330
381
  };
331
382
  const prioritizationStrategy = resolvePrioritizationStrategy();
332
383
  if (prioritizationStrategy == null)
@@ -376,17 +427,17 @@ export class LlamaContext {
376
427
  await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
377
428
  }
378
429
  /** @internal */
379
- async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
430
+ async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter }, logitDataMapper) {
380
431
  return await new Promise((accept, reject) => {
381
432
  this._queuedDecodes.push({
382
433
  sequenceId,
383
434
  tokens,
435
+ logits,
384
436
  firstTokenSequenceIndex,
385
- generateLogitAtTheEnd,
386
437
  evaluationPriority,
387
438
  tokenMeter,
388
439
  response: [accept, reject],
389
- onDone
440
+ logitDataMapper
390
441
  });
391
442
  this._queuedDecodeSequenceIds.add(sequenceId);
392
443
  this._scheduleDecode();
@@ -429,10 +480,20 @@ export class LlamaContext {
429
480
  const dispatchSchedule = this._batchingOptions.dispatchSchedule;
430
481
  if (this._queuedDecodeSequenceIds.size === this._totalSequences)
431
482
  dispatch();
432
- if (dispatchSchedule === "nextTick")
433
- setTimeout(dispatch, 0);
434
- else
483
+ if (dispatchSchedule === "nextCycle") {
484
+ if (typeof setImmediate === "function")
485
+ setImmediate(dispatch);
486
+ else
487
+ setTimeout(dispatch, 0);
488
+ }
489
+ else if (typeof dispatchSchedule === "function")
435
490
  dispatchSchedule(dispatch);
491
+ else {
492
+ if (typeof setImmediate === "function")
493
+ setImmediate(dispatch);
494
+ else
495
+ setTimeout(dispatch, 0);
496
+ }
436
497
  }
437
498
  /** @internal */
438
499
  _dispatchErrorForQueuedDecodesAndDequeue(queuedDecodes, err) {
@@ -620,17 +681,27 @@ export class LlamaContextSequence {
620
681
  /** @internal */ _gcRegistry;
621
682
  /** @internal */ _context;
622
683
  /** @internal */ _contextShift;
684
+ /** @internal */ _tokenPredictor;
623
685
  /** @internal */ _tokenMeter;
624
686
  /** @internal */ _disposeAggregator = new DisposeAggregator();
687
+ /** @internal */ _lock = {};
688
+ /** @internal */ _resetTokenPredictor = false;
689
+ /** @internal */ _tokenPredictorOwner = {};
625
690
  /** @internal */ _contextTokens = [];
626
691
  /** @internal */ _nextTokenIndex = 0;
692
+ /** @internal */ _loadedTokenPredictions = [];
693
+ /** @internal */ _usedTokenPredictions = 0;
694
+ /** @internal */ _unusedTokenPredictions = 0;
695
+ /** @internal */ _validatedTokenPredictions = 0;
696
+ /** @internal */ _refutedTokenPredictions = 0;
627
697
  /** @internal */ _disposed = false;
628
698
  onDispose = new EventRelay();
629
- constructor({ sequenceId, context, tokenMeter, contextShift }) {
699
+ constructor({ sequenceId, context, tokenMeter, contextShift, tokenPredictor }) {
630
700
  this._sequenceId = sequenceId;
631
701
  this._context = context;
632
702
  this._tokenMeter = tokenMeter ?? new TokenMeter();
633
703
  this._contextShift = contextShift;
704
+ this._tokenPredictor = tokenPredictor;
634
705
  this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
635
706
  this._gcRegistry.register(this, sequenceId);
636
707
  this._disposeAggregator.add(() => this._gcRegistry.unregister(this));
@@ -639,6 +710,8 @@ export class LlamaContextSequence {
639
710
  this._disposeAggregator.add(() => {
640
711
  this._context._reclaimUnusedSequenceId(this._sequenceId);
641
712
  });
713
+ if (this._tokenPredictor != null)
714
+ this._disposeAggregator.add(this._tokenPredictor);
642
715
  }
643
716
  dispose() {
644
717
  if (this._disposed)
@@ -660,20 +733,51 @@ export class LlamaContextSequence {
660
733
  get model() {
661
734
  return this._context.model;
662
735
  }
736
+ /** The maximum number of tokens that the sequence state can hold */
737
+ get contextSize() {
738
+ return this._context.contextSize;
739
+ }
740
+ /** The index where the next evaluated token will be placed in the context */
663
741
  get nextTokenIndex() {
664
- return this._nextTokenIndex;
742
+ return this._nextTokenIndex - this._loadedTokenPredictions.length;
665
743
  }
744
+ /** The current context state tokens */
666
745
  get contextTokens() {
667
- return this._contextTokens.slice();
746
+ if (this._loadedTokenPredictions.length === 0)
747
+ return this._contextTokens.slice();
748
+ return this._contextTokens.slice(0, -this._loadedTokenPredictions.length);
668
749
  }
669
750
  get tokenMeter() {
670
751
  return this._tokenMeter;
671
752
  }
753
+ /**
754
+ * The token predictor used when creating this sequence.
755
+ */
756
+ get tokenPredictor() {
757
+ return this._tokenPredictor;
758
+ }
759
+ /**
760
+ * Statistics of token predictions using the sequence's `tokenPredictor`.
761
+ *
762
+ * The statistics change only when token prediction is used in this sequence.
763
+ *
764
+ * `validated` + `refuted` = total number of evaluated predictions.
765
+ *
766
+ * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
767
+ */
768
+ get tokenPredictions() {
769
+ return {
770
+ used: this._usedTokenPredictions,
771
+ unused: this._unusedTokenPredictions,
772
+ validated: this._validatedTokenPredictions,
773
+ refuted: this._refutedTokenPredictions
774
+ };
775
+ }
672
776
  get isLoadedToMemory() {
673
777
  return !this._disposed;
674
778
  }
675
779
  compareContextTokens(tokens) {
676
- for (let i = 0; i < this._contextTokens.length; i++) {
780
+ for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length; i++) {
677
781
  if (compareTokens(this._contextTokens[i], tokens[i]))
678
782
  continue;
679
783
  return {
@@ -681,7 +785,7 @@ export class LlamaContextSequence {
681
785
  };
682
786
  }
683
787
  return {
684
- firstDifferentIndex: this._contextTokens.length
788
+ firstDifferentIndex: this._contextTokens.length - this._loadedTokenPredictions.length
685
789
  };
686
790
  }
687
791
  /**
@@ -695,10 +799,12 @@ export class LlamaContextSequence {
695
799
  * which incurs token evaluation of the shifted tokens.
696
800
  */
697
801
  async adaptStateToTokens(tokens, allowShift = true) {
698
- if (this.model.fileInsights.isRecurrent || !allowShift) {
802
+ const modelSupportsShifting = !this.model.fileInsights.isRecurrent &&
803
+ this.model.fileInfo.metadata?.general?.architecture !== GgufArchitectureType.deepseek2;
804
+ if (!modelSupportsShifting || !allowShift) {
699
805
  const { firstDifferentIndex } = this.compareContextTokens(tokens);
700
- if (firstDifferentIndex < this._nextTokenIndex)
701
- await this.eraseContextTokenRanges([{
806
+ if (firstDifferentIndex < this.nextTokenIndex)
807
+ await this._eraseContextTokenRanges([{
702
808
  start: firstDifferentIndex,
703
809
  end: this._nextTokenIndex
704
810
  }]);
@@ -707,7 +813,7 @@ export class LlamaContextSequence {
707
813
  const eraseRanges = [];
708
814
  let tokensIndex = 0;
709
815
  let differentTokenIndex = undefined;
710
- for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) {
816
+ for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length && tokensIndex < tokens.length; i++) {
711
817
  if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
712
818
  if (differentTokenIndex != null) {
713
819
  eraseRanges.push({
@@ -728,7 +834,7 @@ export class LlamaContextSequence {
728
834
  end: this._nextTokenIndex
729
835
  });
730
836
  if (eraseRanges.length > 0)
731
- await this.eraseContextTokenRanges(eraseRanges);
837
+ await this._eraseContextTokenRanges(eraseRanges);
732
838
  }
733
839
  /**
734
840
  * Clear the history of the sequence.
@@ -736,14 +842,18 @@ export class LlamaContextSequence {
736
842
  */
737
843
  async clearHistory() {
738
844
  this._ensureNotDisposed();
739
- await this.eraseContextTokenRanges([{ start: 0, end: this._nextTokenIndex }]);
845
+ await this._eraseContextTokenRanges([{ start: 0, end: this._nextTokenIndex }]);
740
846
  }
741
847
  /**
742
848
  * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
743
849
  * The start of each range is inclusive, and the end of each range is exclusive.
744
850
  * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
745
851
  */
746
- async eraseContextTokenRanges(ranges) {
852
+ eraseContextTokenRanges(ranges) {
853
+ return this._eraseContextTokenRanges(ranges);
854
+ }
855
+ /** @internal */
856
+ async _eraseContextTokenRanges(ranges, { canResetTokenPredictor = true, canRemovePredictionTokens = true, skipLock = false } = {}) {
747
857
  this._ensureNotDisposed();
748
858
  await withLock(this._context, "context", async () => {
749
859
  this._ensureNotDisposed();
@@ -776,6 +886,19 @@ export class LlamaContextSequence {
776
886
  ranges.push(range);
777
887
  return ranges;
778
888
  }, []);
889
+ const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
890
+ ? this._loadedTokenPredictions.length
891
+ : 0;
892
+ if (tokenPredictionsToRemove > 0) {
893
+ const startDeleteIndex = this._nextTokenIndex - this._loadedTokenPredictions.length;
894
+ const lastDeleteRange = resolvedRanges[resolvedRanges.length - 1];
895
+ if (lastDeleteRange.end >= startDeleteIndex)
896
+ lastDeleteRange.end = this._nextTokenIndex;
897
+ else
898
+ resolvedRanges.push({ start: startDeleteIndex, end: this._nextTokenIndex });
899
+ if (canResetTokenPredictor)
900
+ await this._abortTokenPredictor(true);
901
+ }
779
902
  let removedTokens = 0;
780
903
  let lastDeleteRangeEndPos = null;
781
904
  for (const range of resolvedRanges) {
@@ -790,6 +913,8 @@ export class LlamaContextSequence {
790
913
  removedTokens += range.end - range.start;
791
914
  lastDeleteRangeEndPos = range.end;
792
915
  }
916
+ if (tokenPredictionsToRemove > 0)
917
+ this._loadedTokenPredictions.splice(0, tokenPredictionsToRemove);
793
918
  if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
794
919
  lastDeleteRangeEndPos !== this._nextTokenIndex) {
795
920
  this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
@@ -797,17 +922,62 @@ export class LlamaContextSequence {
797
922
  this._tokenMeter.useTokens(shiftedTokens, "input");
798
923
  }
799
924
  this._nextTokenIndex -= removedTokens;
925
+ if (canResetTokenPredictor && removedTokens > 0)
926
+ await this._abortTokenPredictor(true);
800
927
  if (deletionSuccessful)
801
928
  return;
802
929
  const newSequenceTokens = this._contextTokens.slice();
803
930
  this._nextTokenIndex = 0;
804
931
  this._context._ctx.disposeSequence(this._sequenceId);
805
- await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
932
+ await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, { _skipLock: skipLock });
806
933
  });
807
934
  }
808
- evaluate(tokens, options = {}) {
809
- const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
810
- return this._evaluate(tokens, {
935
+ /**
936
+ * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
937
+ *
938
+ * This method uses the token predictor (when provided) to generate new tokens faster.
939
+ */
940
+ async *evaluate(tokens, options = {}) {
941
+ const iterator = this.evaluateWithMetadata(tokens, {}, options);
942
+ let iterateInput = undefined;
943
+ try {
944
+ while (true) {
945
+ const { value, done } = await iterator.next(iterateInput);
946
+ if (done)
947
+ return;
948
+ iterateInput = yield value.token;
949
+ }
950
+ }
951
+ finally {
952
+ await iterator.return();
953
+ }
954
+ }
955
+ /**
956
+ * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
957
+ *
958
+ * Configure the additional metadata options to choose which metadata to include.
959
+ */
960
+ evaluateWithMetadata(tokens, metadata, options = {}) {
961
+ const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
962
+ if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
963
+ return this._speculativeEvaluate(tokens, metadata, {
964
+ temperature,
965
+ minP,
966
+ topK,
967
+ topP,
968
+ seed,
969
+ grammarEvaluationState,
970
+ repeatPenalty,
971
+ tokenBias,
972
+ evaluationPriority,
973
+ contextShiftOptions: {
974
+ size: contextShiftSize,
975
+ strategy: contextShiftStrategy
976
+ },
977
+ yieldEogToken,
978
+ tokenPredictor: this._tokenPredictor
979
+ });
980
+ return this._evaluate(tokens, metadata, {
811
981
  temperature,
812
982
  minP,
813
983
  topK,
@@ -827,82 +997,205 @@ export class LlamaContextSequence {
827
997
  }
828
998
  /**
829
999
  * Evaluate the provided tokens into the context sequence without generating new tokens.
830
- * @param tokens
831
- * @param [options]
832
1000
  */
833
- async evaluateWithoutGeneratingNewTokens(tokens, { evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {} } = {}) {
834
- const iterator = this._evaluate(tokens, {
1001
+ async evaluateWithoutGeneratingNewTokens(tokens, options = {}) {
1002
+ const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, _skipLock = false } = options;
1003
+ const iterator = this._evaluate(tokens, {}, {
835
1004
  generateNewTokens: false,
836
1005
  evaluationPriority,
837
1006
  contextShiftOptions: {
838
1007
  size: contextShiftSize,
839
1008
  strategy: contextShiftStrategy
840
- }
1009
+ },
1010
+ _skipLock
841
1011
  });
1012
+ const predictorAlignmentPromise = this.tokenPredictor == null
1013
+ ? undefined
1014
+ : this._tokenPredictor?.reset({
1015
+ stateTokens: [...this._contextTokens, ...tokens],
1016
+ evaluateOptions: {
1017
+ evaluationPriority,
1018
+ contextShift: {
1019
+ size: contextShiftSize,
1020
+ strategy: contextShiftStrategy
1021
+ }
1022
+ },
1023
+ targetSequence: this
1024
+ });
1025
+ if (predictorAlignmentPromise != null) {
1026
+ this._tokenPredictorOwner = {};
1027
+ this._resetTokenPredictor = false;
1028
+ }
842
1029
  // eslint-disable-next-line @typescript-eslint/no-unused-vars
843
1030
  for await (const token of iterator) {
844
1031
  // Array.from doesn't work with async generators, so we have to iterate over the generator
845
1032
  }
1033
+ await iterator.return();
1034
+ if (predictorAlignmentPromise != null)
1035
+ await predictorAlignmentPromise;
1036
+ }
1037
+ /**
1038
+ * Evaluate the provided tokens into the context sequence with custom options for each token.
1039
+ *
1040
+ * This method allows for more precise control of the generation process.
1041
+ *
1042
+ * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
1043
+ *
1044
+ * To generate more tokens after this method finishes,
1045
+ * use it again with token(s) you selected to add to the context from the previous evaluation.
1046
+ *
1047
+ * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
1048
+ * Use the `evaluate` method when you need to use token prediction.
1049
+ * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
1050
+ * For indexes that have no output, there won't be any value at the corresponding index in the output array.
1051
+ *
1052
+ * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
1053
+ */
1054
+ async controlledEvaluate(input, options) {
1055
+ const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {} } = options ?? {};
1056
+ const contextShiftOptions = {
1057
+ size: contextShiftSize,
1058
+ strategy: contextShiftStrategy
1059
+ };
1060
+ this._ensureNotDisposed();
1061
+ if (input.length === 0)
1062
+ return [];
1063
+ await this._abortTokenPredictor();
1064
+ const sampler = new LlamaSampler(this.model);
1065
+ const onTokenResult = safeEventCallback(options?.onTokenResult);
1066
+ const logitsArray = [];
1067
+ const resolvedTokens = input.map((item, index) => {
1068
+ if (item instanceof Array) {
1069
+ const [token, options] = item;
1070
+ const generateNext = options?.generateNext ?? {};
1071
+ if (generateNext.probabilities === true || generateNext.confidence === true || generateNext.token === true)
1072
+ logitsArray[index] = true;
1073
+ return token;
1074
+ }
1075
+ return item;
1076
+ });
1077
+ const evaluatorLock = await acquireLock(this._lock, "evaluate");
1078
+ try {
1079
+ return await this._decodeTokens(resolvedTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, async (batchLogitIndex, tokenIndex) => {
1080
+ const inputToken = input[tokenIndex];
1081
+ const inputOptions = inputToken instanceof Array
1082
+ ? (inputToken[1] ?? {})
1083
+ : {};
1084
+ const generateNext = inputOptions.generateNext;
1085
+ if (generateNext == null || ((generateNext.probabilities == null || !generateNext.probabilities) &&
1086
+ (generateNext.token == null || !generateNext.token) &&
1087
+ (generateNext.confidence == null || !generateNext.confidence)))
1088
+ return undefined;
1089
+ const sampleOptions = generateNext.options ?? {};
1090
+ const samplerConfig = this._resolveSamplerConfig({
1091
+ temperature: sampleOptions.temperature,
1092
+ minP: sampleOptions.minP,
1093
+ topK: sampleOptions.topK,
1094
+ topP: sampleOptions.topP,
1095
+ seed: sampleOptions.seed,
1096
+ repeatPenalty: sampleOptions.repeatPenalty,
1097
+ tokenBias: sampleOptions.tokenBias
1098
+ });
1099
+ return await withLock(sampler, "sample", async () => {
1100
+ if (sampler.disposed)
1101
+ return undefined;
1102
+ sampler.applyConfig(samplerConfig);
1103
+ const [token, probabilities, confidence] = await this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, !!generateNext.probabilities, !!generateNext.confidence);
1104
+ const output = {
1105
+ next: {}
1106
+ };
1107
+ if (generateNext.token)
1108
+ output.next.token = token === -1
1109
+ ? null
1110
+ : (token ?? null);
1111
+ if (confidence != null)
1112
+ output.next.confidence = confidence;
1113
+ if (probabilities != null)
1114
+ output.next.probabilities = reviveTokenProbabilities(probabilities);
1115
+ onTokenResult?.(tokenIndex, output);
1116
+ return output;
1117
+ });
1118
+ });
1119
+ }
1120
+ finally {
1121
+ evaluatorLock.dispose();
1122
+ void withLock(sampler, "sample", sampler.asyncDispose);
1123
+ }
846
1124
  }
847
1125
  /** @internal */
848
- async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
1126
+ async *_evaluate(tokens, metadata, { temperature, minP, topK, topP, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false, _skipLock = false }) {
849
1127
  this._ensureNotDisposed();
850
1128
  let evalTokens = tokens;
851
1129
  if (evalTokens.length === 0)
852
1130
  return;
1131
+ await this._abortTokenPredictor(false, true);
1132
+ const sampleProbabilities = metadata.probabilities === true;
1133
+ const sampleConfidence = metadata.confidence === true;
853
1134
  const sampler = new LlamaSampler(this.model);
854
1135
  try {
855
1136
  while (true) {
856
1137
  this._ensureNotDisposed();
857
- // Evaluate to get the next token.
858
- const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
859
- if (_noSampling)
860
- return null;
861
- const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
862
- ? repeatPenalty.punishTokens()
863
- : repeatPenalty?.punishTokens;
864
- const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
865
- const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
866
- ? grammarEvaluationState()
867
- : grammarEvaluationState;
868
- if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
869
- throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
870
- const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
871
- sampler.applyConfig(removeNullFields({
872
- temperature,
873
- minP,
874
- topK,
875
- topP,
876
- seed: Math.max(0, Number.isFinite(seed)
877
- ? Math.floor(seed ?? (Date.now() / 1000))
878
- : Math.floor(Date.now() / 1000)),
879
- repeatPenalty: repeatPenalty?.penalty,
880
- repeatPenaltyMaxTokens: maxPunishTokens,
881
- repeatPenaltyTokens: repeatPenaltyTokens != null
882
- ? Uint32Array.from(repeatPenaltyTokens)
883
- : undefined,
884
- repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
885
- repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
886
- tokenBiasKeys,
887
- tokenBiasValues,
888
- grammarEvaluationState: resolvedGrammarEvaluationState?._state
889
- }));
890
- return withLock(sampler, "sample", async () => {
891
- if (sampler.disposed)
1138
+ const evaluatorLock = _skipLock
1139
+ ? undefined
1140
+ : await acquireLock(this._lock, "evaluate");
1141
+ let nextToken;
1142
+ const yieldRes = {};
1143
+ try {
1144
+ const logitsArray = [];
1145
+ if (generateNewTokens)
1146
+ logitsArray[evalTokens.length - 1] = true;
1147
+ // Evaluate to get the next token.
1148
+ const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
1149
+ if (_noSampling)
892
1150
  return null;
893
- return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
1151
+ const samplerConfig = this._resolveSamplerConfig({
1152
+ temperature,
1153
+ minP,
1154
+ topK,
1155
+ topP,
1156
+ seed,
1157
+ grammarEvaluationState,
1158
+ repeatPenalty,
1159
+ tokenBias
1160
+ });
1161
+ return withLock(sampler, "sample", async () => {
1162
+ if (sampler.disposed)
1163
+ return null;
1164
+ sampler.applyConfig(samplerConfig);
1165
+ if (sampleProbabilities || sampleConfidence)
1166
+ return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
1167
+ else
1168
+ return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
1169
+ });
894
1170
  });
895
- });
896
- if (nextToken === -1)
897
- throw new Error("Failed to sample next token");
898
- if (nextToken == null)
899
- return;
900
- // the model finished generating text
901
- if (!yieldEogToken && this._context.model.isEogToken(nextToken))
902
- break;
903
- const replacementToken = (yield nextToken);
1171
+ const lastDecodeResult = decodeResult[evalTokens.length - 1];
1172
+ if (lastDecodeResult instanceof Array) {
1173
+ const [token, probabilities, confidence] = lastDecodeResult;
1174
+ nextToken = token;
1175
+ if (probabilities != null)
1176
+ yieldRes.probabilities = reviveTokenProbabilities(probabilities);
1177
+ if (confidence != null)
1178
+ yieldRes.confidence = confidence;
1179
+ }
1180
+ else
1181
+ nextToken = lastDecodeResult;
1182
+ if (nextToken === -1)
1183
+ throw new Error("Failed to sample next token");
1184
+ if (nextToken == null)
1185
+ return;
1186
+ // the model finished generating text
1187
+ if (!yieldEogToken && this._context.model.isEogToken(nextToken))
1188
+ break;
1189
+ }
1190
+ finally {
1191
+ evaluatorLock?.dispose();
1192
+ }
1193
+ yieldRes.token = nextToken;
1194
+ const replacementToken = yield yieldRes;
904
1195
  // set the tokens for the next evaluation
905
- if (replacementToken != null)
1196
+ if (replacementToken instanceof Array)
1197
+ evalTokens = replacementToken.slice();
1198
+ else if (replacementToken != null)
906
1199
  evalTokens = [replacementToken];
907
1200
  else
908
1201
  evalTokens = [nextToken];
@@ -913,39 +1206,280 @@ export class LlamaContextSequence {
913
1206
  }
914
1207
  }
915
1208
  /** @internal */
916
- async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
1209
+ async *_speculativeEvaluate(tokens, metadata, { temperature, minP, topK, topP, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShiftOptions, yieldEogToken = false, tokenPredictor }) {
917
1210
  this._ensureNotDisposed();
918
- const tokensLeftToDecode = tokens.slice();
919
- return await withLock(this, "evaluate", async () => {
920
- while (tokensLeftToDecode.length > 0) {
1211
+ let evalTokens = tokens.slice();
1212
+ if (evalTokens.length === 0)
1213
+ return;
1214
+ const tokenPredictorOwner = {};
1215
+ this._tokenPredictorOwner = tokenPredictorOwner;
1216
+ await this._abortTokenPredictor();
1217
+ const sampleProbabilities = metadata.probabilities === true;
1218
+ const sampleConfidence = metadata.confidence === true;
1219
+ let logitsArray = [];
1220
+ let logitsStartIndex = evalTokens.length - 1;
1221
+ const validatedTokens = [];
1222
+ logitsArray[logitsStartIndex] = true;
1223
+ const sampler = new LlamaSampler(this.model);
1224
+ try {
1225
+ while (true) {
921
1226
  this._ensureNotDisposed();
922
- let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
923
- if (freeSpace <= 0) {
924
- await this._freeUpSpaceForTokens(contextShiftOptions);
925
- freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
926
- if (freeSpace <= 0)
927
- throw new Error("Failed to free up space for new tokens");
1227
+ const evaluatorLock = await acquireLock(this._lock, "evaluate");
1228
+ let nextToken;
1229
+ const yieldRes = {};
1230
+ try {
1231
+ if (this._tokenPredictorOwner === tokenPredictorOwner &&
1232
+ this._loadedTokenPredictions.length > 0 &&
1233
+ evalTokens.length === 1 &&
1234
+ evalTokens[0] === this._loadedTokenPredictions[0]?.[0]) {
1235
+ const [token, probabilities, confidence] = this._loadedTokenPredictions.shift()[1];
1236
+ nextToken = token;
1237
+ yieldRes.token = nextToken;
1238
+ if (probabilities != null)
1239
+ yieldRes.probabilities = reviveTokenProbabilities(probabilities);
1240
+ if (confidence != null)
1241
+ yieldRes.confidence = confidence;
1242
+ const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
1243
+ ? grammarEvaluationState()
1244
+ : grammarEvaluationState;
1245
+ if (resolvedGrammarEvaluationState != null)
1246
+ LlamaSampler._acceptTokenOnGrammarEvaluationState(this._context._llama, resolvedGrammarEvaluationState, nextToken);
1247
+ this._unusedTokenPredictions--;
1248
+ this._usedTokenPredictions++;
1249
+ }
1250
+ else if (this._tokenPredictorOwner === tokenPredictorOwner && this._loadedTokenPredictions.length > 0) {
1251
+ const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
1252
+ await this._eraseContextTokenRanges([{ start: deleteStartIndex, end: this._nextTokenIndex }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true });
1253
+ this._loadedTokenPredictions.length = 0;
1254
+ }
1255
+ if (this._resetTokenPredictor) {
1256
+ await tokenPredictor.reset({
1257
+ stateTokens: [...this._contextTokens, ...evalTokens],
1258
+ evaluateOptions: {
1259
+ temperature,
1260
+ minP,
1261
+ topK,
1262
+ topP,
1263
+ seed,
1264
+ grammarEvaluationState: grammarEvaluationState instanceof Function
1265
+ ? grammarEvaluationState()?.clone()
1266
+ : grammarEvaluationState?.clone(),
1267
+ repeatPenalty,
1268
+ tokenBias,
1269
+ evaluationPriority,
1270
+ contextShift: contextShiftOptions,
1271
+ yieldEogToken: true
1272
+ },
1273
+ targetSequence: this
1274
+ });
1275
+ this._resetTokenPredictor = false;
1276
+ this._tokenPredictorOwner = tokenPredictorOwner;
1277
+ }
1278
+ if (nextToken == null) {
1279
+ if (this._tokenPredictorOwner === tokenPredictorOwner &&
1280
+ // prevent incurring context shifts due to token prediction validations
1281
+ this._nextTokenIndex + evalTokens.length < this._context.contextSize) {
1282
+ const testGrammarClone = grammarEvaluationState instanceof Function
1283
+ ? grammarEvaluationState()?.clone()
1284
+ : grammarEvaluationState?.clone();
1285
+ for (const token of await tokenPredictor.predictTokens()) {
1286
+ if (testGrammarClone != null) {
1287
+ const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this.model._llama, testGrammarClone, token);
1288
+ if (!canAddToken)
1289
+ break;
1290
+ }
1291
+ evalTokens.push(token);
1292
+ logitsArray[evalTokens.length - 1] = true;
1293
+ // prevent incurring context shifts due to token prediction validations
1294
+ if (this._nextTokenIndex + evalTokens.length >= this._context.contextSize)
1295
+ break;
1296
+ }
1297
+ }
1298
+ let resolvedGrammarEvaluationState = undefined;
1299
+ // Evaluate to get the next token.
1300
+ const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex, tokenIndex) => {
1301
+ if (tokenIndex === logitsStartIndex)
1302
+ resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
1303
+ ? grammarEvaluationState()
1304
+ : grammarEvaluationState;
1305
+ else if (tokenIndex === logitsStartIndex + 1)
1306
+ resolvedGrammarEvaluationState = resolvedGrammarEvaluationState?.clone();
1307
+ const samplerConfig = this._resolveSamplerConfig({
1308
+ temperature,
1309
+ minP,
1310
+ topK,
1311
+ topP,
1312
+ seed,
1313
+ grammarEvaluationState: resolvedGrammarEvaluationState,
1314
+ repeatPenalty,
1315
+ tokenBias
1316
+ });
1317
+ return withLock(sampler, "sample", async () => {
1318
+ if (sampler.disposed)
1319
+ return null;
1320
+ sampler.applyConfig(samplerConfig);
1321
+ if (sampleProbabilities || sampleConfidence)
1322
+ return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
1323
+ else
1324
+ return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
1325
+ });
1326
+ });
1327
+ for (let i = logitsStartIndex; i < evalTokens.length; i++) {
1328
+ const item = decodeResult[i];
1329
+ const [resultToken, probabilities, confidence] = item instanceof Array
1330
+ ? item
1331
+ : [item];
1332
+ if (i === logitsStartIndex) {
1333
+ if (resultToken === -1)
1334
+ throw new Error("Failed to sample next token");
1335
+ if (resultToken == null)
1336
+ return;
1337
+ nextToken = resultToken;
1338
+ yieldRes.token = nextToken;
1339
+ if (probabilities != null)
1340
+ yieldRes.probabilities = reviveTokenProbabilities(probabilities);
1341
+ if (confidence != null)
1342
+ yieldRes.confidence = confidence;
1343
+ }
1344
+ else {
1345
+ if (resultToken === -1 || resultToken == null)
1346
+ break;
1347
+ const lastValidatedTokenOutput = i === logitsStartIndex + 1
1348
+ ? nextToken
1349
+ : validatedTokens.at(-1)?.[1];
1350
+ if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
1351
+ this._loadedTokenPredictions.push([evalTokens[i], [resultToken, probabilities, confidence]]);
1352
+ this._validatedTokenPredictions++;
1353
+ this._unusedTokenPredictions++;
1354
+ }
1355
+ else {
1356
+ const deleteSize = Math.min(evalTokens.length - i, this.context.contextSize);
1357
+ this._refutedTokenPredictions += deleteSize;
1358
+ const deleteStartIndex = this._nextTokenIndex - deleteSize;
1359
+ tokenPredictor.stop(true);
1360
+ await this._eraseContextTokenRanges([{
1361
+ start: deleteStartIndex,
1362
+ end: this._nextTokenIndex
1363
+ }], { canResetTokenPredictor: false, canRemovePredictionTokens: false, skipLock: true });
1364
+ break; // the assumption that this token will be generated was wrong
1365
+ }
1366
+ }
1367
+ }
1368
+ }
1369
+ if (nextToken == null)
1370
+ throw new Error("Failed to generated next token");
1371
+ // the model finished generating text
1372
+ if (!yieldEogToken && this._context.model.isEogToken(nextToken))
1373
+ break;
928
1374
  }
929
- const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
930
- const generateLogitAtTheEnd = generateLogit && tokensLeftToDecode.length === 0;
931
- const nextToken = await this._context._decodeTokens({
932
- sequenceId: this._sequenceId,
933
- tokens: tokensToDecode,
934
- firstTokenSequenceIndex: this._nextTokenIndex,
935
- generateLogitAtTheEnd,
936
- evaluationPriority,
937
- tokenMeter
938
- }, !generateLogitAtTheEnd
939
- ? undefined
940
- : onDecodeDone);
941
- this._nextTokenIndex += tokensToDecode.length;
942
- this._contextTokens = this._contextTokens.concat(tokensToDecode);
943
- if (generateLogitAtTheEnd && nextToken != null)
944
- return nextToken;
1375
+ finally {
1376
+ evaluatorLock.dispose();
1377
+ }
1378
+ const replacementToken = yield yieldRes;
1379
+ // set the tokens for the next evaluation
1380
+ if (replacementToken instanceof Array)
1381
+ evalTokens = replacementToken.slice();
1382
+ else if (replacementToken != null)
1383
+ evalTokens = [replacementToken];
1384
+ else
1385
+ evalTokens = [nextToken];
1386
+ if (this._tokenPredictorOwner === tokenPredictorOwner)
1387
+ tokenPredictor.pushTokens(evalTokens);
1388
+ logitsArray = [];
1389
+ logitsStartIndex = evalTokens.length - 1;
1390
+ logitsArray[logitsStartIndex] = true;
945
1391
  }
946
- return null;
1392
+ }
1393
+ finally {
1394
+ void withLock(sampler, "sample", sampler.asyncDispose);
1395
+ if (this._tokenPredictorOwner === tokenPredictorOwner)
1396
+ tokenPredictor.stop();
1397
+ }
1398
+ }
1399
+ /** @internal */
1400
+ async _abortTokenPredictor(skipClearingPredictionsFromState = false, skipLock = false) {
1401
+ this._tokenPredictor?.stop();
1402
+ this._resetTokenPredictor = true;
1403
+ if (skipClearingPredictionsFromState)
1404
+ return;
1405
+ if (this._loadedTokenPredictions.length > 0)
1406
+ await this._eraseContextTokenRanges([{
1407
+ start: this._nextTokenIndex - this._loadedTokenPredictions.length,
1408
+ end: this._nextTokenIndex
1409
+ }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock });
1410
+ }
1411
+ /** @internal */
1412
+ _resolveSamplerConfig({ temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias }) {
1413
+ const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
1414
+ ? repeatPenalty.punishTokens()
1415
+ : repeatPenalty?.punishTokens;
1416
+ const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
1417
+ const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
1418
+ ? grammarEvaluationState()
1419
+ : grammarEvaluationState;
1420
+ if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
1421
+ throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
1422
+ const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
1423
+ return removeNullFields({
1424
+ temperature,
1425
+ minP,
1426
+ topK,
1427
+ topP,
1428
+ seed: Math.max(0, Number.isFinite(seed)
1429
+ ? Math.floor(seed ?? (Date.now() / 1000))
1430
+ : Math.floor(Date.now() / 1000)),
1431
+ repeatPenalty: repeatPenalty?.penalty,
1432
+ repeatPenaltyMaxTokens: maxPunishTokens,
1433
+ repeatPenaltyTokens: repeatPenaltyTokens != null
1434
+ ? Uint32Array.from(repeatPenaltyTokens)
1435
+ : undefined,
1436
+ repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
1437
+ repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
1438
+ tokenBiasKeys,
1439
+ tokenBiasValues,
1440
+ grammarEvaluationState: resolvedGrammarEvaluationState?._state
947
1441
  });
948
1442
  }
1443
+ /**
1444
+ * The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
1445
+ * @internal
1446
+ */
1447
+ async _decodeTokens(tokens, logits, evaluationPriority, tokenMeter, contextShiftOptions, logitDataMapper) {
1448
+ this._ensureNotDisposed();
1449
+ const tokensLeftToDecode = tokens.slice();
1450
+ const tokenLogitsLeftToDecode = logits.slice();
1451
+ let currentTokenIndex = 0;
1452
+ const res = [];
1453
+ const normalizedLogitDataMapper = (batchLogitIndex, contextStateTokenIndex) => {
1454
+ return logitDataMapper(batchLogitIndex, currentTokenIndex + (contextStateTokenIndex - this._nextTokenIndex));
1455
+ };
1456
+ while (tokensLeftToDecode.length > 0) {
1457
+ this._ensureNotDisposed();
1458
+ let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
1459
+ if (freeSpace <= 0) {
1460
+ await this._freeUpSpaceForTokens(contextShiftOptions);
1461
+ freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
1462
+ if (freeSpace <= 0)
1463
+ throw new Error("Failed to free up space for new tokens");
1464
+ }
1465
+ const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
1466
+ const tokensLogits = tokenLogitsLeftToDecode.slice(0, tokensToDecode.length);
1467
+ const generatedLogits = await this._context._decodeTokens({
1468
+ sequenceId: this._sequenceId,
1469
+ tokens: tokensToDecode,
1470
+ firstTokenSequenceIndex: this._nextTokenIndex,
1471
+ logits: tokensLogits,
1472
+ evaluationPriority,
1473
+ tokenMeter
1474
+ }, normalizedLogitDataMapper);
1475
+ for (const [index, value] of generatedLogits)
1476
+ res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
1477
+ this._nextTokenIndex += tokensToDecode.length;
1478
+ currentTokenIndex += tokensToDecode.length;
1479
+ this._contextTokens = this._contextTokens.concat(tokensToDecode);
1480
+ }
1481
+ return res;
1482
+ }
949
1483
  /** @internal */
950
1484
  async _freeUpSpaceForTokens(contextShiftOptions) {
951
1485
  this._ensureNotDisposed();
@@ -957,7 +1491,7 @@ export class LlamaContextSequence {
957
1491
  let eraseStartIndex = 0;
958
1492
  if (this.model.tokens.bos != null && this._contextTokens[0] === this.model.tokens.bos)
959
1493
  eraseStartIndex = 1;
960
- await this.eraseContextTokenRanges([{ start: eraseStartIndex, end: size + eraseStartIndex }]);
1494
+ await this._eraseContextTokenRanges([{ start: eraseStartIndex, end: size + eraseStartIndex }], { skipLock: true });
961
1495
  }
962
1496
  else {
963
1497
  const ranges = await contextShiftOptions.strategy({
@@ -966,9 +1500,9 @@ export class LlamaContextSequence {
966
1500
  });
967
1501
  if (ranges == null)
968
1502
  throw new Error("Invalid delete ranges");
969
- await this.eraseContextTokenRanges(ranges);
970
- if (this.nextTokenIndex >= this._context.contextSize - 1)
971
- await this.eraseContextTokenRanges([{ start: 0, end: size }]);
1503
+ await this._eraseContextTokenRanges(ranges, { skipLock: true });
1504
+ if (this._nextTokenIndex >= this._context.contextSize - 1)
1505
+ await this._eraseContextTokenRanges([{ start: 0, end: size }], { skipLock: true });
972
1506
  }
973
1507
  }
974
1508
  /** @internal */
@@ -980,7 +1514,7 @@ export class LlamaContextSequence {
980
1514
  * We need this to make it impossible to manually create instances of this class outside the code of this library
981
1515
  * @internal
982
1516
  */
983
- static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
1517
+ static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor }) {
984
1518
  return new LlamaContextSequence({
985
1519
  sequenceId,
986
1520
  context,
@@ -988,7 +1522,8 @@ export class LlamaContextSequence {
988
1522
  contextShift: {
989
1523
  size: contextShiftSize,
990
1524
  strategy: contextShiftStrategy
991
- }
1525
+ },
1526
+ tokenPredictor
992
1527
  });
993
1528
  }
994
1529
  }
@@ -1020,6 +1555,17 @@ function getTokenBiasesForAddon(tokenBias, currentModel) {
1020
1555
  tokenBiasValues: Float32Array.from(tokenBiasValues)
1021
1556
  };
1022
1557
  }
1558
+ function reviveTokenProbabilities(probabilities) {
1559
+ if (probabilities == null)
1560
+ return undefined;
1561
+ const res = new Map();
1562
+ for (let i = 1; i < probabilities.length; i += 2) {
1563
+ const token = probabilities[i - 1];
1564
+ const probability = probabilities[i];
1565
+ res.set(token, probability);
1566
+ }
1567
+ return res;
1568
+ }
1023
1569
  function disposeContextIfReferenced(contextRef) {
1024
1570
  const context = contextRef.deref();
1025
1571
  if (context != null)