node-llama-cpp 3.3.2 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/bindings/AddonTypes.d.ts +12 -4
- package/dist/bindings/Llama.d.ts +9 -0
- package/dist/bindings/Llama.js +52 -28
- package/dist/bindings/Llama.js.map +1 -1
- package/dist/bindings/getLlama.d.ts +2 -1
- package/dist/bindings/getLlama.js +19 -9
- package/dist/bindings/getLlama.js.map +1 -1
- package/dist/bindings/utils/asyncSome.js +2 -0
- package/dist/bindings/utils/asyncSome.js.map +1 -1
- package/dist/bindings/utils/compileLLamaCpp.d.ts +1 -1
- package/dist/bindings/utils/compileLLamaCpp.js +108 -34
- package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
- package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +1 -0
- package/dist/bindings/utils/detectAvailableComputeLayers.js +4 -4
- package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
- package/dist/bindings/utils/detectBuildTools.d.ts +14 -0
- package/dist/bindings/utils/detectBuildTools.js +149 -0
- package/dist/bindings/utils/detectBuildTools.js.map +1 -0
- package/dist/bindings/utils/resolveActualBindingBinaryPath.d.ts +1 -0
- package/dist/bindings/utils/resolveActualBindingBinaryPath.js +18 -0
- package/dist/bindings/utils/resolveActualBindingBinaryPath.js.map +1 -0
- package/dist/bindings/utils/testBindingBinary.d.ts +1 -1
- package/dist/bindings/utils/testBindingBinary.js +58 -5
- package/dist/bindings/utils/testBindingBinary.js.map +1 -1
- package/dist/chatWrappers/AlpacaChatWrapper.d.ts +4 -0
- package/dist/chatWrappers/AlpacaChatWrapper.js +4 -0
- package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/FalconChatWrapper.d.ts +4 -0
- package/dist/chatWrappers/FalconChatWrapper.js +4 -0
- package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GeneralChatWrapper.d.ts +4 -0
- package/dist/chatWrappers/GeneralChatWrapper.js +4 -0
- package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
- package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +2 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.js +8 -27
- package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
- package/dist/cli/commands/ChatCommand.d.ts +4 -0
- package/dist/cli/commands/ChatCommand.js +155 -11
- package/dist/cli/commands/ChatCommand.js.map +1 -1
- package/dist/cli/commands/CompleteCommand.d.ts +4 -0
- package/dist/cli/commands/CompleteCommand.js +143 -10
- package/dist/cli/commands/CompleteCommand.js.map +1 -1
- package/dist/cli/commands/DebugCommand.js +5 -5
- package/dist/cli/commands/DebugCommand.js.map +1 -1
- package/dist/cli/commands/InfillCommand.d.ts +4 -0
- package/dist/cli/commands/InfillCommand.js +142 -10
- package/dist/cli/commands/InfillCommand.js.map +1 -1
- package/dist/cli/commands/OnPostInstallCommand.js +12 -2
- package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
- package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +1 -0
- package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +14 -7
- package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -1
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +13 -3
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +20 -10
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +2 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +234 -77
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
- package/dist/cli/utils/ConsoleTable.d.ts +1 -0
- package/dist/cli/utils/ConsoleTable.js +5 -1
- package/dist/cli/utils/ConsoleTable.js.map +1 -1
- package/dist/cli/utils/interactivelyAskForModel.d.ts +2 -1
- package/dist/cli/utils/interactivelyAskForModel.js +16 -13
- package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
- package/dist/cli/utils/isRunningUnderRosetta.d.ts +1 -0
- package/dist/cli/utils/isRunningUnderRosetta.js +20 -0
- package/dist/cli/utils/isRunningUnderRosetta.js.map +1 -0
- package/dist/cli/utils/printCommonInfoLines.d.ts +4 -2
- package/dist/cli/utils/printCommonInfoLines.js +67 -5
- package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
- package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
- package/dist/cli/utils/resolveCommandGgufPath.js +6 -5
- package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
- package/dist/cli/utils/toBytes.d.ts +1 -0
- package/dist/cli/utils/toBytes.js +5 -0
- package/dist/cli/utils/toBytes.js.map +1 -0
- package/dist/config.d.ts +3 -0
- package/dist/config.js +3 -0
- package/dist/config.js.map +1 -1
- package/dist/evaluator/LlamaChat/LlamaChat.d.ts +12 -3
- package/dist/evaluator/LlamaChat/LlamaChat.js +21 -7
- package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
- package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +6 -2
- package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +3 -0
- package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
- package/dist/evaluator/LlamaCompletion.d.ts +3 -0
- package/dist/evaluator/LlamaCompletion.js +5 -0
- package/dist/evaluator/LlamaCompletion.js.map +1 -1
- package/dist/evaluator/LlamaContext/LlamaContext.d.ts +81 -38
- package/dist/evaluator/LlamaContext/LlamaContext.js +678 -132
- package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
- package/dist/evaluator/LlamaContext/TokenPredictor.d.ts +55 -0
- package/dist/evaluator/LlamaContext/TokenPredictor.js +20 -0
- package/dist/evaluator/LlamaContext/TokenPredictor.js.map +1 -0
- package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts +56 -0
- package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js +266 -0
- package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map +1 -0
- package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts +58 -0
- package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js +138 -0
- package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map +1 -0
- package/dist/evaluator/LlamaContext/types.d.ts +198 -5
- package/dist/evaluator/LlamaEmbeddingContext.d.ts +3 -0
- package/dist/evaluator/LlamaEmbeddingContext.js +3 -0
- package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
- package/dist/evaluator/LlamaGrammar.d.ts +7 -1
- package/dist/evaluator/LlamaGrammar.js +6 -0
- package/dist/evaluator/LlamaGrammar.js.map +1 -1
- package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +4 -4
- package/dist/evaluator/LlamaGrammarEvaluationState.js +16 -8
- package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
- package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +5 -0
- package/dist/evaluator/LlamaJsonSchemaGrammar.js +7 -0
- package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
- package/dist/evaluator/LlamaModel/LlamaModel.d.ts +19 -11
- package/dist/evaluator/LlamaModel/LlamaModel.js +23 -29
- package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
- package/dist/evaluator/LlamaRankingContext.d.ts +76 -0
- package/dist/evaluator/LlamaRankingContext.js +159 -0
- package/dist/evaluator/LlamaRankingContext.js.map +1 -0
- package/dist/evaluator/TokenBias.d.ts +3 -0
- package/dist/evaluator/TokenBias.js +3 -0
- package/dist/evaluator/TokenBias.js.map +1 -1
- package/dist/evaluator/utils/chunkDocument.d.ts +86 -0
- package/dist/evaluator/utils/chunkDocument.js +212 -0
- package/dist/evaluator/utils/chunkDocument.js.map +1 -0
- package/dist/gguf/insights/GgufInsights.d.ts +3 -1
- package/dist/gguf/insights/GgufInsights.js +114 -8
- package/dist/gguf/insights/GgufInsights.js.map +1 -1
- package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +6 -3
- package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +11 -7
- package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
- package/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts +2 -1
- package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js +13 -7
- package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map +1 -1
- package/dist/gguf/parser/GgufV2Parser.js +29 -8
- package/dist/gguf/parser/GgufV2Parser.js.map +1 -1
- package/dist/gguf/parser/parseGguf.js +11 -11
- package/dist/gguf/parser/parseGguf.js.map +1 -1
- package/dist/gguf/readGgufFileInfo.js +8 -3
- package/dist/gguf/readGgufFileInfo.js.map +1 -1
- package/dist/gguf/types/GgufFileInfoTypes.d.ts +1 -0
- package/dist/gguf/types/GgufMetadataTypes.d.ts +9 -9
- package/dist/gguf/types/GgufMetadataTypes.js +1 -1
- package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
- package/dist/gguf/types/GgufTensorInfoTypes.d.ts +13 -0
- package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -1
- package/dist/index.d.ts +7 -2
- package/dist/index.js +6 -1
- package/dist/index.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/dist/utils/LlamaText.d.ts +4 -1
- package/dist/utils/LlamaText.js +4 -1
- package/dist/utils/LlamaText.js.map +1 -1
- package/dist/utils/cmake.js +23 -0
- package/dist/utils/cmake.js.map +1 -1
- package/dist/utils/pushAll.d.ts +1 -1
- package/dist/utils/pushAll.js.map +1 -1
- package/dist/utils/tokenizerUtils.js +1 -1
- package/dist/utils/utilTypes.d.ts +5 -0
- package/llama/CMakeLists.txt +25 -8
- package/llama/addon/AddonContext.cpp +188 -16
- package/llama/addon/AddonContext.h +1 -0
- package/llama/addon/AddonGrammar.cpp +1 -4
- package/llama/addon/AddonGrammarEvaluationState.cpp +16 -5
- package/llama/addon/AddonModel.cpp +11 -15
- package/llama/addon/AddonModel.h +0 -1
- package/llama/addon/AddonSampler.cpp +1 -6
- package/llama/addon/addon.cpp +26 -7
- package/llama/addon/globals/getGpuInfo.cpp +30 -5
- package/llama/addon/globals/getGpuInfo.h +6 -1
- package/llama/addon/globals/getMemoryInfo.cpp +63 -0
- package/llama/addon/globals/getMemoryInfo.h +4 -0
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/cmake/win32.ensureNinjaPath.cmake +68 -0
- package/llama/cmake/win32.ensureNodeLib.cmake +34 -0
- package/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake +12 -0
- package/llama/cmake/win32.llvmEnsureCmakeAr.cmake +37 -0
- package/llama/cmake/win32.llvmUseGnuModeCompilers.cmake +87 -0
- package/llama/cmake/win32.programFilesPaths.cmake +31 -0
- package/llama/gitRelease.bundle +0 -0
- package/llama/gpuInfo/vulkan-gpu-info.cpp +29 -2
- package/llama/gpuInfo/vulkan-gpu-info.h +1 -0
- package/llama/llama.cpp.info.json +1 -1
- package/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake +14 -0
- package/llama/profiles/llvm.win32.host-x64.target-arm64.cmake +14 -0
- package/llama/profiles/llvm.win32.host-x64.target-x64.cmake +14 -0
- package/llama/toolchains/llvm.win32.host-x64.target-x64.cmake +20 -0
- package/llama/toolchains/win32.host-arm64.target-arm64.cmake +21 -0
- package/llama/toolchains/win32.host-x64.target-arm64.cmake +14 -34
- package/package.json +43 -43
- package/templates/packed/electron-typescript-react.json +1 -1
- package/templates/packed/node-typescript.json +1 -1
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
import { AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
|
|
1
|
+
import { acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
|
|
2
2
|
import { removeNullFields } from "../../utils/removeNullFields.js";
|
|
3
3
|
import { compareTokens } from "../../utils/compareTokens.js";
|
|
4
4
|
import { DisposeGuard } from "../../utils/DisposeGuard.js";
|
|
5
5
|
import { TokenMeter } from "../TokenMeter.js";
|
|
6
6
|
import { UnsupportedError } from "../../utils/UnsupportedError.js";
|
|
7
|
+
import { pushAll } from "../../utils/pushAll.js";
|
|
8
|
+
import { safeEventCallback } from "../../utils/safeEventCallback.js";
|
|
9
|
+
import { GgufArchitectureType } from "../../gguf/types/GgufMetadataTypes.js";
|
|
7
10
|
import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
|
|
8
11
|
import { LlamaSampler } from "./LlamaSampler.js";
|
|
9
12
|
const defaultLoraScale = 1;
|
|
@@ -13,6 +16,7 @@ const defaultFailedCreationRemedy = {
|
|
|
13
16
|
retries: 6,
|
|
14
17
|
autoContextSizeShrink: 0.16
|
|
15
18
|
};
|
|
19
|
+
const defaultEvaluationPriority = 5;
|
|
16
20
|
export class LlamaContext {
|
|
17
21
|
/** @internal */ _llama;
|
|
18
22
|
/** @internal */ _ctx;
|
|
@@ -43,7 +47,7 @@ export class LlamaContext {
|
|
|
43
47
|
/** @internal */ _allocatedContextSize;
|
|
44
48
|
/** @internal */ _disposed = false;
|
|
45
49
|
onDispose = new EventRelay();
|
|
46
|
-
constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "
|
|
50
|
+
constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings, _ranking }) {
|
|
47
51
|
if (_model.disposed)
|
|
48
52
|
throw new DisposedError();
|
|
49
53
|
this._llama = _model._llama;
|
|
@@ -70,6 +74,7 @@ export class LlamaContext {
|
|
|
70
74
|
flashAttention: this._flashAttention,
|
|
71
75
|
threads: this._idealThreads,
|
|
72
76
|
embeddings: _embeddings,
|
|
77
|
+
ranking: _ranking,
|
|
73
78
|
performanceTracking: this._performanceTracking
|
|
74
79
|
}));
|
|
75
80
|
this._batchingOptions = {
|
|
@@ -163,7 +168,7 @@ export class LlamaContext {
|
|
|
163
168
|
* When there are no sequences left, this method will throw an error.
|
|
164
169
|
*/
|
|
165
170
|
getSequence(options = {}) {
|
|
166
|
-
const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = options;
|
|
171
|
+
const { contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor, _tokenMeter } = options;
|
|
167
172
|
this._ensureNotDisposed();
|
|
168
173
|
const nextSequenceId = this._popSequenceId();
|
|
169
174
|
if (nextSequenceId == null)
|
|
@@ -175,7 +180,8 @@ export class LlamaContext {
|
|
|
175
180
|
contextShift: {
|
|
176
181
|
size: contextShiftSize,
|
|
177
182
|
strategy: contextShiftStrategy
|
|
178
|
-
}
|
|
183
|
+
},
|
|
184
|
+
tokenPredictor
|
|
179
185
|
});
|
|
180
186
|
}
|
|
181
187
|
dispatchPendingBatch() {
|
|
@@ -189,6 +195,7 @@ export class LlamaContext {
|
|
|
189
195
|
this._dispatchDecodeScheduled = false;
|
|
190
196
|
this._batchDispatchPending = false;
|
|
191
197
|
let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
|
|
198
|
+
const queuedDecodeToMappedLogits = new Map();
|
|
192
199
|
const resolvePrioritizationStrategy = () => {
|
|
193
200
|
try {
|
|
194
201
|
this._ensureNotDisposed();
|
|
@@ -205,6 +212,7 @@ export class LlamaContext {
|
|
|
205
212
|
for (const queuedDecode of this._queuedDecodes) {
|
|
206
213
|
const batchItem = {
|
|
207
214
|
tokens: queuedDecode.tokens,
|
|
215
|
+
logits: queuedDecode.logits,
|
|
208
216
|
evaluationPriority: queuedDecode.evaluationPriority
|
|
209
217
|
};
|
|
210
218
|
batchItemToQueuedDecodeMap.set(batchItem, queuedDecode);
|
|
@@ -262,15 +270,16 @@ export class LlamaContext {
|
|
|
262
270
|
if (currentBatchSize !== 0)
|
|
263
271
|
this._ctx.initBatch(currentBatchSize);
|
|
264
272
|
for (const { queuedDecode, processAmount } of batchItems) {
|
|
265
|
-
let
|
|
273
|
+
let batchLogitIndexes;
|
|
274
|
+
const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
|
|
275
|
+
const tokenIndexesWithLogitsToProcess = queuedDecode.logits.slice(0, processAmount)
|
|
276
|
+
.map((logit, index) => (logit ? index : undefined))
|
|
277
|
+
.filter((index) => index != undefined);
|
|
278
|
+
const numberOfOutputTokens = tokenIndexesWithLogitsToProcess.length;
|
|
279
|
+
TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
|
|
280
|
+
TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
|
|
266
281
|
try {
|
|
267
|
-
|
|
268
|
-
processAmount === queuedDecode.tokens.length;
|
|
269
|
-
const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
|
|
270
|
-
const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
|
|
271
|
-
TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
|
|
272
|
-
TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
|
|
273
|
-
batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
|
|
282
|
+
batchLogitIndexes = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), Uint32Array.from(tokenIndexesWithLogitsToProcess));
|
|
274
283
|
}
|
|
275
284
|
catch (err) {
|
|
276
285
|
this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
|
|
@@ -280,13 +289,23 @@ export class LlamaContext {
|
|
|
280
289
|
if (queuedDecode.tokens.length === processAmount) {
|
|
281
290
|
queuedDecodesToDelete.add(queuedDecode);
|
|
282
291
|
afterDecodeActions.push({
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
292
|
+
queuedDecode,
|
|
293
|
+
batchLogitIndexes,
|
|
294
|
+
batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
|
|
295
|
+
firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
|
|
296
|
+
returnResults: true
|
|
286
297
|
});
|
|
287
298
|
}
|
|
288
299
|
else {
|
|
300
|
+
if (batchLogitIndexes.length > 0)
|
|
301
|
+
afterDecodeActions.push({
|
|
302
|
+
queuedDecode,
|
|
303
|
+
batchLogitIndexes,
|
|
304
|
+
batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
|
|
305
|
+
firstTokenIndex: queuedDecode.firstTokenSequenceIndex
|
|
306
|
+
});
|
|
289
307
|
queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
|
|
308
|
+
queuedDecode.logits = queuedDecode.logits.slice(processAmount);
|
|
290
309
|
queuedDecode.firstTokenSequenceIndex += processAmount;
|
|
291
310
|
}
|
|
292
311
|
}
|
|
@@ -315,18 +334,50 @@ export class LlamaContext {
|
|
|
315
334
|
return;
|
|
316
335
|
}
|
|
317
336
|
}
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
337
|
+
function finishAfterDecodeAction(action, mappedLogitValues) {
|
|
338
|
+
if (mappedLogitValues != null && mappedLogitValues.length > 0) {
|
|
339
|
+
if (queuedDecodeToMappedLogits.has(action.queuedDecode))
|
|
340
|
+
pushAll(queuedDecodeToMappedLogits.get(action.queuedDecode), mappedLogitValues);
|
|
341
|
+
else
|
|
342
|
+
queuedDecodeToMappedLogits.set(action.queuedDecode, mappedLogitValues);
|
|
343
|
+
}
|
|
344
|
+
if (action.returnResults != null) {
|
|
345
|
+
const [accept] = action.queuedDecode.response;
|
|
346
|
+
const mappedLogits = queuedDecodeToMappedLogits.get(action.queuedDecode) ?? [];
|
|
347
|
+
queuedDecodeToMappedLogits.delete(action.queuedDecode);
|
|
348
|
+
accept(mappedLogits);
|
|
327
349
|
}
|
|
328
|
-
accept(undefined);
|
|
329
350
|
}
|
|
351
|
+
const afterDecodeActionResults = afterDecodeActions.map((action) => {
|
|
352
|
+
if (action.batchLogitIndexes.length === 0) {
|
|
353
|
+
finishAfterDecodeAction(action);
|
|
354
|
+
return undefined;
|
|
355
|
+
}
|
|
356
|
+
const mappedLogitValues = [];
|
|
357
|
+
let promiseChain = undefined;
|
|
358
|
+
const batchLogitIndexes = action.batchLogitIndexes;
|
|
359
|
+
const batchLogitTokenIndexes = action.batchLogitTokenIndexes;
|
|
360
|
+
for (let i = 0; i < batchLogitIndexes.length; i++) {
|
|
361
|
+
const tokenIndex = batchLogitTokenIndexes[i];
|
|
362
|
+
const mappedValue = promiseChain != null
|
|
363
|
+
? promiseChain
|
|
364
|
+
.then(() => action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex))
|
|
365
|
+
: action.queuedDecode.logitDataMapper(batchLogitIndexes[i], tokenIndex + action.firstTokenIndex);
|
|
366
|
+
if (mappedValue instanceof Promise) {
|
|
367
|
+
promiseChain = mappedValue;
|
|
368
|
+
mappedLogitValues.push(mappedValue
|
|
369
|
+
.then((value) => [tokenIndex + action.firstTokenIndex, value]));
|
|
370
|
+
}
|
|
371
|
+
else
|
|
372
|
+
mappedLogitValues.push([tokenIndex + action.firstTokenIndex, mappedValue]);
|
|
373
|
+
}
|
|
374
|
+
if (promiseChain != null)
|
|
375
|
+
return Promise.all(mappedLogitValues)
|
|
376
|
+
.then((resolvedMappedLogitValues) => finishAfterDecodeAction(action, resolvedMappedLogitValues));
|
|
377
|
+
finishAfterDecodeAction(action, mappedLogitValues);
|
|
378
|
+
return undefined;
|
|
379
|
+
});
|
|
380
|
+
await Promise.all(afterDecodeActionResults);
|
|
330
381
|
};
|
|
331
382
|
const prioritizationStrategy = resolvePrioritizationStrategy();
|
|
332
383
|
if (prioritizationStrategy == null)
|
|
@@ -376,17 +427,17 @@ export class LlamaContext {
|
|
|
376
427
|
await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
|
|
377
428
|
}
|
|
378
429
|
/** @internal */
|
|
379
|
-
async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens,
|
|
430
|
+
async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter }, logitDataMapper) {
|
|
380
431
|
return await new Promise((accept, reject) => {
|
|
381
432
|
this._queuedDecodes.push({
|
|
382
433
|
sequenceId,
|
|
383
434
|
tokens,
|
|
435
|
+
logits,
|
|
384
436
|
firstTokenSequenceIndex,
|
|
385
|
-
generateLogitAtTheEnd,
|
|
386
437
|
evaluationPriority,
|
|
387
438
|
tokenMeter,
|
|
388
439
|
response: [accept, reject],
|
|
389
|
-
|
|
440
|
+
logitDataMapper
|
|
390
441
|
});
|
|
391
442
|
this._queuedDecodeSequenceIds.add(sequenceId);
|
|
392
443
|
this._scheduleDecode();
|
|
@@ -429,10 +480,20 @@ export class LlamaContext {
|
|
|
429
480
|
const dispatchSchedule = this._batchingOptions.dispatchSchedule;
|
|
430
481
|
if (this._queuedDecodeSequenceIds.size === this._totalSequences)
|
|
431
482
|
dispatch();
|
|
432
|
-
if (dispatchSchedule === "
|
|
433
|
-
|
|
434
|
-
|
|
483
|
+
if (dispatchSchedule === "nextCycle") {
|
|
484
|
+
if (typeof setImmediate === "function")
|
|
485
|
+
setImmediate(dispatch);
|
|
486
|
+
else
|
|
487
|
+
setTimeout(dispatch, 0);
|
|
488
|
+
}
|
|
489
|
+
else if (typeof dispatchSchedule === "function")
|
|
435
490
|
dispatchSchedule(dispatch);
|
|
491
|
+
else {
|
|
492
|
+
if (typeof setImmediate === "function")
|
|
493
|
+
setImmediate(dispatch);
|
|
494
|
+
else
|
|
495
|
+
setTimeout(dispatch, 0);
|
|
496
|
+
}
|
|
436
497
|
}
|
|
437
498
|
/** @internal */
|
|
438
499
|
_dispatchErrorForQueuedDecodesAndDequeue(queuedDecodes, err) {
|
|
@@ -620,17 +681,27 @@ export class LlamaContextSequence {
|
|
|
620
681
|
/** @internal */ _gcRegistry;
|
|
621
682
|
/** @internal */ _context;
|
|
622
683
|
/** @internal */ _contextShift;
|
|
684
|
+
/** @internal */ _tokenPredictor;
|
|
623
685
|
/** @internal */ _tokenMeter;
|
|
624
686
|
/** @internal */ _disposeAggregator = new DisposeAggregator();
|
|
687
|
+
/** @internal */ _lock = {};
|
|
688
|
+
/** @internal */ _resetTokenPredictor = false;
|
|
689
|
+
/** @internal */ _tokenPredictorOwner = {};
|
|
625
690
|
/** @internal */ _contextTokens = [];
|
|
626
691
|
/** @internal */ _nextTokenIndex = 0;
|
|
692
|
+
/** @internal */ _loadedTokenPredictions = [];
|
|
693
|
+
/** @internal */ _usedTokenPredictions = 0;
|
|
694
|
+
/** @internal */ _unusedTokenPredictions = 0;
|
|
695
|
+
/** @internal */ _validatedTokenPredictions = 0;
|
|
696
|
+
/** @internal */ _refutedTokenPredictions = 0;
|
|
627
697
|
/** @internal */ _disposed = false;
|
|
628
698
|
onDispose = new EventRelay();
|
|
629
|
-
constructor({ sequenceId, context, tokenMeter, contextShift }) {
|
|
699
|
+
constructor({ sequenceId, context, tokenMeter, contextShift, tokenPredictor }) {
|
|
630
700
|
this._sequenceId = sequenceId;
|
|
631
701
|
this._context = context;
|
|
632
702
|
this._tokenMeter = tokenMeter ?? new TokenMeter();
|
|
633
703
|
this._contextShift = contextShift;
|
|
704
|
+
this._tokenPredictor = tokenPredictor;
|
|
634
705
|
this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
|
|
635
706
|
this._gcRegistry.register(this, sequenceId);
|
|
636
707
|
this._disposeAggregator.add(() => this._gcRegistry.unregister(this));
|
|
@@ -639,6 +710,8 @@ export class LlamaContextSequence {
|
|
|
639
710
|
this._disposeAggregator.add(() => {
|
|
640
711
|
this._context._reclaimUnusedSequenceId(this._sequenceId);
|
|
641
712
|
});
|
|
713
|
+
if (this._tokenPredictor != null)
|
|
714
|
+
this._disposeAggregator.add(this._tokenPredictor);
|
|
642
715
|
}
|
|
643
716
|
dispose() {
|
|
644
717
|
if (this._disposed)
|
|
@@ -660,20 +733,51 @@ export class LlamaContextSequence {
|
|
|
660
733
|
get model() {
|
|
661
734
|
return this._context.model;
|
|
662
735
|
}
|
|
736
|
+
/** The maximum number of tokens that the sequence state can hold */
|
|
737
|
+
get contextSize() {
|
|
738
|
+
return this._context.contextSize;
|
|
739
|
+
}
|
|
740
|
+
/** The index where the next evaluated token will be placed in the context */
|
|
663
741
|
get nextTokenIndex() {
|
|
664
|
-
return this._nextTokenIndex;
|
|
742
|
+
return this._nextTokenIndex - this._loadedTokenPredictions.length;
|
|
665
743
|
}
|
|
744
|
+
/** The current context state tokens */
|
|
666
745
|
get contextTokens() {
|
|
667
|
-
|
|
746
|
+
if (this._loadedTokenPredictions.length === 0)
|
|
747
|
+
return this._contextTokens.slice();
|
|
748
|
+
return this._contextTokens.slice(0, -this._loadedTokenPredictions.length);
|
|
668
749
|
}
|
|
669
750
|
get tokenMeter() {
|
|
670
751
|
return this._tokenMeter;
|
|
671
752
|
}
|
|
753
|
+
/**
|
|
754
|
+
* The token predictor used when creating this sequence.
|
|
755
|
+
*/
|
|
756
|
+
get tokenPredictor() {
|
|
757
|
+
return this._tokenPredictor;
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* Statistics of token predictions using the sequence's `tokenPredictor`.
|
|
761
|
+
*
|
|
762
|
+
* The statistics change only when token prediction is used in this sequence.
|
|
763
|
+
*
|
|
764
|
+
* `validated` + `refuted` = total number of evaluated predictions.
|
|
765
|
+
*
|
|
766
|
+
* Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
|
|
767
|
+
*/
|
|
768
|
+
get tokenPredictions() {
|
|
769
|
+
return {
|
|
770
|
+
used: this._usedTokenPredictions,
|
|
771
|
+
unused: this._unusedTokenPredictions,
|
|
772
|
+
validated: this._validatedTokenPredictions,
|
|
773
|
+
refuted: this._refutedTokenPredictions
|
|
774
|
+
};
|
|
775
|
+
}
|
|
672
776
|
get isLoadedToMemory() {
|
|
673
777
|
return !this._disposed;
|
|
674
778
|
}
|
|
675
779
|
compareContextTokens(tokens) {
|
|
676
|
-
for (let i = 0; i < this._contextTokens.length; i++) {
|
|
780
|
+
for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length; i++) {
|
|
677
781
|
if (compareTokens(this._contextTokens[i], tokens[i]))
|
|
678
782
|
continue;
|
|
679
783
|
return {
|
|
@@ -681,7 +785,7 @@ export class LlamaContextSequence {
|
|
|
681
785
|
};
|
|
682
786
|
}
|
|
683
787
|
return {
|
|
684
|
-
firstDifferentIndex: this._contextTokens.length
|
|
788
|
+
firstDifferentIndex: this._contextTokens.length - this._loadedTokenPredictions.length
|
|
685
789
|
};
|
|
686
790
|
}
|
|
687
791
|
/**
|
|
@@ -695,10 +799,12 @@ export class LlamaContextSequence {
|
|
|
695
799
|
* which incurs token evaluation of the shifted tokens.
|
|
696
800
|
*/
|
|
697
801
|
async adaptStateToTokens(tokens, allowShift = true) {
|
|
698
|
-
|
|
802
|
+
const modelSupportsShifting = !this.model.fileInsights.isRecurrent &&
|
|
803
|
+
this.model.fileInfo.metadata?.general?.architecture !== GgufArchitectureType.deepseek2;
|
|
804
|
+
if (!modelSupportsShifting || !allowShift) {
|
|
699
805
|
const { firstDifferentIndex } = this.compareContextTokens(tokens);
|
|
700
|
-
if (firstDifferentIndex < this.
|
|
701
|
-
await this.
|
|
806
|
+
if (firstDifferentIndex < this.nextTokenIndex)
|
|
807
|
+
await this._eraseContextTokenRanges([{
|
|
702
808
|
start: firstDifferentIndex,
|
|
703
809
|
end: this._nextTokenIndex
|
|
704
810
|
}]);
|
|
@@ -707,7 +813,7 @@ export class LlamaContextSequence {
|
|
|
707
813
|
const eraseRanges = [];
|
|
708
814
|
let tokensIndex = 0;
|
|
709
815
|
let differentTokenIndex = undefined;
|
|
710
|
-
for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) {
|
|
816
|
+
for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length && tokensIndex < tokens.length; i++) {
|
|
711
817
|
if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
|
|
712
818
|
if (differentTokenIndex != null) {
|
|
713
819
|
eraseRanges.push({
|
|
@@ -728,7 +834,7 @@ export class LlamaContextSequence {
|
|
|
728
834
|
end: this._nextTokenIndex
|
|
729
835
|
});
|
|
730
836
|
if (eraseRanges.length > 0)
|
|
731
|
-
await this.
|
|
837
|
+
await this._eraseContextTokenRanges(eraseRanges);
|
|
732
838
|
}
|
|
733
839
|
/**
|
|
734
840
|
* Clear the history of the sequence.
|
|
@@ -736,14 +842,18 @@ export class LlamaContextSequence {
|
|
|
736
842
|
*/
|
|
737
843
|
async clearHistory() {
|
|
738
844
|
this._ensureNotDisposed();
|
|
739
|
-
await this.
|
|
845
|
+
await this._eraseContextTokenRanges([{ start: 0, end: this._nextTokenIndex }]);
|
|
740
846
|
}
|
|
741
847
|
/**
|
|
742
848
|
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
|
743
849
|
* The start of each range is inclusive, and the end of each range is exclusive.
|
|
744
850
|
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
|
745
851
|
*/
|
|
746
|
-
|
|
852
|
+
eraseContextTokenRanges(ranges) {
|
|
853
|
+
return this._eraseContextTokenRanges(ranges);
|
|
854
|
+
}
|
|
855
|
+
/** @internal */
|
|
856
|
+
async _eraseContextTokenRanges(ranges, { canResetTokenPredictor = true, canRemovePredictionTokens = true, skipLock = false } = {}) {
|
|
747
857
|
this._ensureNotDisposed();
|
|
748
858
|
await withLock(this._context, "context", async () => {
|
|
749
859
|
this._ensureNotDisposed();
|
|
@@ -776,6 +886,19 @@ export class LlamaContextSequence {
|
|
|
776
886
|
ranges.push(range);
|
|
777
887
|
return ranges;
|
|
778
888
|
}, []);
|
|
889
|
+
const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
|
|
890
|
+
? this._loadedTokenPredictions.length
|
|
891
|
+
: 0;
|
|
892
|
+
if (tokenPredictionsToRemove > 0) {
|
|
893
|
+
const startDeleteIndex = this._nextTokenIndex - this._loadedTokenPredictions.length;
|
|
894
|
+
const lastDeleteRange = resolvedRanges[resolvedRanges.length - 1];
|
|
895
|
+
if (lastDeleteRange.end >= startDeleteIndex)
|
|
896
|
+
lastDeleteRange.end = this._nextTokenIndex;
|
|
897
|
+
else
|
|
898
|
+
resolvedRanges.push({ start: startDeleteIndex, end: this._nextTokenIndex });
|
|
899
|
+
if (canResetTokenPredictor)
|
|
900
|
+
await this._abortTokenPredictor(true);
|
|
901
|
+
}
|
|
779
902
|
let removedTokens = 0;
|
|
780
903
|
let lastDeleteRangeEndPos = null;
|
|
781
904
|
for (const range of resolvedRanges) {
|
|
@@ -790,6 +913,8 @@ export class LlamaContextSequence {
|
|
|
790
913
|
removedTokens += range.end - range.start;
|
|
791
914
|
lastDeleteRangeEndPos = range.end;
|
|
792
915
|
}
|
|
916
|
+
if (tokenPredictionsToRemove > 0)
|
|
917
|
+
this._loadedTokenPredictions.splice(0, tokenPredictionsToRemove);
|
|
793
918
|
if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
|
|
794
919
|
lastDeleteRangeEndPos !== this._nextTokenIndex) {
|
|
795
920
|
this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
|
|
@@ -797,17 +922,62 @@ export class LlamaContextSequence {
|
|
|
797
922
|
this._tokenMeter.useTokens(shiftedTokens, "input");
|
|
798
923
|
}
|
|
799
924
|
this._nextTokenIndex -= removedTokens;
|
|
925
|
+
if (canResetTokenPredictor && removedTokens > 0)
|
|
926
|
+
await this._abortTokenPredictor(true);
|
|
800
927
|
if (deletionSuccessful)
|
|
801
928
|
return;
|
|
802
929
|
const newSequenceTokens = this._contextTokens.slice();
|
|
803
930
|
this._nextTokenIndex = 0;
|
|
804
931
|
this._context._ctx.disposeSequence(this._sequenceId);
|
|
805
|
-
await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
|
|
932
|
+
await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, { _skipLock: skipLock });
|
|
806
933
|
});
|
|
807
934
|
}
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
935
|
+
/**
|
|
936
|
+
* Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
|
|
937
|
+
*
|
|
938
|
+
* This method uses the token predictor (when provided) to generate new tokens faster.
|
|
939
|
+
*/
|
|
940
|
+
async *evaluate(tokens, options = {}) {
|
|
941
|
+
const iterator = this.evaluateWithMetadata(tokens, {}, options);
|
|
942
|
+
let iterateInput = undefined;
|
|
943
|
+
try {
|
|
944
|
+
while (true) {
|
|
945
|
+
const { value, done } = await iterator.next(iterateInput);
|
|
946
|
+
if (done)
|
|
947
|
+
return;
|
|
948
|
+
iterateInput = yield value.token;
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
finally {
|
|
952
|
+
await iterator.return();
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
/**
|
|
956
|
+
* Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
|
|
957
|
+
*
|
|
958
|
+
* Configure the additional metadata options to choose which metadata to include.
|
|
959
|
+
*/
|
|
960
|
+
evaluateWithMetadata(tokens, metadata, options = {}) {
|
|
961
|
+
const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
|
|
962
|
+
if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
|
|
963
|
+
return this._speculativeEvaluate(tokens, metadata, {
|
|
964
|
+
temperature,
|
|
965
|
+
minP,
|
|
966
|
+
topK,
|
|
967
|
+
topP,
|
|
968
|
+
seed,
|
|
969
|
+
grammarEvaluationState,
|
|
970
|
+
repeatPenalty,
|
|
971
|
+
tokenBias,
|
|
972
|
+
evaluationPriority,
|
|
973
|
+
contextShiftOptions: {
|
|
974
|
+
size: contextShiftSize,
|
|
975
|
+
strategy: contextShiftStrategy
|
|
976
|
+
},
|
|
977
|
+
yieldEogToken,
|
|
978
|
+
tokenPredictor: this._tokenPredictor
|
|
979
|
+
});
|
|
980
|
+
return this._evaluate(tokens, metadata, {
|
|
811
981
|
temperature,
|
|
812
982
|
minP,
|
|
813
983
|
topK,
|
|
@@ -827,82 +997,205 @@ export class LlamaContextSequence {
|
|
|
827
997
|
}
|
|
828
998
|
/**
|
|
829
999
|
* Evaluate the provided tokens into the context sequence without generating new tokens.
|
|
830
|
-
* @param tokens
|
|
831
|
-
* @param [options]
|
|
832
1000
|
*/
|
|
833
|
-
async evaluateWithoutGeneratingNewTokens(tokens,
|
|
834
|
-
const
|
|
1001
|
+
async evaluateWithoutGeneratingNewTokens(tokens, options = {}) {
|
|
1002
|
+
const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, _skipLock = false } = options;
|
|
1003
|
+
const iterator = this._evaluate(tokens, {}, {
|
|
835
1004
|
generateNewTokens: false,
|
|
836
1005
|
evaluationPriority,
|
|
837
1006
|
contextShiftOptions: {
|
|
838
1007
|
size: contextShiftSize,
|
|
839
1008
|
strategy: contextShiftStrategy
|
|
840
|
-
}
|
|
1009
|
+
},
|
|
1010
|
+
_skipLock
|
|
841
1011
|
});
|
|
1012
|
+
const predictorAlignmentPromise = this.tokenPredictor == null
|
|
1013
|
+
? undefined
|
|
1014
|
+
: this._tokenPredictor?.reset({
|
|
1015
|
+
stateTokens: [...this._contextTokens, ...tokens],
|
|
1016
|
+
evaluateOptions: {
|
|
1017
|
+
evaluationPriority,
|
|
1018
|
+
contextShift: {
|
|
1019
|
+
size: contextShiftSize,
|
|
1020
|
+
strategy: contextShiftStrategy
|
|
1021
|
+
}
|
|
1022
|
+
},
|
|
1023
|
+
targetSequence: this
|
|
1024
|
+
});
|
|
1025
|
+
if (predictorAlignmentPromise != null) {
|
|
1026
|
+
this._tokenPredictorOwner = {};
|
|
1027
|
+
this._resetTokenPredictor = false;
|
|
1028
|
+
}
|
|
842
1029
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
843
1030
|
for await (const token of iterator) {
|
|
844
1031
|
// Array.from doesn't work with async generators, so we have to iterate over the generator
|
|
845
1032
|
}
|
|
1033
|
+
await iterator.return();
|
|
1034
|
+
if (predictorAlignmentPromise != null)
|
|
1035
|
+
await predictorAlignmentPromise;
|
|
1036
|
+
}
|
|
1037
|
+
/**
|
|
1038
|
+
* Evaluate the provided tokens into the context sequence with custom options for each token.
|
|
1039
|
+
*
|
|
1040
|
+
* This method allows for more precise control of the generation process.
|
|
1041
|
+
*
|
|
1042
|
+
* A next token will be generated for a given token only if any of the `generateNext` options for it are used.
|
|
1043
|
+
*
|
|
1044
|
+
* To generate more tokens after this method finishes,
|
|
1045
|
+
* use it again with token(s) you selected to add to the context from the previous evaluation.
|
|
1046
|
+
*
|
|
1047
|
+
* This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
|
|
1048
|
+
* Use the `evaluate` method when you need to use token prediction.
|
|
1049
|
+
* @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
|
|
1050
|
+
* For indexes that have no output, there won't be any value at the corresponding index in the output array.
|
|
1051
|
+
*
|
|
1052
|
+
* It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
|
|
1053
|
+
*/
|
|
1054
|
+
async controlledEvaluate(input, options) {
|
|
1055
|
+
const { evaluationPriority = defaultEvaluationPriority, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {} } = options ?? {};
|
|
1056
|
+
const contextShiftOptions = {
|
|
1057
|
+
size: contextShiftSize,
|
|
1058
|
+
strategy: contextShiftStrategy
|
|
1059
|
+
};
|
|
1060
|
+
this._ensureNotDisposed();
|
|
1061
|
+
if (input.length === 0)
|
|
1062
|
+
return [];
|
|
1063
|
+
await this._abortTokenPredictor();
|
|
1064
|
+
const sampler = new LlamaSampler(this.model);
|
|
1065
|
+
const onTokenResult = safeEventCallback(options?.onTokenResult);
|
|
1066
|
+
const logitsArray = [];
|
|
1067
|
+
const resolvedTokens = input.map((item, index) => {
|
|
1068
|
+
if (item instanceof Array) {
|
|
1069
|
+
const [token, options] = item;
|
|
1070
|
+
const generateNext = options?.generateNext ?? {};
|
|
1071
|
+
if (generateNext.probabilities === true || generateNext.confidence === true || generateNext.token === true)
|
|
1072
|
+
logitsArray[index] = true;
|
|
1073
|
+
return token;
|
|
1074
|
+
}
|
|
1075
|
+
return item;
|
|
1076
|
+
});
|
|
1077
|
+
const evaluatorLock = await acquireLock(this._lock, "evaluate");
|
|
1078
|
+
try {
|
|
1079
|
+
return await this._decodeTokens(resolvedTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, async (batchLogitIndex, tokenIndex) => {
|
|
1080
|
+
const inputToken = input[tokenIndex];
|
|
1081
|
+
const inputOptions = inputToken instanceof Array
|
|
1082
|
+
? (inputToken[1] ?? {})
|
|
1083
|
+
: {};
|
|
1084
|
+
const generateNext = inputOptions.generateNext;
|
|
1085
|
+
if (generateNext == null || ((generateNext.probabilities == null || !generateNext.probabilities) &&
|
|
1086
|
+
(generateNext.token == null || !generateNext.token) &&
|
|
1087
|
+
(generateNext.confidence == null || !generateNext.confidence)))
|
|
1088
|
+
return undefined;
|
|
1089
|
+
const sampleOptions = generateNext.options ?? {};
|
|
1090
|
+
const samplerConfig = this._resolveSamplerConfig({
|
|
1091
|
+
temperature: sampleOptions.temperature,
|
|
1092
|
+
minP: sampleOptions.minP,
|
|
1093
|
+
topK: sampleOptions.topK,
|
|
1094
|
+
topP: sampleOptions.topP,
|
|
1095
|
+
seed: sampleOptions.seed,
|
|
1096
|
+
repeatPenalty: sampleOptions.repeatPenalty,
|
|
1097
|
+
tokenBias: sampleOptions.tokenBias
|
|
1098
|
+
});
|
|
1099
|
+
return await withLock(sampler, "sample", async () => {
|
|
1100
|
+
if (sampler.disposed)
|
|
1101
|
+
return undefined;
|
|
1102
|
+
sampler.applyConfig(samplerConfig);
|
|
1103
|
+
const [token, probabilities, confidence] = await this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, !!generateNext.probabilities, !!generateNext.confidence);
|
|
1104
|
+
const output = {
|
|
1105
|
+
next: {}
|
|
1106
|
+
};
|
|
1107
|
+
if (generateNext.token)
|
|
1108
|
+
output.next.token = token === -1
|
|
1109
|
+
? null
|
|
1110
|
+
: (token ?? null);
|
|
1111
|
+
if (confidence != null)
|
|
1112
|
+
output.next.confidence = confidence;
|
|
1113
|
+
if (probabilities != null)
|
|
1114
|
+
output.next.probabilities = reviveTokenProbabilities(probabilities);
|
|
1115
|
+
onTokenResult?.(tokenIndex, output);
|
|
1116
|
+
return output;
|
|
1117
|
+
});
|
|
1118
|
+
});
|
|
1119
|
+
}
|
|
1120
|
+
finally {
|
|
1121
|
+
evaluatorLock.dispose();
|
|
1122
|
+
void withLock(sampler, "sample", sampler.asyncDispose);
|
|
1123
|
+
}
|
|
846
1124
|
}
|
|
847
1125
|
/** @internal */
|
|
848
|
-
async *_evaluate(tokens, { temperature
|
|
1126
|
+
async *_evaluate(tokens, metadata, { temperature, minP, topK, topP, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false, _skipLock = false }) {
|
|
849
1127
|
this._ensureNotDisposed();
|
|
850
1128
|
let evalTokens = tokens;
|
|
851
1129
|
if (evalTokens.length === 0)
|
|
852
1130
|
return;
|
|
1131
|
+
await this._abortTokenPredictor(false, true);
|
|
1132
|
+
const sampleProbabilities = metadata.probabilities === true;
|
|
1133
|
+
const sampleConfidence = metadata.confidence === true;
|
|
853
1134
|
const sampler = new LlamaSampler(this.model);
|
|
854
1135
|
try {
|
|
855
1136
|
while (true) {
|
|
856
1137
|
this._ensureNotDisposed();
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
|
|
870
|
-
const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
|
|
871
|
-
sampler.applyConfig(removeNullFields({
|
|
872
|
-
temperature,
|
|
873
|
-
minP,
|
|
874
|
-
topK,
|
|
875
|
-
topP,
|
|
876
|
-
seed: Math.max(0, Number.isFinite(seed)
|
|
877
|
-
? Math.floor(seed ?? (Date.now() / 1000))
|
|
878
|
-
: Math.floor(Date.now() / 1000)),
|
|
879
|
-
repeatPenalty: repeatPenalty?.penalty,
|
|
880
|
-
repeatPenaltyMaxTokens: maxPunishTokens,
|
|
881
|
-
repeatPenaltyTokens: repeatPenaltyTokens != null
|
|
882
|
-
? Uint32Array.from(repeatPenaltyTokens)
|
|
883
|
-
: undefined,
|
|
884
|
-
repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
|
|
885
|
-
repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
|
|
886
|
-
tokenBiasKeys,
|
|
887
|
-
tokenBiasValues,
|
|
888
|
-
grammarEvaluationState: resolvedGrammarEvaluationState?._state
|
|
889
|
-
}));
|
|
890
|
-
return withLock(sampler, "sample", async () => {
|
|
891
|
-
if (sampler.disposed)
|
|
1138
|
+
const evaluatorLock = _skipLock
|
|
1139
|
+
? undefined
|
|
1140
|
+
: await acquireLock(this._lock, "evaluate");
|
|
1141
|
+
let nextToken;
|
|
1142
|
+
const yieldRes = {};
|
|
1143
|
+
try {
|
|
1144
|
+
const logitsArray = [];
|
|
1145
|
+
if (generateNewTokens)
|
|
1146
|
+
logitsArray[evalTokens.length - 1] = true;
|
|
1147
|
+
// Evaluate to get the next token.
|
|
1148
|
+
const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
|
|
1149
|
+
if (_noSampling)
|
|
892
1150
|
return null;
|
|
893
|
-
|
|
1151
|
+
const samplerConfig = this._resolveSamplerConfig({
|
|
1152
|
+
temperature,
|
|
1153
|
+
minP,
|
|
1154
|
+
topK,
|
|
1155
|
+
topP,
|
|
1156
|
+
seed,
|
|
1157
|
+
grammarEvaluationState,
|
|
1158
|
+
repeatPenalty,
|
|
1159
|
+
tokenBias
|
|
1160
|
+
});
|
|
1161
|
+
return withLock(sampler, "sample", async () => {
|
|
1162
|
+
if (sampler.disposed)
|
|
1163
|
+
return null;
|
|
1164
|
+
sampler.applyConfig(samplerConfig);
|
|
1165
|
+
if (sampleProbabilities || sampleConfidence)
|
|
1166
|
+
return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
|
|
1167
|
+
else
|
|
1168
|
+
return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
|
|
1169
|
+
});
|
|
894
1170
|
});
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
1171
|
+
const lastDecodeResult = decodeResult[evalTokens.length - 1];
|
|
1172
|
+
if (lastDecodeResult instanceof Array) {
|
|
1173
|
+
const [token, probabilities, confidence] = lastDecodeResult;
|
|
1174
|
+
nextToken = token;
|
|
1175
|
+
if (probabilities != null)
|
|
1176
|
+
yieldRes.probabilities = reviveTokenProbabilities(probabilities);
|
|
1177
|
+
if (confidence != null)
|
|
1178
|
+
yieldRes.confidence = confidence;
|
|
1179
|
+
}
|
|
1180
|
+
else
|
|
1181
|
+
nextToken = lastDecodeResult;
|
|
1182
|
+
if (nextToken === -1)
|
|
1183
|
+
throw new Error("Failed to sample next token");
|
|
1184
|
+
if (nextToken == null)
|
|
1185
|
+
return;
|
|
1186
|
+
// the model finished generating text
|
|
1187
|
+
if (!yieldEogToken && this._context.model.isEogToken(nextToken))
|
|
1188
|
+
break;
|
|
1189
|
+
}
|
|
1190
|
+
finally {
|
|
1191
|
+
evaluatorLock?.dispose();
|
|
1192
|
+
}
|
|
1193
|
+
yieldRes.token = nextToken;
|
|
1194
|
+
const replacementToken = yield yieldRes;
|
|
904
1195
|
// set the tokens for the next evaluation
|
|
905
|
-
if (replacementToken
|
|
1196
|
+
if (replacementToken instanceof Array)
|
|
1197
|
+
evalTokens = replacementToken.slice();
|
|
1198
|
+
else if (replacementToken != null)
|
|
906
1199
|
evalTokens = [replacementToken];
|
|
907
1200
|
else
|
|
908
1201
|
evalTokens = [nextToken];
|
|
@@ -913,39 +1206,280 @@ export class LlamaContextSequence {
|
|
|
913
1206
|
}
|
|
914
1207
|
}
|
|
915
1208
|
/** @internal */
|
|
916
|
-
async
|
|
1209
|
+
async *_speculativeEvaluate(tokens, metadata, { temperature, minP, topK, topP, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = defaultEvaluationPriority, contextShiftOptions, yieldEogToken = false, tokenPredictor }) {
|
|
917
1210
|
this._ensureNotDisposed();
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
1211
|
+
let evalTokens = tokens.slice();
|
|
1212
|
+
if (evalTokens.length === 0)
|
|
1213
|
+
return;
|
|
1214
|
+
const tokenPredictorOwner = {};
|
|
1215
|
+
this._tokenPredictorOwner = tokenPredictorOwner;
|
|
1216
|
+
await this._abortTokenPredictor();
|
|
1217
|
+
const sampleProbabilities = metadata.probabilities === true;
|
|
1218
|
+
const sampleConfidence = metadata.confidence === true;
|
|
1219
|
+
let logitsArray = [];
|
|
1220
|
+
let logitsStartIndex = evalTokens.length - 1;
|
|
1221
|
+
const validatedTokens = [];
|
|
1222
|
+
logitsArray[logitsStartIndex] = true;
|
|
1223
|
+
const sampler = new LlamaSampler(this.model);
|
|
1224
|
+
try {
|
|
1225
|
+
while (true) {
|
|
921
1226
|
this._ensureNotDisposed();
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
if (
|
|
927
|
-
|
|
1227
|
+
const evaluatorLock = await acquireLock(this._lock, "evaluate");
|
|
1228
|
+
let nextToken;
|
|
1229
|
+
const yieldRes = {};
|
|
1230
|
+
try {
|
|
1231
|
+
if (this._tokenPredictorOwner === tokenPredictorOwner &&
|
|
1232
|
+
this._loadedTokenPredictions.length > 0 &&
|
|
1233
|
+
evalTokens.length === 1 &&
|
|
1234
|
+
evalTokens[0] === this._loadedTokenPredictions[0]?.[0]) {
|
|
1235
|
+
const [token, probabilities, confidence] = this._loadedTokenPredictions.shift()[1];
|
|
1236
|
+
nextToken = token;
|
|
1237
|
+
yieldRes.token = nextToken;
|
|
1238
|
+
if (probabilities != null)
|
|
1239
|
+
yieldRes.probabilities = reviveTokenProbabilities(probabilities);
|
|
1240
|
+
if (confidence != null)
|
|
1241
|
+
yieldRes.confidence = confidence;
|
|
1242
|
+
const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
|
|
1243
|
+
? grammarEvaluationState()
|
|
1244
|
+
: grammarEvaluationState;
|
|
1245
|
+
if (resolvedGrammarEvaluationState != null)
|
|
1246
|
+
LlamaSampler._acceptTokenOnGrammarEvaluationState(this._context._llama, resolvedGrammarEvaluationState, nextToken);
|
|
1247
|
+
this._unusedTokenPredictions--;
|
|
1248
|
+
this._usedTokenPredictions++;
|
|
1249
|
+
}
|
|
1250
|
+
else if (this._tokenPredictorOwner === tokenPredictorOwner && this._loadedTokenPredictions.length > 0) {
|
|
1251
|
+
const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
|
|
1252
|
+
await this._eraseContextTokenRanges([{ start: deleteStartIndex, end: this._nextTokenIndex }], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true });
|
|
1253
|
+
this._loadedTokenPredictions.length = 0;
|
|
1254
|
+
}
|
|
1255
|
+
if (this._resetTokenPredictor) {
|
|
1256
|
+
await tokenPredictor.reset({
|
|
1257
|
+
stateTokens: [...this._contextTokens, ...evalTokens],
|
|
1258
|
+
evaluateOptions: {
|
|
1259
|
+
temperature,
|
|
1260
|
+
minP,
|
|
1261
|
+
topK,
|
|
1262
|
+
topP,
|
|
1263
|
+
seed,
|
|
1264
|
+
grammarEvaluationState: grammarEvaluationState instanceof Function
|
|
1265
|
+
? grammarEvaluationState()?.clone()
|
|
1266
|
+
: grammarEvaluationState?.clone(),
|
|
1267
|
+
repeatPenalty,
|
|
1268
|
+
tokenBias,
|
|
1269
|
+
evaluationPriority,
|
|
1270
|
+
contextShift: contextShiftOptions,
|
|
1271
|
+
yieldEogToken: true
|
|
1272
|
+
},
|
|
1273
|
+
targetSequence: this
|
|
1274
|
+
});
|
|
1275
|
+
this._resetTokenPredictor = false;
|
|
1276
|
+
this._tokenPredictorOwner = tokenPredictorOwner;
|
|
1277
|
+
}
|
|
1278
|
+
if (nextToken == null) {
|
|
1279
|
+
if (this._tokenPredictorOwner === tokenPredictorOwner &&
|
|
1280
|
+
// prevent incurring context shifts due to token prediction validations
|
|
1281
|
+
this._nextTokenIndex + evalTokens.length < this._context.contextSize) {
|
|
1282
|
+
const testGrammarClone = grammarEvaluationState instanceof Function
|
|
1283
|
+
? grammarEvaluationState()?.clone()
|
|
1284
|
+
: grammarEvaluationState?.clone();
|
|
1285
|
+
for (const token of await tokenPredictor.predictTokens()) {
|
|
1286
|
+
if (testGrammarClone != null) {
|
|
1287
|
+
const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this.model._llama, testGrammarClone, token);
|
|
1288
|
+
if (!canAddToken)
|
|
1289
|
+
break;
|
|
1290
|
+
}
|
|
1291
|
+
evalTokens.push(token);
|
|
1292
|
+
logitsArray[evalTokens.length - 1] = true;
|
|
1293
|
+
// prevent incurring context shifts due to token prediction validations
|
|
1294
|
+
if (this._nextTokenIndex + evalTokens.length >= this._context.contextSize)
|
|
1295
|
+
break;
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
let resolvedGrammarEvaluationState = undefined;
|
|
1299
|
+
// Evaluate to get the next token.
|
|
1300
|
+
const decodeResult = await this._decodeTokens(evalTokens, logitsArray, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex, tokenIndex) => {
|
|
1301
|
+
if (tokenIndex === logitsStartIndex)
|
|
1302
|
+
resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
|
|
1303
|
+
? grammarEvaluationState()
|
|
1304
|
+
: grammarEvaluationState;
|
|
1305
|
+
else if (tokenIndex === logitsStartIndex + 1)
|
|
1306
|
+
resolvedGrammarEvaluationState = resolvedGrammarEvaluationState?.clone();
|
|
1307
|
+
const samplerConfig = this._resolveSamplerConfig({
|
|
1308
|
+
temperature,
|
|
1309
|
+
minP,
|
|
1310
|
+
topK,
|
|
1311
|
+
topP,
|
|
1312
|
+
seed,
|
|
1313
|
+
grammarEvaluationState: resolvedGrammarEvaluationState,
|
|
1314
|
+
repeatPenalty,
|
|
1315
|
+
tokenBias
|
|
1316
|
+
});
|
|
1317
|
+
return withLock(sampler, "sample", async () => {
|
|
1318
|
+
if (sampler.disposed)
|
|
1319
|
+
return null;
|
|
1320
|
+
sampler.applyConfig(samplerConfig);
|
|
1321
|
+
if (sampleProbabilities || sampleConfidence)
|
|
1322
|
+
return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler, sampleProbabilities, sampleConfidence);
|
|
1323
|
+
else
|
|
1324
|
+
return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
|
|
1325
|
+
});
|
|
1326
|
+
});
|
|
1327
|
+
for (let i = logitsStartIndex; i < evalTokens.length; i++) {
|
|
1328
|
+
const item = decodeResult[i];
|
|
1329
|
+
const [resultToken, probabilities, confidence] = item instanceof Array
|
|
1330
|
+
? item
|
|
1331
|
+
: [item];
|
|
1332
|
+
if (i === logitsStartIndex) {
|
|
1333
|
+
if (resultToken === -1)
|
|
1334
|
+
throw new Error("Failed to sample next token");
|
|
1335
|
+
if (resultToken == null)
|
|
1336
|
+
return;
|
|
1337
|
+
nextToken = resultToken;
|
|
1338
|
+
yieldRes.token = nextToken;
|
|
1339
|
+
if (probabilities != null)
|
|
1340
|
+
yieldRes.probabilities = reviveTokenProbabilities(probabilities);
|
|
1341
|
+
if (confidence != null)
|
|
1342
|
+
yieldRes.confidence = confidence;
|
|
1343
|
+
}
|
|
1344
|
+
else {
|
|
1345
|
+
if (resultToken === -1 || resultToken == null)
|
|
1346
|
+
break;
|
|
1347
|
+
const lastValidatedTokenOutput = i === logitsStartIndex + 1
|
|
1348
|
+
? nextToken
|
|
1349
|
+
: validatedTokens.at(-1)?.[1];
|
|
1350
|
+
if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
|
|
1351
|
+
this._loadedTokenPredictions.push([evalTokens[i], [resultToken, probabilities, confidence]]);
|
|
1352
|
+
this._validatedTokenPredictions++;
|
|
1353
|
+
this._unusedTokenPredictions++;
|
|
1354
|
+
}
|
|
1355
|
+
else {
|
|
1356
|
+
const deleteSize = Math.min(evalTokens.length - i, this.context.contextSize);
|
|
1357
|
+
this._refutedTokenPredictions += deleteSize;
|
|
1358
|
+
const deleteStartIndex = this._nextTokenIndex - deleteSize;
|
|
1359
|
+
tokenPredictor.stop(true);
|
|
1360
|
+
await this._eraseContextTokenRanges([{
|
|
1361
|
+
start: deleteStartIndex,
|
|
1362
|
+
end: this._nextTokenIndex
|
|
1363
|
+
}], { canResetTokenPredictor: false, canRemovePredictionTokens: false, skipLock: true });
|
|
1364
|
+
break; // the assumption that this token will be generated was wrong
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
if (nextToken == null)
|
|
1370
|
+
throw new Error("Failed to generated next token");
|
|
1371
|
+
// the model finished generating text
|
|
1372
|
+
if (!yieldEogToken && this._context.model.isEogToken(nextToken))
|
|
1373
|
+
break;
|
|
928
1374
|
}
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1375
|
+
finally {
|
|
1376
|
+
evaluatorLock.dispose();
|
|
1377
|
+
}
|
|
1378
|
+
const replacementToken = yield yieldRes;
|
|
1379
|
+
// set the tokens for the next evaluation
|
|
1380
|
+
if (replacementToken instanceof Array)
|
|
1381
|
+
evalTokens = replacementToken.slice();
|
|
1382
|
+
else if (replacementToken != null)
|
|
1383
|
+
evalTokens = [replacementToken];
|
|
1384
|
+
else
|
|
1385
|
+
evalTokens = [nextToken];
|
|
1386
|
+
if (this._tokenPredictorOwner === tokenPredictorOwner)
|
|
1387
|
+
tokenPredictor.pushTokens(evalTokens);
|
|
1388
|
+
logitsArray = [];
|
|
1389
|
+
logitsStartIndex = evalTokens.length - 1;
|
|
1390
|
+
logitsArray[logitsStartIndex] = true;
|
|
945
1391
|
}
|
|
946
|
-
|
|
1392
|
+
}
|
|
1393
|
+
finally {
|
|
1394
|
+
void withLock(sampler, "sample", sampler.asyncDispose);
|
|
1395
|
+
if (this._tokenPredictorOwner === tokenPredictorOwner)
|
|
1396
|
+
tokenPredictor.stop();
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
/** @internal */
|
|
1400
|
+
async _abortTokenPredictor(skipClearingPredictionsFromState = false, skipLock = false) {
|
|
1401
|
+
this._tokenPredictor?.stop();
|
|
1402
|
+
this._resetTokenPredictor = true;
|
|
1403
|
+
if (skipClearingPredictionsFromState)
|
|
1404
|
+
return;
|
|
1405
|
+
if (this._loadedTokenPredictions.length > 0)
|
|
1406
|
+
await this._eraseContextTokenRanges([{
|
|
1407
|
+
start: this._nextTokenIndex - this._loadedTokenPredictions.length,
|
|
1408
|
+
end: this._nextTokenIndex
|
|
1409
|
+
}], { canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock });
|
|
1410
|
+
}
|
|
1411
|
+
/** @internal */
|
|
1412
|
+
_resolveSamplerConfig({ temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias }) {
|
|
1413
|
+
const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
|
|
1414
|
+
? repeatPenalty.punishTokens()
|
|
1415
|
+
: repeatPenalty?.punishTokens;
|
|
1416
|
+
const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
|
|
1417
|
+
const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
|
|
1418
|
+
? grammarEvaluationState()
|
|
1419
|
+
: grammarEvaluationState;
|
|
1420
|
+
if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
|
|
1421
|
+
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
|
|
1422
|
+
const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
|
|
1423
|
+
return removeNullFields({
|
|
1424
|
+
temperature,
|
|
1425
|
+
minP,
|
|
1426
|
+
topK,
|
|
1427
|
+
topP,
|
|
1428
|
+
seed: Math.max(0, Number.isFinite(seed)
|
|
1429
|
+
? Math.floor(seed ?? (Date.now() / 1000))
|
|
1430
|
+
: Math.floor(Date.now() / 1000)),
|
|
1431
|
+
repeatPenalty: repeatPenalty?.penalty,
|
|
1432
|
+
repeatPenaltyMaxTokens: maxPunishTokens,
|
|
1433
|
+
repeatPenaltyTokens: repeatPenaltyTokens != null
|
|
1434
|
+
? Uint32Array.from(repeatPenaltyTokens)
|
|
1435
|
+
: undefined,
|
|
1436
|
+
repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
|
|
1437
|
+
repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
|
|
1438
|
+
tokenBiasKeys,
|
|
1439
|
+
tokenBiasValues,
|
|
1440
|
+
grammarEvaluationState: resolvedGrammarEvaluationState?._state
|
|
947
1441
|
});
|
|
948
1442
|
}
|
|
1443
|
+
/**
|
|
1444
|
+
* The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
|
|
1445
|
+
* @internal
|
|
1446
|
+
*/
|
|
1447
|
+
async _decodeTokens(tokens, logits, evaluationPriority, tokenMeter, contextShiftOptions, logitDataMapper) {
|
|
1448
|
+
this._ensureNotDisposed();
|
|
1449
|
+
const tokensLeftToDecode = tokens.slice();
|
|
1450
|
+
const tokenLogitsLeftToDecode = logits.slice();
|
|
1451
|
+
let currentTokenIndex = 0;
|
|
1452
|
+
const res = [];
|
|
1453
|
+
const normalizedLogitDataMapper = (batchLogitIndex, contextStateTokenIndex) => {
|
|
1454
|
+
return logitDataMapper(batchLogitIndex, currentTokenIndex + (contextStateTokenIndex - this._nextTokenIndex));
|
|
1455
|
+
};
|
|
1456
|
+
while (tokensLeftToDecode.length > 0) {
|
|
1457
|
+
this._ensureNotDisposed();
|
|
1458
|
+
let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
|
|
1459
|
+
if (freeSpace <= 0) {
|
|
1460
|
+
await this._freeUpSpaceForTokens(contextShiftOptions);
|
|
1461
|
+
freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
|
|
1462
|
+
if (freeSpace <= 0)
|
|
1463
|
+
throw new Error("Failed to free up space for new tokens");
|
|
1464
|
+
}
|
|
1465
|
+
const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
|
|
1466
|
+
const tokensLogits = tokenLogitsLeftToDecode.slice(0, tokensToDecode.length);
|
|
1467
|
+
const generatedLogits = await this._context._decodeTokens({
|
|
1468
|
+
sequenceId: this._sequenceId,
|
|
1469
|
+
tokens: tokensToDecode,
|
|
1470
|
+
firstTokenSequenceIndex: this._nextTokenIndex,
|
|
1471
|
+
logits: tokensLogits,
|
|
1472
|
+
evaluationPriority,
|
|
1473
|
+
tokenMeter
|
|
1474
|
+
}, normalizedLogitDataMapper);
|
|
1475
|
+
for (const [index, value] of generatedLogits)
|
|
1476
|
+
res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
|
|
1477
|
+
this._nextTokenIndex += tokensToDecode.length;
|
|
1478
|
+
currentTokenIndex += tokensToDecode.length;
|
|
1479
|
+
this._contextTokens = this._contextTokens.concat(tokensToDecode);
|
|
1480
|
+
}
|
|
1481
|
+
return res;
|
|
1482
|
+
}
|
|
949
1483
|
/** @internal */
|
|
950
1484
|
async _freeUpSpaceForTokens(contextShiftOptions) {
|
|
951
1485
|
this._ensureNotDisposed();
|
|
@@ -957,7 +1491,7 @@ export class LlamaContextSequence {
|
|
|
957
1491
|
let eraseStartIndex = 0;
|
|
958
1492
|
if (this.model.tokens.bos != null && this._contextTokens[0] === this.model.tokens.bos)
|
|
959
1493
|
eraseStartIndex = 1;
|
|
960
|
-
await this.
|
|
1494
|
+
await this._eraseContextTokenRanges([{ start: eraseStartIndex, end: size + eraseStartIndex }], { skipLock: true });
|
|
961
1495
|
}
|
|
962
1496
|
else {
|
|
963
1497
|
const ranges = await contextShiftOptions.strategy({
|
|
@@ -966,9 +1500,9 @@ export class LlamaContextSequence {
|
|
|
966
1500
|
});
|
|
967
1501
|
if (ranges == null)
|
|
968
1502
|
throw new Error("Invalid delete ranges");
|
|
969
|
-
await this.
|
|
970
|
-
if (this.
|
|
971
|
-
await this.
|
|
1503
|
+
await this._eraseContextTokenRanges(ranges, { skipLock: true });
|
|
1504
|
+
if (this._nextTokenIndex >= this._context.contextSize - 1)
|
|
1505
|
+
await this._eraseContextTokenRanges([{ start: 0, end: size }], { skipLock: true });
|
|
972
1506
|
}
|
|
973
1507
|
}
|
|
974
1508
|
/** @internal */
|
|
@@ -980,7 +1514,7 @@ export class LlamaContextSequence {
|
|
|
980
1514
|
* We need this to make it impossible to manually create instances of this class outside the code of this library
|
|
981
1515
|
* @internal
|
|
982
1516
|
*/
|
|
983
|
-
static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
|
|
1517
|
+
static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, tokenPredictor }) {
|
|
984
1518
|
return new LlamaContextSequence({
|
|
985
1519
|
sequenceId,
|
|
986
1520
|
context,
|
|
@@ -988,7 +1522,8 @@ export class LlamaContextSequence {
|
|
|
988
1522
|
contextShift: {
|
|
989
1523
|
size: contextShiftSize,
|
|
990
1524
|
strategy: contextShiftStrategy
|
|
991
|
-
}
|
|
1525
|
+
},
|
|
1526
|
+
tokenPredictor
|
|
992
1527
|
});
|
|
993
1528
|
}
|
|
994
1529
|
}
|
|
@@ -1020,6 +1555,17 @@ function getTokenBiasesForAddon(tokenBias, currentModel) {
|
|
|
1020
1555
|
tokenBiasValues: Float32Array.from(tokenBiasValues)
|
|
1021
1556
|
};
|
|
1022
1557
|
}
|
|
1558
|
+
function reviveTokenProbabilities(probabilities) {
|
|
1559
|
+
if (probabilities == null)
|
|
1560
|
+
return undefined;
|
|
1561
|
+
const res = new Map();
|
|
1562
|
+
for (let i = 1; i < probabilities.length; i += 2) {
|
|
1563
|
+
const token = probabilities[i - 1];
|
|
1564
|
+
const probability = probabilities[i];
|
|
1565
|
+
res.set(token, probability);
|
|
1566
|
+
}
|
|
1567
|
+
return res;
|
|
1568
|
+
}
|
|
1023
1569
|
function disposeContextIfReferenced(contextRef) {
|
|
1024
1570
|
const context = contextRef.deref();
|
|
1025
1571
|
if (context != null)
|