node-llama-cpp 3.0.0-beta.13 → 3.0.0-beta.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/ChatWrapper.js +4 -0
- package/dist/ChatWrapper.js.map +1 -1
- package/dist/bindings/AddonTypes.d.ts +35 -6
- package/dist/bindings/Llama.d.ts +12 -0
- package/dist/bindings/Llama.js +100 -7
- package/dist/bindings/Llama.js.map +1 -1
- package/dist/bindings/getLlama.d.ts +19 -1
- package/dist/bindings/getLlama.js +16 -6
- package/dist/bindings/getLlama.js.map +1 -1
- package/dist/bindings/types.d.ts +18 -0
- package/dist/bindings/types.js +31 -2
- package/dist/bindings/types.js.map +1 -1
- package/dist/bindings/utils/MemoryOrchestrator.d.ts +21 -0
- package/dist/bindings/utils/MemoryOrchestrator.js +49 -0
- package/dist/bindings/utils/MemoryOrchestrator.js.map +1 -0
- package/dist/bindings/utils/cloneLlamaCppRepo.d.ts +1 -1
- package/dist/bindings/utils/cloneLlamaCppRepo.js +4 -3
- package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
- package/dist/bindings/utils/compileLLamaCpp.d.ts +4 -1
- package/dist/bindings/utils/compileLLamaCpp.js +133 -97
- package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
- package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +3 -0
- package/dist/bindings/utils/detectAvailableComputeLayers.js +155 -13
- package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
- package/dist/bindings/utils/getLlamaWithoutBackend.d.ts +5 -0
- package/dist/bindings/utils/getLlamaWithoutBackend.js +27 -0
- package/dist/bindings/utils/getLlamaWithoutBackend.js.map +1 -0
- package/dist/bindings/utils/logDistroInstallInstruction.d.ts +1 -0
- package/dist/bindings/utils/logDistroInstallInstruction.js +16 -6
- package/dist/bindings/utils/logDistroInstallInstruction.js.map +1 -1
- package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -2
- package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
- package/dist/bindings/utils/testBindingBinary.js +2 -2
- package/dist/bindings/utils/testBindingBinary.js.map +1 -1
- package/dist/bindings/utils/testCmakeBinary.d.ts +5 -0
- package/dist/bindings/utils/testCmakeBinary.js +32 -0
- package/dist/bindings/utils/testCmakeBinary.js.map +1 -0
- package/dist/chatWrappers/AlpacaChatWrapper.d.ts +2 -1
- package/dist/chatWrappers/AlpacaChatWrapper.js +9 -2
- package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/ChatMLChatWrapper.js +12 -10
- package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
- package/dist/chatWrappers/FalconChatWrapper.d.ts +2 -1
- package/dist/chatWrappers/FalconChatWrapper.js +28 -11
- package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
- package/dist/chatWrappers/FunctionaryChatWrapper.js +59 -45
- package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GemmaChatWrapper.js +9 -7
- package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GeneralChatWrapper.d.ts +2 -1
- package/dist/chatWrappers/GeneralChatWrapper.js +35 -12
- package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
- package/dist/chatWrappers/LlamaChatWrapper.d.ts +7 -0
- package/dist/chatWrappers/LlamaChatWrapper.js +26 -8
- package/dist/chatWrappers/LlamaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +73 -0
- package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +355 -0
- package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -0
- package/dist/{TemplateChatWrapper.d.ts → chatWrappers/generic/TemplateChatWrapper.d.ts} +16 -18
- package/dist/{TemplateChatWrapper.js → chatWrappers/generic/TemplateChatWrapper.js} +31 -69
- package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -0
- package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.d.ts +33 -0
- package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js +45 -0
- package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js.map +1 -0
- package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.d.ts +4 -0
- package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +206 -0
- package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +67 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.js +206 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -0
- package/dist/cli/cli.js +1 -1
- package/dist/cli/cli.js.map +1 -1
- package/dist/cli/commands/ChatCommand.d.ts +7 -4
- package/dist/cli/commands/ChatCommand.js +177 -70
- package/dist/cli/commands/ChatCommand.js.map +1 -1
- package/dist/cli/commands/ClearCommand.d.ts +1 -1
- package/dist/cli/commands/ClearCommand.js +5 -5
- package/dist/cli/commands/ClearCommand.js.map +1 -1
- package/dist/cli/commands/CompleteCommand.d.ts +3 -2
- package/dist/cli/commands/CompleteCommand.js +115 -51
- package/dist/cli/commands/CompleteCommand.js.map +1 -1
- package/dist/cli/commands/InfillCommand.d.ts +3 -2
- package/dist/cli/commands/InfillCommand.js +115 -51
- package/dist/cli/commands/InfillCommand.js.map +1 -1
- package/dist/cli/commands/OnPostInstallCommand.js +2 -0
- package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
- package/dist/cli/commands/{InspectCommand.d.ts → inspect/InspectCommand.d.ts} +1 -4
- package/dist/cli/commands/inspect/InspectCommand.js +17 -0
- package/dist/cli/commands/inspect/InspectCommand.js.map +1 -0
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts +10 -0
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +108 -0
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -0
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts +4 -0
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +98 -0
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +14 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +577 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -0
- package/dist/cli/utils/ConsoleTable.d.ts +23 -0
- package/dist/cli/utils/ConsoleTable.js +86 -0
- package/dist/cli/utils/ConsoleTable.js.map +1 -0
- package/dist/cli/utils/printCommonInfoLines.d.ts +9 -0
- package/dist/cli/utils/printCommonInfoLines.js +70 -0
- package/dist/cli/utils/printCommonInfoLines.js.map +1 -0
- package/dist/cli/utils/printInfoLine.d.ts +10 -0
- package/dist/cli/utils/printInfoLine.js +45 -0
- package/dist/cli/utils/printInfoLine.js.map +1 -0
- package/dist/cli/utils/resolveCommandGgufPath.d.ts +1 -0
- package/dist/cli/utils/resolveCommandGgufPath.js +6 -0
- package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -0
- package/dist/config.d.ts +3 -1
- package/dist/config.js +7 -1
- package/dist/config.js.map +1 -1
- package/dist/evaluator/LlamaChat/LlamaChat.js +13 -5
- package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
- package/dist/evaluator/LlamaCompletion.js +5 -3
- package/dist/evaluator/LlamaCompletion.js.map +1 -1
- package/dist/evaluator/LlamaContext/LlamaContext.d.ts +43 -9
- package/dist/evaluator/LlamaContext/LlamaContext.js +251 -60
- package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
- package/dist/evaluator/LlamaContext/types.d.ts +68 -10
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map +1 -0
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map +1 -0
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts +2 -0
- package/dist/evaluator/LlamaContext/utils/{resolveBatchItemsPrioritizingStrategy.js → resolveBatchItemsPrioritizationStrategy.js} +4 -4
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map +1 -0
- package/dist/evaluator/LlamaEmbeddingContext.d.ts +29 -7
- package/dist/evaluator/LlamaEmbeddingContext.js +31 -22
- package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
- package/dist/evaluator/LlamaGrammar.js +1 -0
- package/dist/evaluator/LlamaGrammar.js.map +1 -1
- package/dist/evaluator/LlamaModel.d.ts +78 -20
- package/dist/evaluator/LlamaModel.js +385 -21
- package/dist/evaluator/LlamaModel.js.map +1 -1
- package/dist/evaluator/TokenMeter.d.ts +54 -0
- package/dist/evaluator/TokenMeter.js +86 -0
- package/dist/evaluator/TokenMeter.js.map +1 -0
- package/dist/gguf/GgufInsights.d.ts +40 -0
- package/dist/gguf/GgufInsights.js +350 -0
- package/dist/gguf/GgufInsights.js.map +1 -0
- package/dist/gguf/consts.d.ts +3 -0
- package/dist/gguf/consts.js +8 -0
- package/dist/gguf/consts.js.map +1 -0
- package/dist/gguf/errors/InvalidGgufMagicError.d.ts +3 -0
- package/dist/gguf/errors/InvalidGgufMagicError.js +6 -0
- package/dist/gguf/errors/InvalidGgufMagicError.js.map +1 -0
- package/dist/gguf/errors/UnsupportedGgufValueTypeError.d.ts +4 -0
- package/dist/gguf/errors/UnsupportedGgufValueTypeError.js +9 -0
- package/dist/gguf/errors/UnsupportedGgufValueTypeError.js.map +1 -0
- package/dist/gguf/fileReaders/GgufFileReader.d.ts +33 -0
- package/dist/gguf/fileReaders/GgufFileReader.js +76 -0
- package/dist/gguf/fileReaders/GgufFileReader.js.map +1 -0
- package/dist/gguf/fileReaders/GgufFsFileReader.d.ts +17 -0
- package/dist/gguf/fileReaders/GgufFsFileReader.js +45 -0
- package/dist/gguf/fileReaders/GgufFsFileReader.js.map +1 -0
- package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.d.ts +22 -0
- package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js +63 -0
- package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js.map +1 -0
- package/dist/gguf/parser/GgufV2Parser.d.ts +19 -0
- package/dist/gguf/parser/GgufV2Parser.js +115 -0
- package/dist/gguf/parser/GgufV2Parser.js.map +1 -0
- package/dist/gguf/parser/GgufV3Parser.d.ts +3 -0
- package/dist/gguf/parser/GgufV3Parser.js +4 -0
- package/dist/gguf/parser/GgufV3Parser.js.map +1 -0
- package/dist/gguf/parser/parseGguf.d.ts +8 -0
- package/dist/gguf/parser/parseGguf.js +58 -0
- package/dist/gguf/parser/parseGguf.js.map +1 -0
- package/dist/gguf/readGgufFileInfo.d.ts +30 -0
- package/dist/gguf/readGgufFileInfo.js +37 -0
- package/dist/gguf/readGgufFileInfo.js.map +1 -0
- package/dist/gguf/types/GgufFileInfoTypes.d.ts +52 -0
- package/dist/gguf/types/GgufFileInfoTypes.js +18 -0
- package/dist/gguf/types/GgufFileInfoTypes.js.map +1 -0
- package/dist/gguf/types/GgufMetadataTypes.d.ts +330 -0
- package/dist/gguf/types/GgufMetadataTypes.js +86 -0
- package/dist/gguf/types/GgufMetadataTypes.js.map +1 -0
- package/dist/gguf/types/GgufTensorInfoTypes.d.ts +37 -0
- package/dist/gguf/types/GgufTensorInfoTypes.js +33 -0
- package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -0
- package/dist/gguf/utils/GgufReadOffset.d.ts +6 -0
- package/dist/gguf/utils/GgufReadOffset.js +18 -0
- package/dist/gguf/utils/GgufReadOffset.js.map +1 -0
- package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.d.ts +5 -0
- package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js +38 -0
- package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js.map +1 -0
- package/dist/gguf/utils/getGgufFileTypeName.d.ts +4 -0
- package/dist/gguf/utils/getGgufFileTypeName.js +13 -0
- package/dist/gguf/utils/getGgufFileTypeName.js.map +1 -0
- package/dist/gguf/utils/getGgufMetadataArchitectureData.d.ts +3 -0
- package/dist/gguf/utils/getGgufMetadataArchitectureData.js +4 -0
- package/dist/gguf/utils/getGgufMetadataArchitectureData.js.map +1 -0
- package/dist/gguf/utils/normalizeGgufDownloadUrl.d.ts +1 -0
- package/dist/gguf/utils/normalizeGgufDownloadUrl.js +16 -0
- package/dist/gguf/utils/normalizeGgufDownloadUrl.js.map +1 -0
- package/dist/index.d.ts +13 -7
- package/dist/index.js +11 -6
- package/dist/index.js.map +1 -1
- package/dist/state.d.ts +2 -0
- package/dist/state.js +7 -0
- package/dist/state.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/utils/DisposeGuard.d.ts +13 -0
- package/dist/utils/DisposeGuard.js +120 -0
- package/dist/utils/DisposeGuard.js.map +1 -0
- package/dist/utils/InsufficientMemoryError.d.ts +3 -0
- package/dist/utils/InsufficientMemoryError.js +6 -0
- package/dist/utils/InsufficientMemoryError.js.map +1 -0
- package/dist/utils/LlamaText.d.ts +25 -10
- package/dist/utils/LlamaText.js +205 -23
- package/dist/utils/LlamaText.js.map +1 -1
- package/dist/utils/StopGenerationDetector.js +3 -1
- package/dist/utils/StopGenerationDetector.js.map +1 -1
- package/dist/utils/cmake.js +1 -1
- package/dist/utils/cmake.js.map +1 -1
- package/dist/utils/findBestOption.d.ts +4 -0
- package/dist/utils/findBestOption.js +15 -0
- package/dist/utils/findBestOption.js.map +1 -0
- package/dist/utils/getConsoleLogPrefix.js +3 -2
- package/dist/utils/getConsoleLogPrefix.js.map +1 -1
- package/dist/utils/getQueuedTokensBeforeStopTrigger.js +3 -3
- package/dist/utils/getQueuedTokensBeforeStopTrigger.js.map +1 -1
- package/dist/utils/gitReleaseBundles.js +68 -1
- package/dist/utils/gitReleaseBundles.js.map +1 -1
- package/dist/utils/mergeUnionTypes.d.ts +4 -0
- package/dist/utils/parseModelFileName.d.ts +1 -0
- package/dist/utils/parseModelFileName.js +6 -1
- package/dist/utils/parseModelFileName.js.map +1 -1
- package/dist/utils/prettyPrintObject.d.ts +10 -1
- package/dist/utils/prettyPrintObject.js +57 -13
- package/dist/utils/prettyPrintObject.js.map +1 -1
- package/dist/utils/removeNullFields.d.ts +2 -2
- package/dist/utils/removeNullFields.js.map +1 -1
- package/dist/utils/spawnCommand.d.ts +11 -1
- package/dist/utils/spawnCommand.js +55 -7
- package/dist/utils/spawnCommand.js.map +1 -1
- package/dist/utils/tokenizeInput.d.ts +1 -1
- package/dist/utils/tokenizeInput.js +3 -3
- package/dist/utils/tokenizeInput.js.map +1 -1
- package/dist/utils/withOra.d.ts +1 -0
- package/dist/utils/withOra.js +2 -2
- package/dist/utils/withOra.js.map +1 -1
- package/llama/CMakeLists.txt +5 -5
- package/llama/addon.cpp +793 -88
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llama/gpuInfo/cuda-gpu-info.cu +21 -0
- package/llama/gpuInfo/cuda-gpu-info.h +3 -0
- package/llama/gpuInfo/metal-gpu-info.h +4 -1
- package/llama/gpuInfo/metal-gpu-info.mm +14 -1
- package/llama/gpuInfo/vulkan-gpu-info.cpp +20 -2
- package/llama/gpuInfo/vulkan-gpu-info.h +2 -0
- package/llama/grammars/json.gbnf +1 -1
- package/llama/grammars/json_arr.gbnf +1 -1
- package/llama/llama.cpp.info.json +1 -1
- package/llama/toolchains/win32.host-x64.target-arm64.cmake +41 -0
- package/llamaBins/linux-arm64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/linux-x64-cuda/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/linux-x64-vulkan/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
- package/llamaBins/mac-arm64-metal/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/mac-arm64-metal/default.metallib +0 -0
- package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
- package/llamaBins/mac-x64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-arm64/_nlcBuildMetadata.json +1 -0
- package/llamaBins/win-arm64/llama-addon.exp +0 -0
- package/llamaBins/win-arm64/llama-addon.lib +0 -0
- package/llamaBins/win-arm64/llama-addon.node +0 -0
- package/llamaBins/win-x64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/win-x64/llama-addon.exp +0 -0
- package/llamaBins/win-x64/llama-addon.lib +0 -0
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64-cuda/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/win-x64-cuda/llama-addon.exp +0 -0
- package/llamaBins/win-x64-cuda/llama-addon.lib +0 -0
- package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/win-x64-vulkan/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
- package/llamaBins/win-x64-vulkan/llama-addon.exp +0 -0
- package/llamaBins/win-x64-vulkan/llama-addon.lib +0 -0
- package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
- package/package.json +16 -11
- package/dist/TemplateChatWrapper.js.map +0 -1
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +0 -33
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +0 -49
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +0 -1
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.d.ts +0 -13
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +0 -63
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +0 -1
- package/dist/cli/commands/InspectCommand.js +0 -113
- package/dist/cli/commands/InspectCommand.js.map +0 -1
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/firstInFirstOutStrategy.js.map +0 -1
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/maximumParallelismStrategy.js.map +0 -1
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.d.ts +0 -2
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.js.map +0 -1
- package/dist/gguf/GGUFInsights.d.ts +0 -28
- package/dist/gguf/GGUFInsights.js +0 -58
- package/dist/gguf/GGUFInsights.js.map +0 -1
- package/dist/gguf/GGUFMetadata.d.ts +0 -19
- package/dist/gguf/GGUFMetadata.js +0 -38
- package/dist/gguf/GGUFMetadata.js.map +0 -1
- package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +0 -3
- package/dist/gguf/errors/InvalidGGUFMagicError.js +0 -6
- package/dist/gguf/errors/InvalidGGUFMagicError.js.map +0 -1
- package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +0 -3
- package/dist/gguf/errors/MetadataNotParsedYetError.js +0 -6
- package/dist/gguf/errors/MetadataNotParsedYetError.js.map +0 -1
- package/dist/gguf/errors/MissingNodeLlamaError.d.ts +0 -3
- package/dist/gguf/errors/MissingNodeLlamaError.js +0 -6
- package/dist/gguf/errors/MissingNodeLlamaError.js.map +0 -1
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +0 -5
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +0 -12
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +0 -1
- package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +0 -4
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js +0 -8
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +0 -1
- package/dist/gguf/ggufParser/GGUFParser.d.ts +0 -18
- package/dist/gguf/ggufParser/GGUFParser.js +0 -123
- package/dist/gguf/ggufParser/GGUFParser.js.map +0 -1
- package/dist/gguf/ggufParser/GGUFTypes.d.ts +0 -257
- package/dist/gguf/ggufParser/GGUFTypes.js +0 -2
- package/dist/gguf/ggufParser/GGUFTypes.js.map +0 -1
- package/dist/gguf/ggufParser/checkArchitecture.d.ts +0 -14
- package/dist/gguf/ggufParser/checkArchitecture.js +0 -74
- package/dist/gguf/ggufParser/checkArchitecture.js.map +0 -1
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +0 -38
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +0 -83
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +0 -1
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +0 -14
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +0 -35
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +0 -1
- package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +0 -15
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js +0 -40
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +0 -1
- package/dist/utils/parseModelTypeDescription.d.ts +0 -6
- package/dist/utils/parseModelTypeDescription.js +0 -9
- package/dist/utils/parseModelTypeDescription.js.map +0 -1
- package/dist/utils/resolveChatWrapper.d.ts +0 -4
- package/dist/utils/resolveChatWrapper.js +0 -16
- package/dist/utils/resolveChatWrapper.js.map +0 -1
- package/llamaBins/mac-arm64-metal/ggml-metal.metal +0 -7731
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.d.ts +0 -0
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.js +0 -0
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.d.ts +0 -0
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.js +0 -0
|
@@ -1,21 +1,26 @@
|
|
|
1
1
|
import { EventRelay } from "lifecycle-utils";
|
|
2
2
|
import { Token } from "../../types.js";
|
|
3
|
-
import { LlamaModel } from "../LlamaModel.js";
|
|
4
3
|
import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
|
|
4
|
+
import { GgufInsights } from "../../gguf/GgufInsights.js";
|
|
5
|
+
import { TokenMeter } from "../TokenMeter.js";
|
|
6
|
+
import { BuildGpu } from "../../bindings/types.js";
|
|
5
7
|
import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty } from "./types.js";
|
|
8
|
+
import type { LlamaModel } from "../LlamaModel.js";
|
|
6
9
|
export declare class LlamaContext {
|
|
7
10
|
readonly onDispose: EventRelay<void>;
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
*/
|
|
11
|
-
constructor({ model, sequences, seed, contextSize, batchSize, threads, batching: { dispatchSchedule: batchingDispatchSchedule, itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy }, _embedding, _noSeed }: LlamaContextOptions);
|
|
12
|
-
dispose(): void;
|
|
11
|
+
private constructor();
|
|
12
|
+
dispose(): Promise<void>;
|
|
13
13
|
/** @hidden */
|
|
14
|
-
[Symbol.
|
|
14
|
+
[Symbol.asyncDispose](): Promise<void>;
|
|
15
15
|
get disposed(): boolean;
|
|
16
16
|
get model(): LlamaModel;
|
|
17
17
|
get contextSize(): number;
|
|
18
18
|
get batchSize(): number;
|
|
19
|
+
/**
|
|
20
|
+
* The actual size of the state in the memory in bytes.
|
|
21
|
+
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
|
|
22
|
+
*/
|
|
23
|
+
get stateSize(): number;
|
|
19
24
|
getAllocatedContextSize(): number;
|
|
20
25
|
get totalSequences(): number;
|
|
21
26
|
get sequencesLeft(): number;
|
|
@@ -24,10 +29,15 @@ export declare class LlamaContext {
|
|
|
24
29
|
* When there are no sequences left, this method will throw an error.
|
|
25
30
|
* @param [options]
|
|
26
31
|
*/
|
|
27
|
-
getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy } }?: {
|
|
32
|
+
getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, _tokenMeter }?: {
|
|
28
33
|
contextShift?: ContextShiftOptions;
|
|
29
34
|
}): LlamaContextSequence;
|
|
30
35
|
dispatchPendingBatch(): void;
|
|
36
|
+
/**
|
|
37
|
+
* Print the timings of token evaluation since that last print for this context.
|
|
38
|
+
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
|
|
39
|
+
* it won't print anything.
|
|
40
|
+
*/
|
|
31
41
|
printTimings(): Promise<void>;
|
|
32
42
|
}
|
|
33
43
|
export declare class LlamaContextSequence {
|
|
@@ -41,6 +51,7 @@ export declare class LlamaContextSequence {
|
|
|
41
51
|
get model(): LlamaModel;
|
|
42
52
|
get nextTokenIndex(): number;
|
|
43
53
|
get contextTokens(): Token[];
|
|
54
|
+
get tokenMeter(): TokenMeter;
|
|
44
55
|
get isLoadedToMemory(): boolean;
|
|
45
56
|
compareContextTokens(tokens: Token[]): {
|
|
46
57
|
firstDifferentIndex: number;
|
|
@@ -52,7 +63,7 @@ export declare class LlamaContextSequence {
|
|
|
52
63
|
clearHistory(): Promise<void>;
|
|
53
64
|
/**
|
|
54
65
|
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
|
55
|
-
*
|
|
66
|
+
* The start of each range is inclusive, and the end of each range is exclusive.
|
|
56
67
|
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
|
57
68
|
*/
|
|
58
69
|
eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
|
|
@@ -107,3 +118,26 @@ export declare class LlamaContextSequence {
|
|
|
107
118
|
contextShift?: ContextShiftOptions;
|
|
108
119
|
}): Promise<void>;
|
|
109
120
|
}
|
|
121
|
+
export declare function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext }: {
|
|
122
|
+
contextSize?: LlamaContextOptions["contextSize"];
|
|
123
|
+
batchSize?: LlamaContextOptions["batchSize"];
|
|
124
|
+
sequences: number;
|
|
125
|
+
modelFileInsights: GgufInsights;
|
|
126
|
+
modelGpuLayers: number;
|
|
127
|
+
modelTrainContextSize: number;
|
|
128
|
+
getVramState(): {
|
|
129
|
+
total: number;
|
|
130
|
+
free: number;
|
|
131
|
+
};
|
|
132
|
+
llamaGpu: BuildGpu;
|
|
133
|
+
ignoreMemorySafetyChecks?: boolean;
|
|
134
|
+
isEmbeddingContext?: boolean;
|
|
135
|
+
}): number;
|
|
136
|
+
export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
|
|
137
|
+
contextSize: number;
|
|
138
|
+
sequences: number;
|
|
139
|
+
}): number;
|
|
140
|
+
export declare function getDefaultContextSequences(): number;
|
|
141
|
+
export declare function getDefaultModelContextSize({ trainContextSize }: {
|
|
142
|
+
trainContextSize?: number;
|
|
143
|
+
}): number;
|
|
@@ -1,11 +1,15 @@
|
|
|
1
|
-
import { DisposeAggregator, EventRelay, withLock, DisposedError } from "lifecycle-utils";
|
|
1
|
+
import { DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAggregator } from "lifecycle-utils";
|
|
2
2
|
import { removeNullFields } from "../../utils/removeNullFields.js";
|
|
3
3
|
import { compareTokens } from "../../utils/compareTokens.js";
|
|
4
|
-
import {
|
|
4
|
+
import { DisposeGuard } from "../../utils/DisposeGuard.js";
|
|
5
|
+
import { minAllowedContextSizeInCalculations } from "../../config.js";
|
|
6
|
+
import { TokenMeter } from "../TokenMeter.js";
|
|
7
|
+
import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
|
|
5
8
|
export class LlamaContext {
|
|
6
9
|
/** @internal */ _llama;
|
|
7
10
|
/** @internal */ _ctx;
|
|
8
11
|
/** @internal */ _onReclaimUnusedSequenceId = new EventRelay();
|
|
12
|
+
/** @internal */ _backendContextDisposeGuard;
|
|
9
13
|
/** @internal */ _model;
|
|
10
14
|
/** @internal */ _contextSize;
|
|
11
15
|
/** @internal */ _batchSize;
|
|
@@ -14,7 +18,8 @@ export class LlamaContext {
|
|
|
14
18
|
/** @internal */ _batchingOptions;
|
|
15
19
|
/** @internal */ _queuedDecodeSequenceIds = new Set();
|
|
16
20
|
/** @internal */ _queuedDecodes = [];
|
|
17
|
-
/** @internal */ _disposeAggregator = new
|
|
21
|
+
/** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
|
|
22
|
+
/** @internal */ _modelPreventDisposalHandle;
|
|
18
23
|
/** @internal */ _nextGeneratedSequenceId = 0;
|
|
19
24
|
/** @internal */ _dispatchDecodeScheduled = false;
|
|
20
25
|
/** @internal */ _batchDispatchPending = false;
|
|
@@ -22,14 +27,13 @@ export class LlamaContext {
|
|
|
22
27
|
/** @internal */ _allocatedContextSize;
|
|
23
28
|
/** @internal */ _disposed = false;
|
|
24
29
|
onDispose = new EventRelay();
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
*/
|
|
28
|
-
constructor({ model, sequences = 1, seed = null, contextSize = model.trainContextSize, batchSize = contextSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy = "maximumParallelism" } = {}, _embedding, _noSeed }) {
|
|
29
|
-
if (model.disposed)
|
|
30
|
+
constructor({ _model }, { sequences, seed = null, contextSize, batchSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
|
|
31
|
+
if (_model.disposed)
|
|
30
32
|
throw new DisposedError();
|
|
31
|
-
this._llama =
|
|
32
|
-
this._model =
|
|
33
|
+
this._llama = _model._llama;
|
|
34
|
+
this._model = _model;
|
|
35
|
+
this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]);
|
|
36
|
+
this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle();
|
|
33
37
|
this._totalSequences = Math.max(1, Math.floor(sequences));
|
|
34
38
|
this._contextSize = Math.max(2, contextSize);
|
|
35
39
|
this._batchSize = Math.max(batchSize, this._totalSequences);
|
|
@@ -37,30 +41,36 @@ export class LlamaContext {
|
|
|
37
41
|
seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
|
|
38
42
|
contextSize: this._contextSize * this._totalSequences,
|
|
39
43
|
batchSize: this._batchSize,
|
|
44
|
+
sequences: this._totalSequences,
|
|
40
45
|
threads: Math.max(0, Math.floor(threads)),
|
|
41
|
-
|
|
46
|
+
embeddings: _embeddings,
|
|
42
47
|
noSeed: _noSeed
|
|
43
48
|
}));
|
|
44
49
|
this._batchingOptions = {
|
|
45
50
|
dispatchSchedule: batchingDispatchSchedule,
|
|
46
|
-
|
|
51
|
+
itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
|
|
47
52
|
};
|
|
48
53
|
this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
|
|
49
|
-
this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
|
|
50
|
-
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
|
51
54
|
this._disposeAggregator.add(() => {
|
|
52
|
-
this.
|
|
55
|
+
this._disposed = true;
|
|
53
56
|
});
|
|
57
|
+
this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
|
|
58
|
+
this._disposeAggregator.add(this.onDispose.dispatchEvent);
|
|
54
59
|
this._disposeAggregator.add(this.model.onDispose.createListener(disposeContextIfReferenced.bind(null, new WeakRef(this))));
|
|
60
|
+
this._disposeAggregator.add(async () => {
|
|
61
|
+
await this._backendContextDisposeGuard.acquireDisposeLock();
|
|
62
|
+
await this._ctx.dispose();
|
|
63
|
+
this._modelPreventDisposalHandle.dispose();
|
|
64
|
+
});
|
|
55
65
|
}
|
|
56
|
-
dispose() {
|
|
66
|
+
async dispose() {
|
|
57
67
|
if (this._disposed)
|
|
58
68
|
return;
|
|
59
69
|
this._disposed = true;
|
|
60
|
-
this._disposeAggregator.dispose();
|
|
70
|
+
await this._disposeAggregator.dispose();
|
|
61
71
|
}
|
|
62
72
|
/** @hidden */
|
|
63
|
-
[Symbol.
|
|
73
|
+
[Symbol.asyncDispose]() {
|
|
64
74
|
return this.dispose();
|
|
65
75
|
}
|
|
66
76
|
get disposed() {
|
|
@@ -75,6 +85,14 @@ export class LlamaContext {
|
|
|
75
85
|
get batchSize() {
|
|
76
86
|
return this._batchSize;
|
|
77
87
|
}
|
|
88
|
+
/**
|
|
89
|
+
* The actual size of the state in the memory in bytes.
|
|
90
|
+
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
|
|
91
|
+
*/
|
|
92
|
+
get stateSize() {
|
|
93
|
+
this._ensureNotDisposed();
|
|
94
|
+
return this._ctx.getStateSize();
|
|
95
|
+
}
|
|
78
96
|
getAllocatedContextSize() {
|
|
79
97
|
this._ensureNotDisposed();
|
|
80
98
|
if (this._allocatedContextSize == null)
|
|
@@ -92,7 +110,7 @@ export class LlamaContext {
|
|
|
92
110
|
* When there are no sequences left, this method will throw an error.
|
|
93
111
|
* @param [options]
|
|
94
112
|
*/
|
|
95
|
-
getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} } = {}) {
|
|
113
|
+
getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = {}) {
|
|
96
114
|
this._ensureNotDisposed();
|
|
97
115
|
const nextSequenceId = this._popSequenceId();
|
|
98
116
|
if (nextSequenceId == null)
|
|
@@ -100,6 +118,7 @@ export class LlamaContext {
|
|
|
100
118
|
return LlamaContextSequence._create({
|
|
101
119
|
sequenceId: nextSequenceId,
|
|
102
120
|
context: this,
|
|
121
|
+
tokenMeter: _tokenMeter,
|
|
103
122
|
contextShift: {
|
|
104
123
|
size: contextShiftSize,
|
|
105
124
|
strategy: contextShiftStrategy
|
|
@@ -116,17 +135,18 @@ export class LlamaContext {
|
|
|
116
135
|
this._currentDispatchBatchHandle = {};
|
|
117
136
|
this._dispatchDecodeScheduled = false;
|
|
118
137
|
this._batchDispatchPending = false;
|
|
119
|
-
let
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
138
|
+
let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
|
|
139
|
+
const resolvePrioritizationStrategy = () => {
|
|
140
|
+
try {
|
|
141
|
+
this._ensureNotDisposed();
|
|
142
|
+
return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
|
|
143
|
+
}
|
|
144
|
+
catch (err) {
|
|
145
|
+
this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
|
|
146
|
+
}
|
|
147
|
+
return null;
|
|
148
|
+
};
|
|
149
|
+
const getOrderedQueuedDecodes = (prioritizationStrategy) => {
|
|
130
150
|
const batchItemToQueuedDecodeMap = new Map();
|
|
131
151
|
const batchItemsList = [];
|
|
132
152
|
for (const queuedDecode of this._queuedDecodes) {
|
|
@@ -139,42 +159,65 @@ export class LlamaContext {
|
|
|
139
159
|
}
|
|
140
160
|
let prioritizedItems;
|
|
141
161
|
try {
|
|
142
|
-
prioritizedItems =
|
|
162
|
+
prioritizedItems = prioritizationStrategy({
|
|
143
163
|
items: batchItemsList,
|
|
144
164
|
size: this._batchSize
|
|
145
165
|
});
|
|
146
166
|
}
|
|
147
167
|
catch (err) {
|
|
148
168
|
this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
|
|
149
|
-
return;
|
|
169
|
+
return null;
|
|
150
170
|
}
|
|
151
|
-
|
|
152
|
-
const afterDecodeActions = [];
|
|
153
|
-
const queuedDecodesToDelete = new Set();
|
|
154
|
-
const currentQueuedDecodeItems = new Set();
|
|
155
|
-
const currentBatchItems = [];
|
|
156
|
-
let currentBatchSize = 0;
|
|
157
|
-
for (const prioritizedItem of prioritizedItems) {
|
|
171
|
+
return prioritizedItems.map((prioritizedItem) => {
|
|
158
172
|
const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
|
|
159
173
|
if (queuedDecode == null)
|
|
160
174
|
throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
|
|
161
175
|
"of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
|
|
162
|
-
|
|
163
|
-
|
|
176
|
+
return {
|
|
177
|
+
queuedDecode,
|
|
178
|
+
processAmount: prioritizedItem.processAmount
|
|
179
|
+
};
|
|
180
|
+
});
|
|
181
|
+
};
|
|
182
|
+
const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
|
|
183
|
+
const currentBatchItems = [];
|
|
184
|
+
let currentBatchSize = 0;
|
|
185
|
+
let batchTokenSlotsLeft = batchSize;
|
|
186
|
+
for (const { queuedDecode, processAmount } of queuedDecodes) {
|
|
187
|
+
const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
|
|
188
|
+
if (resolvedProcessAmount <= 0) {
|
|
189
|
+
if (batchTokenSlotsLeft === 0)
|
|
190
|
+
break;
|
|
164
191
|
continue;
|
|
165
|
-
|
|
192
|
+
}
|
|
193
|
+
batchTokenSlotsLeft -= resolvedProcessAmount;
|
|
194
|
+
currentBatchSize += resolvedProcessAmount;
|
|
166
195
|
currentBatchItems.push({
|
|
167
196
|
queuedDecode,
|
|
168
|
-
processAmount
|
|
197
|
+
processAmount: resolvedProcessAmount
|
|
169
198
|
});
|
|
170
|
-
currentBatchSize += processAmount;
|
|
171
199
|
}
|
|
200
|
+
return {
|
|
201
|
+
currentBatchItems,
|
|
202
|
+
currentBatchSize
|
|
203
|
+
};
|
|
204
|
+
};
|
|
205
|
+
const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
|
|
206
|
+
const afterDecodeActions = [];
|
|
207
|
+
const queuedDecodesToDelete = new Set();
|
|
208
|
+
const currentQueuedDecodeItems = new Set();
|
|
172
209
|
if (currentBatchSize !== 0)
|
|
173
210
|
this._ctx.initBatch(currentBatchSize);
|
|
174
|
-
for (const { queuedDecode, processAmount } of
|
|
211
|
+
for (const { queuedDecode, processAmount } of batchItems) {
|
|
175
212
|
let batchLogitIndex;
|
|
176
213
|
try {
|
|
177
|
-
|
|
214
|
+
const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
|
|
215
|
+
processAmount === queuedDecode.tokens.length;
|
|
216
|
+
const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
|
|
217
|
+
const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
|
|
218
|
+
TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
|
|
219
|
+
TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
|
|
220
|
+
batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
|
|
178
221
|
}
|
|
179
222
|
catch (err) {
|
|
180
223
|
this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
|
|
@@ -193,8 +236,6 @@ export class LlamaContext {
|
|
|
193
236
|
queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
|
|
194
237
|
queuedDecode.firstTokenSequenceIndex += processAmount;
|
|
195
238
|
}
|
|
196
|
-
if (batchTokenSlotsLeft === 0)
|
|
197
|
-
break;
|
|
198
239
|
}
|
|
199
240
|
for (let i = 0; i < this._queuedDecodes.length; i++) {
|
|
200
241
|
const queuedDecode = this._queuedDecodes[i];
|
|
@@ -204,7 +245,6 @@ export class LlamaContext {
|
|
|
204
245
|
i--;
|
|
205
246
|
}
|
|
206
247
|
}
|
|
207
|
-
shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
|
|
208
248
|
try {
|
|
209
249
|
if (currentBatchSize !== 0)
|
|
210
250
|
await this._ctx.decodeBatch();
|
|
@@ -225,15 +265,45 @@ export class LlamaContext {
|
|
|
225
265
|
}
|
|
226
266
|
accept(undefined);
|
|
227
267
|
}
|
|
268
|
+
};
|
|
269
|
+
const prioritizationStrategy = resolvePrioritizationStrategy();
|
|
270
|
+
if (prioritizationStrategy == null)
|
|
271
|
+
return; // all queued items are rejected and dequeued when we get here
|
|
272
|
+
while (shouldHaveAnotherLoop) {
|
|
273
|
+
const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
|
|
274
|
+
if (orderedQueuedDecodes == null)
|
|
275
|
+
return; // all queued items are rejected and dequeued when we get here
|
|
276
|
+
const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
|
|
277
|
+
let preventDisposalHandle;
|
|
278
|
+
try {
|
|
279
|
+
preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
|
|
280
|
+
}
|
|
281
|
+
catch (err) {
|
|
282
|
+
this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
|
|
283
|
+
return;
|
|
284
|
+
}
|
|
285
|
+
try {
|
|
286
|
+
await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
|
|
287
|
+
shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
|
|
288
|
+
}
|
|
289
|
+
finally {
|
|
290
|
+
preventDisposalHandle.dispose();
|
|
291
|
+
}
|
|
228
292
|
}
|
|
229
293
|
});
|
|
230
294
|
}
|
|
295
|
+
/**
|
|
296
|
+
* Print the timings of token evaluation since that last print for this context.
|
|
297
|
+
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
|
|
298
|
+
* it won't print anything.
|
|
299
|
+
*/
|
|
231
300
|
async printTimings() {
|
|
301
|
+
this._ensureNotDisposed();
|
|
232
302
|
this._ctx.printTimings();
|
|
233
303
|
await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
|
|
234
304
|
}
|
|
235
305
|
/** @internal */
|
|
236
|
-
async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5 }, onDone) {
|
|
306
|
+
async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
|
|
237
307
|
return await new Promise((accept, reject) => {
|
|
238
308
|
this._queuedDecodes.push({
|
|
239
309
|
sequenceId,
|
|
@@ -241,6 +311,7 @@ export class LlamaContext {
|
|
|
241
311
|
firstTokenSequenceIndex,
|
|
242
312
|
generateLogitAtTheEnd,
|
|
243
313
|
evaluationPriority,
|
|
314
|
+
tokenMeter,
|
|
244
315
|
response: [accept, reject],
|
|
245
316
|
onDone
|
|
246
317
|
});
|
|
@@ -253,6 +324,8 @@ export class LlamaContext {
|
|
|
253
324
|
if (this._disposed)
|
|
254
325
|
return;
|
|
255
326
|
void withLock(this, "context", async () => {
|
|
327
|
+
if (this._disposed)
|
|
328
|
+
return;
|
|
256
329
|
this._ctx.disposeSequence(sequenceId);
|
|
257
330
|
this._unusedSequenceIds.push(sequenceId);
|
|
258
331
|
this._onReclaimUnusedSequenceId.dispatchEvent();
|
|
@@ -312,20 +385,65 @@ export class LlamaContext {
|
|
|
312
385
|
if (this._disposed)
|
|
313
386
|
throw new DisposedError();
|
|
314
387
|
}
|
|
388
|
+
/** @internal */
|
|
389
|
+
static async _create(options, { _model }) {
|
|
390
|
+
const sequences = options.sequences ?? getDefaultContextSequences();
|
|
391
|
+
const contextSize = resolveContextContextSizeOption({
|
|
392
|
+
contextSize: options.contextSize,
|
|
393
|
+
batchSize: options.batchSize,
|
|
394
|
+
sequences: sequences,
|
|
395
|
+
modelFileInsights: _model.fileInsights,
|
|
396
|
+
modelGpuLayers: _model.gpuLayers,
|
|
397
|
+
modelTrainContextSize: _model.trainContextSize,
|
|
398
|
+
getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
|
|
399
|
+
llamaGpu: _model._llama.gpu,
|
|
400
|
+
ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
|
|
401
|
+
isEmbeddingContext: options._embeddings
|
|
402
|
+
});
|
|
403
|
+
const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
|
|
404
|
+
const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
|
|
405
|
+
contextSize,
|
|
406
|
+
sequences,
|
|
407
|
+
isEmbeddingContext: options._embeddings,
|
|
408
|
+
modelGpuLayers: _model.gpuLayers,
|
|
409
|
+
batchSize
|
|
410
|
+
}).gpuVram;
|
|
411
|
+
const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences });
|
|
412
|
+
const { createSignal } = options;
|
|
413
|
+
const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
|
|
414
|
+
? null
|
|
415
|
+
: _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
|
|
416
|
+
try {
|
|
417
|
+
const contextLoaded = await context._ctx.init();
|
|
418
|
+
if (createSignal?.aborted) {
|
|
419
|
+
if (contextLoaded)
|
|
420
|
+
await context._ctx.dispose();
|
|
421
|
+
throw createSignal.reason;
|
|
422
|
+
}
|
|
423
|
+
else if (!contextLoaded)
|
|
424
|
+
throw new Error("Failed to create context");
|
|
425
|
+
return context;
|
|
426
|
+
}
|
|
427
|
+
finally {
|
|
428
|
+
contextCreationMemoryReservation?.dispose?.();
|
|
429
|
+
}
|
|
430
|
+
}
|
|
315
431
|
}
|
|
316
432
|
export class LlamaContextSequence {
|
|
317
433
|
/** @internal */ _sequenceId;
|
|
318
434
|
/** @internal */ _gcRegistry;
|
|
319
435
|
/** @internal */ _context;
|
|
320
436
|
/** @internal */ _contextShift;
|
|
437
|
+
/** @internal */ _tokenMeter;
|
|
321
438
|
/** @internal */ _disposeAggregator = new DisposeAggregator();
|
|
322
439
|
/** @internal */ _contextTokens = [];
|
|
323
440
|
/** @internal */ _nextTokenIndex = 0;
|
|
324
441
|
/** @internal */ _disposed = false;
|
|
325
442
|
onDispose = new EventRelay();
|
|
326
|
-
constructor({ sequenceId, context, contextShift }) {
|
|
443
|
+
constructor({ sequenceId, context, tokenMeter, contextShift }) {
|
|
327
444
|
this._sequenceId = sequenceId;
|
|
328
445
|
this._context = context;
|
|
446
|
+
this._tokenMeter = tokenMeter ?? new TokenMeter();
|
|
329
447
|
this._contextShift = contextShift;
|
|
330
448
|
this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
|
|
331
449
|
this._gcRegistry.register(this, sequenceId);
|
|
@@ -362,6 +480,9 @@ export class LlamaContextSequence {
|
|
|
362
480
|
get contextTokens() {
|
|
363
481
|
return this._contextTokens.slice();
|
|
364
482
|
}
|
|
483
|
+
get tokenMeter() {
|
|
484
|
+
return this._tokenMeter;
|
|
485
|
+
}
|
|
365
486
|
get isLoadedToMemory() {
|
|
366
487
|
return !this._disposed;
|
|
367
488
|
}
|
|
@@ -387,7 +508,7 @@ export class LlamaContextSequence {
|
|
|
387
508
|
}
|
|
388
509
|
/**
|
|
389
510
|
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
|
390
|
-
*
|
|
511
|
+
* The start of each range is inclusive, and the end of each range is exclusive.
|
|
391
512
|
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
|
392
513
|
*/
|
|
393
514
|
async eraseContextTokenRanges(ranges) {
|
|
@@ -396,6 +517,8 @@ export class LlamaContextSequence {
|
|
|
396
517
|
this._ensureNotDisposed();
|
|
397
518
|
if (ranges.length === 0)
|
|
398
519
|
return;
|
|
520
|
+
// if the deletion fails, we'll have to dispose the sequence and fill it up again
|
|
521
|
+
let deletionSuccessful = true;
|
|
399
522
|
const resolvedRanges = ranges
|
|
400
523
|
.map(({ start, end }) => {
|
|
401
524
|
if (start === end)
|
|
@@ -425,15 +548,22 @@ export class LlamaContextSequence {
|
|
|
425
548
|
let lastDeleteRangeEndPos = null;
|
|
426
549
|
for (const range of resolvedRanges) {
|
|
427
550
|
this._contextTokens.splice(range.start - removedTokens, range.end - range.start);
|
|
428
|
-
|
|
429
|
-
|
|
551
|
+
if (deletionSuccessful)
|
|
552
|
+
deletionSuccessful &&= this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
|
|
553
|
+
if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start)
|
|
430
554
|
this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, range.start, -removedTokens);
|
|
431
555
|
removedTokens += range.end - range.start;
|
|
432
556
|
lastDeleteRangeEndPos = range.end;
|
|
433
557
|
}
|
|
434
|
-
if (lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
|
|
558
|
+
if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
|
|
435
559
|
this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
|
|
436
560
|
this._nextTokenIndex -= removedTokens;
|
|
561
|
+
if (deletionSuccessful)
|
|
562
|
+
return;
|
|
563
|
+
const newSequenceTokens = this._contextTokens.slice();
|
|
564
|
+
this._nextTokenIndex = 0;
|
|
565
|
+
this._context._ctx.disposeSequence(this._sequenceId);
|
|
566
|
+
await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
|
|
437
567
|
});
|
|
438
568
|
}
|
|
439
569
|
/**
|
|
@@ -485,7 +615,7 @@ export class LlamaContextSequence {
|
|
|
485
615
|
while (true) {
|
|
486
616
|
this._ensureNotDisposed();
|
|
487
617
|
// Evaluate to get the next token.
|
|
488
|
-
const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, contextShiftOptions, (batchLogitIndex) => {
|
|
618
|
+
const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
|
|
489
619
|
const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
|
|
490
620
|
? repeatPenalty.punishTokens()
|
|
491
621
|
: repeatPenalty?.punishTokens;
|
|
@@ -519,7 +649,7 @@ export class LlamaContextSequence {
|
|
|
519
649
|
}
|
|
520
650
|
}
|
|
521
651
|
/** @internal */
|
|
522
|
-
async _decodeTokens(tokens, generateLogit, evaluationPriority, contextShiftOptions, onDecodeDone) {
|
|
652
|
+
async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
|
|
523
653
|
this._ensureNotDisposed();
|
|
524
654
|
const tokensLeftToDecode = tokens.slice();
|
|
525
655
|
return await withLock(this, "evaluate", async () => {
|
|
@@ -539,7 +669,8 @@ export class LlamaContextSequence {
|
|
|
539
669
|
tokens: tokensToDecode,
|
|
540
670
|
firstTokenSequenceIndex: this._nextTokenIndex,
|
|
541
671
|
generateLogitAtTheEnd,
|
|
542
|
-
evaluationPriority
|
|
672
|
+
evaluationPriority,
|
|
673
|
+
tokenMeter
|
|
543
674
|
}, !generateLogitAtTheEnd
|
|
544
675
|
? undefined
|
|
545
676
|
: onDecodeDone);
|
|
@@ -586,10 +717,11 @@ export class LlamaContextSequence {
|
|
|
586
717
|
* We need this to make it impossible to manually create instances of this class outside the code of this library
|
|
587
718
|
* @internal
|
|
588
719
|
*/
|
|
589
|
-
static _create({ sequenceId, context, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
|
|
720
|
+
static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
|
|
590
721
|
return new LlamaContextSequence({
|
|
591
722
|
sequenceId,
|
|
592
723
|
context,
|
|
724
|
+
tokenMeter,
|
|
593
725
|
contextShift: {
|
|
594
726
|
size: contextShiftSize,
|
|
595
727
|
strategy: contextShiftStrategy
|
|
@@ -600,11 +732,70 @@ export class LlamaContextSequence {
|
|
|
600
732
|
function disposeContextIfReferenced(contextRef) {
|
|
601
733
|
const context = contextRef.deref();
|
|
602
734
|
if (context != null)
|
|
603
|
-
context.dispose();
|
|
735
|
+
void context.dispose();
|
|
604
736
|
}
|
|
605
737
|
function disposeContextSequenceIfReferenced(contextRef) {
|
|
606
738
|
const context = contextRef.deref();
|
|
607
739
|
if (context != null)
|
|
608
740
|
context.dispose();
|
|
609
741
|
}
|
|
742
|
+
export function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false }) {
|
|
743
|
+
if (contextSize == null)
|
|
744
|
+
contextSize = "auto";
|
|
745
|
+
if (typeof contextSize === "number") {
|
|
746
|
+
const resolvedContextSize = Math.max(1, Math.floor(contextSize));
|
|
747
|
+
if (ignoreMemorySafetyChecks)
|
|
748
|
+
return resolvedContextSize;
|
|
749
|
+
const vramState = getVramState();
|
|
750
|
+
const contextVram = modelFileInsights.estimateContextResourceRequirements({
|
|
751
|
+
contextSize: resolvedContextSize,
|
|
752
|
+
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }),
|
|
753
|
+
modelGpuLayers: modelGpuLayers,
|
|
754
|
+
sequences,
|
|
755
|
+
isEmbeddingContext
|
|
756
|
+
}).gpuVram;
|
|
757
|
+
if (contextVram > vramState.free)
|
|
758
|
+
throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
|
|
759
|
+
return resolvedContextSize;
|
|
760
|
+
}
|
|
761
|
+
else if (contextSize === "auto" || typeof contextSize === "object") {
|
|
762
|
+
if (llamaGpu === false)
|
|
763
|
+
return modelTrainContextSize;
|
|
764
|
+
const vramState = getVramState();
|
|
765
|
+
if (vramState.total === 0)
|
|
766
|
+
return modelTrainContextSize;
|
|
767
|
+
const freeVram = vramState.free;
|
|
768
|
+
const maxContextSize = contextSize === "auto"
|
|
769
|
+
? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })
|
|
770
|
+
: Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }));
|
|
771
|
+
const minContextSize = contextSize === "auto"
|
|
772
|
+
? minAllowedContextSizeInCalculations
|
|
773
|
+
: Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations);
|
|
774
|
+
for (let testContextSize = maxContextSize; testContextSize >= minContextSize; testContextSize--) {
|
|
775
|
+
const contextVram = modelFileInsights.estimateContextResourceRequirements({
|
|
776
|
+
contextSize: testContextSize,
|
|
777
|
+
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }),
|
|
778
|
+
modelGpuLayers: modelGpuLayers,
|
|
779
|
+
sequences,
|
|
780
|
+
isEmbeddingContext
|
|
781
|
+
}).gpuVram;
|
|
782
|
+
if (contextVram <= freeVram)
|
|
783
|
+
return testContextSize;
|
|
784
|
+
}
|
|
785
|
+
if (ignoreMemorySafetyChecks)
|
|
786
|
+
return minContextSize;
|
|
787
|
+
throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`);
|
|
788
|
+
}
|
|
789
|
+
throw new Error(`Invalid context size: "${contextSize}"`);
|
|
790
|
+
}
|
|
791
|
+
export function getDefaultContextBatchSize({ contextSize, sequences }) {
|
|
792
|
+
return Math.min(contextSize * sequences, 512);
|
|
793
|
+
}
|
|
794
|
+
export function getDefaultContextSequences() {
|
|
795
|
+
return 1;
|
|
796
|
+
}
|
|
797
|
+
const defaultFallbackContextSize = 4096;
|
|
798
|
+
export function getDefaultModelContextSize({ trainContextSize }) {
|
|
799
|
+
return trainContextSize ?? defaultFallbackContextSize;
|
|
800
|
+
}
|
|
610
801
|
//# sourceMappingURL=LlamaContext.js.map
|