node-llama-cpp 3.0.0-beta.14 → 3.0.0-beta.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/ChatWrapper.js +4 -0
- package/dist/ChatWrapper.js.map +1 -1
- package/dist/bindings/AddonTypes.d.ts +21 -0
- package/dist/bindings/Llama.d.ts +4 -0
- package/dist/bindings/Llama.js +45 -4
- package/dist/bindings/Llama.js.map +1 -1
- package/dist/bindings/getLlama.d.ts +19 -1
- package/dist/bindings/getLlama.js +15 -5
- package/dist/bindings/getLlama.js.map +1 -1
- package/dist/bindings/types.d.ts +15 -0
- package/dist/bindings/types.js +27 -2
- package/dist/bindings/types.js.map +1 -1
- package/dist/bindings/utils/MemoryOrchestrator.d.ts +21 -0
- package/dist/bindings/utils/MemoryOrchestrator.js +49 -0
- package/dist/bindings/utils/MemoryOrchestrator.js.map +1 -0
- package/dist/bindings/utils/cloneLlamaCppRepo.d.ts +1 -1
- package/dist/bindings/utils/cloneLlamaCppRepo.js +3 -2
- package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
- package/dist/bindings/utils/compileLLamaCpp.js +2 -2
- package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
- package/dist/bindings/utils/getLlamaWithoutBackend.d.ts +5 -0
- package/dist/bindings/utils/getLlamaWithoutBackend.js +27 -0
- package/dist/bindings/utils/getLlamaWithoutBackend.js.map +1 -0
- package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -2
- package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
- package/dist/chatWrappers/AlpacaChatWrapper.d.ts +2 -1
- package/dist/chatWrappers/AlpacaChatWrapper.js +9 -2
- package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/ChatMLChatWrapper.js +12 -10
- package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
- package/dist/chatWrappers/FalconChatWrapper.d.ts +2 -1
- package/dist/chatWrappers/FalconChatWrapper.js +28 -11
- package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
- package/dist/chatWrappers/FunctionaryChatWrapper.js +59 -45
- package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GemmaChatWrapper.js +9 -7
- package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/GeneralChatWrapper.d.ts +2 -1
- package/dist/chatWrappers/GeneralChatWrapper.js +35 -12
- package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
- package/dist/chatWrappers/LlamaChatWrapper.d.ts +7 -0
- package/dist/chatWrappers/LlamaChatWrapper.js +26 -8
- package/dist/chatWrappers/LlamaChatWrapper.js.map +1 -1
- package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +73 -0
- package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +355 -0
- package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -0
- package/dist/{TemplateChatWrapper.d.ts → chatWrappers/generic/TemplateChatWrapper.d.ts} +6 -9
- package/dist/{TemplateChatWrapper.js → chatWrappers/generic/TemplateChatWrapper.js} +31 -69
- package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -0
- package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.d.ts +33 -0
- package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js +45 -0
- package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js.map +1 -0
- package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.d.ts +4 -0
- package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +206 -0
- package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +67 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.js +206 -0
- package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -0
- package/dist/cli/cli.js +1 -1
- package/dist/cli/cli.js.map +1 -1
- package/dist/cli/commands/ChatCommand.d.ts +7 -4
- package/dist/cli/commands/ChatCommand.js +150 -60
- package/dist/cli/commands/ChatCommand.js.map +1 -1
- package/dist/cli/commands/ClearCommand.d.ts +1 -1
- package/dist/cli/commands/ClearCommand.js +5 -5
- package/dist/cli/commands/ClearCommand.js.map +1 -1
- package/dist/cli/commands/CompleteCommand.d.ts +3 -2
- package/dist/cli/commands/CompleteCommand.js +88 -41
- package/dist/cli/commands/CompleteCommand.js.map +1 -1
- package/dist/cli/commands/InfillCommand.d.ts +3 -2
- package/dist/cli/commands/InfillCommand.js +88 -41
- package/dist/cli/commands/InfillCommand.js.map +1 -1
- package/dist/cli/commands/{InspectCommand.d.ts → inspect/InspectCommand.d.ts} +1 -4
- package/dist/cli/commands/inspect/InspectCommand.js +17 -0
- package/dist/cli/commands/inspect/InspectCommand.js.map +1 -0
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts +10 -0
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +108 -0
- package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -0
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts +4 -0
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +98 -0
- package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +14 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +577 -0
- package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -0
- package/dist/cli/utils/ConsoleTable.d.ts +23 -0
- package/dist/cli/utils/ConsoleTable.js +86 -0
- package/dist/cli/utils/ConsoleTable.js.map +1 -0
- package/dist/cli/utils/printCommonInfoLines.d.ts +9 -0
- package/dist/cli/utils/printCommonInfoLines.js +70 -0
- package/dist/cli/utils/printCommonInfoLines.js.map +1 -0
- package/dist/cli/utils/printInfoLine.d.ts +10 -0
- package/dist/cli/utils/printInfoLine.js +45 -0
- package/dist/cli/utils/printInfoLine.js.map +1 -0
- package/dist/cli/utils/resolveCommandGgufPath.d.ts +1 -0
- package/dist/cli/utils/resolveCommandGgufPath.js +6 -0
- package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -0
- package/dist/config.d.ts +2 -0
- package/dist/config.js +6 -0
- package/dist/config.js.map +1 -1
- package/dist/evaluator/LlamaChat/LlamaChat.js +13 -5
- package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
- package/dist/evaluator/LlamaCompletion.js +5 -3
- package/dist/evaluator/LlamaCompletion.js.map +1 -1
- package/dist/evaluator/LlamaContext/LlamaContext.d.ts +40 -3
- package/dist/evaluator/LlamaContext/LlamaContext.js +245 -100
- package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
- package/dist/evaluator/LlamaContext/types.d.ts +57 -6
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map +1 -0
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map +1 -0
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts +2 -0
- package/dist/evaluator/LlamaContext/utils/{resolveBatchItemsPrioritizingStrategy.js → resolveBatchItemsPrioritizationStrategy.js} +4 -4
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map +1 -0
- package/dist/evaluator/LlamaEmbeddingContext.d.ts +23 -2
- package/dist/evaluator/LlamaEmbeddingContext.js +4 -5
- package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
- package/dist/evaluator/LlamaModel.d.ts +64 -6
- package/dist/evaluator/LlamaModel.js +297 -8
- package/dist/evaluator/LlamaModel.js.map +1 -1
- package/dist/evaluator/TokenMeter.d.ts +54 -0
- package/dist/evaluator/TokenMeter.js +86 -0
- package/dist/evaluator/TokenMeter.js.map +1 -0
- package/dist/gguf/GgufInsights.d.ts +40 -0
- package/dist/gguf/GgufInsights.js +350 -0
- package/dist/gguf/GgufInsights.js.map +1 -0
- package/dist/gguf/consts.d.ts +3 -0
- package/dist/gguf/consts.js +8 -0
- package/dist/gguf/consts.js.map +1 -0
- package/dist/gguf/errors/InvalidGgufMagicError.d.ts +3 -0
- package/dist/gguf/errors/InvalidGgufMagicError.js +6 -0
- package/dist/gguf/errors/InvalidGgufMagicError.js.map +1 -0
- package/dist/gguf/errors/UnsupportedGgufValueTypeError.d.ts +4 -0
- package/dist/gguf/errors/UnsupportedGgufValueTypeError.js +9 -0
- package/dist/gguf/errors/UnsupportedGgufValueTypeError.js.map +1 -0
- package/dist/gguf/fileReaders/GgufFileReader.d.ts +33 -0
- package/dist/gguf/fileReaders/GgufFileReader.js +76 -0
- package/dist/gguf/fileReaders/GgufFileReader.js.map +1 -0
- package/dist/gguf/fileReaders/GgufFsFileReader.d.ts +17 -0
- package/dist/gguf/fileReaders/GgufFsFileReader.js +45 -0
- package/dist/gguf/fileReaders/GgufFsFileReader.js.map +1 -0
- package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.d.ts +22 -0
- package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js +63 -0
- package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js.map +1 -0
- package/dist/gguf/parser/GgufV2Parser.d.ts +19 -0
- package/dist/gguf/parser/GgufV2Parser.js +115 -0
- package/dist/gguf/parser/GgufV2Parser.js.map +1 -0
- package/dist/gguf/parser/GgufV3Parser.d.ts +3 -0
- package/dist/gguf/parser/GgufV3Parser.js +4 -0
- package/dist/gguf/parser/GgufV3Parser.js.map +1 -0
- package/dist/gguf/parser/parseGguf.d.ts +8 -0
- package/dist/gguf/parser/parseGguf.js +58 -0
- package/dist/gguf/parser/parseGguf.js.map +1 -0
- package/dist/gguf/readGgufFileInfo.d.ts +30 -0
- package/dist/gguf/readGgufFileInfo.js +37 -0
- package/dist/gguf/readGgufFileInfo.js.map +1 -0
- package/dist/gguf/types/GgufFileInfoTypes.d.ts +52 -0
- package/dist/gguf/types/GgufFileInfoTypes.js +18 -0
- package/dist/gguf/types/GgufFileInfoTypes.js.map +1 -0
- package/dist/gguf/types/GgufMetadataTypes.d.ts +330 -0
- package/dist/gguf/types/GgufMetadataTypes.js +86 -0
- package/dist/gguf/types/GgufMetadataTypes.js.map +1 -0
- package/dist/gguf/types/GgufTensorInfoTypes.d.ts +37 -0
- package/dist/gguf/types/GgufTensorInfoTypes.js +33 -0
- package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -0
- package/dist/gguf/utils/GgufReadOffset.d.ts +6 -0
- package/dist/gguf/utils/GgufReadOffset.js +18 -0
- package/dist/gguf/utils/GgufReadOffset.js.map +1 -0
- package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.d.ts +5 -0
- package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js +38 -0
- package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js.map +1 -0
- package/dist/gguf/utils/getGgufFileTypeName.d.ts +4 -0
- package/dist/gguf/utils/getGgufFileTypeName.js +13 -0
- package/dist/gguf/utils/getGgufFileTypeName.js.map +1 -0
- package/dist/gguf/utils/getGgufMetadataArchitectureData.d.ts +3 -0
- package/dist/gguf/utils/getGgufMetadataArchitectureData.js +4 -0
- package/dist/gguf/utils/getGgufMetadataArchitectureData.js.map +1 -0
- package/dist/gguf/utils/normalizeGgufDownloadUrl.d.ts +1 -0
- package/dist/gguf/utils/normalizeGgufDownloadUrl.js +16 -0
- package/dist/gguf/utils/normalizeGgufDownloadUrl.js.map +1 -0
- package/dist/index.d.ts +13 -7
- package/dist/index.js +11 -6
- package/dist/index.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/utils/InsufficientMemoryError.d.ts +3 -0
- package/dist/utils/InsufficientMemoryError.js +6 -0
- package/dist/utils/InsufficientMemoryError.js.map +1 -0
- package/dist/utils/LlamaText.d.ts +25 -10
- package/dist/utils/LlamaText.js +205 -23
- package/dist/utils/LlamaText.js.map +1 -1
- package/dist/utils/StopGenerationDetector.js +3 -1
- package/dist/utils/StopGenerationDetector.js.map +1 -1
- package/dist/utils/findBestOption.d.ts +4 -0
- package/dist/utils/findBestOption.js +15 -0
- package/dist/utils/findBestOption.js.map +1 -0
- package/dist/utils/getQueuedTokensBeforeStopTrigger.js +3 -3
- package/dist/utils/getQueuedTokensBeforeStopTrigger.js.map +1 -1
- package/dist/utils/gitReleaseBundles.js +68 -1
- package/dist/utils/gitReleaseBundles.js.map +1 -1
- package/dist/utils/mergeUnionTypes.d.ts +4 -0
- package/dist/utils/parseModelFileName.d.ts +1 -0
- package/dist/utils/parseModelFileName.js +6 -1
- package/dist/utils/parseModelFileName.js.map +1 -1
- package/dist/utils/prettyPrintObject.d.ts +10 -1
- package/dist/utils/prettyPrintObject.js +57 -13
- package/dist/utils/prettyPrintObject.js.map +1 -1
- package/dist/utils/spawnCommand.js.map +1 -1
- package/dist/utils/tokenizeInput.d.ts +1 -1
- package/dist/utils/tokenizeInput.js +3 -3
- package/dist/utils/tokenizeInput.js.map +1 -1
- package/dist/utils/withOra.d.ts +1 -0
- package/dist/utils/withOra.js +2 -2
- package/dist/utils/withOra.js.map +1 -1
- package/llama/CMakeLists.txt +5 -5
- package/llama/addon.cpp +117 -5
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llama/gpuInfo/cuda-gpu-info.cu +21 -0
- package/llama/gpuInfo/cuda-gpu-info.h +3 -0
- package/llama/gpuInfo/metal-gpu-info.h +4 -1
- package/llama/gpuInfo/metal-gpu-info.mm +14 -1
- package/llama/gpuInfo/vulkan-gpu-info.cpp +20 -2
- package/llama/gpuInfo/vulkan-gpu-info.h +2 -0
- package/llama/llama.cpp.info.json +1 -1
- package/llama/toolchains/win32.host-x64.target-arm64.cmake +41 -0
- package/llamaBins/linux-arm64/_nlcBuildMetadata.json +1 -1
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/_nlcBuildMetadata.json +1 -1
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/_nlcBuildMetadata.json +1 -1
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/linux-x64-cuda/_nlcBuildMetadata.json +1 -1
- package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/linux-x64-vulkan/_nlcBuildMetadata.json +1 -1
- package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
- package/llamaBins/mac-arm64-metal/_nlcBuildMetadata.json +1 -1
- package/llamaBins/mac-arm64-metal/default.metallib +0 -0
- package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
- package/llamaBins/mac-x64/_nlcBuildMetadata.json +1 -1
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-arm64/_nlcBuildMetadata.json +1 -0
- package/llamaBins/win-arm64/llama-addon.exp +0 -0
- package/llamaBins/win-arm64/llama-addon.lib +0 -0
- package/llamaBins/win-arm64/llama-addon.node +0 -0
- package/llamaBins/win-x64/_nlcBuildMetadata.json +1 -1
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64-cuda/_nlcBuildMetadata.json +1 -1
- package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
- package/llamaBins/win-x64-vulkan/_nlcBuildMetadata.json +1 -1
- package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
- package/package.json +8 -6
- package/dist/TemplateChatWrapper.js.map +0 -1
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +0 -33
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +0 -49
- package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +0 -1
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.d.ts +0 -13
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +0 -63
- package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +0 -1
- package/dist/cli/commands/InspectCommand.js +0 -113
- package/dist/cli/commands/InspectCommand.js.map +0 -1
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/firstInFirstOutStrategy.js.map +0 -1
- package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/maximumParallelismStrategy.js.map +0 -1
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.d.ts +0 -2
- package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.js.map +0 -1
- package/dist/gguf/GGUFInsights.d.ts +0 -28
- package/dist/gguf/GGUFInsights.js +0 -58
- package/dist/gguf/GGUFInsights.js.map +0 -1
- package/dist/gguf/GGUFMetadata.d.ts +0 -19
- package/dist/gguf/GGUFMetadata.js +0 -38
- package/dist/gguf/GGUFMetadata.js.map +0 -1
- package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +0 -3
- package/dist/gguf/errors/InvalidGGUFMagicError.js +0 -6
- package/dist/gguf/errors/InvalidGGUFMagicError.js.map +0 -1
- package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +0 -3
- package/dist/gguf/errors/MetadataNotParsedYetError.js +0 -6
- package/dist/gguf/errors/MetadataNotParsedYetError.js.map +0 -1
- package/dist/gguf/errors/MissingNodeLlamaError.d.ts +0 -3
- package/dist/gguf/errors/MissingNodeLlamaError.js +0 -6
- package/dist/gguf/errors/MissingNodeLlamaError.js.map +0 -1
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +0 -5
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +0 -11
- package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +0 -1
- package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +0 -4
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js +0 -8
- package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +0 -1
- package/dist/gguf/ggufParser/GGUFParser.d.ts +0 -18
- package/dist/gguf/ggufParser/GGUFParser.js +0 -123
- package/dist/gguf/ggufParser/GGUFParser.js.map +0 -1
- package/dist/gguf/ggufParser/GGUFTypes.d.ts +0 -257
- package/dist/gguf/ggufParser/GGUFTypes.js +0 -2
- package/dist/gguf/ggufParser/GGUFTypes.js.map +0 -1
- package/dist/gguf/ggufParser/checkArchitecture.d.ts +0 -14
- package/dist/gguf/ggufParser/checkArchitecture.js +0 -74
- package/dist/gguf/ggufParser/checkArchitecture.js.map +0 -1
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +0 -38
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +0 -83
- package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +0 -1
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +0 -14
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +0 -35
- package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +0 -1
- package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +0 -15
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js +0 -40
- package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +0 -1
- package/dist/utils/parseModelTypeDescription.d.ts +0 -6
- package/dist/utils/parseModelTypeDescription.js +0 -9
- package/dist/utils/parseModelTypeDescription.js.map +0 -1
- package/dist/utils/resolveChatWrapper.d.ts +0 -4
- package/dist/utils/resolveChatWrapper.js +0 -16
- package/dist/utils/resolveChatWrapper.js.map +0 -1
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.d.ts +0 -0
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.js +0 -0
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.d.ts +0 -0
- /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.js +0 -0
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import { EventRelay } from "lifecycle-utils";
|
|
2
2
|
import { Token } from "../../types.js";
|
|
3
3
|
import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
|
|
4
|
-
import {
|
|
4
|
+
import { GgufInsights } from "../../gguf/GgufInsights.js";
|
|
5
|
+
import { TokenMeter } from "../TokenMeter.js";
|
|
6
|
+
import { BuildGpu } from "../../bindings/types.js";
|
|
7
|
+
import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty } from "./types.js";
|
|
5
8
|
import type { LlamaModel } from "../LlamaModel.js";
|
|
6
9
|
export declare class LlamaContext {
|
|
7
10
|
readonly onDispose: EventRelay<void>;
|
|
@@ -13,6 +16,11 @@ export declare class LlamaContext {
|
|
|
13
16
|
get model(): LlamaModel;
|
|
14
17
|
get contextSize(): number;
|
|
15
18
|
get batchSize(): number;
|
|
19
|
+
/**
|
|
20
|
+
* The actual size of the state in the memory in bytes.
|
|
21
|
+
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
|
|
22
|
+
*/
|
|
23
|
+
get stateSize(): number;
|
|
16
24
|
getAllocatedContextSize(): number;
|
|
17
25
|
get totalSequences(): number;
|
|
18
26
|
get sequencesLeft(): number;
|
|
@@ -21,10 +29,15 @@ export declare class LlamaContext {
|
|
|
21
29
|
* When there are no sequences left, this method will throw an error.
|
|
22
30
|
* @param [options]
|
|
23
31
|
*/
|
|
24
|
-
getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy } }?: {
|
|
32
|
+
getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, _tokenMeter }?: {
|
|
25
33
|
contextShift?: ContextShiftOptions;
|
|
26
34
|
}): LlamaContextSequence;
|
|
27
35
|
dispatchPendingBatch(): void;
|
|
36
|
+
/**
|
|
37
|
+
* Print the timings of token evaluation since that last print for this context.
|
|
38
|
+
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
|
|
39
|
+
* it won't print anything.
|
|
40
|
+
*/
|
|
28
41
|
printTimings(): Promise<void>;
|
|
29
42
|
}
|
|
30
43
|
export declare class LlamaContextSequence {
|
|
@@ -38,6 +51,7 @@ export declare class LlamaContextSequence {
|
|
|
38
51
|
get model(): LlamaModel;
|
|
39
52
|
get nextTokenIndex(): number;
|
|
40
53
|
get contextTokens(): Token[];
|
|
54
|
+
get tokenMeter(): TokenMeter;
|
|
41
55
|
get isLoadedToMemory(): boolean;
|
|
42
56
|
compareContextTokens(tokens: Token[]): {
|
|
43
57
|
firstDifferentIndex: number;
|
|
@@ -49,7 +63,7 @@ export declare class LlamaContextSequence {
|
|
|
49
63
|
clearHistory(): Promise<void>;
|
|
50
64
|
/**
|
|
51
65
|
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
|
52
|
-
*
|
|
66
|
+
* The start of each range is inclusive, and the end of each range is exclusive.
|
|
53
67
|
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
|
54
68
|
*/
|
|
55
69
|
eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
|
|
@@ -104,3 +118,26 @@ export declare class LlamaContextSequence {
|
|
|
104
118
|
contextShift?: ContextShiftOptions;
|
|
105
119
|
}): Promise<void>;
|
|
106
120
|
}
|
|
121
|
+
export declare function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext }: {
|
|
122
|
+
contextSize?: LlamaContextOptions["contextSize"];
|
|
123
|
+
batchSize?: LlamaContextOptions["batchSize"];
|
|
124
|
+
sequences: number;
|
|
125
|
+
modelFileInsights: GgufInsights;
|
|
126
|
+
modelGpuLayers: number;
|
|
127
|
+
modelTrainContextSize: number;
|
|
128
|
+
getVramState(): {
|
|
129
|
+
total: number;
|
|
130
|
+
free: number;
|
|
131
|
+
};
|
|
132
|
+
llamaGpu: BuildGpu;
|
|
133
|
+
ignoreMemorySafetyChecks?: boolean;
|
|
134
|
+
isEmbeddingContext?: boolean;
|
|
135
|
+
}): number;
|
|
136
|
+
export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
|
|
137
|
+
contextSize: number;
|
|
138
|
+
sequences: number;
|
|
139
|
+
}): number;
|
|
140
|
+
export declare function getDefaultContextSequences(): number;
|
|
141
|
+
export declare function getDefaultModelContextSize({ trainContextSize }: {
|
|
142
|
+
trainContextSize?: number;
|
|
143
|
+
}): number;
|
|
@@ -2,7 +2,9 @@ import { DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAgg
|
|
|
2
2
|
import { removeNullFields } from "../../utils/removeNullFields.js";
|
|
3
3
|
import { compareTokens } from "../../utils/compareTokens.js";
|
|
4
4
|
import { DisposeGuard } from "../../utils/DisposeGuard.js";
|
|
5
|
-
import {
|
|
5
|
+
import { minAllowedContextSizeInCalculations } from "../../config.js";
|
|
6
|
+
import { TokenMeter } from "../TokenMeter.js";
|
|
7
|
+
import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
|
|
6
8
|
export class LlamaContext {
|
|
7
9
|
/** @internal */ _llama;
|
|
8
10
|
/** @internal */ _ctx;
|
|
@@ -25,7 +27,7 @@ export class LlamaContext {
|
|
|
25
27
|
/** @internal */ _allocatedContextSize;
|
|
26
28
|
/** @internal */ _disposed = false;
|
|
27
29
|
onDispose = new EventRelay();
|
|
28
|
-
constructor({ _model }, { sequences
|
|
30
|
+
constructor({ _model }, { sequences, seed = null, contextSize, batchSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
|
|
29
31
|
if (_model.disposed)
|
|
30
32
|
throw new DisposedError();
|
|
31
33
|
this._llama = _model._llama;
|
|
@@ -39,13 +41,14 @@ export class LlamaContext {
|
|
|
39
41
|
seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
|
|
40
42
|
contextSize: this._contextSize * this._totalSequences,
|
|
41
43
|
batchSize: this._batchSize,
|
|
44
|
+
sequences: this._totalSequences,
|
|
42
45
|
threads: Math.max(0, Math.floor(threads)),
|
|
43
46
|
embeddings: _embeddings,
|
|
44
47
|
noSeed: _noSeed
|
|
45
48
|
}));
|
|
46
49
|
this._batchingOptions = {
|
|
47
50
|
dispatchSchedule: batchingDispatchSchedule,
|
|
48
|
-
|
|
51
|
+
itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
|
|
49
52
|
};
|
|
50
53
|
this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
|
|
51
54
|
this._disposeAggregator.add(() => {
|
|
@@ -82,6 +85,14 @@ export class LlamaContext {
|
|
|
82
85
|
get batchSize() {
|
|
83
86
|
return this._batchSize;
|
|
84
87
|
}
|
|
88
|
+
/**
|
|
89
|
+
* The actual size of the state in the memory in bytes.
|
|
90
|
+
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
|
|
91
|
+
*/
|
|
92
|
+
get stateSize() {
|
|
93
|
+
this._ensureNotDisposed();
|
|
94
|
+
return this._ctx.getStateSize();
|
|
95
|
+
}
|
|
85
96
|
getAllocatedContextSize() {
|
|
86
97
|
this._ensureNotDisposed();
|
|
87
98
|
if (this._allocatedContextSize == null)
|
|
@@ -99,7 +110,7 @@ export class LlamaContext {
|
|
|
99
110
|
* When there are no sequences left, this method will throw an error.
|
|
100
111
|
* @param [options]
|
|
101
112
|
*/
|
|
102
|
-
getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} } = {}) {
|
|
113
|
+
getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = {}) {
|
|
103
114
|
this._ensureNotDisposed();
|
|
104
115
|
const nextSequenceId = this._popSequenceId();
|
|
105
116
|
if (nextSequenceId == null)
|
|
@@ -107,6 +118,7 @@ export class LlamaContext {
|
|
|
107
118
|
return LlamaContextSequence._create({
|
|
108
119
|
sequenceId: nextSequenceId,
|
|
109
120
|
context: this,
|
|
121
|
+
tokenMeter: _tokenMeter,
|
|
110
122
|
contextShift: {
|
|
111
123
|
size: contextShiftSize,
|
|
112
124
|
strategy: contextShiftStrategy
|
|
@@ -123,17 +135,18 @@ export class LlamaContext {
|
|
|
123
135
|
this._currentDispatchBatchHandle = {};
|
|
124
136
|
this._dispatchDecodeScheduled = false;
|
|
125
137
|
this._batchDispatchPending = false;
|
|
126
|
-
let
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
138
|
+
let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
|
|
139
|
+
const resolvePrioritizationStrategy = () => {
|
|
140
|
+
try {
|
|
141
|
+
this._ensureNotDisposed();
|
|
142
|
+
return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
|
|
143
|
+
}
|
|
144
|
+
catch (err) {
|
|
145
|
+
this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
|
|
146
|
+
}
|
|
147
|
+
return null;
|
|
148
|
+
};
|
|
149
|
+
const getOrderedQueuedDecodes = (prioritizationStrategy) => {
|
|
137
150
|
const batchItemToQueuedDecodeMap = new Map();
|
|
138
151
|
const batchItemsList = [];
|
|
139
152
|
for (const queuedDecode of this._queuedDecodes) {
|
|
@@ -146,101 +159,132 @@ export class LlamaContext {
|
|
|
146
159
|
}
|
|
147
160
|
let prioritizedItems;
|
|
148
161
|
try {
|
|
149
|
-
prioritizedItems =
|
|
162
|
+
prioritizedItems = prioritizationStrategy({
|
|
150
163
|
items: batchItemsList,
|
|
151
164
|
size: this._batchSize
|
|
152
165
|
});
|
|
153
166
|
}
|
|
154
167
|
catch (err) {
|
|
155
168
|
this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
|
|
156
|
-
return;
|
|
169
|
+
return null;
|
|
157
170
|
}
|
|
158
|
-
|
|
159
|
-
const afterDecodeActions = [];
|
|
160
|
-
const queuedDecodesToDelete = new Set();
|
|
161
|
-
const currentQueuedDecodeItems = new Set();
|
|
162
|
-
const currentBatchItems = [];
|
|
163
|
-
let currentBatchSize = 0;
|
|
164
|
-
for (const prioritizedItem of prioritizedItems) {
|
|
171
|
+
return prioritizedItems.map((prioritizedItem) => {
|
|
165
172
|
const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
|
|
166
173
|
if (queuedDecode == null)
|
|
167
174
|
throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
|
|
168
175
|
"of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
|
|
169
|
-
|
|
170
|
-
|
|
176
|
+
return {
|
|
177
|
+
queuedDecode,
|
|
178
|
+
processAmount: prioritizedItem.processAmount
|
|
179
|
+
};
|
|
180
|
+
});
|
|
181
|
+
};
|
|
182
|
+
const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
|
|
183
|
+
const currentBatchItems = [];
|
|
184
|
+
let currentBatchSize = 0;
|
|
185
|
+
let batchTokenSlotsLeft = batchSize;
|
|
186
|
+
for (const { queuedDecode, processAmount } of queuedDecodes) {
|
|
187
|
+
const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
|
|
188
|
+
if (resolvedProcessAmount <= 0) {
|
|
189
|
+
if (batchTokenSlotsLeft === 0)
|
|
190
|
+
break;
|
|
171
191
|
continue;
|
|
172
|
-
|
|
192
|
+
}
|
|
193
|
+
batchTokenSlotsLeft -= resolvedProcessAmount;
|
|
194
|
+
currentBatchSize += resolvedProcessAmount;
|
|
173
195
|
currentBatchItems.push({
|
|
174
196
|
queuedDecode,
|
|
175
|
-
processAmount
|
|
197
|
+
processAmount: resolvedProcessAmount
|
|
176
198
|
});
|
|
177
|
-
currentBatchSize += processAmount;
|
|
178
199
|
}
|
|
179
|
-
|
|
200
|
+
return {
|
|
201
|
+
currentBatchItems,
|
|
202
|
+
currentBatchSize
|
|
203
|
+
};
|
|
204
|
+
};
|
|
205
|
+
const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
|
|
206
|
+
const afterDecodeActions = [];
|
|
207
|
+
const queuedDecodesToDelete = new Set();
|
|
208
|
+
const currentQueuedDecodeItems = new Set();
|
|
209
|
+
if (currentBatchSize !== 0)
|
|
210
|
+
this._ctx.initBatch(currentBatchSize);
|
|
211
|
+
for (const { queuedDecode, processAmount } of batchItems) {
|
|
212
|
+
let batchLogitIndex;
|
|
213
|
+
try {
|
|
214
|
+
const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
|
|
215
|
+
processAmount === queuedDecode.tokens.length;
|
|
216
|
+
const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
|
|
217
|
+
const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
|
|
218
|
+
TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
|
|
219
|
+
TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
|
|
220
|
+
batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
|
|
221
|
+
}
|
|
222
|
+
catch (err) {
|
|
223
|
+
this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
currentQueuedDecodeItems.add(queuedDecode);
|
|
227
|
+
if (queuedDecode.tokens.length === processAmount) {
|
|
228
|
+
queuedDecodesToDelete.add(queuedDecode);
|
|
229
|
+
afterDecodeActions.push({
|
|
230
|
+
batchLogitIndex,
|
|
231
|
+
response: queuedDecode.response,
|
|
232
|
+
onDone: queuedDecode.onDone
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
else {
|
|
236
|
+
queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
|
|
237
|
+
queuedDecode.firstTokenSequenceIndex += processAmount;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
for (let i = 0; i < this._queuedDecodes.length; i++) {
|
|
241
|
+
const queuedDecode = this._queuedDecodes[i];
|
|
242
|
+
if (queuedDecodesToDelete.has(queuedDecode)) {
|
|
243
|
+
this._queuedDecodes.splice(i, 1);
|
|
244
|
+
this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
|
|
245
|
+
i--;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
180
248
|
try {
|
|
181
|
-
|
|
249
|
+
if (currentBatchSize !== 0)
|
|
250
|
+
await this._ctx.decodeBatch();
|
|
182
251
|
}
|
|
183
252
|
catch (err) {
|
|
184
|
-
this._dispatchErrorForQueuedDecodesAndDequeue(
|
|
253
|
+
this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
|
|
185
254
|
return;
|
|
186
255
|
}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
for (const { queuedDecode, processAmount } of currentBatchItems) {
|
|
191
|
-
let batchLogitIndex;
|
|
256
|
+
for (const action of afterDecodeActions) {
|
|
257
|
+
const [accept, reject] = action.response;
|
|
258
|
+
if (action.onDone != null && action.batchLogitIndex != null) {
|
|
192
259
|
try {
|
|
193
|
-
|
|
260
|
+
accept(action.onDone(action.batchLogitIndex ?? null));
|
|
194
261
|
}
|
|
195
262
|
catch (err) {
|
|
196
|
-
|
|
197
|
-
continue;
|
|
198
|
-
}
|
|
199
|
-
currentQueuedDecodeItems.add(queuedDecode);
|
|
200
|
-
if (queuedDecode.tokens.length === processAmount) {
|
|
201
|
-
queuedDecodesToDelete.add(queuedDecode);
|
|
202
|
-
afterDecodeActions.push({
|
|
203
|
-
batchLogitIndex,
|
|
204
|
-
response: queuedDecode.response,
|
|
205
|
-
onDone: queuedDecode.onDone
|
|
206
|
-
});
|
|
207
|
-
}
|
|
208
|
-
else {
|
|
209
|
-
queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
|
|
210
|
-
queuedDecode.firstTokenSequenceIndex += processAmount;
|
|
263
|
+
reject(err);
|
|
211
264
|
}
|
|
212
|
-
if (batchTokenSlotsLeft === 0)
|
|
213
|
-
break;
|
|
214
|
-
}
|
|
215
|
-
for (let i = 0; i < this._queuedDecodes.length; i++) {
|
|
216
|
-
const queuedDecode = this._queuedDecodes[i];
|
|
217
|
-
if (queuedDecodesToDelete.has(queuedDecode)) {
|
|
218
|
-
this._queuedDecodes.splice(i, 1);
|
|
219
|
-
this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
|
|
220
|
-
i--;
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
|
|
224
|
-
try {
|
|
225
|
-
if (currentBatchSize !== 0)
|
|
226
|
-
await this._ctx.decodeBatch();
|
|
227
|
-
}
|
|
228
|
-
catch (err) {
|
|
229
|
-
this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
|
|
230
|
-
return;
|
|
231
|
-
}
|
|
232
|
-
for (const action of afterDecodeActions) {
|
|
233
|
-
const [accept, reject] = action.response;
|
|
234
|
-
if (action.onDone != null && action.batchLogitIndex != null) {
|
|
235
|
-
try {
|
|
236
|
-
accept(action.onDone(action.batchLogitIndex ?? null));
|
|
237
|
-
}
|
|
238
|
-
catch (err) {
|
|
239
|
-
reject(err);
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
accept(undefined);
|
|
243
265
|
}
|
|
266
|
+
accept(undefined);
|
|
267
|
+
}
|
|
268
|
+
};
|
|
269
|
+
const prioritizationStrategy = resolvePrioritizationStrategy();
|
|
270
|
+
if (prioritizationStrategy == null)
|
|
271
|
+
return; // all queued items are rejected and dequeued when we get here
|
|
272
|
+
while (shouldHaveAnotherLoop) {
|
|
273
|
+
const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
|
|
274
|
+
if (orderedQueuedDecodes == null)
|
|
275
|
+
return; // all queued items are rejected and dequeued when we get here
|
|
276
|
+
const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
|
|
277
|
+
let preventDisposalHandle;
|
|
278
|
+
try {
|
|
279
|
+
preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
|
|
280
|
+
}
|
|
281
|
+
catch (err) {
|
|
282
|
+
this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
|
|
283
|
+
return;
|
|
284
|
+
}
|
|
285
|
+
try {
|
|
286
|
+
await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
|
|
287
|
+
shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
|
|
244
288
|
}
|
|
245
289
|
finally {
|
|
246
290
|
preventDisposalHandle.dispose();
|
|
@@ -248,13 +292,18 @@ export class LlamaContext {
|
|
|
248
292
|
}
|
|
249
293
|
});
|
|
250
294
|
}
|
|
295
|
+
/**
|
|
296
|
+
* Print the timings of token evaluation since that last print for this context.
|
|
297
|
+
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
|
|
298
|
+
* it won't print anything.
|
|
299
|
+
*/
|
|
251
300
|
async printTimings() {
|
|
252
301
|
this._ensureNotDisposed();
|
|
253
302
|
this._ctx.printTimings();
|
|
254
303
|
await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
|
|
255
304
|
}
|
|
256
305
|
/** @internal */
|
|
257
|
-
async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5 }, onDone) {
|
|
306
|
+
async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
|
|
258
307
|
return await new Promise((accept, reject) => {
|
|
259
308
|
this._queuedDecodes.push({
|
|
260
309
|
sequenceId,
|
|
@@ -262,6 +311,7 @@ export class LlamaContext {
|
|
|
262
311
|
firstTokenSequenceIndex,
|
|
263
312
|
generateLogitAtTheEnd,
|
|
264
313
|
evaluationPriority,
|
|
314
|
+
tokenMeter,
|
|
265
315
|
response: [accept, reject],
|
|
266
316
|
onDone
|
|
267
317
|
});
|
|
@@ -337,17 +387,46 @@ export class LlamaContext {
|
|
|
337
387
|
}
|
|
338
388
|
/** @internal */
|
|
339
389
|
static async _create(options, { _model }) {
|
|
340
|
-
const
|
|
390
|
+
const sequences = options.sequences ?? getDefaultContextSequences();
|
|
391
|
+
const contextSize = resolveContextContextSizeOption({
|
|
392
|
+
contextSize: options.contextSize,
|
|
393
|
+
batchSize: options.batchSize,
|
|
394
|
+
sequences: sequences,
|
|
395
|
+
modelFileInsights: _model.fileInsights,
|
|
396
|
+
modelGpuLayers: _model.gpuLayers,
|
|
397
|
+
modelTrainContextSize: _model.trainContextSize,
|
|
398
|
+
getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
|
|
399
|
+
llamaGpu: _model._llama.gpu,
|
|
400
|
+
ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
|
|
401
|
+
isEmbeddingContext: options._embeddings
|
|
402
|
+
});
|
|
403
|
+
const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
|
|
404
|
+
const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
|
|
405
|
+
contextSize,
|
|
406
|
+
sequences,
|
|
407
|
+
isEmbeddingContext: options._embeddings,
|
|
408
|
+
modelGpuLayers: _model.gpuLayers,
|
|
409
|
+
batchSize
|
|
410
|
+
}).gpuVram;
|
|
411
|
+
const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences });
|
|
341
412
|
const { createSignal } = options;
|
|
342
|
-
const
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
413
|
+
const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
|
|
414
|
+
? null
|
|
415
|
+
: _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
|
|
416
|
+
try {
|
|
417
|
+
const contextLoaded = await context._ctx.init();
|
|
418
|
+
if (createSignal?.aborted) {
|
|
419
|
+
if (contextLoaded)
|
|
420
|
+
await context._ctx.dispose();
|
|
421
|
+
throw createSignal.reason;
|
|
422
|
+
}
|
|
423
|
+
else if (!contextLoaded)
|
|
424
|
+
throw new Error("Failed to create context");
|
|
425
|
+
return context;
|
|
426
|
+
}
|
|
427
|
+
finally {
|
|
428
|
+
contextCreationMemoryReservation?.dispose?.();
|
|
347
429
|
}
|
|
348
|
-
else if (!contextLoaded)
|
|
349
|
-
throw new Error("Failed to create context");
|
|
350
|
-
return context;
|
|
351
430
|
}
|
|
352
431
|
}
|
|
353
432
|
export class LlamaContextSequence {
|
|
@@ -355,14 +434,16 @@ export class LlamaContextSequence {
|
|
|
355
434
|
/** @internal */ _gcRegistry;
|
|
356
435
|
/** @internal */ _context;
|
|
357
436
|
/** @internal */ _contextShift;
|
|
437
|
+
/** @internal */ _tokenMeter;
|
|
358
438
|
/** @internal */ _disposeAggregator = new DisposeAggregator();
|
|
359
439
|
/** @internal */ _contextTokens = [];
|
|
360
440
|
/** @internal */ _nextTokenIndex = 0;
|
|
361
441
|
/** @internal */ _disposed = false;
|
|
362
442
|
onDispose = new EventRelay();
|
|
363
|
-
constructor({ sequenceId, context, contextShift }) {
|
|
443
|
+
constructor({ sequenceId, context, tokenMeter, contextShift }) {
|
|
364
444
|
this._sequenceId = sequenceId;
|
|
365
445
|
this._context = context;
|
|
446
|
+
this._tokenMeter = tokenMeter ?? new TokenMeter();
|
|
366
447
|
this._contextShift = contextShift;
|
|
367
448
|
this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
|
|
368
449
|
this._gcRegistry.register(this, sequenceId);
|
|
@@ -399,6 +480,9 @@ export class LlamaContextSequence {
|
|
|
399
480
|
get contextTokens() {
|
|
400
481
|
return this._contextTokens.slice();
|
|
401
482
|
}
|
|
483
|
+
get tokenMeter() {
|
|
484
|
+
return this._tokenMeter;
|
|
485
|
+
}
|
|
402
486
|
get isLoadedToMemory() {
|
|
403
487
|
return !this._disposed;
|
|
404
488
|
}
|
|
@@ -424,7 +508,7 @@ export class LlamaContextSequence {
|
|
|
424
508
|
}
|
|
425
509
|
/**
|
|
426
510
|
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
|
|
427
|
-
*
|
|
511
|
+
* The start of each range is inclusive, and the end of each range is exclusive.
|
|
428
512
|
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
|
|
429
513
|
*/
|
|
430
514
|
async eraseContextTokenRanges(ranges) {
|
|
@@ -531,7 +615,7 @@ export class LlamaContextSequence {
|
|
|
531
615
|
while (true) {
|
|
532
616
|
this._ensureNotDisposed();
|
|
533
617
|
// Evaluate to get the next token.
|
|
534
|
-
const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, contextShiftOptions, (batchLogitIndex) => {
|
|
618
|
+
const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
|
|
535
619
|
const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
|
|
536
620
|
? repeatPenalty.punishTokens()
|
|
537
621
|
: repeatPenalty?.punishTokens;
|
|
@@ -565,7 +649,7 @@ export class LlamaContextSequence {
|
|
|
565
649
|
}
|
|
566
650
|
}
|
|
567
651
|
/** @internal */
|
|
568
|
-
async _decodeTokens(tokens, generateLogit, evaluationPriority, contextShiftOptions, onDecodeDone) {
|
|
652
|
+
async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
|
|
569
653
|
this._ensureNotDisposed();
|
|
570
654
|
const tokensLeftToDecode = tokens.slice();
|
|
571
655
|
return await withLock(this, "evaluate", async () => {
|
|
@@ -585,7 +669,8 @@ export class LlamaContextSequence {
|
|
|
585
669
|
tokens: tokensToDecode,
|
|
586
670
|
firstTokenSequenceIndex: this._nextTokenIndex,
|
|
587
671
|
generateLogitAtTheEnd,
|
|
588
|
-
evaluationPriority
|
|
672
|
+
evaluationPriority,
|
|
673
|
+
tokenMeter
|
|
589
674
|
}, !generateLogitAtTheEnd
|
|
590
675
|
? undefined
|
|
591
676
|
: onDecodeDone);
|
|
@@ -632,10 +717,11 @@ export class LlamaContextSequence {
|
|
|
632
717
|
* We need this to make it impossible to manually create instances of this class outside the code of this library
|
|
633
718
|
* @internal
|
|
634
719
|
*/
|
|
635
|
-
static _create({ sequenceId, context, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
|
|
720
|
+
static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
|
|
636
721
|
return new LlamaContextSequence({
|
|
637
722
|
sequenceId,
|
|
638
723
|
context,
|
|
724
|
+
tokenMeter,
|
|
639
725
|
contextShift: {
|
|
640
726
|
size: contextShiftSize,
|
|
641
727
|
strategy: contextShiftStrategy
|
|
@@ -653,4 +739,63 @@ function disposeContextSequenceIfReferenced(contextRef) {
|
|
|
653
739
|
if (context != null)
|
|
654
740
|
context.dispose();
|
|
655
741
|
}
|
|
742
|
+
export function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false }) {
|
|
743
|
+
if (contextSize == null)
|
|
744
|
+
contextSize = "auto";
|
|
745
|
+
if (typeof contextSize === "number") {
|
|
746
|
+
const resolvedContextSize = Math.max(1, Math.floor(contextSize));
|
|
747
|
+
if (ignoreMemorySafetyChecks)
|
|
748
|
+
return resolvedContextSize;
|
|
749
|
+
const vramState = getVramState();
|
|
750
|
+
const contextVram = modelFileInsights.estimateContextResourceRequirements({
|
|
751
|
+
contextSize: resolvedContextSize,
|
|
752
|
+
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }),
|
|
753
|
+
modelGpuLayers: modelGpuLayers,
|
|
754
|
+
sequences,
|
|
755
|
+
isEmbeddingContext
|
|
756
|
+
}).gpuVram;
|
|
757
|
+
if (contextVram > vramState.free)
|
|
758
|
+
throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
|
|
759
|
+
return resolvedContextSize;
|
|
760
|
+
}
|
|
761
|
+
else if (contextSize === "auto" || typeof contextSize === "object") {
|
|
762
|
+
if (llamaGpu === false)
|
|
763
|
+
return modelTrainContextSize;
|
|
764
|
+
const vramState = getVramState();
|
|
765
|
+
if (vramState.total === 0)
|
|
766
|
+
return modelTrainContextSize;
|
|
767
|
+
const freeVram = vramState.free;
|
|
768
|
+
const maxContextSize = contextSize === "auto"
|
|
769
|
+
? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })
|
|
770
|
+
: Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }));
|
|
771
|
+
const minContextSize = contextSize === "auto"
|
|
772
|
+
? minAllowedContextSizeInCalculations
|
|
773
|
+
: Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations);
|
|
774
|
+
for (let testContextSize = maxContextSize; testContextSize >= minContextSize; testContextSize--) {
|
|
775
|
+
const contextVram = modelFileInsights.estimateContextResourceRequirements({
|
|
776
|
+
contextSize: testContextSize,
|
|
777
|
+
batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }),
|
|
778
|
+
modelGpuLayers: modelGpuLayers,
|
|
779
|
+
sequences,
|
|
780
|
+
isEmbeddingContext
|
|
781
|
+
}).gpuVram;
|
|
782
|
+
if (contextVram <= freeVram)
|
|
783
|
+
return testContextSize;
|
|
784
|
+
}
|
|
785
|
+
if (ignoreMemorySafetyChecks)
|
|
786
|
+
return minContextSize;
|
|
787
|
+
throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`);
|
|
788
|
+
}
|
|
789
|
+
throw new Error(`Invalid context size: "${contextSize}"`);
|
|
790
|
+
}
|
|
791
|
+
export function getDefaultContextBatchSize({ contextSize, sequences }) {
|
|
792
|
+
return Math.min(contextSize * sequences, 512);
|
|
793
|
+
}
|
|
794
|
+
export function getDefaultContextSequences() {
|
|
795
|
+
return 1;
|
|
796
|
+
}
|
|
797
|
+
const defaultFallbackContextSize = 4096;
|
|
798
|
+
export function getDefaultModelContextSize({ trainContextSize }) {
|
|
799
|
+
return trainContextSize ?? defaultFallbackContextSize;
|
|
800
|
+
}
|
|
656
801
|
//# sourceMappingURL=LlamaContext.js.map
|