node-llama-cpp 3.0.0-beta.14 → 3.0.0-beta.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. package/README.md +1 -1
  2. package/dist/ChatWrapper.js +4 -0
  3. package/dist/ChatWrapper.js.map +1 -1
  4. package/dist/bindings/AddonTypes.d.ts +21 -0
  5. package/dist/bindings/Llama.d.ts +4 -0
  6. package/dist/bindings/Llama.js +45 -4
  7. package/dist/bindings/Llama.js.map +1 -1
  8. package/dist/bindings/getLlama.d.ts +19 -1
  9. package/dist/bindings/getLlama.js +15 -5
  10. package/dist/bindings/getLlama.js.map +1 -1
  11. package/dist/bindings/types.d.ts +15 -0
  12. package/dist/bindings/types.js +27 -2
  13. package/dist/bindings/types.js.map +1 -1
  14. package/dist/bindings/utils/MemoryOrchestrator.d.ts +21 -0
  15. package/dist/bindings/utils/MemoryOrchestrator.js +49 -0
  16. package/dist/bindings/utils/MemoryOrchestrator.js.map +1 -0
  17. package/dist/bindings/utils/cloneLlamaCppRepo.d.ts +1 -1
  18. package/dist/bindings/utils/cloneLlamaCppRepo.js +3 -2
  19. package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
  20. package/dist/bindings/utils/compileLLamaCpp.js +2 -2
  21. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  22. package/dist/bindings/utils/getLlamaWithoutBackend.d.ts +5 -0
  23. package/dist/bindings/utils/getLlamaWithoutBackend.js +27 -0
  24. package/dist/bindings/utils/getLlamaWithoutBackend.js.map +1 -0
  25. package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -2
  26. package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
  27. package/dist/chatWrappers/AlpacaChatWrapper.d.ts +2 -1
  28. package/dist/chatWrappers/AlpacaChatWrapper.js +9 -2
  29. package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
  30. package/dist/chatWrappers/ChatMLChatWrapper.js +12 -10
  31. package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
  32. package/dist/chatWrappers/FalconChatWrapper.d.ts +2 -1
  33. package/dist/chatWrappers/FalconChatWrapper.js +28 -11
  34. package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
  35. package/dist/chatWrappers/FunctionaryChatWrapper.js +59 -45
  36. package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
  37. package/dist/chatWrappers/GemmaChatWrapper.js +9 -7
  38. package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -1
  39. package/dist/chatWrappers/GeneralChatWrapper.d.ts +2 -1
  40. package/dist/chatWrappers/GeneralChatWrapper.js +35 -12
  41. package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
  42. package/dist/chatWrappers/LlamaChatWrapper.d.ts +7 -0
  43. package/dist/chatWrappers/LlamaChatWrapper.js +26 -8
  44. package/dist/chatWrappers/LlamaChatWrapper.js.map +1 -1
  45. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +73 -0
  46. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +355 -0
  47. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -0
  48. package/dist/{TemplateChatWrapper.d.ts → chatWrappers/generic/TemplateChatWrapper.d.ts} +6 -9
  49. package/dist/{TemplateChatWrapper.js → chatWrappers/generic/TemplateChatWrapper.js} +31 -69
  50. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -0
  51. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.d.ts +33 -0
  52. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js +45 -0
  53. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js.map +1 -0
  54. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.d.ts +4 -0
  55. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +206 -0
  56. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -0
  57. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +67 -0
  58. package/dist/chatWrappers/utils/resolveChatWrapper.js +206 -0
  59. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -0
  60. package/dist/cli/cli.js +1 -1
  61. package/dist/cli/cli.js.map +1 -1
  62. package/dist/cli/commands/ChatCommand.d.ts +7 -4
  63. package/dist/cli/commands/ChatCommand.js +150 -60
  64. package/dist/cli/commands/ChatCommand.js.map +1 -1
  65. package/dist/cli/commands/ClearCommand.d.ts +1 -1
  66. package/dist/cli/commands/ClearCommand.js +5 -5
  67. package/dist/cli/commands/ClearCommand.js.map +1 -1
  68. package/dist/cli/commands/CompleteCommand.d.ts +3 -2
  69. package/dist/cli/commands/CompleteCommand.js +88 -41
  70. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  71. package/dist/cli/commands/InfillCommand.d.ts +3 -2
  72. package/dist/cli/commands/InfillCommand.js +88 -41
  73. package/dist/cli/commands/InfillCommand.js.map +1 -1
  74. package/dist/cli/commands/{InspectCommand.d.ts → inspect/InspectCommand.d.ts} +1 -4
  75. package/dist/cli/commands/inspect/InspectCommand.js +17 -0
  76. package/dist/cli/commands/inspect/InspectCommand.js.map +1 -0
  77. package/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts +10 -0
  78. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +108 -0
  79. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -0
  80. package/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts +4 -0
  81. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +98 -0
  82. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -0
  83. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +14 -0
  84. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +577 -0
  85. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -0
  86. package/dist/cli/utils/ConsoleTable.d.ts +23 -0
  87. package/dist/cli/utils/ConsoleTable.js +86 -0
  88. package/dist/cli/utils/ConsoleTable.js.map +1 -0
  89. package/dist/cli/utils/printCommonInfoLines.d.ts +9 -0
  90. package/dist/cli/utils/printCommonInfoLines.js +70 -0
  91. package/dist/cli/utils/printCommonInfoLines.js.map +1 -0
  92. package/dist/cli/utils/printInfoLine.d.ts +10 -0
  93. package/dist/cli/utils/printInfoLine.js +45 -0
  94. package/dist/cli/utils/printInfoLine.js.map +1 -0
  95. package/dist/cli/utils/resolveCommandGgufPath.d.ts +1 -0
  96. package/dist/cli/utils/resolveCommandGgufPath.js +6 -0
  97. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -0
  98. package/dist/config.d.ts +2 -0
  99. package/dist/config.js +6 -0
  100. package/dist/config.js.map +1 -1
  101. package/dist/evaluator/LlamaChat/LlamaChat.js +13 -5
  102. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  103. package/dist/evaluator/LlamaCompletion.js +5 -3
  104. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  105. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +40 -3
  106. package/dist/evaluator/LlamaContext/LlamaContext.js +245 -100
  107. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  108. package/dist/evaluator/LlamaContext/types.d.ts +57 -6
  109. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map +1 -0
  110. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map +1 -0
  111. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts +2 -0
  112. package/dist/evaluator/LlamaContext/utils/{resolveBatchItemsPrioritizingStrategy.js → resolveBatchItemsPrioritizationStrategy.js} +4 -4
  113. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map +1 -0
  114. package/dist/evaluator/LlamaEmbeddingContext.d.ts +23 -2
  115. package/dist/evaluator/LlamaEmbeddingContext.js +4 -5
  116. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  117. package/dist/evaluator/LlamaModel.d.ts +64 -6
  118. package/dist/evaluator/LlamaModel.js +297 -8
  119. package/dist/evaluator/LlamaModel.js.map +1 -1
  120. package/dist/evaluator/TokenMeter.d.ts +54 -0
  121. package/dist/evaluator/TokenMeter.js +86 -0
  122. package/dist/evaluator/TokenMeter.js.map +1 -0
  123. package/dist/gguf/GgufInsights.d.ts +40 -0
  124. package/dist/gguf/GgufInsights.js +350 -0
  125. package/dist/gguf/GgufInsights.js.map +1 -0
  126. package/dist/gguf/consts.d.ts +3 -0
  127. package/dist/gguf/consts.js +8 -0
  128. package/dist/gguf/consts.js.map +1 -0
  129. package/dist/gguf/errors/InvalidGgufMagicError.d.ts +3 -0
  130. package/dist/gguf/errors/InvalidGgufMagicError.js +6 -0
  131. package/dist/gguf/errors/InvalidGgufMagicError.js.map +1 -0
  132. package/dist/gguf/errors/UnsupportedGgufValueTypeError.d.ts +4 -0
  133. package/dist/gguf/errors/UnsupportedGgufValueTypeError.js +9 -0
  134. package/dist/gguf/errors/UnsupportedGgufValueTypeError.js.map +1 -0
  135. package/dist/gguf/fileReaders/GgufFileReader.d.ts +33 -0
  136. package/dist/gguf/fileReaders/GgufFileReader.js +76 -0
  137. package/dist/gguf/fileReaders/GgufFileReader.js.map +1 -0
  138. package/dist/gguf/fileReaders/GgufFsFileReader.d.ts +17 -0
  139. package/dist/gguf/fileReaders/GgufFsFileReader.js +45 -0
  140. package/dist/gguf/fileReaders/GgufFsFileReader.js.map +1 -0
  141. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.d.ts +22 -0
  142. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js +63 -0
  143. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js.map +1 -0
  144. package/dist/gguf/parser/GgufV2Parser.d.ts +19 -0
  145. package/dist/gguf/parser/GgufV2Parser.js +115 -0
  146. package/dist/gguf/parser/GgufV2Parser.js.map +1 -0
  147. package/dist/gguf/parser/GgufV3Parser.d.ts +3 -0
  148. package/dist/gguf/parser/GgufV3Parser.js +4 -0
  149. package/dist/gguf/parser/GgufV3Parser.js.map +1 -0
  150. package/dist/gguf/parser/parseGguf.d.ts +8 -0
  151. package/dist/gguf/parser/parseGguf.js +58 -0
  152. package/dist/gguf/parser/parseGguf.js.map +1 -0
  153. package/dist/gguf/readGgufFileInfo.d.ts +30 -0
  154. package/dist/gguf/readGgufFileInfo.js +37 -0
  155. package/dist/gguf/readGgufFileInfo.js.map +1 -0
  156. package/dist/gguf/types/GgufFileInfoTypes.d.ts +52 -0
  157. package/dist/gguf/types/GgufFileInfoTypes.js +18 -0
  158. package/dist/gguf/types/GgufFileInfoTypes.js.map +1 -0
  159. package/dist/gguf/types/GgufMetadataTypes.d.ts +330 -0
  160. package/dist/gguf/types/GgufMetadataTypes.js +86 -0
  161. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -0
  162. package/dist/gguf/types/GgufTensorInfoTypes.d.ts +37 -0
  163. package/dist/gguf/types/GgufTensorInfoTypes.js +33 -0
  164. package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -0
  165. package/dist/gguf/utils/GgufReadOffset.d.ts +6 -0
  166. package/dist/gguf/utils/GgufReadOffset.js +18 -0
  167. package/dist/gguf/utils/GgufReadOffset.js.map +1 -0
  168. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.d.ts +5 -0
  169. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js +38 -0
  170. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js.map +1 -0
  171. package/dist/gguf/utils/getGgufFileTypeName.d.ts +4 -0
  172. package/dist/gguf/utils/getGgufFileTypeName.js +13 -0
  173. package/dist/gguf/utils/getGgufFileTypeName.js.map +1 -0
  174. package/dist/gguf/utils/getGgufMetadataArchitectureData.d.ts +3 -0
  175. package/dist/gguf/utils/getGgufMetadataArchitectureData.js +4 -0
  176. package/dist/gguf/utils/getGgufMetadataArchitectureData.js.map +1 -0
  177. package/dist/gguf/utils/normalizeGgufDownloadUrl.d.ts +1 -0
  178. package/dist/gguf/utils/normalizeGgufDownloadUrl.js +16 -0
  179. package/dist/gguf/utils/normalizeGgufDownloadUrl.js.map +1 -0
  180. package/dist/index.d.ts +13 -7
  181. package/dist/index.js +11 -6
  182. package/dist/index.js.map +1 -1
  183. package/dist/types.d.ts +1 -1
  184. package/dist/utils/InsufficientMemoryError.d.ts +3 -0
  185. package/dist/utils/InsufficientMemoryError.js +6 -0
  186. package/dist/utils/InsufficientMemoryError.js.map +1 -0
  187. package/dist/utils/LlamaText.d.ts +25 -10
  188. package/dist/utils/LlamaText.js +205 -23
  189. package/dist/utils/LlamaText.js.map +1 -1
  190. package/dist/utils/StopGenerationDetector.js +3 -1
  191. package/dist/utils/StopGenerationDetector.js.map +1 -1
  192. package/dist/utils/findBestOption.d.ts +4 -0
  193. package/dist/utils/findBestOption.js +15 -0
  194. package/dist/utils/findBestOption.js.map +1 -0
  195. package/dist/utils/getQueuedTokensBeforeStopTrigger.js +3 -3
  196. package/dist/utils/getQueuedTokensBeforeStopTrigger.js.map +1 -1
  197. package/dist/utils/gitReleaseBundles.js +68 -1
  198. package/dist/utils/gitReleaseBundles.js.map +1 -1
  199. package/dist/utils/mergeUnionTypes.d.ts +4 -0
  200. package/dist/utils/parseModelFileName.d.ts +1 -0
  201. package/dist/utils/parseModelFileName.js +6 -1
  202. package/dist/utils/parseModelFileName.js.map +1 -1
  203. package/dist/utils/prettyPrintObject.d.ts +10 -1
  204. package/dist/utils/prettyPrintObject.js +57 -13
  205. package/dist/utils/prettyPrintObject.js.map +1 -1
  206. package/dist/utils/spawnCommand.js.map +1 -1
  207. package/dist/utils/tokenizeInput.d.ts +1 -1
  208. package/dist/utils/tokenizeInput.js +3 -3
  209. package/dist/utils/tokenizeInput.js.map +1 -1
  210. package/dist/utils/withOra.d.ts +1 -0
  211. package/dist/utils/withOra.js +2 -2
  212. package/dist/utils/withOra.js.map +1 -1
  213. package/llama/CMakeLists.txt +5 -5
  214. package/llama/addon.cpp +117 -5
  215. package/llama/binariesGithubRelease.json +1 -1
  216. package/llama/gitRelease.bundle +0 -0
  217. package/llama/gpuInfo/cuda-gpu-info.cu +21 -0
  218. package/llama/gpuInfo/cuda-gpu-info.h +3 -0
  219. package/llama/gpuInfo/metal-gpu-info.h +4 -1
  220. package/llama/gpuInfo/metal-gpu-info.mm +14 -1
  221. package/llama/gpuInfo/vulkan-gpu-info.cpp +20 -2
  222. package/llama/gpuInfo/vulkan-gpu-info.h +2 -0
  223. package/llama/llama.cpp.info.json +1 -1
  224. package/llama/toolchains/win32.host-x64.target-arm64.cmake +41 -0
  225. package/llamaBins/linux-arm64/_nlcBuildMetadata.json +1 -1
  226. package/llamaBins/linux-arm64/llama-addon.node +0 -0
  227. package/llamaBins/linux-armv7l/_nlcBuildMetadata.json +1 -1
  228. package/llamaBins/linux-armv7l/llama-addon.node +0 -0
  229. package/llamaBins/linux-x64/_nlcBuildMetadata.json +1 -1
  230. package/llamaBins/linux-x64/llama-addon.node +0 -0
  231. package/llamaBins/linux-x64-cuda/_nlcBuildMetadata.json +1 -1
  232. package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
  233. package/llamaBins/linux-x64-vulkan/_nlcBuildMetadata.json +1 -1
  234. package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
  235. package/llamaBins/mac-arm64-metal/_nlcBuildMetadata.json +1 -1
  236. package/llamaBins/mac-arm64-metal/default.metallib +0 -0
  237. package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
  238. package/llamaBins/mac-x64/_nlcBuildMetadata.json +1 -1
  239. package/llamaBins/mac-x64/llama-addon.node +0 -0
  240. package/llamaBins/win-arm64/_nlcBuildMetadata.json +1 -0
  241. package/llamaBins/win-arm64/llama-addon.exp +0 -0
  242. package/llamaBins/win-arm64/llama-addon.lib +0 -0
  243. package/llamaBins/win-arm64/llama-addon.node +0 -0
  244. package/llamaBins/win-x64/_nlcBuildMetadata.json +1 -1
  245. package/llamaBins/win-x64/llama-addon.node +0 -0
  246. package/llamaBins/win-x64-cuda/_nlcBuildMetadata.json +1 -1
  247. package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
  248. package/llamaBins/win-x64-vulkan/_nlcBuildMetadata.json +1 -1
  249. package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
  250. package/package.json +8 -6
  251. package/dist/TemplateChatWrapper.js.map +0 -1
  252. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +0 -33
  253. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +0 -49
  254. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +0 -1
  255. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.d.ts +0 -13
  256. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +0 -63
  257. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +0 -1
  258. package/dist/cli/commands/InspectCommand.js +0 -113
  259. package/dist/cli/commands/InspectCommand.js.map +0 -1
  260. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/firstInFirstOutStrategy.js.map +0 -1
  261. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/maximumParallelismStrategy.js.map +0 -1
  262. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.d.ts +0 -2
  263. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.js.map +0 -1
  264. package/dist/gguf/GGUFInsights.d.ts +0 -28
  265. package/dist/gguf/GGUFInsights.js +0 -58
  266. package/dist/gguf/GGUFInsights.js.map +0 -1
  267. package/dist/gguf/GGUFMetadata.d.ts +0 -19
  268. package/dist/gguf/GGUFMetadata.js +0 -38
  269. package/dist/gguf/GGUFMetadata.js.map +0 -1
  270. package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +0 -3
  271. package/dist/gguf/errors/InvalidGGUFMagicError.js +0 -6
  272. package/dist/gguf/errors/InvalidGGUFMagicError.js.map +0 -1
  273. package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +0 -3
  274. package/dist/gguf/errors/MetadataNotParsedYetError.js +0 -6
  275. package/dist/gguf/errors/MetadataNotParsedYetError.js.map +0 -1
  276. package/dist/gguf/errors/MissingNodeLlamaError.d.ts +0 -3
  277. package/dist/gguf/errors/MissingNodeLlamaError.js +0 -6
  278. package/dist/gguf/errors/MissingNodeLlamaError.js.map +0 -1
  279. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +0 -5
  280. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +0 -11
  281. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +0 -1
  282. package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +0 -4
  283. package/dist/gguf/errors/UnsupportedMetadataTypeError.js +0 -8
  284. package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +0 -1
  285. package/dist/gguf/ggufParser/GGUFParser.d.ts +0 -18
  286. package/dist/gguf/ggufParser/GGUFParser.js +0 -123
  287. package/dist/gguf/ggufParser/GGUFParser.js.map +0 -1
  288. package/dist/gguf/ggufParser/GGUFTypes.d.ts +0 -257
  289. package/dist/gguf/ggufParser/GGUFTypes.js +0 -2
  290. package/dist/gguf/ggufParser/GGUFTypes.js.map +0 -1
  291. package/dist/gguf/ggufParser/checkArchitecture.d.ts +0 -14
  292. package/dist/gguf/ggufParser/checkArchitecture.js +0 -74
  293. package/dist/gguf/ggufParser/checkArchitecture.js.map +0 -1
  294. package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +0 -38
  295. package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +0 -83
  296. package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +0 -1
  297. package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +0 -14
  298. package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +0 -35
  299. package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +0 -1
  300. package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +0 -15
  301. package/dist/gguf/ggufParser/stream/GGUFReadStream.js +0 -40
  302. package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +0 -1
  303. package/dist/utils/parseModelTypeDescription.d.ts +0 -6
  304. package/dist/utils/parseModelTypeDescription.js +0 -9
  305. package/dist/utils/parseModelTypeDescription.js.map +0 -1
  306. package/dist/utils/resolveChatWrapper.d.ts +0 -4
  307. package/dist/utils/resolveChatWrapper.js +0 -16
  308. package/dist/utils/resolveChatWrapper.js.map +0 -1
  309. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.d.ts +0 -0
  310. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.js +0 -0
  311. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.d.ts +0 -0
  312. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.js +0 -0
@@ -1,7 +1,10 @@
1
1
  import { EventRelay } from "lifecycle-utils";
2
2
  import { Token } from "../../types.js";
3
3
  import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
4
- import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextSequenceRepeatPenalty } from "./types.js";
4
+ import { GgufInsights } from "../../gguf/GgufInsights.js";
5
+ import { TokenMeter } from "../TokenMeter.js";
6
+ import { BuildGpu } from "../../bindings/types.js";
7
+ import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty } from "./types.js";
5
8
  import type { LlamaModel } from "../LlamaModel.js";
6
9
  export declare class LlamaContext {
7
10
  readonly onDispose: EventRelay<void>;
@@ -13,6 +16,11 @@ export declare class LlamaContext {
13
16
  get model(): LlamaModel;
14
17
  get contextSize(): number;
15
18
  get batchSize(): number;
19
+ /**
20
+ * The actual size of the state in the memory in bytes.
21
+ * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
22
+ */
23
+ get stateSize(): number;
16
24
  getAllocatedContextSize(): number;
17
25
  get totalSequences(): number;
18
26
  get sequencesLeft(): number;
@@ -21,10 +29,15 @@ export declare class LlamaContext {
21
29
  * When there are no sequences left, this method will throw an error.
22
30
  * @param [options]
23
31
  */
24
- getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy } }?: {
32
+ getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, _tokenMeter }?: {
25
33
  contextShift?: ContextShiftOptions;
26
34
  }): LlamaContextSequence;
27
35
  dispatchPendingBatch(): void;
36
+ /**
37
+ * Print the timings of token evaluation since that last print for this context.
38
+ * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
39
+ * it won't print anything.
40
+ */
28
41
  printTimings(): Promise<void>;
29
42
  }
30
43
  export declare class LlamaContextSequence {
@@ -38,6 +51,7 @@ export declare class LlamaContextSequence {
38
51
  get model(): LlamaModel;
39
52
  get nextTokenIndex(): number;
40
53
  get contextTokens(): Token[];
54
+ get tokenMeter(): TokenMeter;
41
55
  get isLoadedToMemory(): boolean;
42
56
  compareContextTokens(tokens: Token[]): {
43
57
  firstDifferentIndex: number;
@@ -49,7 +63,7 @@ export declare class LlamaContextSequence {
49
63
  clearHistory(): Promise<void>;
50
64
  /**
51
65
  * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
52
- * the start and end of each range are exclusive.
66
+ * The start of each range is inclusive, and the end of each range is exclusive.
53
67
  * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
54
68
  */
55
69
  eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
@@ -104,3 +118,26 @@ export declare class LlamaContextSequence {
104
118
  contextShift?: ContextShiftOptions;
105
119
  }): Promise<void>;
106
120
  }
121
+ export declare function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext }: {
122
+ contextSize?: LlamaContextOptions["contextSize"];
123
+ batchSize?: LlamaContextOptions["batchSize"];
124
+ sequences: number;
125
+ modelFileInsights: GgufInsights;
126
+ modelGpuLayers: number;
127
+ modelTrainContextSize: number;
128
+ getVramState(): {
129
+ total: number;
130
+ free: number;
131
+ };
132
+ llamaGpu: BuildGpu;
133
+ ignoreMemorySafetyChecks?: boolean;
134
+ isEmbeddingContext?: boolean;
135
+ }): number;
136
+ export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
137
+ contextSize: number;
138
+ sequences: number;
139
+ }): number;
140
+ export declare function getDefaultContextSequences(): number;
141
+ export declare function getDefaultModelContextSize({ trainContextSize }: {
142
+ trainContextSize?: number;
143
+ }): number;
@@ -2,7 +2,9 @@ import { DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAgg
2
2
  import { removeNullFields } from "../../utils/removeNullFields.js";
3
3
  import { compareTokens } from "../../utils/compareTokens.js";
4
4
  import { DisposeGuard } from "../../utils/DisposeGuard.js";
5
- import { resolveBatchItemsPrioritizingStrategy } from "./utils/resolveBatchItemsPrioritizingStrategy.js";
5
+ import { minAllowedContextSizeInCalculations } from "../../config.js";
6
+ import { TokenMeter } from "../TokenMeter.js";
7
+ import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
6
8
  export class LlamaContext {
7
9
  /** @internal */ _llama;
8
10
  /** @internal */ _ctx;
@@ -25,7 +27,7 @@ export class LlamaContext {
25
27
  /** @internal */ _allocatedContextSize;
26
28
  /** @internal */ _disposed = false;
27
29
  onDispose = new EventRelay();
28
- constructor({ _model }, { sequences = 1, seed = null, contextSize = _model.trainContextSize, batchSize = Math.min(contextSize * sequences, 512), threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
30
+ constructor({ _model }, { sequences, seed = null, contextSize, batchSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
29
31
  if (_model.disposed)
30
32
  throw new DisposedError();
31
33
  this._llama = _model._llama;
@@ -39,13 +41,14 @@ export class LlamaContext {
39
41
  seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
40
42
  contextSize: this._contextSize * this._totalSequences,
41
43
  batchSize: this._batchSize,
44
+ sequences: this._totalSequences,
42
45
  threads: Math.max(0, Math.floor(threads)),
43
46
  embeddings: _embeddings,
44
47
  noSeed: _noSeed
45
48
  }));
46
49
  this._batchingOptions = {
47
50
  dispatchSchedule: batchingDispatchSchedule,
48
- itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy
51
+ itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
49
52
  };
50
53
  this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
51
54
  this._disposeAggregator.add(() => {
@@ -82,6 +85,14 @@ export class LlamaContext {
82
85
  get batchSize() {
83
86
  return this._batchSize;
84
87
  }
88
+ /**
89
+ * The actual size of the state in the memory in bytes.
90
+ * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
91
+ */
92
+ get stateSize() {
93
+ this._ensureNotDisposed();
94
+ return this._ctx.getStateSize();
95
+ }
85
96
  getAllocatedContextSize() {
86
97
  this._ensureNotDisposed();
87
98
  if (this._allocatedContextSize == null)
@@ -99,7 +110,7 @@ export class LlamaContext {
99
110
  * When there are no sequences left, this method will throw an error.
100
111
  * @param [options]
101
112
  */
102
- getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} } = {}) {
113
+ getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = {}) {
103
114
  this._ensureNotDisposed();
104
115
  const nextSequenceId = this._popSequenceId();
105
116
  if (nextSequenceId == null)
@@ -107,6 +118,7 @@ export class LlamaContext {
107
118
  return LlamaContextSequence._create({
108
119
  sequenceId: nextSequenceId,
109
120
  context: this,
121
+ tokenMeter: _tokenMeter,
110
122
  contextShift: {
111
123
  size: contextShiftSize,
112
124
  strategy: contextShiftStrategy
@@ -123,17 +135,18 @@ export class LlamaContext {
123
135
  this._currentDispatchBatchHandle = {};
124
136
  this._dispatchDecodeScheduled = false;
125
137
  this._batchDispatchPending = false;
126
- let prioritizeStrategy;
127
- try {
128
- this._ensureNotDisposed();
129
- prioritizeStrategy = resolveBatchItemsPrioritizingStrategy(this._batchingOptions.itemsPrioritizingStrategy);
130
- }
131
- catch (err) {
132
- this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
133
- return;
134
- }
135
- let shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
136
- while (shouldHaveAnotherBatch) {
138
+ let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
139
+ const resolvePrioritizationStrategy = () => {
140
+ try {
141
+ this._ensureNotDisposed();
142
+ return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
143
+ }
144
+ catch (err) {
145
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
146
+ }
147
+ return null;
148
+ };
149
+ const getOrderedQueuedDecodes = (prioritizationStrategy) => {
137
150
  const batchItemToQueuedDecodeMap = new Map();
138
151
  const batchItemsList = [];
139
152
  for (const queuedDecode of this._queuedDecodes) {
@@ -146,101 +159,132 @@ export class LlamaContext {
146
159
  }
147
160
  let prioritizedItems;
148
161
  try {
149
- prioritizedItems = prioritizeStrategy({
162
+ prioritizedItems = prioritizationStrategy({
150
163
  items: batchItemsList,
151
164
  size: this._batchSize
152
165
  });
153
166
  }
154
167
  catch (err) {
155
168
  this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
156
- return;
169
+ return null;
157
170
  }
158
- let batchTokenSlotsLeft = this._batchSize;
159
- const afterDecodeActions = [];
160
- const queuedDecodesToDelete = new Set();
161
- const currentQueuedDecodeItems = new Set();
162
- const currentBatchItems = [];
163
- let currentBatchSize = 0;
164
- for (const prioritizedItem of prioritizedItems) {
171
+ return prioritizedItems.map((prioritizedItem) => {
165
172
  const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
166
173
  if (queuedDecode == null)
167
174
  throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
168
175
  "of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
169
- const processAmount = Math.min(queuedDecode.tokens.length, prioritizedItem.processAmount, batchTokenSlotsLeft);
170
- if (processAmount <= 0)
176
+ return {
177
+ queuedDecode,
178
+ processAmount: prioritizedItem.processAmount
179
+ };
180
+ });
181
+ };
182
+ const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
183
+ const currentBatchItems = [];
184
+ let currentBatchSize = 0;
185
+ let batchTokenSlotsLeft = batchSize;
186
+ for (const { queuedDecode, processAmount } of queuedDecodes) {
187
+ const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
188
+ if (resolvedProcessAmount <= 0) {
189
+ if (batchTokenSlotsLeft === 0)
190
+ break;
171
191
  continue;
172
- batchTokenSlotsLeft -= processAmount;
192
+ }
193
+ batchTokenSlotsLeft -= resolvedProcessAmount;
194
+ currentBatchSize += resolvedProcessAmount;
173
195
  currentBatchItems.push({
174
196
  queuedDecode,
175
- processAmount
197
+ processAmount: resolvedProcessAmount
176
198
  });
177
- currentBatchSize += processAmount;
178
199
  }
179
- let preventDisposalHandle;
200
+ return {
201
+ currentBatchItems,
202
+ currentBatchSize
203
+ };
204
+ };
205
+ const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
206
+ const afterDecodeActions = [];
207
+ const queuedDecodesToDelete = new Set();
208
+ const currentQueuedDecodeItems = new Set();
209
+ if (currentBatchSize !== 0)
210
+ this._ctx.initBatch(currentBatchSize);
211
+ for (const { queuedDecode, processAmount } of batchItems) {
212
+ let batchLogitIndex;
213
+ try {
214
+ const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
215
+ processAmount === queuedDecode.tokens.length;
216
+ const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
217
+ const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
218
+ TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
219
+ TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
220
+ batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
221
+ }
222
+ catch (err) {
223
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
224
+ continue;
225
+ }
226
+ currentQueuedDecodeItems.add(queuedDecode);
227
+ if (queuedDecode.tokens.length === processAmount) {
228
+ queuedDecodesToDelete.add(queuedDecode);
229
+ afterDecodeActions.push({
230
+ batchLogitIndex,
231
+ response: queuedDecode.response,
232
+ onDone: queuedDecode.onDone
233
+ });
234
+ }
235
+ else {
236
+ queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
237
+ queuedDecode.firstTokenSequenceIndex += processAmount;
238
+ }
239
+ }
240
+ for (let i = 0; i < this._queuedDecodes.length; i++) {
241
+ const queuedDecode = this._queuedDecodes[i];
242
+ if (queuedDecodesToDelete.has(queuedDecode)) {
243
+ this._queuedDecodes.splice(i, 1);
244
+ this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
245
+ i--;
246
+ }
247
+ }
180
248
  try {
181
- preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
249
+ if (currentBatchSize !== 0)
250
+ await this._ctx.decodeBatch();
182
251
  }
183
252
  catch (err) {
184
- this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
253
+ this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
185
254
  return;
186
255
  }
187
- try {
188
- if (currentBatchSize !== 0)
189
- this._ctx.initBatch(currentBatchSize);
190
- for (const { queuedDecode, processAmount } of currentBatchItems) {
191
- let batchLogitIndex;
256
+ for (const action of afterDecodeActions) {
257
+ const [accept, reject] = action.response;
258
+ if (action.onDone != null && action.batchLogitIndex != null) {
192
259
  try {
193
- batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(queuedDecode.tokens.slice(0, processAmount)), queuedDecode.generateLogitAtTheEnd && processAmount === queuedDecode.tokens.length);
260
+ accept(action.onDone(action.batchLogitIndex ?? null));
194
261
  }
195
262
  catch (err) {
196
- this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
197
- continue;
198
- }
199
- currentQueuedDecodeItems.add(queuedDecode);
200
- if (queuedDecode.tokens.length === processAmount) {
201
- queuedDecodesToDelete.add(queuedDecode);
202
- afterDecodeActions.push({
203
- batchLogitIndex,
204
- response: queuedDecode.response,
205
- onDone: queuedDecode.onDone
206
- });
207
- }
208
- else {
209
- queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
210
- queuedDecode.firstTokenSequenceIndex += processAmount;
263
+ reject(err);
211
264
  }
212
- if (batchTokenSlotsLeft === 0)
213
- break;
214
- }
215
- for (let i = 0; i < this._queuedDecodes.length; i++) {
216
- const queuedDecode = this._queuedDecodes[i];
217
- if (queuedDecodesToDelete.has(queuedDecode)) {
218
- this._queuedDecodes.splice(i, 1);
219
- this._queuedDecodeSequenceIds.delete(queuedDecode.sequenceId);
220
- i--;
221
- }
222
- }
223
- shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
224
- try {
225
- if (currentBatchSize !== 0)
226
- await this._ctx.decodeBatch();
227
- }
228
- catch (err) {
229
- this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
230
- return;
231
- }
232
- for (const action of afterDecodeActions) {
233
- const [accept, reject] = action.response;
234
- if (action.onDone != null && action.batchLogitIndex != null) {
235
- try {
236
- accept(action.onDone(action.batchLogitIndex ?? null));
237
- }
238
- catch (err) {
239
- reject(err);
240
- }
241
- }
242
- accept(undefined);
243
265
  }
266
+ accept(undefined);
267
+ }
268
+ };
269
+ const prioritizationStrategy = resolvePrioritizationStrategy();
270
+ if (prioritizationStrategy == null)
271
+ return; // all queued items are rejected and dequeued when we get here
272
+ while (shouldHaveAnotherLoop) {
273
+ const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
274
+ if (orderedQueuedDecodes == null)
275
+ return; // all queued items are rejected and dequeued when we get here
276
+ const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
277
+ let preventDisposalHandle;
278
+ try {
279
+ preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
280
+ }
281
+ catch (err) {
282
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
283
+ return;
284
+ }
285
+ try {
286
+ await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
287
+ shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
244
288
  }
245
289
  finally {
246
290
  preventDisposalHandle.dispose();
@@ -248,13 +292,18 @@ export class LlamaContext {
248
292
  }
249
293
  });
250
294
  }
295
+ /**
296
+ * Print the timings of token evaluation since that last print for this context.
297
+ * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
298
+ * it won't print anything.
299
+ */
251
300
  async printTimings() {
252
301
  this._ensureNotDisposed();
253
302
  this._ctx.printTimings();
254
303
  await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
255
304
  }
256
305
  /** @internal */
257
- async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5 }, onDone) {
306
+ async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
258
307
  return await new Promise((accept, reject) => {
259
308
  this._queuedDecodes.push({
260
309
  sequenceId,
@@ -262,6 +311,7 @@ export class LlamaContext {
262
311
  firstTokenSequenceIndex,
263
312
  generateLogitAtTheEnd,
264
313
  evaluationPriority,
314
+ tokenMeter,
265
315
  response: [accept, reject],
266
316
  onDone
267
317
  });
@@ -337,17 +387,46 @@ export class LlamaContext {
337
387
  }
338
388
  /** @internal */
339
389
  static async _create(options, { _model }) {
340
- const context = new LlamaContext({ _model }, options);
390
+ const sequences = options.sequences ?? getDefaultContextSequences();
391
+ const contextSize = resolveContextContextSizeOption({
392
+ contextSize: options.contextSize,
393
+ batchSize: options.batchSize,
394
+ sequences: sequences,
395
+ modelFileInsights: _model.fileInsights,
396
+ modelGpuLayers: _model.gpuLayers,
397
+ modelTrainContextSize: _model.trainContextSize,
398
+ getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
399
+ llamaGpu: _model._llama.gpu,
400
+ ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
401
+ isEmbeddingContext: options._embeddings
402
+ });
403
+ const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
404
+ const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
405
+ contextSize,
406
+ sequences,
407
+ isEmbeddingContext: options._embeddings,
408
+ modelGpuLayers: _model.gpuLayers,
409
+ batchSize
410
+ }).gpuVram;
411
+ const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences });
341
412
  const { createSignal } = options;
342
- const contextLoaded = await context._ctx.init();
343
- if (createSignal?.aborted) {
344
- if (contextLoaded)
345
- await context._ctx.dispose();
346
- throw createSignal.reason;
413
+ const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
414
+ ? null
415
+ : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
416
+ try {
417
+ const contextLoaded = await context._ctx.init();
418
+ if (createSignal?.aborted) {
419
+ if (contextLoaded)
420
+ await context._ctx.dispose();
421
+ throw createSignal.reason;
422
+ }
423
+ else if (!contextLoaded)
424
+ throw new Error("Failed to create context");
425
+ return context;
426
+ }
427
+ finally {
428
+ contextCreationMemoryReservation?.dispose?.();
347
429
  }
348
- else if (!contextLoaded)
349
- throw new Error("Failed to create context");
350
- return context;
351
430
  }
352
431
  }
353
432
  export class LlamaContextSequence {
@@ -355,14 +434,16 @@ export class LlamaContextSequence {
355
434
  /** @internal */ _gcRegistry;
356
435
  /** @internal */ _context;
357
436
  /** @internal */ _contextShift;
437
+ /** @internal */ _tokenMeter;
358
438
  /** @internal */ _disposeAggregator = new DisposeAggregator();
359
439
  /** @internal */ _contextTokens = [];
360
440
  /** @internal */ _nextTokenIndex = 0;
361
441
  /** @internal */ _disposed = false;
362
442
  onDispose = new EventRelay();
363
- constructor({ sequenceId, context, contextShift }) {
443
+ constructor({ sequenceId, context, tokenMeter, contextShift }) {
364
444
  this._sequenceId = sequenceId;
365
445
  this._context = context;
446
+ this._tokenMeter = tokenMeter ?? new TokenMeter();
366
447
  this._contextShift = contextShift;
367
448
  this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
368
449
  this._gcRegistry.register(this, sequenceId);
@@ -399,6 +480,9 @@ export class LlamaContextSequence {
399
480
  get contextTokens() {
400
481
  return this._contextTokens.slice();
401
482
  }
483
+ get tokenMeter() {
484
+ return this._tokenMeter;
485
+ }
402
486
  get isLoadedToMemory() {
403
487
  return !this._disposed;
404
488
  }
@@ -424,7 +508,7 @@ export class LlamaContextSequence {
424
508
  }
425
509
  /**
426
510
  * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
427
- * the start and end of each range are exclusive.
511
+ * The start of each range is inclusive, and the end of each range is exclusive.
428
512
  * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
429
513
  */
430
514
  async eraseContextTokenRanges(ranges) {
@@ -531,7 +615,7 @@ export class LlamaContextSequence {
531
615
  while (true) {
532
616
  this._ensureNotDisposed();
533
617
  // Evaluate to get the next token.
534
- const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, contextShiftOptions, (batchLogitIndex) => {
618
+ const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
535
619
  const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
536
620
  ? repeatPenalty.punishTokens()
537
621
  : repeatPenalty?.punishTokens;
@@ -565,7 +649,7 @@ export class LlamaContextSequence {
565
649
  }
566
650
  }
567
651
  /** @internal */
568
- async _decodeTokens(tokens, generateLogit, evaluationPriority, contextShiftOptions, onDecodeDone) {
652
+ async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
569
653
  this._ensureNotDisposed();
570
654
  const tokensLeftToDecode = tokens.slice();
571
655
  return await withLock(this, "evaluate", async () => {
@@ -585,7 +669,8 @@ export class LlamaContextSequence {
585
669
  tokens: tokensToDecode,
586
670
  firstTokenSequenceIndex: this._nextTokenIndex,
587
671
  generateLogitAtTheEnd,
588
- evaluationPriority
672
+ evaluationPriority,
673
+ tokenMeter
589
674
  }, !generateLogitAtTheEnd
590
675
  ? undefined
591
676
  : onDecodeDone);
@@ -632,10 +717,11 @@ export class LlamaContextSequence {
632
717
  * We need this to make it impossible to manually create instances of this class outside the code of this library
633
718
  * @internal
634
719
  */
635
- static _create({ sequenceId, context, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
720
+ static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
636
721
  return new LlamaContextSequence({
637
722
  sequenceId,
638
723
  context,
724
+ tokenMeter,
639
725
  contextShift: {
640
726
  size: contextShiftSize,
641
727
  strategy: contextShiftStrategy
@@ -653,4 +739,63 @@ function disposeContextSequenceIfReferenced(contextRef) {
653
739
  if (context != null)
654
740
  context.dispose();
655
741
  }
742
+ export function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false }) {
743
+ if (contextSize == null)
744
+ contextSize = "auto";
745
+ if (typeof contextSize === "number") {
746
+ const resolvedContextSize = Math.max(1, Math.floor(contextSize));
747
+ if (ignoreMemorySafetyChecks)
748
+ return resolvedContextSize;
749
+ const vramState = getVramState();
750
+ const contextVram = modelFileInsights.estimateContextResourceRequirements({
751
+ contextSize: resolvedContextSize,
752
+ batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }),
753
+ modelGpuLayers: modelGpuLayers,
754
+ sequences,
755
+ isEmbeddingContext
756
+ }).gpuVram;
757
+ if (contextVram > vramState.free)
758
+ throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
759
+ return resolvedContextSize;
760
+ }
761
+ else if (contextSize === "auto" || typeof contextSize === "object") {
762
+ if (llamaGpu === false)
763
+ return modelTrainContextSize;
764
+ const vramState = getVramState();
765
+ if (vramState.total === 0)
766
+ return modelTrainContextSize;
767
+ const freeVram = vramState.free;
768
+ const maxContextSize = contextSize === "auto"
769
+ ? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })
770
+ : Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }));
771
+ const minContextSize = contextSize === "auto"
772
+ ? minAllowedContextSizeInCalculations
773
+ : Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations);
774
+ for (let testContextSize = maxContextSize; testContextSize >= minContextSize; testContextSize--) {
775
+ const contextVram = modelFileInsights.estimateContextResourceRequirements({
776
+ contextSize: testContextSize,
777
+ batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }),
778
+ modelGpuLayers: modelGpuLayers,
779
+ sequences,
780
+ isEmbeddingContext
781
+ }).gpuVram;
782
+ if (contextVram <= freeVram)
783
+ return testContextSize;
784
+ }
785
+ if (ignoreMemorySafetyChecks)
786
+ return minContextSize;
787
+ throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`);
788
+ }
789
+ throw new Error(`Invalid context size: "${contextSize}"`);
790
+ }
791
+ export function getDefaultContextBatchSize({ contextSize, sequences }) {
792
+ return Math.min(contextSize * sequences, 512);
793
+ }
794
+ export function getDefaultContextSequences() {
795
+ return 1;
796
+ }
797
+ const defaultFallbackContextSize = 4096;
798
+ export function getDefaultModelContextSize({ trainContextSize }) {
799
+ return trainContextSize ?? defaultFallbackContextSize;
800
+ }
656
801
  //# sourceMappingURL=LlamaContext.js.map