node-llama-cpp 3.0.0-beta.13 → 3.0.0-beta.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (351) hide show
  1. package/README.md +1 -1
  2. package/dist/ChatWrapper.js +4 -0
  3. package/dist/ChatWrapper.js.map +1 -1
  4. package/dist/bindings/AddonTypes.d.ts +35 -6
  5. package/dist/bindings/Llama.d.ts +12 -0
  6. package/dist/bindings/Llama.js +100 -7
  7. package/dist/bindings/Llama.js.map +1 -1
  8. package/dist/bindings/getLlama.d.ts +19 -1
  9. package/dist/bindings/getLlama.js +16 -6
  10. package/dist/bindings/getLlama.js.map +1 -1
  11. package/dist/bindings/types.d.ts +18 -0
  12. package/dist/bindings/types.js +31 -2
  13. package/dist/bindings/types.js.map +1 -1
  14. package/dist/bindings/utils/MemoryOrchestrator.d.ts +21 -0
  15. package/dist/bindings/utils/MemoryOrchestrator.js +49 -0
  16. package/dist/bindings/utils/MemoryOrchestrator.js.map +1 -0
  17. package/dist/bindings/utils/cloneLlamaCppRepo.d.ts +1 -1
  18. package/dist/bindings/utils/cloneLlamaCppRepo.js +4 -3
  19. package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
  20. package/dist/bindings/utils/compileLLamaCpp.d.ts +4 -1
  21. package/dist/bindings/utils/compileLLamaCpp.js +133 -97
  22. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  23. package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +3 -0
  24. package/dist/bindings/utils/detectAvailableComputeLayers.js +155 -13
  25. package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
  26. package/dist/bindings/utils/getLlamaWithoutBackend.d.ts +5 -0
  27. package/dist/bindings/utils/getLlamaWithoutBackend.js +27 -0
  28. package/dist/bindings/utils/getLlamaWithoutBackend.js.map +1 -0
  29. package/dist/bindings/utils/logDistroInstallInstruction.d.ts +1 -0
  30. package/dist/bindings/utils/logDistroInstallInstruction.js +16 -6
  31. package/dist/bindings/utils/logDistroInstallInstruction.js.map +1 -1
  32. package/dist/bindings/utils/resolveCustomCmakeOptions.js +2 -2
  33. package/dist/bindings/utils/resolveCustomCmakeOptions.js.map +1 -1
  34. package/dist/bindings/utils/testBindingBinary.js +2 -2
  35. package/dist/bindings/utils/testBindingBinary.js.map +1 -1
  36. package/dist/bindings/utils/testCmakeBinary.d.ts +5 -0
  37. package/dist/bindings/utils/testCmakeBinary.js +32 -0
  38. package/dist/bindings/utils/testCmakeBinary.js.map +1 -0
  39. package/dist/chatWrappers/AlpacaChatWrapper.d.ts +2 -1
  40. package/dist/chatWrappers/AlpacaChatWrapper.js +9 -2
  41. package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
  42. package/dist/chatWrappers/ChatMLChatWrapper.js +12 -10
  43. package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
  44. package/dist/chatWrappers/FalconChatWrapper.d.ts +2 -1
  45. package/dist/chatWrappers/FalconChatWrapper.js +28 -11
  46. package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
  47. package/dist/chatWrappers/FunctionaryChatWrapper.js +59 -45
  48. package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
  49. package/dist/chatWrappers/GemmaChatWrapper.js +9 -7
  50. package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -1
  51. package/dist/chatWrappers/GeneralChatWrapper.d.ts +2 -1
  52. package/dist/chatWrappers/GeneralChatWrapper.js +35 -12
  53. package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
  54. package/dist/chatWrappers/LlamaChatWrapper.d.ts +7 -0
  55. package/dist/chatWrappers/LlamaChatWrapper.js +26 -8
  56. package/dist/chatWrappers/LlamaChatWrapper.js.map +1 -1
  57. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +73 -0
  58. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +355 -0
  59. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -0
  60. package/dist/{TemplateChatWrapper.d.ts → chatWrappers/generic/TemplateChatWrapper.d.ts} +16 -18
  61. package/dist/{TemplateChatWrapper.js → chatWrappers/generic/TemplateChatWrapper.js} +31 -69
  62. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -0
  63. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.d.ts +33 -0
  64. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js +45 -0
  65. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.js.map +1 -0
  66. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.d.ts +4 -0
  67. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +206 -0
  68. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -0
  69. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +67 -0
  70. package/dist/chatWrappers/utils/resolveChatWrapper.js +206 -0
  71. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -0
  72. package/dist/cli/cli.js +1 -1
  73. package/dist/cli/cli.js.map +1 -1
  74. package/dist/cli/commands/ChatCommand.d.ts +7 -4
  75. package/dist/cli/commands/ChatCommand.js +177 -70
  76. package/dist/cli/commands/ChatCommand.js.map +1 -1
  77. package/dist/cli/commands/ClearCommand.d.ts +1 -1
  78. package/dist/cli/commands/ClearCommand.js +5 -5
  79. package/dist/cli/commands/ClearCommand.js.map +1 -1
  80. package/dist/cli/commands/CompleteCommand.d.ts +3 -2
  81. package/dist/cli/commands/CompleteCommand.js +115 -51
  82. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  83. package/dist/cli/commands/InfillCommand.d.ts +3 -2
  84. package/dist/cli/commands/InfillCommand.js +115 -51
  85. package/dist/cli/commands/InfillCommand.js.map +1 -1
  86. package/dist/cli/commands/OnPostInstallCommand.js +2 -0
  87. package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
  88. package/dist/cli/commands/{InspectCommand.d.ts → inspect/InspectCommand.d.ts} +1 -4
  89. package/dist/cli/commands/inspect/InspectCommand.js +17 -0
  90. package/dist/cli/commands/inspect/InspectCommand.js.map +1 -0
  91. package/dist/cli/commands/inspect/commands/InspectGgufCommand.d.ts +10 -0
  92. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +108 -0
  93. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -0
  94. package/dist/cli/commands/inspect/commands/InspectGpuCommand.d.ts +4 -0
  95. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +98 -0
  96. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -0
  97. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +14 -0
  98. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +577 -0
  99. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -0
  100. package/dist/cli/utils/ConsoleTable.d.ts +23 -0
  101. package/dist/cli/utils/ConsoleTable.js +86 -0
  102. package/dist/cli/utils/ConsoleTable.js.map +1 -0
  103. package/dist/cli/utils/printCommonInfoLines.d.ts +9 -0
  104. package/dist/cli/utils/printCommonInfoLines.js +70 -0
  105. package/dist/cli/utils/printCommonInfoLines.js.map +1 -0
  106. package/dist/cli/utils/printInfoLine.d.ts +10 -0
  107. package/dist/cli/utils/printInfoLine.js +45 -0
  108. package/dist/cli/utils/printInfoLine.js.map +1 -0
  109. package/dist/cli/utils/resolveCommandGgufPath.d.ts +1 -0
  110. package/dist/cli/utils/resolveCommandGgufPath.js +6 -0
  111. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -0
  112. package/dist/config.d.ts +3 -1
  113. package/dist/config.js +7 -1
  114. package/dist/config.js.map +1 -1
  115. package/dist/evaluator/LlamaChat/LlamaChat.js +13 -5
  116. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  117. package/dist/evaluator/LlamaCompletion.js +5 -3
  118. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  119. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +43 -9
  120. package/dist/evaluator/LlamaContext/LlamaContext.js +251 -60
  121. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  122. package/dist/evaluator/LlamaContext/types.d.ts +68 -10
  123. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/firstInFirstOutStrategy.js.map +1 -0
  124. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map +1 -0
  125. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.d.ts +2 -0
  126. package/dist/evaluator/LlamaContext/utils/{resolveBatchItemsPrioritizingStrategy.js → resolveBatchItemsPrioritizationStrategy.js} +4 -4
  127. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map +1 -0
  128. package/dist/evaluator/LlamaEmbeddingContext.d.ts +29 -7
  129. package/dist/evaluator/LlamaEmbeddingContext.js +31 -22
  130. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  131. package/dist/evaluator/LlamaGrammar.js +1 -0
  132. package/dist/evaluator/LlamaGrammar.js.map +1 -1
  133. package/dist/evaluator/LlamaModel.d.ts +78 -20
  134. package/dist/evaluator/LlamaModel.js +385 -21
  135. package/dist/evaluator/LlamaModel.js.map +1 -1
  136. package/dist/evaluator/TokenMeter.d.ts +54 -0
  137. package/dist/evaluator/TokenMeter.js +86 -0
  138. package/dist/evaluator/TokenMeter.js.map +1 -0
  139. package/dist/gguf/GgufInsights.d.ts +40 -0
  140. package/dist/gguf/GgufInsights.js +350 -0
  141. package/dist/gguf/GgufInsights.js.map +1 -0
  142. package/dist/gguf/consts.d.ts +3 -0
  143. package/dist/gguf/consts.js +8 -0
  144. package/dist/gguf/consts.js.map +1 -0
  145. package/dist/gguf/errors/InvalidGgufMagicError.d.ts +3 -0
  146. package/dist/gguf/errors/InvalidGgufMagicError.js +6 -0
  147. package/dist/gguf/errors/InvalidGgufMagicError.js.map +1 -0
  148. package/dist/gguf/errors/UnsupportedGgufValueTypeError.d.ts +4 -0
  149. package/dist/gguf/errors/UnsupportedGgufValueTypeError.js +9 -0
  150. package/dist/gguf/errors/UnsupportedGgufValueTypeError.js.map +1 -0
  151. package/dist/gguf/fileReaders/GgufFileReader.d.ts +33 -0
  152. package/dist/gguf/fileReaders/GgufFileReader.js +76 -0
  153. package/dist/gguf/fileReaders/GgufFileReader.js.map +1 -0
  154. package/dist/gguf/fileReaders/GgufFsFileReader.d.ts +17 -0
  155. package/dist/gguf/fileReaders/GgufFsFileReader.js +45 -0
  156. package/dist/gguf/fileReaders/GgufFsFileReader.js.map +1 -0
  157. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.d.ts +22 -0
  158. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js +63 -0
  159. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js.map +1 -0
  160. package/dist/gguf/parser/GgufV2Parser.d.ts +19 -0
  161. package/dist/gguf/parser/GgufV2Parser.js +115 -0
  162. package/dist/gguf/parser/GgufV2Parser.js.map +1 -0
  163. package/dist/gguf/parser/GgufV3Parser.d.ts +3 -0
  164. package/dist/gguf/parser/GgufV3Parser.js +4 -0
  165. package/dist/gguf/parser/GgufV3Parser.js.map +1 -0
  166. package/dist/gguf/parser/parseGguf.d.ts +8 -0
  167. package/dist/gguf/parser/parseGguf.js +58 -0
  168. package/dist/gguf/parser/parseGguf.js.map +1 -0
  169. package/dist/gguf/readGgufFileInfo.d.ts +30 -0
  170. package/dist/gguf/readGgufFileInfo.js +37 -0
  171. package/dist/gguf/readGgufFileInfo.js.map +1 -0
  172. package/dist/gguf/types/GgufFileInfoTypes.d.ts +52 -0
  173. package/dist/gguf/types/GgufFileInfoTypes.js +18 -0
  174. package/dist/gguf/types/GgufFileInfoTypes.js.map +1 -0
  175. package/dist/gguf/types/GgufMetadataTypes.d.ts +330 -0
  176. package/dist/gguf/types/GgufMetadataTypes.js +86 -0
  177. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -0
  178. package/dist/gguf/types/GgufTensorInfoTypes.d.ts +37 -0
  179. package/dist/gguf/types/GgufTensorInfoTypes.js +33 -0
  180. package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -0
  181. package/dist/gguf/utils/GgufReadOffset.d.ts +6 -0
  182. package/dist/gguf/utils/GgufReadOffset.js +18 -0
  183. package/dist/gguf/utils/GgufReadOffset.js.map +1 -0
  184. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.d.ts +5 -0
  185. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js +38 -0
  186. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js.map +1 -0
  187. package/dist/gguf/utils/getGgufFileTypeName.d.ts +4 -0
  188. package/dist/gguf/utils/getGgufFileTypeName.js +13 -0
  189. package/dist/gguf/utils/getGgufFileTypeName.js.map +1 -0
  190. package/dist/gguf/utils/getGgufMetadataArchitectureData.d.ts +3 -0
  191. package/dist/gguf/utils/getGgufMetadataArchitectureData.js +4 -0
  192. package/dist/gguf/utils/getGgufMetadataArchitectureData.js.map +1 -0
  193. package/dist/gguf/utils/normalizeGgufDownloadUrl.d.ts +1 -0
  194. package/dist/gguf/utils/normalizeGgufDownloadUrl.js +16 -0
  195. package/dist/gguf/utils/normalizeGgufDownloadUrl.js.map +1 -0
  196. package/dist/index.d.ts +13 -7
  197. package/dist/index.js +11 -6
  198. package/dist/index.js.map +1 -1
  199. package/dist/state.d.ts +2 -0
  200. package/dist/state.js +7 -0
  201. package/dist/state.js.map +1 -1
  202. package/dist/types.d.ts +1 -1
  203. package/dist/utils/DisposeGuard.d.ts +13 -0
  204. package/dist/utils/DisposeGuard.js +120 -0
  205. package/dist/utils/DisposeGuard.js.map +1 -0
  206. package/dist/utils/InsufficientMemoryError.d.ts +3 -0
  207. package/dist/utils/InsufficientMemoryError.js +6 -0
  208. package/dist/utils/InsufficientMemoryError.js.map +1 -0
  209. package/dist/utils/LlamaText.d.ts +25 -10
  210. package/dist/utils/LlamaText.js +205 -23
  211. package/dist/utils/LlamaText.js.map +1 -1
  212. package/dist/utils/StopGenerationDetector.js +3 -1
  213. package/dist/utils/StopGenerationDetector.js.map +1 -1
  214. package/dist/utils/cmake.js +1 -1
  215. package/dist/utils/cmake.js.map +1 -1
  216. package/dist/utils/findBestOption.d.ts +4 -0
  217. package/dist/utils/findBestOption.js +15 -0
  218. package/dist/utils/findBestOption.js.map +1 -0
  219. package/dist/utils/getConsoleLogPrefix.js +3 -2
  220. package/dist/utils/getConsoleLogPrefix.js.map +1 -1
  221. package/dist/utils/getQueuedTokensBeforeStopTrigger.js +3 -3
  222. package/dist/utils/getQueuedTokensBeforeStopTrigger.js.map +1 -1
  223. package/dist/utils/gitReleaseBundles.js +68 -1
  224. package/dist/utils/gitReleaseBundles.js.map +1 -1
  225. package/dist/utils/mergeUnionTypes.d.ts +4 -0
  226. package/dist/utils/parseModelFileName.d.ts +1 -0
  227. package/dist/utils/parseModelFileName.js +6 -1
  228. package/dist/utils/parseModelFileName.js.map +1 -1
  229. package/dist/utils/prettyPrintObject.d.ts +10 -1
  230. package/dist/utils/prettyPrintObject.js +57 -13
  231. package/dist/utils/prettyPrintObject.js.map +1 -1
  232. package/dist/utils/removeNullFields.d.ts +2 -2
  233. package/dist/utils/removeNullFields.js.map +1 -1
  234. package/dist/utils/spawnCommand.d.ts +11 -1
  235. package/dist/utils/spawnCommand.js +55 -7
  236. package/dist/utils/spawnCommand.js.map +1 -1
  237. package/dist/utils/tokenizeInput.d.ts +1 -1
  238. package/dist/utils/tokenizeInput.js +3 -3
  239. package/dist/utils/tokenizeInput.js.map +1 -1
  240. package/dist/utils/withOra.d.ts +1 -0
  241. package/dist/utils/withOra.js +2 -2
  242. package/dist/utils/withOra.js.map +1 -1
  243. package/llama/CMakeLists.txt +5 -5
  244. package/llama/addon.cpp +793 -88
  245. package/llama/binariesGithubRelease.json +1 -1
  246. package/llama/gitRelease.bundle +0 -0
  247. package/llama/gpuInfo/cuda-gpu-info.cu +21 -0
  248. package/llama/gpuInfo/cuda-gpu-info.h +3 -0
  249. package/llama/gpuInfo/metal-gpu-info.h +4 -1
  250. package/llama/gpuInfo/metal-gpu-info.mm +14 -1
  251. package/llama/gpuInfo/vulkan-gpu-info.cpp +20 -2
  252. package/llama/gpuInfo/vulkan-gpu-info.h +2 -0
  253. package/llama/grammars/json.gbnf +1 -1
  254. package/llama/grammars/json_arr.gbnf +1 -1
  255. package/llama/llama.cpp.info.json +1 -1
  256. package/llama/toolchains/win32.host-x64.target-arm64.cmake +41 -0
  257. package/llamaBins/linux-arm64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  258. package/llamaBins/linux-arm64/llama-addon.node +0 -0
  259. package/llamaBins/linux-armv7l/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  260. package/llamaBins/linux-armv7l/llama-addon.node +0 -0
  261. package/llamaBins/linux-x64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  262. package/llamaBins/linux-x64/llama-addon.node +0 -0
  263. package/llamaBins/linux-x64-cuda/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  264. package/llamaBins/linux-x64-cuda/llama-addon.node +0 -0
  265. package/llamaBins/linux-x64-vulkan/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  266. package/llamaBins/linux-x64-vulkan/llama-addon.node +0 -0
  267. package/llamaBins/mac-arm64-metal/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  268. package/llamaBins/mac-arm64-metal/default.metallib +0 -0
  269. package/llamaBins/mac-arm64-metal/llama-addon.node +0 -0
  270. package/llamaBins/mac-x64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  271. package/llamaBins/mac-x64/llama-addon.node +0 -0
  272. package/llamaBins/win-arm64/_nlcBuildMetadata.json +1 -0
  273. package/llamaBins/win-arm64/llama-addon.exp +0 -0
  274. package/llamaBins/win-arm64/llama-addon.lib +0 -0
  275. package/llamaBins/win-arm64/llama-addon.node +0 -0
  276. package/llamaBins/win-x64/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  277. package/llamaBins/win-x64/llama-addon.exp +0 -0
  278. package/llamaBins/win-x64/llama-addon.lib +0 -0
  279. package/llamaBins/win-x64/llama-addon.node +0 -0
  280. package/llamaBins/win-x64-cuda/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  281. package/llamaBins/win-x64-cuda/llama-addon.exp +0 -0
  282. package/llamaBins/win-x64-cuda/llama-addon.lib +0 -0
  283. package/llamaBins/win-x64-cuda/llama-addon.node +0 -0
  284. package/llamaBins/win-x64-vulkan/{.buildMetadata.json → _nlcBuildMetadata.json} +1 -1
  285. package/llamaBins/win-x64-vulkan/llama-addon.exp +0 -0
  286. package/llamaBins/win-x64-vulkan/llama-addon.lib +0 -0
  287. package/llamaBins/win-x64-vulkan/llama-addon.node +0 -0
  288. package/package.json +16 -11
  289. package/dist/TemplateChatWrapper.js.map +0 -1
  290. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.d.ts +0 -33
  291. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js +0 -49
  292. package/dist/bindings/utils/resolveChatWrapperBasedOnWrapperTypeName.js.map +0 -1
  293. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.d.ts +0 -13
  294. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js +0 -63
  295. package/dist/chatWrappers/resolveChatWrapperBasedOnModel.js.map +0 -1
  296. package/dist/cli/commands/InspectCommand.js +0 -113
  297. package/dist/cli/commands/InspectCommand.js.map +0 -1
  298. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/firstInFirstOutStrategy.js.map +0 -1
  299. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizingStrategies/maximumParallelismStrategy.js.map +0 -1
  300. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.d.ts +0 -2
  301. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizingStrategy.js.map +0 -1
  302. package/dist/gguf/GGUFInsights.d.ts +0 -28
  303. package/dist/gguf/GGUFInsights.js +0 -58
  304. package/dist/gguf/GGUFInsights.js.map +0 -1
  305. package/dist/gguf/GGUFMetadata.d.ts +0 -19
  306. package/dist/gguf/GGUFMetadata.js +0 -38
  307. package/dist/gguf/GGUFMetadata.js.map +0 -1
  308. package/dist/gguf/errors/InvalidGGUFMagicError.d.ts +0 -3
  309. package/dist/gguf/errors/InvalidGGUFMagicError.js +0 -6
  310. package/dist/gguf/errors/InvalidGGUFMagicError.js.map +0 -1
  311. package/dist/gguf/errors/MetadataNotParsedYetError.d.ts +0 -3
  312. package/dist/gguf/errors/MetadataNotParsedYetError.js +0 -6
  313. package/dist/gguf/errors/MetadataNotParsedYetError.js.map +0 -1
  314. package/dist/gguf/errors/MissingNodeLlamaError.d.ts +0 -3
  315. package/dist/gguf/errors/MissingNodeLlamaError.js +0 -6
  316. package/dist/gguf/errors/MissingNodeLlamaError.js.map +0 -1
  317. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.d.ts +0 -5
  318. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js +0 -12
  319. package/dist/gguf/errors/ModelScore/NotEnoughVRamError.js.map +0 -1
  320. package/dist/gguf/errors/UnsupportedMetadataTypeError.d.ts +0 -4
  321. package/dist/gguf/errors/UnsupportedMetadataTypeError.js +0 -8
  322. package/dist/gguf/errors/UnsupportedMetadataTypeError.js.map +0 -1
  323. package/dist/gguf/ggufParser/GGUFParser.d.ts +0 -18
  324. package/dist/gguf/ggufParser/GGUFParser.js +0 -123
  325. package/dist/gguf/ggufParser/GGUFParser.js.map +0 -1
  326. package/dist/gguf/ggufParser/GGUFTypes.d.ts +0 -257
  327. package/dist/gguf/ggufParser/GGUFTypes.js +0 -2
  328. package/dist/gguf/ggufParser/GGUFTypes.js.map +0 -1
  329. package/dist/gguf/ggufParser/checkArchitecture.d.ts +0 -14
  330. package/dist/gguf/ggufParser/checkArchitecture.js +0 -74
  331. package/dist/gguf/ggufParser/checkArchitecture.js.map +0 -1
  332. package/dist/gguf/ggufParser/stream/GGUFBaseStream.d.ts +0 -38
  333. package/dist/gguf/ggufParser/stream/GGUFBaseStream.js +0 -83
  334. package/dist/gguf/ggufParser/stream/GGUFBaseStream.js.map +0 -1
  335. package/dist/gguf/ggufParser/stream/GGUFFetchStream.d.ts +0 -14
  336. package/dist/gguf/ggufParser/stream/GGUFFetchStream.js +0 -35
  337. package/dist/gguf/ggufParser/stream/GGUFFetchStream.js.map +0 -1
  338. package/dist/gguf/ggufParser/stream/GGUFReadStream.d.ts +0 -15
  339. package/dist/gguf/ggufParser/stream/GGUFReadStream.js +0 -40
  340. package/dist/gguf/ggufParser/stream/GGUFReadStream.js.map +0 -1
  341. package/dist/utils/parseModelTypeDescription.d.ts +0 -6
  342. package/dist/utils/parseModelTypeDescription.js +0 -9
  343. package/dist/utils/parseModelTypeDescription.js.map +0 -1
  344. package/dist/utils/resolveChatWrapper.d.ts +0 -4
  345. package/dist/utils/resolveChatWrapper.js +0 -16
  346. package/dist/utils/resolveChatWrapper.js.map +0 -1
  347. package/llamaBins/mac-arm64-metal/ggml-metal.metal +0 -7731
  348. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.d.ts +0 -0
  349. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/firstInFirstOutStrategy.js +0 -0
  350. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.d.ts +0 -0
  351. /package/dist/evaluator/LlamaContext/utils/{batchItemsPrioritizingStrategies → batchItemsPrioritizationStrategies}/maximumParallelismStrategy.js +0 -0
@@ -1,21 +1,26 @@
1
1
  import { EventRelay } from "lifecycle-utils";
2
2
  import { Token } from "../../types.js";
3
- import { LlamaModel } from "../LlamaModel.js";
4
3
  import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
4
+ import { GgufInsights } from "../../gguf/GgufInsights.js";
5
+ import { TokenMeter } from "../TokenMeter.js";
6
+ import { BuildGpu } from "../../bindings/types.js";
5
7
  import { ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty } from "./types.js";
8
+ import type { LlamaModel } from "../LlamaModel.js";
6
9
  export declare class LlamaContext {
7
10
  readonly onDispose: EventRelay<void>;
8
- /**
9
- * @param options
10
- */
11
- constructor({ model, sequences, seed, contextSize, batchSize, threads, batching: { dispatchSchedule: batchingDispatchSchedule, itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy }, _embedding, _noSeed }: LlamaContextOptions);
12
- dispose(): void;
11
+ private constructor();
12
+ dispose(): Promise<void>;
13
13
  /** @hidden */
14
- [Symbol.dispose](): void;
14
+ [Symbol.asyncDispose](): Promise<void>;
15
15
  get disposed(): boolean;
16
16
  get model(): LlamaModel;
17
17
  get contextSize(): number;
18
18
  get batchSize(): number;
19
+ /**
20
+ * The actual size of the state in the memory in bytes.
21
+ * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
22
+ */
23
+ get stateSize(): number;
19
24
  getAllocatedContextSize(): number;
20
25
  get totalSequences(): number;
21
26
  get sequencesLeft(): number;
@@ -24,10 +29,15 @@ export declare class LlamaContext {
24
29
  * When there are no sequences left, this method will throw an error.
25
30
  * @param [options]
26
31
  */
27
- getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy } }?: {
32
+ getSequence({ contextShift: { size: contextShiftSize, strategy: contextShiftStrategy }, _tokenMeter }?: {
28
33
  contextShift?: ContextShiftOptions;
29
34
  }): LlamaContextSequence;
30
35
  dispatchPendingBatch(): void;
36
+ /**
37
+ * Print the timings of token evaluation since that last print for this context.
38
+ * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
39
+ * it won't print anything.
40
+ */
31
41
  printTimings(): Promise<void>;
32
42
  }
33
43
  export declare class LlamaContextSequence {
@@ -41,6 +51,7 @@ export declare class LlamaContextSequence {
41
51
  get model(): LlamaModel;
42
52
  get nextTokenIndex(): number;
43
53
  get contextTokens(): Token[];
54
+ get tokenMeter(): TokenMeter;
44
55
  get isLoadedToMemory(): boolean;
45
56
  compareContextTokens(tokens: Token[]): {
46
57
  firstDifferentIndex: number;
@@ -52,7 +63,7 @@ export declare class LlamaContextSequence {
52
63
  clearHistory(): Promise<void>;
53
64
  /**
54
65
  * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
55
- * the start and end of each range are exclusive.
66
+ * The start of each range is inclusive, and the end of each range is exclusive.
56
67
  * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
57
68
  */
58
69
  eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
@@ -107,3 +118,26 @@ export declare class LlamaContextSequence {
107
118
  contextShift?: ContextShiftOptions;
108
119
  }): Promise<void>;
109
120
  }
121
+ export declare function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext }: {
122
+ contextSize?: LlamaContextOptions["contextSize"];
123
+ batchSize?: LlamaContextOptions["batchSize"];
124
+ sequences: number;
125
+ modelFileInsights: GgufInsights;
126
+ modelGpuLayers: number;
127
+ modelTrainContextSize: number;
128
+ getVramState(): {
129
+ total: number;
130
+ free: number;
131
+ };
132
+ llamaGpu: BuildGpu;
133
+ ignoreMemorySafetyChecks?: boolean;
134
+ isEmbeddingContext?: boolean;
135
+ }): number;
136
+ export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
137
+ contextSize: number;
138
+ sequences: number;
139
+ }): number;
140
+ export declare function getDefaultContextSequences(): number;
141
+ export declare function getDefaultModelContextSize({ trainContextSize }: {
142
+ trainContextSize?: number;
143
+ }): number;
@@ -1,11 +1,15 @@
1
- import { DisposeAggregator, EventRelay, withLock, DisposedError } from "lifecycle-utils";
1
+ import { DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAggregator } from "lifecycle-utils";
2
2
  import { removeNullFields } from "../../utils/removeNullFields.js";
3
3
  import { compareTokens } from "../../utils/compareTokens.js";
4
- import { resolveBatchItemsPrioritizingStrategy } from "./utils/resolveBatchItemsPrioritizingStrategy.js";
4
+ import { DisposeGuard } from "../../utils/DisposeGuard.js";
5
+ import { minAllowedContextSizeInCalculations } from "../../config.js";
6
+ import { TokenMeter } from "../TokenMeter.js";
7
+ import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
5
8
  export class LlamaContext {
6
9
  /** @internal */ _llama;
7
10
  /** @internal */ _ctx;
8
11
  /** @internal */ _onReclaimUnusedSequenceId = new EventRelay();
12
+ /** @internal */ _backendContextDisposeGuard;
9
13
  /** @internal */ _model;
10
14
  /** @internal */ _contextSize;
11
15
  /** @internal */ _batchSize;
@@ -14,7 +18,8 @@ export class LlamaContext {
14
18
  /** @internal */ _batchingOptions;
15
19
  /** @internal */ _queuedDecodeSequenceIds = new Set();
16
20
  /** @internal */ _queuedDecodes = [];
17
- /** @internal */ _disposeAggregator = new DisposeAggregator();
21
+ /** @internal */ _disposeAggregator = new AsyncDisposeAggregator();
22
+ /** @internal */ _modelPreventDisposalHandle;
18
23
  /** @internal */ _nextGeneratedSequenceId = 0;
19
24
  /** @internal */ _dispatchDecodeScheduled = false;
20
25
  /** @internal */ _batchDispatchPending = false;
@@ -22,14 +27,13 @@ export class LlamaContext {
22
27
  /** @internal */ _allocatedContextSize;
23
28
  /** @internal */ _disposed = false;
24
29
  onDispose = new EventRelay();
25
- /**
26
- * @param options
27
- */
28
- constructor({ model, sequences = 1, seed = null, contextSize = model.trainContextSize, batchSize = contextSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy = "maximumParallelism" } = {}, _embedding, _noSeed }) {
29
- if (model.disposed)
30
+ constructor({ _model }, { sequences, seed = null, contextSize, batchSize, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
31
+ if (_model.disposed)
30
32
  throw new DisposedError();
31
- this._llama = model._llama;
32
- this._model = model;
33
+ this._llama = _model._llama;
34
+ this._model = _model;
35
+ this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]);
36
+ this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle();
33
37
  this._totalSequences = Math.max(1, Math.floor(sequences));
34
38
  this._contextSize = Math.max(2, contextSize);
35
39
  this._batchSize = Math.max(batchSize, this._totalSequences);
@@ -37,30 +41,36 @@ export class LlamaContext {
37
41
  seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
38
42
  contextSize: this._contextSize * this._totalSequences,
39
43
  batchSize: this._batchSize,
44
+ sequences: this._totalSequences,
40
45
  threads: Math.max(0, Math.floor(threads)),
41
- embedding: _embedding,
46
+ embeddings: _embeddings,
42
47
  noSeed: _noSeed
43
48
  }));
44
49
  this._batchingOptions = {
45
50
  dispatchSchedule: batchingDispatchSchedule,
46
- itemsPrioritizingStrategy: batchingItemsPrioritizingStrategy
51
+ itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
47
52
  };
48
53
  this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
49
- this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
50
- this._disposeAggregator.add(this.onDispose.dispatchEvent);
51
54
  this._disposeAggregator.add(() => {
52
- this._ctx.dispose();
55
+ this._disposed = true;
53
56
  });
57
+ this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
58
+ this._disposeAggregator.add(this.onDispose.dispatchEvent);
54
59
  this._disposeAggregator.add(this.model.onDispose.createListener(disposeContextIfReferenced.bind(null, new WeakRef(this))));
60
+ this._disposeAggregator.add(async () => {
61
+ await this._backendContextDisposeGuard.acquireDisposeLock();
62
+ await this._ctx.dispose();
63
+ this._modelPreventDisposalHandle.dispose();
64
+ });
55
65
  }
56
- dispose() {
66
+ async dispose() {
57
67
  if (this._disposed)
58
68
  return;
59
69
  this._disposed = true;
60
- this._disposeAggregator.dispose();
70
+ await this._disposeAggregator.dispose();
61
71
  }
62
72
  /** @hidden */
63
- [Symbol.dispose]() {
73
+ [Symbol.asyncDispose]() {
64
74
  return this.dispose();
65
75
  }
66
76
  get disposed() {
@@ -75,6 +85,14 @@ export class LlamaContext {
75
85
  get batchSize() {
76
86
  return this._batchSize;
77
87
  }
88
+ /**
89
+ * The actual size of the state in the memory in bytes.
90
+ * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
91
+ */
92
+ get stateSize() {
93
+ this._ensureNotDisposed();
94
+ return this._ctx.getStateSize();
95
+ }
78
96
  getAllocatedContextSize() {
79
97
  this._ensureNotDisposed();
80
98
  if (this._allocatedContextSize == null)
@@ -92,7 +110,7 @@ export class LlamaContext {
92
110
  * When there are no sequences left, this method will throw an error.
93
111
  * @param [options]
94
112
  */
95
- getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} } = {}) {
113
+ getSequence({ contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {}, _tokenMeter } = {}) {
96
114
  this._ensureNotDisposed();
97
115
  const nextSequenceId = this._popSequenceId();
98
116
  if (nextSequenceId == null)
@@ -100,6 +118,7 @@ export class LlamaContext {
100
118
  return LlamaContextSequence._create({
101
119
  sequenceId: nextSequenceId,
102
120
  context: this,
121
+ tokenMeter: _tokenMeter,
103
122
  contextShift: {
104
123
  size: contextShiftSize,
105
124
  strategy: contextShiftStrategy
@@ -116,17 +135,18 @@ export class LlamaContext {
116
135
  this._currentDispatchBatchHandle = {};
117
136
  this._dispatchDecodeScheduled = false;
118
137
  this._batchDispatchPending = false;
119
- let prioritizeStrategy;
120
- try {
121
- this._ensureNotDisposed();
122
- prioritizeStrategy = resolveBatchItemsPrioritizingStrategy(this._batchingOptions.itemsPrioritizingStrategy);
123
- }
124
- catch (err) {
125
- this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
126
- return;
127
- }
128
- let shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
129
- while (shouldHaveAnotherBatch) {
138
+ let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
139
+ const resolvePrioritizationStrategy = () => {
140
+ try {
141
+ this._ensureNotDisposed();
142
+ return resolveBatchItemsPrioritizationStrategy(this._batchingOptions.itemPrioritizationStrategy);
143
+ }
144
+ catch (err) {
145
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
146
+ }
147
+ return null;
148
+ };
149
+ const getOrderedQueuedDecodes = (prioritizationStrategy) => {
130
150
  const batchItemToQueuedDecodeMap = new Map();
131
151
  const batchItemsList = [];
132
152
  for (const queuedDecode of this._queuedDecodes) {
@@ -139,42 +159,65 @@ export class LlamaContext {
139
159
  }
140
160
  let prioritizedItems;
141
161
  try {
142
- prioritizedItems = prioritizeStrategy({
162
+ prioritizedItems = prioritizationStrategy({
143
163
  items: batchItemsList,
144
164
  size: this._batchSize
145
165
  });
146
166
  }
147
167
  catch (err) {
148
168
  this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
149
- return;
169
+ return null;
150
170
  }
151
- let batchTokenSlotsLeft = this._batchSize;
152
- const afterDecodeActions = [];
153
- const queuedDecodesToDelete = new Set();
154
- const currentQueuedDecodeItems = new Set();
155
- const currentBatchItems = [];
156
- let currentBatchSize = 0;
157
- for (const prioritizedItem of prioritizedItems) {
171
+ return prioritizedItems.map((prioritizedItem) => {
158
172
  const queuedDecode = batchItemToQueuedDecodeMap.get(prioritizedItem.item);
159
173
  if (queuedDecode == null)
160
174
  throw new Error("Received invalid batch item. Make sure you keep the original object reference " +
161
175
  "of the batch item on `item` on `PrioritizedBatchItem` in your custom prioritization strategy");
162
- const processAmount = Math.min(queuedDecode.tokens.length, prioritizedItem.processAmount, batchTokenSlotsLeft);
163
- if (processAmount <= 0)
176
+ return {
177
+ queuedDecode,
178
+ processAmount: prioritizedItem.processAmount
179
+ };
180
+ });
181
+ };
182
+ const fitQueuedDecodesToABatch = (queuedDecodes, batchSize) => {
183
+ const currentBatchItems = [];
184
+ let currentBatchSize = 0;
185
+ let batchTokenSlotsLeft = batchSize;
186
+ for (const { queuedDecode, processAmount } of queuedDecodes) {
187
+ const resolvedProcessAmount = Math.min(processAmount <= 0 ? 1 : processAmount, queuedDecode.tokens.length, batchTokenSlotsLeft);
188
+ if (resolvedProcessAmount <= 0) {
189
+ if (batchTokenSlotsLeft === 0)
190
+ break;
164
191
  continue;
165
- batchTokenSlotsLeft -= processAmount;
192
+ }
193
+ batchTokenSlotsLeft -= resolvedProcessAmount;
194
+ currentBatchSize += resolvedProcessAmount;
166
195
  currentBatchItems.push({
167
196
  queuedDecode,
168
- processAmount
197
+ processAmount: resolvedProcessAmount
169
198
  });
170
- currentBatchSize += processAmount;
171
199
  }
200
+ return {
201
+ currentBatchItems,
202
+ currentBatchSize
203
+ };
204
+ };
205
+ const decodeTokenBatchItems = async (batchItems, currentBatchSize) => {
206
+ const afterDecodeActions = [];
207
+ const queuedDecodesToDelete = new Set();
208
+ const currentQueuedDecodeItems = new Set();
172
209
  if (currentBatchSize !== 0)
173
210
  this._ctx.initBatch(currentBatchSize);
174
- for (const { queuedDecode, processAmount } of currentBatchItems) {
211
+ for (const { queuedDecode, processAmount } of batchItems) {
175
212
  let batchLogitIndex;
176
213
  try {
177
- batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(queuedDecode.tokens.slice(0, processAmount)), queuedDecode.generateLogitAtTheEnd && processAmount === queuedDecode.tokens.length);
214
+ const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
215
+ processAmount === queuedDecode.tokens.length;
216
+ const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
217
+ const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
218
+ TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
219
+ TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
220
+ batchLogitIndex = this._ctx.addToBatch(queuedDecode.sequenceId, queuedDecode.firstTokenSequenceIndex, Uint32Array.from(tokensToProcess), shouldGenerateLogitAtTheEnd);
178
221
  }
179
222
  catch (err) {
180
223
  this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
@@ -193,8 +236,6 @@ export class LlamaContext {
193
236
  queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
194
237
  queuedDecode.firstTokenSequenceIndex += processAmount;
195
238
  }
196
- if (batchTokenSlotsLeft === 0)
197
- break;
198
239
  }
199
240
  for (let i = 0; i < this._queuedDecodes.length; i++) {
200
241
  const queuedDecode = this._queuedDecodes[i];
@@ -204,7 +245,6 @@ export class LlamaContext {
204
245
  i--;
205
246
  }
206
247
  }
207
- shouldHaveAnotherBatch = this._queuedDecodes.length > 0;
208
248
  try {
209
249
  if (currentBatchSize !== 0)
210
250
  await this._ctx.decodeBatch();
@@ -225,15 +265,45 @@ export class LlamaContext {
225
265
  }
226
266
  accept(undefined);
227
267
  }
268
+ };
269
+ const prioritizationStrategy = resolvePrioritizationStrategy();
270
+ if (prioritizationStrategy == null)
271
+ return; // all queued items are rejected and dequeued when we get here
272
+ while (shouldHaveAnotherLoop) {
273
+ const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
274
+ if (orderedQueuedDecodes == null)
275
+ return; // all queued items are rejected and dequeued when we get here
276
+ const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
277
+ let preventDisposalHandle;
278
+ try {
279
+ preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
280
+ }
281
+ catch (err) {
282
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
283
+ return;
284
+ }
285
+ try {
286
+ await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
287
+ shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
288
+ }
289
+ finally {
290
+ preventDisposalHandle.dispose();
291
+ }
228
292
  }
229
293
  });
230
294
  }
295
+ /**
296
+ * Print the timings of token evaluation since that last print for this context.
297
+ * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
298
+ * it won't print anything.
299
+ */
231
300
  async printTimings() {
301
+ this._ensureNotDisposed();
232
302
  this._ctx.printTimings();
233
303
  await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
234
304
  }
235
305
  /** @internal */
236
- async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5 }, onDone) {
306
+ async _decodeTokens({ sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter }, onDone) {
237
307
  return await new Promise((accept, reject) => {
238
308
  this._queuedDecodes.push({
239
309
  sequenceId,
@@ -241,6 +311,7 @@ export class LlamaContext {
241
311
  firstTokenSequenceIndex,
242
312
  generateLogitAtTheEnd,
243
313
  evaluationPriority,
314
+ tokenMeter,
244
315
  response: [accept, reject],
245
316
  onDone
246
317
  });
@@ -253,6 +324,8 @@ export class LlamaContext {
253
324
  if (this._disposed)
254
325
  return;
255
326
  void withLock(this, "context", async () => {
327
+ if (this._disposed)
328
+ return;
256
329
  this._ctx.disposeSequence(sequenceId);
257
330
  this._unusedSequenceIds.push(sequenceId);
258
331
  this._onReclaimUnusedSequenceId.dispatchEvent();
@@ -312,20 +385,65 @@ export class LlamaContext {
312
385
  if (this._disposed)
313
386
  throw new DisposedError();
314
387
  }
388
+ /** @internal */
389
+ static async _create(options, { _model }) {
390
+ const sequences = options.sequences ?? getDefaultContextSequences();
391
+ const contextSize = resolveContextContextSizeOption({
392
+ contextSize: options.contextSize,
393
+ batchSize: options.batchSize,
394
+ sequences: sequences,
395
+ modelFileInsights: _model.fileInsights,
396
+ modelGpuLayers: _model.gpuLayers,
397
+ modelTrainContextSize: _model.trainContextSize,
398
+ getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
399
+ llamaGpu: _model._llama.gpu,
400
+ ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
401
+ isEmbeddingContext: options._embeddings
402
+ });
403
+ const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
404
+ const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
405
+ contextSize,
406
+ sequences,
407
+ isEmbeddingContext: options._embeddings,
408
+ modelGpuLayers: _model.gpuLayers,
409
+ batchSize
410
+ }).gpuVram;
411
+ const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences });
412
+ const { createSignal } = options;
413
+ const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
414
+ ? null
415
+ : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
416
+ try {
417
+ const contextLoaded = await context._ctx.init();
418
+ if (createSignal?.aborted) {
419
+ if (contextLoaded)
420
+ await context._ctx.dispose();
421
+ throw createSignal.reason;
422
+ }
423
+ else if (!contextLoaded)
424
+ throw new Error("Failed to create context");
425
+ return context;
426
+ }
427
+ finally {
428
+ contextCreationMemoryReservation?.dispose?.();
429
+ }
430
+ }
315
431
  }
316
432
  export class LlamaContextSequence {
317
433
  /** @internal */ _sequenceId;
318
434
  /** @internal */ _gcRegistry;
319
435
  /** @internal */ _context;
320
436
  /** @internal */ _contextShift;
437
+ /** @internal */ _tokenMeter;
321
438
  /** @internal */ _disposeAggregator = new DisposeAggregator();
322
439
  /** @internal */ _contextTokens = [];
323
440
  /** @internal */ _nextTokenIndex = 0;
324
441
  /** @internal */ _disposed = false;
325
442
  onDispose = new EventRelay();
326
- constructor({ sequenceId, context, contextShift }) {
443
+ constructor({ sequenceId, context, tokenMeter, contextShift }) {
327
444
  this._sequenceId = sequenceId;
328
445
  this._context = context;
446
+ this._tokenMeter = tokenMeter ?? new TokenMeter();
329
447
  this._contextShift = contextShift;
330
448
  this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
331
449
  this._gcRegistry.register(this, sequenceId);
@@ -362,6 +480,9 @@ export class LlamaContextSequence {
362
480
  get contextTokens() {
363
481
  return this._contextTokens.slice();
364
482
  }
483
+ get tokenMeter() {
484
+ return this._tokenMeter;
485
+ }
365
486
  get isLoadedToMemory() {
366
487
  return !this._disposed;
367
488
  }
@@ -387,7 +508,7 @@ export class LlamaContextSequence {
387
508
  }
388
509
  /**
389
510
  * Erase context tokens in the provided ranges to free up space for new tokens to be generated.
390
- * the start and end of each range are exclusive.
511
+ * The start of each range is inclusive, and the end of each range is exclusive.
391
512
  * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
392
513
  */
393
514
  async eraseContextTokenRanges(ranges) {
@@ -396,6 +517,8 @@ export class LlamaContextSequence {
396
517
  this._ensureNotDisposed();
397
518
  if (ranges.length === 0)
398
519
  return;
520
+ // if the deletion fails, we'll have to dispose the sequence and fill it up again
521
+ let deletionSuccessful = true;
399
522
  const resolvedRanges = ranges
400
523
  .map(({ start, end }) => {
401
524
  if (start === end)
@@ -425,15 +548,22 @@ export class LlamaContextSequence {
425
548
  let lastDeleteRangeEndPos = null;
426
549
  for (const range of resolvedRanges) {
427
550
  this._contextTokens.splice(range.start - removedTokens, range.end - range.start);
428
- this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
429
- if (lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start)
551
+ if (deletionSuccessful)
552
+ deletionSuccessful &&= this._context._ctx.removeTokenCellsFromSequence(this._sequenceId, range.start, range.end);
553
+ if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== range.start)
430
554
  this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, range.start, -removedTokens);
431
555
  removedTokens += range.end - range.start;
432
556
  lastDeleteRangeEndPos = range.end;
433
557
  }
434
- if (lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
558
+ if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 && lastDeleteRangeEndPos !== this._nextTokenIndex)
435
559
  this._context._ctx.shiftSequenceTokenCells(this._sequenceId, lastDeleteRangeEndPos, this._nextTokenIndex, -removedTokens);
436
560
  this._nextTokenIndex -= removedTokens;
561
+ if (deletionSuccessful)
562
+ return;
563
+ const newSequenceTokens = this._contextTokens.slice();
564
+ this._nextTokenIndex = 0;
565
+ this._context._ctx.disposeSequence(this._sequenceId);
566
+ await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
437
567
  });
438
568
  }
439
569
  /**
@@ -485,7 +615,7 @@ export class LlamaContextSequence {
485
615
  while (true) {
486
616
  this._ensureNotDisposed();
487
617
  // Evaluate to get the next token.
488
- const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, contextShiftOptions, (batchLogitIndex) => {
618
+ const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
489
619
  const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
490
620
  ? repeatPenalty.punishTokens()
491
621
  : repeatPenalty?.punishTokens;
@@ -519,7 +649,7 @@ export class LlamaContextSequence {
519
649
  }
520
650
  }
521
651
  /** @internal */
522
- async _decodeTokens(tokens, generateLogit, evaluationPriority, contextShiftOptions, onDecodeDone) {
652
+ async _decodeTokens(tokens, generateLogit, evaluationPriority, tokenMeter, contextShiftOptions, onDecodeDone) {
523
653
  this._ensureNotDisposed();
524
654
  const tokensLeftToDecode = tokens.slice();
525
655
  return await withLock(this, "evaluate", async () => {
@@ -539,7 +669,8 @@ export class LlamaContextSequence {
539
669
  tokens: tokensToDecode,
540
670
  firstTokenSequenceIndex: this._nextTokenIndex,
541
671
  generateLogitAtTheEnd,
542
- evaluationPriority
672
+ evaluationPriority,
673
+ tokenMeter
543
674
  }, !generateLogitAtTheEnd
544
675
  ? undefined
545
676
  : onDecodeDone);
@@ -586,10 +717,11 @@ export class LlamaContextSequence {
586
717
  * We need this to make it impossible to manually create instances of this class outside the code of this library
587
718
  * @internal
588
719
  */
589
- static _create({ sequenceId, context, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
720
+ static _create({ sequenceId, context, tokenMeter, contextShift: { size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)), strategy: contextShiftStrategy = "eraseBeginning" } = {} }) {
590
721
  return new LlamaContextSequence({
591
722
  sequenceId,
592
723
  context,
724
+ tokenMeter,
593
725
  contextShift: {
594
726
  size: contextShiftSize,
595
727
  strategy: contextShiftStrategy
@@ -600,11 +732,70 @@ export class LlamaContextSequence {
600
732
  function disposeContextIfReferenced(contextRef) {
601
733
  const context = contextRef.deref();
602
734
  if (context != null)
603
- context.dispose();
735
+ void context.dispose();
604
736
  }
605
737
  function disposeContextSequenceIfReferenced(contextRef) {
606
738
  const context = contextRef.deref();
607
739
  if (context != null)
608
740
  context.dispose();
609
741
  }
742
+ export function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false }) {
743
+ if (contextSize == null)
744
+ contextSize = "auto";
745
+ if (typeof contextSize === "number") {
746
+ const resolvedContextSize = Math.max(1, Math.floor(contextSize));
747
+ if (ignoreMemorySafetyChecks)
748
+ return resolvedContextSize;
749
+ const vramState = getVramState();
750
+ const contextVram = modelFileInsights.estimateContextResourceRequirements({
751
+ contextSize: resolvedContextSize,
752
+ batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }),
753
+ modelGpuLayers: modelGpuLayers,
754
+ sequences,
755
+ isEmbeddingContext
756
+ }).gpuVram;
757
+ if (contextVram > vramState.free)
758
+ throw new Error(`The context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`);
759
+ return resolvedContextSize;
760
+ }
761
+ else if (contextSize === "auto" || typeof contextSize === "object") {
762
+ if (llamaGpu === false)
763
+ return modelTrainContextSize;
764
+ const vramState = getVramState();
765
+ if (vramState.total === 0)
766
+ return modelTrainContextSize;
767
+ const freeVram = vramState.free;
768
+ const maxContextSize = contextSize === "auto"
769
+ ? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })
770
+ : Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }));
771
+ const minContextSize = contextSize === "auto"
772
+ ? minAllowedContextSizeInCalculations
773
+ : Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations);
774
+ for (let testContextSize = maxContextSize; testContextSize >= minContextSize; testContextSize--) {
775
+ const contextVram = modelFileInsights.estimateContextResourceRequirements({
776
+ contextSize: testContextSize,
777
+ batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }),
778
+ modelGpuLayers: modelGpuLayers,
779
+ sequences,
780
+ isEmbeddingContext
781
+ }).gpuVram;
782
+ if (contextVram <= freeVram)
783
+ return testContextSize;
784
+ }
785
+ if (ignoreMemorySafetyChecks)
786
+ return minContextSize;
787
+ throw new Error(`The available VRAM is too small to fit the context size of ${maxContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""}`);
788
+ }
789
+ throw new Error(`Invalid context size: "${contextSize}"`);
790
+ }
791
+ export function getDefaultContextBatchSize({ contextSize, sequences }) {
792
+ return Math.min(contextSize * sequences, 512);
793
+ }
794
+ export function getDefaultContextSequences() {
795
+ return 1;
796
+ }
797
+ const defaultFallbackContextSize = 4096;
798
+ export function getDefaultModelContextSize({ trainContextSize }) {
799
+ return trainContextSize ?? defaultFallbackContextSize;
800
+ }
610
801
  //# sourceMappingURL=LlamaContext.js.map