node-llama-cpp 3.0.0-beta.44 → 3.0.0-beta.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. package/README.md +33 -21
  2. package/bins/_linux-arm64.moved.txt +1 -0
  3. package/bins/_linux-armv7l.moved.txt +1 -0
  4. package/bins/_linux-x64-vulkan.moved.txt +1 -0
  5. package/bins/_linux-x64.moved.txt +1 -0
  6. package/bins/_mac-arm64-metal.moved.txt +1 -0
  7. package/bins/_mac-x64.moved.txt +1 -0
  8. package/bins/_win-arm64.moved.txt +1 -0
  9. package/bins/_win-x64-vulkan.moved.txt +1 -0
  10. package/bins/_win-x64.moved.txt +1 -0
  11. package/dist/ChatWrapper.d.ts +11 -1
  12. package/dist/ChatWrapper.js +1 -1
  13. package/dist/ChatWrapper.js.map +1 -1
  14. package/dist/bindings/AddonTypes.d.ts +30 -19
  15. package/dist/bindings/Llama.d.ts +11 -0
  16. package/dist/bindings/Llama.js +37 -6
  17. package/dist/bindings/Llama.js.map +1 -1
  18. package/dist/bindings/consts.d.ts +1 -1
  19. package/dist/bindings/consts.js +2 -0
  20. package/dist/bindings/consts.js.map +1 -1
  21. package/dist/bindings/getLlama.d.ts +41 -5
  22. package/dist/bindings/getLlama.js +14 -3
  23. package/dist/bindings/getLlama.js.map +1 -1
  24. package/dist/bindings/types.d.ts +2 -2
  25. package/dist/bindings/types.js +2 -0
  26. package/dist/bindings/types.js.map +1 -1
  27. package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
  28. package/dist/bindings/utils/compileLLamaCpp.d.ts +0 -1
  29. package/dist/bindings/utils/compileLLamaCpp.js +45 -7
  30. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  31. package/dist/bindings/utils/getBestComputeLayersAvailable.d.ts +0 -1
  32. package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js +2 -2
  33. package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js.map +1 -1
  34. package/dist/bindings/utils/getGpuTypesToUseForOption.d.ts +0 -1
  35. package/dist/bindings/utils/testCmakeBinary.d.ts +0 -1
  36. package/dist/chatWrappers/AlpacaChatWrapper.js +4 -3
  37. package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
  38. package/dist/chatWrappers/ChatMLChatWrapper.js +1 -1
  39. package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
  40. package/dist/chatWrappers/FalconChatWrapper.js +5 -4
  41. package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
  42. package/dist/chatWrappers/FunctionaryChatWrapper.d.ts +2 -2
  43. package/dist/chatWrappers/FunctionaryChatWrapper.js +200 -12
  44. package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
  45. package/dist/chatWrappers/GemmaChatWrapper.js +1 -1
  46. package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -1
  47. package/dist/chatWrappers/GeneralChatWrapper.js +5 -4
  48. package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
  49. package/dist/chatWrappers/Llama2ChatWrapper.js +5 -6
  50. package/dist/chatWrappers/Llama2ChatWrapper.js.map +1 -1
  51. package/dist/chatWrappers/Llama3ChatWrapper.js +1 -1
  52. package/dist/chatWrappers/Llama3ChatWrapper.js.map +1 -1
  53. package/dist/chatWrappers/Llama3_1ChatWrapper.d.ts +13 -9
  54. package/dist/chatWrappers/Llama3_1ChatWrapper.js +92 -38
  55. package/dist/chatWrappers/Llama3_1ChatWrapper.js.map +1 -1
  56. package/dist/chatWrappers/MistralChatWrapper.d.ts +15 -0
  57. package/dist/chatWrappers/MistralChatWrapper.js +169 -0
  58. package/dist/chatWrappers/MistralChatWrapper.js.map +1 -0
  59. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +25 -1
  60. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +50 -12
  61. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -1
  62. package/dist/chatWrappers/generic/TemplateChatWrapper.d.ts +22 -16
  63. package/dist/chatWrappers/generic/TemplateChatWrapper.js +28 -24
  64. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -1
  65. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.d.ts +1 -1
  66. package/dist/chatWrappers/utils/chunkChatItems.d.ts +10 -0
  67. package/dist/chatWrappers/utils/chunkChatItems.js +44 -0
  68. package/dist/chatWrappers/utils/chunkChatItems.js.map +1 -0
  69. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +37 -26
  70. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -1
  71. package/dist/chatWrappers/utils/jsonDumps.d.ts +1 -1
  72. package/dist/chatWrappers/utils/jsonDumps.js +2 -2
  73. package/dist/chatWrappers/utils/jsonDumps.js.map +1 -1
  74. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +30 -6
  75. package/dist/chatWrappers/utils/resolveChatWrapper.js +71 -25
  76. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
  77. package/dist/cli/cli.js +2 -6
  78. package/dist/cli/cli.js.map +1 -1
  79. package/dist/cli/commands/ChatCommand.d.ts +2 -1
  80. package/dist/cli/commands/ChatCommand.js +83 -53
  81. package/dist/cli/commands/ChatCommand.js.map +1 -1
  82. package/dist/cli/commands/CompleteCommand.d.ts +2 -1
  83. package/dist/cli/commands/CompleteCommand.js +58 -30
  84. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  85. package/dist/cli/commands/DebugCommand.js +1 -1
  86. package/dist/cli/commands/DebugCommand.js.map +1 -1
  87. package/dist/cli/commands/InfillCommand.d.ts +2 -1
  88. package/dist/cli/commands/InfillCommand.js +58 -30
  89. package/dist/cli/commands/InfillCommand.js.map +1 -1
  90. package/dist/cli/commands/InitCommand.js +1 -1
  91. package/dist/cli/commands/PullCommand.d.ts +2 -1
  92. package/dist/cli/commands/PullCommand.js +85 -44
  93. package/dist/cli/commands/PullCommand.js.map +1 -1
  94. package/dist/cli/commands/inspect/InspectCommand.js +5 -3
  95. package/dist/cli/commands/inspect/InspectCommand.js.map +1 -1
  96. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +12 -0
  97. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +225 -0
  98. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -0
  99. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +17 -4
  100. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  101. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +31 -9
  102. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
  103. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +7 -4
  104. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
  105. package/dist/cli/commands/source/SourceCommand.d.ts +4 -0
  106. package/dist/cli/commands/source/SourceCommand.js +19 -0
  107. package/dist/cli/commands/source/SourceCommand.js.map +1 -0
  108. package/dist/cli/commands/{BuildCommand.d.ts → source/commands/BuildCommand.d.ts} +1 -2
  109. package/dist/cli/commands/{BuildCommand.js → source/commands/BuildCommand.js} +21 -19
  110. package/dist/cli/commands/source/commands/BuildCommand.js.map +1 -0
  111. package/dist/cli/commands/{ClearCommand.js → source/commands/ClearCommand.js} +6 -6
  112. package/dist/cli/commands/source/commands/ClearCommand.js.map +1 -0
  113. package/dist/cli/commands/{DownloadCommand.d.ts → source/commands/DownloadCommand.d.ts} +1 -2
  114. package/dist/cli/commands/{DownloadCommand.js → source/commands/DownloadCommand.js} +26 -22
  115. package/dist/cli/commands/source/commands/DownloadCommand.js.map +1 -0
  116. package/dist/cli/recommendedModels.js +192 -23
  117. package/dist/cli/recommendedModels.js.map +1 -1
  118. package/dist/cli/utils/ConsoleInteraction.d.ts +0 -1
  119. package/dist/cli/utils/ConsoleTable.js.map +1 -1
  120. package/dist/cli/utils/basicChooseFromListConsoleInteraction.js.map +1 -1
  121. package/dist/cli/utils/interactivelyAskForModel.js +6 -17
  122. package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
  123. package/dist/cli/utils/printCommonInfoLines.js +6 -3
  124. package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
  125. package/dist/cli/utils/renderModelCompatibilityPercentageWithColors.d.ts +6 -0
  126. package/dist/cli/utils/renderModelCompatibilityPercentageWithColors.js +14 -0
  127. package/dist/cli/utils/renderModelCompatibilityPercentageWithColors.js.map +1 -0
  128. package/dist/cli/utils/resolveModelRecommendationFileOptions.d.ts +1 -1
  129. package/dist/cli/utils/withCliCommandDescriptionDocsUrl.js.map +1 -1
  130. package/dist/commands.d.ts +3 -3
  131. package/dist/commands.js +3 -3
  132. package/dist/commands.js.map +1 -1
  133. package/dist/config.d.ts +7 -3
  134. package/dist/config.js +10 -6
  135. package/dist/config.js.map +1 -1
  136. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +17 -2
  137. package/dist/evaluator/LlamaChat/LlamaChat.js +24 -12
  138. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  139. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js +3 -1
  140. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map +1 -1
  141. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +21 -13
  142. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +15 -14
  143. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  144. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts +1 -0
  145. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map +1 -1
  146. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts +7 -2
  147. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js +5 -0
  148. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map +1 -1
  149. package/dist/evaluator/LlamaCompletion.d.ts +18 -4
  150. package/dist/evaluator/LlamaCompletion.js +51 -22
  151. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  152. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +21 -0
  153. package/dist/evaluator/LlamaContext/LlamaContext.js +261 -133
  154. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  155. package/dist/evaluator/LlamaContext/LlamaSampler.d.ts +1 -0
  156. package/dist/evaluator/LlamaContext/LlamaSampler.js +31 -0
  157. package/dist/evaluator/LlamaContext/LlamaSampler.js.map +1 -0
  158. package/dist/evaluator/LlamaContext/types.d.ts +77 -9
  159. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map +1 -1
  160. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js +1 -1
  161. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map +1 -1
  162. package/dist/evaluator/LlamaEmbedding.d.ts +21 -0
  163. package/dist/evaluator/LlamaEmbedding.js +53 -0
  164. package/dist/evaluator/LlamaEmbedding.js.map +1 -0
  165. package/dist/evaluator/LlamaEmbeddingContext.d.ts +1 -5
  166. package/dist/evaluator/LlamaEmbeddingContext.js +6 -8
  167. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  168. package/dist/evaluator/LlamaGrammar.d.ts +9 -10
  169. package/dist/evaluator/LlamaGrammar.js +10 -5
  170. package/dist/evaluator/LlamaGrammar.js.map +1 -1
  171. package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +7 -3
  172. package/dist/evaluator/LlamaGrammarEvaluationState.js +8 -4
  173. package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
  174. package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +3 -0
  175. package/dist/evaluator/LlamaJsonSchemaGrammar.js +3 -0
  176. package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
  177. package/dist/evaluator/LlamaModel/LlamaModel.d.ts +28 -15
  178. package/dist/evaluator/LlamaModel/LlamaModel.js +66 -51
  179. package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
  180. package/dist/evaluator/LlamaModel/utils/TokenAttributes.d.ts +10 -10
  181. package/dist/evaluator/LlamaModel/utils/TokenAttributes.js +10 -10
  182. package/dist/evaluator/LlamaModel/utils/TokenAttributes.js.map +1 -1
  183. package/dist/evaluator/TokenBias.d.ts +20 -8
  184. package/dist/evaluator/TokenBias.js +44 -12
  185. package/dist/evaluator/TokenBias.js.map +1 -1
  186. package/dist/evaluator/TokenMeter.d.ts +3 -12
  187. package/dist/evaluator/TokenMeter.js +4 -16
  188. package/dist/evaluator/TokenMeter.js.map +1 -1
  189. package/dist/gguf/fileReaders/GgufFileReader.d.ts +0 -1
  190. package/dist/gguf/fileReaders/GgufFileReader.js.map +1 -1
  191. package/dist/gguf/fileReaders/GgufFsFileReader.d.ts +0 -2
  192. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.d.ts +5 -3
  193. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js +26 -13
  194. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js.map +1 -1
  195. package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +57 -1
  196. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +86 -4
  197. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
  198. package/dist/gguf/insights/utils/scoreLevels.js.map +1 -1
  199. package/dist/gguf/readGgufFileInfo.d.ts +18 -6
  200. package/dist/gguf/readGgufFileInfo.js +8 -3
  201. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  202. package/dist/gguf/types/GgufMetadataTypes.d.ts +18 -2
  203. package/dist/gguf/types/GgufMetadataTypes.js +16 -1
  204. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  205. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js +2 -0
  206. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js.map +1 -1
  207. package/dist/gguf/utils/getGgufFileTypeName.d.ts +1 -1
  208. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js +1 -1
  209. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js.map +1 -1
  210. package/dist/index.d.ts +8 -4
  211. package/dist/index.js +5 -3
  212. package/dist/index.js.map +1 -1
  213. package/dist/tsconfig.tsbuildinfo +1 -0
  214. package/dist/types.d.ts +1 -0
  215. package/dist/types.js.map +1 -1
  216. package/dist/utils/LlamaText.d.ts +3 -0
  217. package/dist/utils/LlamaText.js +7 -4
  218. package/dist/utils/LlamaText.js.map +1 -1
  219. package/dist/utils/LruCache.d.ts +2 -2
  220. package/dist/utils/LruCache.js.map +1 -1
  221. package/dist/utils/OverridesObject.d.ts +7 -0
  222. package/dist/utils/OverridesObject.js +2 -0
  223. package/dist/utils/OverridesObject.js.map +1 -0
  224. package/dist/utils/StopGenerationDetector.js.map +1 -1
  225. package/dist/utils/ThreadsSplitter.d.ts +32 -0
  226. package/dist/utils/ThreadsSplitter.js +177 -0
  227. package/dist/utils/ThreadsSplitter.js.map +1 -0
  228. package/dist/utils/TokenStreamRegulator.js.map +1 -1
  229. package/dist/utils/appendUserMessageToChatHistory.d.ts +4 -0
  230. package/dist/utils/appendUserMessageToChatHistory.js +4 -0
  231. package/dist/utils/appendUserMessageToChatHistory.js.map +1 -1
  232. package/dist/utils/compareTokens.d.ts +1 -1
  233. package/dist/utils/compareTokens.js.map +1 -1
  234. package/dist/utils/createModelDownloader.d.ts +94 -6
  235. package/dist/utils/createModelDownloader.js +174 -46
  236. package/dist/utils/createModelDownloader.js.map +1 -1
  237. package/dist/utils/gbnfJson/terminals/GbnfOr.js.map +1 -1
  238. package/dist/utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js +1 -1
  239. package/dist/utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js.map +1 -1
  240. package/dist/utils/getGrammarsFolder.js +1 -1
  241. package/dist/utils/getGrammarsFolder.js.map +1 -1
  242. package/dist/utils/gitReleaseBundles.js.map +1 -1
  243. package/dist/utils/modelFileAccesTokens.d.ts +4 -0
  244. package/dist/utils/modelFileAccesTokens.js +40 -0
  245. package/dist/utils/modelFileAccesTokens.js.map +1 -0
  246. package/dist/utils/parseModelFileName.js.map +1 -1
  247. package/dist/utils/parseTextTemplate.js.map +1 -1
  248. package/dist/utils/resolveGithubRelease.d.ts +1 -1
  249. package/dist/utils/resolveLastTokens.js.map +1 -1
  250. package/dist/utils/spawnCommand.d.ts +0 -1
  251. package/dist/utils/truncateTextAndRoundToWords.js +3 -1
  252. package/dist/utils/truncateTextAndRoundToWords.js.map +1 -1
  253. package/dist/utils/withOra.js +1 -1
  254. package/dist/utils/withOra.js.map +1 -1
  255. package/dist/utils/withProgressLog.d.ts +0 -1
  256. package/dist/utils/wrapAbortSignal.d.ts +0 -1
  257. package/llama/CMakeLists.txt +20 -12
  258. package/llama/addon/AddonContext.cpp +69 -202
  259. package/llama/addon/AddonContext.h +4 -5
  260. package/llama/addon/AddonGrammar.cpp +8 -11
  261. package/llama/addon/AddonGrammar.h +4 -3
  262. package/llama/addon/AddonGrammarEvaluationState.cpp +9 -10
  263. package/llama/addon/AddonGrammarEvaluationState.h +3 -1
  264. package/llama/addon/AddonModel.cpp +6 -5
  265. package/llama/addon/AddonSampler.cpp +513 -0
  266. package/llama/addon/AddonSampler.h +65 -0
  267. package/llama/addon/RingBuffer.h +109 -0
  268. package/llama/addon/addon.cpp +7 -0
  269. package/llama/addon/globals/addonLog.cpp +2 -1
  270. package/llama/binariesGithubRelease.json +1 -1
  271. package/llama/gitRelease.bundle +0 -0
  272. package/llama/grammars/README.md +1 -1
  273. package/llama/llama.cpp.info.json +1 -1
  274. package/package.json +71 -46
  275. package/templates/packed/electron-typescript-react.json +1 -1
  276. package/templates/packed/node-typescript.json +1 -1
  277. package/bins/linux-arm64/_nlcBuildMetadata.json +0 -1
  278. package/bins/linux-arm64/libggml.so +0 -0
  279. package/bins/linux-arm64/libllama.so +0 -0
  280. package/bins/linux-arm64/llama-addon.node +0 -0
  281. package/bins/linux-armv7l/_nlcBuildMetadata.json +0 -1
  282. package/bins/linux-armv7l/libggml.so +0 -0
  283. package/bins/linux-armv7l/libllama.so +0 -0
  284. package/bins/linux-armv7l/llama-addon.node +0 -0
  285. package/bins/linux-x64/_nlcBuildMetadata.json +0 -1
  286. package/bins/linux-x64/libggml.so +0 -0
  287. package/bins/linux-x64/libllama.so +0 -0
  288. package/bins/linux-x64/llama-addon.node +0 -0
  289. package/bins/linux-x64-vulkan/_nlcBuildMetadata.json +0 -1
  290. package/bins/linux-x64-vulkan/libggml.so +0 -0
  291. package/bins/linux-x64-vulkan/libllama.so +0 -0
  292. package/bins/linux-x64-vulkan/llama-addon.node +0 -0
  293. package/bins/linux-x64-vulkan/vulkan-shaders-gen +0 -0
  294. package/bins/mac-arm64-metal/_nlcBuildMetadata.json +0 -1
  295. package/bins/mac-arm64-metal/ggml-common.h +0 -1833
  296. package/bins/mac-arm64-metal/ggml-metal.metal +0 -6168
  297. package/bins/mac-arm64-metal/libggml.dylib +0 -0
  298. package/bins/mac-arm64-metal/libllama.dylib +0 -0
  299. package/bins/mac-arm64-metal/llama-addon.node +0 -0
  300. package/bins/mac-x64/_nlcBuildMetadata.json +0 -1
  301. package/bins/mac-x64/libggml.dylib +0 -0
  302. package/bins/mac-x64/libllama.dylib +0 -0
  303. package/bins/mac-x64/llama-addon.node +0 -0
  304. package/bins/win-arm64/_nlcBuildMetadata.json +0 -1
  305. package/bins/win-arm64/ggml.dll +0 -0
  306. package/bins/win-arm64/llama-addon.exp +0 -0
  307. package/bins/win-arm64/llama-addon.lib +0 -0
  308. package/bins/win-arm64/llama-addon.node +0 -0
  309. package/bins/win-arm64/llama.dll +0 -0
  310. package/bins/win-x64/_nlcBuildMetadata.json +0 -1
  311. package/bins/win-x64/ggml.dll +0 -0
  312. package/bins/win-x64/llama-addon.exp +0 -0
  313. package/bins/win-x64/llama-addon.lib +0 -0
  314. package/bins/win-x64/llama-addon.node +0 -0
  315. package/bins/win-x64/llama.dll +0 -0
  316. package/bins/win-x64-vulkan/_nlcBuildMetadata.json +0 -1
  317. package/bins/win-x64-vulkan/ggml.dll +0 -0
  318. package/bins/win-x64-vulkan/llama-addon.exp +0 -0
  319. package/bins/win-x64-vulkan/llama-addon.lib +0 -0
  320. package/bins/win-x64-vulkan/llama-addon.node +0 -0
  321. package/bins/win-x64-vulkan/llama.dll +0 -0
  322. package/bins/win-x64-vulkan/vulkan-shaders-gen.exe +0 -0
  323. package/dist/cli/commands/BuildCommand.js.map +0 -1
  324. package/dist/cli/commands/ClearCommand.js.map +0 -1
  325. package/dist/cli/commands/DownloadCommand.js.map +0 -1
  326. package/dist/utils/DeepPartialObject.d.ts +0 -3
  327. package/dist/utils/DeepPartialObject.js +0 -2
  328. package/dist/utils/DeepPartialObject.js.map +0 -1
  329. /package/dist/cli/commands/{ClearCommand.d.ts → source/commands/ClearCommand.d.ts} +0 -0
@@ -3,8 +3,16 @@ import { removeNullFields } from "../../utils/removeNullFields.js";
3
3
  import { compareTokens } from "../../utils/compareTokens.js";
4
4
  import { DisposeGuard } from "../../utils/DisposeGuard.js";
5
5
  import { TokenMeter } from "../TokenMeter.js";
6
+ import { UnsupportedError } from "../../utils/UnsupportedError.js";
6
7
  import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
8
+ import { LlamaSampler } from "./LlamaSampler.js";
7
9
  const defaultLoraScale = 1;
10
+ const shrinkRetriesMinContextSize = 4096;
11
+ const defaultMaxPunishTokens = 64;
12
+ const defaultFailedCreationRemedy = {
13
+ retries: 6,
14
+ autoContextSizeShrink: 0.16
15
+ };
8
16
  export class LlamaContext {
9
17
  /** @internal */ _llama;
10
18
  /** @internal */ _ctx;
@@ -14,6 +22,9 @@ export class LlamaContext {
14
22
  /** @internal */ _contextSize;
15
23
  /** @internal */ _batchSize;
16
24
  /** @internal */ _flashAttention;
25
+ /** @internal */ _idealThreads;
26
+ /** @internal */ _minThreads;
27
+ /** @internal */ _performanceTracking;
17
28
  /** @internal */ _totalSequences;
18
29
  /** @internal */ _unusedSequenceIds = [];
19
30
  /** @internal */ _batchingOptions;
@@ -26,11 +37,13 @@ export class LlamaContext {
26
37
  /** @internal */ _nextGeneratedSequenceId = 0;
27
38
  /** @internal */ _dispatchDecodeScheduled = false;
28
39
  /** @internal */ _batchDispatchPending = false;
40
+ /** @internal */ _threadSplitterConsumer;
41
+ /** @internal */ _freeReservedThreadsTimeout;
29
42
  /** @internal */ _currentDispatchBatchHandle = {};
30
43
  /** @internal */ _allocatedContextSize;
31
44
  /** @internal */ _disposed = false;
32
45
  onDispose = new EventRelay();
33
- constructor({ _model }, { sequences, seed = null, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
46
+ constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings }) {
34
47
  if (_model.disposed)
35
48
  throw new DisposedError();
36
49
  this._llama = _model._llama;
@@ -41,15 +54,23 @@ export class LlamaContext {
41
54
  this._contextSize = Math.max(2, contextSize);
42
55
  this._batchSize = Math.max(batchSize, this._totalSequences);
43
56
  this._flashAttention = flashAttention;
57
+ this._idealThreads = typeof threads === "number"
58
+ ? this._llama._threadsSplitter.normalizeThreadsValue(threads)
59
+ : this._llama._threadsSplitter.normalizeThreadsValue(threads?.ideal ?? (this._llama.maxThreads === 0
60
+ ? this._llama.cpuMathCores
61
+ : this._llama.maxThreads));
62
+ this._minThreads = Math.max(1, typeof threads === "number"
63
+ ? 1
64
+ : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1));
65
+ this._performanceTracking = !!performanceTracking;
44
66
  this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
45
- seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
46
67
  contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
47
68
  batchSize: this._batchSize,
48
69
  sequences: this._totalSequences,
49
70
  flashAttention: this._flashAttention,
50
- threads: Math.max(0, Math.floor(threads)),
71
+ threads: this._idealThreads,
51
72
  embeddings: _embeddings,
52
- noSeed: _noSeed
73
+ performanceTracking: this._performanceTracking
53
74
  }));
54
75
  this._batchingOptions = {
55
76
  dispatchSchedule: batchingDispatchSchedule,
@@ -58,6 +79,7 @@ export class LlamaContext {
58
79
  this._gcRegistry = new FinalizationRegistry(this._model._removeLoraUsage);
59
80
  this._gcRegistry.register(this, this._loraAdapters);
60
81
  this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
82
+ this._freeReservedThreads = this._freeReservedThreads.bind(this);
61
83
  this._disposeAggregator.add(() => {
62
84
  this._disposed = true;
63
85
  });
@@ -111,6 +133,19 @@ export class LlamaContext {
111
133
  this._ensureNotDisposed();
112
134
  return this._ctx.getStateSize();
113
135
  }
136
+ /** The number of threads currently used to evaluate tokens */
137
+ get currentThreads() {
138
+ this._ensureNotDisposed();
139
+ return this._ctx.getThreads();
140
+ }
141
+ /**
142
+ * The number of threads that are preferred to be used to evaluate tokens.
143
+ *
144
+ * The actual number of threads used may be lower when other evaluations are running in parallel.
145
+ */
146
+ get idealThreads() {
147
+ return this._idealThreads;
148
+ }
114
149
  getAllocatedContextSize() {
115
150
  this._ensureNotDisposed();
116
151
  if (this._allocatedContextSize == null)
@@ -263,13 +298,22 @@ export class LlamaContext {
263
298
  i--;
264
299
  }
265
300
  }
266
- try {
267
- if (currentBatchSize !== 0)
301
+ if (currentBatchSize !== 0) {
302
+ const allocationResult = this._threadSplitterConsumer?.getAllocationToConsume();
303
+ const [threadsToUse, consumerHandle] = allocationResult instanceof Promise
304
+ ? await allocationResult ?? []
305
+ : allocationResult ?? [];
306
+ try {
307
+ if (threadsToUse != null)
308
+ this._ctx.setThreads(threadsToUse);
268
309
  await this._ctx.decodeBatch();
269
- }
270
- catch (err) {
271
- this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
272
- return;
310
+ consumerHandle?.dispose();
311
+ }
312
+ catch (err) {
313
+ consumerHandle?.dispose();
314
+ this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
315
+ return;
316
+ }
273
317
  }
274
318
  for (const action of afterDecodeActions) {
275
319
  const [accept, reject] = action.response;
@@ -287,36 +331,47 @@ export class LlamaContext {
287
331
  const prioritizationStrategy = resolvePrioritizationStrategy();
288
332
  if (prioritizationStrategy == null)
289
333
  return; // all queued items are rejected and dequeued when we get here
290
- while (shouldHaveAnotherLoop) {
291
- const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
292
- if (orderedQueuedDecodes == null)
293
- return; // all queued items are rejected and dequeued when we get here
294
- const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
295
- let preventDisposalHandle;
296
- try {
297
- preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
298
- }
299
- catch (err) {
300
- this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
301
- return;
302
- }
303
- try {
304
- await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
305
- shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
306
- }
307
- finally {
308
- preventDisposalHandle.dispose();
334
+ this._reserveThreads();
335
+ try {
336
+ while (shouldHaveAnotherLoop) {
337
+ const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
338
+ if (orderedQueuedDecodes == null)
339
+ return; // all queued items are rejected and dequeued when we get here
340
+ const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
341
+ let preventDisposalHandle;
342
+ try {
343
+ preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
344
+ }
345
+ catch (err) {
346
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
347
+ return;
348
+ }
349
+ try {
350
+ await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
351
+ shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
352
+ }
353
+ finally {
354
+ preventDisposalHandle.dispose();
355
+ }
309
356
  }
310
357
  }
358
+ finally {
359
+ this._scheduleToFreeReservedThreads();
360
+ }
311
361
  });
312
362
  }
313
363
  /**
314
364
  * Print the timings of token evaluation since that last print for this context.
365
+ *
366
+ * Requires the `performanceTracking` option to be enabled.
367
+ *
315
368
  * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
316
369
  * it won't print anything.
317
370
  */
318
371
  async printTimings() {
319
372
  this._ensureNotDisposed();
373
+ if (!this._performanceTracking)
374
+ throw new UnsupportedError("Performance tracking is not enabled");
320
375
  this._ctx.printTimings();
321
376
  await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
322
377
  }
@@ -350,14 +405,6 @@ export class LlamaContext {
350
405
  });
351
406
  }
352
407
  /** @internal */
353
- _acceptTokenOnGrammarEvaluationState(grammarEvaluationState, token) {
354
- this._ctx.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
355
- }
356
- /** @internal */
357
- _canBeNextTokenForGrammarEvaluationState(grammarEvaluationState, token) {
358
- return this._ctx.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
359
- }
360
- /** @internal */
361
408
  _popSequenceId() {
362
409
  if (this._unusedSequenceIds.length > 0)
363
410
  return this._unusedSequenceIds.shift();
@@ -417,6 +464,30 @@ export class LlamaContext {
417
464
  }
418
465
  }
419
466
  /** @internal */
467
+ _reserveThreads() {
468
+ clearTimeout(this._freeReservedThreadsTimeout);
469
+ delete this._freeReservedThreadsTimeout;
470
+ if (this._threadSplitterConsumer != null)
471
+ return;
472
+ this._threadSplitterConsumer = this._llama._threadsSplitter.createConsumer(this._idealThreads, this._minThreads);
473
+ }
474
+ /** @internal */
475
+ _freeReservedThreads() {
476
+ clearTimeout(this._freeReservedThreadsTimeout);
477
+ delete this._freeReservedThreadsTimeout;
478
+ if (this._threadSplitterConsumer == null)
479
+ return;
480
+ this._threadSplitterConsumer.dispose();
481
+ delete this._threadSplitterConsumer;
482
+ }
483
+ /** @internal */
484
+ _scheduleToFreeReservedThreads() {
485
+ if (this._threadSplitterConsumer == null)
486
+ return;
487
+ clearTimeout(this._freeReservedThreadsTimeout);
488
+ this._freeReservedThreadsTimeout = setTimeout(this._freeReservedThreads, 0);
489
+ }
490
+ /** @internal */
420
491
  static async _create(options, { _model }) {
421
492
  const sequences = options.sequences ?? getDefaultContextSequences();
422
493
  const flashAttention = _model.flashAttentionSupported
@@ -425,7 +496,13 @@ export class LlamaContext {
425
496
  const loraOptions = typeof options.lora === "string"
426
497
  ? { adapters: [{ filePath: options.lora }] }
427
498
  : options.lora;
428
- const contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
499
+ let failedCreationRetries = options.failedCreationRemedy === false
500
+ ? 0
501
+ : Math.max(0, options.failedCreationRemedy?.retries ?? defaultFailedCreationRemedy.retries);
502
+ const failedCreationAutoContextSizeShrink = options.failedCreationRemedy === false
503
+ ? 0
504
+ : options.failedCreationRemedy?.autoContextSizeShrink ?? defaultFailedCreationRemedy.autoContextSizeShrink;
505
+ let contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
429
506
  batchSize: options.batchSize,
430
507
  sequences: sequences,
431
508
  modelGpuLayers: _model.gpuLayers,
@@ -436,69 +513,101 @@ export class LlamaContext {
436
513
  ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
437
514
  isEmbeddingContext: options._embeddings
438
515
  });
439
- const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
440
- const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
441
- contextSize,
442
- sequences,
443
- isEmbeddingContext: options._embeddings,
444
- modelGpuLayers: _model.gpuLayers,
445
- batchSize,
446
- flashAttention
447
- }).gpuVram;
448
- const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention });
516
+ const minContextSize = options.contextSize === "auto"
517
+ ? shrinkRetriesMinContextSize
518
+ : (typeof options.contextSize === "object" && typeof options.contextSize.min === "number")
519
+ ? options.contextSize.min
520
+ : typeof options.contextSize === "number"
521
+ ? options.contextSize
522
+ : shrinkRetriesMinContextSize;
449
523
  const { createSignal } = options;
450
- const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
451
- ? null
452
- : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
453
- try {
454
- const contextLoaded = await context._ctx.init();
455
- if (createSignal?.aborted) {
456
- if (contextLoaded)
457
- await context._ctx.dispose();
458
- throw createSignal.reason;
459
- }
460
- else if (!contextLoaded)
461
- throw new Error("Failed to create context");
462
- contextCreationMemoryReservation?.dispose?.();
463
- if (loraOptions != null && loraOptions.adapters.length > 0) {
464
- let loadedAdapters = 0;
465
- for (const adapter of loraOptions.adapters) {
466
- try {
467
- await context._setLora({
468
- filePath: adapter.filePath,
469
- scale: adapter.scale
470
- });
471
- loadedAdapters++;
524
+ async function createContext(contextSize) {
525
+ const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
526
+ const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
527
+ contextSize,
528
+ sequences,
529
+ isEmbeddingContext: options._embeddings,
530
+ modelGpuLayers: _model.gpuLayers,
531
+ batchSize,
532
+ flashAttention
533
+ }).gpuVram;
534
+ const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention });
535
+ const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
536
+ ? null
537
+ : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
538
+ try {
539
+ if (createSignal?.aborted)
540
+ throw createSignal.reason;
541
+ const contextLoaded = await context._ctx.init();
542
+ if (createSignal?.aborted) {
543
+ if (contextLoaded)
544
+ await context._ctx.dispose();
545
+ throw createSignal.reason;
546
+ }
547
+ else if (!contextLoaded)
548
+ throw new Error("Failed to create context");
549
+ contextCreationMemoryReservation?.dispose?.();
550
+ if (loraOptions != null && loraOptions.adapters.length > 0) {
551
+ let loadedAdapters = 0;
552
+ for (const adapter of loraOptions.adapters) {
472
553
  try {
473
- loraOptions.onLoadProgress?.(loadedAdapters / loraOptions.adapters.length);
554
+ await context._setLora({
555
+ filePath: adapter.filePath,
556
+ scale: adapter.scale
557
+ });
558
+ loadedAdapters++;
559
+ try {
560
+ loraOptions.onLoadProgress?.(loadedAdapters / loraOptions.adapters.length);
561
+ }
562
+ catch (err) {
563
+ console.error(err);
564
+ }
474
565
  }
475
566
  catch (err) {
476
- console.error(err);
567
+ await context.dispose();
568
+ throw err;
569
+ }
570
+ if (createSignal?.aborted) {
571
+ await context.dispose();
572
+ throw createSignal.reason;
477
573
  }
478
574
  }
479
- catch (err) {
480
- await context.dispose();
481
- throw err;
575
+ }
576
+ else if (loraOptions?.onLoadProgress != null) {
577
+ try {
578
+ loraOptions.onLoadProgress(1);
482
579
  }
483
- if (createSignal?.aborted) {
484
- await context.dispose();
485
- throw createSignal.reason;
580
+ catch (err) {
581
+ console.error(err);
486
582
  }
487
583
  }
584
+ return context;
488
585
  }
489
- else if (loraOptions?.onLoadProgress != null) {
490
- try {
491
- loraOptions.onLoadProgress(1);
492
- }
493
- catch (err) {
494
- console.error(err);
495
- }
586
+ finally {
587
+ contextCreationMemoryReservation?.dispose?.();
496
588
  }
497
- return context;
498
589
  }
499
- finally {
500
- contextCreationMemoryReservation?.dispose?.();
590
+ while (failedCreationRetries >= 0) {
591
+ try {
592
+ return await createContext(contextSize);
593
+ }
594
+ catch (err) {
595
+ if (failedCreationRetries === 0 || (createSignal?.aborted && err === createSignal.reason))
596
+ throw err;
597
+ failedCreationRetries--;
598
+ let newContextSize = typeof failedCreationAutoContextSizeShrink === "number"
599
+ ? Math.floor(contextSize * (1 - failedCreationAutoContextSizeShrink))
600
+ : Math.floor(failedCreationAutoContextSizeShrink(contextSize));
601
+ if (!Number.isFinite(newContextSize))
602
+ throw err;
603
+ if (newContextSize < minContextSize)
604
+ newContextSize = minContextSize;
605
+ if (newContextSize >= contextSize)
606
+ throw err;
607
+ contextSize = newContextSize;
608
+ }
501
609
  }
610
+ throw new Error("Failed to create context");
502
611
  }
503
612
  }
504
613
  export class LlamaContextSequence {
@@ -639,12 +748,13 @@ export class LlamaContextSequence {
639
748
  });
640
749
  }
641
750
  evaluate(tokens, options = {}) {
642
- const { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
751
+ const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
643
752
  return this._evaluate(tokens, {
644
753
  temperature,
645
754
  minP,
646
755
  topK,
647
756
  topP,
757
+ seed,
648
758
  grammarEvaluationState,
649
759
  repeatPenalty,
650
760
  tokenBias,
@@ -677,53 +787,71 @@ export class LlamaContextSequence {
677
787
  }
678
788
  }
679
789
  /** @internal */
680
- async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
790
+ async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
681
791
  this._ensureNotDisposed();
682
792
  let evalTokens = tokens;
683
793
  if (evalTokens.length === 0)
684
794
  return;
685
- while (true) {
686
- this._ensureNotDisposed();
687
- // Evaluate to get the next token.
688
- const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
689
- if (_noSampling)
690
- return null;
691
- const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
692
- ? repeatPenalty.punishTokens()
693
- : repeatPenalty?.punishTokens;
694
- const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
695
- ? grammarEvaluationState()
696
- : grammarEvaluationState;
697
- if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
698
- throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
699
- const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
700
- return this._context._ctx.sampleToken(batchLogitIndex, removeNullFields({
701
- temperature,
702
- minP,
703
- topK,
704
- topP,
705
- repeatPenalty: repeatPenalty?.penalty,
706
- repeatPenaltyTokens: repeatPenaltyTokens != null
707
- ? Uint32Array.from(repeatPenaltyTokens)
708
- : undefined,
709
- repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
710
- repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
711
- tokenBiasKeys,
712
- tokenBiasValues,
713
- grammarEvaluationState: resolvedGrammarEvaluationState?._state
714
- }));
715
- });
716
- if (nextToken == null)
717
- return;
718
- // the model finished generating text
719
- if (!yieldEogToken && this._context.model.isEogToken(nextToken))
720
- break;
721
- const replacementToken = (yield nextToken);
722
- // set the tokens for the next evaluation
723
- if (replacementToken != null)
724
- evalTokens = [replacementToken];
725
- else
726
- evalTokens = [nextToken];
795
+ const sampler = new LlamaSampler(this.model);
796
+ try {
797
+ while (true) {
798
+ this._ensureNotDisposed();
799
+ // Evaluate to get the next token.
800
+ const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
801
+ if (_noSampling)
802
+ return null;
803
+ const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
804
+ ? repeatPenalty.punishTokens()
805
+ : repeatPenalty?.punishTokens;
806
+ const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
807
+ const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
808
+ ? grammarEvaluationState()
809
+ : grammarEvaluationState;
810
+ if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
811
+ throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
812
+ const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
813
+ sampler.applyConfig(removeNullFields({
814
+ temperature,
815
+ minP,
816
+ topK,
817
+ topP,
818
+ seed: Math.max(0, Number.isFinite(seed)
819
+ ? Math.floor(seed ?? (Date.now() / 1000))
820
+ : Math.floor(Date.now() / 1000)),
821
+ repeatPenalty: repeatPenalty?.penalty,
822
+ repeatPenaltyMaxTokens: maxPunishTokens,
823
+ repeatPenaltyTokens: repeatPenaltyTokens != null
824
+ ? Uint32Array.from(repeatPenaltyTokens)
825
+ : undefined,
826
+ repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
827
+ repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
828
+ tokenBiasKeys,
829
+ tokenBiasValues,
830
+ grammarEvaluationState: resolvedGrammarEvaluationState?._state
831
+ }));
832
+ return withLock(sampler, "sample", async () => {
833
+ if (sampler.disposed)
834
+ return null;
835
+ return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
836
+ });
837
+ });
838
+ if (nextToken === -1)
839
+ throw new Error("Failed to sample next token");
840
+ if (nextToken == null)
841
+ return;
842
+ // the model finished generating text
843
+ if (!yieldEogToken && this._context.model.isEogToken(nextToken))
844
+ break;
845
+ const replacementToken = (yield nextToken);
846
+ // set the tokens for the next evaluation
847
+ if (replacementToken != null)
848
+ evalTokens = [replacementToken];
849
+ else
850
+ evalTokens = [nextToken];
851
+ }
852
+ }
853
+ finally {
854
+ void withLock(sampler, "sample", sampler.asyncDispose);
727
855
  }
728
856
  }
729
857
  /** @internal */
@@ -814,7 +942,7 @@ function getTokenBiasesForAddon(tokenBias, currentModel) {
814
942
  };
815
943
  if (tokenBias instanceof Function)
816
944
  tokenBias = tokenBias();
817
- if (tokenBias._model !== currentModel)
945
+ if (tokenBias._tokenizer !== currentModel.tokenizer)
818
946
  throw new Error("This TokenBias instance was created with a different model than the one used by this context. " +
819
947
  "Make sure you use the model instance of the context sequence for the TokenBias you use it with.");
820
948
  const tokenBiasKeys = [];