node-llama-cpp 3.0.0-beta.44 → 3.0.0-beta.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. package/README.md +33 -21
  2. package/bins/_linux-arm64.moved.txt +1 -0
  3. package/bins/_linux-armv7l.moved.txt +1 -0
  4. package/bins/_linux-x64-vulkan.moved.txt +1 -0
  5. package/bins/_linux-x64.moved.txt +1 -0
  6. package/bins/_mac-arm64-metal.moved.txt +1 -0
  7. package/bins/_mac-x64.moved.txt +1 -0
  8. package/bins/_win-arm64.moved.txt +1 -0
  9. package/bins/_win-x64-vulkan.moved.txt +1 -0
  10. package/bins/_win-x64.moved.txt +1 -0
  11. package/dist/ChatWrapper.d.ts +11 -1
  12. package/dist/ChatWrapper.js +1 -1
  13. package/dist/ChatWrapper.js.map +1 -1
  14. package/dist/bindings/AddonTypes.d.ts +30 -19
  15. package/dist/bindings/Llama.d.ts +9 -0
  16. package/dist/bindings/Llama.js +33 -6
  17. package/dist/bindings/Llama.js.map +1 -1
  18. package/dist/bindings/consts.d.ts +1 -1
  19. package/dist/bindings/consts.js +2 -0
  20. package/dist/bindings/consts.js.map +1 -1
  21. package/dist/bindings/getLlama.d.ts +33 -5
  22. package/dist/bindings/getLlama.js +14 -3
  23. package/dist/bindings/getLlama.js.map +1 -1
  24. package/dist/bindings/types.d.ts +2 -2
  25. package/dist/bindings/types.js +2 -0
  26. package/dist/bindings/types.js.map +1 -1
  27. package/dist/bindings/utils/cloneLlamaCppRepo.js.map +1 -1
  28. package/dist/bindings/utils/compileLLamaCpp.d.ts +0 -1
  29. package/dist/bindings/utils/compileLLamaCpp.js +45 -7
  30. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  31. package/dist/bindings/utils/getBestComputeLayersAvailable.d.ts +0 -1
  32. package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js +2 -2
  33. package/dist/bindings/utils/getBuildFolderNameForBuildOptions.js.map +1 -1
  34. package/dist/bindings/utils/getGpuTypesToUseForOption.d.ts +0 -1
  35. package/dist/bindings/utils/testCmakeBinary.d.ts +0 -1
  36. package/dist/chatWrappers/AlpacaChatWrapper.js +4 -3
  37. package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
  38. package/dist/chatWrappers/ChatMLChatWrapper.js +1 -1
  39. package/dist/chatWrappers/ChatMLChatWrapper.js.map +1 -1
  40. package/dist/chatWrappers/FalconChatWrapper.js +5 -4
  41. package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
  42. package/dist/chatWrappers/FunctionaryChatWrapper.d.ts +2 -2
  43. package/dist/chatWrappers/FunctionaryChatWrapper.js +200 -12
  44. package/dist/chatWrappers/FunctionaryChatWrapper.js.map +1 -1
  45. package/dist/chatWrappers/GemmaChatWrapper.js +1 -1
  46. package/dist/chatWrappers/GemmaChatWrapper.js.map +1 -1
  47. package/dist/chatWrappers/GeneralChatWrapper.js +5 -4
  48. package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
  49. package/dist/chatWrappers/Llama2ChatWrapper.js +5 -6
  50. package/dist/chatWrappers/Llama2ChatWrapper.js.map +1 -1
  51. package/dist/chatWrappers/Llama3ChatWrapper.js +1 -1
  52. package/dist/chatWrappers/Llama3ChatWrapper.js.map +1 -1
  53. package/dist/chatWrappers/Llama3_1ChatWrapper.d.ts +13 -9
  54. package/dist/chatWrappers/Llama3_1ChatWrapper.js +92 -38
  55. package/dist/chatWrappers/Llama3_1ChatWrapper.js.map +1 -1
  56. package/dist/chatWrappers/MistralChatWrapper.d.ts +15 -0
  57. package/dist/chatWrappers/MistralChatWrapper.js +169 -0
  58. package/dist/chatWrappers/MistralChatWrapper.js.map +1 -0
  59. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.d.ts +25 -1
  60. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js +50 -12
  61. package/dist/chatWrappers/generic/JinjaTemplateChatWrapper.js.map +1 -1
  62. package/dist/chatWrappers/generic/TemplateChatWrapper.d.ts +22 -16
  63. package/dist/chatWrappers/generic/TemplateChatWrapper.js +28 -24
  64. package/dist/chatWrappers/generic/TemplateChatWrapper.js.map +1 -1
  65. package/dist/chatWrappers/generic/utils/chatHistoryFunctionCallMessageTemplate.d.ts +1 -1
  66. package/dist/chatWrappers/utils/chunkChatItems.d.ts +10 -0
  67. package/dist/chatWrappers/utils/chunkChatItems.js +44 -0
  68. package/dist/chatWrappers/utils/chunkChatItems.js.map +1 -0
  69. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js +37 -26
  70. package/dist/chatWrappers/utils/isJinjaTemplateEquivalentToSpecializedChatWrapper.js.map +1 -1
  71. package/dist/chatWrappers/utils/jsonDumps.d.ts +1 -1
  72. package/dist/chatWrappers/utils/jsonDumps.js +2 -2
  73. package/dist/chatWrappers/utils/jsonDumps.js.map +1 -1
  74. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +30 -6
  75. package/dist/chatWrappers/utils/resolveChatWrapper.js +71 -25
  76. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
  77. package/dist/cli/cli.js +2 -6
  78. package/dist/cli/cli.js.map +1 -1
  79. package/dist/cli/commands/ChatCommand.d.ts +2 -1
  80. package/dist/cli/commands/ChatCommand.js +83 -53
  81. package/dist/cli/commands/ChatCommand.js.map +1 -1
  82. package/dist/cli/commands/CompleteCommand.d.ts +2 -1
  83. package/dist/cli/commands/CompleteCommand.js +58 -30
  84. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  85. package/dist/cli/commands/DebugCommand.js +1 -1
  86. package/dist/cli/commands/DebugCommand.js.map +1 -1
  87. package/dist/cli/commands/InfillCommand.d.ts +2 -1
  88. package/dist/cli/commands/InfillCommand.js +58 -30
  89. package/dist/cli/commands/InfillCommand.js.map +1 -1
  90. package/dist/cli/commands/InitCommand.js +1 -1
  91. package/dist/cli/commands/PullCommand.d.ts +2 -1
  92. package/dist/cli/commands/PullCommand.js +85 -44
  93. package/dist/cli/commands/PullCommand.js.map +1 -1
  94. package/dist/cli/commands/inspect/InspectCommand.js +5 -3
  95. package/dist/cli/commands/inspect/InspectCommand.js.map +1 -1
  96. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +12 -0
  97. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +225 -0
  98. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -0
  99. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +17 -4
  100. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  101. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +31 -9
  102. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
  103. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +7 -4
  104. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
  105. package/dist/cli/commands/source/SourceCommand.d.ts +4 -0
  106. package/dist/cli/commands/source/SourceCommand.js +19 -0
  107. package/dist/cli/commands/source/SourceCommand.js.map +1 -0
  108. package/dist/cli/commands/{BuildCommand.d.ts → source/commands/BuildCommand.d.ts} +1 -2
  109. package/dist/cli/commands/{BuildCommand.js → source/commands/BuildCommand.js} +21 -19
  110. package/dist/cli/commands/source/commands/BuildCommand.js.map +1 -0
  111. package/dist/cli/commands/{ClearCommand.js → source/commands/ClearCommand.js} +6 -6
  112. package/dist/cli/commands/source/commands/ClearCommand.js.map +1 -0
  113. package/dist/cli/commands/{DownloadCommand.d.ts → source/commands/DownloadCommand.d.ts} +1 -2
  114. package/dist/cli/commands/{DownloadCommand.js → source/commands/DownloadCommand.js} +26 -22
  115. package/dist/cli/commands/source/commands/DownloadCommand.js.map +1 -0
  116. package/dist/cli/recommendedModels.js +192 -23
  117. package/dist/cli/recommendedModels.js.map +1 -1
  118. package/dist/cli/utils/ConsoleInteraction.d.ts +0 -1
  119. package/dist/cli/utils/ConsoleTable.js.map +1 -1
  120. package/dist/cli/utils/basicChooseFromListConsoleInteraction.js.map +1 -1
  121. package/dist/cli/utils/interactivelyAskForModel.js +6 -17
  122. package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
  123. package/dist/cli/utils/printCommonInfoLines.js +3 -0
  124. package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
  125. package/dist/cli/utils/renderModelCompatibilityPercentageWithColors.d.ts +6 -0
  126. package/dist/cli/utils/renderModelCompatibilityPercentageWithColors.js +14 -0
  127. package/dist/cli/utils/renderModelCompatibilityPercentageWithColors.js.map +1 -0
  128. package/dist/cli/utils/resolveModelRecommendationFileOptions.d.ts +1 -1
  129. package/dist/cli/utils/withCliCommandDescriptionDocsUrl.js.map +1 -1
  130. package/dist/commands.d.ts +3 -3
  131. package/dist/commands.js +3 -3
  132. package/dist/commands.js.map +1 -1
  133. package/dist/config.d.ts +7 -3
  134. package/dist/config.js +10 -6
  135. package/dist/config.js.map +1 -1
  136. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +17 -2
  137. package/dist/evaluator/LlamaChat/LlamaChat.js +24 -12
  138. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  139. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js +3 -1
  140. package/dist/evaluator/LlamaChat/utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js.map +1 -1
  141. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +21 -13
  142. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +15 -14
  143. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  144. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.d.ts +1 -0
  145. package/dist/evaluator/LlamaChatSession/utils/LlamaChatSessionPromptCompletionEngine.js.map +1 -1
  146. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.d.ts +3 -0
  147. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js +3 -0
  148. package/dist/evaluator/LlamaChatSession/utils/defineChatSessionFunction.js.map +1 -1
  149. package/dist/evaluator/LlamaCompletion.d.ts +18 -4
  150. package/dist/evaluator/LlamaCompletion.js +51 -22
  151. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  152. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +21 -0
  153. package/dist/evaluator/LlamaContext/LlamaContext.js +256 -133
  154. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  155. package/dist/evaluator/LlamaContext/LlamaSampler.d.ts +1 -0
  156. package/dist/evaluator/LlamaContext/LlamaSampler.js +31 -0
  157. package/dist/evaluator/LlamaContext/LlamaSampler.js.map +1 -0
  158. package/dist/evaluator/LlamaContext/types.d.ts +71 -9
  159. package/dist/evaluator/LlamaContext/utils/batchItemsPrioritizationStrategies/maximumParallelismStrategy.js.map +1 -1
  160. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js +1 -1
  161. package/dist/evaluator/LlamaContext/utils/resolveBatchItemsPrioritizationStrategy.js.map +1 -1
  162. package/dist/evaluator/LlamaEmbedding.d.ts +21 -0
  163. package/dist/evaluator/LlamaEmbedding.js +53 -0
  164. package/dist/evaluator/LlamaEmbedding.js.map +1 -0
  165. package/dist/evaluator/LlamaEmbeddingContext.d.ts +1 -5
  166. package/dist/evaluator/LlamaEmbeddingContext.js +6 -8
  167. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  168. package/dist/evaluator/LlamaGrammar.d.ts +9 -10
  169. package/dist/evaluator/LlamaGrammar.js +10 -5
  170. package/dist/evaluator/LlamaGrammar.js.map +1 -1
  171. package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +7 -3
  172. package/dist/evaluator/LlamaGrammarEvaluationState.js +8 -4
  173. package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
  174. package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +3 -0
  175. package/dist/evaluator/LlamaJsonSchemaGrammar.js +3 -0
  176. package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
  177. package/dist/evaluator/LlamaModel/LlamaModel.d.ts +28 -15
  178. package/dist/evaluator/LlamaModel/LlamaModel.js +66 -51
  179. package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
  180. package/dist/evaluator/LlamaModel/utils/TokenAttributes.d.ts +10 -10
  181. package/dist/evaluator/LlamaModel/utils/TokenAttributes.js +10 -10
  182. package/dist/evaluator/LlamaModel/utils/TokenAttributes.js.map +1 -1
  183. package/dist/evaluator/TokenBias.d.ts +20 -8
  184. package/dist/evaluator/TokenBias.js +44 -12
  185. package/dist/evaluator/TokenBias.js.map +1 -1
  186. package/dist/evaluator/TokenMeter.d.ts +3 -12
  187. package/dist/evaluator/TokenMeter.js +4 -16
  188. package/dist/evaluator/TokenMeter.js.map +1 -1
  189. package/dist/gguf/fileReaders/GgufFileReader.d.ts +0 -1
  190. package/dist/gguf/fileReaders/GgufFileReader.js.map +1 -1
  191. package/dist/gguf/fileReaders/GgufFsFileReader.d.ts +0 -2
  192. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.d.ts +5 -3
  193. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js +26 -13
  194. package/dist/gguf/fileReaders/GgufNetworkFetchFileReader.js.map +1 -1
  195. package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +57 -1
  196. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +86 -4
  197. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
  198. package/dist/gguf/insights/utils/scoreLevels.js.map +1 -1
  199. package/dist/gguf/readGgufFileInfo.d.ts +18 -6
  200. package/dist/gguf/readGgufFileInfo.js +8 -3
  201. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  202. package/dist/gguf/types/GgufMetadataTypes.d.ts +18 -2
  203. package/dist/gguf/types/GgufMetadataTypes.js +16 -1
  204. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  205. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js +2 -0
  206. package/dist/gguf/utils/convertMetadataKeyValueRecordToNestedObject.js.map +1 -1
  207. package/dist/gguf/utils/getGgufFileTypeName.d.ts +1 -1
  208. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js +1 -1
  209. package/dist/gguf/utils/resolveBinarySplitGgufPartUrls.js.map +1 -1
  210. package/dist/index.d.ts +8 -4
  211. package/dist/index.js +5 -3
  212. package/dist/index.js.map +1 -1
  213. package/dist/tsconfig.tsbuildinfo +1 -0
  214. package/dist/types.d.ts +1 -0
  215. package/dist/types.js.map +1 -1
  216. package/dist/utils/LlamaText.d.ts +3 -0
  217. package/dist/utils/LlamaText.js +7 -4
  218. package/dist/utils/LlamaText.js.map +1 -1
  219. package/dist/utils/LruCache.d.ts +2 -2
  220. package/dist/utils/LruCache.js.map +1 -1
  221. package/dist/utils/OverridesObject.d.ts +7 -0
  222. package/dist/utils/OverridesObject.js +2 -0
  223. package/dist/utils/OverridesObject.js.map +1 -0
  224. package/dist/utils/StopGenerationDetector.js.map +1 -1
  225. package/dist/utils/ThreadsSplitter.d.ts +26 -0
  226. package/dist/utils/ThreadsSplitter.js +164 -0
  227. package/dist/utils/ThreadsSplitter.js.map +1 -0
  228. package/dist/utils/TokenStreamRegulator.js.map +1 -1
  229. package/dist/utils/appendUserMessageToChatHistory.d.ts +4 -0
  230. package/dist/utils/appendUserMessageToChatHistory.js +4 -0
  231. package/dist/utils/appendUserMessageToChatHistory.js.map +1 -1
  232. package/dist/utils/compareTokens.d.ts +1 -1
  233. package/dist/utils/compareTokens.js.map +1 -1
  234. package/dist/utils/createModelDownloader.d.ts +94 -6
  235. package/dist/utils/createModelDownloader.js +174 -46
  236. package/dist/utils/createModelDownloader.js.map +1 -1
  237. package/dist/utils/gbnfJson/terminals/GbnfOr.js.map +1 -1
  238. package/dist/utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js +1 -1
  239. package/dist/utils/gbnfJson/utils/validateObjectAgainstGbnfSchema.js.map +1 -1
  240. package/dist/utils/getGrammarsFolder.js +1 -1
  241. package/dist/utils/getGrammarsFolder.js.map +1 -1
  242. package/dist/utils/gitReleaseBundles.js.map +1 -1
  243. package/dist/utils/modelFileAccesTokens.d.ts +4 -0
  244. package/dist/utils/modelFileAccesTokens.js +40 -0
  245. package/dist/utils/modelFileAccesTokens.js.map +1 -0
  246. package/dist/utils/parseModelFileName.js.map +1 -1
  247. package/dist/utils/parseTextTemplate.js.map +1 -1
  248. package/dist/utils/resolveGithubRelease.d.ts +1 -1
  249. package/dist/utils/resolveLastTokens.js.map +1 -1
  250. package/dist/utils/spawnCommand.d.ts +0 -1
  251. package/dist/utils/truncateTextAndRoundToWords.js +3 -1
  252. package/dist/utils/truncateTextAndRoundToWords.js.map +1 -1
  253. package/dist/utils/withOra.js +1 -1
  254. package/dist/utils/withOra.js.map +1 -1
  255. package/dist/utils/withProgressLog.d.ts +0 -1
  256. package/dist/utils/wrapAbortSignal.d.ts +0 -1
  257. package/llama/CMakeLists.txt +20 -12
  258. package/llama/addon/AddonContext.cpp +69 -202
  259. package/llama/addon/AddonContext.h +4 -5
  260. package/llama/addon/AddonGrammar.cpp +8 -11
  261. package/llama/addon/AddonGrammar.h +4 -3
  262. package/llama/addon/AddonGrammarEvaluationState.cpp +9 -10
  263. package/llama/addon/AddonGrammarEvaluationState.h +3 -1
  264. package/llama/addon/AddonModel.cpp +6 -5
  265. package/llama/addon/AddonSampler.cpp +513 -0
  266. package/llama/addon/AddonSampler.h +65 -0
  267. package/llama/addon/RingBuffer.h +109 -0
  268. package/llama/addon/addon.cpp +7 -0
  269. package/llama/addon/globals/addonLog.cpp +2 -1
  270. package/llama/binariesGithubRelease.json +1 -1
  271. package/llama/gitRelease.bundle +0 -0
  272. package/llama/grammars/README.md +1 -1
  273. package/llama/llama.cpp.info.json +1 -1
  274. package/package.json +71 -46
  275. package/templates/packed/electron-typescript-react.json +1 -1
  276. package/templates/packed/node-typescript.json +1 -1
  277. package/bins/linux-arm64/_nlcBuildMetadata.json +0 -1
  278. package/bins/linux-arm64/libggml.so +0 -0
  279. package/bins/linux-arm64/libllama.so +0 -0
  280. package/bins/linux-arm64/llama-addon.node +0 -0
  281. package/bins/linux-armv7l/_nlcBuildMetadata.json +0 -1
  282. package/bins/linux-armv7l/libggml.so +0 -0
  283. package/bins/linux-armv7l/libllama.so +0 -0
  284. package/bins/linux-armv7l/llama-addon.node +0 -0
  285. package/bins/linux-x64/_nlcBuildMetadata.json +0 -1
  286. package/bins/linux-x64/libggml.so +0 -0
  287. package/bins/linux-x64/libllama.so +0 -0
  288. package/bins/linux-x64/llama-addon.node +0 -0
  289. package/bins/linux-x64-vulkan/_nlcBuildMetadata.json +0 -1
  290. package/bins/linux-x64-vulkan/libggml.so +0 -0
  291. package/bins/linux-x64-vulkan/libllama.so +0 -0
  292. package/bins/linux-x64-vulkan/llama-addon.node +0 -0
  293. package/bins/linux-x64-vulkan/vulkan-shaders-gen +0 -0
  294. package/bins/mac-arm64-metal/_nlcBuildMetadata.json +0 -1
  295. package/bins/mac-arm64-metal/ggml-common.h +0 -1833
  296. package/bins/mac-arm64-metal/ggml-metal.metal +0 -6168
  297. package/bins/mac-arm64-metal/libggml.dylib +0 -0
  298. package/bins/mac-arm64-metal/libllama.dylib +0 -0
  299. package/bins/mac-arm64-metal/llama-addon.node +0 -0
  300. package/bins/mac-x64/_nlcBuildMetadata.json +0 -1
  301. package/bins/mac-x64/libggml.dylib +0 -0
  302. package/bins/mac-x64/libllama.dylib +0 -0
  303. package/bins/mac-x64/llama-addon.node +0 -0
  304. package/bins/win-arm64/_nlcBuildMetadata.json +0 -1
  305. package/bins/win-arm64/ggml.dll +0 -0
  306. package/bins/win-arm64/llama-addon.exp +0 -0
  307. package/bins/win-arm64/llama-addon.lib +0 -0
  308. package/bins/win-arm64/llama-addon.node +0 -0
  309. package/bins/win-arm64/llama.dll +0 -0
  310. package/bins/win-x64/_nlcBuildMetadata.json +0 -1
  311. package/bins/win-x64/ggml.dll +0 -0
  312. package/bins/win-x64/llama-addon.exp +0 -0
  313. package/bins/win-x64/llama-addon.lib +0 -0
  314. package/bins/win-x64/llama-addon.node +0 -0
  315. package/bins/win-x64/llama.dll +0 -0
  316. package/bins/win-x64-vulkan/_nlcBuildMetadata.json +0 -1
  317. package/bins/win-x64-vulkan/ggml.dll +0 -0
  318. package/bins/win-x64-vulkan/llama-addon.exp +0 -0
  319. package/bins/win-x64-vulkan/llama-addon.lib +0 -0
  320. package/bins/win-x64-vulkan/llama-addon.node +0 -0
  321. package/bins/win-x64-vulkan/llama.dll +0 -0
  322. package/bins/win-x64-vulkan/vulkan-shaders-gen.exe +0 -0
  323. package/dist/cli/commands/BuildCommand.js.map +0 -1
  324. package/dist/cli/commands/ClearCommand.js.map +0 -1
  325. package/dist/cli/commands/DownloadCommand.js.map +0 -1
  326. package/dist/utils/DeepPartialObject.d.ts +0 -3
  327. package/dist/utils/DeepPartialObject.js +0 -2
  328. package/dist/utils/DeepPartialObject.js.map +0 -1
  329. /package/dist/cli/commands/{ClearCommand.d.ts → source/commands/ClearCommand.d.ts} +0 -0
@@ -3,8 +3,16 @@ import { removeNullFields } from "../../utils/removeNullFields.js";
3
3
  import { compareTokens } from "../../utils/compareTokens.js";
4
4
  import { DisposeGuard } from "../../utils/DisposeGuard.js";
5
5
  import { TokenMeter } from "../TokenMeter.js";
6
+ import { UnsupportedError } from "../../utils/UnsupportedError.js";
6
7
  import { resolveBatchItemsPrioritizationStrategy } from "./utils/resolveBatchItemsPrioritizationStrategy.js";
8
+ import { LlamaSampler } from "./LlamaSampler.js";
7
9
  const defaultLoraScale = 1;
10
+ const shrinkRetriesMinContextSize = 4096;
11
+ const defaultMaxPunishTokens = 64;
12
+ const defaultFailedCreationRemedy = {
13
+ retries: 6,
14
+ autoContextSizeShrink: 0.16
15
+ };
8
16
  export class LlamaContext {
9
17
  /** @internal */ _llama;
10
18
  /** @internal */ _ctx;
@@ -14,6 +22,9 @@ export class LlamaContext {
14
22
  /** @internal */ _contextSize;
15
23
  /** @internal */ _batchSize;
16
24
  /** @internal */ _flashAttention;
25
+ /** @internal */ _idealThreads;
26
+ /** @internal */ _minThreads;
27
+ /** @internal */ _performanceTracking;
17
28
  /** @internal */ _totalSequences;
18
29
  /** @internal */ _unusedSequenceIds = [];
19
30
  /** @internal */ _batchingOptions;
@@ -26,11 +37,13 @@ export class LlamaContext {
26
37
  /** @internal */ _nextGeneratedSequenceId = 0;
27
38
  /** @internal */ _dispatchDecodeScheduled = false;
28
39
  /** @internal */ _batchDispatchPending = false;
40
+ /** @internal */ _threadSplitterConsumer;
41
+ /** @internal */ _freeReservedThreadsTimeout;
29
42
  /** @internal */ _currentDispatchBatchHandle = {};
30
43
  /** @internal */ _allocatedContextSize;
31
44
  /** @internal */ _disposed = false;
32
45
  onDispose = new EventRelay();
33
- constructor({ _model }, { sequences, seed = null, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, _embeddings, _noSeed }) {
46
+ constructor({ _model }, { sequences, contextSize, batchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, performanceTracking = false, _embeddings }) {
34
47
  if (_model.disposed)
35
48
  throw new DisposedError();
36
49
  this._llama = _model._llama;
@@ -41,15 +54,21 @@ export class LlamaContext {
41
54
  this._contextSize = Math.max(2, contextSize);
42
55
  this._batchSize = Math.max(batchSize, this._totalSequences);
43
56
  this._flashAttention = flashAttention;
57
+ this._idealThreads = typeof threads === "number"
58
+ ? this._llama._threadsSplitter.normalizeThreadsValue(threads)
59
+ : this._llama._threadsSplitter.normalizeThreadsValue(threads?.ideal ?? this._llama.maxThreads);
60
+ this._minThreads = typeof threads === "number"
61
+ ? 1
62
+ : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1);
63
+ this._performanceTracking = !!performanceTracking;
44
64
  this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
45
- seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
46
65
  contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
47
66
  batchSize: this._batchSize,
48
67
  sequences: this._totalSequences,
49
68
  flashAttention: this._flashAttention,
50
- threads: Math.max(0, Math.floor(threads)),
69
+ threads: this._idealThreads,
51
70
  embeddings: _embeddings,
52
- noSeed: _noSeed
71
+ performanceTracking: this._performanceTracking
53
72
  }));
54
73
  this._batchingOptions = {
55
74
  dispatchSchedule: batchingDispatchSchedule,
@@ -58,6 +77,7 @@ export class LlamaContext {
58
77
  this._gcRegistry = new FinalizationRegistry(this._model._removeLoraUsage);
59
78
  this._gcRegistry.register(this, this._loraAdapters);
60
79
  this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
80
+ this._freeReservedThreads = this._freeReservedThreads.bind(this);
61
81
  this._disposeAggregator.add(() => {
62
82
  this._disposed = true;
63
83
  });
@@ -111,6 +131,19 @@ export class LlamaContext {
111
131
  this._ensureNotDisposed();
112
132
  return this._ctx.getStateSize();
113
133
  }
134
+ /** The number of threads currently used to evaluate tokens */
135
+ get currentThreads() {
136
+ this._ensureNotDisposed();
137
+ return this._ctx.getThreads();
138
+ }
139
+ /**
140
+ * The number of threads that are preferred to be used to evaluate tokens.
141
+ *
142
+ * The actual number of threads used may be lower when other evaluations are running in parallel.
143
+ */
144
+ get idealThreads() {
145
+ return this._idealThreads;
146
+ }
114
147
  getAllocatedContextSize() {
115
148
  this._ensureNotDisposed();
116
149
  if (this._allocatedContextSize == null)
@@ -263,13 +296,19 @@ export class LlamaContext {
263
296
  i--;
264
297
  }
265
298
  }
266
- try {
267
- if (currentBatchSize !== 0)
299
+ if (currentBatchSize !== 0) {
300
+ const [threadsToUse, consumerHandle] = await this._threadSplitterConsumer?.getAllocationToConsume() ?? [];
301
+ try {
302
+ if (threadsToUse != null)
303
+ this._ctx.setThreads(threadsToUse);
268
304
  await this._ctx.decodeBatch();
269
- }
270
- catch (err) {
271
- this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
272
- return;
305
+ consumerHandle?.dispose();
306
+ }
307
+ catch (err) {
308
+ consumerHandle?.dispose();
309
+ this._dispatchErrorForQueuedDecodesAndDequeue(currentQueuedDecodeItems, err);
310
+ return;
311
+ }
273
312
  }
274
313
  for (const action of afterDecodeActions) {
275
314
  const [accept, reject] = action.response;
@@ -287,36 +326,47 @@ export class LlamaContext {
287
326
  const prioritizationStrategy = resolvePrioritizationStrategy();
288
327
  if (prioritizationStrategy == null)
289
328
  return; // all queued items are rejected and dequeued when we get here
290
- while (shouldHaveAnotherLoop) {
291
- const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
292
- if (orderedQueuedDecodes == null)
293
- return; // all queued items are rejected and dequeued when we get here
294
- const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
295
- let preventDisposalHandle;
296
- try {
297
- preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
298
- }
299
- catch (err) {
300
- this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
301
- return;
302
- }
303
- try {
304
- await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
305
- shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
306
- }
307
- finally {
308
- preventDisposalHandle.dispose();
329
+ this._reserveThreads();
330
+ try {
331
+ while (shouldHaveAnotherLoop) {
332
+ const orderedQueuedDecodes = getOrderedQueuedDecodes(prioritizationStrategy);
333
+ if (orderedQueuedDecodes == null)
334
+ return; // all queued items are rejected and dequeued when we get here
335
+ const { currentBatchItems, currentBatchSize } = fitQueuedDecodesToABatch(orderedQueuedDecodes, this._batchSize);
336
+ let preventDisposalHandle;
337
+ try {
338
+ preventDisposalHandle = this._backendContextDisposeGuard.createPreventDisposalHandle();
339
+ }
340
+ catch (err) {
341
+ this._dispatchErrorForQueuedDecodesAndDequeue(new Set(this._queuedDecodes), err);
342
+ return;
343
+ }
344
+ try {
345
+ await decodeTokenBatchItems(currentBatchItems, currentBatchSize);
346
+ shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
347
+ }
348
+ finally {
349
+ preventDisposalHandle.dispose();
350
+ }
309
351
  }
310
352
  }
353
+ finally {
354
+ this._scheduleToFreeReservedThreads();
355
+ }
311
356
  });
312
357
  }
313
358
  /**
314
359
  * Print the timings of token evaluation since that last print for this context.
360
+ *
361
+ * Requires the `performanceTracking` option to be enabled.
362
+ *
315
363
  * > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
316
364
  * it won't print anything.
317
365
  */
318
366
  async printTimings() {
319
367
  this._ensureNotDisposed();
368
+ if (!this._performanceTracking)
369
+ throw new UnsupportedError("Performance tracking is not enabled");
320
370
  this._ctx.printTimings();
321
371
  await new Promise((accept) => setTimeout(accept, 0)); // wait for the logs to finish printing
322
372
  }
@@ -350,14 +400,6 @@ export class LlamaContext {
350
400
  });
351
401
  }
352
402
  /** @internal */
353
- _acceptTokenOnGrammarEvaluationState(grammarEvaluationState, token) {
354
- this._ctx.acceptGrammarEvaluationStateToken(grammarEvaluationState._state, token);
355
- }
356
- /** @internal */
357
- _canBeNextTokenForGrammarEvaluationState(grammarEvaluationState, token) {
358
- return this._ctx.canBeNextTokenForGrammarEvaluationState(grammarEvaluationState._state, token);
359
- }
360
- /** @internal */
361
403
  _popSequenceId() {
362
404
  if (this._unusedSequenceIds.length > 0)
363
405
  return this._unusedSequenceIds.shift();
@@ -417,6 +459,30 @@ export class LlamaContext {
417
459
  }
418
460
  }
419
461
  /** @internal */
462
+ _reserveThreads() {
463
+ clearTimeout(this._freeReservedThreadsTimeout);
464
+ delete this._freeReservedThreadsTimeout;
465
+ if (this._threadSplitterConsumer != null)
466
+ return;
467
+ this._threadSplitterConsumer = this._llama._threadsSplitter.createConsumer(this._idealThreads, this._minThreads);
468
+ }
469
+ /** @internal */
470
+ _freeReservedThreads() {
471
+ clearTimeout(this._freeReservedThreadsTimeout);
472
+ delete this._freeReservedThreadsTimeout;
473
+ if (this._threadSplitterConsumer == null)
474
+ return;
475
+ this._threadSplitterConsumer.dispose();
476
+ delete this._threadSplitterConsumer;
477
+ }
478
+ /** @internal */
479
+ _scheduleToFreeReservedThreads() {
480
+ if (this._threadSplitterConsumer == null)
481
+ return;
482
+ clearTimeout(this._freeReservedThreadsTimeout);
483
+ this._freeReservedThreadsTimeout = setTimeout(this._freeReservedThreads, 0);
484
+ }
485
+ /** @internal */
420
486
  static async _create(options, { _model }) {
421
487
  const sequences = options.sequences ?? getDefaultContextSequences();
422
488
  const flashAttention = _model.flashAttentionSupported
@@ -425,7 +491,13 @@ export class LlamaContext {
425
491
  const loraOptions = typeof options.lora === "string"
426
492
  ? { adapters: [{ filePath: options.lora }] }
427
493
  : options.lora;
428
- const contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
494
+ let failedCreationRetries = options.failedCreationRemedy === false
495
+ ? 0
496
+ : Math.max(0, options.failedCreationRemedy?.retries ?? defaultFailedCreationRemedy.retries);
497
+ const failedCreationAutoContextSizeShrink = options.failedCreationRemedy === false
498
+ ? 0
499
+ : options.failedCreationRemedy?.autoContextSizeShrink ?? defaultFailedCreationRemedy.autoContextSizeShrink;
500
+ let contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
429
501
  batchSize: options.batchSize,
430
502
  sequences: sequences,
431
503
  modelGpuLayers: _model.gpuLayers,
@@ -436,69 +508,101 @@ export class LlamaContext {
436
508
  ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
437
509
  isEmbeddingContext: options._embeddings
438
510
  });
439
- const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
440
- const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
441
- contextSize,
442
- sequences,
443
- isEmbeddingContext: options._embeddings,
444
- modelGpuLayers: _model.gpuLayers,
445
- batchSize,
446
- flashAttention
447
- }).gpuVram;
448
- const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention });
511
+ const minContextSize = options.contextSize === "auto"
512
+ ? shrinkRetriesMinContextSize
513
+ : (typeof options.contextSize === "object" && typeof options.contextSize.min === "number")
514
+ ? options.contextSize.min
515
+ : typeof options.contextSize === "number"
516
+ ? options.contextSize
517
+ : shrinkRetriesMinContextSize;
449
518
  const { createSignal } = options;
450
- const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
451
- ? null
452
- : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
453
- try {
454
- const contextLoaded = await context._ctx.init();
455
- if (createSignal?.aborted) {
456
- if (contextLoaded)
457
- await context._ctx.dispose();
458
- throw createSignal.reason;
459
- }
460
- else if (!contextLoaded)
461
- throw new Error("Failed to create context");
462
- contextCreationMemoryReservation?.dispose?.();
463
- if (loraOptions != null && loraOptions.adapters.length > 0) {
464
- let loadedAdapters = 0;
465
- for (const adapter of loraOptions.adapters) {
466
- try {
467
- await context._setLora({
468
- filePath: adapter.filePath,
469
- scale: adapter.scale
470
- });
471
- loadedAdapters++;
519
+ async function createContext(contextSize) {
520
+ const batchSize = options.batchSize ?? getDefaultContextBatchSize({ contextSize, sequences });
521
+ const vramRequiredEstimate = _model.fileInsights.estimateContextResourceRequirements({
522
+ contextSize,
523
+ sequences,
524
+ isEmbeddingContext: options._embeddings,
525
+ modelGpuLayers: _model.gpuLayers,
526
+ batchSize,
527
+ flashAttention
528
+ }).gpuVram;
529
+ const context = new LlamaContext({ _model }, { ...options, contextSize, batchSize, sequences, flashAttention });
530
+ const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
531
+ ? null
532
+ : _model._llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
533
+ try {
534
+ if (createSignal?.aborted)
535
+ throw createSignal.reason;
536
+ const contextLoaded = await context._ctx.init();
537
+ if (createSignal?.aborted) {
538
+ if (contextLoaded)
539
+ await context._ctx.dispose();
540
+ throw createSignal.reason;
541
+ }
542
+ else if (!contextLoaded)
543
+ throw new Error("Failed to create context");
544
+ contextCreationMemoryReservation?.dispose?.();
545
+ if (loraOptions != null && loraOptions.adapters.length > 0) {
546
+ let loadedAdapters = 0;
547
+ for (const adapter of loraOptions.adapters) {
472
548
  try {
473
- loraOptions.onLoadProgress?.(loadedAdapters / loraOptions.adapters.length);
549
+ await context._setLora({
550
+ filePath: adapter.filePath,
551
+ scale: adapter.scale
552
+ });
553
+ loadedAdapters++;
554
+ try {
555
+ loraOptions.onLoadProgress?.(loadedAdapters / loraOptions.adapters.length);
556
+ }
557
+ catch (err) {
558
+ console.error(err);
559
+ }
474
560
  }
475
561
  catch (err) {
476
- console.error(err);
562
+ await context.dispose();
563
+ throw err;
564
+ }
565
+ if (createSignal?.aborted) {
566
+ await context.dispose();
567
+ throw createSignal.reason;
477
568
  }
478
569
  }
479
- catch (err) {
480
- await context.dispose();
481
- throw err;
570
+ }
571
+ else if (loraOptions?.onLoadProgress != null) {
572
+ try {
573
+ loraOptions.onLoadProgress(1);
482
574
  }
483
- if (createSignal?.aborted) {
484
- await context.dispose();
485
- throw createSignal.reason;
575
+ catch (err) {
576
+ console.error(err);
486
577
  }
487
578
  }
579
+ return context;
488
580
  }
489
- else if (loraOptions?.onLoadProgress != null) {
490
- try {
491
- loraOptions.onLoadProgress(1);
492
- }
493
- catch (err) {
494
- console.error(err);
495
- }
581
+ finally {
582
+ contextCreationMemoryReservation?.dispose?.();
496
583
  }
497
- return context;
498
584
  }
499
- finally {
500
- contextCreationMemoryReservation?.dispose?.();
585
+ while (failedCreationRetries >= 0) {
586
+ try {
587
+ return await createContext(contextSize);
588
+ }
589
+ catch (err) {
590
+ if (failedCreationRetries === 0 || (createSignal?.aborted && err === createSignal.reason))
591
+ throw err;
592
+ failedCreationRetries--;
593
+ let newContextSize = typeof failedCreationAutoContextSizeShrink === "number"
594
+ ? Math.floor(contextSize * (1 - failedCreationAutoContextSizeShrink))
595
+ : Math.floor(failedCreationAutoContextSizeShrink(contextSize));
596
+ if (!Number.isFinite(newContextSize))
597
+ throw err;
598
+ if (newContextSize < minContextSize)
599
+ newContextSize = minContextSize;
600
+ if (newContextSize >= contextSize)
601
+ throw err;
602
+ contextSize = newContextSize;
603
+ }
501
604
  }
605
+ throw new Error("Failed to create context");
502
606
  }
503
607
  }
504
608
  export class LlamaContextSequence {
@@ -639,12 +743,13 @@ export class LlamaContextSequence {
639
743
  });
640
744
  }
641
745
  evaluate(tokens, options = {}) {
642
- const { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
746
+ const { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, contextShift: { size: contextShiftSize = this._contextShift.size, strategy: contextShiftStrategy = this._contextShift.strategy } = {}, yieldEogToken = false, _noSampling = false } = options;
643
747
  return this._evaluate(tokens, {
644
748
  temperature,
645
749
  minP,
646
750
  topK,
647
751
  topP,
752
+ seed,
648
753
  grammarEvaluationState,
649
754
  repeatPenalty,
650
755
  tokenBias,
@@ -677,53 +782,71 @@ export class LlamaContextSequence {
677
782
  }
678
783
  }
679
784
  /** @internal */
680
- async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
785
+ async *_evaluate(tokens, { temperature = 0, minP = 0, topK = 40, topP = 0.95, seed, grammarEvaluationState, repeatPenalty, tokenBias, evaluationPriority = 5, generateNewTokens = true, contextShiftOptions, yieldEogToken = false, _noSampling = false }) {
681
786
  this._ensureNotDisposed();
682
787
  let evalTokens = tokens;
683
788
  if (evalTokens.length === 0)
684
789
  return;
685
- while (true) {
686
- this._ensureNotDisposed();
687
- // Evaluate to get the next token.
688
- const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
689
- if (_noSampling)
690
- return null;
691
- const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
692
- ? repeatPenalty.punishTokens()
693
- : repeatPenalty?.punishTokens;
694
- const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
695
- ? grammarEvaluationState()
696
- : grammarEvaluationState;
697
- if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
698
- throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
699
- const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
700
- return this._context._ctx.sampleToken(batchLogitIndex, removeNullFields({
701
- temperature,
702
- minP,
703
- topK,
704
- topP,
705
- repeatPenalty: repeatPenalty?.penalty,
706
- repeatPenaltyTokens: repeatPenaltyTokens != null
707
- ? Uint32Array.from(repeatPenaltyTokens)
708
- : undefined,
709
- repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
710
- repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
711
- tokenBiasKeys,
712
- tokenBiasValues,
713
- grammarEvaluationState: resolvedGrammarEvaluationState?._state
714
- }));
715
- });
716
- if (nextToken == null)
717
- return;
718
- // the model finished generating text
719
- if (!yieldEogToken && this._context.model.isEogToken(nextToken))
720
- break;
721
- const replacementToken = (yield nextToken);
722
- // set the tokens for the next evaluation
723
- if (replacementToken != null)
724
- evalTokens = [replacementToken];
725
- else
726
- evalTokens = [nextToken];
790
+ const sampler = new LlamaSampler(this.model);
791
+ try {
792
+ while (true) {
793
+ this._ensureNotDisposed();
794
+ // Evaluate to get the next token.
795
+ const nextToken = await this._decodeTokens(evalTokens, generateNewTokens, evaluationPriority, this._tokenMeter, contextShiftOptions, (batchLogitIndex) => {
796
+ if (_noSampling)
797
+ return null;
798
+ const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
799
+ ? repeatPenalty.punishTokens()
800
+ : repeatPenalty?.punishTokens;
801
+ const maxPunishTokens = Math.max(repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens, repeatPenaltyTokens?.length ?? 0);
802
+ const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
803
+ ? grammarEvaluationState()
804
+ : grammarEvaluationState;
805
+ if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
806
+ throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
807
+ const { tokenBiasKeys, tokenBiasValues } = getTokenBiasesForAddon(tokenBias, this.model);
808
+ sampler.applyConfig(removeNullFields({
809
+ temperature,
810
+ minP,
811
+ topK,
812
+ topP,
813
+ seed: Math.max(0, Number.isFinite(seed)
814
+ ? Math.floor(seed ?? (Date.now() / 1000))
815
+ : Math.floor(Date.now() / 1000)),
816
+ repeatPenalty: repeatPenalty?.penalty,
817
+ repeatPenaltyMaxTokens: maxPunishTokens,
818
+ repeatPenaltyTokens: repeatPenaltyTokens != null
819
+ ? Uint32Array.from(repeatPenaltyTokens)
820
+ : undefined,
821
+ repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
822
+ repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
823
+ tokenBiasKeys,
824
+ tokenBiasValues,
825
+ grammarEvaluationState: resolvedGrammarEvaluationState?._state
826
+ }));
827
+ return withLock(sampler, "sample", async () => {
828
+ if (sampler.disposed)
829
+ return null;
830
+ return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
831
+ });
832
+ });
833
+ if (nextToken === -1)
834
+ throw new Error("Failed to sample next token");
835
+ if (nextToken == null)
836
+ return;
837
+ // the model finished generating text
838
+ if (!yieldEogToken && this._context.model.isEogToken(nextToken))
839
+ break;
840
+ const replacementToken = (yield nextToken);
841
+ // set the tokens for the next evaluation
842
+ if (replacementToken != null)
843
+ evalTokens = [replacementToken];
844
+ else
845
+ evalTokens = [nextToken];
846
+ }
847
+ }
848
+ finally {
849
+ void withLock(sampler, "sample", sampler.asyncDispose);
727
850
  }
728
851
  }
729
852
  /** @internal */
@@ -814,7 +937,7 @@ function getTokenBiasesForAddon(tokenBias, currentModel) {
814
937
  };
815
938
  if (tokenBias instanceof Function)
816
939
  tokenBias = tokenBias();
817
- if (tokenBias._model !== currentModel)
940
+ if (tokenBias._tokenizer !== currentModel.tokenizer)
818
941
  throw new Error("This TokenBias instance was created with a different model than the one used by this context. " +
819
942
  "Make sure you use the model instance of the context sequence for the TokenBias you use it with.");
820
943
  const tokenBiasKeys = [];