node-llama-cpp 3.3.2 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. package/README.md +3 -2
  2. package/dist/bindings/AddonTypes.d.ts +12 -4
  3. package/dist/bindings/Llama.d.ts +9 -0
  4. package/dist/bindings/Llama.js +52 -28
  5. package/dist/bindings/Llama.js.map +1 -1
  6. package/dist/bindings/getLlama.d.ts +2 -1
  7. package/dist/bindings/getLlama.js +19 -9
  8. package/dist/bindings/getLlama.js.map +1 -1
  9. package/dist/bindings/utils/asyncSome.js +2 -0
  10. package/dist/bindings/utils/asyncSome.js.map +1 -1
  11. package/dist/bindings/utils/compileLLamaCpp.d.ts +1 -1
  12. package/dist/bindings/utils/compileLLamaCpp.js +115 -34
  13. package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
  14. package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +1 -0
  15. package/dist/bindings/utils/detectAvailableComputeLayers.js +4 -4
  16. package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
  17. package/dist/bindings/utils/detectBuildTools.d.ts +14 -0
  18. package/dist/bindings/utils/detectBuildTools.js +149 -0
  19. package/dist/bindings/utils/detectBuildTools.js.map +1 -0
  20. package/dist/bindings/utils/resolveActualBindingBinaryPath.d.ts +1 -0
  21. package/dist/bindings/utils/resolveActualBindingBinaryPath.js +18 -0
  22. package/dist/bindings/utils/resolveActualBindingBinaryPath.js.map +1 -0
  23. package/dist/bindings/utils/testBindingBinary.d.ts +1 -1
  24. package/dist/bindings/utils/testBindingBinary.js +58 -5
  25. package/dist/bindings/utils/testBindingBinary.js.map +1 -1
  26. package/dist/chatWrappers/AlpacaChatWrapper.d.ts +4 -0
  27. package/dist/chatWrappers/AlpacaChatWrapper.js +4 -0
  28. package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
  29. package/dist/chatWrappers/FalconChatWrapper.d.ts +4 -0
  30. package/dist/chatWrappers/FalconChatWrapper.js +4 -0
  31. package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
  32. package/dist/chatWrappers/GeneralChatWrapper.d.ts +4 -0
  33. package/dist/chatWrappers/GeneralChatWrapper.js +4 -0
  34. package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
  35. package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +2 -0
  36. package/dist/chatWrappers/utils/resolveChatWrapper.js +8 -27
  37. package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
  38. package/dist/cli/commands/ChatCommand.d.ts +4 -0
  39. package/dist/cli/commands/ChatCommand.js +158 -13
  40. package/dist/cli/commands/ChatCommand.js.map +1 -1
  41. package/dist/cli/commands/CompleteCommand.d.ts +4 -0
  42. package/dist/cli/commands/CompleteCommand.js +143 -10
  43. package/dist/cli/commands/CompleteCommand.js.map +1 -1
  44. package/dist/cli/commands/DebugCommand.js +5 -5
  45. package/dist/cli/commands/DebugCommand.js.map +1 -1
  46. package/dist/cli/commands/InfillCommand.d.ts +4 -0
  47. package/dist/cli/commands/InfillCommand.js +142 -10
  48. package/dist/cli/commands/InfillCommand.js.map +1 -1
  49. package/dist/cli/commands/OnPostInstallCommand.js +12 -2
  50. package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
  51. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +1 -0
  52. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +14 -7
  53. package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -1
  54. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +13 -3
  55. package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
  56. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +20 -10
  57. package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
  58. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +2 -0
  59. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +234 -77
  60. package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
  61. package/dist/cli/recommendedModels.js +11 -1
  62. package/dist/cli/recommendedModels.js.map +1 -1
  63. package/dist/cli/utils/ConsoleTable.d.ts +1 -0
  64. package/dist/cli/utils/ConsoleTable.js +5 -1
  65. package/dist/cli/utils/ConsoleTable.js.map +1 -1
  66. package/dist/cli/utils/interactivelyAskForModel.d.ts +2 -1
  67. package/dist/cli/utils/interactivelyAskForModel.js +16 -13
  68. package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
  69. package/dist/cli/utils/isRunningUnderRosetta.d.ts +1 -0
  70. package/dist/cli/utils/isRunningUnderRosetta.js +20 -0
  71. package/dist/cli/utils/isRunningUnderRosetta.js.map +1 -0
  72. package/dist/cli/utils/printCommonInfoLines.d.ts +4 -2
  73. package/dist/cli/utils/printCommonInfoLines.js +67 -5
  74. package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
  75. package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
  76. package/dist/cli/utils/resolveCommandGgufPath.js +6 -5
  77. package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
  78. package/dist/cli/utils/toBytes.d.ts +1 -0
  79. package/dist/cli/utils/toBytes.js +5 -0
  80. package/dist/cli/utils/toBytes.js.map +1 -0
  81. package/dist/config.d.ts +3 -0
  82. package/dist/config.js +3 -0
  83. package/dist/config.js.map +1 -1
  84. package/dist/evaluator/LlamaChat/LlamaChat.d.ts +12 -3
  85. package/dist/evaluator/LlamaChat/LlamaChat.js +21 -7
  86. package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
  87. package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +6 -2
  88. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +3 -0
  89. package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
  90. package/dist/evaluator/LlamaCompletion.d.ts +3 -0
  91. package/dist/evaluator/LlamaCompletion.js +5 -0
  92. package/dist/evaluator/LlamaCompletion.js.map +1 -1
  93. package/dist/evaluator/LlamaContext/LlamaContext.d.ts +81 -38
  94. package/dist/evaluator/LlamaContext/LlamaContext.js +678 -132
  95. package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
  96. package/dist/evaluator/LlamaContext/TokenPredictor.d.ts +55 -0
  97. package/dist/evaluator/LlamaContext/TokenPredictor.js +20 -0
  98. package/dist/evaluator/LlamaContext/TokenPredictor.js.map +1 -0
  99. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts +56 -0
  100. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js +266 -0
  101. package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map +1 -0
  102. package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts +58 -0
  103. package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js +138 -0
  104. package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map +1 -0
  105. package/dist/evaluator/LlamaContext/types.d.ts +198 -5
  106. package/dist/evaluator/LlamaEmbeddingContext.d.ts +3 -0
  107. package/dist/evaluator/LlamaEmbeddingContext.js +3 -0
  108. package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
  109. package/dist/evaluator/LlamaGrammar.d.ts +7 -1
  110. package/dist/evaluator/LlamaGrammar.js +6 -0
  111. package/dist/evaluator/LlamaGrammar.js.map +1 -1
  112. package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +4 -4
  113. package/dist/evaluator/LlamaGrammarEvaluationState.js +16 -8
  114. package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
  115. package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +5 -0
  116. package/dist/evaluator/LlamaJsonSchemaGrammar.js +7 -0
  117. package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
  118. package/dist/evaluator/LlamaModel/LlamaModel.d.ts +19 -11
  119. package/dist/evaluator/LlamaModel/LlamaModel.js +23 -29
  120. package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
  121. package/dist/evaluator/LlamaRankingContext.d.ts +76 -0
  122. package/dist/evaluator/LlamaRankingContext.js +158 -0
  123. package/dist/evaluator/LlamaRankingContext.js.map +1 -0
  124. package/dist/evaluator/TokenBias.d.ts +3 -0
  125. package/dist/evaluator/TokenBias.js +3 -0
  126. package/dist/evaluator/TokenBias.js.map +1 -1
  127. package/dist/evaluator/utils/chunkDocument.d.ts +86 -0
  128. package/dist/evaluator/utils/chunkDocument.js +212 -0
  129. package/dist/evaluator/utils/chunkDocument.js.map +1 -0
  130. package/dist/gguf/insights/GgufInsights.d.ts +3 -1
  131. package/dist/gguf/insights/GgufInsights.js +114 -8
  132. package/dist/gguf/insights/GgufInsights.js.map +1 -1
  133. package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +6 -3
  134. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +11 -7
  135. package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
  136. package/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts +2 -1
  137. package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js +13 -7
  138. package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map +1 -1
  139. package/dist/gguf/parser/GgufV2Parser.js +29 -8
  140. package/dist/gguf/parser/GgufV2Parser.js.map +1 -1
  141. package/dist/gguf/parser/parseGguf.js +11 -11
  142. package/dist/gguf/parser/parseGguf.js.map +1 -1
  143. package/dist/gguf/readGgufFileInfo.js +8 -3
  144. package/dist/gguf/readGgufFileInfo.js.map +1 -1
  145. package/dist/gguf/types/GgufFileInfoTypes.d.ts +1 -0
  146. package/dist/gguf/types/GgufMetadataTypes.d.ts +9 -9
  147. package/dist/gguf/types/GgufMetadataTypes.js +1 -1
  148. package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
  149. package/dist/gguf/types/GgufTensorInfoTypes.d.ts +13 -0
  150. package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -1
  151. package/dist/index.d.ts +7 -2
  152. package/dist/index.js +6 -1
  153. package/dist/index.js.map +1 -1
  154. package/dist/tsconfig.tsbuildinfo +1 -1
  155. package/dist/utils/LlamaText.d.ts +4 -1
  156. package/dist/utils/LlamaText.js +4 -1
  157. package/dist/utils/LlamaText.js.map +1 -1
  158. package/dist/utils/cmake.js +23 -0
  159. package/dist/utils/cmake.js.map +1 -1
  160. package/dist/utils/pushAll.d.ts +1 -1
  161. package/dist/utils/pushAll.js.map +1 -1
  162. package/dist/utils/tokenizerUtils.js +1 -1
  163. package/dist/utils/utilTypes.d.ts +5 -0
  164. package/llama/CMakeLists.txt +25 -8
  165. package/llama/addon/AddonContext.cpp +196 -22
  166. package/llama/addon/AddonContext.h +1 -0
  167. package/llama/addon/AddonGrammar.cpp +1 -4
  168. package/llama/addon/AddonGrammarEvaluationState.cpp +16 -5
  169. package/llama/addon/AddonModel.cpp +31 -39
  170. package/llama/addon/AddonModel.h +1 -1
  171. package/llama/addon/AddonModelLora.cpp +2 -2
  172. package/llama/addon/AddonModelLora.h +1 -1
  173. package/llama/addon/AddonSampler.cpp +7 -12
  174. package/llama/addon/addon.cpp +26 -7
  175. package/llama/addon/globals/getGpuInfo.cpp +30 -5
  176. package/llama/addon/globals/getGpuInfo.h +6 -1
  177. package/llama/addon/globals/getMemoryInfo.cpp +63 -0
  178. package/llama/addon/globals/getMemoryInfo.h +4 -0
  179. package/llama/binariesGithubRelease.json +1 -1
  180. package/llama/cmake/win32.ensureNinjaPath.cmake +68 -0
  181. package/llama/cmake/win32.ensureNodeLib.cmake +34 -0
  182. package/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake +12 -0
  183. package/llama/cmake/win32.llvmEnsureCmakeAr.cmake +37 -0
  184. package/llama/cmake/win32.llvmUseGnuModeCompilers.cmake +87 -0
  185. package/llama/cmake/win32.programFilesPaths.cmake +35 -0
  186. package/llama/gitRelease.bundle +0 -0
  187. package/llama/gpuInfo/vulkan-gpu-info.cpp +29 -2
  188. package/llama/gpuInfo/vulkan-gpu-info.h +1 -0
  189. package/llama/llama.cpp.info.json +1 -1
  190. package/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake +14 -0
  191. package/llama/profiles/llvm.win32.host-x64.target-arm64.cmake +14 -0
  192. package/llama/profiles/llvm.win32.host-x64.target-x64.cmake +14 -0
  193. package/llama/toolchains/llvm.win32.host-x64.target-x64.cmake +20 -0
  194. package/llama/toolchains/win32.host-arm64.target-arm64.cmake +21 -0
  195. package/llama/toolchains/win32.host-x64.target-arm64.cmake +14 -34
  196. package/package.json +47 -44
  197. package/templates/README.md +1 -1
  198. package/templates/packed/electron-typescript-react.json +1 -1
  199. package/templates/packed/node-typescript.json +1 -1
@@ -2,8 +2,8 @@ import path from "path";
2
2
  import process from "process";
3
3
  import { fileURLToPath } from "url";
4
4
  import { fork } from "node:child_process";
5
+ import os from "os";
5
6
  import chalk from "chalk";
6
- import bytes from "bytes";
7
7
  import stripAnsi from "strip-ansi";
8
8
  import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
9
9
  import { resolveCommandGgufPath } from "../../../utils/resolveCommandGgufPath.js";
@@ -17,6 +17,7 @@ import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
17
17
  import { getReadablePath } from "../../../utils/getReadablePath.js";
18
18
  import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
19
19
  import { documentationPageUrls } from "../../../../config.js";
20
+ import { toBytes } from "../../../utils/toBytes.js";
20
21
  export const InspectMeasureCommand = {
21
22
  command: "measure [modelPath]",
22
23
  describe: withCliCommandDescriptionDocsUrl("Measure VRAM consumption of a GGUF model file with all possible combinations of gpu layers and context sizes", documentationPageUrls.CLI.Inspect.Measure),
@@ -82,6 +83,17 @@ export const InspectMeasureCommand = {
82
83
  type: "number",
83
84
  default: 10,
84
85
  description: "Number of context size measures to take for each gpu layers count"
86
+ })
87
+ .option("memory", {
88
+ type: "string",
89
+ choices: ["vram", "ram", "all"],
90
+ default: "vram",
91
+ description: "Type of memory to measure"
92
+ })
93
+ .option("noMmap", {
94
+ type: "boolean",
95
+ default: false,
96
+ description: "Disable mmap (memory-mapped file) usage"
85
97
  })
86
98
  .option("printHeaderBeforeEachLayer", {
87
99
  alias: "ph",
@@ -101,13 +113,14 @@ export const InspectMeasureCommand = {
101
113
  description: "Number of times to repeat the evaluation text before sending it for evaluation, in order to make it longer"
102
114
  });
103
115
  },
104
- async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }) {
116
+ async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }) {
105
117
  if (maxLayers === -1)
106
118
  maxLayers = undefined;
107
119
  if (maxContextSize === -1)
108
120
  maxContextSize = undefined;
109
121
  if (minLayers < 1)
110
122
  minLayers = 1;
123
+ const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all";
111
124
  const headers = resolveHeaderFlag(headerArg);
112
125
  // ensure a llama build is available
113
126
  const llama = gpu == null
@@ -118,17 +131,29 @@ export const InspectMeasureCommand = {
118
131
  gpu,
119
132
  logLevel: LlamaLogLevel.error
120
133
  });
121
- const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers);
134
+ const useMmap = !noMmap && llama.supportsMmap;
135
+ const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
136
+ flashAttention, useMmap
137
+ });
122
138
  console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
123
139
  console.info(`${chalk.yellow("GPU:")} ${getPrettyBuildGpuName(llama.gpu)}${gpu == null ? chalk.gray(" (last build)") : ""}`);
140
+ console.info(chalk.yellow("mmap:") + " " + (!llama.supportsMmap
141
+ ? "unsupported"
142
+ : useMmap
143
+ ? "enabled"
144
+ : "disabled"));
145
+ if (measureMemoryType === "ram" || measureMemoryType === "all")
146
+ console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
124
147
  console.info();
125
148
  const ggufMetadata = await readGgufFileInfo(resolvedGgufPath, {
126
149
  sourceType: "filesystem"
127
150
  });
128
151
  const ggufInsights = await GgufInsights.from(ggufMetadata, llama);
129
152
  const totalVram = (await llama.getVramState()).total;
153
+ const totalRam = os.totalmem();
130
154
  let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
131
155
  let previousContextSizeCheck = undefined;
156
+ const measureTable = getMeasureTable(measureMemoryType);
132
157
  measureTable.logHeader({ drawRowSeparator: !printHeaderBeforeEachLayer });
133
158
  while (lastGpuLayers >= (minLayers ?? 0)) {
134
159
  let printedAlreadyWithThisProcess = false;
@@ -141,6 +166,7 @@ export const InspectMeasureCommand = {
141
166
  };
142
167
  const done = await measureModel({
143
168
  modelPath: resolvedGgufPath,
169
+ useMmap,
144
170
  gpu: gpu == null
145
171
  ? undefined
146
172
  : llama.gpu,
@@ -154,6 +180,7 @@ export const InspectMeasureCommand = {
154
180
  evaluateText: evaluateText == null
155
181
  ? undefined
156
182
  : evaluateText.repeat(repeatEvaluateText ?? 1),
183
+ exitAfterMeasurement: exitAfterEachMeasurement,
157
184
  onInfo({ gpuLayers, result }) {
158
185
  if (lastGpuLayers !== gpuLayers) {
159
186
  lastGpuLayers = gpuLayers;
@@ -196,26 +223,45 @@ export const InspectMeasureCommand = {
196
223
  else if (result.type === "success") {
197
224
  previousContextSizeCheck = result.contextSize;
198
225
  hadSuccessInThisProcess = true;
199
- const modelVramEstimation = ggufInsights.estimateModelResourceRequirements({ gpuLayers: lastGpuLayers }).gpuVram;
226
+ const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({
227
+ gpuLayers: lastGpuLayers,
228
+ useMmap
229
+ });
230
+ const modelVramEstimation = modelResourceEstimation.gpuVram;
200
231
  const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") +
201
- bytes(Math.abs(result.modelVramUsage - modelVramEstimation));
232
+ toBytes(Math.abs(result.modelVramUsage - modelVramEstimation));
202
233
  const modelVramEstimationDiffText = modelVramEstimationDiffBytes.padEnd(9, " ") + " " +
203
234
  padStartAnsi("(" + renderDiffPercentageWithColors(((modelVramEstimation / result.modelVramUsage) - 1) * 100) + ")", 9);
204
- const contextVramEstimation = previousContextSizeCheck == null
235
+ const modelRamEstimation = modelResourceEstimation.cpuRam;
236
+ const modelRamEstimationDiffBytes = (modelRamEstimation < result.modelRamUsage ? "-" : "") +
237
+ toBytes(Math.abs(result.modelRamUsage - modelRamEstimation));
238
+ const modelRamEstimationDiffText = modelRamEstimationDiffBytes.padEnd(9, " ") + " " +
239
+ padStartAnsi("(" + renderDiffPercentageWithColors(((modelRamEstimation / result.modelRamUsage) - 1) * 100) + ")", 9);
240
+ const contextResourceEstimation = previousContextSizeCheck == null
205
241
  ? undefined
206
242
  : ggufInsights.estimateContextResourceRequirements({
207
243
  contextSize: previousContextSizeCheck,
208
244
  modelGpuLayers: lastGpuLayers,
209
245
  flashAttention
210
- }).gpuVram;
246
+ });
247
+ const contextVramEstimation = contextResourceEstimation?.gpuVram;
211
248
  const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
212
249
  ? undefined
213
250
  : ((contextVramEstimation < result.contextVramUsage ? "-" : "") +
214
- bytes(Math.abs(result.contextVramUsage - contextVramEstimation)));
251
+ toBytes(Math.abs(result.contextVramUsage - contextVramEstimation)));
215
252
  const contextVramEstimationDiffText = (contextVramEstimation == null || contextVramEstimationDiffBytes == null || result.contextVramUsage == null)
216
253
  ? undefined
217
254
  : (contextVramEstimationDiffBytes.padEnd(9, " ") + " " +
218
255
  padStartAnsi("(" + renderDiffPercentageWithColors(((contextVramEstimation / result.contextVramUsage) - 1) * 100) + ")", 9));
256
+ const contextRamEstimation = contextResourceEstimation?.cpuRam;
257
+ const contextRamEstimationDiffBytes = (result.contextRamUsage == null || contextRamEstimation == null)
258
+ ? undefined
259
+ : ((contextRamEstimation < result.contextRamUsage ? "-" : "") +
260
+ toBytes(Math.abs(result.contextRamUsage - contextRamEstimation)));
261
+ const contextRamEstimationDiffText = (contextRamEstimation == null || contextRamEstimationDiffBytes == null || result.contextRamUsage == null)
262
+ ? undefined
263
+ : (contextRamEstimationDiffBytes.padEnd(9, " ") + " " +
264
+ padStartAnsi("(" + renderDiffPercentageWithColors(((contextRamEstimation / result.contextRamUsage) - 1) * 100) + ")", 9));
219
265
  measureTable.logLine({
220
266
  newProcess: getNewProccessValue(),
221
267
  type: previousContextSizeCheck == null
@@ -225,18 +271,30 @@ export const InspectMeasureCommand = {
225
271
  contextSize: previousContextSizeCheck != null
226
272
  ? String(previousContextSizeCheck)
227
273
  : undefined,
228
- estimatedModelVram: bytes(modelVramEstimation),
229
- actualModelVram: bytes(result.modelVramUsage),
230
- modelEstimationDiff: modelVramEstimationDiffText,
274
+ estimatedModelVram: toBytes(modelVramEstimation),
275
+ actualModelVram: toBytes(result.modelVramUsage),
276
+ modelVramEstimationDiff: modelVramEstimationDiffText,
277
+ estimatedModelRam: toBytes(modelRamEstimation),
278
+ actualModelRam: toBytes(result.modelRamUsage),
279
+ modelRamEstimationDiff: modelRamEstimationDiffText,
231
280
  estimatedContextVram: contextVramEstimation == null
232
281
  ? undefined
233
- : bytes(contextVramEstimation),
282
+ : toBytes(contextVramEstimation),
234
283
  actualContextVram: result.contextVramUsage == null
235
284
  ? undefined
236
- : bytes(result.contextVramUsage),
237
- contextEstimationDiff: contextVramEstimationDiffText,
285
+ : toBytes(result.contextVramUsage),
286
+ contextVramEstimationDiff: contextVramEstimationDiffText,
238
287
  totalVramUsage: ((result.totalVramUsage / totalVram) * 100).toFixed(2).padStart(5, "0") + "% " +
239
- chalk.gray("(" + bytes(result.totalVramUsage) + "/" + bytes(totalVram) + ")")
288
+ chalk.gray("(" + toBytes(result.totalVramUsage) + "/" + toBytes(totalVram) + ")"),
289
+ estimatedContextRam: contextRamEstimation == null
290
+ ? undefined
291
+ : toBytes(contextRamEstimation),
292
+ actualContextRam: result.contextRamUsage == null
293
+ ? undefined
294
+ : toBytes(result.contextRamUsage),
295
+ contextRamEstimationDiff: contextRamEstimationDiffText,
296
+ totalRamUsage: ((result.totalRamUsage / totalRam) * 100).toFixed(2).padStart(5, "0") + "% " +
297
+ chalk.gray("(" + toBytes(result.totalRamUsage) + "/" + toBytes(totalRam) + ")")
240
298
  });
241
299
  }
242
300
  }
@@ -246,55 +304,100 @@ export const InspectMeasureCommand = {
246
304
  }
247
305
  }
248
306
  };
249
- const measureTable = new ConsoleTable([{
250
- key: "newProcess",
251
- title: " ",
252
- width: 1
253
- }, {
254
- key: "type",
255
- title: "Type",
256
- width: Math.max("Type".length, "Model".length, "Context".length),
257
- canSpanOverEmptyColumns: true
258
- }, {
259
- key: "gpuLayers",
260
- title: "Layers",
261
- width: "Layers".length,
262
- canSpanOverEmptyColumns: true
263
- }, {
264
- key: "contextSize",
265
- title: "Context size",
266
- width: "Context size".length,
267
- canSpanOverEmptyColumns: true
268
- }, {
269
- key: "estimatedModelVram",
270
- title: "Estimated model VRAM",
271
- width: "Estimated model VRAM".length,
272
- canSpanOverEmptyColumns: true
273
- }, {
274
- key: "actualModelVram",
275
- title: "Model VRAM",
276
- width: "Model VRAM".length
277
- }, {
278
- key: "modelEstimationDiff",
279
- title: "Diff",
280
- width: Math.max("Diff".length, 9 + 1 + 9)
281
- }, {
282
- key: "estimatedContextVram",
283
- title: "Estimated context VRAM",
284
- width: "Estimated context VRAM".length
285
- }, {
286
- key: "actualContextVram",
287
- title: "Context VRAM",
288
- width: "Context VRAM".length
289
- }, {
290
- key: "contextEstimationDiff",
291
- title: "Diff",
292
- width: Math.max("Diff".length, 9 + 1 + 9)
293
- }, {
294
- key: "totalVramUsage",
295
- title: "VRAM usage",
296
- width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
297
- }]);
307
+ function getMeasureTable(memoryType) {
308
+ return new ConsoleTable([{
309
+ key: "newProcess",
310
+ title: " ",
311
+ width: 1
312
+ }, {
313
+ key: "type",
314
+ title: "Type",
315
+ width: Math.max("Type".length, "Model".length, "Context".length),
316
+ canSpanOverEmptyColumns: true
317
+ }, {
318
+ key: "gpuLayers",
319
+ title: "Layers",
320
+ width: "Layers".length,
321
+ canSpanOverEmptyColumns: true
322
+ }, {
323
+ key: "contextSize",
324
+ title: "Context size",
325
+ width: "Context size".length,
326
+ canSpanOverEmptyColumns: true
327
+ }, {
328
+ key: "estimatedModelVram",
329
+ visible: memoryType === "vram" || memoryType === "all",
330
+ title: "Estimated model VRAM",
331
+ width: "Estimated model VRAM".length,
332
+ canSpanOverEmptyColumns: true
333
+ }, {
334
+ key: "actualModelVram",
335
+ visible: memoryType === "vram" || memoryType === "all",
336
+ title: "Model VRAM",
337
+ width: "Model VRAM".length
338
+ }, {
339
+ key: "modelVramEstimationDiff",
340
+ visible: memoryType === "vram" || memoryType === "all",
341
+ title: "Diff",
342
+ width: Math.max("Diff".length, 9 + 1 + 9)
343
+ }, {
344
+ key: "estimatedModelRam",
345
+ visible: memoryType === "ram" || memoryType === "all",
346
+ title: "Estimated model RAM",
347
+ width: "Estimated model RAM".length,
348
+ canSpanOverEmptyColumns: true
349
+ }, {
350
+ key: "actualModelRam",
351
+ visible: memoryType === "ram" || memoryType === "all",
352
+ title: "Model RAM",
353
+ width: "Model RAM".length
354
+ }, {
355
+ key: "modelRamEstimationDiff",
356
+ visible: memoryType === "ram" || memoryType === "all",
357
+ title: "Diff",
358
+ width: Math.max("Diff".length, 9 + 1 + 9)
359
+ }, {
360
+ key: "estimatedContextVram",
361
+ visible: memoryType === "vram" || memoryType === "all",
362
+ title: "Estimated context VRAM",
363
+ width: "Estimated context VRAM".length
364
+ }, {
365
+ key: "actualContextVram",
366
+ visible: memoryType === "vram" || memoryType === "all",
367
+ title: "Context VRAM",
368
+ width: "Context VRAM".length
369
+ }, {
370
+ key: "contextVramEstimationDiff",
371
+ visible: memoryType === "vram" || memoryType === "all",
372
+ title: "Diff",
373
+ width: Math.max("Diff".length, 9 + 1 + 9)
374
+ }, {
375
+ key: "totalVramUsage",
376
+ visible: memoryType === "vram" || memoryType === "all",
377
+ title: "VRAM usage",
378
+ width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
379
+ }, {
380
+ key: "estimatedContextRam",
381
+ visible: memoryType === "ram" || memoryType === "all",
382
+ title: "Estimated context RAM",
383
+ width: "Estimated context RAM".length
384
+ }, {
385
+ key: "actualContextRam",
386
+ visible: memoryType === "ram" || memoryType === "all",
387
+ title: "Context RAM",
388
+ width: "Context RAM".length
389
+ }, {
390
+ key: "contextRamEstimationDiff",
391
+ visible: memoryType === "ram" || memoryType === "all",
392
+ title: "Diff",
393
+ width: Math.max("Diff".length, 9 + 1 + 9)
394
+ }, {
395
+ key: "totalRamUsage",
396
+ visible: memoryType === "ram" || memoryType === "all",
397
+ title: "RAM usage",
398
+ width: Math.max("RAM usage".length, 8 + 1 + 8 + 1 + 8)
399
+ }]);
400
+ }
298
401
  function renderDiffPercentageWithColors(percentage, { greenBright = 2, green = 6, yellow = 10, yellowBright = 14 } = {}) {
299
402
  const percentageText = percentage.toFixed(2).padStart(5, "0") + "%";
300
403
  const absPercentage = Math.abs(percentage);
@@ -311,7 +414,7 @@ function renderDiffPercentageWithColors(percentage, { greenBright = 2, green = 6
311
414
  const __filename = fileURLToPath(import.meta.url);
312
415
  const detectedFileName = path.basename(__filename);
313
416
  const expectedFileName = "InspectMeasureCommand";
314
- async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText, onInfo }) {
417
+ async function measureModel({ modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText, exitAfterMeasurement = false, onInfo }) {
315
418
  if (!detectedFileName.startsWith(expectedFileName)) {
316
419
  console.warn(getConsoleLogPrefix() +
317
420
  `"${expectedFileName}.js" file is not independent, so running sub-process tests cannot be done with it\n` +
@@ -331,6 +434,7 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
331
434
  }
332
435
  });
333
436
  let isPlannedExit = false;
437
+ let isDone = false;
334
438
  let forkSucceeded = false;
335
439
  let timeoutHandle = null;
336
440
  const processCreationTimeout = 1000 * 60 * 5;
@@ -363,8 +467,8 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
363
467
  function done() {
364
468
  if (!forkSucceeded)
365
469
  reject(new Error(`Measuring a model failed to run a sub-process via file "${__filename}"`));
366
- else
367
- resolve(isPlannedExit);
470
+ else if (isPlannedExit)
471
+ resolve(isPlannedExit && isDone);
368
472
  cleanup();
369
473
  }
370
474
  subProcess.on("message", (message) => {
@@ -373,6 +477,7 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
373
477
  subProcess.send({
374
478
  type: "start",
375
479
  modelPath,
480
+ useMmap,
376
481
  tests,
377
482
  initialMaxContextSize,
378
483
  maxContextSize,
@@ -380,7 +485,8 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
380
485
  maxGpuLayers,
381
486
  minGpuLayers,
382
487
  flashAttention,
383
- evaluateText
488
+ evaluateText,
489
+ exitAfterMeasurement
384
490
  });
385
491
  if (timeoutHandle != null) {
386
492
  clearTimeout(timeoutHandle);
@@ -388,6 +494,11 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
388
494
  }
389
495
  }
390
496
  else if (message.type === "done") {
497
+ isPlannedExit = true;
498
+ isDone = true;
499
+ subProcess.send({ type: "exit" });
500
+ }
501
+ else if (message.type === "exit") {
391
502
  isPlannedExit = true;
392
503
  subProcess.send({ type: "exit" });
393
504
  }
@@ -409,10 +520,13 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
409
520
  result: {
410
521
  type: "success",
411
522
  modelVramUsage: message.modelVramUsage,
523
+ modelRamUsage: message.modelRamUsage,
412
524
  contextSize: message.contextSize,
413
525
  contextVramUsage: message.contextVramUsage,
526
+ contextRamUsage: message.contextRamUsage,
414
527
  contextStateSize: message.contextStateSize,
415
- totalVramUsage: message.totalVramUsage
528
+ totalVramUsage: message.totalVramUsage,
529
+ totalRamUsage: message.totalRamUsage
416
530
  }
417
531
  });
418
532
  }
@@ -462,7 +576,8 @@ async function runTestWorkerLogic() {
462
576
  process.exit(1);
463
577
  process.send(info);
464
578
  }
465
- async function testContextSizes({ model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText }) {
579
+ async function testContextSizes({ model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText, exitAfterMeasurement = false }) {
580
+ let measurementsDone = 0;
466
581
  const contextSizeCheckPlan = getContextSizesCheckPlan(maxContextSize != null
467
582
  ? Math.min(model.trainContextSize, maxContextSize)
468
583
  : model.trainContextSize, tests, minContextSize);
@@ -474,6 +589,7 @@ async function runTestWorkerLogic() {
474
589
  currentContextSizeCheck = null;
475
590
  try {
476
591
  const preContextVramUsage = (await llama.getVramState()).used;
592
+ const preContextRamUsage = getMemoryUsage(llama);
477
593
  const context = await model.createContext({
478
594
  contextSize: currentContextSizeCheck ?? (maxContextSize != null
479
595
  ? { max: maxContextSize }
@@ -487,14 +603,19 @@ async function runTestWorkerLogic() {
487
603
  await sequence.evaluateWithoutGeneratingNewTokens(model.tokenize(evaluateText));
488
604
  }
489
605
  const postContextVramUsage = (await llama.getVramState()).used;
606
+ const postContextRamUsage = getMemoryUsage(llama);
607
+ measurementsDone++;
490
608
  sendInfoBack({
491
609
  type: "stats",
492
610
  gpuLayers: model.gpuLayers,
493
611
  modelVramUsage,
612
+ modelRamUsage,
494
613
  contextSize: context.contextSize,
495
614
  contextVramUsage: postContextVramUsage - preContextVramUsage,
615
+ contextRamUsage: postContextRamUsage - preContextRamUsage,
496
616
  contextStateSize: context.stateSize,
497
- totalVramUsage: postContextVramUsage
617
+ totalVramUsage: postContextVramUsage,
618
+ totalRamUsage: postContextRamUsage
498
619
  });
499
620
  currentContextSizeCheck = context.contextSize;
500
621
  await context.dispose();
@@ -514,35 +635,46 @@ async function runTestWorkerLogic() {
514
635
  }
515
636
  }
516
637
  currentContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, currentContextSizeCheck);
638
+ if (exitAfterMeasurement)
639
+ return measurementsDone;
517
640
  }
641
+ return measurementsDone;
518
642
  }
519
- async function testWithGpuLayers({ modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText }) {
643
+ async function testWithGpuLayers({ modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText, exitAfterMeasurement = false }) {
520
644
  try {
521
645
  const preModelVramUsage = (await llama.getVramState()).used;
646
+ const preModelRamUsage = getMemoryUsage(llama);
522
647
  const model = await llama.loadModel({
523
648
  modelPath,
649
+ useMmap,
524
650
  gpuLayers,
525
651
  defaultContextFlashAttention: flashAttention,
526
652
  ignoreMemorySafetyChecks: true
527
653
  });
528
654
  const postModelVramUsage = (await llama.getVramState()).used;
655
+ const postModelRamUsage = getMemoryUsage(llama);
529
656
  sendInfoBack({
530
657
  type: "stats",
531
658
  gpuLayers: model.gpuLayers,
532
659
  modelVramUsage: postModelVramUsage - preModelVramUsage,
533
- totalVramUsage: postModelVramUsage
660
+ modelRamUsage: postModelRamUsage - preModelRamUsage,
661
+ totalVramUsage: postModelVramUsage,
662
+ totalRamUsage: postModelRamUsage
534
663
  });
535
- await testContextSizes({
664
+ const measurementsDone = await testContextSizes({
536
665
  model,
537
666
  modelVramUsage: postModelVramUsage - preModelVramUsage,
667
+ modelRamUsage: postModelRamUsage - preModelRamUsage,
538
668
  startContextSize,
539
669
  maxContextSize,
540
670
  minContextSize,
541
671
  flashAttention,
542
672
  tests,
543
- evaluateText
673
+ evaluateText,
674
+ exitAfterMeasurement
544
675
  });
545
676
  await model.dispose();
677
+ return measurementsDone;
546
678
  }
547
679
  catch (err) {
548
680
  sendInfoBack({
@@ -551,12 +683,23 @@ async function runTestWorkerLogic() {
551
683
  gpuLayers: gpuLayers
552
684
  });
553
685
  }
686
+ return 0;
554
687
  }
555
688
  process.on("message", async (message) => {
556
689
  if (message.type === "start") {
557
690
  for (let gpuLayers = message.maxGpuLayers; gpuLayers >= (message.minGpuLayers ?? 0); gpuLayers--) {
558
- await testWithGpuLayers({
691
+ if (gpuLayers == message.maxGpuLayers && message.initialMaxContextSize != null) {
692
+ const ggufInsights = await GgufInsights.from(await readGgufFileInfo(message.modelPath), llama);
693
+ const contextSizeCheckPlan = getContextSizesCheckPlan(message.maxContextSize != null
694
+ ? Math.min(ggufInsights.trainContextSize ?? 4096, message.maxContextSize)
695
+ : ggufInsights.trainContextSize ?? 4096, message.tests, message.minContextSize);
696
+ const firstContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, message.initialMaxContextSize);
697
+ if (firstContextSizeCheck == null)
698
+ continue;
699
+ }
700
+ const measurementsDone = await testWithGpuLayers({
559
701
  modelPath: message.modelPath,
702
+ useMmap: message.useMmap,
560
703
  gpuLayers,
561
704
  tests: message.tests,
562
705
  startContextSize: gpuLayers == message.maxGpuLayers
@@ -565,8 +708,13 @@ async function runTestWorkerLogic() {
565
708
  maxContextSize: message.maxContextSize,
566
709
  minContextSize: message.minContextSize,
567
710
  flashAttention: message.flashAttention,
568
- evaluateText: message.evaluateText
711
+ evaluateText: message.evaluateText,
712
+ exitAfterMeasurement: message.exitAfterMeasurement
569
713
  });
714
+ if (measurementsDone > 0 && message.exitAfterMeasurement) {
715
+ sendInfoBack({ type: "exit" });
716
+ return;
717
+ }
570
718
  }
571
719
  sendInfoBack({ type: "done" });
572
720
  }
@@ -626,4 +774,13 @@ function padStartAnsi(text, length, padChar = " ") {
626
774
  const textWithoutAnsi = stripAnsi(text);
627
775
  return padChar.repeat(Math.max(0, length - textWithoutAnsi.length)) + text;
628
776
  }
777
+ function getMemoryUsage(llama) {
778
+ const totalMemoryUsage = llama._bindings.getMemoryInfo().total;
779
+ const vramUsage = llama._bindings.getGpuVramInfo();
780
+ let memoryUsage = totalMemoryUsage;
781
+ const unifiedMemoryVramUsage = Math.min(vramUsage.unifiedSize, vramUsage.used);
782
+ if (unifiedMemoryVramUsage <= memoryUsage)
783
+ memoryUsage -= unifiedMemoryVramUsage;
784
+ return memoryUsage;
785
+ }
629
786
  //# sourceMappingURL=InspectMeasureCommand.js.map