npm - node-llama-cpp - Versions diffs - 3.3.2 → 3.4.1 - Mend

node-llama-cpp 3.3.2 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (199) hide show

package/README.md +3 -2
package/dist/bindings/AddonTypes.d.ts +12 -4
package/dist/bindings/Llama.d.ts +9 -0
package/dist/bindings/Llama.js +52 -28
package/dist/bindings/Llama.js.map +1 -1
package/dist/bindings/getLlama.d.ts +2 -1
package/dist/bindings/getLlama.js +19 -9
package/dist/bindings/getLlama.js.map +1 -1
package/dist/bindings/utils/asyncSome.js +2 -0
package/dist/bindings/utils/asyncSome.js.map +1 -1
package/dist/bindings/utils/compileLLamaCpp.d.ts +1 -1
package/dist/bindings/utils/compileLLamaCpp.js +115 -34
package/dist/bindings/utils/compileLLamaCpp.js.map +1 -1
package/dist/bindings/utils/detectAvailableComputeLayers.d.ts +1 -0
package/dist/bindings/utils/detectAvailableComputeLayers.js +4 -4
package/dist/bindings/utils/detectAvailableComputeLayers.js.map +1 -1
package/dist/bindings/utils/detectBuildTools.d.ts +14 -0
package/dist/bindings/utils/detectBuildTools.js +149 -0
package/dist/bindings/utils/detectBuildTools.js.map +1 -0
package/dist/bindings/utils/resolveActualBindingBinaryPath.d.ts +1 -0
package/dist/bindings/utils/resolveActualBindingBinaryPath.js +18 -0
package/dist/bindings/utils/resolveActualBindingBinaryPath.js.map +1 -0
package/dist/bindings/utils/testBindingBinary.d.ts +1 -1
package/dist/bindings/utils/testBindingBinary.js +58 -5
package/dist/bindings/utils/testBindingBinary.js.map +1 -1
package/dist/chatWrappers/AlpacaChatWrapper.d.ts +4 -0
package/dist/chatWrappers/AlpacaChatWrapper.js +4 -0
package/dist/chatWrappers/AlpacaChatWrapper.js.map +1 -1
package/dist/chatWrappers/FalconChatWrapper.d.ts +4 -0
package/dist/chatWrappers/FalconChatWrapper.js +4 -0
package/dist/chatWrappers/FalconChatWrapper.js.map +1 -1
package/dist/chatWrappers/GeneralChatWrapper.d.ts +4 -0
package/dist/chatWrappers/GeneralChatWrapper.js +4 -0
package/dist/chatWrappers/GeneralChatWrapper.js.map +1 -1
package/dist/chatWrappers/utils/resolveChatWrapper.d.ts +2 -0
package/dist/chatWrappers/utils/resolveChatWrapper.js +8 -27
package/dist/chatWrappers/utils/resolveChatWrapper.js.map +1 -1
package/dist/cli/commands/ChatCommand.d.ts +4 -0
package/dist/cli/commands/ChatCommand.js +158 -13
package/dist/cli/commands/ChatCommand.js.map +1 -1
package/dist/cli/commands/CompleteCommand.d.ts +4 -0
package/dist/cli/commands/CompleteCommand.js +143 -10
package/dist/cli/commands/CompleteCommand.js.map +1 -1
package/dist/cli/commands/DebugCommand.js +5 -5
package/dist/cli/commands/DebugCommand.js.map +1 -1
package/dist/cli/commands/InfillCommand.d.ts +4 -0
package/dist/cli/commands/InfillCommand.js +142 -10
package/dist/cli/commands/InfillCommand.js.map +1 -1
package/dist/cli/commands/OnPostInstallCommand.js +12 -2
package/dist/cli/commands/OnPostInstallCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectEstimateCommand.d.ts +1 -0
package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js +14 -7
package/dist/cli/commands/inspect/commands/InspectEstimateCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectGgufCommand.js +13 -3
package/dist/cli/commands/inspect/commands/InspectGgufCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectGpuCommand.js +20 -10
package/dist/cli/commands/inspect/commands/InspectGpuCommand.js.map +1 -1
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.d.ts +2 -0
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js +234 -77
package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js.map +1 -1
package/dist/cli/recommendedModels.js +11 -1
package/dist/cli/recommendedModels.js.map +1 -1
package/dist/cli/utils/ConsoleTable.d.ts +1 -0
package/dist/cli/utils/ConsoleTable.js +5 -1
package/dist/cli/utils/ConsoleTable.js.map +1 -1
package/dist/cli/utils/interactivelyAskForModel.d.ts +2 -1
package/dist/cli/utils/interactivelyAskForModel.js +16 -13
package/dist/cli/utils/interactivelyAskForModel.js.map +1 -1
package/dist/cli/utils/isRunningUnderRosetta.d.ts +1 -0
package/dist/cli/utils/isRunningUnderRosetta.js +20 -0
package/dist/cli/utils/isRunningUnderRosetta.js.map +1 -0
package/dist/cli/utils/printCommonInfoLines.d.ts +4 -2
package/dist/cli/utils/printCommonInfoLines.js +67 -5
package/dist/cli/utils/printCommonInfoLines.js.map +1 -1
package/dist/cli/utils/resolveCommandGgufPath.d.ts +3 -1
package/dist/cli/utils/resolveCommandGgufPath.js +6 -5
package/dist/cli/utils/resolveCommandGgufPath.js.map +1 -1
package/dist/cli/utils/toBytes.d.ts +1 -0
package/dist/cli/utils/toBytes.js +5 -0
package/dist/cli/utils/toBytes.js.map +1 -0
package/dist/config.d.ts +3 -0
package/dist/config.js +3 -0
package/dist/config.js.map +1 -1
package/dist/evaluator/LlamaChat/LlamaChat.d.ts +12 -3
package/dist/evaluator/LlamaChat/LlamaChat.js +21 -7
package/dist/evaluator/LlamaChat/LlamaChat.js.map +1 -1
package/dist/evaluator/LlamaChatSession/LlamaChatSession.d.ts +6 -2
package/dist/evaluator/LlamaChatSession/LlamaChatSession.js +3 -0
package/dist/evaluator/LlamaChatSession/LlamaChatSession.js.map +1 -1
package/dist/evaluator/LlamaCompletion.d.ts +3 -0
package/dist/evaluator/LlamaCompletion.js +5 -0
package/dist/evaluator/LlamaCompletion.js.map +1 -1
package/dist/evaluator/LlamaContext/LlamaContext.d.ts +81 -38
package/dist/evaluator/LlamaContext/LlamaContext.js +678 -132
package/dist/evaluator/LlamaContext/LlamaContext.js.map +1 -1
package/dist/evaluator/LlamaContext/TokenPredictor.d.ts +55 -0
package/dist/evaluator/LlamaContext/TokenPredictor.js +20 -0
package/dist/evaluator/LlamaContext/TokenPredictor.js.map +1 -0
package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.d.ts +56 -0
package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js +266 -0
package/dist/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js.map +1 -0
package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.d.ts +58 -0
package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js +138 -0
package/dist/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js.map +1 -0
package/dist/evaluator/LlamaContext/types.d.ts +198 -5
package/dist/evaluator/LlamaEmbeddingContext.d.ts +3 -0
package/dist/evaluator/LlamaEmbeddingContext.js +3 -0
package/dist/evaluator/LlamaEmbeddingContext.js.map +1 -1
package/dist/evaluator/LlamaGrammar.d.ts +7 -1
package/dist/evaluator/LlamaGrammar.js +6 -0
package/dist/evaluator/LlamaGrammar.js.map +1 -1
package/dist/evaluator/LlamaGrammarEvaluationState.d.ts +4 -4
package/dist/evaluator/LlamaGrammarEvaluationState.js +16 -8
package/dist/evaluator/LlamaGrammarEvaluationState.js.map +1 -1
package/dist/evaluator/LlamaJsonSchemaGrammar.d.ts +5 -0
package/dist/evaluator/LlamaJsonSchemaGrammar.js +7 -0
package/dist/evaluator/LlamaJsonSchemaGrammar.js.map +1 -1
package/dist/evaluator/LlamaModel/LlamaModel.d.ts +19 -11
package/dist/evaluator/LlamaModel/LlamaModel.js +23 -29
package/dist/evaluator/LlamaModel/LlamaModel.js.map +1 -1
package/dist/evaluator/LlamaRankingContext.d.ts +76 -0
package/dist/evaluator/LlamaRankingContext.js +158 -0
package/dist/evaluator/LlamaRankingContext.js.map +1 -0
package/dist/evaluator/TokenBias.d.ts +3 -0
package/dist/evaluator/TokenBias.js +3 -0
package/dist/evaluator/TokenBias.js.map +1 -1
package/dist/evaluator/utils/chunkDocument.d.ts +86 -0
package/dist/evaluator/utils/chunkDocument.js +212 -0
package/dist/evaluator/utils/chunkDocument.js.map +1 -0
package/dist/gguf/insights/GgufInsights.d.ts +3 -1
package/dist/gguf/insights/GgufInsights.js +114 -8
package/dist/gguf/insights/GgufInsights.js.map +1 -1
package/dist/gguf/insights/GgufInsightsConfigurationResolver.d.ts +6 -3
package/dist/gguf/insights/GgufInsightsConfigurationResolver.js +11 -7
package/dist/gguf/insights/GgufInsightsConfigurationResolver.js.map +1 -1
package/dist/gguf/insights/utils/resolveModelGpuLayersOption.d.ts +2 -1
package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js +13 -7
package/dist/gguf/insights/utils/resolveModelGpuLayersOption.js.map +1 -1
package/dist/gguf/parser/GgufV2Parser.js +29 -8
package/dist/gguf/parser/GgufV2Parser.js.map +1 -1
package/dist/gguf/parser/parseGguf.js +11 -11
package/dist/gguf/parser/parseGguf.js.map +1 -1
package/dist/gguf/readGgufFileInfo.js +8 -3
package/dist/gguf/readGgufFileInfo.js.map +1 -1
package/dist/gguf/types/GgufFileInfoTypes.d.ts +1 -0
package/dist/gguf/types/GgufMetadataTypes.d.ts +9 -9
package/dist/gguf/types/GgufMetadataTypes.js +1 -1
package/dist/gguf/types/GgufMetadataTypes.js.map +1 -1
package/dist/gguf/types/GgufTensorInfoTypes.d.ts +13 -0
package/dist/gguf/types/GgufTensorInfoTypes.js.map +1 -1
package/dist/index.d.ts +7 -2
package/dist/index.js +6 -1
package/dist/index.js.map +1 -1
package/dist/tsconfig.tsbuildinfo +1 -1
package/dist/utils/LlamaText.d.ts +4 -1
package/dist/utils/LlamaText.js +4 -1
package/dist/utils/LlamaText.js.map +1 -1
package/dist/utils/cmake.js +23 -0
package/dist/utils/cmake.js.map +1 -1
package/dist/utils/pushAll.d.ts +1 -1
package/dist/utils/pushAll.js.map +1 -1
package/dist/utils/tokenizerUtils.js +1 -1
package/dist/utils/utilTypes.d.ts +5 -0
package/llama/CMakeLists.txt +25 -8
package/llama/addon/AddonContext.cpp +196 -22
package/llama/addon/AddonContext.h +1 -0
package/llama/addon/AddonGrammar.cpp +1 -4
package/llama/addon/AddonGrammarEvaluationState.cpp +16 -5
package/llama/addon/AddonModel.cpp +31 -39
package/llama/addon/AddonModel.h +1 -1
package/llama/addon/AddonModelLora.cpp +2 -2
package/llama/addon/AddonModelLora.h +1 -1
package/llama/addon/AddonSampler.cpp +7 -12
package/llama/addon/addon.cpp +26 -7
package/llama/addon/globals/getGpuInfo.cpp +30 -5
package/llama/addon/globals/getGpuInfo.h +6 -1
package/llama/addon/globals/getMemoryInfo.cpp +63 -0
package/llama/addon/globals/getMemoryInfo.h +4 -0
package/llama/binariesGithubRelease.json +1 -1
package/llama/cmake/win32.ensureNinjaPath.cmake +68 -0
package/llama/cmake/win32.ensureNodeLib.cmake +34 -0
package/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake +12 -0
package/llama/cmake/win32.llvmEnsureCmakeAr.cmake +37 -0
package/llama/cmake/win32.llvmUseGnuModeCompilers.cmake +87 -0
package/llama/cmake/win32.programFilesPaths.cmake +35 -0
package/llama/gitRelease.bundle +0 -0
package/llama/gpuInfo/vulkan-gpu-info.cpp +29 -2
package/llama/gpuInfo/vulkan-gpu-info.h +1 -0
package/llama/llama.cpp.info.json +1 -1
package/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake +14 -0
package/llama/profiles/llvm.win32.host-x64.target-arm64.cmake +14 -0
package/llama/profiles/llvm.win32.host-x64.target-x64.cmake +14 -0
package/llama/toolchains/llvm.win32.host-x64.target-x64.cmake +20 -0
package/llama/toolchains/win32.host-arm64.target-arm64.cmake +21 -0
package/llama/toolchains/win32.host-x64.target-arm64.cmake +14 -34
package/package.json +47 -44
package/templates/README.md +1 -1
package/templates/packed/electron-typescript-react.json +1 -1
package/templates/packed/node-typescript.json +1 -1

package/dist/cli/commands/inspect/commands/InspectMeasureCommand.js CHANGED Viewed

@@ -2,8 +2,8 @@ import path from "path";
 import process from "process";
 import { fileURLToPath } from "url";
 import { fork } from "node:child_process";
+import os from "os";
 import chalk from "chalk";
-import bytes from "bytes";
 import stripAnsi from "strip-ansi";
 import { readGgufFileInfo } from "../../../../gguf/readGgufFileInfo.js";
 import { resolveCommandGgufPath } from "../../../utils/resolveCommandGgufPath.js";
@@ -17,6 +17,7 @@ import { getPrettyBuildGpuName } from "../../../../bindings/consts.js";
 import { getReadablePath } from "../../../utils/getReadablePath.js";
 import { withCliCommandDescriptionDocsUrl } from "../../../utils/withCliCommandDescriptionDocsUrl.js";
 import { documentationPageUrls } from "../../../../config.js";
+import { toBytes } from "../../../utils/toBytes.js";
 export const InspectMeasureCommand = {
     command: "measure [modelPath]",
     describe: withCliCommandDescriptionDocsUrl("Measure VRAM consumption of a GGUF model file with all possible combinations of gpu layers and context sizes", documentationPageUrls.CLI.Inspect.Measure),
@@ -82,6 +83,17 @@ export const InspectMeasureCommand = {
             type: "number",
             default: 10,
             description: "Number of context size measures to take for each gpu layers count"
+        })
+            .option("memory", {
+            type: "string",
+            choices: ["vram", "ram", "all"],
+            default: "vram",
+            description: "Type of memory to measure"
+        })
+            .option("noMmap", {
+            type: "boolean",
+            default: false,
+            description: "Disable mmap (memory-mapped file) usage"
         })
             .option("printHeaderBeforeEachLayer", {
             alias: "ph",
@@ -101,13 +113,14 @@ export const InspectMeasureCommand = {
             description: "Number of times to repeat the evaluation text before sending it for evaluation, in order to make it longer"
         });
     },
-    async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }) {
+    async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }) {
         if (maxLayers === -1)
             maxLayers = undefined;
         if (maxContextSize === -1)
             maxContextSize = undefined;
         if (minLayers < 1)
             minLayers = 1;
+        const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all";
         const headers = resolveHeaderFlag(headerArg);
         // ensure a llama build is available
         const llama = gpu == null
@@ -118,17 +131,29 @@ export const InspectMeasureCommand = {
                 gpu,
                 logLevel: LlamaLogLevel.error
             });
-        const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers);
+        const useMmap = !noMmap && llama.supportsMmap;
+        const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
+            flashAttention, useMmap
+        });
         console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
         console.info(`${chalk.yellow("GPU:")} ${getPrettyBuildGpuName(llama.gpu)}${gpu == null ? chalk.gray(" (last build)") : ""}`);
+        console.info(chalk.yellow("mmap:") + " " + (!llama.supportsMmap
+            ? "unsupported"
+            : useMmap
+                ? "enabled"
+                : "disabled"));
+        if (measureMemoryType === "ram" || measureMemoryType === "all")
+            console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
         console.info();
         const ggufMetadata = await readGgufFileInfo(resolvedGgufPath, {
             sourceType: "filesystem"
         });
         const ggufInsights = await GgufInsights.from(ggufMetadata, llama);
         const totalVram = (await llama.getVramState()).total;
+        const totalRam = os.totalmem();
         let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
         let previousContextSizeCheck = undefined;
+        const measureTable = getMeasureTable(measureMemoryType);
         measureTable.logHeader({ drawRowSeparator: !printHeaderBeforeEachLayer });
         while (lastGpuLayers >= (minLayers ?? 0)) {
             let printedAlreadyWithThisProcess = false;
@@ -141,6 +166,7 @@ export const InspectMeasureCommand = {
             };
             const done = await measureModel({
                 modelPath: resolvedGgufPath,
+                useMmap,
                 gpu: gpu == null
                     ? undefined
                     : llama.gpu,
@@ -154,6 +180,7 @@ export const InspectMeasureCommand = {
                 evaluateText: evaluateText == null
                     ? undefined
                     : evaluateText.repeat(repeatEvaluateText ?? 1),
+                exitAfterMeasurement: exitAfterEachMeasurement,
                 onInfo({ gpuLayers, result }) {
                     if (lastGpuLayers !== gpuLayers) {
                         lastGpuLayers = gpuLayers;
@@ -196,26 +223,45 @@ export const InspectMeasureCommand = {
                     else if (result.type === "success") {
                         previousContextSizeCheck = result.contextSize;
                         hadSuccessInThisProcess = true;
-                        const modelVramEstimation = ggufInsights.estimateModelResourceRequirements({ gpuLayers: lastGpuLayers }).gpuVram;
+                        const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({
+                            gpuLayers: lastGpuLayers,
+                            useMmap
+                        });
+                        const modelVramEstimation = modelResourceEstimation.gpuVram;
                         const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") +
-                            bytes(Math.abs(result.modelVramUsage - modelVramEstimation));
+                            toBytes(Math.abs(result.modelVramUsage - modelVramEstimation));
                         const modelVramEstimationDiffText = modelVramEstimationDiffBytes.padEnd(9, " ") + " " +
                             padStartAnsi("(" + renderDiffPercentageWithColors(((modelVramEstimation / result.modelVramUsage) - 1) * 100) + ")", 9);
-                        const contextVramEstimation = previousContextSizeCheck == null
+                        const modelRamEstimation = modelResourceEstimation.cpuRam;
+                        const modelRamEstimationDiffBytes = (modelRamEstimation < result.modelRamUsage ? "-" : "") +
+                            toBytes(Math.abs(result.modelRamUsage - modelRamEstimation));
+                        const modelRamEstimationDiffText = modelRamEstimationDiffBytes.padEnd(9, " ") + " " +
+                            padStartAnsi("(" + renderDiffPercentageWithColors(((modelRamEstimation / result.modelRamUsage) - 1) * 100) + ")", 9);
+                        const contextResourceEstimation = previousContextSizeCheck == null
                             ? undefined
                             : ggufInsights.estimateContextResourceRequirements({
                                 contextSize: previousContextSizeCheck,
                                 modelGpuLayers: lastGpuLayers,
                                 flashAttention
-                            }).gpuVram;
+                            });
+                        const contextVramEstimation = contextResourceEstimation?.gpuVram;
                         const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
                             ? undefined
                             : ((contextVramEstimation < result.contextVramUsage ? "-" : "") +
-                                bytes(Math.abs(result.contextVramUsage - contextVramEstimation)));
+                                toBytes(Math.abs(result.contextVramUsage - contextVramEstimation)));
                         const contextVramEstimationDiffText = (contextVramEstimation == null || contextVramEstimationDiffBytes == null || result.contextVramUsage == null)
                             ? undefined
                             : (contextVramEstimationDiffBytes.padEnd(9, " ") + " " +
                                 padStartAnsi("(" + renderDiffPercentageWithColors(((contextVramEstimation / result.contextVramUsage) - 1) * 100) + ")", 9));
+                        const contextRamEstimation = contextResourceEstimation?.cpuRam;
+                        const contextRamEstimationDiffBytes = (result.contextRamUsage == null || contextRamEstimation == null)
+                            ? undefined
+                            : ((contextRamEstimation < result.contextRamUsage ? "-" : "") +
+                                toBytes(Math.abs(result.contextRamUsage - contextRamEstimation)));
+                        const contextRamEstimationDiffText = (contextRamEstimation == null || contextRamEstimationDiffBytes == null || result.contextRamUsage == null)
+                            ? undefined
+                            : (contextRamEstimationDiffBytes.padEnd(9, " ") + " " +
+                                padStartAnsi("(" + renderDiffPercentageWithColors(((contextRamEstimation / result.contextRamUsage) - 1) * 100) + ")", 9));
                         measureTable.logLine({
                             newProcess: getNewProccessValue(),
                             type: previousContextSizeCheck == null
@@ -225,18 +271,30 @@ export const InspectMeasureCommand = {
                             contextSize: previousContextSizeCheck != null
                                 ? String(previousContextSizeCheck)
                                 : undefined,
-                            estimatedModelVram: bytes(modelVramEstimation),
-                            actualModelVram: bytes(result.modelVramUsage),
-                            modelEstimationDiff: modelVramEstimationDiffText,
+                            estimatedModelVram: toBytes(modelVramEstimation),
+                            actualModelVram: toBytes(result.modelVramUsage),
+                            modelVramEstimationDiff: modelVramEstimationDiffText,
+                            estimatedModelRam: toBytes(modelRamEstimation),
+                            actualModelRam: toBytes(result.modelRamUsage),
+                            modelRamEstimationDiff: modelRamEstimationDiffText,
                             estimatedContextVram: contextVramEstimation == null
                                 ? undefined
-                                : bytes(contextVramEstimation),
+                                : toBytes(contextVramEstimation),
                             actualContextVram: result.contextVramUsage == null
                                 ? undefined
-                                : bytes(result.contextVramUsage),
-                            contextEstimationDiff: contextVramEstimationDiffText,
+                                : toBytes(result.contextVramUsage),
+                            contextVramEstimationDiff: contextVramEstimationDiffText,
                             totalVramUsage: ((result.totalVramUsage / totalVram) * 100).toFixed(2).padStart(5, "0") + "% " +
-                                chalk.gray("(" + bytes(result.totalVramUsage) + "/" + bytes(totalVram) + ")")
+                                chalk.gray("(" + toBytes(result.totalVramUsage) + "/" + toBytes(totalVram) + ")"),
+                            estimatedContextRam: contextRamEstimation == null
+                                ? undefined
+                                : toBytes(contextRamEstimation),
+                            actualContextRam: result.contextRamUsage == null
+                                ? undefined
+                                : toBytes(result.contextRamUsage),
+                            contextRamEstimationDiff: contextRamEstimationDiffText,
+                            totalRamUsage: ((result.totalRamUsage / totalRam) * 100).toFixed(2).padStart(5, "0") + "% " +
+                                chalk.gray("(" + toBytes(result.totalRamUsage) + "/" + toBytes(totalRam) + ")")
                         });
                     }
                 }
@@ -246,55 +304,100 @@ export const InspectMeasureCommand = {
         }
     }
 };
-const measureTable = new ConsoleTable([{
-        key: "newProcess",
-        title: " ",
-        width: 1
-    }, {
-        key: "type",
-        title: "Type",
-        width: Math.max("Type".length, "Model".length, "Context".length),
-        canSpanOverEmptyColumns: true
-    }, {
-        key: "gpuLayers",
-        title: "Layers",
-        width: "Layers".length,
-        canSpanOverEmptyColumns: true
-    }, {
-        key: "contextSize",
-        title: "Context size",
-        width: "Context size".length,
-        canSpanOverEmptyColumns: true
-    }, {
-        key: "estimatedModelVram",
-        title: "Estimated model VRAM",
-        width: "Estimated model VRAM".length,
-        canSpanOverEmptyColumns: true
-    }, {
-        key: "actualModelVram",
-        title: "Model VRAM",
-        width: "Model VRAM".length
-    }, {
-        key: "modelEstimationDiff",
-        title: "Diff",
-        width: Math.max("Diff".length, 9 + 1 + 9)
-    }, {
-        key: "estimatedContextVram",
-        title: "Estimated context VRAM",
-        width: "Estimated context VRAM".length
-    }, {
-        key: "actualContextVram",
-        title: "Context VRAM",
-        width: "Context VRAM".length
-    }, {
-        key: "contextEstimationDiff",
-        title: "Diff",
-        width: Math.max("Diff".length, 9 + 1 + 9)
-    }, {
-        key: "totalVramUsage",
-        title: "VRAM usage",
-        width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
-    }]);
+function getMeasureTable(memoryType) {
+    return new ConsoleTable([{
+            key: "newProcess",
+            title: " ",
+            width: 1
+        }, {
+            key: "type",
+            title: "Type",
+            width: Math.max("Type".length, "Model".length, "Context".length),
+            canSpanOverEmptyColumns: true
+        }, {
+            key: "gpuLayers",
+            title: "Layers",
+            width: "Layers".length,
+            canSpanOverEmptyColumns: true
+        }, {
+            key: "contextSize",
+            title: "Context size",
+            width: "Context size".length,
+            canSpanOverEmptyColumns: true
+        }, {
+            key: "estimatedModelVram",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "Estimated model VRAM",
+            width: "Estimated model VRAM".length,
+            canSpanOverEmptyColumns: true
+        }, {
+            key: "actualModelVram",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "Model VRAM",
+            width: "Model VRAM".length
+        }, {
+            key: "modelVramEstimationDiff",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "Diff",
+            width: Math.max("Diff".length, 9 + 1 + 9)
+        }, {
+            key: "estimatedModelRam",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "Estimated model RAM",
+            width: "Estimated model RAM".length,
+            canSpanOverEmptyColumns: true
+        }, {
+            key: "actualModelRam",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "Model RAM",
+            width: "Model RAM".length
+        }, {
+            key: "modelRamEstimationDiff",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "Diff",
+            width: Math.max("Diff".length, 9 + 1 + 9)
+        }, {
+            key: "estimatedContextVram",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "Estimated context VRAM",
+            width: "Estimated context VRAM".length
+        }, {
+            key: "actualContextVram",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "Context VRAM",
+            width: "Context VRAM".length
+        }, {
+            key: "contextVramEstimationDiff",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "Diff",
+            width: Math.max("Diff".length, 9 + 1 + 9)
+        }, {
+            key: "totalVramUsage",
+            visible: memoryType === "vram" || memoryType === "all",
+            title: "VRAM usage",
+            width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
+        }, {
+            key: "estimatedContextRam",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "Estimated context RAM",
+            width: "Estimated context RAM".length
+        }, {
+            key: "actualContextRam",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "Context RAM",
+            width: "Context RAM".length
+        }, {
+            key: "contextRamEstimationDiff",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "Diff",
+            width: Math.max("Diff".length, 9 + 1 + 9)
+        }, {
+            key: "totalRamUsage",
+            visible: memoryType === "ram" || memoryType === "all",
+            title: "RAM usage",
+            width: Math.max("RAM usage".length, 8 + 1 + 8 + 1 + 8)
+        }]);
+}
 function renderDiffPercentageWithColors(percentage, { greenBright = 2, green = 6, yellow = 10, yellowBright = 14 } = {}) {
     const percentageText = percentage.toFixed(2).padStart(5, "0") + "%";
     const absPercentage = Math.abs(percentage);
@@ -311,7 +414,7 @@ function renderDiffPercentageWithColors(percentage, { greenBright = 2, green = 6
 const __filename = fileURLToPath(import.meta.url);
 const detectedFileName = path.basename(__filename);
 const expectedFileName = "InspectMeasureCommand";
-async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText, onInfo }) {
+async function measureModel({ modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText, exitAfterMeasurement = false, onInfo }) {
     if (!detectedFileName.startsWith(expectedFileName)) {
         console.warn(getConsoleLogPrefix() +
             `"${expectedFileName}.js" file is not independent, so running sub-process tests cannot be done with it\n` +
@@ -331,6 +434,7 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
         }
     });
     let isPlannedExit = false;
+    let isDone = false;
     let forkSucceeded = false;
     let timeoutHandle = null;
     const processCreationTimeout = 1000 * 60 * 5;
@@ -363,8 +467,8 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
             function done() {
                 if (!forkSucceeded)
                     reject(new Error(`Measuring a model failed to run a sub-process via file "${__filename}"`));
-                else
-                    resolve(isPlannedExit);
+                else if (isPlannedExit)
+                    resolve(isPlannedExit && isDone);
                 cleanup();
             }
             subProcess.on("message", (message) => {
@@ -373,6 +477,7 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
                     subProcess.send({
                         type: "start",
                         modelPath,
+                        useMmap,
                         tests,
                         initialMaxContextSize,
                         maxContextSize,
@@ -380,7 +485,8 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
                         maxGpuLayers,
                         minGpuLayers,
                         flashAttention,
-                        evaluateText
+                        evaluateText,
+                        exitAfterMeasurement
                     });
                     if (timeoutHandle != null) {
                         clearTimeout(timeoutHandle);
@@ -388,6 +494,11 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
                     }
                 }
                 else if (message.type === "done") {
+                    isPlannedExit = true;
+                    isDone = true;
+                    subProcess.send({ type: "exit" });
+                }
+                else if (message.type === "exit") {
                     isPlannedExit = true;
                     subProcess.send({ type: "exit" });
                 }
@@ -409,10 +520,13 @@ async function measureModel({ modelPath, gpu, tests, initialMaxContextSize, maxC
                         result: {
                             type: "success",
                             modelVramUsage: message.modelVramUsage,
+                            modelRamUsage: message.modelRamUsage,
                             contextSize: message.contextSize,
                             contextVramUsage: message.contextVramUsage,
+                            contextRamUsage: message.contextRamUsage,
                             contextStateSize: message.contextStateSize,
-                            totalVramUsage: message.totalVramUsage
+                            totalVramUsage: message.totalVramUsage,
+                            totalRamUsage: message.totalRamUsage
                         }
                     });
                 }
@@ -462,7 +576,8 @@ async function runTestWorkerLogic() {
             process.exit(1);
         process.send(info);
     }
-    async function testContextSizes({ model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText }) {
+    async function testContextSizes({ model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText, exitAfterMeasurement = false }) {
+        let measurementsDone = 0;
         const contextSizeCheckPlan = getContextSizesCheckPlan(maxContextSize != null
             ? Math.min(model.trainContextSize, maxContextSize)
             : model.trainContextSize, tests, minContextSize);
@@ -474,6 +589,7 @@ async function runTestWorkerLogic() {
                 currentContextSizeCheck = null;
             try {
                 const preContextVramUsage = (await llama.getVramState()).used;
+                const preContextRamUsage = getMemoryUsage(llama);
                 const context = await model.createContext({
                     contextSize: currentContextSizeCheck ?? (maxContextSize != null
                         ? { max: maxContextSize }
@@ -487,14 +603,19 @@ async function runTestWorkerLogic() {
                     await sequence.evaluateWithoutGeneratingNewTokens(model.tokenize(evaluateText));
                 }
                 const postContextVramUsage = (await llama.getVramState()).used;
+                const postContextRamUsage = getMemoryUsage(llama);
+                measurementsDone++;
                 sendInfoBack({
                     type: "stats",
                     gpuLayers: model.gpuLayers,
                     modelVramUsage,
+                    modelRamUsage,
                     contextSize: context.contextSize,
                     contextVramUsage: postContextVramUsage - preContextVramUsage,
+                    contextRamUsage: postContextRamUsage - preContextRamUsage,
                     contextStateSize: context.stateSize,
-                    totalVramUsage: postContextVramUsage
+                    totalVramUsage: postContextVramUsage,
+                    totalRamUsage: postContextRamUsage
                 });
                 currentContextSizeCheck = context.contextSize;
                 await context.dispose();
@@ -514,35 +635,46 @@ async function runTestWorkerLogic() {
                 }
             }
             currentContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, currentContextSizeCheck);
+            if (exitAfterMeasurement)
+                return measurementsDone;
         }
+        return measurementsDone;
     }
-    async function testWithGpuLayers({ modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText }) {
+    async function testWithGpuLayers({ modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText, exitAfterMeasurement = false }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
+            const preModelRamUsage = getMemoryUsage(llama);
             const model = await llama.loadModel({
                 modelPath,
+                useMmap,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: true
             });
             const postModelVramUsage = (await llama.getVramState()).used;
+            const postModelRamUsage = getMemoryUsage(llama);
             sendInfoBack({
                 type: "stats",
                 gpuLayers: model.gpuLayers,
                 modelVramUsage: postModelVramUsage - preModelVramUsage,
-                totalVramUsage: postModelVramUsage
+                modelRamUsage: postModelRamUsage - preModelRamUsage,
+                totalVramUsage: postModelVramUsage,
+                totalRamUsage: postModelRamUsage
             });
-            await testContextSizes({
+            const measurementsDone = await testContextSizes({
                 model,
                 modelVramUsage: postModelVramUsage - preModelVramUsage,
+                modelRamUsage: postModelRamUsage - preModelRamUsage,
                 startContextSize,
                 maxContextSize,
                 minContextSize,
                 flashAttention,
                 tests,
-                evaluateText
+                evaluateText,
+                exitAfterMeasurement
             });
             await model.dispose();
+            return measurementsDone;
         }
         catch (err) {
             sendInfoBack({
@@ -551,12 +683,23 @@ async function runTestWorkerLogic() {
                 gpuLayers: gpuLayers
             });
         }
+        return 0;
     }
     process.on("message", async (message) => {
         if (message.type === "start") {
             for (let gpuLayers = message.maxGpuLayers; gpuLayers >= (message.minGpuLayers ?? 0); gpuLayers--) {
-                await testWithGpuLayers({
+                if (gpuLayers == message.maxGpuLayers && message.initialMaxContextSize != null) {
+                    const ggufInsights = await GgufInsights.from(await readGgufFileInfo(message.modelPath), llama);
+                    const contextSizeCheckPlan = getContextSizesCheckPlan(message.maxContextSize != null
+                        ? Math.min(ggufInsights.trainContextSize ?? 4096, message.maxContextSize)
+                        : ggufInsights.trainContextSize ?? 4096, message.tests, message.minContextSize);
+                    const firstContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, message.initialMaxContextSize);
+                    if (firstContextSizeCheck == null)
+                        continue;
+                }
+                const measurementsDone = await testWithGpuLayers({
                     modelPath: message.modelPath,
+                    useMmap: message.useMmap,
                     gpuLayers,
                     tests: message.tests,
                     startContextSize: gpuLayers == message.maxGpuLayers
@@ -565,8 +708,13 @@ async function runTestWorkerLogic() {
                     maxContextSize: message.maxContextSize,
                     minContextSize: message.minContextSize,
                     flashAttention: message.flashAttention,
-                    evaluateText: message.evaluateText
+                    evaluateText: message.evaluateText,
+                    exitAfterMeasurement: message.exitAfterMeasurement
                 });
+                if (measurementsDone > 0 && message.exitAfterMeasurement) {
+                    sendInfoBack({ type: "exit" });
+                    return;
+                }
             }
             sendInfoBack({ type: "done" });
         }
@@ -626,4 +774,13 @@ function padStartAnsi(text, length, padChar = " ") {
     const textWithoutAnsi = stripAnsi(text);
     return padChar.repeat(Math.max(0, length - textWithoutAnsi.length)) + text;
 }
+function getMemoryUsage(llama) {
+    const totalMemoryUsage = llama._bindings.getMemoryInfo().total;
+    const vramUsage = llama._bindings.getGpuVramInfo();
+    let memoryUsage = totalMemoryUsage;
+    const unifiedMemoryVramUsage = Math.min(vramUsage.unifiedSize, vramUsage.used);
+    if (unifiedMemoryVramUsage <= memoryUsage)
+        memoryUsage -= unifiedMemoryVramUsage;
+    return memoryUsage;
+}
 //# sourceMappingURL=InspectMeasureCommand.js.map