npm - @realtimex/node-llama-cpp - Versions diffs - 0.1.0 - Mend

@realtimex/node-llama-cpp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (876) hide show

package/dist/gguf/insights/GgufInsights.js ADDED Viewed

@@ -0,0 +1,854 @@
+import { getLlamaWithoutBackend } from "../../bindings/utils/getLlamaWithoutBackend.js";
+import { getDefaultContextBatchSize, getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
+import { GgmlType } from "../types/GgufTensorInfoTypes.js";
+import { GgufArchitectureType } from "../types/GgufMetadataTypes.js";
+import { getReadablePath } from "../../cli/utils/getReadablePath.js";
+import { padSafeContextSize } from "../../evaluator/LlamaContext/utils/padSafeContextSize.js";
+import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js";
+import { GgufInsightsTokens } from "./GgufInsightsTokens.js";
+export class GgufInsights {
+    /** @internal */ _llama;
+    /** @internal */ _modelSize;
+    /** @internal */ _totalFileLayers = null;
+    /** @internal */ _supportsRanking;
+    /** @internal */ _dominantTensorType;
+    /** @internal */ _ggufFileInfo;
+    /** @internal */ _configurationResolver;
+    /** @internal */ _tokens;
+    constructor(ggufFileInfo, llama) {
+        this._llama = llama;
+        this._ggufFileInfo = ggufFileInfo;
+        this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
+        this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
+        this._tokens = GgufInsightsTokens._create(this);
+    }
+    /**
+     * Get warnings about the model file that would affect its usage.
+     *
+     * Most of these warnings are also generated by `llama.cpp`
+     */
+    getWarnings(modelFilePath) {
+        const warnings = [];
+        const modelFilePathText = (modelFilePath != null && modelFilePath !== "")
+            ? ` ("${getReadablePath(modelFilePath)}")`
+            : "";
+        if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" &&
+            this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null) {
+            // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'"
+            warnings.push(`This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` +
+                "This may cause incorrect tokenization and thus degrade the generation quality. " +
+                "Consider using a newer model or regenerating this GGUF model file");
+        }
+        return warnings;
+    }
+    get ggufFileInfo() {
+        return this._ggufFileInfo;
+    }
+    get configurationResolver() {
+        return this._configurationResolver;
+    }
+    get tokens() {
+        return this._tokens;
+    }
+    /** The context size the model was trained on */
+    get trainContextSize() {
+        return this._ggufFileInfo.architectureMetadata.context_length;
+    }
+    /** The size of an embedding vector the model can produce */
+    get embeddingVectorSize() {
+        return this._ggufFileInfo.architectureMetadata.embedding_length;
+    }
+    get totalLayers() {
+        const outputLayers = 1;
+        return this._getTotalFileLayers() + outputLayers;
+    }
+    get modelSize() {
+        return this._modelSize;
+    }
+    get flashAttentionSupported() {
+        // source: `llama_new_context_with_model` in `llama.cpp`
+        if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
+            return false;
+        else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
+            return false;
+        else {
+            const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0;
+            const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0;
+            const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
+            const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
+            if (nEmbdHeadK !== nEmbdHeadV)
+                return false;
+        }
+        return true;
+    }
+    get hasEncoder() {
+        switch (this._ggufFileInfo.metadata?.general?.architecture) {
+            case GgufArchitectureType.t5:
+            case GgufArchitectureType.t5encoder:
+                return true;
+        }
+        return false;
+    }
+    get hasDecoder() {
+        switch (this._ggufFileInfo.metadata?.general?.architecture) {
+            case GgufArchitectureType.t5encoder:
+                return false;
+        }
+        return true;
+    }
+    get isRecurrent() {
+        // source: `llm_arch_is_recurrent` in `llama-arch.cpp`
+        switch (this._ggufFileInfo.metadata?.general?.architecture) {
+            case GgufArchitectureType.mamba:
+            case GgufArchitectureType.mamba2:
+            case GgufArchitectureType.rwkv6:
+            case GgufArchitectureType.rwkv6qwen2:
+            case GgufArchitectureType.rwkv7:
+            case GgufArchitectureType.arwkv7:
+                return true;
+        }
+        return false;
+    }
+    get isHybrid() {
+        // source: `llm_arch_is_hybrid` in `llama-arch.cpp`
+        switch (this._ggufFileInfo.metadata?.general?.architecture) {
+            case GgufArchitectureType.jamba:
+            case GgufArchitectureType.falconH1:
+            case GgufArchitectureType.plamo2:
+            case GgufArchitectureType.granitehybrid:
+            case GgufArchitectureType.lfm2:
+            case GgufArchitectureType.lfm2moe:
+            case GgufArchitectureType.nemotronH:
+            case GgufArchitectureType.nemotronHMoe:
+            case GgufArchitectureType.qwen3next:
+            case GgufArchitectureType.kimiLinear:
+            case GgufArchitectureType.qwen35:
+            case GgufArchitectureType.qwen35moe:
+                return true;
+        }
+        return false;
+    }
+    /**
+     * Get the dominant tensor type used in the model file
+     */
+    get dominantTensorType() {
+        if (this._dominantTensorType == null)
+            this._dominantTensorType = getDominantTensorType(this._ggufFileInfo.fullTensorInfo ?? []);
+        return this._dominantTensorType;
+    }
+    get supportsRanking() {
+        if (this._supportsRanking != null)
+            return this._supportsRanking;
+        const layers = this._ggufFileInfo.fullTensorInfo ?? [];
+        for (let i = layers.length - 1; i >= 0; i--) {
+            const tensor = layers[i];
+            if (tensor == null)
+                continue;
+            if (tensor.name === "cls.weight" || tensor.name === "cls.output.weight") {
+                this._supportsRanking = this.tokens.sepToken != null || this.tokens.eosToken != null ||
+                    isRankingTemplateValid(parseRankingTemplate(this._ggufFileInfo.metadata?.tokenizer?.["chat_template.rerank"]));
+                this._supportsRanking &&= !(this.hasEncoder && this.hasDecoder); // encoder-decoder models are not supported
+                return this._supportsRanking;
+            }
+        }
+        this._supportsRanking = false;
+        return this._supportsRanking;
+    }
+    /**
+     * The size of the SWA (Sliding Window Attention).
+     *
+     * When `undefined`, the model does not use sliding window attention.
+     */
+    get swaSize() {
+        const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window;
+        if (slidingWindow == null || slidingWindow <= 0)
+            return undefined;
+        const trainContextSize = this.trainContextSize;
+        if (trainContextSize != null && slidingWindow >= trainContextSize)
+            return undefined;
+        return slidingWindow;
+    }
+    estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }) {
+        const { cpu, gpu } = this._getTensorResourceSplit(gpuLayers);
+        return {
+            cpuRam: calculateTensorsSize(cpu, this._llama, false),
+            gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap)
+        };
+    }
+    /**
+     * Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`.
+     * The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that.
+     * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
+     */
+    estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, swaFullCache = false, kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16 }) {
+        if (sequences == null)
+            sequences = getDefaultContextSequences();
+        if (batchSize == null)
+            batchSize = getDefaultContextBatchSize({ contextSize, sequences });
+        const llmData = this._ggufFileInfo.architectureMetadata;
+        const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
+        const slidingWindow = this.swaSize ?? 0;
+        const kvUnified = false;
+        const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize &&
+            (this.trainContextSize == null || slidingWindow < this.trainContextSize);
+        const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture, this._ggufFileInfo.architectureMetadata?.attention?.sliding_window_pattern);
+        const nonSwaPercent = swaPattern <= 1
+            ? 1
+            : (1 / (swaPattern + (flashAttention ? -0.5 : -1)));
+        // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp`
+        const kvCachePadding = 1;
+        const actualContextSize = kvUnified
+            ? padSafeContextSize(sequences * contextSize, "up")
+            : sequences * padSafeContextSize(contextSize, "up");
+        const kvSize = usingSWA
+            ? ((1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) +
+                nonSwaPercent * actualContextSize)
+            : actualContextSize;
+        const totalFileLayers = this._getTotalFileLayers();
+        const totalLayersIncludingOutput = totalFileLayers + 1;
+        const finalModelGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalLayersIncludingOutput, totalLayersIncludingOutput));
+        const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers));
+        const finalCpuLayers = totalFileLayers - finalGpuLayers;
+        const usingGpu = finalModelGpuLayers !== 0;
+        const { gpuKVCacheSize, cpuKVCacheSize, gpuRecurrentStateSize, cpuRecurrentStateSize } = this._estimateContextCacheMemorySplitInBytes({
+            kvSize,
+            sequences,
+            totalFileLayers,
+            finalModelGpuLayers,
+            usingGpu,
+            kvCacheKeyType,
+            kvCacheValueType
+        });
+        const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
+        const embeddingSize = llmData.embedding_length ?? 0;
+        const floatBytes = 4; // sizeof(float)
+        const int32TBytes = 4; // sizeof(int32_t)
+        const estimateOutput = (nOutputs) => {
+            // source: `llama_context::output_reserve` in `llama-context.cpp`
+            const nOutputsMax = Math.max(batchSize, nOutputs);
+            const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5;
+            const hasLogits = isT5 || !isEmbeddingContext;
+            const hasEmbd = isT5 || isEmbeddingContext;
+            const logitsSize = hasLogits
+                ? (vocabularySize * nOutputsMax)
+                : 0;
+            const embdSize = hasEmbd
+                ? (embeddingSize * nOutputsMax)
+                : 0;
+            const outputBufferSize = (logitsSize + embdSize) * floatBytes;
+            const outputIdsArr = int32TBytes * batchSize;
+            return outputBufferSize + outputIdsArr;
+        };
+        const estimateGraphOverheadMemory = () => {
+            const s1MB = Math.pow(1024, 2);
+            const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
+            const expertCount = llmData?.expert_count ?? 0;
+            const headCount = llmData?.attention?.head_count ?? 0;
+            const embeddingLength = llmData?.embedding_length ?? 0;
+            let defaultCalculationAdjustment = 0;
+            if (batchSize == null)
+                return 0;
+            if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) {
+                if (expertCount > 0) {
+                    const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
+                    return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
+                }
+                return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
+            }
+            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) {
+                if (modelGpuLayers === this.totalLayers) {
+                    defaultCalculationAdjustment -= (s1MB * 340) * (this.trainContextSize == null
+                        ? 1
+                        : kvSize / this.trainContextSize);
+                }
+                else {
+                    defaultCalculationAdjustment -= (s1MB * 250) + ((s1MB * 50) * (this.trainContextSize == null
+                        ? 1
+                        : kvSize / this.trainContextSize));
+                }
+            }
+            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma) {
+                // only works properly when all layers are on the GPU, which is why it's commented out:
+                // return int32TBytes * batchSize * ((llmData.embedding_length ?? 0));
+                if (modelGpuLayers === this.totalLayers) {
+                    defaultCalculationAdjustment += (s1MB * 40) - ((s1MB * 270) * (this.trainContextSize == null
+                        ? 1
+                        : kvSize / this.trainContextSize));
+                }
+                else {
+                    defaultCalculationAdjustment += -(s1MB * 550) + ((s1MB * 150) * (this.trainContextSize == null
+                        ? 1
+                        : Math.max(0, (1 - (kvSize / this.trainContextSize)))));
+                }
+            }
+            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) {
+                const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
+                return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB);
+                // if (modelGpuLayers === this.totalLayers) {
+                //     defaultCalculationAdjustment += -(s1MB * 20) + (
+                //         (s1MB * 250) * (
+                //             this.trainContextSize == null
+                //                 ? 1
+                //                 : kvSize / this.trainContextSize
+                //         )
+                //     );
+                // } else {
+                //     defaultCalculationAdjustment += -(s1MB * 40) + (
+                //         (s1MB * 300) * (
+                //             this.trainContextSize == null
+                //                 ? 1
+                //                 : kvSize / this.trainContextSize
+                //         )
+                //     );
+                // }
+            }
+            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen3) {
+                return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
+            }
+            else if (expertCount > 0) {
+                const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
+                return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
+            }
+            const totalElements = tensorInfo.length === 0
+                ? this.totalLayers * (((llmData.embedding_length ?? 0) +
+                    (llmData.feed_forward_length ?? 0)) / 2)
+                : tensorInfo.reduce((res, tensor) => {
+                    return res + tensor.dimensions.reduce((res, dim) => res + Number(dim), 0);
+                }, 0);
+            if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) {
+                // magic numbers for estimation. will be improved in the future
+                return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment;
+            }
+            else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.cohere2) {
+                // magic numbers for estimation. will be improved in the future
+                return (totalElements * 148 * (kvSize / 4096)) + defaultCalculationAdjustment;
+            }
+            // magic numbers for estimation. will be improved in the future
+            return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment;
+        };
+        // source: `llama_context::graph_max_nodes` in `llama-context.cpp`
+        const getMaxNodesMultiplier = (arch, nTokens) => {
+            if (arch === GgufArchitectureType.qwen3next)
+                return {
+                    min: nTokens * 40,
+                    multiplier: 32
+                };
+            return {
+                min: 1024,
+                multiplier: 8
+            };
+        };
+        const maxNodesMultiplier = getMaxNodesMultiplier(this._ggufFileInfo.metadata?.general?.architecture, Math.min(actualContextSize, batchSize));
+        const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length);
+        const cpuNodes = totalFileLayers === 0
+            ? 0
+            : maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers));
+        const gpuNodes = maxNodes - cpuNodes;
+        const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) +
+            this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false);
+        const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) +
+            this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false);
+        const graphOverheadMemory = (flashAttention || !includeGraphOverhead)
+            ? 0
+            : estimateGraphOverheadMemory();
+        const graphOverheadGpuSize = (usingGpu && totalFileLayers > 0)
+            ? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers))
+            : 0;
+        const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize;
+        const outputBufferSize = estimateOutput(sequences);
+        const gpuVram = gpuKVCacheSize + gpuRecurrentStateSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize;
+        const cpuRam = cpuKVCacheSize + cpuRecurrentStateSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize;
+        return {
+            cpuRam,
+            gpuVram: usingGpu
+                ? gpuVram
+                : 0
+        };
+    }
+    /**
+     * Get the split tensor resources for CPU and GPU based on the number of GPU layers
+     * @internal
+     */
+    _getTensorResourceSplit(gpuLayers) {
+        const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
+        const architecture = this._ggufFileInfo.metadata?.general?.architecture;
+        if (gpuLayers === 0) {
+            return {
+                cpu: tensorInfo,
+                gpu: []
+            };
+        }
+        const fileLayers = this._getFileLayers();
+        const startGpuLayer = Math.max(0, fileLayers - gpuLayers);
+        const gpuTensors = [];
+        const cpuTensors = [];
+        let tokenEmbedLayer;
+        let mainOutputLayer;
+        for (const singleTensorInfo of tensorInfo) {
+            if (isMainOutputLayer(singleTensorInfo.name))
+                mainOutputLayer = singleTensorInfo;
+            else if (isTokenEmbedLayer(singleTensorInfo.name))
+                tokenEmbedLayer = singleTensorInfo;
+            // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
+            // loaded with `model.dev_input`, which is always set to the CPU
+            if (isInputLayer(singleTensorInfo.name)) {
+                cpuTensors.push(singleTensorInfo);
+                continue;
+                // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always
+                // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU
+            }
+            else if (isOutputLayer(singleTensorInfo.name)) {
+                if (gpuLayers === this.totalLayers) {
+                    gpuTensors.push(singleTensorInfo);
+                    continue;
+                }
+                else {
+                    cpuTensors.push(singleTensorInfo);
+                    continue;
+                }
+            }
+            const { layerNumber } = parseTensorName(singleTensorInfo.name);
+            if (gpuLayers !== this.totalLayers) {
+                if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
+                    if (layerNumber != null && layerNumber >= startGpuLayer)
+                        gpuTensors.push(singleTensorInfo);
+                    else
+                        cpuTensors.push(singleTensorInfo);
+                    continue;
+                }
+            }
+            if (layerNumber == null || layerNumber >= startGpuLayer)
+                gpuTensors.push(singleTensorInfo);
+            else
+                cpuTensors.push(singleTensorInfo);
+        }
+        if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
+            gpuTensors.push(tokenEmbedLayer);
+        return {
+            cpu: cpuTensors,
+            gpu: gpuTensors
+        };
+    }
+    /** @internal */
+    _determineNumberOfLayersFromTensorInfo() {
+        const layerNumbers = new Set();
+        for (const singleTensorInfo of (this._ggufFileInfo.fullTensorInfo ?? [])) {
+            const { layerNumber } = parseTensorName(singleTensorInfo.name);
+            if (layerNumber != null)
+                layerNumbers.add(layerNumber);
+        }
+        return layerNumbers.size;
+    }
+    /** @internal */
+    _getFileLayers() {
+        return this._ggufFileInfo.architectureMetadata.block_count ?? this._determineNumberOfLayersFromTensorInfo();
+    }
+    _estimateContextCacheMemorySplitInBytes({ kvSize, sequences, totalFileLayers, finalModelGpuLayers, usingGpu, kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16 }) {
+        // source: `llama_kv_cache_init` in `llama.cpp`
+        const architecture = this._ggufFileInfo.metadata.general?.architecture;
+        const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
+        const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0;
+        const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
+        const nHeadKv = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead;
+        const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
+        const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size;
+        const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size;
+        // source: `llama_model::load_tensors` in `llama-model.cpp`
+        // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers`
+        const gpuRepeatingLayerStart = Math.max(0, (totalFileLayers + 1) - finalModelGpuLayers);
+        const recurrentLayersByPattern = this.isRecurrent
+            ? "all"
+            : getRecurrentLayersPattern(architecture, this._ggufFileInfo.architectureMetadata);
+        let gpuKvElementsK = 0;
+        let gpuKvElementsV = 0;
+        let cpuKvElementsK = 0;
+        let cpuKvElementsV = 0;
+        let gpuRecurrentLayers = 0;
+        let cpuRecurrentLayers = 0;
+        for (let i = 0; i < totalFileLayers; i++) {
+            const isGpuLayer = i >= gpuRepeatingLayerStart;
+            const isRecurrentLayer = isLayerRecurrent(recurrentLayersByPattern, i);
+            if (isRecurrentLayer) {
+                if (isGpuLayer)
+                    gpuRecurrentLayers++;
+                else
+                    cpuRecurrentLayers++;
+            }
+            else {
+                const nHeadKvLayer = resolveLayerHeadCountKv(nHeadKv, i, nHead);
+                const layerElementsK = nEmbdHeadK * nHeadKvLayer * kvSize;
+                const layerElementsV = nEmbdHeadV * nHeadKvLayer * kvSize;
+                if (isGpuLayer) {
+                    gpuKvElementsK += layerElementsK;
+                    gpuKvElementsV += layerElementsV;
+                }
+                else {
+                    cpuKvElementsK += layerElementsK;
+                    cpuKvElementsV += layerElementsV;
+                }
+            }
+        }
+        const gpuKVCacheSize = usingGpu
+            ? ((gpuKvElementsK * keyTypeSize) + (gpuKvElementsV * valueTypeSize))
+            : 0;
+        const cpuKVCacheSize = (cpuKvElementsK * keyTypeSize) + (cpuKvElementsV * valueTypeSize);
+        const recurrentCellSize = Math.max(1, sequences);
+        const gpuRecurrentStateSize = usingGpu
+            ? this._estimateRecurrentStateMemorySizeInBytes(recurrentCellSize, gpuRecurrentLayers)
+            : 0;
+        const cpuRecurrentStateSize = this._estimateRecurrentStateMemorySizeInBytes(recurrentCellSize, cpuRecurrentLayers);
+        return {
+            gpuKVCacheSize,
+            cpuKVCacheSize,
+            gpuRecurrentStateSize,
+            cpuRecurrentStateSize
+        };
+    }
+    _estimateRecurrentStateMemorySizeInBytes(recurrentCellSize, layers) {
+        if (layers <= 0 || recurrentCellSize <= 0)
+            return 0;
+        // source: `llama_memory_recurrent` + `llama_hparams::n_embd_r` / `llama_hparams::n_embd_s` in `llama.cpp`
+        const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
+        const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0;
+        const wkvHeadSize = this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0;
+        const tokenShiftCount = this._ggufFileInfo.architectureMetadata.token_shift_count ?? 0;
+        const ssmDConv = this._ggufFileInfo.architectureMetadata.ssm?.conv_kernel ?? 0;
+        const ssmDInner = this._ggufFileInfo.architectureMetadata.ssm?.inner_size ?? 0;
+        const ssmDState = this._ggufFileInfo.architectureMetadata.ssm?.state_size ?? 0;
+        const ssmGroupCount = this._ggufFileInfo.architectureMetadata.ssm?.group_count ?? 0;
+        const shortConvLCache = this._ggufFileInfo.architectureMetadata.shortconv?.l_cache ?? 0;
+        const kdaHeadDim = this._ggufFileInfo.architectureMetadata.kda?.head_dim ?? 0;
+        const shortConvHistoryShift = 1;
+        const kdaQkvStateCount = 3;
+        const kdaDefaultConvKernel = 3;
+        const ssmBcStatePairCount = 2;
+        const nEmbdR = wkvHeadSize !== 0
+            ? tokenShiftCount * nEmbd
+            : shortConvLCache !== 0
+                ? nEmbd * Math.max(0, shortConvLCache - shortConvHistoryShift)
+                : kdaHeadDim !== 0
+                    ? (kdaQkvStateCount * (ssmDConv > 0
+                        ? ssmDConv - shortConvHistoryShift
+                        : kdaDefaultConvKernel) * (nHead * kdaHeadDim))
+                    : ((ssmDConv > 0
+                        ? (ssmDConv - shortConvHistoryShift)
+                        : 0) * (ssmDInner + (ssmBcStatePairCount * ssmGroupCount * ssmDState)));
+        const nEmbdS = wkvHeadSize !== 0
+            ? nEmbd * wkvHeadSize
+            : kdaHeadDim !== 0
+                ? kdaHeadDim * kdaHeadDim * nHead
+                : ssmDState * ssmDInner;
+        if (nEmbdR === 0 && nEmbdS === 0)
+            return 0;
+        const recurrentTypeSize = this._llama._consts.ggmlTypeF32Size;
+        const bytesPerLayer = (nEmbdR + nEmbdS) * recurrentTypeSize;
+        return layers * recurrentCellSize * bytesPerLayer;
+    }
+    /** @internal */
+    _getTotalFileLayers() {
+        if (this._totalFileLayers != null)
+            return this._totalFileLayers;
+        this._totalFileLayers = this._getFileLayers();
+        return this._totalFileLayers;
+    }
+    /**
+     * @param ggufFileInfo
+     * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance.
+     * If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that
+     * doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances
+     * that need a fallback `Llama` instance.
+     */
+    static async from(ggufFileInfo, llama) {
+        let resolvedLlama = llama;
+        if (resolvedLlama == null)
+            resolvedLlama = await getLlamaWithoutBackend();
+        return new GgufInsights(ggufFileInfo, resolvedLlama);
+    }
+}
+function parseTensorName(tensorName) {
+    if (tensorName == null)
+        return { layerNumber: undefined };
+    const layerTensorPrefix = "blk.";
+    if (!tensorName.startsWith(layerTensorPrefix))
+        return { layerNumber: undefined };
+    const dotIndex = tensorName.indexOf(".", layerTensorPrefix.length);
+    const layerNumberString = tensorName.slice(layerTensorPrefix.length, dotIndex < 0
+        ? tensorName.length
+        : dotIndex);
+    const layerNumber = parseInt(layerNumberString);
+    if (Number.isFinite(layerNumber))
+        return { layerNumber };
+    return { layerNumber: undefined };
+}
+function calculateTensorsSize(tensorsInfo, llama, useMmap, startFromTensorDataOffset = false) {
+    if (!useMmap) {
+        let size = 0;
+        for (const tensorInfo of tensorsInfo)
+            size += calculateTensorSize(tensorInfo, llama);
+        return size;
+    }
+    const fileStats = new Map();
+    for (const tensorInfo of tensorsInfo) {
+        let stats = fileStats.get(tensorInfo.filePart);
+        if (stats == null) {
+            stats = {
+                tensorsSize: 0
+            };
+            fileStats.set(tensorInfo.filePart, stats);
+        }
+        const tensorSize = calculateTensorSize(tensorInfo, llama);
+        stats.tensorsSize += tensorSize;
+        const startOffset = tensorInfo.offset;
+        const endOffset = typeof startOffset === "number"
+            ? startOffset + tensorSize
+            : startOffset + BigInt(tensorSize);
+        if (startFromTensorDataOffset)
+            stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset));
+        else if (stats.startOffset == null || startOffset < stats.startOffset)
+            stats.startOffset = startOffset;
+        if (stats.endOffset == null || endOffset > stats.endOffset)
+            stats.endOffset = endOffset;
+    }
+    let size = 0;
+    for (const [, stats] of fileStats) {
+        const offsetSize = (stats.endOffset == null || stats.startOffset == null)
+            ? 0
+            : Number(BigInt(stats.endOffset) - BigInt(stats.startOffset));
+        const tensorsSize = stats.tensorsSize;
+        size += Math.max(offsetSize, tensorsSize);
+    }
+    return size;
+}
+function calculateTensorSize(tensor, llama) {
+    const typeSize = llama._bindings.getTypeSizeForGgmlType(tensor.ggmlType);
+    const blockSize = llama._bindings.getBlockSizeForGgmlType(tensor.ggmlType);
+    const ggmlMaxDims = llama._consts.ggmlMaxDims;
+    if (typeSize == null || blockSize == null)
+        throw new Error("Invalid type or block size");
+    const { ne, nb } = getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims });
+    if (blockSize === 1) {
+        let totalBytes = typeSize;
+        for (let i = 0; i < ggmlMaxDims; i++) {
+            totalBytes += (ne[i] - 1) * nb[i];
+        }
+        return totalBytes;
+    }
+    else {
+        let totalBytes = Math.floor((ne[0] * nb[0]) / blockSize);
+        for (let i = 1; i < ggmlMaxDims; i++) {
+            totalBytes += (ne[i] - 1) * nb[i];
+        }
+        return totalBytes;
+    }
+}
+function getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }) {
+    // number of elements
+    // source: `ggml_new_tensor_impl` in `ggml.c`
+    const ne = [
+        ...tensor.dimensions,
+        ...(Array(Math.max(0, ggmlMaxDims - tensor.dimensions.length)).fill(1))
+    ].slice(0, ggmlMaxDims);
+    // number of bytes
+    // source: `ggml_new_tensor_impl` in `ggml.c`
+    const nb = [
+        typeSize,
+        Math.floor(typeSize * (ne[0] / blockSize)),
+        ...Array(ggmlMaxDims - 2).fill(0)
+    ];
+    for (let i = 2; i < ggmlMaxDims; i++) {
+        nb[i] = nb[i - 1] * ne[i - 1];
+    }
+    return {
+        ne,
+        nb
+    };
+}
+function isInputLayer(layerName) {
+    const [firstPart] = layerName.split(".");
+    if (firstPart == null)
+        return false;
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart) {
+        case "token_embd":
+        case "token_embd_norm":
+        case "token_types":
+        case "position_embd":
+            return true;
+    }
+    return false;
+}
+function isOutputLayer(layerName) {
+    const [firstPart, secondPart] = layerName.split(".");
+    if (firstPart == null)
+        return false;
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart) {
+        case "output":
+        case "output_norm":
+        case "cls":
+            return true;
+    }
+    if (secondPart == null)
+        return false;
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart + "." + secondPart) {
+        case "cls.output":
+        case "dec.output_norm":
+        case "enc.output_norm":
+            return true;
+    }
+    return false;
+}
+function isMainOutputLayer(layerName) {
+    const [firstPart] = layerName.split(".");
+    return firstPart === "output";
+}
+function isTokenEmbedLayer(layerName) {
+    const [firstPart] = layerName.split(".");
+    return firstPart === "token_embd";
+}
+function ggmlPad(value, padding) {
+    return ((value + padding - 1) & ~(padding - 1));
+}
+function getSwaPatternForArchitecture(architecture, slidingWindowPattern) {
+    if (typeof slidingWindowPattern === "number")
+        return slidingWindowPattern;
+    // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern`
+    switch (architecture) {
+        case GgufArchitectureType.llama4:
+            return 4;
+        case GgufArchitectureType.afmoe:
+            return 4;
+        case GgufArchitectureType.modernBert:
+            return 3;
+        case GgufArchitectureType.phi3:
+            return 1;
+        case GgufArchitectureType.plamo3:
+            return 8;
+        case GgufArchitectureType.gemma2:
+            return 2;
+        case GgufArchitectureType.gemma3:
+            return 6;
+        case GgufArchitectureType.gemma3n:
+            return 5;
+        case GgufArchitectureType.gemmaEmbedding:
+            return 6;
+        case GgufArchitectureType.cohere2:
+            return 4;
+        case GgufArchitectureType.olmo2:
+            return 4;
+        case GgufArchitectureType.exaone4:
+            return 4;
+        case GgufArchitectureType.exaoneMoe:
+            return 4;
+        case GgufArchitectureType.gptOss:
+            return 2;
+        case GgufArchitectureType.smallthinker:
+            return 4;
+    }
+    return 1;
+}
+function resolveLayerHeadCountKv(nHeadKv, layerIndex, nHead) {
+    if (typeof nHeadKv === "number")
+        return nHeadKv;
+    const layerHeadCountKv = nHeadKv[layerIndex];
+    if (layerHeadCountKv == null)
+        return nHead;
+    return layerHeadCountKv;
+}
+function getRecurrentLayersPattern(architecture, architectureMetadata) {
+    const nHeadKv = architectureMetadata?.attention?.head_count_kv;
+    const feedForwardLength = architectureMetadata?.feed_forward_length;
+    const hasRecurrentHeadCountKvEntry = Array.isArray(nHeadKv) && nHeadKv.some((value) => value === 0);
+    if (architecture === GgufArchitectureType.falconH1)
+        // source: `llama_model::load_hparams` in `llama-model.cpp`:
+        // `case LLM_ARCH_FALCON_H1` does `std::fill(..., true)` for `recurrent_layer_arr`
+        return "all";
+    if (architecture === GgufArchitectureType.nemotronH || architecture === GgufArchitectureType.nemotronHMoe) {
+        // source: `llama_model::load_hparams` in `llama-model.cpp`:
+        // `case LLM_ARCH_NEMOTRON_H / LLM_ARCH_NEMOTRON_H_MOE`:
+        // `recurrent_layer_arr[i] = (n_head_kv(i) == 0 && n_ff(i) == 0)`
+        if (Array.isArray(nHeadKv))
+            return {
+                type: "headCountKvAndFeedForward",
+                headCountKvValues: nHeadKv,
+                feedForwardLength
+            };
+        if (nHeadKv === 0) {
+            if (typeof feedForwardLength === "number")
+                return feedForwardLength === 0
+                    ? "all"
+                    : "none";
+            return "none";
+        }
+    }
+    if (typeof architectureMetadata?.full_attention_interval === "number" &&
+        Number.isFinite(architectureMetadata?.full_attention_interval) &&
+        architectureMetadata?.full_attention_interval > 0 &&
+        (
+        // source: `llama_model::load_hparams` in `llama-model.cpp`
+        // `case LLM_ARCH_QWEN3NEXT / LLM_ARCH_QWEN35 / LLM_ARCH_QWEN35MOE`:
+        // `hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0)`
+        architecture === GgufArchitectureType.qwen3next ||
+            architecture === GgufArchitectureType.qwen35 ||
+            architecture === GgufArchitectureType.qwen35moe ||
+            hasRecurrentHeadCountKvEntry))
+        return {
+            type: "fullAttentionInterval",
+            interval: Math.max(1, Math.floor(architectureMetadata?.full_attention_interval))
+        };
+    if (hasRecurrentHeadCountKvEntry)
+        return {
+            type: "headCountKvArray",
+            values: nHeadKv
+        };
+    return "none";
+}
+function isLayerRecurrent(pattern, layerIndex) {
+    if (pattern === "all")
+        return true;
+    else if (pattern === "none")
+        return false;
+    else if (pattern.type === "fullAttentionInterval")
+        return (layerIndex + 1) % pattern.interval !== 0;
+    else if (pattern.type === "headCountKvAndFeedForward")
+        return pattern.headCountKvValues[layerIndex] === 0 &&
+            resolveLayerFeedForwardLength(pattern.feedForwardLength, layerIndex) === 0;
+    return pattern.values[layerIndex] === 0;
+}
+function resolveLayerFeedForwardLength(feedForwardLength, layerIndex) {
+    if (typeof feedForwardLength === "number")
+        return feedForwardLength;
+    else if (Array.isArray(feedForwardLength))
+        return feedForwardLength[layerIndex] ?? 0;
+    return 0;
+}
+export function parseRankingTemplate(template) {
+    if (template == null)
+        return undefined;
+    return template
+        .replaceAll("{query}", "{{query}}")
+        .replaceAll("{document}", "{{document}}");
+}
+export function isRankingTemplateValid(template) {
+    return template != null && template.includes("{{query}}") && template.includes("{{document}}");
+}
+export function getDominantTensorType(tensorInfo) {
+    const tensorTypes = [];
+    for (const tensor of tensorInfo)
+        tensorTypes[tensor.ggmlType] = ((tensorTypes[tensor.ggmlType] ?? 0) +
+            tensor.dimensions.map(((dim) => Number(dim))).reduce((a, b) => a * b, 1));
+    let dominantType = undefined;
+    let maxCount = 0;
+    for (const [type, count] of tensorTypes.entries()) {
+        if (count > maxCount) {
+            maxCount = count;
+            dominantType = type;
+        }
+    }
+    return dominantType;
+}
+//# sourceMappingURL=GgufInsights.js.map