modelinfo-cli 1.3.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/PKG-INFO +27 -14
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/README.md +26 -13
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/pyproject.toml +1 -1
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/__init__.py +1 -1
- modelinfo_cli-1.4.0/src/modelinfo/calculator.py +178 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/cli.py +93 -13
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/gguf.py +10 -6
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/huggingface.py +36 -21
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/ui.py +54 -11
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/PKG-INFO +27 -14
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/tests/test_calculator.py +59 -0
- modelinfo_cli-1.3.0/src/modelinfo/calculator.py +0 -98
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/LICENSE +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/setup.cfg +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/__main__.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/architecture.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/hardware.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/__init__.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/base.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/pytorch.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/safetensors.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/SOURCES.txt +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/dependency_links.txt +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/entry_points.txt +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/requires.txt +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/top_level.txt +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/tests/test_constraints.py +0 -0
- {modelinfo_cli-1.3.0 → modelinfo_cli-1.4.0}/tests/test_parsers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: modelinfo-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit.
|
|
5
5
|
Author: ModelInfo Contributors
|
|
6
6
|
License: MIT
|
|
@@ -22,25 +22,27 @@ Dynamic: license-file
|
|
|
22
22
|

|
|
23
23
|

|
|
24
24
|
|
|
25
|
-
ModelInfo
|
|
25
|
+

|
|
26
|
+
|
|
27
|
+
ModelInfo is a CLI tool that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
|
|
26
28
|
|
|
27
29
|
It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
|
|
28
30
|
|
|
29
31
|
## Features
|
|
30
32
|
|
|
31
|
-
- **Zero-Dependency Parsing**: Reads
|
|
32
|
-
- **Remote Hugging Face Hub Inspection**:
|
|
33
|
-
-
|
|
34
|
-
- **Dynamic VRAM
|
|
35
|
-
- **Hardware Fit Diagnostics**:
|
|
36
|
-
- **Side-by-Side Comparison**: Pass multiple models to
|
|
37
|
-
-
|
|
38
|
-
- **Secure Pickling**: Inspects legacy `.pt` files
|
|
39
|
-
-
|
|
33
|
+
- **Zero-Dependency Parsing**: Reads `.safetensors` 8-byte JSON prefixes and `.gguf` binary key-value metadata directly via `struct` and `json` (falling back to `config.json` if needed).
|
|
34
|
+
- **Remote Hugging Face Hub Inspection**: Pass a repo ID (e.g., `meta-llama/Llama-2-7b-hf`) and it uses concurrent byte-range requests to read the headers off the CDN in under 2 seconds. No need to download the checkpoint.
|
|
35
|
+
- Parses `model.safetensors.index.json` to support sharded models without crashing on partial downloads.
|
|
36
|
+
- **Dynamic VRAM & Subtractive vLLM Math**: Calculates exact VRAM limits based on the model's architecture and your target context length. If you use the `--vllm` flag, it switches to a subtractive "Serving Capacity" engine that calculates exactly how many tokens fit in the PagedAttention pool based on your `--gpu-util` ratio.
|
|
37
|
+
- **Hardware Fit Diagnostics**: Check if a model fits your cluster with `--gpu` (e.g. `--gpu RTX4090` or `--gpu auto`). It enforces Apple Silicon's 75% unified memory wire limit, and you can explicitly model multi-GPU NCCL communication penalties with `--topology` and `--strategy`.
|
|
38
|
+
- **Side-by-Side Comparison**: Pass multiple models to trigger a comparison table (parameters, data types, context lengths, VRAM footprints).
|
|
39
|
+
- Uses exact `ggml_type` mappings for GGUF formats to calculate byte-scaling coefficients, preventing VRAM under-reporting.
|
|
40
|
+
- **Secure Pickling**: Inspects legacy `.pt` files safely using a restricted `pickle.Unpickler`.
|
|
41
|
+
- The UI (built with `rich`) groups repetitive layers and color-codes VRAM heatmaps.
|
|
40
42
|
|
|
41
43
|
> [!NOTE]
|
|
42
44
|
> **A Note on Performance & Remote Fetching**
|
|
43
|
-
> Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**.
|
|
45
|
+
> Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. To remain zero-dependency, `modelinfo` opens connections via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
|
|
44
46
|
|
|
45
47
|
## Installation
|
|
46
48
|
|
|
@@ -120,6 +122,12 @@ Compare multiple models side-by-side against a hardware target:
|
|
|
120
122
|
modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
|
|
121
123
|
```
|
|
122
124
|
|
|
125
|
+
Simulate exactly how many tokens you can serve using vLLM on a specific multi-GPU topology:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
modelinfo mistralai/Mistral-7B-v0.1 --vllm --gpu 4xRTX4090 --topology pcie4 --strategy tp
|
|
129
|
+
```
|
|
130
|
+
|
|
123
131
|
### Example Output (Single Model)
|
|
124
132
|
|
|
125
133
|
```text
|
|
@@ -157,13 +165,18 @@ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
|
|
|
157
165
|
| `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
|
|
158
166
|
| `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
|
|
159
167
|
| `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
|
|
168
|
+
| `--vllm` | `--vllm --gpu auto` | Switches from additive memory checking to a subtractive serving capacity estimation. Shows exactly how many tokens fit in the PagedAttention pool. |
|
|
169
|
+
| `--gpu-util` | `--gpu-util 0.9` | Sets the vLLM `gpu_memory_utilization` ratio. Defaults to `0.9` (reserves 10% for PyTorch context). |
|
|
170
|
+
| `--topology` | `--topology nvlink` | Set interconnect topology to calculate exact communication overhead penalties (`nvlink`, `pcie4`, `pcie3`). Defaults to `pcie4`. |
|
|
171
|
+
| `--strategy` | `--strategy tp` | Selects the parallelization strategy for multi-GPU setups (`tp` for Tensor Parallelism, `pp` for Pipeline Parallelism). Defaults to `tp`. |
|
|
172
|
+
| `--tensors` | `--tensors` | Bypasses the algorithmic speed estimation and forces the tool to fetch all remote shards, displaying an exact size breakdown of every tensor. |
|
|
160
173
|
|
|
161
174
|
## Architecture
|
|
162
175
|
|
|
163
|
-
|
|
176
|
+
Three modules:
|
|
164
177
|
|
|
165
178
|
1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
|
|
166
|
-
2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`)
|
|
179
|
+
2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) that use only the standard library.
|
|
167
180
|
3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
|
|
168
181
|
|
|
169
182
|
## License
|
|
@@ -4,25 +4,27 @@
|
|
|
4
4
|

|
|
5
5
|

|
|
6
6
|
|
|
7
|
-
ModelInfo
|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
ModelInfo is a CLI tool that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
|
|
8
10
|
|
|
9
11
|
It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
|
|
10
12
|
|
|
11
13
|
## Features
|
|
12
14
|
|
|
13
|
-
- **Zero-Dependency Parsing**: Reads
|
|
14
|
-
- **Remote Hugging Face Hub Inspection**:
|
|
15
|
-
-
|
|
16
|
-
- **Dynamic VRAM
|
|
17
|
-
- **Hardware Fit Diagnostics**:
|
|
18
|
-
- **Side-by-Side Comparison**: Pass multiple models to
|
|
19
|
-
-
|
|
20
|
-
- **Secure Pickling**: Inspects legacy `.pt` files
|
|
21
|
-
-
|
|
15
|
+
- **Zero-Dependency Parsing**: Reads `.safetensors` 8-byte JSON prefixes and `.gguf` binary key-value metadata directly via `struct` and `json` (falling back to `config.json` if needed).
|
|
16
|
+
- **Remote Hugging Face Hub Inspection**: Pass a repo ID (e.g., `meta-llama/Llama-2-7b-hf`) and it uses concurrent byte-range requests to read the headers off the CDN in under 2 seconds. No need to download the checkpoint.
|
|
17
|
+
- Parses `model.safetensors.index.json` to support sharded models without crashing on partial downloads.
|
|
18
|
+
- **Dynamic VRAM & Subtractive vLLM Math**: Calculates exact VRAM limits based on the model's architecture and your target context length. If you use the `--vllm` flag, it switches to a subtractive "Serving Capacity" engine that calculates exactly how many tokens fit in the PagedAttention pool based on your `--gpu-util` ratio.
|
|
19
|
+
- **Hardware Fit Diagnostics**: Check if a model fits your cluster with `--gpu` (e.g. `--gpu RTX4090` or `--gpu auto`). It enforces Apple Silicon's 75% unified memory wire limit, and you can explicitly model multi-GPU NCCL communication penalties with `--topology` and `--strategy`.
|
|
20
|
+
- **Side-by-Side Comparison**: Pass multiple models to trigger a comparison table (parameters, data types, context lengths, VRAM footprints).
|
|
21
|
+
- Uses exact `ggml_type` mappings for GGUF formats to calculate byte-scaling coefficients, preventing VRAM under-reporting.
|
|
22
|
+
- **Secure Pickling**: Inspects legacy `.pt` files safely using a restricted `pickle.Unpickler`.
|
|
23
|
+
- The UI (built with `rich`) groups repetitive layers and color-codes VRAM heatmaps.
|
|
22
24
|
|
|
23
25
|
> [!NOTE]
|
|
24
26
|
> **A Note on Performance & Remote Fetching**
|
|
25
|
-
> Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**.
|
|
27
|
+
> Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. To remain zero-dependency, `modelinfo` opens connections via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
|
|
26
28
|
|
|
27
29
|
## Installation
|
|
28
30
|
|
|
@@ -102,6 +104,12 @@ Compare multiple models side-by-side against a hardware target:
|
|
|
102
104
|
modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
|
|
103
105
|
```
|
|
104
106
|
|
|
107
|
+
Simulate exactly how many tokens you can serve using vLLM on a specific multi-GPU topology:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
modelinfo mistralai/Mistral-7B-v0.1 --vllm --gpu 4xRTX4090 --topology pcie4 --strategy tp
|
|
111
|
+
```
|
|
112
|
+
|
|
105
113
|
### Example Output (Single Model)
|
|
106
114
|
|
|
107
115
|
```text
|
|
@@ -139,13 +147,18 @@ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
|
|
|
139
147
|
| `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
|
|
140
148
|
| `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
|
|
141
149
|
| `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
|
|
150
|
+
| `--vllm` | `--vllm --gpu auto` | Switches from additive memory checking to a subtractive serving capacity estimation. Shows exactly how many tokens fit in the PagedAttention pool. |
|
|
151
|
+
| `--gpu-util` | `--gpu-util 0.9` | Sets the vLLM `gpu_memory_utilization` ratio. Defaults to `0.9` (reserves 10% for PyTorch context). |
|
|
152
|
+
| `--topology` | `--topology nvlink` | Set interconnect topology to calculate exact communication overhead penalties (`nvlink`, `pcie4`, `pcie3`). Defaults to `pcie4`. |
|
|
153
|
+
| `--strategy` | `--strategy tp` | Selects the parallelization strategy for multi-GPU setups (`tp` for Tensor Parallelism, `pp` for Pipeline Parallelism). Defaults to `tp`. |
|
|
154
|
+
| `--tensors` | `--tensors` | Bypasses the algorithmic speed estimation and forces the tool to fetch all remote shards, displaying an exact size breakdown of every tensor. |
|
|
142
155
|
|
|
143
156
|
## Architecture
|
|
144
157
|
|
|
145
|
-
|
|
158
|
+
Three modules:
|
|
146
159
|
|
|
147
160
|
1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
|
|
148
|
-
2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`)
|
|
161
|
+
2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) that use only the standard library.
|
|
149
162
|
3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
|
|
150
163
|
|
|
151
164
|
## License
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "modelinfo-cli"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.4.0"
|
|
8
8
|
description = "A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from modelinfo.architecture import extract_architecture
|
|
5
|
+
|
|
6
|
+
DTYPE_BYTES = {
|
|
7
|
+
"F64": 8.0,
|
|
8
|
+
"F32": 4.0,
|
|
9
|
+
"F16": 2.0,
|
|
10
|
+
"BF16": 2.0,
|
|
11
|
+
"F8": 1.0,
|
|
12
|
+
"F8_E5M2": 1.0,
|
|
13
|
+
"F8_E4M3": 1.0,
|
|
14
|
+
"I64": 8.0,
|
|
15
|
+
"I32": 4.0,
|
|
16
|
+
"I16": 2.0,
|
|
17
|
+
"I8": 1.0,
|
|
18
|
+
"U64": 8.0,
|
|
19
|
+
"U32": 4.0,
|
|
20
|
+
"Q8_0": 1.0625,
|
|
21
|
+
"Q8_1": 1.0625,
|
|
22
|
+
"Q8_K": 1.0625,
|
|
23
|
+
"Q6_K": 0.828125,
|
|
24
|
+
"Q5_0": 0.6875,
|
|
25
|
+
"Q5_1": 0.75,
|
|
26
|
+
"Q5_K": 0.6875,
|
|
27
|
+
"Q4_0": 0.5625,
|
|
28
|
+
"Q4_1": 0.625,
|
|
29
|
+
"Q4_K": 0.59375,
|
|
30
|
+
"Q3_K": 0.4375,
|
|
31
|
+
"Q2_K": 0.34375,
|
|
32
|
+
"IQ4_NL": 0.53125,
|
|
33
|
+
"IQ4_XS": 0.53125,
|
|
34
|
+
"IQ3_S": 0.4375,
|
|
35
|
+
"IQ3_XXS": 0.385,
|
|
36
|
+
"IQ2_S": 0.3125,
|
|
37
|
+
"IQ2_XS": 0.296875,
|
|
38
|
+
"IQ2_XXS": 0.28125,
|
|
39
|
+
"IQ1_M": 0.21875,
|
|
40
|
+
"IQ1_S": 0.1953125,
|
|
41
|
+
"Q8": 1.06,
|
|
42
|
+
"Q6": 0.82,
|
|
43
|
+
"Q5": 0.68,
|
|
44
|
+
"Q4": 0.58,
|
|
45
|
+
"Q3": 0.43,
|
|
46
|
+
"Q2": 0.28,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def _get_bytes_per_param(dtype: str) -> float:
|
|
50
|
+
"""Return the size in bytes for a given data type."""
|
|
51
|
+
return DTYPE_BYTES.get(dtype.upper(), 2.0)
|
|
52
|
+
|
|
53
|
+
def calculate_footprint(
|
|
54
|
+
tensors: Dict[str, Any],
|
|
55
|
+
context_length: int = 0,
|
|
56
|
+
batch_size: int = 1,
|
|
57
|
+
config: Dict[str, Any] = None,
|
|
58
|
+
gpu_count: int = 1,
|
|
59
|
+
topology: str = "pcie4",
|
|
60
|
+
strategy: str = "tp",
|
|
61
|
+
is_vllm: bool = False,
|
|
62
|
+
gpu_vram_bytes: float = 0.0,
|
|
63
|
+
gpu_util: float = 0.9
|
|
64
|
+
) -> Dict[str, Any]:
|
|
65
|
+
"""
|
|
66
|
+
Calculate the memory footprint of a model based on its tensors and context length.
|
|
67
|
+
"""
|
|
68
|
+
total_params = 0
|
|
69
|
+
base_memory_bytes = 0.0
|
|
70
|
+
dtype_counts: Dict[str, int] = {}
|
|
71
|
+
|
|
72
|
+
is_lazy = tensors.get("__metadata__", {}).get("lazy_fetch", False)
|
|
73
|
+
|
|
74
|
+
if is_lazy:
|
|
75
|
+
base_memory_bytes = tensors.get("__metadata__", {}).get("total_size", 0.0)
|
|
76
|
+
# Assume predominantly FP16/BF16 for modern Hub architectures
|
|
77
|
+
primary_dtype = "BF16"
|
|
78
|
+
dtype_counts[primary_dtype] = 1
|
|
79
|
+
total_params = int(base_memory_bytes / 2.0)
|
|
80
|
+
else:
|
|
81
|
+
for name, metadata in tensors.items():
|
|
82
|
+
if name == "__metadata__":
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
shape = metadata.get("shape", [])
|
|
86
|
+
if not shape:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
param_count = math.prod(shape)
|
|
90
|
+
total_params += param_count
|
|
91
|
+
|
|
92
|
+
dtype = metadata.get("dtype", "F16").upper()
|
|
93
|
+
dtype_counts[dtype] = dtype_counts.get(dtype, 0) + 1
|
|
94
|
+
|
|
95
|
+
bytes_per_param = _get_bytes_per_param(dtype)
|
|
96
|
+
base_memory_bytes += param_count * bytes_per_param
|
|
97
|
+
|
|
98
|
+
num_layers, kv_dim, is_estimate = extract_architecture(tensors, config)
|
|
99
|
+
|
|
100
|
+
# Formula: 2 * Layers * (KV_Heads * Head_Dim) * Context_Length * Batch_Size * Bytes_per_param
|
|
101
|
+
# Assume FP16 (2 bytes) for KV cache
|
|
102
|
+
kv_cache_bytes = 2 * num_layers * kv_dim * context_length * batch_size * 2
|
|
103
|
+
|
|
104
|
+
primary_dtype = max(dtype_counts.items(), key=lambda x: x[1])[0] if dtype_counts else "Unknown"
|
|
105
|
+
# Topology & Strategy Penalties
|
|
106
|
+
penalty_percentage = 0.0
|
|
107
|
+
if gpu_count > 1:
|
|
108
|
+
if strategy == "pp":
|
|
109
|
+
penalty_percentage = 0.0
|
|
110
|
+
else: # strategy == "tp"
|
|
111
|
+
if topology == "nvlink":
|
|
112
|
+
penalty_percentage = 0.04
|
|
113
|
+
elif topology == "pcie3":
|
|
114
|
+
penalty_percentage = 0.20
|
|
115
|
+
else: # pcie4
|
|
116
|
+
penalty_percentage = 0.12
|
|
117
|
+
|
|
118
|
+
distributed_overhead = base_memory_bytes * penalty_percentage if gpu_count > 1 else 0.0
|
|
119
|
+
|
|
120
|
+
vllm_metrics = {}
|
|
121
|
+
if is_vllm and gpu_vram_bytes > 0:
|
|
122
|
+
usable_vram = gpu_vram_bytes * gpu_util
|
|
123
|
+
remaining_vram = usable_vram - (base_memory_bytes + distributed_overhead)
|
|
124
|
+
|
|
125
|
+
bytes_per_token = 2 * num_layers * kv_dim * 2
|
|
126
|
+
|
|
127
|
+
max_serving_capacity = 0
|
|
128
|
+
if remaining_vram > 0 and bytes_per_token > 0:
|
|
129
|
+
max_serving_capacity = math.floor(remaining_vram / bytes_per_token)
|
|
130
|
+
|
|
131
|
+
overhead_bytes = distributed_overhead
|
|
132
|
+
total_memory_bytes = base_memory_bytes + overhead_bytes
|
|
133
|
+
|
|
134
|
+
vllm_metrics = {
|
|
135
|
+
"usable_vram": usable_vram,
|
|
136
|
+
"static_weights": base_memory_bytes,
|
|
137
|
+
"distributed_penalty": distributed_overhead,
|
|
138
|
+
"paged_kv_pool": max(0.0, remaining_vram),
|
|
139
|
+
"max_serving_capacity": max_serving_capacity
|
|
140
|
+
}
|
|
141
|
+
else:
|
|
142
|
+
CUDA_CONTEXT_MB = 600 * gpu_count
|
|
143
|
+
overhead_bytes = (CUDA_CONTEXT_MB * 1024 * 1024) + distributed_overhead
|
|
144
|
+
total_memory_bytes = base_memory_bytes + kv_cache_bytes + overhead_bytes
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"total_params": total_params,
|
|
148
|
+
"base_memory_bytes": base_memory_bytes,
|
|
149
|
+
"kv_cache_bytes": kv_cache_bytes,
|
|
150
|
+
"overhead_bytes": overhead_bytes,
|
|
151
|
+
"total_memory_bytes": total_memory_bytes,
|
|
152
|
+
"num_layers": num_layers,
|
|
153
|
+
"kv_dim": kv_dim,
|
|
154
|
+
"primary_dtype": primary_dtype,
|
|
155
|
+
"kv_is_estimate": is_estimate,
|
|
156
|
+
"penalty_percentage": penalty_percentage,
|
|
157
|
+
"vllm_metrics": vllm_metrics
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
def format_bytes(size_bytes: float) -> str:
|
|
161
|
+
"""Format bytes into a human-readable string (e.g. GB)."""
|
|
162
|
+
if size_bytes == 0:
|
|
163
|
+
return "0 B"
|
|
164
|
+
units = ["B", "KB", "MB", "GB", "TB", "PB"]
|
|
165
|
+
i = max(0, min(len(units) - 1, math.floor(math.log(size_bytes, 1024))))
|
|
166
|
+
p = math.pow(1024, i)
|
|
167
|
+
s = round(size_bytes / p, 2)
|
|
168
|
+
return f"{s} {units[i]}"
|
|
169
|
+
|
|
170
|
+
def format_params(count: int) -> str:
|
|
171
|
+
"""Format parameter count into a human-readable string (e.g. 7.2B)."""
|
|
172
|
+
if count >= 1_000_000_000:
|
|
173
|
+
return f"{count:,} ({count / 1_000_000_000:.1f}B)"
|
|
174
|
+
elif count >= 1_000_000:
|
|
175
|
+
return f"{count:,} ({count / 1_000_000:.1f}M)"
|
|
176
|
+
elif count >= 1_000:
|
|
177
|
+
return f"{count:,} ({count / 1_000:.1f}K)"
|
|
178
|
+
return f"{count:,}"
|
|
@@ -42,11 +42,51 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
|
|
|
42
42
|
default=None,
|
|
43
43
|
help="Target GPU hardware (e.g. 'RTX4090' or 'auto') to check if the model fits.",
|
|
44
44
|
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--tensors",
|
|
47
|
+
action="store_true",
|
|
48
|
+
help="Deep dive: Fetch all remote tensor shards to display the exact tensor size breakdown.",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--topology",
|
|
52
|
+
type=str,
|
|
53
|
+
choices=["nvlink", "pcie4", "pcie3"],
|
|
54
|
+
default="pcie4",
|
|
55
|
+
help="Interconnect topology to calculate distributed communication overhead.",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--strategy",
|
|
59
|
+
type=str,
|
|
60
|
+
choices=["tp", "pp"],
|
|
61
|
+
default="tp",
|
|
62
|
+
help="Distributed parallelism strategy (Tensor vs Pipeline).",
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--vllm",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Enable Subtractive Math Engine: Calculate max context tokens using vLLM PagedAttention allocation.",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--gpu-util",
|
|
71
|
+
type=float,
|
|
72
|
+
default=0.9,
|
|
73
|
+
help="vLLM gpu_memory_utilization ratio (default 0.9). Reserves 10 percent for PyTorch context.",
|
|
74
|
+
)
|
|
45
75
|
|
|
46
76
|
return parser.parse_args(argv)
|
|
47
77
|
|
|
48
78
|
|
|
49
|
-
def analyze_model(
|
|
79
|
+
def analyze_model(
|
|
80
|
+
file_path: str,
|
|
81
|
+
context_override: int | None,
|
|
82
|
+
gpu_count: int = 1,
|
|
83
|
+
fetch_tensors: bool = False,
|
|
84
|
+
topology: str = "pcie4",
|
|
85
|
+
strategy: str = "tp",
|
|
86
|
+
is_vllm: bool = False,
|
|
87
|
+
gpu_vram_gb: float = 0.0,
|
|
88
|
+
gpu_util: float = 0.9
|
|
89
|
+
) -> dict:
|
|
50
90
|
tensors = {}
|
|
51
91
|
config = None
|
|
52
92
|
disk_size = 0.0
|
|
@@ -55,7 +95,7 @@ def analyze_model(file_path: str, context_override: int | None, gpu_count: int =
|
|
|
55
95
|
|
|
56
96
|
if not os.path.exists(file_path) and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
|
|
57
97
|
from modelinfo.parsers.huggingface import fetch_huggingface_repo
|
|
58
|
-
tensors, config, format_name, disk_size = fetch_huggingface_repo(file_path)
|
|
98
|
+
tensors, config, format_name, disk_size = fetch_huggingface_repo(file_path, fetch_tensors=fetch_tensors)
|
|
59
99
|
elif file_path_lower.endswith(".safetensors") or file_path_lower.endswith(".index.json"):
|
|
60
100
|
tensors = parse_safetensors_header(file_path)
|
|
61
101
|
format_name = "SafeTensors"
|
|
@@ -92,7 +132,17 @@ def analyze_model(file_path: str, context_override: int | None, gpu_count: int =
|
|
|
92
132
|
context_length = min(8192, max_context) if max_context else 8192
|
|
93
133
|
is_default_context = True
|
|
94
134
|
|
|
95
|
-
footprint = calculate_footprint(
|
|
135
|
+
footprint = calculate_footprint(
|
|
136
|
+
tensors,
|
|
137
|
+
context_length=context_length,
|
|
138
|
+
config=config,
|
|
139
|
+
gpu_count=gpu_count,
|
|
140
|
+
topology=topology,
|
|
141
|
+
strategy=strategy,
|
|
142
|
+
is_vllm=is_vllm,
|
|
143
|
+
gpu_vram_bytes=gpu_vram_gb * 1024**3 if gpu_vram_gb else 0.0,
|
|
144
|
+
gpu_util=gpu_util
|
|
145
|
+
)
|
|
96
146
|
num_layers = footprint["num_layers"]
|
|
97
147
|
arch_name = identify_architecture_name(tensors, num_layers, config)
|
|
98
148
|
|
|
@@ -110,7 +160,14 @@ def analyze_model(file_path: str, context_override: int | None, gpu_count: int =
|
|
|
110
160
|
"context_length": context_length,
|
|
111
161
|
"is_default_context": is_default_context,
|
|
112
162
|
"tensors": tensors,
|
|
113
|
-
"max_context": max_context
|
|
163
|
+
"max_context": max_context,
|
|
164
|
+
"is_lazy": tensors.get("__metadata__", {}).get("lazy_fetch", False),
|
|
165
|
+
"gpu_count": gpu_count,
|
|
166
|
+
"topology": topology,
|
|
167
|
+
"strategy": strategy,
|
|
168
|
+
"is_vllm": is_vllm,
|
|
169
|
+
"gpu_vram_gb": gpu_vram_gb,
|
|
170
|
+
"gpu_util": gpu_util
|
|
114
171
|
}
|
|
115
172
|
|
|
116
173
|
|
|
@@ -118,20 +175,33 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
118
175
|
args = parse_args(argv)
|
|
119
176
|
|
|
120
177
|
gpu_name_display = None
|
|
178
|
+
gpu_vram_gb = None
|
|
121
179
|
gpu_count = 1
|
|
122
|
-
|
|
180
|
+
|
|
181
|
+
if args.gpu or args.vllm:
|
|
182
|
+
target = args.gpu if args.gpu else "auto"
|
|
123
183
|
from modelinfo.hardware import resolve_gpu
|
|
124
|
-
|
|
125
|
-
gpu_name_display, args.max_vram, gpu_count = resolve_gpu(args.gpu)
|
|
126
|
-
except Exception as e:
|
|
127
|
-
console.print(f"[red]{e}[/red]")
|
|
128
|
-
return 1
|
|
184
|
+
gpu_name_display, gpu_vram_gb, gpu_count = resolve_gpu(target)
|
|
129
185
|
|
|
130
186
|
if len(args.file) > 1:
|
|
187
|
+
if args.vllm:
|
|
188
|
+
console.print("[red]Error: Side-by-side comparison does not currently support the subtractive --vllm engine. Compare models sequentially or remove --vllm.[/red]")
|
|
189
|
+
return 1
|
|
190
|
+
|
|
131
191
|
models = []
|
|
132
192
|
for model_path in args.file:
|
|
133
193
|
try:
|
|
134
|
-
info = analyze_model(
|
|
194
|
+
info = analyze_model(
|
|
195
|
+
model_path,
|
|
196
|
+
args.context,
|
|
197
|
+
gpu_count,
|
|
198
|
+
fetch_tensors=args.tensors,
|
|
199
|
+
topology=args.topology,
|
|
200
|
+
strategy=args.strategy,
|
|
201
|
+
is_vllm=args.vllm,
|
|
202
|
+
gpu_vram_gb=gpu_vram_gb if gpu_vram_gb else 0.0,
|
|
203
|
+
gpu_util=args.gpu_util
|
|
204
|
+
)
|
|
135
205
|
models.append((model_path.split("/")[-1], info))
|
|
136
206
|
except Exception as e:
|
|
137
207
|
console.print(f"[red]Error analyzing model '{model_path}': {e}[/red]")
|
|
@@ -143,12 +213,22 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
143
213
|
file_path = args.file[0]
|
|
144
214
|
|
|
145
215
|
try:
|
|
146
|
-
info = analyze_model(
|
|
216
|
+
info = analyze_model(
|
|
217
|
+
file_path,
|
|
218
|
+
args.context,
|
|
219
|
+
gpu_count,
|
|
220
|
+
fetch_tensors=args.tensors,
|
|
221
|
+
topology=args.topology,
|
|
222
|
+
strategy=args.strategy,
|
|
223
|
+
is_vllm=args.vllm,
|
|
224
|
+
gpu_vram_gb=gpu_vram_gb if gpu_vram_gb else 0.0,
|
|
225
|
+
gpu_util=args.gpu_util
|
|
226
|
+
)
|
|
147
227
|
except Exception as e:
|
|
148
228
|
console.print(f"[red]Error: {e}[/red]")
|
|
149
229
|
return 1
|
|
150
230
|
|
|
151
|
-
print_model_info(**info, max_vram_gb=
|
|
231
|
+
print_model_info(**info, max_vram_gb=gpu_vram_gb if gpu_vram_gb else 8.0, gpu_name=gpu_name_display)
|
|
152
232
|
return 0
|
|
153
233
|
|
|
154
234
|
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
import struct
|
|
2
2
|
from typing import Any, Dict
|
|
3
3
|
|
|
4
|
+
GGML_TYPE_MAP = {
|
|
5
|
+
0: "F32", 1: "F16", 2: "Q4_0", 3: "Q4_1", 4: "Q4_1_O", 5: "Q4_0_O",
|
|
6
|
+
6: "Q5_0", 7: "Q5_1", 8: "Q8_0", 9: "Q8_1", 10: "Q2_K", 11: "Q3_K",
|
|
7
|
+
12: "Q4_K", 13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 16: "IQ2_XXS", 17: "IQ2_XS",
|
|
8
|
+
18: "IQ3_XXS", 19: "IQ1_S", 20: "IQ4_NL", 21: "IQ3_S", 22: "IQ2_S",
|
|
9
|
+
23: "IQ4_XS", 24: "I8", 25: "I16", 26: "I32", 27: "I64", 28: "F64",
|
|
10
|
+
29: "IQ1_M", 30: "BF16", 31: "Q4_0_4_4", 32: "Q4_0_4_8", 33: "Q4_0_8_8",
|
|
11
|
+
}
|
|
4
12
|
|
|
5
13
|
def _read_gguf_value(f: Any, val_type: int) -> Any:
|
|
6
14
|
if val_type == 0:
|
|
@@ -73,12 +81,8 @@ def parse_gguf_header(path: str) -> Dict[str, Any]:
|
|
|
73
81
|
t_type = struct.unpack("<I", f.read(4))[0]
|
|
74
82
|
f.read(8) # skip offset bytes
|
|
75
83
|
|
|
76
|
-
#
|
|
77
|
-
dtype = "
|
|
78
|
-
if t_type == 1:
|
|
79
|
-
dtype = "F16"
|
|
80
|
-
elif t_type > 1:
|
|
81
|
-
dtype = "Q4" # Generic placeholder for quantized types
|
|
84
|
+
# Strict GGUF tensor type mapping
|
|
85
|
+
dtype = GGML_TYPE_MAP.get(t_type, "Unknown")
|
|
82
86
|
|
|
83
87
|
tensors[name] = {"shape": shape, "dtype": dtype}
|
|
84
88
|
|
|
@@ -76,7 +76,7 @@ def _fetch_safetensors_header(repo_id: str, filename: str) -> Dict[str, Any]:
|
|
|
76
76
|
|
|
77
77
|
return json.loads(json_bytes)
|
|
78
78
|
|
|
79
|
-
def fetch_huggingface_repo(repo_id: str) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]:
|
|
79
|
+
def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]:
|
|
80
80
|
"""
|
|
81
81
|
Fetches the metadata directly from the Hugging Face Hub over the network.
|
|
82
82
|
Returns: (tensors, config, format_name, disk_size)
|
|
@@ -110,31 +110,41 @@ def fetch_huggingface_repo(repo_id: str) -> Tuple[Dict[str, Any], Dict[str, Any]
|
|
|
110
110
|
|
|
111
111
|
total_size = index_data.get("metadata", {}).get("total_size", 0.0)
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
113
|
+
if config and not fetch_tensors and total_size > 0:
|
|
114
|
+
# Lazy Fetch Paradigm
|
|
115
|
+
for tensor_name in weight_map.keys():
|
|
116
|
+
tensors[tensor_name] = {"shape": [], "dtype": "BF16"}
|
|
117
|
+
|
|
118
|
+
tensors["__metadata__"] = {
|
|
119
|
+
"missing_shards": 0,
|
|
120
|
+
"total_shards": len(unique_shards),
|
|
121
|
+
"is_sharded": True,
|
|
122
|
+
"lazy_fetch": True,
|
|
123
|
+
"total_size": total_size
|
|
124
|
+
}
|
|
125
|
+
else:
|
|
126
|
+
def fetch_shard(shard: str):
|
|
127
|
+
return shard, _fetch_safetensors_header(repo_id, shard)
|
|
128
|
+
|
|
129
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=min(8, len(unique_shards))) as executor:
|
|
130
|
+
future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
|
|
131
|
+
for future in concurrent.futures.as_completed(future_to_shard):
|
|
132
|
+
shard, shard_header = future.result()
|
|
133
|
+
for k, v in shard_header.items():
|
|
134
|
+
if k != "__metadata__":
|
|
135
|
+
tensors[k] = v
|
|
136
|
+
|
|
137
|
+
tensors["__metadata__"] = {
|
|
138
|
+
"missing_shards": 0,
|
|
139
|
+
"total_shards": len(unique_shards),
|
|
140
|
+
"is_sharded": True
|
|
141
|
+
}
|
|
129
142
|
format_name = "SafeTensors"
|
|
130
143
|
|
|
131
144
|
elif "model.safetensors" in filenames:
|
|
132
145
|
# Single SafeTensors
|
|
133
|
-
header = _fetch_safetensors_header(repo_id, "model.safetensors")
|
|
134
|
-
tensors = header
|
|
135
|
-
format_name = "SafeTensors"
|
|
136
146
|
|
|
137
|
-
#
|
|
147
|
+
# Determine total size first
|
|
138
148
|
req = urllib.request.Request(f"https://huggingface.co/{repo_id}/resolve/main/model.safetensors", method="HEAD")
|
|
139
149
|
token = _get_hf_token()
|
|
140
150
|
if token:
|
|
@@ -144,7 +154,12 @@ def fetch_huggingface_repo(repo_id: str) -> Tuple[Dict[str, Any], Dict[str, Any]
|
|
|
144
154
|
total_size = int(response.headers.get("Content-Length", 0))
|
|
145
155
|
except Exception:
|
|
146
156
|
pass
|
|
157
|
+
|
|
158
|
+
header = _fetch_safetensors_header(repo_id, "model.safetensors")
|
|
159
|
+
tensors = header
|
|
147
160
|
|
|
161
|
+
format_name = "SafeTensors"
|
|
162
|
+
|
|
148
163
|
else:
|
|
149
164
|
raise ValueError(f"Repository {repo_id} does not contain SafeTensors weights.")
|
|
150
165
|
|
|
@@ -47,7 +47,14 @@ def print_model_info(
|
|
|
47
47
|
tensors: Dict[str, Any],
|
|
48
48
|
max_context: int | None = None,
|
|
49
49
|
max_vram_gb: float = 8.0,
|
|
50
|
-
gpu_name: str | None = None
|
|
50
|
+
gpu_name: str | None = None,
|
|
51
|
+
is_lazy: bool = False,
|
|
52
|
+
gpu_count: int = 1,
|
|
53
|
+
topology: str = "pcie4",
|
|
54
|
+
strategy: str = "tp",
|
|
55
|
+
is_vllm: bool = False,
|
|
56
|
+
gpu_vram_gb: float = 0.0,
|
|
57
|
+
gpu_util: float = 0.9
|
|
51
58
|
) -> None:
|
|
52
59
|
summary = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2))
|
|
53
60
|
summary.add_column("Property", style="bold")
|
|
@@ -92,7 +99,11 @@ def print_model_info(
|
|
|
92
99
|
vram_display += f" ├─ KV Cache: {format_bytes(kv_cache_bytes)}{kv_note}\n"
|
|
93
100
|
|
|
94
101
|
overhead_bytes = footprint.get("overhead_bytes", 600 * 1024 * 1024)
|
|
95
|
-
|
|
102
|
+
if gpu_count > 1:
|
|
103
|
+
penalty_str = f"TP/{topology}" if strategy == "tp" else "PP"
|
|
104
|
+
vram_display += f" └─ Overhead: {format_bytes(overhead_bytes)} (CUDA Contexts + {penalty_str} Penalty)"
|
|
105
|
+
else:
|
|
106
|
+
vram_display += f" └─ Overhead: {format_bytes(overhead_bytes)} (CUDA Context + Activations)"
|
|
96
107
|
|
|
97
108
|
summary.add_row("Format:", format_name)
|
|
98
109
|
summary.add_row("Architecture:", arch_name)
|
|
@@ -100,17 +111,45 @@ def print_model_info(
|
|
|
100
111
|
summary.add_row("Parameters:", param_text)
|
|
101
112
|
summary.add_row("Dtype:", footprint["primary_dtype"])
|
|
102
113
|
summary.add_row("Disk size:", disk_text)
|
|
103
|
-
summary.add_row("VRAM (est):", vram_display)
|
|
104
114
|
|
|
105
|
-
if
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
115
|
+
if is_vllm:
|
|
116
|
+
vllm = footprint.get("vllm_metrics", {})
|
|
117
|
+
usable_vram = vllm.get("usable_vram", 0.0)
|
|
118
|
+
static_weights = vllm.get("static_weights", 0.0)
|
|
119
|
+
distributed_penalty = vllm.get("distributed_penalty", 0.0)
|
|
120
|
+
paged_kv_pool = vllm.get("paged_kv_pool", 0.0)
|
|
121
|
+
max_capacity = vllm.get("max_serving_capacity", 0)
|
|
122
|
+
|
|
123
|
+
summary.add_row("VRAM Ceiling:", f"{max_vram_gb:.1f} GB ({gpu_name if gpu_name else 'Target'})")
|
|
124
|
+
|
|
125
|
+
alloc_display = f" ├─ Usable VRAM: {format_bytes(usable_vram)} ({int(gpu_util*100)}% gpu_memory_utilization)\n"
|
|
126
|
+
alloc_display += f" ├─ Static Weights: -{format_bytes(static_weights)} ({footprint.get('primary_dtype', 'BF16')})\n"
|
|
127
|
+
if gpu_count > 1:
|
|
128
|
+
penalty_str = f"TP/{topology}" if strategy == "tp" else "PP"
|
|
129
|
+
alloc_display += f" ├─ {penalty_str} Penalty: -{format_bytes(distributed_penalty)}\n"
|
|
130
|
+
alloc_display += f" └─ Paged KV Pool: = {format_bytes(paged_kv_pool)} Available for Context"
|
|
131
|
+
|
|
132
|
+
summary.add_row("vLLM Allocation:", alloc_display)
|
|
133
|
+
summary.add_row("Max Capacity:", f"~{max_capacity:,} Tokens (Across all concurrent batches)")
|
|
134
|
+
|
|
135
|
+
if paged_kv_pool <= 0:
|
|
136
|
+
summary.add_row("Hardware Fit:", "[red]✗ No (OOM before serving any tokens)[/red]")
|
|
111
137
|
else:
|
|
112
|
-
|
|
113
|
-
|
|
138
|
+
summary.add_row("Hardware Fit:", "[green]✓ Yes[/green]")
|
|
139
|
+
|
|
140
|
+
if gpu_count > 1:
|
|
141
|
+
summary.add_row("", "[dim]*Note: Max capacity assumes perfect load balancing. Real capacity is bottlenecked by the most memory-constrained GPU in the array.[/dim]")
|
|
142
|
+
else:
|
|
143
|
+
summary.add_row("VRAM (est):", vram_display)
|
|
144
|
+
if gpu_name:
|
|
145
|
+
utilization = vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0
|
|
146
|
+
if utilization <= 0.90:
|
|
147
|
+
fit_text = f"[green]✓ Fits comfortably in {gpu_name} ({max_vram_gb:.1f} GB)[/green]"
|
|
148
|
+
elif utilization <= 0.99:
|
|
149
|
+
fit_text = f"[yellow]⚠ Warning: Extreme hardware limit on {gpu_name}. High risk of fragmentation OOM.[/yellow]"
|
|
150
|
+
else:
|
|
151
|
+
fit_text = f"[red]✗ No (Requires {format_bytes(vram_bytes)}, Hardware has {max_vram_gb:.1f} GB)[/red]"
|
|
152
|
+
summary.add_row("Hardware Fit:", fit_text)
|
|
114
153
|
|
|
115
154
|
console.print(summary)
|
|
116
155
|
|
|
@@ -122,6 +161,10 @@ def print_model_info(
|
|
|
122
161
|
|
|
123
162
|
console.print()
|
|
124
163
|
|
|
164
|
+
if is_lazy:
|
|
165
|
+
console.print("[yellow]Top Tensors omitted for speed. Run with --tensors to fetch remote shards.[/yellow]")
|
|
166
|
+
return
|
|
167
|
+
|
|
125
168
|
console.print("Top Tensors by Size:", style="bold")
|
|
126
169
|
|
|
127
170
|
grouped_tensors = group_tensors_by_size(tensors)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: modelinfo-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit.
|
|
5
5
|
Author: ModelInfo Contributors
|
|
6
6
|
License: MIT
|
|
@@ -22,25 +22,27 @@ Dynamic: license-file
|
|
|
22
22
|

|
|
23
23
|

|
|
24
24
|
|
|
25
|
-
ModelInfo
|
|
25
|
+

|
|
26
|
+
|
|
27
|
+
ModelInfo is a CLI tool that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
|
|
26
28
|
|
|
27
29
|
It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
|
|
28
30
|
|
|
29
31
|
## Features
|
|
30
32
|
|
|
31
|
-
- **Zero-Dependency Parsing**: Reads
|
|
32
|
-
- **Remote Hugging Face Hub Inspection**:
|
|
33
|
-
-
|
|
34
|
-
- **Dynamic VRAM
|
|
35
|
-
- **Hardware Fit Diagnostics**:
|
|
36
|
-
- **Side-by-Side Comparison**: Pass multiple models to
|
|
37
|
-
-
|
|
38
|
-
- **Secure Pickling**: Inspects legacy `.pt` files
|
|
39
|
-
-
|
|
33
|
+
- **Zero-Dependency Parsing**: Reads `.safetensors` 8-byte JSON prefixes and `.gguf` binary key-value metadata directly via `struct` and `json` (falling back to `config.json` if needed).
|
|
34
|
+
- **Remote Hugging Face Hub Inspection**: Pass a repo ID (e.g., `meta-llama/Llama-2-7b-hf`) and it uses concurrent byte-range requests to read the headers off the CDN in under 2 seconds. No need to download the checkpoint.
|
|
35
|
+
- Parses `model.safetensors.index.json` to support sharded models without crashing on partial downloads.
|
|
36
|
+
- **Dynamic VRAM & Subtractive vLLM Math**: Calculates exact VRAM limits based on the model's architecture and your target context length. If you use the `--vllm` flag, it switches to a subtractive "Serving Capacity" engine that calculates exactly how many tokens fit in the PagedAttention pool based on your `--gpu-util` ratio.
|
|
37
|
+
- **Hardware Fit Diagnostics**: Check if a model fits your cluster with `--gpu` (e.g. `--gpu RTX4090` or `--gpu auto`). It enforces Apple Silicon's 75% unified memory wire limit, and you can explicitly model multi-GPU NCCL communication penalties with `--topology` and `--strategy`.
|
|
38
|
+
- **Side-by-Side Comparison**: Pass multiple models to trigger a comparison table (parameters, data types, context lengths, VRAM footprints).
|
|
39
|
+
- Uses exact `ggml_type` mappings for GGUF formats to calculate byte-scaling coefficients, preventing VRAM under-reporting.
|
|
40
|
+
- **Secure Pickling**: Inspects legacy `.pt` files safely using a restricted `pickle.Unpickler`.
|
|
41
|
+
- The UI (built with `rich`) groups repetitive layers and color-codes VRAM heatmaps.
|
|
40
42
|
|
|
41
43
|
> [!NOTE]
|
|
42
44
|
> **A Note on Performance & Remote Fetching**
|
|
43
|
-
> Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**.
|
|
45
|
+
> Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. To remain zero-dependency, `modelinfo` opens connections via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
|
|
44
46
|
|
|
45
47
|
## Installation
|
|
46
48
|
|
|
@@ -120,6 +122,12 @@ Compare multiple models side-by-side against a hardware target:
|
|
|
120
122
|
modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
|
|
121
123
|
```
|
|
122
124
|
|
|
125
|
+
Simulate exactly how many tokens you can serve using vLLM on a specific multi-GPU topology:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
modelinfo mistralai/Mistral-7B-v0.1 --vllm --gpu 4xRTX4090 --topology pcie4 --strategy tp
|
|
129
|
+
```
|
|
130
|
+
|
|
123
131
|
### Example Output (Single Model)
|
|
124
132
|
|
|
125
133
|
```text
|
|
@@ -157,13 +165,18 @@ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
|
|
|
157
165
|
| `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
|
|
158
166
|
| `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
|
|
159
167
|
| `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
|
|
168
|
+
| `--vllm` | `--vllm --gpu auto` | Switches from additive memory checking to a subtractive serving capacity estimation. Shows exactly how many tokens fit in the PagedAttention pool. |
|
|
169
|
+
| `--gpu-util` | `--gpu-util 0.9` | Sets the vLLM `gpu_memory_utilization` ratio. Defaults to `0.9` (reserves 10% for PyTorch context). |
|
|
170
|
+
| `--topology` | `--topology nvlink` | Set interconnect topology to calculate exact communication overhead penalties (`nvlink`, `pcie4`, `pcie3`). Defaults to `pcie4`. |
|
|
171
|
+
| `--strategy` | `--strategy tp` | Selects the parallelization strategy for multi-GPU setups (`tp` for Tensor Parallelism, `pp` for Pipeline Parallelism). Defaults to `tp`. |
|
|
172
|
+
| `--tensors` | `--tensors` | Bypasses the algorithmic speed estimation and forces the tool to fetch all remote shards, displaying an exact size breakdown of every tensor. |
|
|
160
173
|
|
|
161
174
|
## Architecture
|
|
162
175
|
|
|
163
|
-
|
|
176
|
+
Three modules:
|
|
164
177
|
|
|
165
178
|
1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
|
|
166
|
-
2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`)
|
|
179
|
+
2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) that use only the standard library.
|
|
167
180
|
3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
|
|
168
181
|
|
|
169
182
|
## License
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import math
|
|
1
2
|
from modelinfo.calculator import calculate_footprint, _get_bytes_per_param
|
|
2
3
|
|
|
3
4
|
def test_quantization_byte_multipliers():
|
|
@@ -110,3 +111,61 @@ def test_framework_overhead_included():
|
|
|
110
111
|
assert "overhead_bytes" in footprint
|
|
111
112
|
assert footprint["overhead_bytes"] == 600 * 1024 * 1024
|
|
112
113
|
assert footprint["total_memory_bytes"] == footprint["base_memory_bytes"] + footprint["kv_cache_bytes"] + footprint["overhead_bytes"]
|
|
114
|
+
|
|
115
|
+
def test_explicit_gguf_quantization_byte_multipliers():
|
|
116
|
+
"""Verify that explicit ggml_type enums are exactly mapped."""
|
|
117
|
+
assert _get_bytes_per_param("Q8_0") == 1.0625
|
|
118
|
+
assert _get_bytes_per_param("Q4_K") == 0.59375
|
|
119
|
+
assert _get_bytes_per_param("IQ2_XXS") == 0.28125
|
|
120
|
+
assert _get_bytes_per_param("F8_E5M2") == 1.0
|
|
121
|
+
|
|
122
|
+
def test_topology_penalties():
|
|
123
|
+
"""Verify multi-GPU distributed overhead logic."""
|
|
124
|
+
tensors = {
|
|
125
|
+
"model.layers.0.attn.weight": {"shape": [1024, 1024], "dtype": "F16"} # Base: 2,097,152 bytes
|
|
126
|
+
}
|
|
127
|
+
# NVLink (4%)
|
|
128
|
+
fp_nvlink = calculate_footprint(tensors, gpu_count=2, topology="nvlink", strategy="tp")
|
|
129
|
+
assert fp_nvlink["penalty_percentage"] == 0.04
|
|
130
|
+
assert fp_nvlink["overhead_bytes"] == (2 * 600 * 1024 * 1024) + (2097152 * 0.04)
|
|
131
|
+
|
|
132
|
+
# PCIe3 (20%)
|
|
133
|
+
fp_pcie3 = calculate_footprint(tensors, gpu_count=4, topology="pcie3", strategy="tp")
|
|
134
|
+
assert fp_pcie3["penalty_percentage"] == 0.20
|
|
135
|
+
assert fp_pcie3["overhead_bytes"] == (4 * 600 * 1024 * 1024) + (2097152 * 0.20)
|
|
136
|
+
|
|
137
|
+
def test_strategy_pp():
|
|
138
|
+
"""Verify Pipeline Parallelism incurs 0 distributed overhead."""
|
|
139
|
+
tensors = {
|
|
140
|
+
"model.layers.0.attn.weight": {"shape": [1024, 1024], "dtype": "F16"}
|
|
141
|
+
}
|
|
142
|
+
fp_pp = calculate_footprint(tensors, gpu_count=4, topology="pcie3", strategy="pp")
|
|
143
|
+
assert fp_pp["penalty_percentage"] == 0.0
|
|
144
|
+
assert fp_pp["overhead_bytes"] == (4 * 600 * 1024 * 1024)
|
|
145
|
+
|
|
146
|
+
def test_vllm_subtractive_math():
|
|
147
|
+
"""Verify the subtractive vLLM serving capacity engine calculates exact tokens."""
|
|
148
|
+
tensors = {
|
|
149
|
+
"model.layers.0.attn.weight": {"shape": [1024, 1024], "dtype": "F16"} # Base: 2MB
|
|
150
|
+
}
|
|
151
|
+
config = {
|
|
152
|
+
"num_hidden_layers": 10,
|
|
153
|
+
"num_attention_heads": 8,
|
|
154
|
+
"num_key_value_heads": 8,
|
|
155
|
+
"hidden_size": 1024
|
|
156
|
+
}
|
|
157
|
+
# 24GB VRAM. 90% util = 21.6GB. Base weights = 2MB. Remaining = ~21.59GB.
|
|
158
|
+
# Bytes per token: 2 (FP16) * 10 (layers) * 1024 (kv_dim) * 2 = 40960 bytes
|
|
159
|
+
gpu_vram = 24.0 * 1024**3
|
|
160
|
+
|
|
161
|
+
fp_vllm = calculate_footprint(tensors, config=config, is_vllm=True, gpu_vram_bytes=gpu_vram, gpu_util=0.9, gpu_count=1)
|
|
162
|
+
|
|
163
|
+
metrics = fp_vllm["vllm_metrics"]
|
|
164
|
+
assert "usable_vram" in metrics
|
|
165
|
+
assert metrics["usable_vram"] == gpu_vram * 0.9
|
|
166
|
+
assert metrics["static_weights"] == 2097152
|
|
167
|
+
assert metrics["paged_kv_pool"] == metrics["usable_vram"] - metrics["static_weights"]
|
|
168
|
+
|
|
169
|
+
bytes_per_token = 40960
|
|
170
|
+
expected_capacity = math.floor(metrics["paged_kv_pool"] / bytes_per_token)
|
|
171
|
+
assert metrics["max_serving_capacity"] == expected_capacity
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import math
|
|
2
|
-
from typing import Any, Dict
|
|
3
|
-
|
|
4
|
-
from modelinfo.architecture import extract_architecture
|
|
5
|
-
|
|
6
|
-
DTYPE_BYTES = {
|
|
7
|
-
"F64": 8,
|
|
8
|
-
"F32": 4,
|
|
9
|
-
"F16": 2,
|
|
10
|
-
"BF16": 2,
|
|
11
|
-
"F8": 1,
|
|
12
|
-
"F8_E5M2": 1,
|
|
13
|
-
"F8_E4M3": 1,
|
|
14
|
-
"I64": 8,
|
|
15
|
-
"I32": 4,
|
|
16
|
-
"I16": 2,
|
|
17
|
-
"I8": 1,
|
|
18
|
-
"U64": 8,
|
|
19
|
-
"U32": 4,
|
|
20
|
-
"Q8": 1.06,
|
|
21
|
-
"Q6": 0.82,
|
|
22
|
-
"Q5": 0.68,
|
|
23
|
-
"Q4": 0.58,
|
|
24
|
-
"Q3": 0.43,
|
|
25
|
-
"Q2": 0.28,
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
def _get_bytes_per_param(dtype: str) -> float:
|
|
29
|
-
"""Return the size in bytes for a given data type."""
|
|
30
|
-
return DTYPE_BYTES.get(dtype.upper(), 2.0)
|
|
31
|
-
|
|
32
|
-
def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1, config: Dict[str, Any] = None, gpu_count: int = 1) -> Dict[str, Any]:
|
|
33
|
-
"""
|
|
34
|
-
Calculate the memory footprint of a model based on its tensors and context length.
|
|
35
|
-
"""
|
|
36
|
-
total_params = 0
|
|
37
|
-
base_memory_bytes = 0.0
|
|
38
|
-
dtype_counts: Dict[str, int] = {}
|
|
39
|
-
|
|
40
|
-
for name, metadata in tensors.items():
|
|
41
|
-
if name == "__metadata__":
|
|
42
|
-
continue
|
|
43
|
-
|
|
44
|
-
shape = metadata.get("shape", [])
|
|
45
|
-
if not shape:
|
|
46
|
-
continue
|
|
47
|
-
|
|
48
|
-
param_count = math.prod(shape)
|
|
49
|
-
total_params += param_count
|
|
50
|
-
|
|
51
|
-
dtype = metadata.get("dtype", "F16").upper()
|
|
52
|
-
dtype_counts[dtype] = dtype_counts.get(dtype, 0) + 1
|
|
53
|
-
|
|
54
|
-
bytes_per_param = _get_bytes_per_param(dtype)
|
|
55
|
-
base_memory_bytes += param_count * bytes_per_param
|
|
56
|
-
|
|
57
|
-
num_layers, kv_dim, is_estimate = extract_architecture(tensors, config)
|
|
58
|
-
|
|
59
|
-
# Formula: 2 * Layers * (KV_Heads * Head_Dim) * Context_Length * Batch_Size * Bytes_per_param
|
|
60
|
-
# Assume FP16 (2 bytes) for KV cache
|
|
61
|
-
kv_cache_bytes = 2 * num_layers * kv_dim * context_length * batch_size * 2
|
|
62
|
-
|
|
63
|
-
primary_dtype = max(dtype_counts.items(), key=lambda x: x[1])[0] if dtype_counts else "Unknown"
|
|
64
|
-
|
|
65
|
-
CUDA_CONTEXT_MB = 600 * gpu_count
|
|
66
|
-
overhead_bytes = CUDA_CONTEXT_MB * 1024 * 1024
|
|
67
|
-
|
|
68
|
-
return {
|
|
69
|
-
"total_params": total_params,
|
|
70
|
-
"base_memory_bytes": base_memory_bytes,
|
|
71
|
-
"kv_cache_bytes": kv_cache_bytes,
|
|
72
|
-
"overhead_bytes": overhead_bytes,
|
|
73
|
-
"total_memory_bytes": base_memory_bytes + kv_cache_bytes + overhead_bytes,
|
|
74
|
-
"num_layers": num_layers,
|
|
75
|
-
"kv_dim": kv_dim,
|
|
76
|
-
"primary_dtype": primary_dtype,
|
|
77
|
-
"kv_is_estimate": is_estimate
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
def format_bytes(size_bytes: float) -> str:
|
|
81
|
-
"""Format bytes into a human-readable string (e.g. GB)."""
|
|
82
|
-
if size_bytes == 0:
|
|
83
|
-
return "0 B"
|
|
84
|
-
units = ["B", "KB", "MB", "GB", "TB", "PB"]
|
|
85
|
-
i = max(0, min(len(units) - 1, math.floor(math.log(size_bytes, 1024))))
|
|
86
|
-
p = math.pow(1024, i)
|
|
87
|
-
s = round(size_bytes / p, 2)
|
|
88
|
-
return f"{s} {units[i]}"
|
|
89
|
-
|
|
90
|
-
def format_params(count: int) -> str:
|
|
91
|
-
"""Format parameter count into a human-readable string (e.g. 7.2B)."""
|
|
92
|
-
if count >= 1_000_000_000:
|
|
93
|
-
return f"{count:,} ({count / 1_000_000_000:.1f}B)"
|
|
94
|
-
elif count >= 1_000_000:
|
|
95
|
-
return f"{count:,} ({count / 1_000_000:.1f}M)"
|
|
96
|
-
elif count >= 1_000:
|
|
97
|
-
return f"{count:,} ({count / 1_000:.1f}K)"
|
|
98
|
-
return f"{count:,}"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|