modelinfo-cli 1.2.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. modelinfo_cli-1.4.0/PKG-INFO +184 -0
  2. modelinfo_cli-1.4.0/README.md +166 -0
  3. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/pyproject.toml +3 -3
  4. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/__init__.py +1 -1
  5. modelinfo_cli-1.4.0/src/modelinfo/calculator.py +178 -0
  6. modelinfo_cli-1.4.0/src/modelinfo/cli.py +236 -0
  7. modelinfo_cli-1.4.0/src/modelinfo/hardware.py +231 -0
  8. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/gguf.py +10 -6
  9. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/huggingface.py +38 -23
  10. modelinfo_cli-1.4.0/src/modelinfo/ui.py +241 -0
  11. modelinfo_cli-1.4.0/src/modelinfo_cli.egg-info/PKG-INFO +184 -0
  12. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/SOURCES.txt +1 -0
  13. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/tests/test_calculator.py +59 -0
  14. modelinfo_cli-1.2.0/PKG-INFO +0 -115
  15. modelinfo_cli-1.2.0/README.md +0 -97
  16. modelinfo_cli-1.2.0/src/modelinfo/calculator.py +0 -98
  17. modelinfo_cli-1.2.0/src/modelinfo/cli.py +0 -116
  18. modelinfo_cli-1.2.0/src/modelinfo/ui.py +0 -139
  19. modelinfo_cli-1.2.0/src/modelinfo_cli.egg-info/PKG-INFO +0 -115
  20. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/LICENSE +0 -0
  21. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/setup.cfg +0 -0
  22. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/__main__.py +0 -0
  23. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/architecture.py +0 -0
  24. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/__init__.py +0 -0
  25. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/base.py +0 -0
  26. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/pytorch.py +0 -0
  27. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo/parsers/safetensors.py +0 -0
  28. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/dependency_links.txt +0 -0
  29. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/entry_points.txt +0 -0
  30. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/requires.txt +0 -0
  31. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/src/modelinfo_cli.egg-info/top_level.txt +0 -0
  32. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/tests/test_constraints.py +0 -0
  33. {modelinfo_cli-1.2.0 → modelinfo_cli-1.4.0}/tests/test_parsers.py +0 -0
@@ -0,0 +1,184 @@
1
+ Metadata-Version: 2.4
2
+ Name: modelinfo-cli
3
+ Version: 1.4.0
4
+ Summary: A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit.
5
+ Author: ModelInfo Contributors
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: rich>=13.0.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: build; extra == "dev"
13
+ Requires-Dist: twine; extra == "dev"
14
+ Requires-Dist: pytest; extra == "dev"
15
+ Requires-Dist: pytest-cov; extra == "dev"
16
+ Requires-Dist: ruff; extra == "dev"
17
+ Dynamic: license-file
18
+
19
+ # ModelInfo CLI
20
+
21
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)
22
+ ![Dependencies](https://img.shields.io/badge/dependencies-rich-green.svg)
23
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
24
+
25
+ ![ModelInfo Demo](modelinfo.gif)
26
+
27
+ ModelInfo is a CLI tool that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
28
+
29
+ It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
30
+
31
+ ## Features
32
+
33
+ - **Zero-Dependency Parsing**: Reads `.safetensors` 8-byte JSON prefixes and `.gguf` binary key-value metadata directly via `struct` and `json` (falling back to `config.json` if needed).
34
+ - **Remote Hugging Face Hub Inspection**: Pass a repo ID (e.g., `meta-llama/Llama-2-7b-hf`) and it uses concurrent byte-range requests to read the headers off the CDN in under 2 seconds. No need to download the checkpoint.
35
+ - Parses `model.safetensors.index.json` to support sharded models without crashing on partial downloads.
36
+ - **Dynamic VRAM & Subtractive vLLM Math**: Calculates exact VRAM limits based on the model's architecture and your target context length. If you use the `--vllm` flag, it switches to a subtractive "Serving Capacity" engine that calculates exactly how many tokens fit in the PagedAttention pool based on your `--gpu-util` ratio.
37
+ - **Hardware Fit Diagnostics**: Check if a model fits your cluster with `--gpu` (e.g. `--gpu RTX4090` or `--gpu auto`). It enforces Apple Silicon's 75% unified memory wire limit, and you can explicitly model multi-GPU NCCL communication penalties with `--topology` and `--strategy`.
38
+ - **Side-by-Side Comparison**: Pass multiple models to trigger a comparison table (parameters, data types, context lengths, VRAM footprints).
39
+ - Uses exact `ggml_type` mappings for GGUF formats to calculate byte-scaling coefficients, preventing VRAM under-reporting.
40
+ - **Secure Pickling**: Inspects legacy `.pt` files safely using a restricted `pickle.Unpickler`.
41
+ - The UI (built with `rich`) groups repetitive layers and color-codes VRAM heatmaps.
42
+
43
+ > [!NOTE]
44
+ > **A Note on Performance & Remote Fetching**
45
+ > Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. To remain zero-dependency, `modelinfo` opens connections via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
46
+
47
+ ## Installation
48
+
49
+ Install directly from PyPI:
50
+
51
+ ```bash
52
+ pip install modelinfo-cli
53
+ ```
54
+
55
+ ### Development
56
+
57
+ To install from source and run the test suite:
58
+
59
+ ```bash
60
+ git clone https://github.com/pipe1os/modelinfo-cli.git
61
+ cd modelinfo-cli
62
+ python -m venv .venv
63
+ source .venv/bin/activate
64
+ pip install -e ".[dev]"
65
+ ```
66
+
67
+ ## Testing
68
+
69
+ The testing suite enforces cross-platform structural integrity and guards the zero-dependency latency constraint. Tests are isolated against custom binary mocks in `tests/fixtures/`.
70
+
71
+ Run the test suite using pytest:
72
+
73
+ ```bash
74
+ pytest tests/ -v
75
+ ```
76
+
77
+ ## Usage
78
+
79
+ Inspect a local model checkpoint:
80
+
81
+ ```bash
82
+ modelinfo mistral-7b.safetensors
83
+ ```
84
+
85
+ Inspect a remote model directly from the Hugging Face Hub:
86
+
87
+ ```bash
88
+ modelinfo meta-llama/Llama-2-7b-hf
89
+ ```
90
+
91
+ For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens).
92
+
93
+ ```bash
94
+ export HF_TOKEN="hf_your_token_here"
95
+ modelinfo meta-llama/Llama-2-7b-hf
96
+ ```
97
+
98
+ Alternatively, the tool will automatically read tokens stored by the `hf auth login` command (located in `~/.cache/huggingface/token`).
99
+
100
+ Calculate the memory footprint with a specific KV cache context window:
101
+
102
+ ```bash
103
+ modelinfo mistral-7b.safetensors --context 8192
104
+ ```
105
+
106
+ Adjust the VRAM heat-mapping thresholds for your specific hardware (e.g., an 80GB card):
107
+
108
+ ```bash
109
+ modelinfo meta-llama/Llama-2-7b-hf --max-vram 80
110
+ ```
111
+
112
+ Determine if a model fits your specific hardware:
113
+
114
+ ```bash
115
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu "RTX 4090"
116
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu auto
117
+ ```
118
+
119
+ Compare multiple models side-by-side against a hardware target:
120
+
121
+ ```bash
122
+ modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
123
+ ```
124
+
125
+ Simulate exactly how many tokens you can serve using vLLM on a specific multi-GPU topology:
126
+
127
+ ```bash
128
+ modelinfo mistralai/Mistral-7B-v0.1 --vllm --gpu 4xRTX4090 --topology pcie4 --strategy tp
129
+ ```
130
+
131
+ ### Example Output (Single Model)
132
+
133
+ ```text
134
+ Format: SafeTensors
135
+ Architecture: MistralForCausalLM (32 layers)
136
+ Tensors: 291
137
+ Parameters: 7.2B
138
+ Dtype: BF16
139
+ Disk size: 13.49 GB
140
+ VRAM (est): ~15.07 GB Total Minimum Required
141
+ ├─ Weights: 13.49 GB
142
+ ├─ KV Cache: 1.0 GB (Default 8192 tokens. Native limit: 32,768)
143
+ └─ Overhead: 600.0 MB (CUDA Context + Activations)
144
+ Hardware Fit: ✗ No (Requires 15.07 GB, Hardware has 12.0 GB)
145
+
146
+ Top Tensors by Size:
147
+ model.embed_tokens.weight [32000 x 4096] bf16 131.1M params
148
+ 32x model.layers.[N].self_attn.q_proj.weight [4096 x 4096] bf16 16.8M params
149
+ ```
150
+
151
+ ### Example Output (Comparison)
152
+
153
+ ```text
154
+ Model Params Dtype Context VRAM Fits
155
+ Mistral-7B-v0.1 7.2B BF16 8K 15.07 GB ✗
156
+ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
157
+ ```
158
+
159
+ ## Command Reference
160
+
161
+ | Argument | Example | Description |
162
+ | :--- | :--- | :--- |
163
+ | `[files...]` | `modelinfo model.safetensors` | Inspect a single model (local path or Hugging Face repo ID). |
164
+ | `[files...]` | `modelinfo modelA modelB` | Pass multiple files/repos to automatically render a side-by-side comparison table instead of a deep-dive summary. |
165
+ | `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
166
+ | `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
167
+ | `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
168
+ | `--vllm` | `--vllm --gpu auto` | Switches from additive memory checking to a subtractive serving capacity estimation. Shows exactly how many tokens fit in the PagedAttention pool. |
169
+ | `--gpu-util` | `--gpu-util 0.9` | Sets the vLLM `gpu_memory_utilization` ratio. Defaults to `0.9` (reserves 10% for PyTorch context). |
170
+ | `--topology` | `--topology nvlink` | Set interconnect topology to calculate exact communication overhead penalties (`nvlink`, `pcie4`, `pcie3`). Defaults to `pcie4`. |
171
+ | `--strategy` | `--strategy tp` | Selects the parallelization strategy for multi-GPU setups (`tp` for Tensor Parallelism, `pp` for Pipeline Parallelism). Defaults to `tp`. |
172
+ | `--tensors` | `--tensors` | Bypasses the algorithmic speed estimation and forces the tool to fetch all remote shards, displaying an exact size breakdown of every tensor. |
173
+
174
+ ## Architecture
175
+
176
+ Three modules:
177
+
178
+ 1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
179
+ 2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) that use only the standard library.
180
+ 3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
181
+
182
+ ## License
183
+
184
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,166 @@
1
+ # ModelInfo CLI
2
+
3
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)
4
+ ![Dependencies](https://img.shields.io/badge/dependencies-rich-green.svg)
5
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
6
+
7
+ ![ModelInfo Demo](modelinfo.gif)
8
+
9
+ ModelInfo is a CLI tool that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
10
+
11
+ It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
12
+
13
+ ## Features
14
+
15
+ - **Zero-Dependency Parsing**: Reads `.safetensors` 8-byte JSON prefixes and `.gguf` binary key-value metadata directly via `struct` and `json` (falling back to `config.json` if needed).
16
+ - **Remote Hugging Face Hub Inspection**: Pass a repo ID (e.g., `meta-llama/Llama-2-7b-hf`) and it uses concurrent byte-range requests to read the headers off the CDN in under 2 seconds. No need to download the checkpoint.
17
+ - Parses `model.safetensors.index.json` to support sharded models without crashing on partial downloads.
18
+ - **Dynamic VRAM & Subtractive vLLM Math**: Calculates exact VRAM limits based on the model's architecture and your target context length. If you use the `--vllm` flag, it switches to a subtractive "Serving Capacity" engine that calculates exactly how many tokens fit in the PagedAttention pool based on your `--gpu-util` ratio.
19
+ - **Hardware Fit Diagnostics**: Check if a model fits your cluster with `--gpu` (e.g. `--gpu RTX4090` or `--gpu auto`). It enforces Apple Silicon's 75% unified memory wire limit, and you can explicitly model multi-GPU NCCL communication penalties with `--topology` and `--strategy`.
20
+ - **Side-by-Side Comparison**: Pass multiple models to trigger a comparison table (parameters, data types, context lengths, VRAM footprints).
21
+ - Uses exact `ggml_type` mappings for GGUF formats to calculate byte-scaling coefficients, preventing VRAM under-reporting.
22
+ - **Secure Pickling**: Inspects legacy `.pt` files safely using a restricted `pickle.Unpickler`.
23
+ - The UI (built with `rich`) groups repetitive layers and color-codes VRAM heatmaps.
24
+
25
+ > [!NOTE]
26
+ > **A Note on Performance & Remote Fetching**
27
+ > Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. To remain zero-dependency, `modelinfo` opens connections via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
28
+
29
+ ## Installation
30
+
31
+ Install directly from PyPI:
32
+
33
+ ```bash
34
+ pip install modelinfo-cli
35
+ ```
36
+
37
+ ### Development
38
+
39
+ To install from source and run the test suite:
40
+
41
+ ```bash
42
+ git clone https://github.com/pipe1os/modelinfo-cli.git
43
+ cd modelinfo-cli
44
+ python -m venv .venv
45
+ source .venv/bin/activate
46
+ pip install -e ".[dev]"
47
+ ```
48
+
49
+ ## Testing
50
+
51
+ The testing suite enforces cross-platform structural integrity and guards the zero-dependency latency constraint. Tests are isolated against custom binary mocks in `tests/fixtures/`.
52
+
53
+ Run the test suite using pytest:
54
+
55
+ ```bash
56
+ pytest tests/ -v
57
+ ```
58
+
59
+ ## Usage
60
+
61
+ Inspect a local model checkpoint:
62
+
63
+ ```bash
64
+ modelinfo mistral-7b.safetensors
65
+ ```
66
+
67
+ Inspect a remote model directly from the Hugging Face Hub:
68
+
69
+ ```bash
70
+ modelinfo meta-llama/Llama-2-7b-hf
71
+ ```
72
+
73
+ For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens).
74
+
75
+ ```bash
76
+ export HF_TOKEN="hf_your_token_here"
77
+ modelinfo meta-llama/Llama-2-7b-hf
78
+ ```
79
+
80
+ Alternatively, the tool will automatically read tokens stored by the `hf auth login` command (located in `~/.cache/huggingface/token`).
81
+
82
+ Calculate the memory footprint with a specific KV cache context window:
83
+
84
+ ```bash
85
+ modelinfo mistral-7b.safetensors --context 8192
86
+ ```
87
+
88
+ Adjust the VRAM heat-mapping thresholds for your specific hardware (e.g., an 80GB card):
89
+
90
+ ```bash
91
+ modelinfo meta-llama/Llama-2-7b-hf --max-vram 80
92
+ ```
93
+
94
+ Determine if a model fits your specific hardware:
95
+
96
+ ```bash
97
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu "RTX 4090"
98
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu auto
99
+ ```
100
+
101
+ Compare multiple models side-by-side against a hardware target:
102
+
103
+ ```bash
104
+ modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
105
+ ```
106
+
107
+ Simulate exactly how many tokens you can serve using vLLM on a specific multi-GPU topology:
108
+
109
+ ```bash
110
+ modelinfo mistralai/Mistral-7B-v0.1 --vllm --gpu 4xRTX4090 --topology pcie4 --strategy tp
111
+ ```
112
+
113
+ ### Example Output (Single Model)
114
+
115
+ ```text
116
+ Format: SafeTensors
117
+ Architecture: MistralForCausalLM (32 layers)
118
+ Tensors: 291
119
+ Parameters: 7.2B
120
+ Dtype: BF16
121
+ Disk size: 13.49 GB
122
+ VRAM (est): ~15.07 GB Total Minimum Required
123
+ ├─ Weights: 13.49 GB
124
+ ├─ KV Cache: 1.0 GB (Default 8192 tokens. Native limit: 32,768)
125
+ └─ Overhead: 600.0 MB (CUDA Context + Activations)
126
+ Hardware Fit: ✗ No (Requires 15.07 GB, Hardware has 12.0 GB)
127
+
128
+ Top Tensors by Size:
129
+ model.embed_tokens.weight [32000 x 4096] bf16 131.1M params
130
+ 32x model.layers.[N].self_attn.q_proj.weight [4096 x 4096] bf16 16.8M params
131
+ ```
132
+
133
+ ### Example Output (Comparison)
134
+
135
+ ```text
136
+ Model Params Dtype Context VRAM Fits
137
+ Mistral-7B-v0.1 7.2B BF16 8K 15.07 GB ✗
138
+ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
139
+ ```
140
+
141
+ ## Command Reference
142
+
143
+ | Argument | Example | Description |
144
+ | :--- | :--- | :--- |
145
+ | `[files...]` | `modelinfo model.safetensors` | Inspect a single model (local path or Hugging Face repo ID). |
146
+ | `[files...]` | `modelinfo modelA modelB` | Pass multiple files/repos to automatically render a side-by-side comparison table instead of a deep-dive summary. |
147
+ | `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
148
+ | `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
149
+ | `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
150
+ | `--vllm` | `--vllm --gpu auto` | Switches from additive memory checking to a subtractive serving capacity estimation. Shows exactly how many tokens fit in the PagedAttention pool. |
151
+ | `--gpu-util` | `--gpu-util 0.9` | Sets the vLLM `gpu_memory_utilization` ratio. Defaults to `0.9` (reserves 10% for PyTorch context). |
152
+ | `--topology` | `--topology nvlink` | Set interconnect topology to calculate exact communication overhead penalties (`nvlink`, `pcie4`, `pcie3`). Defaults to `pcie4`. |
153
+ | `--strategy` | `--strategy tp` | Selects the parallelization strategy for multi-GPU setups (`tp` for Tensor Parallelism, `pp` for Pipeline Parallelism). Defaults to `tp`. |
154
+ | `--tensors` | `--tensors` | Bypasses the algorithmic speed estimation and forces the tool to fetch all remote shards, displaying an exact size breakdown of every tensor. |
155
+
156
+ ## Architecture
157
+
158
+ Three modules:
159
+
160
+ 1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
161
+ 2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) that use only the standard library.
162
+ 3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
163
+
164
+ ## License
165
+
166
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "modelinfo-cli"
7
- version = "1.2.0"
8
- description = "A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements."
7
+ version = "1.4.0"
8
+ description = "A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
@@ -30,4 +30,4 @@ modelinfo = "modelinfo.cli:main"
30
30
 
31
31
  [tool.ruff]
32
32
  line-length = 88
33
- target-version = "py310"
33
+ target-version = "py310"
@@ -2,4 +2,4 @@
2
2
  modelinfo - A high-performance CLI utility for inspecting ML model checkpoints.
3
3
  """
4
4
 
5
- __version__ = "1.2.0"
5
+ __version__ = "1.4.0"
@@ -0,0 +1,178 @@
1
+ import math
2
+ from typing import Any, Dict
3
+
4
+ from modelinfo.architecture import extract_architecture
5
+
6
+ DTYPE_BYTES = {
7
+ "F64": 8.0,
8
+ "F32": 4.0,
9
+ "F16": 2.0,
10
+ "BF16": 2.0,
11
+ "F8": 1.0,
12
+ "F8_E5M2": 1.0,
13
+ "F8_E4M3": 1.0,
14
+ "I64": 8.0,
15
+ "I32": 4.0,
16
+ "I16": 2.0,
17
+ "I8": 1.0,
18
+ "U64": 8.0,
19
+ "U32": 4.0,
20
+ "Q8_0": 1.0625,
21
+ "Q8_1": 1.0625,
22
+ "Q8_K": 1.0625,
23
+ "Q6_K": 0.828125,
24
+ "Q5_0": 0.6875,
25
+ "Q5_1": 0.75,
26
+ "Q5_K": 0.6875,
27
+ "Q4_0": 0.5625,
28
+ "Q4_1": 0.625,
29
+ "Q4_K": 0.59375,
30
+ "Q3_K": 0.4375,
31
+ "Q2_K": 0.34375,
32
+ "IQ4_NL": 0.53125,
33
+ "IQ4_XS": 0.53125,
34
+ "IQ3_S": 0.4375,
35
+ "IQ3_XXS": 0.385,
36
+ "IQ2_S": 0.3125,
37
+ "IQ2_XS": 0.296875,
38
+ "IQ2_XXS": 0.28125,
39
+ "IQ1_M": 0.21875,
40
+ "IQ1_S": 0.1953125,
41
+ "Q8": 1.06,
42
+ "Q6": 0.82,
43
+ "Q5": 0.68,
44
+ "Q4": 0.58,
45
+ "Q3": 0.43,
46
+ "Q2": 0.28,
47
+ }
48
+
49
+ def _get_bytes_per_param(dtype: str) -> float:
50
+ """Return the size in bytes for a given data type."""
51
+ return DTYPE_BYTES.get(dtype.upper(), 2.0)
52
+
53
+ def calculate_footprint(
54
+ tensors: Dict[str, Any],
55
+ context_length: int = 0,
56
+ batch_size: int = 1,
57
+ config: Dict[str, Any] = None,
58
+ gpu_count: int = 1,
59
+ topology: str = "pcie4",
60
+ strategy: str = "tp",
61
+ is_vllm: bool = False,
62
+ gpu_vram_bytes: float = 0.0,
63
+ gpu_util: float = 0.9
64
+ ) -> Dict[str, Any]:
65
+ """
66
+ Calculate the memory footprint of a model based on its tensors and context length.
67
+ """
68
+ total_params = 0
69
+ base_memory_bytes = 0.0
70
+ dtype_counts: Dict[str, int] = {}
71
+
72
+ is_lazy = tensors.get("__metadata__", {}).get("lazy_fetch", False)
73
+
74
+ if is_lazy:
75
+ base_memory_bytes = tensors.get("__metadata__", {}).get("total_size", 0.0)
76
+ # Assume predominantly FP16/BF16 for modern Hub architectures
77
+ primary_dtype = "BF16"
78
+ dtype_counts[primary_dtype] = 1
79
+ total_params = int(base_memory_bytes / 2.0)
80
+ else:
81
+ for name, metadata in tensors.items():
82
+ if name == "__metadata__":
83
+ continue
84
+
85
+ shape = metadata.get("shape", [])
86
+ if not shape:
87
+ continue
88
+
89
+ param_count = math.prod(shape)
90
+ total_params += param_count
91
+
92
+ dtype = metadata.get("dtype", "F16").upper()
93
+ dtype_counts[dtype] = dtype_counts.get(dtype, 0) + 1
94
+
95
+ bytes_per_param = _get_bytes_per_param(dtype)
96
+ base_memory_bytes += param_count * bytes_per_param
97
+
98
+ num_layers, kv_dim, is_estimate = extract_architecture(tensors, config)
99
+
100
+ # Formula: 2 * Layers * (KV_Heads * Head_Dim) * Context_Length * Batch_Size * Bytes_per_param
101
+ # Assume FP16 (2 bytes) for KV cache
102
+ kv_cache_bytes = 2 * num_layers * kv_dim * context_length * batch_size * 2
103
+
104
+ primary_dtype = max(dtype_counts.items(), key=lambda x: x[1])[0] if dtype_counts else "Unknown"
105
+ # Topology & Strategy Penalties
106
+ penalty_percentage = 0.0
107
+ if gpu_count > 1:
108
+ if strategy == "pp":
109
+ penalty_percentage = 0.0
110
+ else: # strategy == "tp"
111
+ if topology == "nvlink":
112
+ penalty_percentage = 0.04
113
+ elif topology == "pcie3":
114
+ penalty_percentage = 0.20
115
+ else: # pcie4
116
+ penalty_percentage = 0.12
117
+
118
+ distributed_overhead = base_memory_bytes * penalty_percentage if gpu_count > 1 else 0.0
119
+
120
+ vllm_metrics = {}
121
+ if is_vllm and gpu_vram_bytes > 0:
122
+ usable_vram = gpu_vram_bytes * gpu_util
123
+ remaining_vram = usable_vram - (base_memory_bytes + distributed_overhead)
124
+
125
+ bytes_per_token = 2 * num_layers * kv_dim * 2
126
+
127
+ max_serving_capacity = 0
128
+ if remaining_vram > 0 and bytes_per_token > 0:
129
+ max_serving_capacity = math.floor(remaining_vram / bytes_per_token)
130
+
131
+ overhead_bytes = distributed_overhead
132
+ total_memory_bytes = base_memory_bytes + overhead_bytes
133
+
134
+ vllm_metrics = {
135
+ "usable_vram": usable_vram,
136
+ "static_weights": base_memory_bytes,
137
+ "distributed_penalty": distributed_overhead,
138
+ "paged_kv_pool": max(0.0, remaining_vram),
139
+ "max_serving_capacity": max_serving_capacity
140
+ }
141
+ else:
142
+ CUDA_CONTEXT_MB = 600 * gpu_count
143
+ overhead_bytes = (CUDA_CONTEXT_MB * 1024 * 1024) + distributed_overhead
144
+ total_memory_bytes = base_memory_bytes + kv_cache_bytes + overhead_bytes
145
+
146
+ return {
147
+ "total_params": total_params,
148
+ "base_memory_bytes": base_memory_bytes,
149
+ "kv_cache_bytes": kv_cache_bytes,
150
+ "overhead_bytes": overhead_bytes,
151
+ "total_memory_bytes": total_memory_bytes,
152
+ "num_layers": num_layers,
153
+ "kv_dim": kv_dim,
154
+ "primary_dtype": primary_dtype,
155
+ "kv_is_estimate": is_estimate,
156
+ "penalty_percentage": penalty_percentage,
157
+ "vllm_metrics": vllm_metrics
158
+ }
159
+
160
+ def format_bytes(size_bytes: float) -> str:
161
+ """Format bytes into a human-readable string (e.g. GB)."""
162
+ if size_bytes == 0:
163
+ return "0 B"
164
+ units = ["B", "KB", "MB", "GB", "TB", "PB"]
165
+ i = max(0, min(len(units) - 1, math.floor(math.log(size_bytes, 1024))))
166
+ p = math.pow(1024, i)
167
+ s = round(size_bytes / p, 2)
168
+ return f"{s} {units[i]}"
169
+
170
+ def format_params(count: int) -> str:
171
+ """Format parameter count into a human-readable string (e.g. 7.2B)."""
172
+ if count >= 1_000_000_000:
173
+ return f"{count:,} ({count / 1_000_000_000:.1f}B)"
174
+ elif count >= 1_000_000:
175
+ return f"{count:,} ({count / 1_000_000:.1f}M)"
176
+ elif count >= 1_000:
177
+ return f"{count:,} ({count / 1_000:.1f}K)"
178
+ return f"{count:,}"