modelinfo-cli 1.1.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. modelinfo_cli-1.3.0/PKG-INFO +171 -0
  2. modelinfo_cli-1.3.0/README.md +153 -0
  3. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/pyproject.toml +3 -3
  4. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/__init__.py +1 -1
  5. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/architecture.py +6 -2
  6. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/calculator.py +6 -2
  7. modelinfo_cli-1.3.0/src/modelinfo/cli.py +156 -0
  8. modelinfo_cli-1.3.0/src/modelinfo/hardware.py +231 -0
  9. modelinfo_cli-1.3.0/src/modelinfo/parsers/huggingface.py +151 -0
  10. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/ui.py +89 -12
  11. modelinfo_cli-1.3.0/src/modelinfo_cli.egg-info/PKG-INFO +171 -0
  12. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo_cli.egg-info/SOURCES.txt +2 -0
  13. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/tests/test_calculator.py +11 -0
  14. modelinfo_cli-1.1.0/PKG-INFO +0 -108
  15. modelinfo_cli-1.1.0/README.md +0 -90
  16. modelinfo_cli-1.1.0/src/modelinfo/cli.py +0 -99
  17. modelinfo_cli-1.1.0/src/modelinfo_cli.egg-info/PKG-INFO +0 -108
  18. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/LICENSE +0 -0
  19. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/setup.cfg +0 -0
  20. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/__main__.py +0 -0
  21. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/parsers/__init__.py +0 -0
  22. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/parsers/base.py +0 -0
  23. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/parsers/gguf.py +0 -0
  24. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/parsers/pytorch.py +0 -0
  25. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo/parsers/safetensors.py +0 -0
  26. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo_cli.egg-info/dependency_links.txt +0 -0
  27. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo_cli.egg-info/entry_points.txt +0 -0
  28. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo_cli.egg-info/requires.txt +0 -0
  29. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/src/modelinfo_cli.egg-info/top_level.txt +0 -0
  30. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/tests/test_constraints.py +0 -0
  31. {modelinfo_cli-1.1.0 → modelinfo_cli-1.3.0}/tests/test_parsers.py +0 -0
@@ -0,0 +1,171 @@
1
+ Metadata-Version: 2.4
2
+ Name: modelinfo-cli
3
+ Version: 1.3.0
4
+ Summary: A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit.
5
+ Author: ModelInfo Contributors
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: rich>=13.0.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: build; extra == "dev"
13
+ Requires-Dist: twine; extra == "dev"
14
+ Requires-Dist: pytest; extra == "dev"
15
+ Requires-Dist: pytest-cov; extra == "dev"
16
+ Requires-Dist: ruff; extra == "dev"
17
+ Dynamic: license-file
18
+
19
+ # ModelInfo CLI
20
+
21
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)
22
+ ![Dependencies](https://img.shields.io/badge/dependencies-rich-green.svg)
23
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
24
+
25
+ ModelInfo is a terminal-native utility that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
26
+
27
+ It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
28
+
29
+ ## Features
30
+
31
+ - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Reads adjacent `config.json` for architecture fallback.
32
+ - **Remote Hugging Face Hub Inspection**: Inspect any public or gated model directly via its repo ID (e.g., `modelinfo meta-llama/Llama-2-7b-hf`) without downloading the checkpoint. Uses concurrent byte-range requests to read the binary headers directly off the CDN in under 2 seconds.
33
+ - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
34
+ - **Dynamic VRAM Estimation**: Extracts underlying model architecture to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
35
+ - **Hardware Fit Diagnostics**: Pass the `--gpu` flag (e.g. `--gpu RTX4090` or `--gpu auto`) to calculate if the model fits in your specific cluster. Defends against fragmentation OOMs using a 3-tier heuristic (Safe, Warning, Fail), calculates overhead across multi-GPU setups, and enforces Apple Silicon's 75% unified memory wire limit.
36
+ - **Side-by-Side Comparison**: Pass multiple models to automatically trigger a comparison table. Compares parameters, data types, context lengths, and VRAM footprints side-by-side to evaluate trade-offs.
37
+ - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
38
+ - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
39
+ - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`. Breaks down memory footprints into Weights, KV Cache, and Overhead.
40
+
41
+ > [!NOTE]
42
+ > **A Note on Performance & Remote Fetching**
43
+ > Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. This is an intentional trade-off. To remain zero-dependency, `modelinfo` negotiates raw TCP/TLS via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
44
+
45
+ ## Installation
46
+
47
+ Install directly from PyPI:
48
+
49
+ ```bash
50
+ pip install modelinfo-cli
51
+ ```
52
+
53
+ ### Development
54
+
55
+ To install from source and run the test suite:
56
+
57
+ ```bash
58
+ git clone https://github.com/pipe1os/modelinfo-cli.git
59
+ cd modelinfo-cli
60
+ python -m venv .venv
61
+ source .venv/bin/activate
62
+ pip install -e ".[dev]"
63
+ ```
64
+
65
+ ## Testing
66
+
67
+ The testing suite enforces cross-platform structural integrity and guards the zero-dependency latency constraint. Tests are isolated against custom binary mocks in `tests/fixtures/`.
68
+
69
+ Run the test suite using pytest:
70
+
71
+ ```bash
72
+ pytest tests/ -v
73
+ ```
74
+
75
+ ## Usage
76
+
77
+ Inspect a local model checkpoint:
78
+
79
+ ```bash
80
+ modelinfo mistral-7b.safetensors
81
+ ```
82
+
83
+ Inspect a remote model directly from the Hugging Face Hub:
84
+
85
+ ```bash
86
+ modelinfo meta-llama/Llama-2-7b-hf
87
+ ```
88
+
89
+ For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens).
90
+
91
+ ```bash
92
+ export HF_TOKEN="hf_your_token_here"
93
+ modelinfo meta-llama/Llama-2-7b-hf
94
+ ```
95
+
96
+ Alternatively, the tool will automatically read tokens stored by the `hf auth login` command (located in `~/.cache/huggingface/token`).
97
+
98
+ Calculate the memory footprint with a specific KV cache context window:
99
+
100
+ ```bash
101
+ modelinfo mistral-7b.safetensors --context 8192
102
+ ```
103
+
104
+ Adjust the VRAM heat-mapping thresholds for your specific hardware (e.g., an 80GB card):
105
+
106
+ ```bash
107
+ modelinfo meta-llama/Llama-2-7b-hf --max-vram 80
108
+ ```
109
+
110
+ Determine if a model fits your specific hardware:
111
+
112
+ ```bash
113
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu "RTX 4090"
114
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu auto
115
+ ```
116
+
117
+ Compare multiple models side-by-side against a hardware target:
118
+
119
+ ```bash
120
+ modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
121
+ ```
122
+
123
+ ### Example Output (Single Model)
124
+
125
+ ```text
126
+ Format: SafeTensors
127
+ Architecture: MistralForCausalLM (32 layers)
128
+ Tensors: 291
129
+ Parameters: 7.2B
130
+ Dtype: BF16
131
+ Disk size: 13.49 GB
132
+ VRAM (est): ~15.07 GB Total Minimum Required
133
+ ├─ Weights: 13.49 GB
134
+ ├─ KV Cache: 1.0 GB (Default 8192 tokens. Native limit: 32,768)
135
+ └─ Overhead: 600.0 MB (CUDA Context + Activations)
136
+ Hardware Fit: ✗ No (Requires 15.07 GB, Hardware has 12.0 GB)
137
+
138
+ Top Tensors by Size:
139
+ model.embed_tokens.weight [32000 x 4096] bf16 131.1M params
140
+ 32x model.layers.[N].self_attn.q_proj.weight [4096 x 4096] bf16 16.8M params
141
+ ```
142
+
143
+ ### Example Output (Comparison)
144
+
145
+ ```text
146
+ Model Params Dtype Context VRAM Fits
147
+ Mistral-7B-v0.1 7.2B BF16 8K 15.07 GB ✗
148
+ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
149
+ ```
150
+
151
+ ## Command Reference
152
+
153
+ | Argument | Example | Description |
154
+ | :--- | :--- | :--- |
155
+ | `[files...]` | `modelinfo model.safetensors` | Inspect a single model (local path or Hugging Face repo ID). |
156
+ | `[files...]` | `modelinfo modelA modelB` | Pass multiple files/repos to automatically render a side-by-side comparison table instead of a deep-dive summary. |
157
+ | `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
158
+ | `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
159
+ | `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
160
+
161
+ ## Architecture
162
+
163
+ The system operates across three modules:
164
+
165
+ 1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
166
+ 2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) strictly confined to standard library operations.
167
+ 3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
168
+
169
+ ## License
170
+
171
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,153 @@
1
+ # ModelInfo CLI
2
+
3
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)
4
+ ![Dependencies](https://img.shields.io/badge/dependencies-rich-green.svg)
5
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
6
+
7
+ ModelInfo is a terminal-native utility that inspects machine learning model checkpoints (`.safetensors`, `.gguf`, `.pt`) and calculates hardware requirements completely offline.
8
+
9
+ It reads binary headers directly using the Python standard library. By bypassing full tensor payload loading and strictly excluding heavy ecosystems like PyTorch or HuggingFace, the tool executes in under 100 milliseconds.
10
+
11
+ ## Features
12
+
13
+ - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Reads adjacent `config.json` for architecture fallback.
14
+ - **Remote Hugging Face Hub Inspection**: Inspect any public or gated model directly via its repo ID (e.g., `modelinfo meta-llama/Llama-2-7b-hf`) without downloading the checkpoint. Uses concurrent byte-range requests to read the binary headers directly off the CDN in under 2 seconds.
15
+ - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
16
+ - **Dynamic VRAM Estimation**: Extracts underlying model architecture to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
17
+ - **Hardware Fit Diagnostics**: Pass the `--gpu` flag (e.g. `--gpu RTX4090` or `--gpu auto`) to calculate if the model fits in your specific cluster. Defends against fragmentation OOMs using a 3-tier heuristic (Safe, Warning, Fail), calculates overhead across multi-GPU setups, and enforces Apple Silicon's 75% unified memory wire limit.
18
+ - **Side-by-Side Comparison**: Pass multiple models to automatically trigger a comparison table. Compares parameters, data types, context lengths, and VRAM footprints side-by-side to evaluate trade-offs.
19
+ - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
20
+ - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
21
+ - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`. Breaks down memory footprints into Weights, KV Cache, and Overhead.
22
+
23
+ > [!NOTE]
24
+ > **A Note on Performance & Remote Fetching**
25
+ > Local `.gguf` and `.safetensors` files are parsed in under 100ms. However, querying remote Hugging Face repositories takes **1 to 10 seconds**. This is an intentional trade-off. To remain zero-dependency, `modelinfo` negotiates raw TCP/TLS via Python `urllib` instead of loading PyTorch. For massive sharded models (e.g., 100+ shards), it must fetch every header individually, capped at an 8-worker thread pool to prevent Cloudflare IP bans. Waiting ~8 seconds to map a model is faster than downloading 400GB just to see if it fits your hardware.
26
+
27
+ ## Installation
28
+
29
+ Install directly from PyPI:
30
+
31
+ ```bash
32
+ pip install modelinfo-cli
33
+ ```
34
+
35
+ ### Development
36
+
37
+ To install from source and run the test suite:
38
+
39
+ ```bash
40
+ git clone https://github.com/pipe1os/modelinfo-cli.git
41
+ cd modelinfo-cli
42
+ python -m venv .venv
43
+ source .venv/bin/activate
44
+ pip install -e ".[dev]"
45
+ ```
46
+
47
+ ## Testing
48
+
49
+ The testing suite enforces cross-platform structural integrity and guards the zero-dependency latency constraint. Tests are isolated against custom binary mocks in `tests/fixtures/`.
50
+
51
+ Run the test suite using pytest:
52
+
53
+ ```bash
54
+ pytest tests/ -v
55
+ ```
56
+
57
+ ## Usage
58
+
59
+ Inspect a local model checkpoint:
60
+
61
+ ```bash
62
+ modelinfo mistral-7b.safetensors
63
+ ```
64
+
65
+ Inspect a remote model directly from the Hugging Face Hub:
66
+
67
+ ```bash
68
+ modelinfo meta-llama/Llama-2-7b-hf
69
+ ```
70
+
71
+ For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens).
72
+
73
+ ```bash
74
+ export HF_TOKEN="hf_your_token_here"
75
+ modelinfo meta-llama/Llama-2-7b-hf
76
+ ```
77
+
78
+ Alternatively, the tool will automatically read tokens stored by the `hf auth login` command (located in `~/.cache/huggingface/token`).
79
+
80
+ Calculate the memory footprint with a specific KV cache context window:
81
+
82
+ ```bash
83
+ modelinfo mistral-7b.safetensors --context 8192
84
+ ```
85
+
86
+ Adjust the VRAM heat-mapping thresholds for your specific hardware (e.g., an 80GB card):
87
+
88
+ ```bash
89
+ modelinfo meta-llama/Llama-2-7b-hf --max-vram 80
90
+ ```
91
+
92
+ Determine if a model fits your specific hardware:
93
+
94
+ ```bash
95
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu "RTX 4090"
96
+ modelinfo mistralai/Mistral-7B-v0.1 --gpu auto
97
+ ```
98
+
99
+ Compare multiple models side-by-side against a hardware target:
100
+
101
+ ```bash
102
+ modelinfo mistralai/Mistral-7B-v0.1 Qwen/Qwen2.5-0.5B --gpu 12
103
+ ```
104
+
105
+ ### Example Output (Single Model)
106
+
107
+ ```text
108
+ Format: SafeTensors
109
+ Architecture: MistralForCausalLM (32 layers)
110
+ Tensors: 291
111
+ Parameters: 7.2B
112
+ Dtype: BF16
113
+ Disk size: 13.49 GB
114
+ VRAM (est): ~15.07 GB Total Minimum Required
115
+ ├─ Weights: 13.49 GB
116
+ ├─ KV Cache: 1.0 GB (Default 8192 tokens. Native limit: 32,768)
117
+ └─ Overhead: 600.0 MB (CUDA Context + Activations)
118
+ Hardware Fit: ✗ No (Requires 15.07 GB, Hardware has 12.0 GB)
119
+
120
+ Top Tensors by Size:
121
+ model.embed_tokens.weight [32000 x 4096] bf16 131.1M params
122
+ 32x model.layers.[N].self_attn.q_proj.weight [4096 x 4096] bf16 16.8M params
123
+ ```
124
+
125
+ ### Example Output (Comparison)
126
+
127
+ ```text
128
+ Model Params Dtype Context VRAM Fits
129
+ Mistral-7B-v0.1 7.2B BF16 8K 15.07 GB ✗
130
+ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
131
+ ```
132
+
133
+ ## Command Reference
134
+
135
+ | Argument | Example | Description |
136
+ | :--- | :--- | :--- |
137
+ | `[files...]` | `modelinfo model.safetensors` | Inspect a single model (local path or Hugging Face repo ID). |
138
+ | `[files...]` | `modelinfo modelA modelB` | Pass multiple files/repos to automatically render a side-by-side comparison table instead of a deep-dive summary. |
139
+ | `--gpu` | `--gpu rtx4090` | Check if the model fits. Accepts GPU names (`rtx4090`, `b200`, `rx7900xtx`), explicit VRAM limits in GB (`--gpu 24`), or local hardware auto-discovery (`--gpu auto`). |
140
+ | `--context` | `--context 32768` | Adjust the target KV cache length. Essential for calculating the dynamic memory footprint of long-context models. Defaults to `8192`. |
141
+ | `--max-vram` | `--max-vram 80` | Adjusts the color-coded heat mapping thresholds (Green/Yellow/Red) in the terminal output to match a specific hardware ceiling. |
142
+
143
+ ## Architecture
144
+
145
+ The system operates across three modules:
146
+
147
+ 1. **Presentation (`cli.py`, `ui.py`)**: Parses arguments and formats tables via `rich`.
148
+ 2. **Parsing Engine (`parsers/`)**: Specialized binary readers (`safetensors.py`, `gguf.py`, `pytorch.py`) strictly confined to standard library operations.
149
+ 3. **Math Engine (`calculator.py`)**: Determines total parameter counts, maps data types to byte coefficients, and calculates dynamic memory allocations based on tensor shape heuristics.
150
+
151
+ ## License
152
+
153
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "modelinfo-cli"
7
- version = "1.1.0"
8
- description = "A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements."
7
+ version = "1.3.0"
8
+ description = "A sub-100ms, zero-dependency CLI to inspect ML models (.safetensors, .gguf) locally or via Hugging Face, calculate exact VRAM footprints, and determine hardware fit."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
@@ -30,4 +30,4 @@ modelinfo = "modelinfo.cli:main"
30
30
 
31
31
  [tool.ruff]
32
32
  line-length = 88
33
- target-version = "py310"
33
+ target-version = "py310"
@@ -2,4 +2,4 @@
2
2
  modelinfo - A high-performance CLI utility for inspecting ML model checkpoints.
3
3
  """
4
4
 
5
- __version__ = "0.1.0"
5
+ __version__ = "1.3.0"
@@ -85,8 +85,12 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None)
85
85
 
86
86
  return num_layers, kv_dim, is_estimate
87
87
 
88
- def identify_architecture_name(tensors: Dict[str, Any], num_layers: int) -> str:
89
- """Attempt to identify the architecture family based on tensor names or metadata."""
88
+ def identify_architecture_name(tensors: Dict[str, Any], num_layers: int, config: Dict[str, Any] = None) -> str:
89
+ """Attempt to identify the architecture family based on tensor names, metadata, or config.json."""
90
+ if config and "architectures" in config and config["architectures"]:
91
+ arch_title = config["architectures"][0]
92
+ return f"{arch_title} ({num_layers} layers)" if num_layers else arch_title
93
+
90
94
  metadata = tensors.get("__metadata__", {})
91
95
  gen_arch = metadata.get("general.architecture")
92
96
 
@@ -29,7 +29,7 @@ def _get_bytes_per_param(dtype: str) -> float:
29
29
  """Return the size in bytes for a given data type."""
30
30
  return DTYPE_BYTES.get(dtype.upper(), 2.0)
31
31
 
32
- def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1, config: Dict[str, Any] = None) -> Dict[str, Any]:
32
+ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1, config: Dict[str, Any] = None, gpu_count: int = 1) -> Dict[str, Any]:
33
33
  """
34
34
  Calculate the memory footprint of a model based on its tensors and context length.
35
35
  """
@@ -62,11 +62,15 @@ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_
62
62
 
63
63
  primary_dtype = max(dtype_counts.items(), key=lambda x: x[1])[0] if dtype_counts else "Unknown"
64
64
 
65
+ CUDA_CONTEXT_MB = 600 * gpu_count
66
+ overhead_bytes = CUDA_CONTEXT_MB * 1024 * 1024
67
+
65
68
  return {
66
69
  "total_params": total_params,
67
70
  "base_memory_bytes": base_memory_bytes,
68
71
  "kv_cache_bytes": kv_cache_bytes,
69
- "total_memory_bytes": base_memory_bytes + kv_cache_bytes,
72
+ "overhead_bytes": overhead_bytes,
73
+ "total_memory_bytes": base_memory_bytes + kv_cache_bytes + overhead_bytes,
70
74
  "num_layers": num_layers,
71
75
  "kv_dim": kv_dim,
72
76
  "primary_dtype": primary_dtype,
@@ -0,0 +1,156 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import sys
5
+ from typing import Sequence
6
+
7
+ from modelinfo.architecture import identify_architecture_name
8
+ from modelinfo.calculator import calculate_footprint
9
+ from modelinfo.parsers.gguf import parse_gguf_header
10
+ from modelinfo.parsers.pytorch import parse_pytorch_header
11
+ from modelinfo.parsers.safetensors import parse_safetensors_header
12
+ from modelinfo.ui import console, print_model_info, print_compare_info
13
+
14
+
15
+ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
16
+ parser = argparse.ArgumentParser(
17
+ prog="modelinfo",
18
+ description="High-performance CLI utility to inspect ML model checkpoints and calculate VRAM requirements.",
19
+ )
20
+
21
+ parser.add_argument(
22
+ "file",
23
+ type=str,
24
+ nargs="+",
25
+ help="Path to the model checkpoint file(s) or Hugging Face repository IDs.",
26
+ )
27
+ parser.add_argument(
28
+ "--context",
29
+ type=int,
30
+ default=None,
31
+ help="Context length for dynamic KV cache footprint calculation.",
32
+ )
33
+ parser.add_argument(
34
+ "--max-vram",
35
+ type=float,
36
+ default=8.0,
37
+ help="Maximum VRAM in GB for color-coding thresholds.",
38
+ )
39
+ parser.add_argument(
40
+ "--gpu",
41
+ type=str,
42
+ default=None,
43
+ help="Target GPU hardware (e.g. 'RTX4090' or 'auto') to check if the model fits.",
44
+ )
45
+
46
+ return parser.parse_args(argv)
47
+
48
+
49
+ def analyze_model(file_path: str, context_override: int | None, gpu_count: int = 1) -> dict:
50
+ tensors = {}
51
+ config = None
52
+ disk_size = 0.0
53
+
54
+ file_path_lower = file_path.lower()
55
+
56
+ if not os.path.exists(file_path) and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
57
+ from modelinfo.parsers.huggingface import fetch_huggingface_repo
58
+ tensors, config, format_name, disk_size = fetch_huggingface_repo(file_path)
59
+ elif file_path_lower.endswith(".safetensors") or file_path_lower.endswith(".index.json"):
60
+ tensors = parse_safetensors_header(file_path)
61
+ format_name = "SafeTensors"
62
+
63
+ config_path = os.path.join(os.path.dirname(file_path), "config.json")
64
+ if os.path.exists(config_path):
65
+ try:
66
+ with open(config_path, "r", encoding="utf-8") as f:
67
+ config = json.load(f)
68
+ except (json.JSONDecodeError, OSError):
69
+ pass
70
+
71
+ elif file_path_lower.endswith(".gguf"):
72
+ tensors = parse_gguf_header(file_path)
73
+ format_name = "GGUF"
74
+ elif file_path_lower.endswith(".pt") or file_path_lower.endswith(".bin"):
75
+ tensors = parse_pytorch_header(file_path)
76
+ format_name = "PyTorch"
77
+ else:
78
+ raise ValueError(f"File '{file_path}' not found locally and does not appear to be a Hugging Face repository ID.")
79
+
80
+ max_context = None
81
+ if config:
82
+ max_context = config.get("max_position_embeddings")
83
+ elif format_name == "GGUF":
84
+ metadata = tensors.get("__metadata__", {})
85
+ gen_arch = metadata.get("general.architecture")
86
+ if gen_arch:
87
+ max_context = metadata.get(f"{gen_arch}.context_length")
88
+
89
+ is_default_context = False
90
+ context_length = context_override
91
+ if context_length is None:
92
+ context_length = min(8192, max_context) if max_context else 8192
93
+ is_default_context = True
94
+
95
+ footprint = calculate_footprint(tensors, context_length=context_length, config=config, gpu_count=gpu_count)
96
+ num_layers = footprint["num_layers"]
97
+ arch_name = identify_architecture_name(tensors, num_layers, config)
98
+
99
+ if format_name != "SafeTensors" or os.path.exists(file_path):
100
+ disk_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0.0
101
+
102
+ tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])
103
+
104
+ return {
105
+ "format_name": format_name,
106
+ "arch_name": arch_name,
107
+ "tensor_count": tensor_count,
108
+ "footprint": footprint,
109
+ "disk_size": disk_size,
110
+ "context_length": context_length,
111
+ "is_default_context": is_default_context,
112
+ "tensors": tensors,
113
+ "max_context": max_context
114
+ }
115
+
116
+
117
+ def main(argv: Sequence[str] | None = None) -> int:
118
+ args = parse_args(argv)
119
+
120
+ gpu_name_display = None
121
+ gpu_count = 1
122
+ if args.gpu:
123
+ from modelinfo.hardware import resolve_gpu
124
+ try:
125
+ gpu_name_display, args.max_vram, gpu_count = resolve_gpu(args.gpu)
126
+ except Exception as e:
127
+ console.print(f"[red]{e}[/red]")
128
+ return 1
129
+
130
+ if len(args.file) > 1:
131
+ models = []
132
+ for model_path in args.file:
133
+ try:
134
+ info = analyze_model(model_path, args.context, gpu_count)
135
+ models.append((model_path.split("/")[-1], info))
136
+ except Exception as e:
137
+ console.print(f"[red]Error analyzing model '{model_path}': {e}[/red]")
138
+ return 1
139
+
140
+ print_compare_info(models, args.max_vram, gpu_name=gpu_name_display)
141
+ return 0
142
+
143
+ file_path = args.file[0]
144
+
145
+ try:
146
+ info = analyze_model(file_path, args.context, gpu_count)
147
+ except Exception as e:
148
+ console.print(f"[red]Error: {e}[/red]")
149
+ return 1
150
+
151
+ print_model_info(**info, max_vram_gb=args.max_vram, gpu_name=gpu_name_display)
152
+ return 0
153
+
154
+
155
+ if __name__ == "__main__":
156
+ sys.exit(main())