modelinfo-cli 1.0.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {modelinfo_cli-1.0.0/src/modelinfo_cli.egg-info → modelinfo_cli-1.2.0}/PKG-INFO +12 -5
  2. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/README.md +11 -4
  3. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/pyproject.toml +1 -1
  4. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/__init__.py +1 -1
  5. modelinfo_cli-1.2.0/src/modelinfo/architecture.py +113 -0
  6. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/calculator.py +9 -4
  7. modelinfo_cli-1.2.0/src/modelinfo/cli.py +116 -0
  8. modelinfo_cli-1.2.0/src/modelinfo/parsers/huggingface.py +151 -0
  9. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/ui.py +33 -8
  10. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0/src/modelinfo_cli.egg-info}/PKG-INFO +12 -5
  11. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo_cli.egg-info/SOURCES.txt +1 -0
  12. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/tests/test_calculator.py +47 -0
  13. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/tests/test_parsers.py +15 -0
  14. modelinfo_cli-1.0.0/src/modelinfo/architecture.py +0 -45
  15. modelinfo_cli-1.0.0/src/modelinfo/cli.py +0 -77
  16. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/LICENSE +0 -0
  17. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/setup.cfg +0 -0
  18. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/__main__.py +0 -0
  19. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/parsers/__init__.py +0 -0
  20. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/parsers/base.py +0 -0
  21. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/parsers/gguf.py +0 -0
  22. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/parsers/pytorch.py +0 -0
  23. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo/parsers/safetensors.py +0 -0
  24. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo_cli.egg-info/dependency_links.txt +0 -0
  25. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo_cli.egg-info/entry_points.txt +0 -0
  26. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo_cli.egg-info/requires.txt +0 -0
  27. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/src/modelinfo_cli.egg-info/top_level.txt +0 -0
  28. {modelinfo_cli-1.0.0 → modelinfo_cli-1.2.0}/tests/test_constraints.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modelinfo-cli
3
- Version: 1.0.0
3
+ Version: 1.2.0
4
4
  Summary: A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements.
5
5
  Author: ModelInfo Contributors
6
6
  License: MIT
@@ -28,12 +28,13 @@ It reads binary headers directly using the Python standard library. By bypassing
28
28
 
29
29
  ## Features
30
30
 
31
- - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`.
31
+ - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Seamlessly reads adjacent `config.json` for robust fallback logic.
32
+ - **Remote Hugging Face Hub Inspection**: Inspect any public or gated model directly via its repo ID (e.g., `modelinfo meta-llama/Llama-2-7b-hf`) without downloading the 15GB checkpoint. Uses concurrent byte-range requests to pluck the binary headers directly off the CDN in under 2 seconds.
32
33
  - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
33
- - **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
34
+ - **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths. Defaults to 8192 tokens to prevent unrealistic VRAM calculations, while still warning users if the requested context exceeds the model's native limit. Estimates include a standard 600MB CUDA context overhead.
34
35
  - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
35
36
  - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
36
- - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`.
37
+ - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`. Breaks down memory footprints into Weights, KV Cache, and Overhead.
37
38
 
38
39
  ## Installation
39
40
 
@@ -67,12 +68,18 @@ pytest tests/ -v
67
68
 
68
69
  ## Usage
69
70
 
70
- Inspect a model checkpoint:
71
+ Inspect a local model checkpoint:
71
72
 
72
73
  ```bash
73
74
  modelinfo mistral-7b.safetensors
74
75
  ```
75
76
 
77
+ Inspect a remote model directly from the Hugging Face Hub:
78
+
79
+ ```bash
80
+ modelinfo meta-llama/Llama-2-7b-hf
81
+ ```
82
+
76
83
  Calculate the memory footprint with a specific KV cache context window:
77
84
 
78
85
  ```bash
@@ -10,12 +10,13 @@ It reads binary headers directly using the Python standard library. By bypassing
10
10
 
11
11
  ## Features
12
12
 
13
- - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`.
13
+ - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Seamlessly reads adjacent `config.json` for robust fallback logic.
14
+ - **Remote Hugging Face Hub Inspection**: Inspect any public or gated model directly via its repo ID (e.g., `modelinfo meta-llama/Llama-2-7b-hf`) without downloading the 15GB checkpoint. Uses concurrent byte-range requests to pluck the binary headers directly off the CDN in under 2 seconds.
14
15
  - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
15
- - **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
16
+ - **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths. Defaults to 8192 tokens to prevent unrealistic VRAM calculations, while still warning users if the requested context exceeds the model's native limit. Estimates include a standard 600MB CUDA context overhead.
16
17
  - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
17
18
  - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
18
- - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`.
19
+ - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`. Breaks down memory footprints into Weights, KV Cache, and Overhead.
19
20
 
20
21
  ## Installation
21
22
 
@@ -49,12 +50,18 @@ pytest tests/ -v
49
50
 
50
51
  ## Usage
51
52
 
52
- Inspect a model checkpoint:
53
+ Inspect a local model checkpoint:
53
54
 
54
55
  ```bash
55
56
  modelinfo mistral-7b.safetensors
56
57
  ```
57
58
 
59
+ Inspect a remote model directly from the Hugging Face Hub:
60
+
61
+ ```bash
62
+ modelinfo meta-llama/Llama-2-7b-hf
63
+ ```
64
+
58
65
  Calculate the memory footprint with a specific KV cache context window:
59
66
 
60
67
  ```bash
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "modelinfo-cli"
7
- version = "1.0.0"
7
+ version = "1.2.0"
8
8
  description = "A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -2,4 +2,4 @@
2
2
  modelinfo - A high-performance CLI utility for inspecting ML model checkpoints.
3
3
  """
4
4
 
5
- __version__ = "0.1.0"
5
+ __version__ = "1.2.0"
@@ -0,0 +1,113 @@
1
+ import os
2
+ import json
3
+ from typing import Any, Dict, Tuple
4
+
5
+ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None) -> Tuple[int, int, bool]:
6
+ """
7
+ Extracts the number of layers and KV cache dimension (kv_heads * head_dim).
8
+ Returns (num_layers, kv_dim, is_estimate).
9
+ """
10
+ num_layers = 0
11
+ kv_dim = 0
12
+ is_estimate = False
13
+
14
+ metadata = tensors.get("__metadata__", {})
15
+ gen_arch = metadata.get("general.architecture")
16
+
17
+ # 1. Attempt explicit GGUF metadata
18
+ if gen_arch:
19
+ arch_str = str(gen_arch)
20
+ num_layers = metadata.get(f"{arch_str}.block_count", 0)
21
+ kv_heads = metadata.get(f"{arch_str}.attention.head_count_kv", 0)
22
+
23
+ key_length = metadata.get(f"{arch_str}.attention.key_length")
24
+ if not key_length:
25
+ embed_len = metadata.get(f"{arch_str}.embedding_length", 0)
26
+ q_heads = metadata.get(f"{arch_str}.attention.head_count", 1)
27
+ if q_heads > 0:
28
+ key_length = embed_len // q_heads
29
+ else:
30
+ key_length = 0
31
+
32
+ if kv_heads > 0 and key_length > 0:
33
+ kv_dim = kv_heads * key_length
34
+ if num_layers > 0:
35
+ return num_layers, kv_dim, False
36
+
37
+ # 2. Attempt explicit SafeTensors config.json
38
+ if config:
39
+ num_layers = config.get("num_hidden_layers", 0)
40
+ num_attention_heads = config.get("num_attention_heads", 1)
41
+ num_key_value_heads = config.get("num_key_value_heads", num_attention_heads)
42
+ hidden_size = config.get("hidden_size", 0)
43
+
44
+ if num_attention_heads > 0:
45
+ head_dim = hidden_size // num_attention_heads
46
+ kv_dim = num_key_value_heads * head_dim
47
+ if num_layers > 0 and kv_dim > 0:
48
+ return num_layers, kv_dim, False
49
+
50
+ # 3. Fallback to shape guessing
51
+ layers_set = set()
52
+ found_fused = False
53
+ found_k_proj = False
54
+
55
+ for name, meta in tensors.items():
56
+ if name == "__metadata__":
57
+ continue
58
+
59
+ parts = name.split(".")
60
+ if "layers" in parts:
61
+ idx = parts.index("layers")
62
+ if len(parts) > idx + 1 and parts[idx+1].isdigit():
63
+ layers_set.add(int(parts[idx+1]))
64
+ elif "h" in parts:
65
+ idx = parts.index("h")
66
+ if len(parts) > idx + 1 and parts[idx+1].isdigit():
67
+ layers_set.add(int(parts[idx+1]))
68
+
69
+ if name.endswith("k_proj.weight") or name.endswith("attn.k.weight") or name.endswith("k_proj.w"):
70
+ found_k_proj = True
71
+ shape = meta.get("shape", [])
72
+ if len(shape) >= 2:
73
+ kv_dim = shape[0]
74
+
75
+ if "qkv_proj.weight" in name or "c_attn.weight" in name:
76
+ found_fused = True
77
+ if not found_k_proj:
78
+ shape = meta.get("shape", [])
79
+ if len(shape) >= 2:
80
+ kv_dim = shape[0] // 3
81
+
82
+ num_layers = len(layers_set)
83
+ if found_fused and not found_k_proj and kv_dim > 0:
84
+ is_estimate = True
85
+
86
+ return num_layers, kv_dim, is_estimate
87
+
88
+ def identify_architecture_name(tensors: Dict[str, Any], num_layers: int, config: Dict[str, Any] = None) -> str:
89
+ """Attempt to identify the architecture family based on tensor names, metadata, or config.json."""
90
+ if config and "architectures" in config and config["architectures"]:
91
+ arch_title = config["architectures"][0]
92
+ return f"{arch_title} ({num_layers} layers)" if num_layers else arch_title
93
+
94
+ metadata = tensors.get("__metadata__", {})
95
+ gen_arch = metadata.get("general.architecture")
96
+
97
+ if gen_arch:
98
+ arch_title = str(gen_arch).title()
99
+ return f"{arch_title} ({num_layers} transformer layers)" if num_layers else arch_title
100
+
101
+ for name in tensors.keys():
102
+ if name == "__metadata__":
103
+ continue
104
+
105
+ name_lower = name.lower()
106
+ if "llama" in name_lower:
107
+ return f"Llama ({num_layers} transformer layers)" if num_layers else "Llama"
108
+ if "mistral" in name_lower:
109
+ return f"Mistral ({num_layers} transformer layers)" if num_layers else "Mistral"
110
+ if "qwen" in name_lower:
111
+ return f"Qwen ({num_layers} transformer layers)" if num_layers else "Qwen"
112
+
113
+ return f"Generic Transformer ({num_layers} layers)" if num_layers > 0 else "Unknown Architecture"
@@ -29,7 +29,7 @@ def _get_bytes_per_param(dtype: str) -> float:
29
29
  """Return the size in bytes for a given data type."""
30
30
  return DTYPE_BYTES.get(dtype.upper(), 2.0)
31
31
 
32
- def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1) -> Dict[str, Any]:
32
+ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1, config: Dict[str, Any] = None) -> Dict[str, Any]:
33
33
  """
34
34
  Calculate the memory footprint of a model based on its tensors and context length.
35
35
  """
@@ -54,7 +54,7 @@ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_
54
54
  bytes_per_param = _get_bytes_per_param(dtype)
55
55
  base_memory_bytes += param_count * bytes_per_param
56
56
 
57
- num_layers, kv_dim = extract_architecture(tensors)
57
+ num_layers, kv_dim, is_estimate = extract_architecture(tensors, config)
58
58
 
59
59
  # Formula: 2 * Layers * (KV_Heads * Head_Dim) * Context_Length * Batch_Size * Bytes_per_param
60
60
  # Assume FP16 (2 bytes) for KV cache
@@ -62,14 +62,19 @@ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_
62
62
 
63
63
  primary_dtype = max(dtype_counts.items(), key=lambda x: x[1])[0] if dtype_counts else "Unknown"
64
64
 
65
+ CUDA_CONTEXT_MB = 600
66
+ overhead_bytes = CUDA_CONTEXT_MB * 1024 * 1024
67
+
65
68
  return {
66
69
  "total_params": total_params,
67
70
  "base_memory_bytes": base_memory_bytes,
68
71
  "kv_cache_bytes": kv_cache_bytes,
69
- "total_memory_bytes": base_memory_bytes + kv_cache_bytes,
72
+ "overhead_bytes": overhead_bytes,
73
+ "total_memory_bytes": base_memory_bytes + kv_cache_bytes + overhead_bytes,
70
74
  "num_layers": num_layers,
71
75
  "kv_dim": kv_dim,
72
- "primary_dtype": primary_dtype
76
+ "primary_dtype": primary_dtype,
77
+ "kv_is_estimate": is_estimate
73
78
  }
74
79
 
75
80
  def format_bytes(size_bytes: float) -> str:
@@ -0,0 +1,116 @@
1
+ import argparse
2
+ import json
3
+ import os
4
+ import sys
5
+ from typing import Sequence
6
+
7
+ from modelinfo.architecture import identify_architecture_name
8
+ from modelinfo.calculator import calculate_footprint
9
+ from modelinfo.parsers.gguf import parse_gguf_header
10
+ from modelinfo.parsers.pytorch import parse_pytorch_header
11
+ from modelinfo.parsers.safetensors import parse_safetensors_header
12
+ from modelinfo.ui import console, print_model_info
13
+
14
+
15
+ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
16
+ parser = argparse.ArgumentParser(
17
+ prog="modelinfo",
18
+ description="High-performance CLI utility to inspect ML model checkpoints and calculate VRAM requirements.",
19
+ )
20
+
21
+ parser.add_argument(
22
+ "file",
23
+ type=str,
24
+ help="Path to the model checkpoint file (.safetensors, .gguf, .pt)",
25
+ )
26
+ parser.add_argument(
27
+ "--context",
28
+ type=int,
29
+ default=None,
30
+ help="Context length for dynamic KV cache footprint calculation.",
31
+ )
32
+
33
+ return parser.parse_args(argv)
34
+
35
+
36
+ def main(argv: Sequence[str] | None = None) -> int:
37
+ args = parse_args(argv)
38
+
39
+ file_path = args.file.lower()
40
+ tensors = {}
41
+ config = None
42
+
43
+ if not os.path.exists(file_path) and not file_path.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
44
+ from modelinfo.parsers.huggingface import fetch_huggingface_repo
45
+ try:
46
+ tensors, config, format_name, disk_size = fetch_huggingface_repo(args.file)
47
+ except Exception as e:
48
+ console.print(f"[red]Error fetching from Hugging Face: {e}[/red]")
49
+ return 1
50
+ elif file_path.endswith(".safetensors") or file_path.endswith(".index.json"):
51
+ tensors = parse_safetensors_header(args.file)
52
+ format_name = "SafeTensors"
53
+
54
+ # Read config.json to maintain pure math engines
55
+ config_path = os.path.join(os.path.dirname(args.file), "config.json")
56
+ if os.path.exists(config_path):
57
+ try:
58
+ with open(config_path, "r", encoding="utf-8") as f:
59
+ config = json.load(f)
60
+ except (json.JSONDecodeError, OSError):
61
+ pass
62
+
63
+ elif file_path.endswith(".gguf"):
64
+ tensors = parse_gguf_header(args.file)
65
+ format_name = "GGUF"
66
+ elif file_path.endswith(".pt") or file_path.endswith(".bin"):
67
+ tensors = parse_pytorch_header(args.file)
68
+ format_name = "PyTorch"
69
+ else:
70
+ console.print(
71
+ f"[red]Error: File '{args.file}' not found locally and does not appear to be a Hugging Face repository ID.[/red]"
72
+ )
73
+ return 1
74
+
75
+ max_context = None
76
+ if config:
77
+ max_context = config.get("max_position_embeddings")
78
+ elif format_name == "GGUF":
79
+ metadata = tensors.get("__metadata__", {})
80
+ gen_arch = metadata.get("general.architecture")
81
+ if gen_arch:
82
+ max_context = metadata.get(f"{gen_arch}.context_length")
83
+
84
+ # Determine the actual context length to use for calculation
85
+ is_default_context = False
86
+ context_length = args.context
87
+ if context_length is None:
88
+ context_length = min(8192, max_context) if max_context else 8192
89
+ is_default_context = True
90
+
91
+ footprint = calculate_footprint(tensors, context_length=context_length, config=config)
92
+ num_layers = footprint["num_layers"]
93
+ arch_name = identify_architecture_name(tensors, num_layers, config)
94
+
95
+ if format_name != "SafeTensors" or os.path.exists(args.file):
96
+ disk_size = os.path.getsize(args.file) if os.path.exists(args.file) else 0.0
97
+
98
+ tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])
99
+
100
+ print_model_info(
101
+ format_name=format_name,
102
+ arch_name=arch_name,
103
+ tensor_count=tensor_count,
104
+ footprint=footprint,
105
+ disk_size=disk_size,
106
+ context_length=context_length,
107
+ is_default_context=is_default_context,
108
+ tensors=tensors,
109
+ max_context=max_context
110
+ )
111
+
112
+ return 0
113
+
114
+
115
+ if __name__ == "__main__":
116
+ sys.exit(main())
@@ -0,0 +1,151 @@
1
+ import concurrent.futures
2
+ import json
3
+ import os
4
+ import struct
5
+ import urllib.error
6
+ import urllib.request
7
+ from typing import Any, Dict, Tuple
8
+
9
+ def _get_hf_token() -> str | None:
10
+ token = os.environ.get("HF_TOKEN")
11
+ if token:
12
+ return token
13
+
14
+ cache_path = os.path.expanduser("~/.cache/huggingface/token")
15
+ if os.path.exists(cache_path):
16
+ try:
17
+ with open(cache_path, "r", encoding="utf-8") as f:
18
+ return f.read().strip()
19
+ except OSError:
20
+ pass
21
+
22
+ legacy_path = os.path.expanduser("~/.huggingface/token")
23
+ if os.path.exists(legacy_path):
24
+ try:
25
+ with open(legacy_path, "r", encoding="utf-8") as f:
26
+ return f.read().strip()
27
+ except OSError:
28
+ pass
29
+
30
+ return None
31
+
32
+ def _make_request(url: str, headers: Dict[str, str] = None) -> bytes:
33
+ if headers is None:
34
+ headers = {}
35
+
36
+ token = _get_hf_token()
37
+ if token:
38
+ headers["Authorization"] = f"Bearer {token}"
39
+
40
+ req = urllib.request.Request(url, headers=headers)
41
+ try:
42
+ with urllib.request.urlopen(req, timeout=10) as response:
43
+ return response.read()
44
+ except urllib.error.HTTPError as e:
45
+ if e.code == 401:
46
+ raise PermissionError(f"🔒 Gated Model or Invalid Token: Please set HF_TOKEN environment variable to access {url}")
47
+ if e.code == 404:
48
+ raise FileNotFoundError(f"File not found on Hugging Face Hub: {url}")
49
+ raise
50
+
51
+ def _fetch_safetensors_header(repo_id: str, filename: str) -> Dict[str, Any]:
52
+ url = f"https://huggingface.co/{repo_id}/resolve/main/{filename}"
53
+
54
+ # 1. Fetch the first 500KB in a single roundtrip
55
+ headers = {"Range": "bytes=0-500000"}
56
+ try:
57
+ chunk = _make_request(url, headers=headers)
58
+ except urllib.error.HTTPError as e:
59
+ if e.code == 416: # Range Not Satisfiable (file is smaller than 500KB)
60
+ chunk = _make_request(url)
61
+ else:
62
+ raise
63
+
64
+ if len(chunk) < 8:
65
+ raise ValueError(f"File {filename} is too small to contain a SafeTensors header.")
66
+
67
+ header_size = struct.unpack("<Q", chunk[:8])[0]
68
+
69
+ # 2. Slice locally if it fits
70
+ if 8 + header_size <= len(chunk):
71
+ json_bytes = chunk[8:8+header_size]
72
+ else:
73
+ # 3. Double-roundtrip only if the header is massive (>500KB)
74
+ headers = {"Range": f"bytes=8-{8+header_size-1}"}
75
+ json_bytes = _make_request(url, headers=headers)
76
+
77
+ return json.loads(json_bytes)
78
+
79
+ def fetch_huggingface_repo(repo_id: str) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]:
80
+ """
81
+ Fetches the metadata directly from the Hugging Face Hub over the network.
82
+ Returns: (tensors, config, format_name, disk_size)
83
+ """
84
+ api_url = f"https://huggingface.co/api/models/{repo_id}"
85
+ try:
86
+ api_data = json.loads(_make_request(api_url).decode("utf-8"))
87
+ except urllib.error.HTTPError as e:
88
+ if e.code == 401:
89
+ raise PermissionError(f"🔒 Gated Model: Please set HF_TOKEN environment variable to access {repo_id}")
90
+ raise
91
+
92
+ siblings = api_data.get("siblings", [])
93
+ filenames = {s["rfilename"] for s in siblings}
94
+
95
+ config = None
96
+ if "config.json" in filenames:
97
+ config_url = f"https://huggingface.co/{repo_id}/resolve/main/config.json"
98
+ config = json.loads(_make_request(config_url).decode("utf-8"))
99
+
100
+ tensors = {}
101
+ total_size = 0.0
102
+
103
+ if "model.safetensors.index.json" in filenames:
104
+ # Sharded SafeTensors
105
+ index_url = f"https://huggingface.co/{repo_id}/resolve/main/model.safetensors.index.json"
106
+ index_data = json.loads(_make_request(index_url).decode("utf-8"))
107
+
108
+ weight_map = index_data.get("weight_map", {})
109
+ unique_shards = list(set(weight_map.values()))
110
+
111
+ total_size = index_data.get("metadata", {}).get("total_size", 0.0)
112
+
113
+ def fetch_shard(shard: str):
114
+ return shard, _fetch_safetensors_header(repo_id, shard)
115
+
116
+ with concurrent.futures.ThreadPoolExecutor(max_workers=min(8, len(unique_shards))) as executor:
117
+ future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
118
+ for future in concurrent.futures.as_completed(future_to_shard):
119
+ shard, shard_header = future.result()
120
+ for k, v in shard_header.items():
121
+ if k != "__metadata__":
122
+ tensors[k] = v
123
+
124
+ tensors["__metadata__"] = {
125
+ "missing_shards": 0,
126
+ "total_shards": len(unique_shards),
127
+ "is_sharded": True
128
+ }
129
+ format_name = "SafeTensors"
130
+
131
+ elif "model.safetensors" in filenames:
132
+ # Single SafeTensors
133
+ header = _fetch_safetensors_header(repo_id, "model.safetensors")
134
+ tensors = header
135
+ format_name = "SafeTensors"
136
+
137
+ # We don't have total_size from index, so we could get it from Content-Length or just leave it 0
138
+ req = urllib.request.Request(f"https://huggingface.co/{repo_id}/resolve/main/model.safetensors", method="HEAD")
139
+ token = _get_hf_token()
140
+ if token:
141
+ req.add_header("Authorization", f"Bearer {token}")
142
+ try:
143
+ with urllib.request.urlopen(req) as response:
144
+ total_size = int(response.headers.get("Content-Length", 0))
145
+ except Exception:
146
+ pass
147
+
148
+ else:
149
+ raise ValueError(f"Repository {repo_id} does not contain SafeTensors weights.")
150
+
151
+ return tensors, config, format_name, float(total_size)
@@ -43,7 +43,9 @@ def print_model_info(
43
43
  footprint: Dict[str, Any],
44
44
  disk_size: float,
45
45
  context_length: int,
46
- tensors: Dict[str, Any]
46
+ is_default_context: bool,
47
+ tensors: Dict[str, Any],
48
+ max_context: int | None = None
47
49
  ) -> None:
48
50
  summary = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2))
49
51
  summary.add_column("Property", style="bold")
@@ -57,18 +59,38 @@ def print_model_info(
57
59
  param_text = "[yellow]UNKNOWN (Missing Shards)[/yellow]"
58
60
  disk_text = "[yellow]UNKNOWN (Missing Shards)[/yellow]"
59
61
  vram_display = "[yellow]UNKNOWN (Missing Shards)[/yellow]"
62
+ elif footprint["total_memory_bytes"] == 0:
63
+ param_text = format_params(footprint["total_params"])
64
+ disk_text = format_bytes(disk_size)
65
+ vram_display = "[red]Unknown (Missing Tensor Shapes)[/red]"
60
66
  else:
61
67
  param_text = format_params(footprint["total_params"])
62
68
  disk_text = format_bytes(disk_size)
63
69
  vram_bytes = footprint["total_memory_bytes"]
64
70
  vram_color = "green" if vram_bytes < 8 * 1024**3 else "yellow" if vram_bytes < 16 * 1024**3 else "red"
65
71
 
66
- vram_text = f"~{format_bytes(vram_bytes)}"
67
- if context_length > 0:
68
- vram_text += f" ({footprint['primary_dtype']}, KV cache for {context_length} tokens)"
72
+ vram_text = f"~{format_bytes(vram_bytes)} Total Minimum Required"
73
+ vram_display = f"[{vram_color}]{vram_text}[/{vram_color}]\n"
74
+
75
+ weights_bytes = footprint["base_memory_bytes"]
76
+ vram_display += f" ├─ Weights: {format_bytes(weights_bytes)}\n"
77
+
78
+ kv_cache_bytes = footprint["kv_cache_bytes"]
79
+
80
+ if footprint.get("kv_is_estimate"):
81
+ kv_note = " (Estimated KV Cache - Missing Config)"
82
+ elif is_default_context:
83
+ if max_context and max_context > context_length:
84
+ kv_note = f" (Default {context_length} tokens. Native limit: {max_context:,})"
85
+ else:
86
+ kv_note = f" (Default {context_length} tokens)"
69
87
  else:
70
- vram_text += f" ({footprint['primary_dtype']}, no KV cache)"
71
- vram_display = f"[{vram_color}]{vram_text}[/{vram_color}]"
88
+ kv_note = f" ({context_length} tokens)"
89
+
90
+ vram_display += f" ├─ KV Cache: {format_bytes(kv_cache_bytes)}{kv_note}\n"
91
+
92
+ overhead_bytes = footprint.get("overhead_bytes", 600 * 1024 * 1024)
93
+ vram_display += f" └─ Overhead: {format_bytes(overhead_bytes)} (CUDA Context + Activations)"
72
94
 
73
95
  summary.add_row("Format:", format_name)
74
96
  summary.add_row("Architecture:", arch_name)
@@ -81,8 +103,11 @@ def print_model_info(
81
103
  console.print(summary)
82
104
 
83
105
  if missing_shards > 0:
84
- console.print(f"[bold yellow]⚠️ Partial Model: Missing {missing_shards} of {total_shards} shards on disk. Totals are incomplete.[/bold yellow]")
106
+ console.print(f"[bold yellow]WARNING: Partial Model. Missing {missing_shards} of {total_shards} shards on disk. Totals are incomplete.[/bold yellow]")
85
107
 
108
+ if context_length > 0 and max_context is not None and context_length > max_context:
109
+ console.print(f"[bold yellow]WARNING: Requested context ({context_length:,}) exceeds model's native limit ({max_context:,}).[/bold yellow]")
110
+
86
111
  console.print()
87
112
 
88
113
  console.print("Top Tensors by Size:", style="bold")
@@ -90,7 +115,7 @@ def print_model_info(
90
115
  grouped_tensors = group_tensors_by_size(tensors)
91
116
 
92
117
  tensor_table = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2))
93
- tensor_table.add_column("Name")
118
+ tensor_table.add_column("Name", no_wrap=True, overflow="fold")
94
119
  tensor_table.add_column("Shape", justify="right")
95
120
  tensor_table.add_column("Dtype", justify="left")
96
121
  tensor_table.add_column("Params", justify="right")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modelinfo-cli
3
- Version: 1.0.0
3
+ Version: 1.2.0
4
4
  Summary: A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements.
5
5
  Author: ModelInfo Contributors
6
6
  License: MIT
@@ -28,12 +28,13 @@ It reads binary headers directly using the Python standard library. By bypassing
28
28
 
29
29
  ## Features
30
30
 
31
- - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`.
31
+ - **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Seamlessly reads adjacent `config.json` for robust fallback logic.
32
+ - **Remote Hugging Face Hub Inspection**: Inspect any public or gated model directly via its repo ID (e.g., `modelinfo meta-llama/Llama-2-7b-hf`) without downloading the 15GB checkpoint. Uses concurrent byte-range requests to pluck the binary headers directly off the CDN in under 2 seconds.
32
33
  - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
33
- - **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
34
+ - **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths. Defaults to 8192 tokens to prevent unrealistic VRAM calculations, while still warning users if the requested context exceeds the model's native limit. Estimates include a standard 600MB CUDA context overhead.
34
35
  - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
35
36
  - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
36
- - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`.
37
+ - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`. Breaks down memory footprints into Weights, KV Cache, and Overhead.
37
38
 
38
39
  ## Installation
39
40
 
@@ -67,12 +68,18 @@ pytest tests/ -v
67
68
 
68
69
  ## Usage
69
70
 
70
- Inspect a model checkpoint:
71
+ Inspect a local model checkpoint:
71
72
 
72
73
  ```bash
73
74
  modelinfo mistral-7b.safetensors
74
75
  ```
75
76
 
77
+ Inspect a remote model directly from the Hugging Face Hub:
78
+
79
+ ```bash
80
+ modelinfo meta-llama/Llama-2-7b-hf
81
+ ```
82
+
76
83
  Calculate the memory footprint with a specific KV cache context window:
77
84
 
78
85
  ```bash
@@ -10,6 +10,7 @@ src/modelinfo/ui.py
10
10
  src/modelinfo/parsers/__init__.py
11
11
  src/modelinfo/parsers/base.py
12
12
  src/modelinfo/parsers/gguf.py
13
+ src/modelinfo/parsers/huggingface.py
13
14
  src/modelinfo/parsers/pytorch.py
14
15
  src/modelinfo/parsers/safetensors.py
15
16
  src/modelinfo_cli.egg-info/PKG-INFO
@@ -63,3 +63,50 @@ def test_dynamic_kv_cache():
63
63
  assert footprint["num_layers"] == 2
64
64
  assert footprint["kv_dim"] == 1024
65
65
  assert footprint["kv_cache_bytes"] == 8192000
66
+
67
+ def test_safetensors_config_fallback():
68
+ """Verify that architecture extraction correctly parses a config dictionary for SafeTensors."""
69
+ from modelinfo.architecture import extract_architecture
70
+
71
+ tensors = {
72
+ "model.layers.0.qkv_proj.weight": {
73
+ "shape": [6144, 4096],
74
+ "dtype": "F16"
75
+ }
76
+ }
77
+
78
+ config = {
79
+ "num_hidden_layers": 32,
80
+ "num_attention_heads": 32,
81
+ "num_key_value_heads": 8,
82
+ "hidden_size": 4096
83
+ }
84
+
85
+ num_layers, kv_dim, is_estimate = extract_architecture(tensors, config=config)
86
+
87
+ assert num_layers == 32
88
+ assert kv_dim == 1024
89
+ assert is_estimate is False
90
+
91
+ def test_kv_cache_is_fp16():
92
+ """Verify that KV cache is always calculated using 2.0 bytes (FP16), even for Q4 base models."""
93
+ tensors = {
94
+ "model.layers.0.attn.weight": {"shape": [4096, 4096], "dtype": "Q4"},
95
+ "model.layers.0.attn.k.weight": {"shape": [1024, 4096], "dtype": "Q4"},
96
+ }
97
+
98
+ footprint = calculate_footprint(tensors, context_length=8192)
99
+
100
+ assert footprint["kv_cache_bytes"] == 33554432
101
+ assert footprint["primary_dtype"] == "Q4"
102
+
103
+ def test_framework_overhead_included():
104
+ """Verify that CUDA context and activation overhead is correctly included."""
105
+ tensors = {
106
+ "model.layers.0.attn.weight": {"shape": [1024, 1024], "dtype": "F16"}
107
+ }
108
+ footprint = calculate_footprint(tensors)
109
+
110
+ assert "overhead_bytes" in footprint
111
+ assert footprint["overhead_bytes"] == 600 * 1024 * 1024
112
+ assert footprint["total_memory_bytes"] == footprint["base_memory_bytes"] + footprint["kv_cache_bytes"] + footprint["overhead_bytes"]
@@ -30,3 +30,18 @@ def test_missing_shard_handling():
30
30
  # it fails safely when a file truly doesn't exist.
31
31
  with pytest.raises(FileNotFoundError):
32
32
  parse_safetensors_header(os.path.join(FIXTURES_DIR, "does_not_exist.safetensors"))
33
+
34
+ def test_gguf_parser_metadata():
35
+ """Verify that the GGUF parser extracts the global metadata bypass."""
36
+ from modelinfo.parsers.gguf import parse_gguf_header
37
+ from modelinfo.architecture import identify_architecture_name
38
+
39
+ mock_path = os.path.join(FIXTURES_DIR, "mock_model.gguf")
40
+ tensors = parse_gguf_header(mock_path)
41
+
42
+ assert "__metadata__" in tensors
43
+ assert tensors["__metadata__"]["general.architecture"] == "qwen2"
44
+
45
+ # Verify the architecture bypass parses it to titlecase and prevents "Unknown Architecture"
46
+ arch_name = identify_architecture_name(tensors, num_layers=1)
47
+ assert arch_name == "Qwen2 (1 transformer layers)"
@@ -1,45 +0,0 @@
1
- from typing import Any, Dict, Tuple
2
-
3
- def extract_architecture(tensors: Dict[str, Any]) -> Tuple[int, int]:
4
- """
5
- Extracts the number of layers and KV cache dimension (kv_heads * head_dim)
6
- from tensor metadata.
7
- """
8
- layers = set()
9
- kv_dim = 0
10
-
11
- for name, metadata in tensors.items():
12
- if name == "__metadata__":
13
- continue
14
-
15
- parts = name.split(".")
16
-
17
- if "layers" in parts:
18
- idx = parts.index("layers")
19
- if len(parts) > idx + 1 and parts[idx+1].isdigit():
20
- layers.add(int(parts[idx+1]))
21
- elif "h" in parts:
22
- idx = parts.index("h")
23
- if len(parts) > idx + 1 and parts[idx+1].isdigit():
24
- layers.add(int(parts[idx+1]))
25
-
26
- if name.endswith("k_proj.weight") or name.endswith("attn.k.weight") or name.endswith("k_proj.w"):
27
- shape = metadata.get("shape", [])
28
- if len(shape) >= 2:
29
- # Typically [out_features, in_features], so out_features is shape[0]
30
- kv_dim = shape[0]
31
-
32
- return len(layers), kv_dim
33
-
34
- def identify_architecture_name(tensors: Dict[str, Any], num_layers: int) -> str:
35
- """Attempt to identify the architecture family based on tensor names."""
36
- for name in tensors.keys():
37
- name_lower = name.lower()
38
- if "llama" in name_lower:
39
- return f"Llama ({num_layers} transformer layers)" if num_layers else "Llama"
40
- if "mistral" in name_lower:
41
- return f"Mistral ({num_layers} transformer layers)" if num_layers else "Mistral"
42
- if "qwen" in name_lower:
43
- return f"Qwen ({num_layers} transformer layers)" if num_layers else "Qwen"
44
-
45
- return f"Generic Transformer ({num_layers} layers)" if num_layers > 0 else "Unknown Architecture"
@@ -1,77 +0,0 @@
1
- import argparse
2
- import os
3
- import sys
4
- from typing import Sequence
5
-
6
- from modelinfo.architecture import identify_architecture_name
7
- from modelinfo.calculator import calculate_footprint
8
- from modelinfo.parsers.gguf import parse_gguf_header
9
- from modelinfo.parsers.pytorch import parse_pytorch_header
10
- from modelinfo.parsers.safetensors import parse_safetensors_header
11
- from modelinfo.ui import console, print_model_info
12
-
13
-
14
- def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
15
- parser = argparse.ArgumentParser(
16
- prog="modelinfo",
17
- description="High-performance CLI utility to inspect ML model checkpoints and calculate VRAM requirements.",
18
- )
19
-
20
- parser.add_argument(
21
- "file",
22
- type=str,
23
- help="Path to the model checkpoint file (.safetensors, .gguf, .pt)",
24
- )
25
- parser.add_argument(
26
- "--context",
27
- type=int,
28
- default=0,
29
- help="Context length for dynamic KV cache footprint calculation.",
30
- )
31
-
32
- return parser.parse_args(argv)
33
-
34
-
35
- def main(argv: Sequence[str] | None = None) -> int:
36
- args = parse_args(argv)
37
-
38
- file_path = args.file.lower()
39
- tensors = {}
40
-
41
- if file_path.endswith(".safetensors") or file_path.endswith(".index.json"):
42
- tensors = parse_safetensors_header(args.file)
43
- format_name = "SafeTensors"
44
- elif file_path.endswith(".gguf"):
45
- tensors = parse_gguf_header(args.file)
46
- format_name = "GGUF"
47
- elif file_path.endswith(".pt") or file_path.endswith(".bin"):
48
- tensors = parse_pytorch_header(args.file)
49
- format_name = "PyTorch"
50
- else:
51
- console.print(
52
- f"[red]Error: Unsupported file format '{args.file}'. Supported formats are .safetensors, .gguf, .pt[/red]"
53
- )
54
- return 1
55
-
56
- footprint = calculate_footprint(tensors, context_length=args.context)
57
- num_layers = footprint["num_layers"]
58
- arch_name = identify_architecture_name(tensors, num_layers)
59
-
60
- disk_size = os.path.getsize(args.file) if os.path.exists(args.file) else 0.0
61
- tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])
62
-
63
- print_model_info(
64
- format_name=format_name,
65
- arch_name=arch_name,
66
- tensor_count=tensor_count,
67
- footprint=footprint,
68
- disk_size=disk_size,
69
- context_length=args.context,
70
- tensors=tensors
71
- )
72
-
73
- return 0
74
-
75
-
76
- if __name__ == "__main__":
77
- sys.exit(main())
File without changes
File without changes