PyPI - modelinfo-cli - Versions diffs - 1.0.0__tar.gz → 1.1.0__tar.gz - Mend

modelinfo-cli 1.0.0tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{modelinfo_cli-1.0.0/src/modelinfo_cli.egg-info → modelinfo_cli-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: modelinfo-cli
-Version: 1.0.0
+Version: 1.1.0
 Summary: A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements.
 Author: ModelInfo Contributors
 License: MIT
@@ -28,9 +28,9 @@ It reads binary headers directly using the Python standard library. By bypassing
 ## Features
-- **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`.
+- **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Seamlessly reads adjacent `config.json` for robust fallback logic.
 - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
-- **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
+- **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths. Actively warns users if requested context exceeds the model's native limit.
 - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
 - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
 - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`.

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/README.md RENAMED Viewed

@@ -10,9 +10,9 @@ It reads binary headers directly using the Python standard library. By bypassing
 ## Features
-- **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`.
+- **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Seamlessly reads adjacent `config.json` for robust fallback logic.
 - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
-- **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
+- **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths. Actively warns users if requested context exceeds the model's native limit.
 - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
 - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
 - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`.

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "modelinfo-cli"
-version = "1.0.0"
+version = "1.1.0"
 description = "A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements."
 readme = "README.md"
 requires-python = ">=3.10"

modelinfo_cli-1.1.0/src/modelinfo/architecture.py ADDED Viewed

@@ -0,0 +1,109 @@
+import os
+import json
+from typing import Any, Dict, Tuple
+def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None) -> Tuple[int, int, bool]:
+    """
+    Extracts the number of layers and KV cache dimension (kv_heads * head_dim).
+    Returns (num_layers, kv_dim, is_estimate).
+    """
+    num_layers = 0
+    kv_dim = 0
+    is_estimate = False
+    metadata = tensors.get("__metadata__", {})
+    gen_arch = metadata.get("general.architecture")
+    # 1. Attempt explicit GGUF metadata
+    if gen_arch:
+        arch_str = str(gen_arch)
+        num_layers = metadata.get(f"{arch_str}.block_count", 0)
+        kv_heads = metadata.get(f"{arch_str}.attention.head_count_kv", 0)
+        key_length = metadata.get(f"{arch_str}.attention.key_length")
+        if not key_length:
+            embed_len = metadata.get(f"{arch_str}.embedding_length", 0)
+            q_heads = metadata.get(f"{arch_str}.attention.head_count", 1)
+            if q_heads > 0:
+                key_length = embed_len // q_heads
+            else:
+                key_length = 0
+        if kv_heads > 0 and key_length > 0:
+            kv_dim = kv_heads * key_length
+            if num_layers > 0:
+                return num_layers, kv_dim, False
+    # 2. Attempt explicit SafeTensors config.json
+    if config:
+        num_layers = config.get("num_hidden_layers", 0)
+        num_attention_heads = config.get("num_attention_heads", 1)
+        num_key_value_heads = config.get("num_key_value_heads", num_attention_heads)
+        hidden_size = config.get("hidden_size", 0)
+        if num_attention_heads > 0:
+            head_dim = hidden_size // num_attention_heads
+            kv_dim = num_key_value_heads * head_dim
+            if num_layers > 0 and kv_dim > 0:
+                return num_layers, kv_dim, False
+    # 3. Fallback to shape guessing
+    layers_set = set()
+    found_fused = False
+    found_k_proj = False
+    for name, meta in tensors.items():
+        if name == "__metadata__":
+            continue
+        parts = name.split(".")
+        if "layers" in parts:
+            idx = parts.index("layers")
+            if len(parts) > idx + 1 and parts[idx+1].isdigit():
+                layers_set.add(int(parts[idx+1]))
+        elif "h" in parts:
+            idx = parts.index("h")
+            if len(parts) > idx + 1 and parts[idx+1].isdigit():
+                layers_set.add(int(parts[idx+1]))
+        if name.endswith("k_proj.weight") or name.endswith("attn.k.weight") or name.endswith("k_proj.w"):
+            found_k_proj = True
+            shape = meta.get("shape", [])
+            if len(shape) >= 2:
+                kv_dim = shape[0]
+        if "qkv_proj.weight" in name or "c_attn.weight" in name:
+            found_fused = True
+            if not found_k_proj:
+                shape = meta.get("shape", [])
+                if len(shape) >= 2:
+                    kv_dim = shape[0] // 3
+    num_layers = len(layers_set)
+    if found_fused and not found_k_proj and kv_dim > 0:
+        is_estimate = True
+    return num_layers, kv_dim, is_estimate
+def identify_architecture_name(tensors: Dict[str, Any], num_layers: int) -> str:
+    """Attempt to identify the architecture family based on tensor names or metadata."""
+    metadata = tensors.get("__metadata__", {})
+    gen_arch = metadata.get("general.architecture")
+    if gen_arch:
+        arch_title = str(gen_arch).title()
+        return f"{arch_title} ({num_layers} transformer layers)" if num_layers else arch_title
+    for name in tensors.keys():
+        if name == "__metadata__":
+            continue
+        name_lower = name.lower()
+        if "llama" in name_lower:
+            return f"Llama ({num_layers} transformer layers)" if num_layers else "Llama"
+        if "mistral" in name_lower:
+            return f"Mistral ({num_layers} transformer layers)" if num_layers else "Mistral"
+        if "qwen" in name_lower:
+            return f"Qwen ({num_layers} transformer layers)" if num_layers else "Qwen"
+    return f"Generic Transformer ({num_layers} layers)" if num_layers > 0 else "Unknown Architecture"

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/src/modelinfo/calculator.py RENAMED Viewed

@@ -29,7 +29,7 @@ def _get_bytes_per_param(dtype: str) -> float:
     """Return the size in bytes for a given data type."""
     return DTYPE_BYTES.get(dtype.upper(), 2.0)
-def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1) -> Dict[str, Any]:
+def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_size: int = 1, config: Dict[str, Any] = None) -> Dict[str, Any]:
     """
     Calculate the memory footprint of a model based on its tensors and context length.
     """
@@ -54,7 +54,7 @@ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_
         bytes_per_param = _get_bytes_per_param(dtype)
         base_memory_bytes += param_count * bytes_per_param
-    num_layers, kv_dim = extract_architecture(tensors)
+    num_layers, kv_dim, is_estimate = extract_architecture(tensors, config)
     # Formula: 2 * Layers * (KV_Heads * Head_Dim) * Context_Length * Batch_Size * Bytes_per_param
     # Assume FP16 (2 bytes) for KV cache
@@ -69,7 +69,8 @@ def calculate_footprint(tensors: Dict[str, Any], context_length: int = 0, batch_
         "total_memory_bytes": base_memory_bytes + kv_cache_bytes,
         "num_layers": num_layers,
         "kv_dim": kv_dim,
-        "primary_dtype": primary_dtype
+        "primary_dtype": primary_dtype,
+        "kv_is_estimate": is_estimate
     }
 def format_bytes(size_bytes: float) -> str:

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/src/modelinfo/cli.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import argparse
+import json
 import os
 import sys
 from typing import Sequence
@@ -37,10 +38,21 @@ def main(argv: Sequence[str] | None = None) -> int:
     file_path = args.file.lower()
     tensors = {}
+    config = None
     if file_path.endswith(".safetensors") or file_path.endswith(".index.json"):
         tensors = parse_safetensors_header(args.file)
         format_name = "SafeTensors"
+        # Read config.json to maintain pure math engines
+        config_path = os.path.join(os.path.dirname(args.file), "config.json")
+        if os.path.exists(config_path):
+            try:
+                with open(config_path, "r", encoding="utf-8") as f:
+                    config = json.load(f)
+            except (json.JSONDecodeError, OSError):
+                pass
     elif file_path.endswith(".gguf"):
         tensors = parse_gguf_header(args.file)
         format_name = "GGUF"
@@ -53,10 +65,19 @@ def main(argv: Sequence[str] | None = None) -> int:
         )
         return 1
-    footprint = calculate_footprint(tensors, context_length=args.context)
+    footprint = calculate_footprint(tensors, context_length=args.context, config=config)
     num_layers = footprint["num_layers"]
     arch_name = identify_architecture_name(tensors, num_layers)
+    max_context = None
+    if config:
+        max_context = config.get("max_position_embeddings")
+    else:
+        metadata = tensors.get("__metadata__", {})
+        gen_arch = metadata.get("general.architecture")
+        if gen_arch:
+            max_context = metadata.get(f"{gen_arch}.context_length")
     disk_size = os.path.getsize(args.file) if os.path.exists(args.file) else 0.0
     tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])
@@ -67,7 +88,8 @@ def main(argv: Sequence[str] | None = None) -> int:
         footprint=footprint,
         disk_size=disk_size,
         context_length=args.context,
-        tensors=tensors
+        tensors=tensors,
+        max_context=max_context
     )
     return 0

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/src/modelinfo/ui.py RENAMED Viewed

@@ -43,7 +43,8 @@ def print_model_info(
     footprint: Dict[str, Any],
     disk_size: float,
     context_length: int,
-    tensors: Dict[str, Any]
+    tensors: Dict[str, Any],
+    max_context: int | None = None
 ) -> None:
     summary = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2))
     summary.add_column("Property", style="bold")
@@ -65,7 +66,10 @@ def print_model_info(
         vram_text = f"~{format_bytes(vram_bytes)}"
         if context_length > 0:
-            vram_text += f" ({footprint['primary_dtype']}, KV cache for {context_length} tokens)"
+            if footprint.get("kv_is_estimate"):
+                vram_text += f" ({footprint['primary_dtype']}, Estimated KV Cache - Missing Config)"
+            else:
+                vram_text += f" ({footprint['primary_dtype']}, KV cache for {context_length} tokens)"
         else:
             vram_text += f" ({footprint['primary_dtype']}, no KV cache)"
         vram_display = f"[{vram_color}]{vram_text}[/{vram_color}]"
@@ -81,8 +85,11 @@ def print_model_info(
     console.print(summary)
     if missing_shards > 0:
-        console.print(f"[bold yellow]⚠️ Partial Model: Missing {missing_shards} of {total_shards} shards on disk. Totals are incomplete.[/bold yellow]")
+        console.print(f"[bold yellow]WARNING: Partial Model. Missing {missing_shards} of {total_shards} shards on disk. Totals are incomplete.[/bold yellow]")
+    if context_length > 0 and max_context is not None and context_length > max_context:
+        console.print(f"[bold yellow]WARNING: Requested context ({context_length:,}) exceeds model's native limit ({max_context:,}).[/bold yellow]")
     console.print()
     console.print("Top Tensors by Size:", style="bold")
@@ -90,7 +97,7 @@ def print_model_info(
     grouped_tensors = group_tensors_by_size(tensors)
     tensor_table = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2))
-    tensor_table.add_column("Name")
+    tensor_table.add_column("Name", no_wrap=True, overflow="fold")
     tensor_table.add_column("Shape", justify="right")
     tensor_table.add_column("Dtype", justify="left")
     tensor_table.add_column("Params", justify="right")

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0/src/modelinfo_cli.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: modelinfo-cli
-Version: 1.0.0
+Version: 1.1.0
 Summary: A sub-100ms, zero-dependency CLI tool to inspect ML model checkpoints and dynamically calculate VRAM requirements.
 Author: ModelInfo Contributors
 License: MIT
@@ -28,9 +28,9 @@ It reads binary headers directly using the Python standard library. By bypassing
 ## Features
-- **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`.
+- **Zero-Dependency Parsing**: Reads the 8-byte JSON prefix of `.safetensors` files and the binary key-value metadata of `.gguf` directly via `struct` and `json`. Seamlessly reads adjacent `config.json` for robust fallback logic.
 - **Sharded Model Support**: Transparently parses `model.safetensors.index.json` to detect multi-file checkpoint distributions, gracefully guarding against partial downloads without crashing.
-- **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths.
+- **Dynamic VRAM Estimation**: Extracts underlying model architecture (layers, heads, dimensions) to calculate exact VRAM limits, including dynamic KV cache footprints based on user-specified context lengths. Actively warns users if requested context exceeds the model's native limit.
 - **Precise Block Quantization**: Factors in exact byte-scaling coefficients for GGUF formats (e.g., Q8, Q6, Q4) rather than naive averages, eliminating VRAM under-reporting.
 - **Secure Pickling**: Inspects legacy `.pt` files without executing arbitrary code by using a highly restricted `pickle.Unpickler`.
 - **Terminal UI**: Groups repetitive structural layers and color-codes VRAM heatmaps using `rich`.

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/tests/test_calculator.py RENAMED Viewed

@@ -63,3 +63,39 @@ def test_dynamic_kv_cache():
     assert footprint["num_layers"] == 2
     assert footprint["kv_dim"] == 1024
     assert footprint["kv_cache_bytes"] == 8192000
+def test_safetensors_config_fallback():
+    """Verify that architecture extraction correctly parses a config dictionary for SafeTensors."""
+    from modelinfo.architecture import extract_architecture
+    tensors = {
+        "model.layers.0.qkv_proj.weight": {
+            "shape": [6144, 4096],
+            "dtype": "F16"
+        }
+    }
+    config = {
+        "num_hidden_layers": 32,
+        "num_attention_heads": 32,
+        "num_key_value_heads": 8,
+        "hidden_size": 4096
+    }
+    num_layers, kv_dim, is_estimate = extract_architecture(tensors, config=config)
+    assert num_layers == 32
+    assert kv_dim == 1024
+    assert is_estimate is False
+def test_kv_cache_is_fp16():
+    """Verify that KV cache is always calculated using 2.0 bytes (FP16), even for Q4 base models."""
+    tensors = {
+        "model.layers.0.attn.weight": {"shape": [4096, 4096], "dtype": "Q4"},
+        "model.layers.0.attn.k.weight": {"shape": [1024, 4096], "dtype": "Q4"},
+    }
+    footprint = calculate_footprint(tensors, context_length=8192)
+    assert footprint["kv_cache_bytes"] == 33554432
+    assert footprint["primary_dtype"] == "Q4"

{modelinfo_cli-1.0.0 → modelinfo_cli-1.1.0}/tests/test_parsers.py RENAMED Viewed

@@ -30,3 +30,18 @@ def test_missing_shard_handling():
     # it fails safely when a file truly doesn't exist.
     with pytest.raises(FileNotFoundError):
         parse_safetensors_header(os.path.join(FIXTURES_DIR, "does_not_exist.safetensors"))
+def test_gguf_parser_metadata():
+    """Verify that the GGUF parser extracts the global metadata bypass."""
+    from modelinfo.parsers.gguf import parse_gguf_header
+    from modelinfo.architecture import identify_architecture_name
+    mock_path = os.path.join(FIXTURES_DIR, "mock_model.gguf")
+    tensors = parse_gguf_header(mock_path)
+    assert "__metadata__" in tensors
+    assert tensors["__metadata__"]["general.architecture"] == "qwen2"
+    # Verify the architecture bypass parses it to titlecase and prevents "Unknown Architecture"
+    arch_name = identify_architecture_name(tensors, num_layers=1)
+    assert arch_name == "Qwen2 (1 transformer layers)"

modelinfo_cli-1.0.0/src/modelinfo/architecture.py DELETED Viewed

@@ -1,45 +0,0 @@
-from typing import Any, Dict, Tuple
-def extract_architecture(tensors: Dict[str, Any]) -> Tuple[int, int]:
-    """
-    Extracts the number of layers and KV cache dimension (kv_heads * head_dim)
-    from tensor metadata.
-    """
-    layers = set()
-    kv_dim = 0
-    for name, metadata in tensors.items():
-        if name == "__metadata__":
-            continue
-        parts = name.split(".")
-        if "layers" in parts:
-            idx = parts.index("layers")
-            if len(parts) > idx + 1 and parts[idx+1].isdigit():
-                layers.add(int(parts[idx+1]))
-        elif "h" in parts:
-            idx = parts.index("h")
-            if len(parts) > idx + 1 and parts[idx+1].isdigit():
-                layers.add(int(parts[idx+1]))
-        if name.endswith("k_proj.weight") or name.endswith("attn.k.weight") or name.endswith("k_proj.w"):
-            shape = metadata.get("shape", [])
-            if len(shape) >= 2:
-                # Typically [out_features, in_features], so out_features is shape[0]
-                kv_dim = shape[0]
-    return len(layers), kv_dim
-def identify_architecture_name(tensors: Dict[str, Any], num_layers: int) -> str:
-    """Attempt to identify the architecture family based on tensor names."""
-    for name in tensors.keys():
-        name_lower = name.lower()
-        if "llama" in name_lower:
-            return f"Llama ({num_layers} transformer layers)" if num_layers else "Llama"
-        if "mistral" in name_lower:
-            return f"Mistral ({num_layers} transformer layers)" if num_layers else "Mistral"
-        if "qwen" in name_lower:
-            return f"Qwen ({num_layers} transformer layers)" if num_layers else "Qwen"
-    return f"Generic Transformer ({num_layers} layers)" if num_layers > 0 else "Unknown Architecture"