PyPI - mod-trace - Versions diffs - 0.3.2__tar.gz → 0.4.0__tar.gz - Mend

mod-trace 0.3.2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{mod_trace-0.3.2 → mod_trace-0.4.0}/Cargo.lock RENAMED Viewed

@@ -16,7 +16,7 @@ checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
 [[package]]
 name = "mod-trace"
-version = "0.3.2"
+version = "0.4.0"
 dependencies = [
  "serde",
  "serde_json",

{mod_trace-0.3.2 → mod_trace-0.4.0}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "mod-trace"
-version = "0.3.2"
+version = "0.4.0"
 edition = "2024"
 description = "Rust CLI for inspecting ML model artifacts without loading the framework"
 license = "MIT"

{mod_trace-0.3.2 → mod_trace-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mod-trace
-Version: 0.3.2
+Version: 0.4.0
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3 :: Only
@@ -25,7 +25,7 @@ mod-trace is a small Rust CLI for answering a practical question:
 What is inside this model file?
 ```
-It can inspect real artifacts such as CatBoost `.cbm` files, LightGBM `.txt`/`.lgb` text models, and ONNX `.onnx` graphs, then report structure, size, parameters, operator mix, rough inference cost, and changes between versions. CatBoost, LightGBM, and ONNX are all read natively — no Python, framework, or runtime needed (CatBoost `--deep` is the one optional exception).
+It can inspect real artifacts such as CatBoost `.cbm` files, LightGBM `.txt`/`.lgb` text models, ONNX `.onnx` graphs, and PyTorch `.pt`/`.pth` checkpoints, then report structure, size, parameters, operator mix, rough inference cost, and changes between versions. All formats are read natively — no Python, framework, or runtime needed (CatBoost `--deep` is the one optional exception). The PyTorch reader is static: it sizes/names tensors and fingerprints weights without decoding exact shapes.
 The most useful command is `explain-diff`, which says in plain English what changed between two model versions:

{mod_trace-0.3.2 → mod_trace-0.4.0}/README.md RENAMED Viewed

@@ -8,7 +8,7 @@ mod-trace is a small Rust CLI for answering a practical question:
 What is inside this model file?
 ```
-It can inspect real artifacts such as CatBoost `.cbm` files, LightGBM `.txt`/`.lgb` text models, and ONNX `.onnx` graphs, then report structure, size, parameters, operator mix, rough inference cost, and changes between versions. CatBoost, LightGBM, and ONNX are all read natively — no Python, framework, or runtime needed (CatBoost `--deep` is the one optional exception).
+It can inspect real artifacts such as CatBoost `.cbm` files, LightGBM `.txt`/`.lgb` text models, ONNX `.onnx` graphs, and PyTorch `.pt`/`.pth` checkpoints, then report structure, size, parameters, operator mix, rough inference cost, and changes between versions. All formats are read natively — no Python, framework, or runtime needed (CatBoost `--deep` is the one optional exception). The PyTorch reader is static: it sizes/names tensors and fingerprints weights without decoding exact shapes.
 The most useful command is `explain-diff`, which says in plain English what changed between two model versions:

mod_trace-0.4.0/examples/pytorch/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# PyTorch example models
+Synthetic `torch.save` artifacts for trying `mod-trace` on PyTorch with **no
+torch and no Python** — mod-trace reads the `.pt` zip (pickled structure + raw
+tensor storages) statically.
+| Files | What they show |
+|-------|----------------|
+| `mlp_v1.pt` vs `mlp_v2.pt` | Same 2-layer MLP, hidden size 32 → 64 (parameter count ~doubles, same layer names). |
+## Try it
+```bash
+mod-trace inspect      examples/pytorch/mlp_v1.pt
+mod-trace explain-diff examples/pytorch/mlp_v1.pt examples/pytorch/mlp_v2.pt
+mod-trace check --max-parameter-growth 30% examples/pytorch/mlp_v1.pt examples/pytorch/mlp_v2.pt
+mod-trace inspect --json examples/pytorch/mlp_v1.pt
+```
+## What it reads (and what it doesn't)
+Reads, statically: file size, tensor/storage count, **estimated parameter count**
+(from storage bytes ÷ dtype), **dominant dtype**, **recovered parameter/layer
+names** (`fc1.weight`, …), and fingerprints (a sampled weight fingerprint that
+changes on a retrain/finetune).
+Does **not** decode exact per-tensor shapes — that would need a full pickle
+interpreter. Same static/heuristic philosophy as the CatBoost and ONNX readers.
+## Regenerate
+```bash
+python -m pip install torch
+python examples/pytorch/generate_demo_models.py
+```

mod_trace-0.4.0/examples/pytorch/generate_demo_models.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Generate the synthetic PyTorch demo models used by the README examples.
+Fully synthetic (no real data). Run:
+    python -m pip install torch
+    python examples/pytorch/generate_demo_models.py
+Produces, in this directory:
+  mlp_v1.pt / mlp_v2.pt  -> same 2-layer MLP, different hidden size (32 vs 64)
+"""
+import os
+import torch
+import torch.nn as nn
+HERE = os.path.dirname(os.path.abspath(__file__))
+class Net(nn.Module):
+    def __init__(self, hidden):
+        super().__init__()
+        self.fc1 = nn.Linear(16, hidden)
+        self.fc2 = nn.Linear(hidden, 4)
+    def forward(self, x):
+        return self.fc2(torch.relu(self.fc1(x)))
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    torch.save(Net(32).state_dict(), os.path.join(HERE, "mlp_v1.pt"))
+    torch.manual_seed(1)
+    torch.save(Net(64).state_dict(), os.path.join(HERE, "mlp_v2.pt"))
+    print("wrote mlp_v1.pt and mlp_v2.pt")

mod_trace-0.4.0/examples/pytorch/mlp_v1.pt ADDED Viewed

Binary file

mod_trace-0.4.0/examples/pytorch/mlp_v2.pt ADDED Viewed

Binary file

{mod_trace-0.3.2 → mod_trace-0.4.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "maturin"
 [project]
 name = "mod-trace"
-version = "0.3.2"
+version = "0.4.0"
 description = "Rust CLI for inspecting ML model artifacts without loading the framework"
 readme = "README.md"
 requires-python = ">=3.9"

{mod_trace-0.3.2 → mod_trace-0.4.0}/src/main.rs RENAMED Viewed

@@ -4,6 +4,7 @@ mod explain;
 mod lgbm;
 mod model;
 mod onnx;
+mod pt;
 mod tensor;
 use std::collections::BTreeSet;
@@ -54,6 +55,7 @@ fn run() -> Result<(), String> {
         Some("onnx") => onnx_cmd(&args.rest),
         Some("catboost") | Some("cbm") => catboost_cmd(&args.rest),
         Some("lightgbm") | Some("lgbm") => lgbm_cmd(&args.rest),
+        Some("pytorch") | Some("pt") => pt_cmd(&args.rest),
         Some("validate") => validate_model_cmd(&args.rest),
         Some("tensor-inspect") => inspect_model_cmd(&args.rest),
         Some("run") => run_model_cmd(&args.rest),
@@ -93,6 +95,7 @@ struct BuiltInDoctorReport {
     catboost_metadata: bool,
     lightgbm_text: bool,
     onnx_static_graph: bool,
+    pytorch_zip: bool,
     json_tensor_plans: bool,
 }
@@ -125,6 +128,7 @@ fn doctor_report() -> DoctorReport {
             catboost_metadata: true,
             lightgbm_text: true,
             onnx_static_graph: true,
+            pytorch_zip: true,
             json_tensor_plans: true,
         },
         optional_python: PythonDoctorReport {
@@ -168,6 +172,10 @@ fn print_doctor_report(report: &DoctorReport) {
         "  ONNX static graph: {}",
         availability(report.built_in.onnx_static_graph)
     );
+    println!(
+        "  PyTorch .pt/.pth (zip): {}",
+        availability(report.built_in.pytorch_zip)
+    );
     println!(
         "  JSON tensor plans: {}",
         availability(report.built_in.json_tensor_plans)
@@ -192,15 +200,15 @@ fn print_doctor_report(report: &DoctorReport) {
     println!();
     println!("Available commands:");
     println!(
-        "  inspect .cbm/.lgb/.onnx/.json: {}",
+        "  inspect .cbm/.lgb/.onnx/.pt/.json: {}",
         available_unavailable(report.commands.inspect_artifacts)
     );
     println!(
-        "  diff .cbm/.lgb/.onnx: {}",
+        "  diff .cbm/.lgb/.onnx/.pt: {}",
         available_unavailable(report.commands.diff_artifacts)
     );
     println!(
-        "  explain-diff .cbm/.lgb/.onnx: {}",
+        "  explain-diff .cbm/.lgb/.onnx/.pt: {}",
         available_unavailable(report.commands.diff_artifacts)
     );
     println!(
@@ -301,6 +309,10 @@ fn inspect_cmd(args: &[String]) -> Result<(), String> {
             Err("--deep inspection is currently only supported for CatBoost artifacts.".to_string())
         }
         ArtifactKind::Onnx => onnx_cmd(args),
+        ArtifactKind::PyTorch if deep => {
+            Err("--deep inspection is currently only supported for CatBoost artifacts.".to_string())
+        }
+        ArtifactKind::PyTorch => pt_cmd(args),
         ArtifactKind::Json if deep => {
             Err("--deep inspection is currently only supported for CatBoost artifacts.".to_string())
         }
@@ -312,7 +324,7 @@ fn inspect_cmd(args: &[String]) -> Result<(), String> {
         }
         ArtifactKind::Json => inspect_model_cmd(args),
         ArtifactKind::Unknown => Err(format!(
-            "unsupported artifact type for `{}`. Try .cbm, .lgb, .onnx, or .json.",
+            "unsupported artifact type for `{}`. Try .cbm, .lgb, .onnx, .pt/.pth, or .json.",
             target
         )),
     }
@@ -378,6 +390,13 @@ fn diff_cmd(args: &[String]) -> Result<(), String> {
             }
             diff_onnx(&paths[0], &paths[1], json)
         }
+        (ArtifactKind::PyTorch, ArtifactKind::PyTorch) => {
+            if deep {
+                println!("Note: --deep is currently only used for CatBoost artifacts.");
+                println!();
+            }
+            diff_pt(&paths[0], &paths[1], json)
+        }
         (ArtifactKind::Json, ArtifactKind::Json) => Err(
             "tensor plan diff is not supported yet. Use trace, compare, why, or validate instead."
                 .to_string(),
@@ -462,8 +481,9 @@ fn check_cmd(args: &[String]) -> Result<(), String> {
             check_lgbm(&paths[0], &paths[1], &options)
         }
         (ArtifactKind::Onnx, ArtifactKind::Onnx) => check_onnx(&paths[0], &paths[1], &options),
+        (ArtifactKind::PyTorch, ArtifactKind::PyTorch) => check_pt(&paths[0], &paths[1], &options),
         (ArtifactKind::Json, ArtifactKind::Json) => {
-            Err("check supports CatBoost, LightGBM, and ONNX artifacts, not tensor plan JSON."
+            Err("check supports CatBoost, LightGBM, ONNX, and PyTorch artifacts, not tensor plan JSON."
                 .to_string())
         }
         (left, right) => Err(format!(
@@ -715,6 +735,7 @@ fn explain_cmd(args: &[String]) -> Result<(), String> {
             ArtifactKind::Onnx => return explain_onnx_cmd(target),
             ArtifactKind::CatBoost => return explain_catboost_cmd(target),
             ArtifactKind::LightGbm => return lgbm_cmd(&[target.to_string()]),
+            ArtifactKind::PyTorch => return pt_cmd(&[target.to_string()]),
             ArtifactKind::Json => return explain_model_cmd(&[target.to_string()]),
             ArtifactKind::Unknown => {}
         }
@@ -1013,8 +1034,9 @@ fn explain_diff_cmd(args: &[String]) -> Result<(), String> {
         (ArtifactKind::Onnx, ArtifactKind::Onnx) => explain_diff_onnx(old, new),
         (ArtifactKind::LightGbm, ArtifactKind::LightGbm) => explain_diff_lgbm(old, new),
         (ArtifactKind::CatBoost, ArtifactKind::CatBoost) => explain_diff_catboost(old, new),
+        (ArtifactKind::PyTorch, ArtifactKind::PyTorch) => explain_diff_pt(old, new),
         (left, right) => Err(format!(
-            "explain-diff needs two artifacts of the same supported type (.onnx, .cbm, .lgb): {} vs {}",
+            "explain-diff needs two artifacts of the same supported type (.onnx, .cbm, .lgb, .pt): {} vs {}",
             left.label(),
             right.label()
         )),
@@ -1326,6 +1348,19 @@ fn explain_diff_catboost(old_path: &str, new_path: &str) -> Result<(), String> {
     let old = cbm::inspect(old_path)?;
     let new = cbm::inspect(new_path)?;
+    let old_features = old.feature_candidates.iter().collect::<BTreeSet<_>>();
+    let new_features = new.feature_candidates.iter().collect::<BTreeSet<_>>();
+    let added = new_features
+        .difference(&old_features)
+        .map(|name| name.as_str())
+        .collect::<Vec<_>>();
+    let removed = old_features
+        .difference(&new_features)
+        .map(|name| name.as_str())
+        .collect::<Vec<_>>();
+    let names_known = !old.feature_candidates.is_empty() || !new.feature_candidates.is_empty();
+    let config_same = catboost_training_config_same(&old, &new);
     println!("Model Change Explanation");
     println!("------------------------");
     println!("Type: CatBoost");
@@ -1334,21 +1369,68 @@ fn explain_diff_catboost(old_path: &str, new_path: &str) -> Result<(), String> {
     println!();
     println!("Architecture:");
     println!(
-        "  Trees:             {} -> {}",
+        "  Trees:             {} -> {} ({})",
         opt_num(old.iterations.map(|value| value as usize)),
-        opt_num(new.iterations.map(|value| value as usize))
+        opt_num(new.iterations.map(|value| value as usize)),
+        match (old.iterations, new.iterations) {
+            (Some(o), Some(n)) => growth_label(o as usize, n as usize),
+            _ => "unknown".to_string(),
+        }
     );
     println!(
-        "  Depth:             {} -> {}",
+        "  Depth:             {} -> {} ({})",
         opt_num(old.depth.map(|value| value as usize)),
-        opt_num(new.depth.map(|value| value as usize))
+        opt_num(new.depth.map(|value| value as usize)),
+        same_or_changed(old.depth == new.depth)
     );
+    let feature_note = if !names_known {
+        "names not recovered".to_string()
+    } else if added.is_empty() && removed.is_empty() {
+        "same set".to_string()
+    } else {
+        format!("+{} added, -{} removed", added.len(), removed.len())
+    };
     println!(
-        "  Features (recovered): {} -> {}",
+        "  Features (recovered): {} -> {} ({})",
         old.feature_candidates.len(),
-        new.feature_candidates.len()
+        new.feature_candidates.len(),
+        feature_note
     );
+    print_lgbm_feature_list("    added:  ", &added, 8);
+    print_lgbm_feature_list("    removed:", &removed, 8);
     println!();
+    println!("Training config:");
+    println!(
+        "  Loss:              {} -> {} ({})",
+        old.loss_function.as_deref().unwrap_or("unknown"),
+        new.loss_function.as_deref().unwrap_or("unknown"),
+        same_or_changed(old.loss_function == new.loss_function)
+    );
+    println!(
+        "  Eval metric:       {} -> {} ({})",
+        old.eval_metric.as_deref().unwrap_or("unknown"),
+        new.eval_metric.as_deref().unwrap_or("unknown"),
+        same_or_changed(old.eval_metric == new.eval_metric)
+    );
+    println!(
+        "  Learning rate:     {} -> {} ({})",
+        opt_float(old.learning_rate),
+        opt_float(new.learning_rate),
+        same_or_changed(old.learning_rate == new.learning_rate)
+    );
+    println!(
+        "  Grow policy:       {} -> {} ({})",
+        old.grow_policy.as_deref().unwrap_or("unknown"),
+        new.grow_policy.as_deref().unwrap_or("unknown"),
+        same_or_changed(old.grow_policy == new.grow_policy)
+    );
+    println!();
+    println!(
+        "File size: {} -> {} ({})",
+        format_bytes(old.bytes),
+        format_bytes(new.bytes),
+        growth_label(old.bytes, new.bytes)
+    );
     match (old.estimated_leaf_values(), new.estimated_leaf_values()) {
         (Some(o), Some(n)) => println!(
             "Estimated leaf-slot growth: {}",
@@ -1363,7 +1445,20 @@ fn explain_diff_catboost(old_path: &str, new_path: &str) -> Result<(), String> {
         println!("Learned state: unchanged");
     }
     println!();
-    println!("Note: CatBoost internals are summarized; run `diff --deep` for exact split/leaf changes.");
+    println!("Summary:");
+    let features_same = added.is_empty() && removed.is_empty();
+    let descriptor = if !features_same {
+        "feature set changed"
+    } else if !config_same {
+        "training config changed"
+    } else if old.iterations != new.iterations || old.depth != new.depth {
+        "same spec, retrained with different tree count/depth"
+    } else {
+        "same spec and features, retrained"
+    };
+    println!("  {descriptor}.");
+    println!();
+    println!("Note: heuristic summary; run `diff --deep` for exact split/leaf changes.");
     Ok(())
 }
@@ -1692,6 +1787,7 @@ enum ArtifactKind {
     CatBoost,
     LightGbm,
     Onnx,
+    PyTorch,
     Json,
     Unknown,
 }
@@ -1702,6 +1798,7 @@ impl ArtifactKind {
             ArtifactKind::CatBoost => "CatBoost",
             ArtifactKind::LightGbm => "LightGBM",
             ArtifactKind::Onnx => "ONNX",
+            ArtifactKind::PyTorch => "PyTorch",
             ArtifactKind::Json => "tensor plan JSON",
             ArtifactKind::Unknown => "unknown",
         }
@@ -1718,6 +1815,7 @@ fn artifact_kind(path: &str) -> ArtifactKind {
         Some("cbm") => return ArtifactKind::CatBoost,
         Some("lgb") => return ArtifactKind::LightGbm,
         Some("onnx") => return ArtifactKind::Onnx,
+        Some("pt") | Some("pth") => return ArtifactKind::PyTorch,
         Some("json") => return ArtifactKind::Json,
         _ => {}
     }
@@ -2989,6 +3087,332 @@ fn check_lgbm(old_path: &str, new_path: &str, options: &CheckOptions) -> Result<
     finish_check(&checks)
 }
+fn pt_cmd(args: &[String]) -> Result<(), String> {
+    if args.is_empty() {
+        return Err(
+            "usage: mod-trace pytorch [--json] [--limit 20] <model.pt|model.pth> [more...]"
+                .to_string(),
+        );
+    }
+    let mut json = false;
+    let mut limit = DEFAULT_LIMIT;
+    let mut paths = Vec::new();
+    let mut i = 0usize;
+    while i < args.len() {
+        match args[i].as_str() {
+            "--json" => {
+                json = true;
+                i += 1;
+            }
+            "--deep" => {
+                return Err(
+                    "--deep inspection is currently only supported for CatBoost artifacts."
+                        .to_string(),
+                );
+            }
+            "--limit" => {
+                let value = args
+                    .get(i + 1)
+                    .ok_or_else(|| "--limit needs a number".to_string())?;
+                limit = value
+                    .parse::<usize>()
+                    .map_err(|err| format!("parse --limit: {err}"))?;
+                i += 2;
+            }
+            value => {
+                paths.push(value.to_string());
+                i += 1;
+            }
+        }
+    }
+    if paths.is_empty() {
+        return Err(
+            "usage: mod-trace pytorch [--json] [--limit 20] <model.pt|model.pth> [more...]"
+                .to_string(),
+        );
+    }
+    if json {
+        let reports = paths
+            .iter()
+            .map(|path| pt::inspect(path))
+            .collect::<Result<Vec<_>, _>>()?;
+        if reports.len() == 1 {
+            print_json(&reports[0])?;
+        } else {
+            print_json(&reports)?;
+        }
+        return Ok(());
+    }
+    for (index, path) in paths.iter().enumerate() {
+        if index > 0 {
+            println!();
+        }
+        let report = pt::inspect(path)?;
+        print_pt_report(&report, limit);
+    }
+    Ok(())
+}
+fn print_pt_report(report: &pt::PtReport, limit: usize) {
+    println!("PyTorch Model Summary");
+    println!("---------------------");
+    println!("Model: {}", report.path);
+    println!("Format: {}", report.format());
+    println!("File size: {}", format_bytes(report.bytes));
+    if let Some(version) = report.torch_version.as_deref() {
+        println!("Serialization version: {version}");
+    }
+    println!();
+    println!("Structure:");
+    println!("  Tensors (storages): {}", report.tensor_count);
+    println!(
+        "  Parameters (est):   {}",
+        format_count_human(report.estimated_parameter_count as usize)
+    );
+    println!(
+        "  Parameter bytes:    {}",
+        format_bytes(report.total_parameter_bytes as usize)
+    );
+    print_optional("Dominant dtype", report.dominant_dtype.as_deref());
+    println!();
+    println!("Parameter-like Internals:");
+    println!(
+        "  Full artifact fingerprint: {}",
+        format_hex(report.file_fingerprint)
+    );
+    println!(
+        "  Learned-state fingerprint (sampled): {}",
+        format_hex(report.learned_state_fingerprint)
+    );
+    println!(
+        "  Note: parameter count is estimated from storage bytes / dtype; tensor shapes are not decoded."
+    );
+    println!();
+    println!("Recovered Parameter Names:");
+    if report.param_names.is_empty() {
+        println!("  none recovered from the pickle");
+    } else {
+        println!("  found: {}", report.param_names.len());
+        for name in report.param_names.iter().take(limit) {
+            println!("  {name}");
+        }
+        if report.param_names.len() > limit {
+            println!("  ... {} more", report.param_names.len() - limit);
+        }
+    }
+}
+fn diff_pt(old_path: &str, new_path: &str, json: bool) -> Result<(), String> {
+    let old = pt::inspect(old_path)?;
+    let new = pt::inspect(new_path)?;
+    if json {
+        print_json(&serde_json::json!({
+            "type": "pytorch",
+            "old": old,
+            "new": new,
+        }))?;
+        return Ok(());
+    }
+    println!("Model Diff");
+    println!("----------");
+    println!("Type: PyTorch");
+    println!("Old: {}", old.path);
+    println!("New: {}", new.path);
+    println!();
+    println!("Structure:");
+    print_diff_bytes("File size", old.bytes, new.bytes);
+    print_diff_usize("Tensors (storages)", old.tensor_count, new.tensor_count);
+    print_diff_usize(
+        "Parameters (est)",
+        old.estimated_parameter_count as usize,
+        new.estimated_parameter_count as usize,
+    );
+    print_diff_bytes(
+        "Parameter bytes",
+        old.total_parameter_bytes as usize,
+        new.total_parameter_bytes as usize,
+    );
+    print_diff_opt_str(
+        "Dominant dtype",
+        old.dominant_dtype.as_deref(),
+        new.dominant_dtype.as_deref(),
+    );
+    println!();
+    println!("Parameter-like Internals:");
+    print_fingerprint_diff(
+        "Full artifact fingerprint",
+        Some(old.file_fingerprint),
+        Some(new.file_fingerprint),
+    );
+    print_fingerprint_diff(
+        "Learned-state fingerprint (sampled)",
+        Some(old.learned_state_fingerprint),
+        Some(new.learned_state_fingerprint),
+    );
+    println!();
+    println!("Parameter Names:");
+    print_diff_usize("Recovered names", old.param_names.len(), new.param_names.len());
+    print_feature_name_changes(&old.param_names, &new.param_names, 12);
+    Ok(())
+}
+fn explain_diff_pt(old_path: &str, new_path: &str) -> Result<(), String> {
+    let old = pt::inspect(old_path)?;
+    let new = pt::inspect(new_path)?;
+    let old_names = old.param_names.iter().collect::<BTreeSet<_>>();
+    let new_names = new.param_names.iter().collect::<BTreeSet<_>>();
+    let added = new_names
+        .difference(&old_names)
+        .map(|name| name.as_str())
+        .collect::<Vec<_>>();
+    let removed = old_names
+        .difference(&new_names)
+        .map(|name| name.as_str())
+        .collect::<Vec<_>>();
+    let names_known = !old.param_names.is_empty() || !new.param_names.is_empty();
+    println!("Model Change Explanation");
+    println!("------------------------");
+    println!("Type: PyTorch");
+    println!("Old: {}", old.path);
+    println!("New: {}", new.path);
+    println!();
+    println!("Architecture:");
+    println!(
+        "  Tensors:           {} -> {} ({})",
+        old.tensor_count,
+        new.tensor_count,
+        growth_label(old.tensor_count, new.tensor_count)
+    );
+    println!(
+        "  Parameters (est):  {} -> {} ({})",
+        format_count_human(old.estimated_parameter_count as usize),
+        format_count_human(new.estimated_parameter_count as usize),
+        growth_label(
+            old.estimated_parameter_count as usize,
+            new.estimated_parameter_count as usize
+        )
+    );
+    println!(
+        "  Dominant dtype:    {} -> {} ({})",
+        old.dominant_dtype.as_deref().unwrap_or("unknown"),
+        new.dominant_dtype.as_deref().unwrap_or("unknown"),
+        same_or_changed(old.dominant_dtype == new.dominant_dtype)
+    );
+    let name_note = if !names_known {
+        "names not recovered".to_string()
+    } else if added.is_empty() && removed.is_empty() {
+        "same set".to_string()
+    } else {
+        format!("+{} added, -{} removed", added.len(), removed.len())
+    };
+    println!(
+        "  Param names:       {} -> {} ({})",
+        old.param_names.len(),
+        new.param_names.len(),
+        name_note
+    );
+    print_lgbm_feature_list("    added:  ", &added, 8);
+    print_lgbm_feature_list("    removed:", &removed, 8);
+    println!();
+    println!(
+        "File size: {} -> {} ({})",
+        format_bytes(old.bytes),
+        format_bytes(new.bytes),
+        growth_label(old.bytes, new.bytes)
+    );
+    println!();
+    if old.learned_state_fingerprint != new.learned_state_fingerprint {
+        println!("Learned state: CHANGED (sampled weight bytes differ - a real retrain/finetune)");
+    } else {
+        println!("Learned state: unchanged (sampled weight bytes identical)");
+    }
+    println!();
+    println!("Summary:");
+    let names_same = added.is_empty() && removed.is_empty();
+    let descriptor = if !names_known {
+        "weights compared (parameter names not recovered)"
+    } else if !names_same {
+        "parameter set changed (layers added/removed)"
+    } else if old.estimated_parameter_count != new.estimated_parameter_count {
+        "same layers, parameter count changed (resized)"
+    } else {
+        "same architecture, retrained/finetuned"
+    };
+    println!(
+        "  {descriptor}; params {}.",
+        growth_label(
+            old.estimated_parameter_count as usize,
+            new.estimated_parameter_count as usize
+        )
+    );
+    println!();
+    println!(
+        "Note: static read of the torch.save zip (no torch); shapes are not decoded and the weight fingerprint is sampled."
+    );
+    Ok(())
+}
+fn check_pt(old_path: &str, new_path: &str, options: &CheckOptions) -> Result<(), String> {
+    if options.max_ops_growth_pct.is_some() {
+        return Err("--max-ops-growth is only supported for ONNX artifacts.".to_string());
+    }
+    if options.fail_on_new_op {
+        return Err("--fail-on-new-op is only supported for ONNX artifacts.".to_string());
+    }
+    if options.fail_on_training_config_change {
+        return Err(
+            "--fail-on-training-config-change is only supported for CatBoost/LightGBM artifacts."
+                .to_string(),
+        );
+    }
+    let old = pt::inspect(old_path)?;
+    let new = pt::inspect(new_path)?;
+    let mut checks = Vec::new();
+    if let Some(max_pct) = options.max_size_growth_pct {
+        checks.push(growth_check(
+            "file size growth",
+            old.bytes,
+            new.bytes,
+            max_pct,
+        ));
+    }
+    if let Some(max_pct) = options.max_parameter_growth_pct {
+        checks.push(growth_check(
+            "parameter growth",
+            old.estimated_parameter_count as usize,
+            new.estimated_parameter_count as usize,
+            max_pct,
+        ));
+    }
+    if options.fail_on_feature_change {
+        checks.push(boolean_check(
+            "parameter names unchanged",
+            old.param_names == new.param_names,
+            format!(
+                "{} -> {} recovered names",
+                old.param_names.len(),
+                new.param_names.len()
+            ),
+        ));
+    }
+    print_check_report("PyTorch", &old.path, &new.path, &checks);
+    finish_check(&checks)
+}
 fn onnx_cmd(args: &[String]) -> Result<(), String> {
     if args.is_empty() {
         return Err("usage: mod-trace onnx [--json] [--limit 20] <model.onnx>".to_string());
@@ -3488,13 +3912,14 @@ fn print_help() {
         "mod-trace - inspect ML model artifacts without loading the framework\n\n\
 Core usage:\n  \
 mod-trace doctor [--json]\n  \
-mod-trace inspect [--deep] [--json] [--limit 20] <model.cbm|model.lgb|model.onnx|model.json>\n  \
-mod-trace diff [--deep] [--json] <old-model> <new-model>  (.cbm, .lgb/.txt LightGBM, or .onnx)\n  \
+mod-trace inspect [--deep] [--json] [--limit 20] <model.cbm|model.lgb|model.onnx|model.pt|model.json>\n  \
+mod-trace diff [--deep] [--json] <old-model> <new-model>  (.cbm, .lgb/.txt, .onnx, or .pt/.pth)\n  \
 mod-trace explain-diff <old-model> <new-model>  (plain-English what changed: layers, params, cost, new ops)\n\n\
 mod-trace check [--max-size-growth 20%] [--max-ops-growth 25%] [--max-parameter-growth 30%] [--fail-on-feature-change] [--fail-on-training-config-change] [--fail-on-new-op] <old-model> <new-model>\n\n\
 Artifact inspectors:\n  \
 mod-trace catboost [--deep] [--json] [--limit 20] <model.cbm> [more.cbm...]\n  \
 mod-trace lightgbm [--json] [--limit 20] <model.lgb|model.txt> [more...]\n  \
+mod-trace pytorch [--json] [--limit 20] <model.pt|model.pth> [more...]\n  \
 mod-trace onnx [--json] [--limit 20] <model.onnx>\n  \
 mod-trace explain <model.onnx|model.cbm>\n\n\
 Tensor lab (secondary; see docs/tensor-lab.md):\n  \
@@ -3831,7 +4256,7 @@ mod tests {
         assert_eq!(
             check_cmd(&args),
-            Err("check supports CatBoost, LightGBM, and ONNX artifacts, not tensor plan JSON."
+            Err("check supports CatBoost, LightGBM, ONNX, and PyTorch artifacts, not tensor plan JSON."
                 .to_string())
         );
     }

mod_trace-0.4.0/src/pt.rs ADDED Viewed

@@ -0,0 +1,398 @@
+use serde::Serialize;
+use std::fs::File;
+use std::io::{Read, Seek, SeekFrom};
+use std::path::Path;
+/// A static read of a PyTorch `torch.save` artifact (`.pt`/`.pth`).
+///
+/// Modern PyTorch saves an (uncompressed) ZIP of a pickled structure
+/// (`data.pkl`) plus raw tensor storages under `.../data/N`. This reader walks
+/// that zip without running Python or torch: it sizes the storages, recovers
+/// parameter names and dtype from the pickle, and fingerprints the weights by
+/// sampling. It does not decode exact per-tensor shapes (that needs a pickle VM).
+#[derive(Serialize)]
+pub struct PtReport {
+    pub path: String,
+    pub bytes: usize,
+    pub is_zip: bool,
+    pub tensor_count: usize,
+    pub total_parameter_bytes: u64,
+    pub estimated_parameter_count: u64,
+    pub dominant_dtype: Option<String>,
+    pub param_names: Vec<String>,
+    pub torch_version: Option<String>,
+    pub file_fingerprint: u64,
+    pub learned_state_fingerprint: u64,
+}
+impl PtReport {
+    pub fn format(&self) -> &'static str {
+        if self.is_zip {
+            "PyTorch (torch.save zip)"
+        } else {
+            "PyTorch (legacy pickle)"
+        }
+    }
+}
+pub fn looks_like_pt(head: &[u8]) -> bool {
+    head.starts_with(b"PK\x03\x04") || head.first() == Some(&0x80)
+}
+const SAMPLE_PER_STORAGE: u64 = 1 << 20; // 1 MiB sampled per tensor for the weight fingerprint
+pub fn inspect(path: &str) -> Result<PtReport, String> {
+    let mut file = File::open(path).map_err(|err| format!("open {path}: {err}"))?;
+    let total = file
+        .metadata()
+        .map_err(|err| format!("stat {path}: {err}"))?
+        .len();
+    let mut magic = [0u8; 4];
+    let read = file.read(&mut magic).map_err(|err| format!("read {path}: {err}"))?;
+    if read >= 4 && &magic == b"PK\x03\x04" {
+        inspect_zip(&mut file, path, total)
+    } else if read >= 1 && magic[0] == 0x80 {
+        inspect_legacy(&mut file, path, total)
+    } else {
+        Err(format!(
+            "{path} does not look like a PyTorch file (no zip `PK` or pickle marker)"
+        ))
+    }
+}
+fn read_at(file: &mut File, offset: u64, len: usize) -> Result<Vec<u8>, String> {
+    file.seek(SeekFrom::Start(offset))
+        .map_err(|err| format!("seek: {err}"))?;
+    let mut buf = vec![0u8; len];
+    file.read_exact(&mut buf).map_err(|err| format!("read: {err}"))?;
+    Ok(buf)
+}
+fn le_u16(b: &[u8], o: usize) -> u16 {
+    u16::from_le_bytes([b[o], b[o + 1]])
+}
+fn le_u32(b: &[u8], o: usize) -> u32 {
+    u32::from_le_bytes([b[o], b[o + 1], b[o + 2], b[o + 3]])
+}
+fn le_u64(b: &[u8], o: usize) -> u64 {
+    u64::from_le_bytes([
+        b[o], b[o + 1], b[o + 2], b[o + 3], b[o + 4], b[o + 5], b[o + 6], b[o + 7],
+    ])
+}
+/// Offset where an entry's data begins, by reading its local file header.
+fn entry_data_start(file: &mut File, local_offset: u64) -> Result<u64, String> {
+    let lh = read_at(file, local_offset, 30)?;
+    if &lh[0..4] != b"PK\x03\x04" {
+        return Err("bad local header".to_string());
+    }
+    let name_len = le_u16(&lh, 26) as u64;
+    let extra_len = le_u16(&lh, 28) as u64;
+    Ok(local_offset + 30 + name_len + extra_len)
+}
+fn find_sig_rev(buf: &[u8], sig: &[u8; 4]) -> Option<usize> {
+    if buf.len() < 4 {
+        return None;
+    }
+    let mut i = buf.len() - 4;
+    loop {
+        if &buf[i..i + 4] == sig {
+            return Some(i);
+        }
+        if i == 0 {
+            return None;
+        }
+        i -= 1;
+    }
+}
+fn inspect_zip(file: &mut File, path: &str, total: u64) -> Result<PtReport, String> {
+    // Torch zips use data descriptors, so sizes live in the central directory.
+    let tail_len = total.min(65_557) as usize;
+    let tail = read_at(file, total - tail_len as u64, tail_len)?;
+    let eocd_rel =
+        find_sig_rev(&tail, b"PK\x05\x06").ok_or_else(|| format!("{path}: no zip EOCD record"))?;
+    let eocd = &tail[eocd_rel..];
+    let mut cd_size = le_u32(eocd, 12) as u64;
+    let mut cd_offset = le_u32(eocd, 16) as u64;
+    // ZIP64 when fields are saturated (models > 4 GiB).
+    if cd_offset == 0xFFFF_FFFF || cd_size == 0xFFFF_FFFF {
+        if eocd_rel >= 20 {
+            let loc = &tail[eocd_rel - 20..];
+            if &loc[0..4] == b"PK\x06\x07" {
+                let z64_off = le_u64(loc, 8);
+                let z64 = read_at(file, z64_off, 56)?;
+                if &z64[0..4] == b"PK\x06\x06" {
+                    cd_size = le_u64(&z64, 40);
+                    cd_offset = le_u64(&z64, 48);
+                }
+            }
+        }
+    }
+    let cd = read_at(file, cd_offset, cd_size as usize)?;
+    let mut pkl = Vec::new();
+    let mut torch_version = None;
+    let mut tensor_count = 0usize;
+    let mut total_parameter_bytes = 0u64;
+    let mut struct_hash = FNV_OFFSET;
+    let mut weight_hash = FNV_OFFSET;
+    let mut o = 0usize;
+    while o + 46 <= cd.len() && &cd[o..o + 4] == b"PK\x01\x02" {
+        let comp32 = le_u32(&cd, o + 20);
+        let mut uncomp_size = le_u32(&cd, o + 24) as u64;
+        let name_len = le_u16(&cd, o + 28) as usize;
+        let extra_len = le_u16(&cd, o + 30) as usize;
+        let comment_len = le_u16(&cd, o + 32) as usize;
+        let mut local_offset = le_u32(&cd, o + 42) as u64;
+        let name = String::from_utf8_lossy(&cd[o + 46..o + 46 + name_len]).to_string();
+        let extra = &cd[o + 46 + name_len..o + 46 + name_len + extra_len];
+        // ZIP64 extended info: present fields appear in order uncomp, comp, offset.
+        if uncomp_size == 0xFFFF_FFFF || local_offset == 0xFFFF_FFFF {
+            let mut e = 0usize;
+            while e + 4 <= extra.len() {
+                let id = le_u16(extra, e);
+                let sz = le_u16(extra, e + 2) as usize;
+                if id == 0x0001 {
+                    let mut f = e + 4;
+                    if uncomp_size == 0xFFFF_FFFF && f + 8 <= extra.len() {
+                        uncomp_size = le_u64(extra, f);
+                        f += 8;
+                    }
+                    if comp32 == 0xFFFF_FFFF && f + 8 <= extra.len() {
+                        f += 8; // skip compressed size
+                    }
+                    if local_offset == 0xFFFF_FFFF && f + 8 <= extra.len() {
+                        local_offset = le_u64(extra, f);
+                    }
+                    break;
+                }
+                e += 4 + sz;
+            }
+        }
+        let base = name.rsplit('/').next().unwrap_or(&name);
+        struct_hash = fnv_update(struct_hash, name.as_bytes());
+        struct_hash = fnv_update(struct_hash, &uncomp_size.to_le_bytes());
+        if name.ends_with("data.pkl") {
+            let start = entry_data_start(file, local_offset)?;
+            pkl = read_at(file, start, uncomp_size as usize)?;
+        } else if name.ends_with("/version") || name == "version" {
+            let start = entry_data_start(file, local_offset)?;
+            let v = read_at(file, start, uncomp_size.min(32) as usize)?;
+            torch_version = Some(String::from_utf8_lossy(&v).trim().to_string());
+        } else if name.contains("/data/") && !base.is_empty() && base.bytes().all(|b| b.is_ascii_digit())
+        {
+            tensor_count += 1;
+            total_parameter_bytes += uncomp_size;
+            let sample = uncomp_size.min(SAMPLE_PER_STORAGE) as usize;
+            if sample > 0 {
+                let start = entry_data_start(file, local_offset)?;
+                let bytes = read_at(file, start, sample)?;
+                weight_hash = fnv_update(weight_hash, &bytes);
+            }
+            weight_hash = fnv_update(weight_hash, &uncomp_size.to_le_bytes());
+        }
+        o += 46 + name_len + extra_len + comment_len;
+    }
+    if pkl.is_empty() {
+        return Err(format!("{path}: no data.pkl found inside the archive"));
+    }
+    let param_names = recover_param_names(&pkl);
+    let (dtype, elem_size) = dominant_dtype(&pkl);
+    let estimated_parameter_count = total_parameter_bytes / elem_size;
+    let file_fingerprint = fnv_update(fnv_update(FNV_OFFSET, &pkl), &struct_hash.to_le_bytes());
+    Ok(PtReport {
+        path: file_name(path),
+        bytes: total as usize,
+        is_zip: true,
+        tensor_count,
+        total_parameter_bytes,
+        estimated_parameter_count,
+        dominant_dtype: dtype,
+        param_names,
+        torch_version,
+        file_fingerprint,
+        learned_state_fingerprint: weight_hash,
+    })
+}
+fn inspect_legacy(file: &mut File, path: &str, total: u64) -> Result<PtReport, String> {
+    // Legacy (non-zip) pickle: we can only scan strings + fingerprint the bytes.
+    let bytes = read_at(file, 0, total as usize)?;
+    let param_names = recover_param_names(&bytes);
+    let (dtype, _) = dominant_dtype(&bytes);
+    Ok(PtReport {
+        path: file_name(path),
+        bytes: total as usize,
+        is_zip: false,
+        tensor_count: 0,
+        total_parameter_bytes: 0,
+        estimated_parameter_count: 0,
+        dominant_dtype: dtype,
+        param_names,
+        torch_version: None,
+        file_fingerprint: fnv_update(FNV_OFFSET, &bytes),
+        learned_state_fingerprint: fnv_update(FNV_OFFSET, &bytes),
+    })
+}
+fn file_name(path: &str) -> String {
+    Path::new(path)
+        .file_name()
+        .and_then(|name| name.to_str())
+        .unwrap_or(path)
+        .to_string()
+}
+/// Pull UTF-8 strings out of pickle BINUNICODE (0x58) / SHORT_BINUNICODE (0x8c)
+/// opcodes and keep the ones that look like state_dict keys.
+fn recover_param_names(pkl: &[u8]) -> Vec<String> {
+    let mut names = Vec::new();
+    let mut seen = std::collections::BTreeSet::new();
+    let mut i = 0usize;
+    while i < pkl.len() {
+        let (text, next) = match pkl[i] {
+            0x8c if i + 2 <= pkl.len() => {
+                let n = pkl[i + 1] as usize;
+                if i + 2 + n <= pkl.len() {
+                    (std::str::from_utf8(&pkl[i + 2..i + 2 + n]).ok(), i + 2 + n)
+                } else {
+                    (None, i + 1)
+                }
+            }
+            0x58 if i + 5 <= pkl.len() => {
+                let n = le_u32(pkl, i + 1) as usize;
+                if i + 5 + n <= pkl.len() {
+                    (std::str::from_utf8(&pkl[i + 5..i + 5 + n]).ok(), i + 5 + n)
+                } else {
+                    (None, i + 1)
+                }
+            }
+            _ => (None, i + 1),
+        };
+        if let Some(text) = text {
+            if looks_like_param_name(text) && seen.insert(text.to_string()) {
+                names.push(text.to_string());
+            }
+        }
+        i = next;
+    }
+    names
+}
+fn looks_like_param_name(value: &str) -> bool {
+    if value.len() < 2 || value.len() > 200 {
+        return false;
+    }
+    if !value
+        .bytes()
+        .all(|b| b.is_ascii_alphanumeric() || b == b'_' || b == b'.')
+    {
+        return false;
+    }
+    value.contains('.')
+        || value.ends_with("weight")
+        || value.ends_with("bias")
+        || value.ends_with("running_mean")
+        || value.ends_with("running_var")
+}
+fn dominant_dtype(pkl: &[u8]) -> (Option<String>, u64) {
+    // (storage token, dtype name, element size)
+    let candidates: [(&[u8], &str, u64); 8] = [
+        (b"DoubleStorage", "float64", 8),
+        (b"BFloat16Storage", "bfloat16", 2),
+        (b"HalfStorage", "float16", 2),
+        (b"FloatStorage", "float32", 4),
+        (b"LongStorage", "int64", 8),
+        (b"IntStorage", "int32", 4),
+        (b"ByteStorage", "uint8", 1),
+        (b"BoolStorage", "bool", 1),
+    ];
+    let mut best: Option<(&str, u64, usize)> = None;
+    for (token, name, size) in candidates {
+        let count = count_occurrences(pkl, token);
+        if count > 0 && best.map(|(_, _, c)| count > c).unwrap_or(true) {
+            best = Some((name, size, count));
+        }
+    }
+    match best {
+        Some((name, size, _)) => (Some(name.to_string()), size),
+        None => (None, 4),
+    }
+}
+fn count_occurrences(haystack: &[u8], needle: &[u8]) -> usize {
+    if needle.is_empty() || haystack.len() < needle.len() {
+        return 0;
+    }
+    let mut count = 0;
+    let mut i = 0;
+    while i + needle.len() <= haystack.len() {
+        if &haystack[i..i + needle.len()] == needle {
+            count += 1;
+            i += needle.len();
+        } else {
+            i += 1;
+        }
+    }
+    count
+}
+const FNV_OFFSET: u64 = 0xcbf29ce484222325;
+const FNV_PRIME: u64 = 0x100000001b3;
+fn fnv_update(mut hash: u64, data: &[u8]) -> u64 {
+    for byte in data {
+        hash ^= u64::from(*byte);
+        hash = hash.wrapping_mul(FNV_PRIME);
+    }
+    hash
+}
+#[cfg(test)]
+mod tests {
+    use super::{count_occurrences, looks_like_param_name, recover_param_names};
+    #[test]
+    fn param_name_filter() {
+        assert!(looks_like_param_name("fc1.weight"));
+        assert!(looks_like_param_name("encoder.layers.3.attn.bias"));
+        assert!(looks_like_param_name("classifier.weight"));
+        assert!(!looks_like_param_name("cpu"));
+        assert!(!looks_like_param_name("storage"));
+        assert!(!looks_like_param_name("has space"));
+    }
+    #[test]
+    fn recovers_binunicode_names() {
+        // SHORT_BINUNICODE(0x8c) len=10 "fc1.weight", then BINUNICODE(0x58) len=8 "fc1.bias"
+        let mut pkl = vec![0x8c, 10];
+        pkl.extend_from_slice(b"fc1.weight");
+        pkl.push(0x58);
+        pkl.extend_from_slice(&8u32.to_le_bytes());
+        pkl.extend_from_slice(b"fc1.bias");
+        let names = recover_param_names(&pkl);
+        assert_eq!(names, vec!["fc1.weight", "fc1.bias"]);
+    }
+    #[test]
+    fn counts_tokens() {
+        assert_eq!(count_occurrences(b"FloatStorageFloatStorage", b"FloatStorage"), 2);
+        assert_eq!(count_occurrences(b"none here", b"FloatStorage"), 0);
+    }
+}