npm - open-agents-ai - Versions diffs - 0.185.29 → 0.185.30 - Mend

open-agents-ai 0.185.29 → 0.185.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js +65 -0
package/package.json +1 -1
package/voices/personaplex/quantize-weights.py +167 -0

package/dist/index.js CHANGED Viewed

@@ -41297,6 +41297,18 @@ function detectPersonaPlexCapability() {
     const [gpuName, vramMB] = nvsmi.split("\n")[0].split(", ");
     const vramGB = parseInt(vramMB ?? "0", 10) / 1024;
     if (vramGB < 16) {
+      const isJetson = /orin|tegra|jetson/i.test(gpuName ?? "");
+      if (isJetson) {
+        try {
+          const memInfo = execSync27("grep MemTotal /proc/meminfo", { encoding: "utf8", timeout: 3e3, stdio: "pipe" });
+          const memKB = parseInt(memInfo.match(/(\d+)/)?.[1] ?? "0", 10);
+          const totalGB = memKB / 1024 / 1024;
+          if (totalGB >= 32) {
+            return { supported: true, reason: `Jetson unified memory (${totalGB.toFixed(0)}GB total)`, gpuName: gpuName ?? "", vramGB: totalGB };
+          }
+        } catch {
+        }
+      }
       return { supported: false, reason: `GPU has ${vramGB.toFixed(1)}GB VRAM (need \u226516GB)`, gpuName: gpuName ?? "", vramGB };
     }
     try {
@@ -41352,6 +41364,14 @@ async function installPersonaPlex(onInfo) {
   }
   const pip = process.platform === "win32" ? join54(venvDir, "Scripts", "pip.exe") : join54(venvDir, "bin", "pip");
   const python = process.platform === "win32" ? join54(venvDir, "Scripts", "python.exe") : join54(venvDir, "bin", "python3");
+  let arch2 = "";
+  try {
+    arch2 = execSync27("uname -m", { encoding: "utf8", timeout: 3e3, stdio: "pipe" }).trim();
+  } catch {
+  }
+  const isAarch64 = arch2 === "aarch64" || arch2 === "arm64";
+  if (isAarch64)
+    log(`Detected ARM64 platform (${arch2}) \u2014 Jetson/ARM install path`);
   log("Checking system dependencies (libopus)...");
   try {
     if (process.platform === "linux") {
@@ -41361,12 +41381,43 @@ async function installPersonaPlex(onInfo) {
     }
   } catch {
   }
+  if (isAarch64) {
+    log("ARM64: Checking Rust toolchain for sphn build...");
+    try {
+      execSync27("rustc --version", { timeout: 5e3, stdio: "pipe" });
+    } catch {
+      log("ARM64: Installing Rust toolchain (needed for sphn audio codec)...");
+      try {
+        execSync27("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y", { timeout: 12e4, stdio: "pipe" });
+      } catch (e) {
+        log(`Rust install failed: ${e instanceof Error ? e.message : String(e)}`);
+        log("Install Rust manually: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh");
+        return false;
+      }
+    }
+    try {
+      execSync27(`"${pip}" install --quiet maturin`, { timeout: 6e4, stdio: "pipe" });
+    } catch {
+    }
+  }
   log("Installing PersonaPlex (moshi package)...");
   const repoDir = join54(PERSONAPLEX_DIR, "personaplex-repo");
   try {
     if (!existsSync37(repoDir)) {
       execSync27(`git clone https://github.com/NVIDIA/personaplex.git "${repoDir}"`, { timeout: 12e4, stdio: "pipe" });
     }
+    if (isAarch64) {
+      log("ARM64: Building sphn from source (Opus codec bindings)...");
+      try {
+        const rustEnv = `export PATH="$HOME/.cargo/bin:$PATH" &&`;
+        execSync27(`${rustEnv} "${pip}" install --quiet --no-binary sphn sphn`, { timeout: 3e5, stdio: "pipe", shell: "/bin/bash" });
+        log("ARM64: sphn built successfully");
+      } catch (e) {
+        log(`ARM64: sphn build failed \u2014 ${e instanceof Error ? e.message : String(e)}`);
+        log("Ensure Rust, libopus-dev, and cmake are installed.");
+        return false;
+      }
+    }
     execSync27(`"${pip}" install --quiet "${join54(repoDir, "moshi")}/."`, { timeout: 3e5, stdio: "pipe" });
   } catch (err) {
     log(`Moshi install failed: ${err instanceof Error ? err.message : String(err)}`);
@@ -41394,7 +41445,21 @@ async function installPersonaPlex(onInfo) {
     }
   } catch {
   }
+  if (isAarch64) {
+    log("ARM64: Installing bitsandbytes for INT4 inference...");
+    try {
+      execSync27(`"${pip}" install --quiet bitsandbytes`, { timeout: 12e4, stdio: "pipe" });
+    } catch {
+    }
+  }
+  try {
+    execSync27(`"${pip}" install --quiet pyloudnorm noisereduce torchaudio`, { timeout: 12e4, stdio: "pipe" });
+  } catch {
+  }
   log("PersonaPlex installed. Model will download on first launch (~14GB).");
+  if (isAarch64) {
+    log("ARM64: On first run, weights will load in INT4 mode for real-time performance.");
+  }
   writeFileSync16(join54(PERSONAPLEX_DIR, "model_ready"), (/* @__PURE__ */ new Date()).toISOString());
   log("PersonaPlex installed successfully.");
   return true;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "open-agents-ai",
-  "version": "0.185.29",
+  "version": "0.185.30",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",

package/voices/personaplex/quantize-weights.py ADDED Viewed

@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+quantize-weights.py — Quantize PersonaPlex 7B weights to INT4 (NF4) for edge devices.
+Creates a ~3.5GB quantized checkpoint from the ~14GB bf16 weights.
+The quantized model runs 3-4x faster on memory-bandwidth-limited devices
+like Jetson AGX Orin while maintaining voice quality.
+Usage:
+  python quantize-weights.py [--device cuda] [--output personaplex-7b-nf4.safetensors]
+Requirements:
+  pip install bitsandbytes safetensors torch
+"""
+import argparse
+import os
+import sys
+import logging
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+log = logging.getLogger(__name__)
+def quantize_model(device: str = "cuda", output_path: str = None):
+    """Quantize PersonaPlex 7B to NF4 (4-bit Normal Float)"""
+    import torch
+    from huggingface_hub import hf_hub_download
+    from safetensors.torch import load_file, save_file
+    hf_repo = "nvidia/personaplex-7b-v1"
+    # 1) Download original weights
+    log.info("Downloading PersonaPlex 7B weights...")
+    weight_path = hf_hub_download(hf_repo, "model.safetensors")
+    log.info(f"  Weights: {weight_path}")
+    log.info(f"  Size: {os.path.getsize(weight_path) / 1024**3:.1f} GB")
+    # 2) Load state dict
+    log.info("Loading state dict...")
+    state_dict = load_file(weight_path, device="cpu")
+    log.info(f"  Loaded {len(state_dict)} tensors")
+    # 3) Quantize each weight tensor to INT4 using block-wise NF4
+    try:
+        import bitsandbytes as bnb
+        from bitsandbytes.functional import quantize_nf4, dequantize_nf4
+        HAS_BNB = True
+    except ImportError:
+        HAS_BNB = False
+        log.info("  bitsandbytes not available — using manual INT4 quantization")
+    quantized_state = {}
+    quant_meta = {}  # Store quantization parameters for dequantization
+    total_original = 0
+    total_quantized = 0
+    skipped = 0
+    for name, tensor in state_dict.items():
+        original_bytes = tensor.numel() * tensor.element_size()
+        total_original += original_bytes
+        # Only quantize large weight matrices (≥1024 elements, 2D)
+        # Skip biases, norms, embeddings, small tensors
+        should_quantize = (
+            tensor.ndim >= 2
+            and tensor.numel() >= 1024
+            and not any(skip in name for skip in [
+                "norm", "bias", "embed", "positional", "rope",
+                "depformer_emb", "depformer_in",
+            ])
+        )
+        if not should_quantize:
+            quantized_state[name] = tensor.to(torch.float16).contiguous()
+            total_quantized += tensor.numel() * 2  # fp16
+            skipped += 1
+            continue
+        # Reshape to 2D for quantization
+        orig_shape = tensor.shape
+        flat = tensor.reshape(-1).float()
+        if HAS_BNB:
+            # Use bitsandbytes NF4 quantization
+            quant_tensor, quant_state = bnb.functional.quantize_4bit(
+                flat, quant_type="nf4", compress_statistics=True,
+            )
+            # Store the quantized bytes + metadata for reconstruction
+            quantized_state[name] = quant_tensor.contiguous()
+            quant_meta[f"{name}.__quant_state__"] = torch.tensor(
+                list(orig_shape) + [0] * (4 - len(orig_shape)),
+                dtype=torch.int64,
+            )
+            # Store absmax for dequantization
+            if hasattr(quant_state, 'absmax'):
+                quantized_state[f"{name}.__absmax__"] = quant_state.absmax.contiguous()
+            if hasattr(quant_state, 'quant_map'):
+                quantized_state[f"{name}.__quant_map__"] = quant_state.quant_map.contiguous()
+            total_quantized += quant_tensor.numel()
+        else:
+            # Manual symmetric INT4 quantization (no bitsandbytes)
+            # Block size 64 for good accuracy
+            block_size = 64
+            n_blocks = (flat.numel() + block_size - 1) // block_size
+            padded = torch.zeros(n_blocks * block_size)
+            padded[:flat.numel()] = flat
+            blocks = padded.reshape(n_blocks, block_size)
+            scales = blocks.abs().max(dim=1).values / 7.0  # INT4 range: -8 to 7
+            scales = scales.clamp(min=1e-8)
+            # Quantize to INT4 (stored as INT8 pairs)
+            quantized_blocks = torch.round(blocks / scales.unsqueeze(1)).clamp(-8, 7).to(torch.int8)
+            # Pack two INT4 values into one INT8
+            packed = torch.zeros(n_blocks, block_size // 2, dtype=torch.uint8)
+            for i in range(block_size // 2):
+                low = (quantized_blocks[:, 2 * i] + 8).to(torch.uint8)
+                high = (quantized_blocks[:, 2 * i + 1] + 8).to(torch.uint8)
+                packed[:, i] = low | (high << 4)
+            quantized_state[name] = packed.reshape(-1).contiguous()
+            quantized_state[f"{name}.__scales__"] = scales.to(torch.float16).contiguous()
+            quant_meta[f"{name}.__quant_state__"] = torch.tensor(
+                list(orig_shape) + [0] * (4 - len(orig_shape)) + [block_size, flat.numel()],
+                dtype=torch.int64,
+            )
+            total_quantized += packed.numel() + scales.numel() * 2
+    # Add metadata tensors
+    quantized_state.update(quant_meta)
+    # 4) Save quantized weights
+    if output_path is None:
+        output_path = os.path.join(os.path.dirname(weight_path), "model-nf4.safetensors")
+    log.info(f"\nSaving quantized weights to: {output_path}")
+    save_file(quantized_state, output_path)
+    final_size = os.path.getsize(output_path)
+    compression = total_original / max(final_size, 1)
+    log.info(f"\nQuantization complete!")
+    log.info(f"  Original: {total_original / 1024**3:.1f} GB (bf16)")
+    log.info(f"  Quantized: {final_size / 1024**3:.1f} GB (NF4)")
+    log.info(f"  Compression: {compression:.1f}x")
+    log.info(f"  Tensors quantized: {len(state_dict) - skipped}/{len(state_dict)}")
+    log.info(f"  Tensors kept fp16: {skipped} (norms, biases, embeddings)")
+    log.info(f"\nUse --quantized flag with PersonaPlex server for INT4 inference")
+    return output_path
+def main():
+    parser = argparse.ArgumentParser(description="Quantize PersonaPlex 7B to INT4 NF4")
+    parser.add_argument("--device", default="cuda", help="Device for quantization")
+    parser.add_argument("--output", "-o", default=None, help="Output path for quantized weights")
+    args = parser.parse_args()
+    import torch
+    with torch.no_grad():
+        quantize_model(device=args.device, output_path=args.output)
+if __name__ == "__main__":
+    main()