open-agents-ai 0.185.29 → 0.185.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -41270,6 +41270,8 @@ __export(personaplex_exports, {
41270
41270
  clonePersonaPlexVoice: () => clonePersonaPlexVoice,
41271
41271
  detectPersonaPlexCapability: () => detectPersonaPlexCapability,
41272
41272
  getPersonaPlexWSUrl: () => getPersonaPlexWSUrl,
41273
+ getWeightRepoInfo: () => getWeightRepoInfo,
41274
+ getWeightTier: () => getWeightTier,
41273
41275
  installPersonaPlex: () => installPersonaPlex,
41274
41276
  isPersonaPlexInstalled: () => isPersonaPlexInstalled,
41275
41277
  isPersonaPlexRunning: () => isPersonaPlexRunning,
@@ -41284,6 +41286,13 @@ import { join as join54, dirname as dirname18 } from "node:path";
41284
41286
  import { homedir as homedir13 } from "node:os";
41285
41287
  import { execSync as execSync27, spawn as spawn19 } from "node:child_process";
41286
41288
  import { fileURLToPath as fileURLToPath11 } from "node:url";
41289
+ function selectWeightTier(vramGB) {
41290
+ if (vramGB >= 48)
41291
+ return "original";
41292
+ if (vramGB >= 16)
41293
+ return "nf4";
41294
+ return "turbo2bit";
41295
+ }
41287
41296
  function detectPersonaPlexCapability() {
41288
41297
  try {
41289
41298
  const nvsmi = execSync27("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits", {
@@ -41292,12 +41301,23 @@ function detectPersonaPlexCapability() {
41292
41301
  stdio: "pipe"
41293
41302
  }).trim();
41294
41303
  if (!nvsmi) {
41295
- return { supported: false, reason: "No NVIDIA GPU detected", gpuName: "", vramGB: 0 };
41304
+ return { supported: false, reason: "No NVIDIA GPU detected", gpuName: "", vramGB: 0, weightTier: "turbo2bit", needsHfToken: false };
41296
41305
  }
41297
41306
  const [gpuName, vramMB] = nvsmi.split("\n")[0].split(", ");
41298
- const vramGB = parseInt(vramMB ?? "0", 10) / 1024;
41299
- if (vramGB < 16) {
41300
- return { supported: false, reason: `GPU has ${vramGB.toFixed(1)}GB VRAM (need \u226516GB)`, gpuName: gpuName ?? "", vramGB };
41307
+ let vramGB = parseInt(vramMB ?? "0", 10) / 1024;
41308
+ const isJetson = /orin|tegra|jetson/i.test(gpuName ?? "");
41309
+ if (isJetson) {
41310
+ try {
41311
+ const memInfo = execSync27("grep MemTotal /proc/meminfo", { encoding: "utf8", timeout: 3e3, stdio: "pipe" });
41312
+ const memKB = parseInt(memInfo.match(/(\d+)/)?.[1] ?? "0", 10);
41313
+ const totalGB = memKB / 1024 / 1024;
41314
+ if (totalGB > vramGB)
41315
+ vramGB = totalGB;
41316
+ } catch {
41317
+ }
41318
+ }
41319
+ if (vramGB < 8) {
41320
+ return { supported: false, reason: `GPU has ${vramGB.toFixed(1)}GB VRAM (need \u22658GB for 2-bit weights)`, gpuName: gpuName ?? "", vramGB, weightTier: "turbo2bit", needsHfToken: false };
41301
41321
  }
41302
41322
  try {
41303
41323
  execSync27('python3 -c "import torch; assert torch.cuda.is_available()"', {
@@ -41305,11 +41325,22 @@ function detectPersonaPlexCapability() {
41305
41325
  stdio: "pipe"
41306
41326
  });
41307
41327
  } catch {
41308
- return { supported: false, reason: "PyTorch CUDA not available", gpuName: gpuName ?? "", vramGB };
41328
+ const tier2 = selectWeightTier(vramGB);
41329
+ return { supported: false, reason: "PyTorch CUDA not available", gpuName: gpuName ?? "", vramGB, weightTier: tier2, needsHfToken: WEIGHT_REPOS[tier2].needsToken };
41309
41330
  }
41310
- return { supported: true, reason: "OK", gpuName: gpuName ?? "", vramGB };
41331
+ const tier = selectWeightTier(vramGB);
41332
+ const hasHfToken = !!(process.env["HF_TOKEN"] || process.env["HUGGING_FACE_HUB_TOKEN"]);
41333
+ const effectiveTier = tier === "original" && !hasHfToken ? "nf4" : tier;
41334
+ return {
41335
+ supported: true,
41336
+ reason: `OK \u2014 ${effectiveTier} weights (${WEIGHT_REPOS[effectiveTier].sizeGB}GB)`,
41337
+ gpuName: gpuName ?? "",
41338
+ vramGB,
41339
+ weightTier: effectiveTier,
41340
+ needsHfToken: WEIGHT_REPOS[effectiveTier].needsToken
41341
+ };
41311
41342
  } catch {
41312
- return { supported: false, reason: "nvidia-smi not found", gpuName: "", vramGB: 0 };
41343
+ return { supported: false, reason: "nvidia-smi not found", gpuName: "", vramGB: 0, weightTier: "turbo2bit", needsHfToken: false };
41313
41344
  }
41314
41345
  }
41315
41346
  function isPersonaPlexRunning() {
@@ -41336,7 +41367,19 @@ function getPersonaPlexWSUrl() {
41336
41367
  function isPersonaPlexInstalled() {
41337
41368
  return existsSync37(join54(PERSONAPLEX_DIR, "model_ready"));
41338
41369
  }
41339
- async function installPersonaPlex(onInfo) {
41370
+ function getWeightTier() {
41371
+ const tierFile = join54(PERSONAPLEX_DIR, "weight_tier");
41372
+ if (existsSync37(tierFile)) {
41373
+ const saved = readFileSync28(tierFile, "utf8").trim();
41374
+ if (saved in WEIGHT_REPOS)
41375
+ return saved;
41376
+ }
41377
+ return detectPersonaPlexCapability().weightTier;
41378
+ }
41379
+ function getWeightRepoInfo(tier) {
41380
+ return WEIGHT_REPOS[tier];
41381
+ }
41382
+ async function installPersonaPlex(onInfo, weightTier) {
41340
41383
  const log = onInfo ?? (() => {
41341
41384
  });
41342
41385
  mkdirSync15(PERSONAPLEX_DIR, { recursive: true });
@@ -41352,6 +41395,14 @@ async function installPersonaPlex(onInfo) {
41352
41395
  }
41353
41396
  const pip = process.platform === "win32" ? join54(venvDir, "Scripts", "pip.exe") : join54(venvDir, "bin", "pip");
41354
41397
  const python = process.platform === "win32" ? join54(venvDir, "Scripts", "python.exe") : join54(venvDir, "bin", "python3");
41398
+ let arch2 = "";
41399
+ try {
41400
+ arch2 = execSync27("uname -m", { encoding: "utf8", timeout: 3e3, stdio: "pipe" }).trim();
41401
+ } catch {
41402
+ }
41403
+ const isAarch64 = arch2 === "aarch64" || arch2 === "arm64";
41404
+ if (isAarch64)
41405
+ log(`Detected ARM64 platform (${arch2}) \u2014 Jetson/ARM install path`);
41355
41406
  log("Checking system dependencies (libopus)...");
41356
41407
  try {
41357
41408
  if (process.platform === "linux") {
@@ -41361,12 +41412,43 @@ async function installPersonaPlex(onInfo) {
41361
41412
  }
41362
41413
  } catch {
41363
41414
  }
41415
+ if (isAarch64) {
41416
+ log("ARM64: Checking Rust toolchain for sphn build...");
41417
+ try {
41418
+ execSync27("rustc --version", { timeout: 5e3, stdio: "pipe" });
41419
+ } catch {
41420
+ log("ARM64: Installing Rust toolchain (needed for sphn audio codec)...");
41421
+ try {
41422
+ execSync27("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y", { timeout: 12e4, stdio: "pipe" });
41423
+ } catch (e) {
41424
+ log(`Rust install failed: ${e instanceof Error ? e.message : String(e)}`);
41425
+ log("Install Rust manually: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh");
41426
+ return false;
41427
+ }
41428
+ }
41429
+ try {
41430
+ execSync27(`"${pip}" install --quiet maturin`, { timeout: 6e4, stdio: "pipe" });
41431
+ } catch {
41432
+ }
41433
+ }
41364
41434
  log("Installing PersonaPlex (moshi package)...");
41365
41435
  const repoDir = join54(PERSONAPLEX_DIR, "personaplex-repo");
41366
41436
  try {
41367
41437
  if (!existsSync37(repoDir)) {
41368
41438
  execSync27(`git clone https://github.com/NVIDIA/personaplex.git "${repoDir}"`, { timeout: 12e4, stdio: "pipe" });
41369
41439
  }
41440
+ if (isAarch64) {
41441
+ log("ARM64: Building sphn from source (Opus codec bindings)...");
41442
+ try {
41443
+ const rustEnv = `export PATH="$HOME/.cargo/bin:$PATH" &&`;
41444
+ execSync27(`${rustEnv} "${pip}" install --quiet --no-binary sphn sphn`, { timeout: 3e5, stdio: "pipe", shell: "/bin/bash" });
41445
+ log("ARM64: sphn built successfully");
41446
+ } catch (e) {
41447
+ log(`ARM64: sphn build failed \u2014 ${e instanceof Error ? e.message : String(e)}`);
41448
+ log("Ensure Rust, libopus-dev, and cmake are installed.");
41449
+ return false;
41450
+ }
41451
+ }
41370
41452
  execSync27(`"${pip}" install --quiet "${join54(repoDir, "moshi")}/."`, { timeout: 3e5, stdio: "pipe" });
41371
41453
  } catch (err) {
41372
41454
  log(`Moshi install failed: ${err instanceof Error ? err.message : String(err)}`);
@@ -41394,9 +41476,75 @@ async function installPersonaPlex(onInfo) {
41394
41476
  }
41395
41477
  } catch {
41396
41478
  }
41397
- log("PersonaPlex installed. Model will download on first launch (~14GB).");
41479
+ if (isAarch64) {
41480
+ log("ARM64: Installing bitsandbytes for INT4 inference...");
41481
+ try {
41482
+ execSync27(`"${pip}" install --quiet bitsandbytes`, { timeout: 12e4, stdio: "pipe" });
41483
+ } catch {
41484
+ }
41485
+ }
41486
+ try {
41487
+ execSync27(`"${pip}" install --quiet pyloudnorm noisereduce torchaudio`, { timeout: 12e4, stdio: "pipe" });
41488
+ } catch {
41489
+ }
41490
+ const tier = weightTier ?? detectPersonaPlexCapability().weightTier;
41491
+ const repoInfo = WEIGHT_REPOS[tier];
41492
+ log(`Weight tier: ${tier} (${repoInfo.sizeGB}GB) \u2014 ${repoInfo.needsToken ? "requires HF_TOKEN" : "public, no token needed"}`);
41493
+ log(`Downloading PersonaPlex weights (${repoInfo.sizeGB}GB)...`);
41494
+ try {
41495
+ const tokenArg = repoInfo.needsToken ? "" : "--token ''";
41496
+ const dlCmd = `"${python}" -c "from huggingface_hub import hf_hub_download; f=hf_hub_download('${repoInfo.repo}', '${repoInfo.file}'${repoInfo.needsToken ? "" : ", token=False"}); print(f)"`;
41497
+ const weightPath = execSync27(dlCmd, {
41498
+ encoding: "utf8",
41499
+ timeout: 6e5,
41500
+ stdio: "pipe",
41501
+ env: { ...process.env }
41502
+ }).trim();
41503
+ log(`Weights downloaded: ${repoInfo.file}`);
41504
+ if (tier !== "original") {
41505
+ log("Downloading Mimi codec and tokenizer...");
41506
+ try {
41507
+ const hasToken = !!(process.env["HF_TOKEN"] || process.env["HUGGING_FACE_HUB_TOKEN"]);
41508
+ if (hasToken) {
41509
+ execSync27(`"${python}" -c "from huggingface_hub import hf_hub_download; hf_hub_download('nvidia/personaplex-7b-v1', 'tokenizer_spm_32k_3.model'); hf_hub_download('nvidia/personaplex-7b-v1', 'tokenizer-e351c8d8-checkpoint125.safetensors')"`, {
41510
+ timeout: 3e5,
41511
+ stdio: "pipe"
41512
+ });
41513
+ log("Codec + tokenizer downloaded.");
41514
+ } else {
41515
+ log("Note: Mimi codec needs HF_TOKEN on first run (set HF_TOKEN env var).");
41516
+ log("Weights themselves are public \u2014 no token needed for the model.");
41517
+ }
41518
+ } catch {
41519
+ }
41520
+ }
41521
+ } catch (err) {
41522
+ const msg = err instanceof Error ? err.message : String(err);
41523
+ if (repoInfo.needsToken && /401|403|gated|unauthorized/i.test(msg)) {
41524
+ log(`HF_TOKEN required for ${tier} weights. Set HF_TOKEN or accept license at https://huggingface.co/${repoInfo.repo}`);
41525
+ if (tier === "original") {
41526
+ log("Auto-downgrading to INT4 weights (no token required)...");
41527
+ const nf4 = WEIGHT_REPOS["nf4"];
41528
+ try {
41529
+ execSync27(`"${python}" -c "from huggingface_hub import hf_hub_download; hf_hub_download('${nf4.repo}', '${nf4.file}', token=False)"`, {
41530
+ timeout: 6e5,
41531
+ stdio: "pipe"
41532
+ });
41533
+ writeFileSync16(join54(PERSONAPLEX_DIR, "weight_tier"), "nf4");
41534
+ log(`Downloaded INT4 weights instead (${nf4.sizeGB}GB, public).`);
41535
+ } catch {
41536
+ log("Weight download failed.");
41537
+ return false;
41538
+ }
41539
+ }
41540
+ } else {
41541
+ log(`Weight download failed: ${msg}`);
41542
+ log("Weights will download on first server launch.");
41543
+ }
41544
+ }
41545
+ writeFileSync16(join54(PERSONAPLEX_DIR, "weight_tier"), tier);
41398
41546
  writeFileSync16(join54(PERSONAPLEX_DIR, "model_ready"), (/* @__PURE__ */ new Date()).toISOString());
41399
- log("PersonaPlex installed successfully.");
41547
+ log(`PersonaPlex installed (${tier} tier). Use /call to start voice session.`);
41400
41548
  return true;
41401
41549
  }
41402
41550
  async function startPersonaPlexDaemon(onInfo) {
@@ -41719,10 +41867,11 @@ async function autoSetupPersonaPlex(onInfo) {
41719
41867
  log(`PersonaPlex not available: ${caps.reason}`);
41720
41868
  return null;
41721
41869
  }
41722
- log(`GPU: ${caps.gpuName} (${caps.vramGB.toFixed(0)}GB) \u2014 PersonaPlex compatible`);
41870
+ const tierInfo = WEIGHT_REPOS[caps.weightTier];
41871
+ log(`GPU: ${caps.gpuName} (${caps.vramGB.toFixed(0)}GB) \u2192 ${caps.weightTier} weights (${tierInfo.sizeGB}GB${caps.needsHfToken ? "" : ", no HF token needed"})`);
41723
41872
  if (!isPersonaPlexInstalled()) {
41724
41873
  log("Installing PersonaPlex (first time setup)...");
41725
- const ok = await installPersonaPlex(log);
41874
+ const ok = await installPersonaPlex(log, caps.weightTier);
41726
41875
  if (!ok) {
41727
41876
  log("PersonaPlex installation failed.");
41728
41877
  return null;
@@ -41742,11 +41891,16 @@ async function autoSetupPersonaPlex(onInfo) {
41742
41891
  }
41743
41892
  return await startPersonaPlexDaemon(log);
41744
41893
  }
41745
- var PERSONAPLEX_DIR, PID_FILE, PORT_FILE, LOG_FILE, CUSTOM_VOICES_DIR;
41894
+ var WEIGHT_REPOS, PERSONAPLEX_DIR, PID_FILE, PORT_FILE, LOG_FILE, CUSTOM_VOICES_DIR;
41746
41895
  var init_personaplex = __esm({
41747
41896
  "packages/cli/dist/tui/personaplex.js"() {
41748
41897
  "use strict";
41749
41898
  init_render();
41899
+ WEIGHT_REPOS = {
41900
+ original: { repo: "nvidia/personaplex-7b-v1", file: "model.safetensors", sizeGB: 15.6, needsToken: true },
41901
+ nf4: { repo: "cudabenchmarktest/personaplex-7b-nf4", file: "model-nf4.safetensors", sizeGB: 4.1, needsToken: false },
41902
+ turbo2bit: { repo: "cudabenchmarktest/personaplex-7b-turbo2bit", file: "model-turbo2bit.safetensors", sizeGB: 2.1, needsToken: false }
41903
+ };
41750
41904
  PERSONAPLEX_DIR = join54(homedir13(), ".open-agents", "voice", "personaplex");
41751
41905
  PID_FILE = join54(PERSONAPLEX_DIR, "daemon.pid");
41752
41906
  PORT_FILE = join54(PERSONAPLEX_DIR, "daemon.port");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.185.29",
3
+ "version": "0.185.31",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ quantize-weights.py — Quantize PersonaPlex 7B weights to INT4 (NF4) for edge devices.
4
+
5
+ Creates a ~3.5GB quantized checkpoint from the ~14GB bf16 weights.
6
+ The quantized model runs 3-4x faster on memory-bandwidth-limited devices
7
+ like Jetson AGX Orin while maintaining voice quality.
8
+
9
+ Usage:
10
+ python quantize-weights.py [--device cuda] [--output personaplex-7b-nf4.safetensors]
11
+
12
+ Requirements:
13
+ pip install bitsandbytes safetensors torch
14
+ """
15
+
16
+ import argparse
17
+ import os
18
+ import sys
19
+ import logging
20
+
21
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
22
+ log = logging.getLogger(__name__)
23
+
24
+
25
+ def quantize_model(device: str = "cuda", output_path: str = None):
26
+ """Quantize PersonaPlex 7B to NF4 (4-bit Normal Float)"""
27
+ import torch
28
+ from huggingface_hub import hf_hub_download
29
+ from safetensors.torch import load_file, save_file
30
+
31
+ hf_repo = "nvidia/personaplex-7b-v1"
32
+
33
+ # 1) Download original weights
34
+ log.info("Downloading PersonaPlex 7B weights...")
35
+ weight_path = hf_hub_download(hf_repo, "model.safetensors")
36
+ log.info(f" Weights: {weight_path}")
37
+ log.info(f" Size: {os.path.getsize(weight_path) / 1024**3:.1f} GB")
38
+
39
+ # 2) Load state dict
40
+ log.info("Loading state dict...")
41
+ state_dict = load_file(weight_path, device="cpu")
42
+ log.info(f" Loaded {len(state_dict)} tensors")
43
+
44
+ # 3) Quantize each weight tensor to INT4 using block-wise NF4
45
+ try:
46
+ import bitsandbytes as bnb
47
+ from bitsandbytes.functional import quantize_nf4, dequantize_nf4
48
+ HAS_BNB = True
49
+ except ImportError:
50
+ HAS_BNB = False
51
+ log.info(" bitsandbytes not available — using manual INT4 quantization")
52
+
53
+ quantized_state = {}
54
+ quant_meta = {} # Store quantization parameters for dequantization
55
+ total_original = 0
56
+ total_quantized = 0
57
+ skipped = 0
58
+
59
+ for name, tensor in state_dict.items():
60
+ original_bytes = tensor.numel() * tensor.element_size()
61
+ total_original += original_bytes
62
+
63
+ # Only quantize large weight matrices (≥1024 elements, 2D)
64
+ # Skip biases, norms, embeddings, small tensors
65
+ should_quantize = (
66
+ tensor.ndim >= 2
67
+ and tensor.numel() >= 1024
68
+ and not any(skip in name for skip in [
69
+ "norm", "bias", "embed", "positional", "rope",
70
+ "depformer_emb", "depformer_in",
71
+ ])
72
+ )
73
+
74
+ if not should_quantize:
75
+ quantized_state[name] = tensor.to(torch.float16).contiguous()
76
+ total_quantized += tensor.numel() * 2 # fp16
77
+ skipped += 1
78
+ continue
79
+
80
+ # Reshape to 2D for quantization
81
+ orig_shape = tensor.shape
82
+ flat = tensor.reshape(-1).float()
83
+
84
+ if HAS_BNB:
85
+ # Use bitsandbytes NF4 quantization
86
+ quant_tensor, quant_state = bnb.functional.quantize_4bit(
87
+ flat, quant_type="nf4", compress_statistics=True,
88
+ )
89
+ # Store the quantized bytes + metadata for reconstruction
90
+ quantized_state[name] = quant_tensor.contiguous()
91
+ quant_meta[f"{name}.__quant_state__"] = torch.tensor(
92
+ list(orig_shape) + [0] * (4 - len(orig_shape)),
93
+ dtype=torch.int64,
94
+ )
95
+ # Store absmax for dequantization
96
+ if hasattr(quant_state, 'absmax'):
97
+ quantized_state[f"{name}.__absmax__"] = quant_state.absmax.contiguous()
98
+ if hasattr(quant_state, 'quant_map'):
99
+ quantized_state[f"{name}.__quant_map__"] = quant_state.quant_map.contiguous()
100
+ total_quantized += quant_tensor.numel()
101
+ else:
102
+ # Manual symmetric INT4 quantization (no bitsandbytes)
103
+ # Block size 64 for good accuracy
104
+ block_size = 64
105
+ n_blocks = (flat.numel() + block_size - 1) // block_size
106
+ padded = torch.zeros(n_blocks * block_size)
107
+ padded[:flat.numel()] = flat
108
+
109
+ blocks = padded.reshape(n_blocks, block_size)
110
+ scales = blocks.abs().max(dim=1).values / 7.0 # INT4 range: -8 to 7
111
+ scales = scales.clamp(min=1e-8)
112
+
113
+ # Quantize to INT4 (stored as INT8 pairs)
114
+ quantized_blocks = torch.round(blocks / scales.unsqueeze(1)).clamp(-8, 7).to(torch.int8)
115
+
116
+ # Pack two INT4 values into one INT8
117
+ packed = torch.zeros(n_blocks, block_size // 2, dtype=torch.uint8)
118
+ for i in range(block_size // 2):
119
+ low = (quantized_blocks[:, 2 * i] + 8).to(torch.uint8)
120
+ high = (quantized_blocks[:, 2 * i + 1] + 8).to(torch.uint8)
121
+ packed[:, i] = low | (high << 4)
122
+
123
+ quantized_state[name] = packed.reshape(-1).contiguous()
124
+ quantized_state[f"{name}.__scales__"] = scales.to(torch.float16).contiguous()
125
+ quant_meta[f"{name}.__quant_state__"] = torch.tensor(
126
+ list(orig_shape) + [0] * (4 - len(orig_shape)) + [block_size, flat.numel()],
127
+ dtype=torch.int64,
128
+ )
129
+ total_quantized += packed.numel() + scales.numel() * 2
130
+
131
+ # Add metadata tensors
132
+ quantized_state.update(quant_meta)
133
+
134
+ # 4) Save quantized weights
135
+ if output_path is None:
136
+ output_path = os.path.join(os.path.dirname(weight_path), "model-nf4.safetensors")
137
+
138
+ log.info(f"\nSaving quantized weights to: {output_path}")
139
+ save_file(quantized_state, output_path)
140
+
141
+ final_size = os.path.getsize(output_path)
142
+ compression = total_original / max(final_size, 1)
143
+
144
+ log.info(f"\nQuantization complete!")
145
+ log.info(f" Original: {total_original / 1024**3:.1f} GB (bf16)")
146
+ log.info(f" Quantized: {final_size / 1024**3:.1f} GB (NF4)")
147
+ log.info(f" Compression: {compression:.1f}x")
148
+ log.info(f" Tensors quantized: {len(state_dict) - skipped}/{len(state_dict)}")
149
+ log.info(f" Tensors kept fp16: {skipped} (norms, biases, embeddings)")
150
+ log.info(f"\nUse --quantized flag with PersonaPlex server for INT4 inference")
151
+
152
+ return output_path
153
+
154
+
155
+ def main():
156
+ parser = argparse.ArgumentParser(description="Quantize PersonaPlex 7B to INT4 NF4")
157
+ parser.add_argument("--device", default="cuda", help="Device for quantization")
158
+ parser.add_argument("--output", "-o", default=None, help="Output path for quantized weights")
159
+ args = parser.parse_args()
160
+
161
+ import torch
162
+ with torch.no_grad():
163
+ quantize_model(device=args.device, output_path=args.output)
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()