open-agents-ai 0.185.29 → 0.185.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +167 -13
- package/package.json +1 -1
- package/voices/personaplex/quantize-weights.py +167 -0
package/dist/index.js
CHANGED
|
@@ -41270,6 +41270,8 @@ __export(personaplex_exports, {
|
|
|
41270
41270
|
clonePersonaPlexVoice: () => clonePersonaPlexVoice,
|
|
41271
41271
|
detectPersonaPlexCapability: () => detectPersonaPlexCapability,
|
|
41272
41272
|
getPersonaPlexWSUrl: () => getPersonaPlexWSUrl,
|
|
41273
|
+
getWeightRepoInfo: () => getWeightRepoInfo,
|
|
41274
|
+
getWeightTier: () => getWeightTier,
|
|
41273
41275
|
installPersonaPlex: () => installPersonaPlex,
|
|
41274
41276
|
isPersonaPlexInstalled: () => isPersonaPlexInstalled,
|
|
41275
41277
|
isPersonaPlexRunning: () => isPersonaPlexRunning,
|
|
@@ -41284,6 +41286,13 @@ import { join as join54, dirname as dirname18 } from "node:path";
|
|
|
41284
41286
|
import { homedir as homedir13 } from "node:os";
|
|
41285
41287
|
import { execSync as execSync27, spawn as spawn19 } from "node:child_process";
|
|
41286
41288
|
import { fileURLToPath as fileURLToPath11 } from "node:url";
|
|
41289
|
+
function selectWeightTier(vramGB) {
|
|
41290
|
+
if (vramGB >= 48)
|
|
41291
|
+
return "original";
|
|
41292
|
+
if (vramGB >= 16)
|
|
41293
|
+
return "nf4";
|
|
41294
|
+
return "turbo2bit";
|
|
41295
|
+
}
|
|
41287
41296
|
function detectPersonaPlexCapability() {
|
|
41288
41297
|
try {
|
|
41289
41298
|
const nvsmi = execSync27("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits", {
|
|
@@ -41292,12 +41301,23 @@ function detectPersonaPlexCapability() {
|
|
|
41292
41301
|
stdio: "pipe"
|
|
41293
41302
|
}).trim();
|
|
41294
41303
|
if (!nvsmi) {
|
|
41295
|
-
return { supported: false, reason: "No NVIDIA GPU detected", gpuName: "", vramGB: 0 };
|
|
41304
|
+
return { supported: false, reason: "No NVIDIA GPU detected", gpuName: "", vramGB: 0, weightTier: "turbo2bit", needsHfToken: false };
|
|
41296
41305
|
}
|
|
41297
41306
|
const [gpuName, vramMB] = nvsmi.split("\n")[0].split(", ");
|
|
41298
|
-
|
|
41299
|
-
|
|
41300
|
-
|
|
41307
|
+
let vramGB = parseInt(vramMB ?? "0", 10) / 1024;
|
|
41308
|
+
const isJetson = /orin|tegra|jetson/i.test(gpuName ?? "");
|
|
41309
|
+
if (isJetson) {
|
|
41310
|
+
try {
|
|
41311
|
+
const memInfo = execSync27("grep MemTotal /proc/meminfo", { encoding: "utf8", timeout: 3e3, stdio: "pipe" });
|
|
41312
|
+
const memKB = parseInt(memInfo.match(/(\d+)/)?.[1] ?? "0", 10);
|
|
41313
|
+
const totalGB = memKB / 1024 / 1024;
|
|
41314
|
+
if (totalGB > vramGB)
|
|
41315
|
+
vramGB = totalGB;
|
|
41316
|
+
} catch {
|
|
41317
|
+
}
|
|
41318
|
+
}
|
|
41319
|
+
if (vramGB < 8) {
|
|
41320
|
+
return { supported: false, reason: `GPU has ${vramGB.toFixed(1)}GB VRAM (need \u22658GB for 2-bit weights)`, gpuName: gpuName ?? "", vramGB, weightTier: "turbo2bit", needsHfToken: false };
|
|
41301
41321
|
}
|
|
41302
41322
|
try {
|
|
41303
41323
|
execSync27('python3 -c "import torch; assert torch.cuda.is_available()"', {
|
|
@@ -41305,11 +41325,22 @@ function detectPersonaPlexCapability() {
|
|
|
41305
41325
|
stdio: "pipe"
|
|
41306
41326
|
});
|
|
41307
41327
|
} catch {
|
|
41308
|
-
|
|
41328
|
+
const tier2 = selectWeightTier(vramGB);
|
|
41329
|
+
return { supported: false, reason: "PyTorch CUDA not available", gpuName: gpuName ?? "", vramGB, weightTier: tier2, needsHfToken: WEIGHT_REPOS[tier2].needsToken };
|
|
41309
41330
|
}
|
|
41310
|
-
|
|
41331
|
+
const tier = selectWeightTier(vramGB);
|
|
41332
|
+
const hasHfToken = !!(process.env["HF_TOKEN"] || process.env["HUGGING_FACE_HUB_TOKEN"]);
|
|
41333
|
+
const effectiveTier = tier === "original" && !hasHfToken ? "nf4" : tier;
|
|
41334
|
+
return {
|
|
41335
|
+
supported: true,
|
|
41336
|
+
reason: `OK \u2014 ${effectiveTier} weights (${WEIGHT_REPOS[effectiveTier].sizeGB}GB)`,
|
|
41337
|
+
gpuName: gpuName ?? "",
|
|
41338
|
+
vramGB,
|
|
41339
|
+
weightTier: effectiveTier,
|
|
41340
|
+
needsHfToken: WEIGHT_REPOS[effectiveTier].needsToken
|
|
41341
|
+
};
|
|
41311
41342
|
} catch {
|
|
41312
|
-
return { supported: false, reason: "nvidia-smi not found", gpuName: "", vramGB: 0 };
|
|
41343
|
+
return { supported: false, reason: "nvidia-smi not found", gpuName: "", vramGB: 0, weightTier: "turbo2bit", needsHfToken: false };
|
|
41313
41344
|
}
|
|
41314
41345
|
}
|
|
41315
41346
|
function isPersonaPlexRunning() {
|
|
@@ -41336,7 +41367,19 @@ function getPersonaPlexWSUrl() {
|
|
|
41336
41367
|
function isPersonaPlexInstalled() {
|
|
41337
41368
|
return existsSync37(join54(PERSONAPLEX_DIR, "model_ready"));
|
|
41338
41369
|
}
|
|
41339
|
-
|
|
41370
|
+
function getWeightTier() {
|
|
41371
|
+
const tierFile = join54(PERSONAPLEX_DIR, "weight_tier");
|
|
41372
|
+
if (existsSync37(tierFile)) {
|
|
41373
|
+
const saved = readFileSync28(tierFile, "utf8").trim();
|
|
41374
|
+
if (saved in WEIGHT_REPOS)
|
|
41375
|
+
return saved;
|
|
41376
|
+
}
|
|
41377
|
+
return detectPersonaPlexCapability().weightTier;
|
|
41378
|
+
}
|
|
41379
|
+
function getWeightRepoInfo(tier) {
|
|
41380
|
+
return WEIGHT_REPOS[tier];
|
|
41381
|
+
}
|
|
41382
|
+
async function installPersonaPlex(onInfo, weightTier) {
|
|
41340
41383
|
const log = onInfo ?? (() => {
|
|
41341
41384
|
});
|
|
41342
41385
|
mkdirSync15(PERSONAPLEX_DIR, { recursive: true });
|
|
@@ -41352,6 +41395,14 @@ async function installPersonaPlex(onInfo) {
|
|
|
41352
41395
|
}
|
|
41353
41396
|
const pip = process.platform === "win32" ? join54(venvDir, "Scripts", "pip.exe") : join54(venvDir, "bin", "pip");
|
|
41354
41397
|
const python = process.platform === "win32" ? join54(venvDir, "Scripts", "python.exe") : join54(venvDir, "bin", "python3");
|
|
41398
|
+
let arch2 = "";
|
|
41399
|
+
try {
|
|
41400
|
+
arch2 = execSync27("uname -m", { encoding: "utf8", timeout: 3e3, stdio: "pipe" }).trim();
|
|
41401
|
+
} catch {
|
|
41402
|
+
}
|
|
41403
|
+
const isAarch64 = arch2 === "aarch64" || arch2 === "arm64";
|
|
41404
|
+
if (isAarch64)
|
|
41405
|
+
log(`Detected ARM64 platform (${arch2}) \u2014 Jetson/ARM install path`);
|
|
41355
41406
|
log("Checking system dependencies (libopus)...");
|
|
41356
41407
|
try {
|
|
41357
41408
|
if (process.platform === "linux") {
|
|
@@ -41361,12 +41412,43 @@ async function installPersonaPlex(onInfo) {
|
|
|
41361
41412
|
}
|
|
41362
41413
|
} catch {
|
|
41363
41414
|
}
|
|
41415
|
+
if (isAarch64) {
|
|
41416
|
+
log("ARM64: Checking Rust toolchain for sphn build...");
|
|
41417
|
+
try {
|
|
41418
|
+
execSync27("rustc --version", { timeout: 5e3, stdio: "pipe" });
|
|
41419
|
+
} catch {
|
|
41420
|
+
log("ARM64: Installing Rust toolchain (needed for sphn audio codec)...");
|
|
41421
|
+
try {
|
|
41422
|
+
execSync27("curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y", { timeout: 12e4, stdio: "pipe" });
|
|
41423
|
+
} catch (e) {
|
|
41424
|
+
log(`Rust install failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
41425
|
+
log("Install Rust manually: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh");
|
|
41426
|
+
return false;
|
|
41427
|
+
}
|
|
41428
|
+
}
|
|
41429
|
+
try {
|
|
41430
|
+
execSync27(`"${pip}" install --quiet maturin`, { timeout: 6e4, stdio: "pipe" });
|
|
41431
|
+
} catch {
|
|
41432
|
+
}
|
|
41433
|
+
}
|
|
41364
41434
|
log("Installing PersonaPlex (moshi package)...");
|
|
41365
41435
|
const repoDir = join54(PERSONAPLEX_DIR, "personaplex-repo");
|
|
41366
41436
|
try {
|
|
41367
41437
|
if (!existsSync37(repoDir)) {
|
|
41368
41438
|
execSync27(`git clone https://github.com/NVIDIA/personaplex.git "${repoDir}"`, { timeout: 12e4, stdio: "pipe" });
|
|
41369
41439
|
}
|
|
41440
|
+
if (isAarch64) {
|
|
41441
|
+
log("ARM64: Building sphn from source (Opus codec bindings)...");
|
|
41442
|
+
try {
|
|
41443
|
+
const rustEnv = `export PATH="$HOME/.cargo/bin:$PATH" &&`;
|
|
41444
|
+
execSync27(`${rustEnv} "${pip}" install --quiet --no-binary sphn sphn`, { timeout: 3e5, stdio: "pipe", shell: "/bin/bash" });
|
|
41445
|
+
log("ARM64: sphn built successfully");
|
|
41446
|
+
} catch (e) {
|
|
41447
|
+
log(`ARM64: sphn build failed \u2014 ${e instanceof Error ? e.message : String(e)}`);
|
|
41448
|
+
log("Ensure Rust, libopus-dev, and cmake are installed.");
|
|
41449
|
+
return false;
|
|
41450
|
+
}
|
|
41451
|
+
}
|
|
41370
41452
|
execSync27(`"${pip}" install --quiet "${join54(repoDir, "moshi")}/."`, { timeout: 3e5, stdio: "pipe" });
|
|
41371
41453
|
} catch (err) {
|
|
41372
41454
|
log(`Moshi install failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
@@ -41394,9 +41476,75 @@ async function installPersonaPlex(onInfo) {
|
|
|
41394
41476
|
}
|
|
41395
41477
|
} catch {
|
|
41396
41478
|
}
|
|
41397
|
-
|
|
41479
|
+
if (isAarch64) {
|
|
41480
|
+
log("ARM64: Installing bitsandbytes for INT4 inference...");
|
|
41481
|
+
try {
|
|
41482
|
+
execSync27(`"${pip}" install --quiet bitsandbytes`, { timeout: 12e4, stdio: "pipe" });
|
|
41483
|
+
} catch {
|
|
41484
|
+
}
|
|
41485
|
+
}
|
|
41486
|
+
try {
|
|
41487
|
+
execSync27(`"${pip}" install --quiet pyloudnorm noisereduce torchaudio`, { timeout: 12e4, stdio: "pipe" });
|
|
41488
|
+
} catch {
|
|
41489
|
+
}
|
|
41490
|
+
const tier = weightTier ?? detectPersonaPlexCapability().weightTier;
|
|
41491
|
+
const repoInfo = WEIGHT_REPOS[tier];
|
|
41492
|
+
log(`Weight tier: ${tier} (${repoInfo.sizeGB}GB) \u2014 ${repoInfo.needsToken ? "requires HF_TOKEN" : "public, no token needed"}`);
|
|
41493
|
+
log(`Downloading PersonaPlex weights (${repoInfo.sizeGB}GB)...`);
|
|
41494
|
+
try {
|
|
41495
|
+
const tokenArg = repoInfo.needsToken ? "" : "--token ''";
|
|
41496
|
+
const dlCmd = `"${python}" -c "from huggingface_hub import hf_hub_download; f=hf_hub_download('${repoInfo.repo}', '${repoInfo.file}'${repoInfo.needsToken ? "" : ", token=False"}); print(f)"`;
|
|
41497
|
+
const weightPath = execSync27(dlCmd, {
|
|
41498
|
+
encoding: "utf8",
|
|
41499
|
+
timeout: 6e5,
|
|
41500
|
+
stdio: "pipe",
|
|
41501
|
+
env: { ...process.env }
|
|
41502
|
+
}).trim();
|
|
41503
|
+
log(`Weights downloaded: ${repoInfo.file}`);
|
|
41504
|
+
if (tier !== "original") {
|
|
41505
|
+
log("Downloading Mimi codec and tokenizer...");
|
|
41506
|
+
try {
|
|
41507
|
+
const hasToken = !!(process.env["HF_TOKEN"] || process.env["HUGGING_FACE_HUB_TOKEN"]);
|
|
41508
|
+
if (hasToken) {
|
|
41509
|
+
execSync27(`"${python}" -c "from huggingface_hub import hf_hub_download; hf_hub_download('nvidia/personaplex-7b-v1', 'tokenizer_spm_32k_3.model'); hf_hub_download('nvidia/personaplex-7b-v1', 'tokenizer-e351c8d8-checkpoint125.safetensors')"`, {
|
|
41510
|
+
timeout: 3e5,
|
|
41511
|
+
stdio: "pipe"
|
|
41512
|
+
});
|
|
41513
|
+
log("Codec + tokenizer downloaded.");
|
|
41514
|
+
} else {
|
|
41515
|
+
log("Note: Mimi codec needs HF_TOKEN on first run (set HF_TOKEN env var).");
|
|
41516
|
+
log("Weights themselves are public \u2014 no token needed for the model.");
|
|
41517
|
+
}
|
|
41518
|
+
} catch {
|
|
41519
|
+
}
|
|
41520
|
+
}
|
|
41521
|
+
} catch (err) {
|
|
41522
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
41523
|
+
if (repoInfo.needsToken && /401|403|gated|unauthorized/i.test(msg)) {
|
|
41524
|
+
log(`HF_TOKEN required for ${tier} weights. Set HF_TOKEN or accept license at https://huggingface.co/${repoInfo.repo}`);
|
|
41525
|
+
if (tier === "original") {
|
|
41526
|
+
log("Auto-downgrading to INT4 weights (no token required)...");
|
|
41527
|
+
const nf4 = WEIGHT_REPOS["nf4"];
|
|
41528
|
+
try {
|
|
41529
|
+
execSync27(`"${python}" -c "from huggingface_hub import hf_hub_download; hf_hub_download('${nf4.repo}', '${nf4.file}', token=False)"`, {
|
|
41530
|
+
timeout: 6e5,
|
|
41531
|
+
stdio: "pipe"
|
|
41532
|
+
});
|
|
41533
|
+
writeFileSync16(join54(PERSONAPLEX_DIR, "weight_tier"), "nf4");
|
|
41534
|
+
log(`Downloaded INT4 weights instead (${nf4.sizeGB}GB, public).`);
|
|
41535
|
+
} catch {
|
|
41536
|
+
log("Weight download failed.");
|
|
41537
|
+
return false;
|
|
41538
|
+
}
|
|
41539
|
+
}
|
|
41540
|
+
} else {
|
|
41541
|
+
log(`Weight download failed: ${msg}`);
|
|
41542
|
+
log("Weights will download on first server launch.");
|
|
41543
|
+
}
|
|
41544
|
+
}
|
|
41545
|
+
writeFileSync16(join54(PERSONAPLEX_DIR, "weight_tier"), tier);
|
|
41398
41546
|
writeFileSync16(join54(PERSONAPLEX_DIR, "model_ready"), (/* @__PURE__ */ new Date()).toISOString());
|
|
41399
|
-
log(
|
|
41547
|
+
log(`PersonaPlex installed (${tier} tier). Use /call to start voice session.`);
|
|
41400
41548
|
return true;
|
|
41401
41549
|
}
|
|
41402
41550
|
async function startPersonaPlexDaemon(onInfo) {
|
|
@@ -41719,10 +41867,11 @@ async function autoSetupPersonaPlex(onInfo) {
|
|
|
41719
41867
|
log(`PersonaPlex not available: ${caps.reason}`);
|
|
41720
41868
|
return null;
|
|
41721
41869
|
}
|
|
41722
|
-
|
|
41870
|
+
const tierInfo = WEIGHT_REPOS[caps.weightTier];
|
|
41871
|
+
log(`GPU: ${caps.gpuName} (${caps.vramGB.toFixed(0)}GB) \u2192 ${caps.weightTier} weights (${tierInfo.sizeGB}GB${caps.needsHfToken ? "" : ", no HF token needed"})`);
|
|
41723
41872
|
if (!isPersonaPlexInstalled()) {
|
|
41724
41873
|
log("Installing PersonaPlex (first time setup)...");
|
|
41725
|
-
const ok = await installPersonaPlex(log);
|
|
41874
|
+
const ok = await installPersonaPlex(log, caps.weightTier);
|
|
41726
41875
|
if (!ok) {
|
|
41727
41876
|
log("PersonaPlex installation failed.");
|
|
41728
41877
|
return null;
|
|
@@ -41742,11 +41891,16 @@ async function autoSetupPersonaPlex(onInfo) {
|
|
|
41742
41891
|
}
|
|
41743
41892
|
return await startPersonaPlexDaemon(log);
|
|
41744
41893
|
}
|
|
41745
|
-
var PERSONAPLEX_DIR, PID_FILE, PORT_FILE, LOG_FILE, CUSTOM_VOICES_DIR;
|
|
41894
|
+
var WEIGHT_REPOS, PERSONAPLEX_DIR, PID_FILE, PORT_FILE, LOG_FILE, CUSTOM_VOICES_DIR;
|
|
41746
41895
|
var init_personaplex = __esm({
|
|
41747
41896
|
"packages/cli/dist/tui/personaplex.js"() {
|
|
41748
41897
|
"use strict";
|
|
41749
41898
|
init_render();
|
|
41899
|
+
WEIGHT_REPOS = {
|
|
41900
|
+
original: { repo: "nvidia/personaplex-7b-v1", file: "model.safetensors", sizeGB: 15.6, needsToken: true },
|
|
41901
|
+
nf4: { repo: "cudabenchmarktest/personaplex-7b-nf4", file: "model-nf4.safetensors", sizeGB: 4.1, needsToken: false },
|
|
41902
|
+
turbo2bit: { repo: "cudabenchmarktest/personaplex-7b-turbo2bit", file: "model-turbo2bit.safetensors", sizeGB: 2.1, needsToken: false }
|
|
41903
|
+
};
|
|
41750
41904
|
PERSONAPLEX_DIR = join54(homedir13(), ".open-agents", "voice", "personaplex");
|
|
41751
41905
|
PID_FILE = join54(PERSONAPLEX_DIR, "daemon.pid");
|
|
41752
41906
|
PORT_FILE = join54(PERSONAPLEX_DIR, "daemon.port");
|
package/package.json
CHANGED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
quantize-weights.py — Quantize PersonaPlex 7B weights to INT4 (NF4) for edge devices.
|
|
4
|
+
|
|
5
|
+
Creates a ~3.5GB quantized checkpoint from the ~14GB bf16 weights.
|
|
6
|
+
The quantized model runs 3-4x faster on memory-bandwidth-limited devices
|
|
7
|
+
like Jetson AGX Orin while maintaining voice quality.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python quantize-weights.py [--device cuda] [--output personaplex-7b-nf4.safetensors]
|
|
11
|
+
|
|
12
|
+
Requirements:
|
|
13
|
+
pip install bitsandbytes safetensors torch
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
22
|
+
log = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def quantize_model(device: str = "cuda", output_path: str = None):
|
|
26
|
+
"""Quantize PersonaPlex 7B to NF4 (4-bit Normal Float)"""
|
|
27
|
+
import torch
|
|
28
|
+
from huggingface_hub import hf_hub_download
|
|
29
|
+
from safetensors.torch import load_file, save_file
|
|
30
|
+
|
|
31
|
+
hf_repo = "nvidia/personaplex-7b-v1"
|
|
32
|
+
|
|
33
|
+
# 1) Download original weights
|
|
34
|
+
log.info("Downloading PersonaPlex 7B weights...")
|
|
35
|
+
weight_path = hf_hub_download(hf_repo, "model.safetensors")
|
|
36
|
+
log.info(f" Weights: {weight_path}")
|
|
37
|
+
log.info(f" Size: {os.path.getsize(weight_path) / 1024**3:.1f} GB")
|
|
38
|
+
|
|
39
|
+
# 2) Load state dict
|
|
40
|
+
log.info("Loading state dict...")
|
|
41
|
+
state_dict = load_file(weight_path, device="cpu")
|
|
42
|
+
log.info(f" Loaded {len(state_dict)} tensors")
|
|
43
|
+
|
|
44
|
+
# 3) Quantize each weight tensor to INT4 using block-wise NF4
|
|
45
|
+
try:
|
|
46
|
+
import bitsandbytes as bnb
|
|
47
|
+
from bitsandbytes.functional import quantize_nf4, dequantize_nf4
|
|
48
|
+
HAS_BNB = True
|
|
49
|
+
except ImportError:
|
|
50
|
+
HAS_BNB = False
|
|
51
|
+
log.info(" bitsandbytes not available — using manual INT4 quantization")
|
|
52
|
+
|
|
53
|
+
quantized_state = {}
|
|
54
|
+
quant_meta = {} # Store quantization parameters for dequantization
|
|
55
|
+
total_original = 0
|
|
56
|
+
total_quantized = 0
|
|
57
|
+
skipped = 0
|
|
58
|
+
|
|
59
|
+
for name, tensor in state_dict.items():
|
|
60
|
+
original_bytes = tensor.numel() * tensor.element_size()
|
|
61
|
+
total_original += original_bytes
|
|
62
|
+
|
|
63
|
+
# Only quantize large weight matrices (≥1024 elements, 2D)
|
|
64
|
+
# Skip biases, norms, embeddings, small tensors
|
|
65
|
+
should_quantize = (
|
|
66
|
+
tensor.ndim >= 2
|
|
67
|
+
and tensor.numel() >= 1024
|
|
68
|
+
and not any(skip in name for skip in [
|
|
69
|
+
"norm", "bias", "embed", "positional", "rope",
|
|
70
|
+
"depformer_emb", "depformer_in",
|
|
71
|
+
])
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if not should_quantize:
|
|
75
|
+
quantized_state[name] = tensor.to(torch.float16).contiguous()
|
|
76
|
+
total_quantized += tensor.numel() * 2 # fp16
|
|
77
|
+
skipped += 1
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
# Reshape to 2D for quantization
|
|
81
|
+
orig_shape = tensor.shape
|
|
82
|
+
flat = tensor.reshape(-1).float()
|
|
83
|
+
|
|
84
|
+
if HAS_BNB:
|
|
85
|
+
# Use bitsandbytes NF4 quantization
|
|
86
|
+
quant_tensor, quant_state = bnb.functional.quantize_4bit(
|
|
87
|
+
flat, quant_type="nf4", compress_statistics=True,
|
|
88
|
+
)
|
|
89
|
+
# Store the quantized bytes + metadata for reconstruction
|
|
90
|
+
quantized_state[name] = quant_tensor.contiguous()
|
|
91
|
+
quant_meta[f"{name}.__quant_state__"] = torch.tensor(
|
|
92
|
+
list(orig_shape) + [0] * (4 - len(orig_shape)),
|
|
93
|
+
dtype=torch.int64,
|
|
94
|
+
)
|
|
95
|
+
# Store absmax for dequantization
|
|
96
|
+
if hasattr(quant_state, 'absmax'):
|
|
97
|
+
quantized_state[f"{name}.__absmax__"] = quant_state.absmax.contiguous()
|
|
98
|
+
if hasattr(quant_state, 'quant_map'):
|
|
99
|
+
quantized_state[f"{name}.__quant_map__"] = quant_state.quant_map.contiguous()
|
|
100
|
+
total_quantized += quant_tensor.numel()
|
|
101
|
+
else:
|
|
102
|
+
# Manual symmetric INT4 quantization (no bitsandbytes)
|
|
103
|
+
# Block size 64 for good accuracy
|
|
104
|
+
block_size = 64
|
|
105
|
+
n_blocks = (flat.numel() + block_size - 1) // block_size
|
|
106
|
+
padded = torch.zeros(n_blocks * block_size)
|
|
107
|
+
padded[:flat.numel()] = flat
|
|
108
|
+
|
|
109
|
+
blocks = padded.reshape(n_blocks, block_size)
|
|
110
|
+
scales = blocks.abs().max(dim=1).values / 7.0 # INT4 range: -8 to 7
|
|
111
|
+
scales = scales.clamp(min=1e-8)
|
|
112
|
+
|
|
113
|
+
# Quantize to INT4 (stored as INT8 pairs)
|
|
114
|
+
quantized_blocks = torch.round(blocks / scales.unsqueeze(1)).clamp(-8, 7).to(torch.int8)
|
|
115
|
+
|
|
116
|
+
# Pack two INT4 values into one INT8
|
|
117
|
+
packed = torch.zeros(n_blocks, block_size // 2, dtype=torch.uint8)
|
|
118
|
+
for i in range(block_size // 2):
|
|
119
|
+
low = (quantized_blocks[:, 2 * i] + 8).to(torch.uint8)
|
|
120
|
+
high = (quantized_blocks[:, 2 * i + 1] + 8).to(torch.uint8)
|
|
121
|
+
packed[:, i] = low | (high << 4)
|
|
122
|
+
|
|
123
|
+
quantized_state[name] = packed.reshape(-1).contiguous()
|
|
124
|
+
quantized_state[f"{name}.__scales__"] = scales.to(torch.float16).contiguous()
|
|
125
|
+
quant_meta[f"{name}.__quant_state__"] = torch.tensor(
|
|
126
|
+
list(orig_shape) + [0] * (4 - len(orig_shape)) + [block_size, flat.numel()],
|
|
127
|
+
dtype=torch.int64,
|
|
128
|
+
)
|
|
129
|
+
total_quantized += packed.numel() + scales.numel() * 2
|
|
130
|
+
|
|
131
|
+
# Add metadata tensors
|
|
132
|
+
quantized_state.update(quant_meta)
|
|
133
|
+
|
|
134
|
+
# 4) Save quantized weights
|
|
135
|
+
if output_path is None:
|
|
136
|
+
output_path = os.path.join(os.path.dirname(weight_path), "model-nf4.safetensors")
|
|
137
|
+
|
|
138
|
+
log.info(f"\nSaving quantized weights to: {output_path}")
|
|
139
|
+
save_file(quantized_state, output_path)
|
|
140
|
+
|
|
141
|
+
final_size = os.path.getsize(output_path)
|
|
142
|
+
compression = total_original / max(final_size, 1)
|
|
143
|
+
|
|
144
|
+
log.info(f"\nQuantization complete!")
|
|
145
|
+
log.info(f" Original: {total_original / 1024**3:.1f} GB (bf16)")
|
|
146
|
+
log.info(f" Quantized: {final_size / 1024**3:.1f} GB (NF4)")
|
|
147
|
+
log.info(f" Compression: {compression:.1f}x")
|
|
148
|
+
log.info(f" Tensors quantized: {len(state_dict) - skipped}/{len(state_dict)}")
|
|
149
|
+
log.info(f" Tensors kept fp16: {skipped} (norms, biases, embeddings)")
|
|
150
|
+
log.info(f"\nUse --quantized flag with PersonaPlex server for INT4 inference")
|
|
151
|
+
|
|
152
|
+
return output_path
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
parser = argparse.ArgumentParser(description="Quantize PersonaPlex 7B to INT4 NF4")
|
|
157
|
+
parser.add_argument("--device", default="cuda", help="Device for quantization")
|
|
158
|
+
parser.add_argument("--output", "-o", default=None, help="Output path for quantized weights")
|
|
159
|
+
args = parser.parse_args()
|
|
160
|
+
|
|
161
|
+
import torch
|
|
162
|
+
with torch.no_grad():
|
|
163
|
+
quantize_model(device=args.device, output_path=args.output)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
if __name__ == "__main__":
|
|
167
|
+
main()
|