vlora-dev 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.gitignore +1 -3
- vlora_dev-0.3.0/CHANGELOG.md +54 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/PKG-INFO +86 -10
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/README.md +84 -8
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/index.md +2 -2
- vlora_dev-0.3.0/examples/qlora_pipeline.py +154 -0
- vlora_dev-0.3.0/icon.png +0 -0
- vlora_dev-0.3.0/logo.png +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/pyproject.toml +2 -2
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/__init__.py +10 -1
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/integrations/huggingface.py +68 -5
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/io.py +11 -4
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/merge.py +2 -2
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/model.py +87 -6
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/ops.py +181 -14
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/subspace.py +398 -38
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_cli.py +57 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_compression.py +61 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_huggingface.py +79 -6
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_incremental.py +44 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_model.py +43 -0
- vlora_dev-0.3.0/tests/test_ops.py +256 -0
- vlora_dev-0.3.0/tests/test_subspace.py +303 -0
- vlora_dev-0.2.1/logo.png +0 -0
- vlora_dev-0.2.1/tests/test_ops.py +0 -121
- vlora_dev-0.2.1/tests/test_subspace.py +0 -146
- vlora_dev-0.2.1/website/.firebase/hosting.ZGlzdA.cache +0 -5
- vlora_dev-0.2.1/website/.firebaserc +0 -15
- vlora_dev-0.2.1/website/astro.config.mjs +0 -11
- vlora_dev-0.2.1/website/firebase.json +0 -37
- vlora_dev-0.2.1/website/package-lock.json +0 -6124
- vlora_dev-0.2.1/website/package.json +0 -17
- vlora_dev-0.2.1/website/public/favicon.png +0 -0
- vlora_dev-0.2.1/website/public/logo.png +0 -0
- vlora_dev-0.2.1/website/public/og-card.png +0 -0
- vlora_dev-0.2.1/website/src/components/Algorithm.astro +0 -80
- vlora_dev-0.2.1/website/src/components/Benchmarks.astro +0 -120
- vlora_dev-0.2.1/website/src/components/CodeExample.astro +0 -56
- vlora_dev-0.2.1/website/src/components/Features.astro +0 -99
- vlora_dev-0.2.1/website/src/components/Footer.astro +0 -27
- vlora_dev-0.2.1/website/src/components/Header.astro +0 -98
- vlora_dev-0.2.1/website/src/components/Hero.astro +0 -98
- vlora_dev-0.2.1/website/src/layouts/BaseLayout.astro +0 -50
- vlora_dev-0.2.1/website/src/pages/index.astro +0 -22
- vlora_dev-0.2.1/website/src/styles/global.css +0 -45
- vlora_dev-0.2.1/website/tsconfig.json +0 -5
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.github/workflows/ci.yml +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.github/workflows/release.yml +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.pre-commit-config.yaml +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/LICENSE +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/api.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/guide_ollama.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/guide_tgi.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/guide_vllm.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/launch_post.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/migration_from_peft.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/quickstart.md +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/axolotl_config.yml +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/basic_pipeline.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/hf_trainer_subspace.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/quickstart.ipynb +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/real_adapters.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/mkdocs.yml +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/_validate.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/analysis.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/cli.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/integrations/__init__.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/pipeline.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/router.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/training.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/__init__.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_analysis.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_backlog.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_io.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_merge.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_pipeline.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_router.py +0 -0
- {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_training.py +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
Format follows [Keep a Changelog](https://keepachangelog.com/).
|
|
6
|
+
|
|
7
|
+
## [0.3.0] - 2026-03-30
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- **NF4 quantization** — 4-bit NormalFloat quantization from QLoRA (Dettmers et al., 2023). `subspace.quantize(method="nf4")` uses 16 quantile levels optimized for normally-distributed weights, with per-block absmax scaling. Lower error than symmetric int4.
|
|
11
|
+
- **Double quantization** — quantize per-block NF4 scales to FP8 via `double_quant=True`, reducing scale overhead from 0.5 to ~0.127 bits/param.
|
|
12
|
+
- **NF4 packed storage** — `subspace.save_quantized()` packs components as uint8 (two 4-bit indices per byte) for ~7x disk savings. `SharedSubspace.load()` auto-detects format.
|
|
13
|
+
- **QLoRA-aware VLoRAModel** — `compute_dtype` parameter for mixed-precision LoRA computation with quantized base models; `qlora_info` property for base model introspection.
|
|
14
|
+
- **`full_stack_compression()`** — report combined base model quantization + adapter compression savings.
|
|
15
|
+
- **`quantize_loadings` parameter** — optionally quantize per-task loadings (not just components).
|
|
16
|
+
- **`nf4_pack` / `nf4_unpack`** — low-level ops for 4-bit packing to uint8.
|
|
17
|
+
- **Layer shapes stored in metadata** — `reconstruct()` uses stored shapes instead of deriving from `numel() // rank`, supporting per-layer rank configs.
|
|
18
|
+
- **`__repr__` on core objects** — `SharedSubspace`, `TaskProjection`, `LoRAWeights` now print useful info.
|
|
19
|
+
- **`adaptive_k` preserved through `absorb()`** — subspaces built with `adaptive_k=True` retain that setting after absorption.
|
|
20
|
+
- QLoRA + vLoRA pipeline example (`examples/qlora_pipeline.py`).
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- **`absorb_incremental` re-projection bug** — existing tasks were having loadings padded/truncated instead of properly re-projected when the basis rotated. Now reconstructs from old basis and projects onto updated basis.
|
|
24
|
+
- **`VLoRACallback` was a no-op** — the HF Trainer callback created an optimizer but never stepped it. Now registers differentiable forward hooks so the Trainer's backward pass produces gradients on loadings, and steps the optimizer in `on_step_end`.
|
|
25
|
+
- **TIES merge normalization** — `n / contributor_count` over-scaled output when elements were trimmed. Fixed to `1 / contributor_count`.
|
|
26
|
+
- **`__version__` mismatch** — `__init__.py` said 0.1.0 while `pyproject.toml` said 0.2.1.
|
|
27
|
+
- **`check_tensor_health` never called** — imported but unused; now wired up after SVD in `from_adapters`.
|
|
28
|
+
- **Task ID collision** — `absorb()` and `absorb_incremental()` now warn when overwriting an existing task ID.
|
|
29
|
+
- **Filesystem-unsafe task IDs** — `save()` now sanitizes task IDs for filenames (handles `/`, `:`, spaces) with a mapping in metadata for lossless round-trip.
|
|
30
|
+
- **`from_adapters_streaming` missing validation** — now checks `len(task_ids) == len(adapter_paths)`.
|
|
31
|
+
|
|
32
|
+
### Changed
|
|
33
|
+
- **`gram_schmidt` uses QR factorization** — replaced O(k^2 * D) inner loop with `torch.linalg.qr` for better performance and numerical stability.
|
|
34
|
+
- **VLoRAModel caches module handles** — `_apply_hooks` no longer scans all `named_modules()` on every task switch.
|
|
35
|
+
- **VLoRAModel inference hooks wrapped in `torch.no_grad()`** — prevents unnecessary autograd tracking.
|
|
36
|
+
- **NF4 quantization uses `torch.bucketize`** — replaced O(N*16) distance broadcast with binary search, reducing memory from O(N*16) to O(N).
|
|
37
|
+
- **`_LORA_KEY_RE` handles multi-adapter PEFT format** — supports `base_model.model.{layer}.lora_A.{adapter_name}.weight`.
|
|
38
|
+
- **`save_adapter` no longer hardcodes `CAUSAL_LM`** — task type left for PEFT to infer.
|
|
39
|
+
- Repo URL updated to `github.com/vlora-dev/vlora`.
|
|
40
|
+
|
|
41
|
+
## [0.2.1] - 2026-02-10
|
|
42
|
+
|
|
43
|
+
Initial public release on PyPI as `vlora-dev`.
|
|
44
|
+
|
|
45
|
+
### Added
|
|
46
|
+
- `SharedSubspace` — 3-step algorithm: from_adapters, project, absorb
|
|
47
|
+
- `VLoRAModel` — inference wrapper with forward hooks
|
|
48
|
+
- `SubspaceTrainer` — loadings-only training
|
|
49
|
+
- `TaskRouter` — per-input adapter routing
|
|
50
|
+
- `task_arithmetic`, `ties_merge`, `dare_merge` — adapter merging
|
|
51
|
+
- Analysis tools: similarity matrix, clustering, outlier detection
|
|
52
|
+
- CLI with 9 commands
|
|
53
|
+
- HuggingFace Trainer integration via `VLoRACallback`
|
|
54
|
+
- Streaming and incremental subspace construction
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vlora-dev
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Various LoRA adapters. One shared basis. Up to 122x compression at scale.
|
|
5
5
|
Project-URL: Homepage, https://github.com/tveseli/vlora
|
|
6
6
|
Project-URL: Repository, https://github.com/tveseli/vlora
|
|
7
7
|
Author: Tim Veseli
|
|
@@ -39,10 +39,10 @@ Description-Content-Type: text/markdown
|
|
|
39
39
|
</p>
|
|
40
40
|
|
|
41
41
|
<p align="center">
|
|
42
|
-
<strong>
|
|
42
|
+
<strong>Various LoRA adapters. One shared basis.</strong>
|
|
43
43
|
</p>
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
Your adapters share more structure than you think. vLoRA finds the common basis and stores each adapter as a tiny coefficient vector — up to 122× compression at scale. Based on the [Share paper](https://arxiv.org/abs/2602.06043).
|
|
46
46
|
|
|
47
47
|
## Install
|
|
48
48
|
|
|
@@ -52,7 +52,7 @@ pip install vlora-dev
|
|
|
52
52
|
|
|
53
53
|
Or from source:
|
|
54
54
|
```bash
|
|
55
|
-
git clone https://github.com/
|
|
55
|
+
git clone https://github.com/vlora-dev/vlora.git
|
|
56
56
|
cd vlora
|
|
57
57
|
pip install -e ".[dev]"
|
|
58
58
|
```
|
|
@@ -137,6 +137,77 @@ output = model(input_ids)
|
|
|
137
137
|
print(model.available_tasks) # ["task_0", "task_1", ...]
|
|
138
138
|
```
|
|
139
139
|
|
|
140
|
+
## QLoRA Support
|
|
141
|
+
|
|
142
|
+
vLoRA has first-class support for [QLoRA](https://arxiv.org/abs/2305.14314) workflows. QLoRA compresses the **base model** (FP16 → 4-bit NF4), while vLoRA compresses the **adapter space** — these are orthogonal and stack multiplicatively.
|
|
143
|
+
|
|
144
|
+
### NF4 Quantization
|
|
145
|
+
|
|
146
|
+
Quantize subspace components using the same NF4 data type from QLoRA — 16 quantile levels optimized for normally-distributed weights:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
# NF4 quantization (better than symmetric int4 for normal-ish weights)
|
|
150
|
+
subspace.quantize(method="nf4")
|
|
151
|
+
|
|
152
|
+
# With double quantization (quantize the per-block scales too)
|
|
153
|
+
subspace.quantize(method="nf4", double_quant=True)
|
|
154
|
+
|
|
155
|
+
# Also quantize loadings (effective when loadings are approximately normal)
|
|
156
|
+
subspace.quantize(method="nf4", quantize_loadings=True)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Packed NF4 Storage
|
|
160
|
+
|
|
161
|
+
Save subspace in packed 4-bit format for ~7× disk savings:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
# Save: packs components as uint8 (two 4-bit values per byte)
|
|
165
|
+
subspace.save_quantized("shared_subspace/")
|
|
166
|
+
|
|
167
|
+
# Load: auto-detects format, dequantizes on the fly
|
|
168
|
+
subspace = SharedSubspace.load("shared_subspace/")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### QLoRA Base Model
|
|
172
|
+
|
|
173
|
+
`VLoRAModel` works with quantized base models loaded via bitsandbytes:
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
|
177
|
+
from vlora import VLoRAModel, SharedSubspace
|
|
178
|
+
|
|
179
|
+
# Load 4-bit base model
|
|
180
|
+
bnb_config = BitsAndBytesConfig(
|
|
181
|
+
load_in_4bit=True,
|
|
182
|
+
bnb_4bit_quant_type="nf4",
|
|
183
|
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
|
184
|
+
)
|
|
185
|
+
base_model = AutoModelForCausalLM.from_pretrained("model-name", quantization_config=bnb_config)
|
|
186
|
+
|
|
187
|
+
# Wrap with vLoRA — compute_dtype ensures LoRA math runs in BF16
|
|
188
|
+
subspace = SharedSubspace.load("shared_subspace/")
|
|
189
|
+
model = VLoRAModel(base_model, subspace, compute_dtype=torch.bfloat16)
|
|
190
|
+
|
|
191
|
+
print(model.qlora_info) # {'quantized': True, 'method': 'nf4', ...}
|
|
192
|
+
model.set_task("task_0")
|
|
193
|
+
output = model(input_ids)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### Full-Stack Compression
|
|
197
|
+
|
|
198
|
+
Report combined savings across base model quantization and adapter compression:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
stats = subspace.full_stack_compression(
|
|
202
|
+
base_model_params=7_000_000_000, # 7B model
|
|
203
|
+
base_model_bits=16, # original FP16
|
|
204
|
+
quantized_bits=4, # QLoRA NF4
|
|
205
|
+
)
|
|
206
|
+
# → {'total_compression_ratio': 4.0, 'total_original_bytes': 14.0 GB, ...}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
See [`examples/qlora_pipeline.py`](examples/qlora_pipeline.py) for a complete end-to-end example.
|
|
210
|
+
|
|
140
211
|
## Training in the Subspace
|
|
141
212
|
|
|
142
213
|
Train only the loadings vector (k params per layer) instead of full LoRA matrices — 100×+ parameter reduction:
|
|
@@ -219,8 +290,10 @@ merged = dare_merge(adapters, drop_rate=0.5, seed=42)
|
|
|
219
290
|
# Adaptive k: different components per layer based on explained variance
|
|
220
291
|
subspace = SharedSubspace.from_adapters(adapters, adaptive_k=True, variance_threshold=0.9)
|
|
221
292
|
|
|
222
|
-
# Quantize components
|
|
223
|
-
subspace.quantize(bits=8)
|
|
293
|
+
# Quantize components — symmetric (int8/int4) or NF4
|
|
294
|
+
subspace.quantize(bits=8) # symmetric int8
|
|
295
|
+
subspace.quantize(method="nf4") # NF4 4-bit (better for normal weights)
|
|
296
|
+
subspace.quantize(method="nf4", double_quant=True) # + quantize the scales
|
|
224
297
|
|
|
225
298
|
# Check compression stats
|
|
226
299
|
stats = subspace.compression_stats()
|
|
@@ -267,14 +340,16 @@ subspace.to(device="cuda", dtype=torch.float16)
|
|
|
267
340
|
- `.absorb(adapter, task_id)` — Incorporate + recompute (full SVD)
|
|
268
341
|
- `.absorb_incremental(adapter, task_id)` — Fast incremental update
|
|
269
342
|
- `.get_trainable_params(task_id)` — For training integration
|
|
270
|
-
- `.quantize(bits=8)` — Quantize components (int8/int4)
|
|
343
|
+
- `.quantize(bits=8, method="symmetric")` — Quantize components (int8/int4/NF4)
|
|
271
344
|
- `.compression_stats()` — Compression ratio and parameter counts
|
|
345
|
+
- `.full_stack_compression(base_model_params)` — Combined base + adapter stats
|
|
272
346
|
- `.to(device, dtype)` — Move tensors to device/dtype
|
|
273
|
-
- `.save(path)` / `.load(path)` — Serialization
|
|
347
|
+
- `.save(path)` / `.save_quantized(path)` / `.load(path)` — Serialization (NF4-packed auto-detected)
|
|
274
348
|
|
|
275
349
|
### Model Integration
|
|
276
350
|
|
|
277
|
-
- **`VLoRAModel(base_model, subspace, lora_alpha=None)`** — Inference wrapper with forward hooks
|
|
351
|
+
- **`VLoRAModel(base_model, subspace, lora_alpha=None, compute_dtype=None)`** — Inference wrapper with forward hooks
|
|
352
|
+
- `.qlora_info` — Base model quantization metadata
|
|
278
353
|
- `.set_task(task_id)` — Switch adapter (cached)
|
|
279
354
|
- `.clear_task()` — Remove adapter
|
|
280
355
|
- `.available_tasks` — List task IDs
|
|
@@ -325,6 +400,7 @@ subspace.to(device="cuda", dtype=torch.float16)
|
|
|
325
400
|
- `compute_svd`, `project_onto_subspace`, `reconstruct_from_subspace`
|
|
326
401
|
- `gram_schmidt`, `explained_variance_ratio`, `select_num_components`
|
|
327
402
|
- `incremental_svd_update`
|
|
403
|
+
- `nf4_quantize_dequantize`, `nf4_pack`, `nf4_unpack` — NF4 quantization (QLoRA)
|
|
328
404
|
|
|
329
405
|
## Benchmarks — Real-World Adapters
|
|
330
406
|
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
<p align="center">
|
|
6
|
-
<strong>
|
|
6
|
+
<strong>Various LoRA adapters. One shared basis.</strong>
|
|
7
7
|
</p>
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
Your adapters share more structure than you think. vLoRA finds the common basis and stores each adapter as a tiny coefficient vector — up to 122× compression at scale. Based on the [Share paper](https://arxiv.org/abs/2602.06043).
|
|
10
10
|
|
|
11
11
|
## Install
|
|
12
12
|
|
|
@@ -16,7 +16,7 @@ pip install vlora-dev
|
|
|
16
16
|
|
|
17
17
|
Or from source:
|
|
18
18
|
```bash
|
|
19
|
-
git clone https://github.com/
|
|
19
|
+
git clone https://github.com/vlora-dev/vlora.git
|
|
20
20
|
cd vlora
|
|
21
21
|
pip install -e ".[dev]"
|
|
22
22
|
```
|
|
@@ -101,6 +101,77 @@ output = model(input_ids)
|
|
|
101
101
|
print(model.available_tasks) # ["task_0", "task_1", ...]
|
|
102
102
|
```
|
|
103
103
|
|
|
104
|
+
## QLoRA Support
|
|
105
|
+
|
|
106
|
+
vLoRA has first-class support for [QLoRA](https://arxiv.org/abs/2305.14314) workflows. QLoRA compresses the **base model** (FP16 → 4-bit NF4), while vLoRA compresses the **adapter space** — these are orthogonal and stack multiplicatively.
|
|
107
|
+
|
|
108
|
+
### NF4 Quantization
|
|
109
|
+
|
|
110
|
+
Quantize subspace components using the same NF4 data type from QLoRA — 16 quantile levels optimized for normally-distributed weights:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# NF4 quantization (better than symmetric int4 for normal-ish weights)
|
|
114
|
+
subspace.quantize(method="nf4")
|
|
115
|
+
|
|
116
|
+
# With double quantization (quantize the per-block scales too)
|
|
117
|
+
subspace.quantize(method="nf4", double_quant=True)
|
|
118
|
+
|
|
119
|
+
# Also quantize loadings (effective when loadings are approximately normal)
|
|
120
|
+
subspace.quantize(method="nf4", quantize_loadings=True)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Packed NF4 Storage
|
|
124
|
+
|
|
125
|
+
Save subspace in packed 4-bit format for ~7× disk savings:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
# Save: packs components as uint8 (two 4-bit values per byte)
|
|
129
|
+
subspace.save_quantized("shared_subspace/")
|
|
130
|
+
|
|
131
|
+
# Load: auto-detects format, dequantizes on the fly
|
|
132
|
+
subspace = SharedSubspace.load("shared_subspace/")
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### QLoRA Base Model
|
|
136
|
+
|
|
137
|
+
`VLoRAModel` works with quantized base models loaded via bitsandbytes:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
|
141
|
+
from vlora import VLoRAModel, SharedSubspace
|
|
142
|
+
|
|
143
|
+
# Load 4-bit base model
|
|
144
|
+
bnb_config = BitsAndBytesConfig(
|
|
145
|
+
load_in_4bit=True,
|
|
146
|
+
bnb_4bit_quant_type="nf4",
|
|
147
|
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
|
148
|
+
)
|
|
149
|
+
base_model = AutoModelForCausalLM.from_pretrained("model-name", quantization_config=bnb_config)
|
|
150
|
+
|
|
151
|
+
# Wrap with vLoRA — compute_dtype ensures LoRA math runs in BF16
|
|
152
|
+
subspace = SharedSubspace.load("shared_subspace/")
|
|
153
|
+
model = VLoRAModel(base_model, subspace, compute_dtype=torch.bfloat16)
|
|
154
|
+
|
|
155
|
+
print(model.qlora_info) # {'quantized': True, 'method': 'nf4', ...}
|
|
156
|
+
model.set_task("task_0")
|
|
157
|
+
output = model(input_ids)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Full-Stack Compression
|
|
161
|
+
|
|
162
|
+
Report combined savings across base model quantization and adapter compression:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
stats = subspace.full_stack_compression(
|
|
166
|
+
base_model_params=7_000_000_000, # 7B model
|
|
167
|
+
base_model_bits=16, # original FP16
|
|
168
|
+
quantized_bits=4, # QLoRA NF4
|
|
169
|
+
)
|
|
170
|
+
# → {'total_compression_ratio': 4.0, 'total_original_bytes': 14.0 GB, ...}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
See [`examples/qlora_pipeline.py`](examples/qlora_pipeline.py) for a complete end-to-end example.
|
|
174
|
+
|
|
104
175
|
## Training in the Subspace
|
|
105
176
|
|
|
106
177
|
Train only the loadings vector (k params per layer) instead of full LoRA matrices — 100×+ parameter reduction:
|
|
@@ -183,8 +254,10 @@ merged = dare_merge(adapters, drop_rate=0.5, seed=42)
|
|
|
183
254
|
# Adaptive k: different components per layer based on explained variance
|
|
184
255
|
subspace = SharedSubspace.from_adapters(adapters, adaptive_k=True, variance_threshold=0.9)
|
|
185
256
|
|
|
186
|
-
# Quantize components
|
|
187
|
-
subspace.quantize(bits=8)
|
|
257
|
+
# Quantize components — symmetric (int8/int4) or NF4
|
|
258
|
+
subspace.quantize(bits=8) # symmetric int8
|
|
259
|
+
subspace.quantize(method="nf4") # NF4 4-bit (better for normal weights)
|
|
260
|
+
subspace.quantize(method="nf4", double_quant=True) # + quantize the scales
|
|
188
261
|
|
|
189
262
|
# Check compression stats
|
|
190
263
|
stats = subspace.compression_stats()
|
|
@@ -231,14 +304,16 @@ subspace.to(device="cuda", dtype=torch.float16)
|
|
|
231
304
|
- `.absorb(adapter, task_id)` — Incorporate + recompute (full SVD)
|
|
232
305
|
- `.absorb_incremental(adapter, task_id)` — Fast incremental update
|
|
233
306
|
- `.get_trainable_params(task_id)` — For training integration
|
|
234
|
-
- `.quantize(bits=8)` — Quantize components (int8/int4)
|
|
307
|
+
- `.quantize(bits=8, method="symmetric")` — Quantize components (int8/int4/NF4)
|
|
235
308
|
- `.compression_stats()` — Compression ratio and parameter counts
|
|
309
|
+
- `.full_stack_compression(base_model_params)` — Combined base + adapter stats
|
|
236
310
|
- `.to(device, dtype)` — Move tensors to device/dtype
|
|
237
|
-
- `.save(path)` / `.load(path)` — Serialization
|
|
311
|
+
- `.save(path)` / `.save_quantized(path)` / `.load(path)` — Serialization (NF4-packed auto-detected)
|
|
238
312
|
|
|
239
313
|
### Model Integration
|
|
240
314
|
|
|
241
|
-
- **`VLoRAModel(base_model, subspace, lora_alpha=None)`** — Inference wrapper with forward hooks
|
|
315
|
+
- **`VLoRAModel(base_model, subspace, lora_alpha=None, compute_dtype=None)`** — Inference wrapper with forward hooks
|
|
316
|
+
- `.qlora_info` — Base model quantization metadata
|
|
242
317
|
- `.set_task(task_id)` — Switch adapter (cached)
|
|
243
318
|
- `.clear_task()` — Remove adapter
|
|
244
319
|
- `.available_tasks` — List task IDs
|
|
@@ -289,6 +364,7 @@ subspace.to(device="cuda", dtype=torch.float16)
|
|
|
289
364
|
- `compute_svd`, `project_onto_subspace`, `reconstruct_from_subspace`
|
|
290
365
|
- `gram_schmidt`, `explained_variance_ratio`, `select_num_components`
|
|
291
366
|
- `incremental_svd_update`
|
|
367
|
+
- `nf4_quantize_dequantize`, `nf4_pack`, `nf4_unpack` — NF4 quantization (QLoRA)
|
|
292
368
|
|
|
293
369
|
## Benchmarks — Real-World Adapters
|
|
294
370
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# vlora
|
|
2
2
|
|
|
3
|
-
**
|
|
3
|
+
**Various LoRA adapters. One shared basis.**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Your adapters share more structure than you think. vLoRA finds the common basis and stores each adapter as a tiny coefficient vector — up to 122× compression at scale. Based on the [Share paper](https://arxiv.org/abs/2602.06043).
|
|
6
6
|
|
|
7
7
|
## Install
|
|
8
8
|
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""QLoRA + vLoRA: End-to-end pipeline for efficient multi-adapter serving.
|
|
2
|
+
|
|
3
|
+
This example shows the full workflow:
|
|
4
|
+
1. Load a QLoRA-quantized base model (4-bit NF4)
|
|
5
|
+
2. Load multiple LoRA adapters (produced by QLoRA fine-tuning)
|
|
6
|
+
3. Build a shared subspace with NF4 quantization
|
|
7
|
+
4. Serve with instant task switching via VLoRAModel
|
|
8
|
+
|
|
9
|
+
Requirements:
|
|
10
|
+
pip install vlora-dev[hub] transformers bitsandbytes accelerate
|
|
11
|
+
|
|
12
|
+
The pipeline combines two orthogonal compression techniques:
|
|
13
|
+
- QLoRA: compresses the base model (FP16 -> NF4, ~4x savings)
|
|
14
|
+
- vLoRA: compresses the adapter space (N adapters -> shared subspace, ~122x)
|
|
15
|
+
Together they enable serving hundreds of task-specific adapters on a single GPU.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
|
|
22
|
+
# ── Step 0: Configuration ──────────────────────────────────────────────
|
|
23
|
+
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Small model for demo
|
|
24
|
+
ADAPTER_REPOS = [
|
|
25
|
+
# Replace with your QLoRA adapter repos from HuggingFace Hub
|
|
26
|
+
# "username/adapter-task-a",
|
|
27
|
+
# "username/adapter-task-b",
|
|
28
|
+
]
|
|
29
|
+
NUM_COMPONENTS = 4 # Subspace dimension
|
|
30
|
+
USE_NF4_STORAGE = True # Save subspace in packed NF4 format
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main():
|
|
34
|
+
# ── Step 1: Load QLoRA base model ──────────────────────────────────
|
|
35
|
+
# In production, load with 4-bit quantization:
|
|
36
|
+
#
|
|
37
|
+
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
|
38
|
+
# bnb_config = BitsAndBytesConfig(
|
|
39
|
+
# load_in_4bit=True,
|
|
40
|
+
# bnb_4bit_quant_type="nf4",
|
|
41
|
+
# bnb_4bit_compute_dtype=torch.bfloat16,
|
|
42
|
+
# )
|
|
43
|
+
# base_model = AutoModelForCausalLM.from_pretrained(
|
|
44
|
+
# BASE_MODEL, quantization_config=bnb_config
|
|
45
|
+
# )
|
|
46
|
+
#
|
|
47
|
+
# For this demo, we simulate with synthetic data:
|
|
48
|
+
print("=== QLoRA + vLoRA Pipeline Demo ===\n")
|
|
49
|
+
|
|
50
|
+
# ── Step 2: Load adapters ──────────────────────────────────────────
|
|
51
|
+
from vlora import LoRAWeights, SharedSubspace, VLoRAModel
|
|
52
|
+
|
|
53
|
+
print("Creating synthetic adapters (replace with load_adapter_from_hub)...")
|
|
54
|
+
layers = [
|
|
55
|
+
"model.layers.0.self_attn.q_proj",
|
|
56
|
+
"model.layers.0.self_attn.v_proj",
|
|
57
|
+
"model.layers.1.self_attn.q_proj",
|
|
58
|
+
"model.layers.1.self_attn.v_proj",
|
|
59
|
+
]
|
|
60
|
+
rank = 8
|
|
61
|
+
dim = 512
|
|
62
|
+
n_adapters = 10
|
|
63
|
+
|
|
64
|
+
# Create correlated adapters (simulates real LoRA adapters sharing structure)
|
|
65
|
+
torch.manual_seed(42)
|
|
66
|
+
shared_basis = {l: torch.randn(5, rank * dim) for l in layers}
|
|
67
|
+
adapters = []
|
|
68
|
+
task_ids = []
|
|
69
|
+
for i in range(n_adapters):
|
|
70
|
+
lora_a = {l: (torch.randn(5) @ shared_basis[l]).reshape(rank, dim) for l in layers}
|
|
71
|
+
lora_b = {l: torch.randn(dim, rank) * 0.01 for l in layers}
|
|
72
|
+
adapters.append(LoRAWeights(layer_names=layers, lora_a=lora_a, lora_b=lora_b, rank=rank))
|
|
73
|
+
task_ids.append(f"task_{i}")
|
|
74
|
+
print(f" Loaded {n_adapters} adapters, rank={rank}, {len(layers)} layers\n")
|
|
75
|
+
|
|
76
|
+
# ── Step 3: Build shared subspace ──────────────────────────────────
|
|
77
|
+
print("Building shared subspace...")
|
|
78
|
+
subspace = SharedSubspace.from_adapters(
|
|
79
|
+
adapters,
|
|
80
|
+
task_ids=task_ids,
|
|
81
|
+
num_components=NUM_COMPONENTS,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
stats = subspace.compression_stats()
|
|
85
|
+
print(f" Components: {subspace.num_components}")
|
|
86
|
+
print(f" Compression: {stats['compression_ratio']:.1f}x")
|
|
87
|
+
print(f" Original params: {stats['total_params_original']:,}")
|
|
88
|
+
print(f" Compressed params: {stats['total_params_compressed']:,}\n")
|
|
89
|
+
|
|
90
|
+
# ── Step 4: Apply NF4 quantization to subspace ─────────────────────
|
|
91
|
+
print("Quantizing subspace with NF4...")
|
|
92
|
+
subspace.quantize(method="nf4", quantize_loadings=True)
|
|
93
|
+
print(" Done (components + loadings quantized)\n")
|
|
94
|
+
|
|
95
|
+
# ── Step 5: Save with packed NF4 storage ───────────────────────────
|
|
96
|
+
import tempfile
|
|
97
|
+
from pathlib import Path
|
|
98
|
+
|
|
99
|
+
save_dir = Path(tempfile.mkdtemp()) / "subspace"
|
|
100
|
+
|
|
101
|
+
if USE_NF4_STORAGE:
|
|
102
|
+
print("Saving with NF4-packed format...")
|
|
103
|
+
subspace.save_quantized(save_dir)
|
|
104
|
+
else:
|
|
105
|
+
print("Saving with float32 format...")
|
|
106
|
+
subspace.save(save_dir)
|
|
107
|
+
|
|
108
|
+
# Compare file sizes
|
|
109
|
+
total_bytes = sum(f.stat().st_size for f in save_dir.rglob("*") if f.is_file())
|
|
110
|
+
print(f" Saved to: {save_dir}")
|
|
111
|
+
print(f" Total size: {total_bytes / 1024:.1f} KB\n")
|
|
112
|
+
|
|
113
|
+
# ── Step 6: Load and serve ─────────────────────────────────────────
|
|
114
|
+
print("Loading subspace (auto-detects format)...")
|
|
115
|
+
loaded = SharedSubspace.load(save_dir)
|
|
116
|
+
print(f" {loaded!r}\n")
|
|
117
|
+
|
|
118
|
+
# Full-stack compression stats (with hypothetical QLoRA base model)
|
|
119
|
+
full_stats = loaded.full_stack_compression(
|
|
120
|
+
base_model_params=1_100_000_000, # TinyLlama 1.1B
|
|
121
|
+
base_model_bits=16,
|
|
122
|
+
quantized_bits=4,
|
|
123
|
+
)
|
|
124
|
+
if "total_compression_ratio" in full_stats:
|
|
125
|
+
print("Full-stack compression (QLoRA base + vLoRA adapters):")
|
|
126
|
+
print(f" Base model: {full_stats['base_model']['compression_ratio']:.1f}x (FP16->NF4)")
|
|
127
|
+
print(f" Adapters: {stats['compression_ratio']:.1f}x ({n_adapters} adapters)")
|
|
128
|
+
print(f" Total: {full_stats['total_original_bytes']/1e9:.1f} GB -> "
|
|
129
|
+
f"{full_stats['total_compressed_bytes']/1e9:.2f} GB")
|
|
130
|
+
print(f" Combined: {full_stats['total_compression_ratio']:.1f}x\n")
|
|
131
|
+
|
|
132
|
+
# In production with a real base model:
|
|
133
|
+
#
|
|
134
|
+
# model = VLoRAModel(base_model, loaded, compute_dtype=torch.bfloat16)
|
|
135
|
+
# print(f"QLoRA info: {model.qlora_info}")
|
|
136
|
+
#
|
|
137
|
+
# # Instant task switching
|
|
138
|
+
# model.set_task("task_0")
|
|
139
|
+
# output = model(input_ids)
|
|
140
|
+
#
|
|
141
|
+
# model.set_task("task_5") # microseconds to switch
|
|
142
|
+
# output = model(input_ids)
|
|
143
|
+
|
|
144
|
+
# Demonstrate reconstruction
|
|
145
|
+
print("Reconstructing adapters from subspace...")
|
|
146
|
+
for tid in ["task_0", "task_5", "task_9"]:
|
|
147
|
+
recon = loaded.reconstruct(tid)
|
|
148
|
+
print(f" {tid}: {recon!r}")
|
|
149
|
+
|
|
150
|
+
print("\nDone!")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
main()
|
vlora_dev-0.3.0/icon.png
ADDED
|
Binary file
|
vlora_dev-0.3.0/logo.png
ADDED
|
Binary file
|
|
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "vlora-dev"
|
|
7
|
-
version = "0.
|
|
8
|
-
description = "
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Various LoRA adapters. One shared basis. Up to 122x compression at scale."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -5,13 +5,17 @@ share a common low-rank subspace. Instead of storing N separate adapters,
|
|
|
5
5
|
maintain one shared basis and per-task coefficient vectors.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "0.
|
|
8
|
+
__version__ = "0.3.0"
|
|
9
9
|
|
|
10
10
|
from vlora.io import LoRAWeights, load_adapter, load_adapter_from_hub, save_adapter
|
|
11
11
|
from vlora.ops import (
|
|
12
|
+
NF4_QUANT_TABLE,
|
|
12
13
|
compute_svd,
|
|
13
14
|
explained_variance_ratio,
|
|
14
15
|
gram_schmidt,
|
|
16
|
+
nf4_pack,
|
|
17
|
+
nf4_quantize_dequantize,
|
|
18
|
+
nf4_unpack,
|
|
15
19
|
project_onto_subspace,
|
|
16
20
|
reconstruct_from_subspace,
|
|
17
21
|
select_num_components,
|
|
@@ -51,6 +55,11 @@ __all__ = [
|
|
|
51
55
|
"gram_schmidt",
|
|
52
56
|
"explained_variance_ratio",
|
|
53
57
|
"select_num_components",
|
|
58
|
+
# NF4 quantization (QLoRA-style)
|
|
59
|
+
"NF4_QUANT_TABLE",
|
|
60
|
+
"nf4_quantize_dequantize",
|
|
61
|
+
"nf4_pack",
|
|
62
|
+
"nf4_unpack",
|
|
54
63
|
# Analysis
|
|
55
64
|
"compute_similarity_matrix",
|
|
56
65
|
"find_clusters",
|