vlora-dev 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.gitignore +1 -3
  2. vlora_dev-0.3.0/CHANGELOG.md +54 -0
  3. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/PKG-INFO +86 -10
  4. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/README.md +84 -8
  5. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/index.md +2 -2
  6. vlora_dev-0.3.0/examples/qlora_pipeline.py +154 -0
  7. vlora_dev-0.3.0/icon.png +0 -0
  8. vlora_dev-0.3.0/logo.png +0 -0
  9. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/pyproject.toml +2 -2
  10. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/__init__.py +10 -1
  11. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/integrations/huggingface.py +68 -5
  12. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/io.py +11 -4
  13. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/merge.py +2 -2
  14. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/model.py +87 -6
  15. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/ops.py +181 -14
  16. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/subspace.py +398 -38
  17. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_cli.py +57 -0
  18. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_compression.py +61 -0
  19. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_huggingface.py +79 -6
  20. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_incremental.py +44 -0
  21. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_model.py +43 -0
  22. vlora_dev-0.3.0/tests/test_ops.py +256 -0
  23. vlora_dev-0.3.0/tests/test_subspace.py +303 -0
  24. vlora_dev-0.2.1/logo.png +0 -0
  25. vlora_dev-0.2.1/tests/test_ops.py +0 -121
  26. vlora_dev-0.2.1/tests/test_subspace.py +0 -146
  27. vlora_dev-0.2.1/website/.firebase/hosting.ZGlzdA.cache +0 -5
  28. vlora_dev-0.2.1/website/.firebaserc +0 -15
  29. vlora_dev-0.2.1/website/astro.config.mjs +0 -11
  30. vlora_dev-0.2.1/website/firebase.json +0 -37
  31. vlora_dev-0.2.1/website/package-lock.json +0 -6124
  32. vlora_dev-0.2.1/website/package.json +0 -17
  33. vlora_dev-0.2.1/website/public/favicon.png +0 -0
  34. vlora_dev-0.2.1/website/public/logo.png +0 -0
  35. vlora_dev-0.2.1/website/public/og-card.png +0 -0
  36. vlora_dev-0.2.1/website/src/components/Algorithm.astro +0 -80
  37. vlora_dev-0.2.1/website/src/components/Benchmarks.astro +0 -120
  38. vlora_dev-0.2.1/website/src/components/CodeExample.astro +0 -56
  39. vlora_dev-0.2.1/website/src/components/Features.astro +0 -99
  40. vlora_dev-0.2.1/website/src/components/Footer.astro +0 -27
  41. vlora_dev-0.2.1/website/src/components/Header.astro +0 -98
  42. vlora_dev-0.2.1/website/src/components/Hero.astro +0 -98
  43. vlora_dev-0.2.1/website/src/layouts/BaseLayout.astro +0 -50
  44. vlora_dev-0.2.1/website/src/pages/index.astro +0 -22
  45. vlora_dev-0.2.1/website/src/styles/global.css +0 -45
  46. vlora_dev-0.2.1/website/tsconfig.json +0 -5
  47. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.github/workflows/ci.yml +0 -0
  48. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.github/workflows/release.yml +0 -0
  49. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/.pre-commit-config.yaml +0 -0
  50. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/LICENSE +0 -0
  51. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/api.md +0 -0
  52. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/guide_ollama.md +0 -0
  53. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/guide_tgi.md +0 -0
  54. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/guide_vllm.md +0 -0
  55. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/launch_post.md +0 -0
  56. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/migration_from_peft.md +0 -0
  57. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/docs/quickstart.md +0 -0
  58. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/axolotl_config.yml +0 -0
  59. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/basic_pipeline.py +0 -0
  60. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/hf_trainer_subspace.py +0 -0
  61. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/quickstart.ipynb +0 -0
  62. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/examples/real_adapters.py +0 -0
  63. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/mkdocs.yml +0 -0
  64. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/_validate.py +0 -0
  65. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/analysis.py +0 -0
  66. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/cli.py +0 -0
  67. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/integrations/__init__.py +0 -0
  68. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/pipeline.py +0 -0
  69. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/router.py +0 -0
  70. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/src/vlora/training.py +0 -0
  71. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/__init__.py +0 -0
  72. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_analysis.py +0 -0
  73. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_backlog.py +0 -0
  74. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_io.py +0 -0
  75. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_merge.py +0 -0
  76. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_pipeline.py +0 -0
  77. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_router.py +0 -0
  78. {vlora_dev-0.2.1 → vlora_dev-0.3.0}/tests/test_training.py +0 -0
@@ -18,6 +18,4 @@ env/
18
18
  *.bin
19
19
  .DS_Store
20
20
  .claude/
21
- website/node_modules/
22
- website/dist/
23
- website/.astro/
21
+ website/
@@ -0,0 +1,54 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ Format follows [Keep a Changelog](https://keepachangelog.com/).
6
+
7
+ ## [0.3.0] - 2026-03-30
8
+
9
+ ### Added
10
+ - **NF4 quantization** — 4-bit NormalFloat quantization from QLoRA (Dettmers et al., 2023). `subspace.quantize(method="nf4")` uses 16 quantile levels optimized for normally-distributed weights, with per-block absmax scaling. Lower error than symmetric int4.
11
+ - **Double quantization** — quantize per-block NF4 scales to FP8 via `double_quant=True`, reducing scale overhead from 0.5 to ~0.127 bits/param.
12
+ - **NF4 packed storage** — `subspace.save_quantized()` packs components as uint8 (two 4-bit indices per byte) for ~7x disk savings. `SharedSubspace.load()` auto-detects format.
13
+ - **QLoRA-aware VLoRAModel** — `compute_dtype` parameter for mixed-precision LoRA computation with quantized base models; `qlora_info` property for base model introspection.
14
+ - **`full_stack_compression()`** — report combined base model quantization + adapter compression savings.
15
+ - **`quantize_loadings` parameter** — optionally quantize per-task loadings (not just components).
16
+ - **`nf4_pack` / `nf4_unpack`** — low-level ops for 4-bit packing to uint8.
17
+ - **Layer shapes stored in metadata** — `reconstruct()` uses stored shapes instead of deriving from `numel() // rank`, supporting per-layer rank configs.
18
+ - **`__repr__` on core objects** — `SharedSubspace`, `TaskProjection`, `LoRAWeights` now print useful info.
19
+ - **`adaptive_k` preserved through `absorb()`** — subspaces built with `adaptive_k=True` retain that setting after absorption.
20
+ - QLoRA + vLoRA pipeline example (`examples/qlora_pipeline.py`).
21
+
22
+ ### Fixed
23
+ - **`absorb_incremental` re-projection bug** — existing tasks were having loadings padded/truncated instead of properly re-projected when the basis rotated. Now reconstructs from old basis and projects onto updated basis.
24
+ - **`VLoRACallback` was a no-op** — the HF Trainer callback created an optimizer but never stepped it. Now registers differentiable forward hooks so the Trainer's backward pass produces gradients on loadings, and steps the optimizer in `on_step_end`.
25
+ - **TIES merge normalization** — `n / contributor_count` over-scaled output when elements were trimmed. Fixed to `1 / contributor_count`.
26
+ - **`__version__` mismatch** — `__init__.py` said 0.1.0 while `pyproject.toml` said 0.2.1.
27
+ - **`check_tensor_health` never called** — imported but unused; now wired up after SVD in `from_adapters`.
28
+ - **Task ID collision** — `absorb()` and `absorb_incremental()` now warn when overwriting an existing task ID.
29
+ - **Filesystem-unsafe task IDs** — `save()` now sanitizes task IDs for filenames (handles `/`, `:`, spaces) with a mapping in metadata for lossless round-trip.
30
+ - **`from_adapters_streaming` missing validation** — now checks `len(task_ids) == len(adapter_paths)`.
31
+
32
+ ### Changed
33
+ - **`gram_schmidt` uses QR factorization** — replaced O(k^2 * D) inner loop with `torch.linalg.qr` for better performance and numerical stability.
34
+ - **VLoRAModel caches module handles** — `_apply_hooks` no longer scans all `named_modules()` on every task switch.
35
+ - **VLoRAModel inference hooks wrapped in `torch.no_grad()`** — prevents unnecessary autograd tracking.
36
+ - **NF4 quantization uses `torch.bucketize`** — replaced O(N*16) distance broadcast with binary search, reducing memory from O(N*16) to O(N).
37
+ - **`_LORA_KEY_RE` handles multi-adapter PEFT format** — supports `base_model.model.{layer}.lora_A.{adapter_name}.weight`.
38
+ - **`save_adapter` no longer hardcodes `CAUSAL_LM`** — task type left for PEFT to infer.
39
+ - Repo URL updated to `github.com/vlora-dev/vlora`.
40
+
41
+ ## [0.2.1] - 2026-02-10
42
+
43
+ Initial public release on PyPI as `vlora-dev`.
44
+
45
+ ### Added
46
+ - `SharedSubspace` — 3-step algorithm: from_adapters, project, absorb
47
+ - `VLoRAModel` — inference wrapper with forward hooks
48
+ - `SubspaceTrainer` — loadings-only training
49
+ - `TaskRouter` — per-input adapter routing
50
+ - `task_arithmetic`, `ties_merge`, `dare_merge` — adapter merging
51
+ - Analysis tools: similarity matrix, clustering, outlier detection
52
+ - CLI with 9 commands
53
+ - HuggingFace Trainer integration via `VLoRACallback`
54
+ - Streaming and incremental subspace construction
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vlora-dev
3
- Version: 0.2.1
4
- Summary: Shared low-rank subspaces for efficient LoRA adapter management
3
+ Version: 0.3.0
4
+ Summary: Various LoRA adapters. One shared basis. Up to 122x compression at scale.
5
5
  Project-URL: Homepage, https://github.com/tveseli/vlora
6
6
  Project-URL: Repository, https://github.com/tveseli/vlora
7
7
  Author: Tim Veseli
@@ -39,10 +39,10 @@ Description-Content-Type: text/markdown
39
39
  </p>
40
40
 
41
41
  <p align="center">
42
- <strong>Shared low-rank subspaces for efficient LoRA adapter management.</strong>
42
+ <strong>Various LoRA adapters. One shared basis.</strong>
43
43
  </p>
44
44
 
45
- Based on the [Share paper](https://arxiv.org/abs/2602.06043): LoRA adapters across tasks share a common low-rank subspace. Instead of storing *N* separate adapters, maintain **one shared basis** and **per-task coefficient vectors**achieving up to 122× compression at scale.
45
+ Your adapters share more structure than you think. vLoRA finds the common basis and stores each adapter as a tiny coefficient vector — up to 122× compression at scale. Based on the [Share paper](https://arxiv.org/abs/2602.06043).
46
46
 
47
47
  ## Install
48
48
 
@@ -52,7 +52,7 @@ pip install vlora-dev
52
52
 
53
53
  Or from source:
54
54
  ```bash
55
- git clone https://github.com/tveseli/vlora.git
55
+ git clone https://github.com/vlora-dev/vlora.git
56
56
  cd vlora
57
57
  pip install -e ".[dev]"
58
58
  ```
@@ -137,6 +137,77 @@ output = model(input_ids)
137
137
  print(model.available_tasks) # ["task_0", "task_1", ...]
138
138
  ```
139
139
 
140
+ ## QLoRA Support
141
+
142
+ vLoRA has first-class support for [QLoRA](https://arxiv.org/abs/2305.14314) workflows. QLoRA compresses the **base model** (FP16 → 4-bit NF4), while vLoRA compresses the **adapter space** — these are orthogonal and stack multiplicatively.
143
+
144
+ ### NF4 Quantization
145
+
146
+ Quantize subspace components using the same NF4 data type from QLoRA — 16 quantile levels optimized for normally-distributed weights:
147
+
148
+ ```python
149
+ # NF4 quantization (better than symmetric int4 for normal-ish weights)
150
+ subspace.quantize(method="nf4")
151
+
152
+ # With double quantization (quantize the per-block scales too)
153
+ subspace.quantize(method="nf4", double_quant=True)
154
+
155
+ # Also quantize loadings (effective when loadings are approximately normal)
156
+ subspace.quantize(method="nf4", quantize_loadings=True)
157
+ ```
158
+
159
+ ### Packed NF4 Storage
160
+
161
+ Save subspace in packed 4-bit format for ~7× disk savings:
162
+
163
+ ```python
164
+ # Save: packs components as uint8 (two 4-bit values per byte)
165
+ subspace.save_quantized("shared_subspace/")
166
+
167
+ # Load: auto-detects format, dequantizes on the fly
168
+ subspace = SharedSubspace.load("shared_subspace/")
169
+ ```
170
+
171
+ ### QLoRA Base Model
172
+
173
+ `VLoRAModel` works with quantized base models loaded via bitsandbytes:
174
+
175
+ ```python
176
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
177
+ from vlora import VLoRAModel, SharedSubspace
178
+
179
+ # Load 4-bit base model
180
+ bnb_config = BitsAndBytesConfig(
181
+ load_in_4bit=True,
182
+ bnb_4bit_quant_type="nf4",
183
+ bnb_4bit_compute_dtype=torch.bfloat16,
184
+ )
185
+ base_model = AutoModelForCausalLM.from_pretrained("model-name", quantization_config=bnb_config)
186
+
187
+ # Wrap with vLoRA — compute_dtype ensures LoRA math runs in BF16
188
+ subspace = SharedSubspace.load("shared_subspace/")
189
+ model = VLoRAModel(base_model, subspace, compute_dtype=torch.bfloat16)
190
+
191
+ print(model.qlora_info) # {'quantized': True, 'method': 'nf4', ...}
192
+ model.set_task("task_0")
193
+ output = model(input_ids)
194
+ ```
195
+
196
+ ### Full-Stack Compression
197
+
198
+ Report combined savings across base model quantization and adapter compression:
199
+
200
+ ```python
201
+ stats = subspace.full_stack_compression(
202
+ base_model_params=7_000_000_000, # 7B model
203
+ base_model_bits=16, # original FP16
204
+ quantized_bits=4, # QLoRA NF4
205
+ )
206
+ # → {'total_compression_ratio': 4.0, 'total_original_bytes': 14.0 GB, ...}
207
+ ```
208
+
209
+ See [`examples/qlora_pipeline.py`](examples/qlora_pipeline.py) for a complete end-to-end example.
210
+
140
211
  ## Training in the Subspace
141
212
 
142
213
  Train only the loadings vector (k params per layer) instead of full LoRA matrices — 100×+ parameter reduction:
@@ -219,8 +290,10 @@ merged = dare_merge(adapters, drop_rate=0.5, seed=42)
219
290
  # Adaptive k: different components per layer based on explained variance
220
291
  subspace = SharedSubspace.from_adapters(adapters, adaptive_k=True, variance_threshold=0.9)
221
292
 
222
- # Quantize components for smaller memory footprint
223
- subspace.quantize(bits=8) # or bits=4
293
+ # Quantize components symmetric (int8/int4) or NF4
294
+ subspace.quantize(bits=8) # symmetric int8
295
+ subspace.quantize(method="nf4") # NF4 4-bit (better for normal weights)
296
+ subspace.quantize(method="nf4", double_quant=True) # + quantize the scales
224
297
 
225
298
  # Check compression stats
226
299
  stats = subspace.compression_stats()
@@ -267,14 +340,16 @@ subspace.to(device="cuda", dtype=torch.float16)
267
340
  - `.absorb(adapter, task_id)` — Incorporate + recompute (full SVD)
268
341
  - `.absorb_incremental(adapter, task_id)` — Fast incremental update
269
342
  - `.get_trainable_params(task_id)` — For training integration
270
- - `.quantize(bits=8)` — Quantize components (int8/int4)
343
+ - `.quantize(bits=8, method="symmetric")` — Quantize components (int8/int4/NF4)
271
344
  - `.compression_stats()` — Compression ratio and parameter counts
345
+ - `.full_stack_compression(base_model_params)` — Combined base + adapter stats
272
346
  - `.to(device, dtype)` — Move tensors to device/dtype
273
- - `.save(path)` / `.load(path)` — Serialization
347
+ - `.save(path)` / `.save_quantized(path)` / `.load(path)` — Serialization (NF4-packed auto-detected)
274
348
 
275
349
  ### Model Integration
276
350
 
277
- - **`VLoRAModel(base_model, subspace, lora_alpha=None)`** — Inference wrapper with forward hooks
351
+ - **`VLoRAModel(base_model, subspace, lora_alpha=None, compute_dtype=None)`** — Inference wrapper with forward hooks
352
+ - `.qlora_info` — Base model quantization metadata
278
353
  - `.set_task(task_id)` — Switch adapter (cached)
279
354
  - `.clear_task()` — Remove adapter
280
355
  - `.available_tasks` — List task IDs
@@ -325,6 +400,7 @@ subspace.to(device="cuda", dtype=torch.float16)
325
400
  - `compute_svd`, `project_onto_subspace`, `reconstruct_from_subspace`
326
401
  - `gram_schmidt`, `explained_variance_ratio`, `select_num_components`
327
402
  - `incremental_svd_update`
403
+ - `nf4_quantize_dequantize`, `nf4_pack`, `nf4_unpack` — NF4 quantization (QLoRA)
328
404
 
329
405
  ## Benchmarks — Real-World Adapters
330
406
 
@@ -3,10 +3,10 @@
3
3
  </p>
4
4
 
5
5
  <p align="center">
6
- <strong>Shared low-rank subspaces for efficient LoRA adapter management.</strong>
6
+ <strong>Various LoRA adapters. One shared basis.</strong>
7
7
  </p>
8
8
 
9
- Based on the [Share paper](https://arxiv.org/abs/2602.06043): LoRA adapters across tasks share a common low-rank subspace. Instead of storing *N* separate adapters, maintain **one shared basis** and **per-task coefficient vectors**achieving up to 122× compression at scale.
9
+ Your adapters share more structure than you think. vLoRA finds the common basis and stores each adapter as a tiny coefficient vector — up to 122× compression at scale. Based on the [Share paper](https://arxiv.org/abs/2602.06043).
10
10
 
11
11
  ## Install
12
12
 
@@ -16,7 +16,7 @@ pip install vlora-dev
16
16
 
17
17
  Or from source:
18
18
  ```bash
19
- git clone https://github.com/tveseli/vlora.git
19
+ git clone https://github.com/vlora-dev/vlora.git
20
20
  cd vlora
21
21
  pip install -e ".[dev]"
22
22
  ```
@@ -101,6 +101,77 @@ output = model(input_ids)
101
101
  print(model.available_tasks) # ["task_0", "task_1", ...]
102
102
  ```
103
103
 
104
+ ## QLoRA Support
105
+
106
+ vLoRA has first-class support for [QLoRA](https://arxiv.org/abs/2305.14314) workflows. QLoRA compresses the **base model** (FP16 → 4-bit NF4), while vLoRA compresses the **adapter space** — these are orthogonal and stack multiplicatively.
107
+
108
+ ### NF4 Quantization
109
+
110
+ Quantize subspace components using the same NF4 data type from QLoRA — 16 quantile levels optimized for normally-distributed weights:
111
+
112
+ ```python
113
+ # NF4 quantization (better than symmetric int4 for normal-ish weights)
114
+ subspace.quantize(method="nf4")
115
+
116
+ # With double quantization (quantize the per-block scales too)
117
+ subspace.quantize(method="nf4", double_quant=True)
118
+
119
+ # Also quantize loadings (effective when loadings are approximately normal)
120
+ subspace.quantize(method="nf4", quantize_loadings=True)
121
+ ```
122
+
123
+ ### Packed NF4 Storage
124
+
125
+ Save subspace in packed 4-bit format for ~7× disk savings:
126
+
127
+ ```python
128
+ # Save: packs components as uint8 (two 4-bit values per byte)
129
+ subspace.save_quantized("shared_subspace/")
130
+
131
+ # Load: auto-detects format, dequantizes on the fly
132
+ subspace = SharedSubspace.load("shared_subspace/")
133
+ ```
134
+
135
+ ### QLoRA Base Model
136
+
137
+ `VLoRAModel` works with quantized base models loaded via bitsandbytes:
138
+
139
+ ```python
140
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
141
+ from vlora import VLoRAModel, SharedSubspace
142
+
143
+ # Load 4-bit base model
144
+ bnb_config = BitsAndBytesConfig(
145
+ load_in_4bit=True,
146
+ bnb_4bit_quant_type="nf4",
147
+ bnb_4bit_compute_dtype=torch.bfloat16,
148
+ )
149
+ base_model = AutoModelForCausalLM.from_pretrained("model-name", quantization_config=bnb_config)
150
+
151
+ # Wrap with vLoRA — compute_dtype ensures LoRA math runs in BF16
152
+ subspace = SharedSubspace.load("shared_subspace/")
153
+ model = VLoRAModel(base_model, subspace, compute_dtype=torch.bfloat16)
154
+
155
+ print(model.qlora_info) # {'quantized': True, 'method': 'nf4', ...}
156
+ model.set_task("task_0")
157
+ output = model(input_ids)
158
+ ```
159
+
160
+ ### Full-Stack Compression
161
+
162
+ Report combined savings across base model quantization and adapter compression:
163
+
164
+ ```python
165
+ stats = subspace.full_stack_compression(
166
+ base_model_params=7_000_000_000, # 7B model
167
+ base_model_bits=16, # original FP16
168
+ quantized_bits=4, # QLoRA NF4
169
+ )
170
+ # → {'total_compression_ratio': 4.0, 'total_original_bytes': 14.0 GB, ...}
171
+ ```
172
+
173
+ See [`examples/qlora_pipeline.py`](examples/qlora_pipeline.py) for a complete end-to-end example.
174
+
104
175
  ## Training in the Subspace
105
176
 
106
177
  Train only the loadings vector (k params per layer) instead of full LoRA matrices — 100×+ parameter reduction:
@@ -183,8 +254,10 @@ merged = dare_merge(adapters, drop_rate=0.5, seed=42)
183
254
  # Adaptive k: different components per layer based on explained variance
184
255
  subspace = SharedSubspace.from_adapters(adapters, adaptive_k=True, variance_threshold=0.9)
185
256
 
186
- # Quantize components for smaller memory footprint
187
- subspace.quantize(bits=8) # or bits=4
257
+ # Quantize components symmetric (int8/int4) or NF4
258
+ subspace.quantize(bits=8) # symmetric int8
259
+ subspace.quantize(method="nf4") # NF4 4-bit (better for normal weights)
260
+ subspace.quantize(method="nf4", double_quant=True) # + quantize the scales
188
261
 
189
262
  # Check compression stats
190
263
  stats = subspace.compression_stats()
@@ -231,14 +304,16 @@ subspace.to(device="cuda", dtype=torch.float16)
231
304
  - `.absorb(adapter, task_id)` — Incorporate + recompute (full SVD)
232
305
  - `.absorb_incremental(adapter, task_id)` — Fast incremental update
233
306
  - `.get_trainable_params(task_id)` — For training integration
234
- - `.quantize(bits=8)` — Quantize components (int8/int4)
307
+ - `.quantize(bits=8, method="symmetric")` — Quantize components (int8/int4/NF4)
235
308
  - `.compression_stats()` — Compression ratio and parameter counts
309
+ - `.full_stack_compression(base_model_params)` — Combined base + adapter stats
236
310
  - `.to(device, dtype)` — Move tensors to device/dtype
237
- - `.save(path)` / `.load(path)` — Serialization
311
+ - `.save(path)` / `.save_quantized(path)` / `.load(path)` — Serialization (NF4-packed auto-detected)
238
312
 
239
313
  ### Model Integration
240
314
 
241
- - **`VLoRAModel(base_model, subspace, lora_alpha=None)`** — Inference wrapper with forward hooks
315
+ - **`VLoRAModel(base_model, subspace, lora_alpha=None, compute_dtype=None)`** — Inference wrapper with forward hooks
316
+ - `.qlora_info` — Base model quantization metadata
242
317
  - `.set_task(task_id)` — Switch adapter (cached)
243
318
  - `.clear_task()` — Remove adapter
244
319
  - `.available_tasks` — List task IDs
@@ -289,6 +364,7 @@ subspace.to(device="cuda", dtype=torch.float16)
289
364
  - `compute_svd`, `project_onto_subspace`, `reconstruct_from_subspace`
290
365
  - `gram_schmidt`, `explained_variance_ratio`, `select_num_components`
291
366
  - `incremental_svd_update`
367
+ - `nf4_quantize_dequantize`, `nf4_pack`, `nf4_unpack` — NF4 quantization (QLoRA)
292
368
 
293
369
  ## Benchmarks — Real-World Adapters
294
370
 
@@ -1,8 +1,8 @@
1
1
  # vlora
2
2
 
3
- **Shared low-rank subspaces for efficient LoRA adapter management.**
3
+ **Various LoRA adapters. One shared basis.**
4
4
 
5
- Based on the [Share paper](https://arxiv.org/abs/2602.06043): LoRA adapters across tasks share a common low-rank subspace. Instead of storing *N* separate adapters, maintain **one shared basis** and **per-task coefficient vectors**achieving up to 122× compression at scale.
5
+ Your adapters share more structure than you think. vLoRA finds the common basis and stores each adapter as a tiny coefficient vector — up to 122× compression at scale. Based on the [Share paper](https://arxiv.org/abs/2602.06043).
6
6
 
7
7
  ## Install
8
8
 
@@ -0,0 +1,154 @@
1
+ """QLoRA + vLoRA: End-to-end pipeline for efficient multi-adapter serving.
2
+
3
+ This example shows the full workflow:
4
+ 1. Load a QLoRA-quantized base model (4-bit NF4)
5
+ 2. Load multiple LoRA adapters (produced by QLoRA fine-tuning)
6
+ 3. Build a shared subspace with NF4 quantization
7
+ 4. Serve with instant task switching via VLoRAModel
8
+
9
+ Requirements:
10
+ pip install vlora-dev[hub] transformers bitsandbytes accelerate
11
+
12
+ The pipeline combines two orthogonal compression techniques:
13
+ - QLoRA: compresses the base model (FP16 -> NF4, ~4x savings)
14
+ - vLoRA: compresses the adapter space (N adapters -> shared subspace, ~122x)
15
+ Together they enable serving hundreds of task-specific adapters on a single GPU.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import torch
21
+
22
+ # ── Step 0: Configuration ──────────────────────────────────────────────
23
+ BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Small model for demo
24
+ ADAPTER_REPOS = [
25
+ # Replace with your QLoRA adapter repos from HuggingFace Hub
26
+ # "username/adapter-task-a",
27
+ # "username/adapter-task-b",
28
+ ]
29
+ NUM_COMPONENTS = 4 # Subspace dimension
30
+ USE_NF4_STORAGE = True # Save subspace in packed NF4 format
31
+
32
+
33
+ def main():
34
+ # ── Step 1: Load QLoRA base model ──────────────────────────────────
35
+ # In production, load with 4-bit quantization:
36
+ #
37
+ # from transformers import AutoModelForCausalLM, BitsAndBytesConfig
38
+ # bnb_config = BitsAndBytesConfig(
39
+ # load_in_4bit=True,
40
+ # bnb_4bit_quant_type="nf4",
41
+ # bnb_4bit_compute_dtype=torch.bfloat16,
42
+ # )
43
+ # base_model = AutoModelForCausalLM.from_pretrained(
44
+ # BASE_MODEL, quantization_config=bnb_config
45
+ # )
46
+ #
47
+ # For this demo, we simulate with synthetic data:
48
+ print("=== QLoRA + vLoRA Pipeline Demo ===\n")
49
+
50
+ # ── Step 2: Load adapters ──────────────────────────────────────────
51
+ from vlora import LoRAWeights, SharedSubspace, VLoRAModel
52
+
53
+ print("Creating synthetic adapters (replace with load_adapter_from_hub)...")
54
+ layers = [
55
+ "model.layers.0.self_attn.q_proj",
56
+ "model.layers.0.self_attn.v_proj",
57
+ "model.layers.1.self_attn.q_proj",
58
+ "model.layers.1.self_attn.v_proj",
59
+ ]
60
+ rank = 8
61
+ dim = 512
62
+ n_adapters = 10
63
+
64
+ # Create correlated adapters (simulates real LoRA adapters sharing structure)
65
+ torch.manual_seed(42)
66
+ shared_basis = {l: torch.randn(5, rank * dim) for l in layers}
67
+ adapters = []
68
+ task_ids = []
69
+ for i in range(n_adapters):
70
+ lora_a = {l: (torch.randn(5) @ shared_basis[l]).reshape(rank, dim) for l in layers}
71
+ lora_b = {l: torch.randn(dim, rank) * 0.01 for l in layers}
72
+ adapters.append(LoRAWeights(layer_names=layers, lora_a=lora_a, lora_b=lora_b, rank=rank))
73
+ task_ids.append(f"task_{i}")
74
+ print(f" Loaded {n_adapters} adapters, rank={rank}, {len(layers)} layers\n")
75
+
76
+ # ── Step 3: Build shared subspace ──────────────────────────────────
77
+ print("Building shared subspace...")
78
+ subspace = SharedSubspace.from_adapters(
79
+ adapters,
80
+ task_ids=task_ids,
81
+ num_components=NUM_COMPONENTS,
82
+ )
83
+
84
+ stats = subspace.compression_stats()
85
+ print(f" Components: {subspace.num_components}")
86
+ print(f" Compression: {stats['compression_ratio']:.1f}x")
87
+ print(f" Original params: {stats['total_params_original']:,}")
88
+ print(f" Compressed params: {stats['total_params_compressed']:,}\n")
89
+
90
+ # ── Step 4: Apply NF4 quantization to subspace ─────────────────────
91
+ print("Quantizing subspace with NF4...")
92
+ subspace.quantize(method="nf4", quantize_loadings=True)
93
+ print(" Done (components + loadings quantized)\n")
94
+
95
+ # ── Step 5: Save with packed NF4 storage ───────────────────────────
96
+ import tempfile
97
+ from pathlib import Path
98
+
99
+ save_dir = Path(tempfile.mkdtemp()) / "subspace"
100
+
101
+ if USE_NF4_STORAGE:
102
+ print("Saving with NF4-packed format...")
103
+ subspace.save_quantized(save_dir)
104
+ else:
105
+ print("Saving with float32 format...")
106
+ subspace.save(save_dir)
107
+
108
+ # Compare file sizes
109
+ total_bytes = sum(f.stat().st_size for f in save_dir.rglob("*") if f.is_file())
110
+ print(f" Saved to: {save_dir}")
111
+ print(f" Total size: {total_bytes / 1024:.1f} KB\n")
112
+
113
+ # ── Step 6: Load and serve ─────────────────────────────────────────
114
+ print("Loading subspace (auto-detects format)...")
115
+ loaded = SharedSubspace.load(save_dir)
116
+ print(f" {loaded!r}\n")
117
+
118
+ # Full-stack compression stats (with hypothetical QLoRA base model)
119
+ full_stats = loaded.full_stack_compression(
120
+ base_model_params=1_100_000_000, # TinyLlama 1.1B
121
+ base_model_bits=16,
122
+ quantized_bits=4,
123
+ )
124
+ if "total_compression_ratio" in full_stats:
125
+ print("Full-stack compression (QLoRA base + vLoRA adapters):")
126
+ print(f" Base model: {full_stats['base_model']['compression_ratio']:.1f}x (FP16->NF4)")
127
+ print(f" Adapters: {stats['compression_ratio']:.1f}x ({n_adapters} adapters)")
128
+ print(f" Total: {full_stats['total_original_bytes']/1e9:.1f} GB -> "
129
+ f"{full_stats['total_compressed_bytes']/1e9:.2f} GB")
130
+ print(f" Combined: {full_stats['total_compression_ratio']:.1f}x\n")
131
+
132
+ # In production with a real base model:
133
+ #
134
+ # model = VLoRAModel(base_model, loaded, compute_dtype=torch.bfloat16)
135
+ # print(f"QLoRA info: {model.qlora_info}")
136
+ #
137
+ # # Instant task switching
138
+ # model.set_task("task_0")
139
+ # output = model(input_ids)
140
+ #
141
+ # model.set_task("task_5") # microseconds to switch
142
+ # output = model(input_ids)
143
+
144
+ # Demonstrate reconstruction
145
+ print("Reconstructing adapters from subspace...")
146
+ for tid in ["task_0", "task_5", "task_9"]:
147
+ recon = loaded.reconstruct(tid)
148
+ print(f" {tid}: {recon!r}")
149
+
150
+ print("\nDone!")
151
+
152
+
153
+ if __name__ == "__main__":
154
+ main()
Binary file
Binary file
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "vlora-dev"
7
- version = "0.2.1"
8
- description = "Shared low-rank subspaces for efficient LoRA adapter management"
7
+ version = "0.3.0"
8
+ description = "Various LoRA adapters. One shared basis. Up to 122x compression at scale."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
11
11
  requires-python = ">=3.9"
@@ -5,13 +5,17 @@ share a common low-rank subspace. Instead of storing N separate adapters,
5
5
  maintain one shared basis and per-task coefficient vectors.
6
6
  """
7
7
 
8
- __version__ = "0.1.0"
8
+ __version__ = "0.3.0"
9
9
 
10
10
  from vlora.io import LoRAWeights, load_adapter, load_adapter_from_hub, save_adapter
11
11
  from vlora.ops import (
12
+ NF4_QUANT_TABLE,
12
13
  compute_svd,
13
14
  explained_variance_ratio,
14
15
  gram_schmidt,
16
+ nf4_pack,
17
+ nf4_quantize_dequantize,
18
+ nf4_unpack,
15
19
  project_onto_subspace,
16
20
  reconstruct_from_subspace,
17
21
  select_num_components,
@@ -51,6 +55,11 @@ __all__ = [
51
55
  "gram_schmidt",
52
56
  "explained_variance_ratio",
53
57
  "select_num_components",
58
+ # NF4 quantization (QLoRA-style)
59
+ "NF4_QUANT_TABLE",
60
+ "nf4_quantize_dequantize",
61
+ "nf4_pack",
62
+ "nf4_unpack",
54
63
  # Analysis
55
64
  "compute_similarity_matrix",
56
65
  "find_clusters",