vespaembed 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. vespaembed-0.0.2/PKG-INFO +325 -0
  2. vespaembed-0.0.2/README.md +285 -0
  3. {vespaembed-0.0.1 → vespaembed-0.0.2}/pyproject.toml +23 -9
  4. vespaembed-0.0.2/src/vespaembed/__init__.py +1 -0
  5. vespaembed-0.0.2/src/vespaembed/cli/__init__.py +17 -0
  6. vespaembed-0.0.2/src/vespaembed/cli/commands/__init__.py +7 -0
  7. vespaembed-0.0.2/src/vespaembed/cli/commands/evaluate.py +85 -0
  8. vespaembed-0.0.2/src/vespaembed/cli/commands/export.py +86 -0
  9. vespaembed-0.0.2/src/vespaembed/cli/commands/info.py +52 -0
  10. vespaembed-0.0.2/src/vespaembed/cli/commands/serve.py +49 -0
  11. vespaembed-0.0.2/src/vespaembed/cli/commands/train.py +267 -0
  12. vespaembed-0.0.2/src/vespaembed/cli/vespaembed.py +55 -0
  13. vespaembed-0.0.2/src/vespaembed/core/__init__.py +2 -0
  14. vespaembed-0.0.2/src/vespaembed/core/config.py +164 -0
  15. vespaembed-0.0.2/src/vespaembed/core/registry.py +158 -0
  16. vespaembed-0.0.2/src/vespaembed/core/trainer.py +573 -0
  17. vespaembed-0.0.2/src/vespaembed/datasets/__init__.py +3 -0
  18. vespaembed-0.0.2/src/vespaembed/datasets/formats/__init__.py +5 -0
  19. vespaembed-0.0.2/src/vespaembed/datasets/formats/csv.py +15 -0
  20. vespaembed-0.0.2/src/vespaembed/datasets/formats/huggingface.py +34 -0
  21. vespaembed-0.0.2/src/vespaembed/datasets/formats/jsonl.py +26 -0
  22. vespaembed-0.0.2/src/vespaembed/datasets/loader.py +80 -0
  23. vespaembed-0.0.2/src/vespaembed/db.py +176 -0
  24. vespaembed-0.0.2/src/vespaembed/enums.py +58 -0
  25. vespaembed-0.0.2/src/vespaembed/evaluation/__init__.py +3 -0
  26. vespaembed-0.0.2/src/vespaembed/evaluation/factory.py +86 -0
  27. vespaembed-0.0.2/src/vespaembed/models/__init__.py +4 -0
  28. vespaembed-0.0.2/src/vespaembed/models/export.py +89 -0
  29. vespaembed-0.0.2/src/vespaembed/models/loader.py +25 -0
  30. vespaembed-0.0.2/src/vespaembed/static/css/styles.css +1800 -0
  31. vespaembed-0.0.2/src/vespaembed/static/js/app.js +1485 -0
  32. vespaembed-0.0.2/src/vespaembed/tasks/__init__.py +23 -0
  33. vespaembed-0.0.2/src/vespaembed/tasks/base.py +144 -0
  34. vespaembed-0.0.2/src/vespaembed/tasks/pairs.py +91 -0
  35. vespaembed-0.0.2/src/vespaembed/tasks/similarity.py +84 -0
  36. vespaembed-0.0.2/src/vespaembed/tasks/triplets.py +90 -0
  37. vespaembed-0.0.2/src/vespaembed/tasks/tsdae.py +102 -0
  38. vespaembed-0.0.2/src/vespaembed/templates/index.html +544 -0
  39. vespaembed-0.0.2/src/vespaembed/utils/__init__.py +3 -0
  40. vespaembed-0.0.2/src/vespaembed/utils/logging.py +69 -0
  41. vespaembed-0.0.2/src/vespaembed/web/__init__.py +1 -0
  42. vespaembed-0.0.2/src/vespaembed/web/api/__init__.py +1 -0
  43. vespaembed-0.0.2/src/vespaembed/web/app.py +605 -0
  44. vespaembed-0.0.2/src/vespaembed/worker.py +313 -0
  45. vespaembed-0.0.2/src/vespaembed.egg-info/PKG-INFO +325 -0
  46. vespaembed-0.0.2/src/vespaembed.egg-info/SOURCES.txt +57 -0
  47. vespaembed-0.0.2/src/vespaembed.egg-info/requires.txt +31 -0
  48. vespaembed-0.0.2/tests/test_api.py +343 -0
  49. vespaembed-0.0.2/tests/test_cli.py +113 -0
  50. vespaembed-0.0.2/tests/test_config.py +350 -0
  51. vespaembed-0.0.2/tests/test_db.py +163 -0
  52. vespaembed-0.0.2/tests/test_e2e.py +526 -0
  53. vespaembed-0.0.2/tests/test_registry.py +362 -0
  54. vespaembed-0.0.2/tests/test_trainer.py +174 -0
  55. vespaembed-0.0.1/PKG-INFO +0 -20
  56. vespaembed-0.0.1/README.md +0 -0
  57. vespaembed-0.0.1/src/vespaembed/__init__.py +0 -1
  58. vespaembed-0.0.1/src/vespaembed.egg-info/PKG-INFO +0 -20
  59. vespaembed-0.0.1/src/vespaembed.egg-info/SOURCES.txt +0 -10
  60. vespaembed-0.0.1/src/vespaembed.egg-info/requires.txt +0 -11
  61. {vespaembed-0.0.1 → vespaembed-0.0.2}/LICENSE +0 -0
  62. {vespaembed-0.0.1 → vespaembed-0.0.2}/setup.cfg +0 -0
  63. {vespaembed-0.0.1 → vespaembed-0.0.2}/src/vespaembed.egg-info/dependency_links.txt +0 -0
  64. {vespaembed-0.0.1 → vespaembed-0.0.2}/src/vespaembed.egg-info/entry_points.txt +0 -0
  65. {vespaembed-0.0.1 → vespaembed-0.0.2}/src/vespaembed.egg-info/top_level.txt +0 -0
@@ -0,0 +1,325 @@
1
+ Metadata-Version: 2.4
2
+ Name: vespaembed
3
+ Version: 0.0.2
4
+ Summary: vespaembed: no-code training for embedding models
5
+ Author: Abhishek Thakur
6
+ License: Apache 2.0
7
+ Project-URL: Homepage, https://github.com/vespaai-playground/vespaembed
8
+ Requires-Python: >=3.11
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: sentence-transformers>=3.0.0
12
+ Requires-Dist: transformers>=4.40.0
13
+ Requires-Dist: accelerate>=0.26.0
14
+ Requires-Dist: torch>=2.0.0
15
+ Requires-Dist: datasets>=2.18.0
16
+ Requires-Dist: pandas>=2.0.0
17
+ Requires-Dist: pydantic>=2.0.0
18
+ Requires-Dist: rich>=13.0.0
19
+ Requires-Dist: fastapi>=0.111.0
20
+ Requires-Dist: uvicorn>=0.30.0
21
+ Requires-Dist: python-multipart>=0.0.9
22
+ Requires-Dist: websockets>=12.0
23
+ Requires-Dist: jinja2>=3.1.0
24
+ Requires-Dist: pyyaml>=6.0.0
25
+ Requires-Dist: tensorboard>=2.15.0
26
+ Requires-Dist: peft>=0.18.1
27
+ Requires-Dist: unsloth>=2026.1.4
28
+ Provides-Extra: unsloth
29
+ Requires-Dist: unsloth; extra == "unsloth"
30
+ Provides-Extra: onnx
31
+ Requires-Dist: onnx>=1.14.0; extra == "onnx"
32
+ Requires-Dist: onnxruntime>=1.23.2; extra == "onnx"
33
+ Provides-Extra: dev
34
+ Requires-Dist: black==26.1.0; extra == "dev"
35
+ Requires-Dist: isort==7.0.0; extra == "dev"
36
+ Requires-Dist: flake8==7.3.0; extra == "dev"
37
+ Requires-Dist: pytest>=9.0.2; extra == "dev"
38
+ Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # VespaEmbed
42
+
43
+ No-code training for embedding models. Train custom embedding models with a web UI or CLI.
44
+
45
+ ## Features
46
+
47
+ - **Web UI** - Visual interface for configuring and monitoring training
48
+ - **CLI** - Command-line interface for scripting and automation
49
+ - **Multiple Tasks** - Support for pairs, triplets, similarity scoring, and unsupervised learning
50
+ - **Loss Variants** - Choose from multiple loss functions per task
51
+ - **Matryoshka Embeddings** - Train multi-dimensional embeddings for flexible retrieval
52
+ - **LoRA Support** - Parameter-efficient fine-tuning with LoRA adapters
53
+ - **Unsloth Integration** - Faster training with Unsloth optimizations
54
+ - **HuggingFace Integration** - Load datasets, models from HuggingFace Hub, push models to Hub
55
+
56
+ ## Installation
57
+
58
+ > **Note:** VespaEmbed is in experimental phase. Install from source.
59
+
60
+ ```bash
61
+ git clone https://github.com/vespaai-playground/vespaembed.git
62
+ cd vespaembed
63
+ uv sync
64
+ ```
65
+
66
+ ### Optional Dependencies
67
+
68
+ ```bash
69
+ # For Unsloth acceleration (requires NVIDIA/AMD GPU)
70
+ uv sync --extra unsloth
71
+
72
+ # For ONNX export
73
+ uv sync --extra onnx
74
+
75
+ # For development
76
+ uv sync --extra dev
77
+ ```
78
+
79
+ ## Quick Start
80
+
81
+ ### Web UI
82
+
83
+ Launch the web interface:
84
+
85
+ ```bash
86
+ vespaembed
87
+ ```
88
+
89
+ Open http://localhost:8000 in your browser. The UI lets you:
90
+ - Upload training data (CSV or JSONL)
91
+ - Select task type and base model
92
+ - Configure hyperparameters
93
+ - Monitor training progress
94
+ - Download trained models
95
+
96
+ ### CLI
97
+
98
+ Train a model from the command line:
99
+
100
+ ```bash
101
+ vespaembed train \
102
+ --data examples/data/pairs.csv \
103
+ --task pairs \
104
+ --base-model sentence-transformers/all-MiniLM-L6-v2 \
105
+ --epochs 3
106
+ ```
107
+
108
+ Or use a YAML config file:
109
+
110
+ ```bash
111
+ vespaembed train --config config.yaml
112
+ ```
113
+
114
+ ## Tasks
115
+
116
+ VespaEmbed supports 4 training tasks based on your data format:
117
+
118
+ ### Pairs
119
+
120
+ Text pairs for semantic search. Use when you have query-document pairs without explicit negatives.
121
+
122
+ **Data format:**
123
+ ```csv
124
+ anchor,positive
125
+ What is machine learning?,Machine learning is a subset of AI...
126
+ How does photosynthesis work?,Photosynthesis converts sunlight...
127
+ ```
128
+
129
+ **Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
130
+
131
+ ### Triplets
132
+
133
+ Text triplets with hard negatives. Use when you have explicit negative examples.
134
+
135
+ **Data format:**
136
+ ```csv
137
+ anchor,positive,negative
138
+ What is Python?,Python is a programming language...,A python is a large snake...
139
+ ```
140
+
141
+ **Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
142
+
143
+ ### Similarity
144
+
145
+ Text pairs with similarity scores (STS-style). Use when you have continuous similarity labels.
146
+
147
+ **Data format:**
148
+ ```csv
149
+ sentence1,sentence2,score
150
+ A man is playing guitar,A person plays music,0.85
151
+ The cat is sleeping,A dog is running,0.12
152
+ ```
153
+
154
+ **Loss variants:** `cosine` (default), `cosent`, `angle`
155
+
156
+ ### TSDAE
157
+
158
+ Unsupervised learning with denoising auto-encoder. Use when you only have unlabeled text for domain adaptation.
159
+
160
+ **Data format:**
161
+ ```csv
162
+ text
163
+ Machine learning is transforming how we analyze data.
164
+ Natural language processing enables computers to understand human language.
165
+ ```
166
+
167
+ ## Configuration
168
+
169
+ ### CLI Arguments
170
+
171
+ ```bash
172
+ vespaembed train \
173
+ --data <path> # Training data (CSV, JSONL, or HF dataset)
174
+ --task <task> # Task type: pairs, triplets, similarity, tsdae
175
+ --base-model <model> # Base model name or path
176
+ --project <name> # Project name (optional)
177
+ --eval-data <path> # Evaluation data (optional)
178
+ --epochs <n> # Number of epochs (default: 3)
179
+ --batch-size <n> # Batch size (default: 32)
180
+ --learning-rate <lr> # Learning rate (default: 2e-5)
181
+ --optimizer <opt> # Optimizer (default: adamw_torch)
182
+ --scheduler <sched> # LR scheduler (default: linear)
183
+ --matryoshka # Enable Matryoshka embeddings
184
+ --matryoshka-dims <dims> # Dimensions (default: 768,512,256,128,64)
185
+ --unsloth # Use Unsloth for faster training
186
+ --subset <name> # HuggingFace dataset subset
187
+ --split <name> # HuggingFace dataset split
188
+ ```
189
+
190
+ ### Optimizers
191
+
192
+ | Option | Description |
193
+ |--------|-------------|
194
+ | `adamw_torch` | AdamW (default) |
195
+ | `adamw_torch_fused` | Fused AdamW (faster on CUDA) |
196
+ | `adamw_8bit` | 8-bit AdamW (memory efficient) |
197
+ | `adafactor` | Adafactor (memory efficient, no momentum) |
198
+ | `sgd` | SGD with momentum |
199
+
200
+ ### Schedulers
201
+
202
+ | Option | Description |
203
+ |--------|-------------|
204
+ | `linear` | Linear decay (default) |
205
+ | `cosine` | Cosine annealing |
206
+ | `cosine_with_restarts` | Cosine with warm restarts |
207
+ | `constant` | Constant learning rate |
208
+ | `constant_with_warmup` | Constant after warmup |
209
+ | `polynomial` | Polynomial decay |
210
+
211
+ ### YAML Configuration
212
+
213
+ ```yaml
214
+ task: pairs
215
+ base_model: sentence-transformers/all-MiniLM-L6-v2
216
+
217
+ data:
218
+ train: train.csv
219
+ eval: eval.csv # optional
220
+
221
+ training:
222
+ epochs: 3
223
+ batch_size: 32
224
+ learning_rate: 2e-5
225
+ warmup_ratio: 0.1
226
+ weight_decay: 0.01
227
+ fp16: true
228
+ eval_steps: 500
229
+ save_steps: 500
230
+ logging_steps: 100
231
+ optimizer: adamw_torch # adamw_torch, adamw_8bit, adafactor, sgd
232
+ scheduler: linear # linear, cosine, constant, polynomial
233
+
234
+ output:
235
+ dir: ./output
236
+ push_to_hub: false
237
+ hf_username: null
238
+
239
+ # Optional: LoRA configuration
240
+ lora:
241
+ enabled: false
242
+ r: 64
243
+ alpha: 128
244
+ dropout: 0.1
245
+ target_modules: [query, key, value, dense]
246
+
247
+ # Optional: Matryoshka dimensions
248
+ matryoshka_dims: [768, 512, 256, 128, 64]
249
+
250
+ # Optional: Loss variant (uses task default if not specified)
251
+ loss_variant: mnr
252
+ ```
253
+
254
+ ### HuggingFace Datasets
255
+
256
+ Load datasets directly from HuggingFace Hub:
257
+
258
+ ```bash
259
+ vespaembed train \
260
+ --data sentence-transformers/all-nli \
261
+ --subset triplet \
262
+ --split train \
263
+ --task triplets \
264
+ --base-model sentence-transformers/all-MiniLM-L6-v2
265
+ ```
266
+
267
+ ## CLI Commands
268
+
269
+ | Command | Description |
270
+ |---------|-------------|
271
+ | `vespaembed` | Launch web UI (default) |
272
+ | `vespaembed serve` | Launch web UI |
273
+ | `vespaembed train` | Train a model |
274
+ | `vespaembed evaluate` | Evaluate a model |
275
+ | `vespaembed export` | Export model to ONNX |
276
+ | `vespaembed info` | Show task information |
277
+
278
+ ## Output
279
+
280
+ Trained models are saved to `~/.vespaembed/projects/<project-name>/`:
281
+
282
+ ```
283
+ ~/.vespaembed/projects/my-project/
284
+ ├── final/ # Final trained model
285
+ ├── checkpoint-500/ # Training checkpoints
286
+ ├── checkpoint-1000/
287
+ └── logs/ # TensorBoard logs
288
+ ```
289
+
290
+ ## Column Aliases
291
+
292
+ VespaEmbed automatically recognizes common column name variations:
293
+
294
+ | Task | Expected | Also Accepts |
295
+ |------|----------|--------------|
296
+ | pairs | `anchor` | `query`, `question`, `sent1`, `sentence1`, `text1` |
297
+ | pairs | `positive` | `document`, `answer`, `pos`, `sent2`, `sentence2`, `text2` |
298
+ | triplets | `negative` | `neg`, `hard_negative`, `sent3`, `sentence3`, `text3` |
299
+ | similarity | `sentence1` | `sent1`, `text1`, `anchor`, `query` |
300
+ | similarity | `sentence2` | `sent2`, `text2`, `positive`, `document` |
301
+ | similarity | `score` | `similarity`, `label`, `sim_score` |
302
+ | tsdae | `text` | `sentence`, `sentences`, `content`, `input` |
303
+
304
+ ## Development
305
+
306
+ ```bash
307
+ # Install dev dependencies
308
+ uv sync --extra dev
309
+
310
+ # Run tests
311
+ uv run pytest tests/
312
+
313
+ # Run tests with coverage
314
+ uv run pytest tests/ --cov=vespaembed
315
+
316
+ # Format code
317
+ make format
318
+
319
+ # Lint
320
+ make lint
321
+ ```
322
+
323
+ ## License
324
+
325
+ Apache 2.0
@@ -0,0 +1,285 @@
1
+ # VespaEmbed
2
+
3
+ No-code training for embedding models. Train custom embedding models with a web UI or CLI.
4
+
5
+ ## Features
6
+
7
+ - **Web UI** - Visual interface for configuring and monitoring training
8
+ - **CLI** - Command-line interface for scripting and automation
9
+ - **Multiple Tasks** - Support for pairs, triplets, similarity scoring, and unsupervised learning
10
+ - **Loss Variants** - Choose from multiple loss functions per task
11
+ - **Matryoshka Embeddings** - Train multi-dimensional embeddings for flexible retrieval
12
+ - **LoRA Support** - Parameter-efficient fine-tuning with LoRA adapters
13
+ - **Unsloth Integration** - Faster training with Unsloth optimizations
14
+ - **HuggingFace Integration** - Load datasets, models from HuggingFace Hub, push models to Hub
15
+
16
+ ## Installation
17
+
18
+ > **Note:** VespaEmbed is in experimental phase. Install from source.
19
+
20
+ ```bash
21
+ git clone https://github.com/vespaai-playground/vespaembed.git
22
+ cd vespaembed
23
+ uv sync
24
+ ```
25
+
26
+ ### Optional Dependencies
27
+
28
+ ```bash
29
+ # For Unsloth acceleration (requires NVIDIA/AMD GPU)
30
+ uv sync --extra unsloth
31
+
32
+ # For ONNX export
33
+ uv sync --extra onnx
34
+
35
+ # For development
36
+ uv sync --extra dev
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ### Web UI
42
+
43
+ Launch the web interface:
44
+
45
+ ```bash
46
+ vespaembed
47
+ ```
48
+
49
+ Open http://localhost:8000 in your browser. The UI lets you:
50
+ - Upload training data (CSV or JSONL)
51
+ - Select task type and base model
52
+ - Configure hyperparameters
53
+ - Monitor training progress
54
+ - Download trained models
55
+
56
+ ### CLI
57
+
58
+ Train a model from the command line:
59
+
60
+ ```bash
61
+ vespaembed train \
62
+ --data examples/data/pairs.csv \
63
+ --task pairs \
64
+ --base-model sentence-transformers/all-MiniLM-L6-v2 \
65
+ --epochs 3
66
+ ```
67
+
68
+ Or use a YAML config file:
69
+
70
+ ```bash
71
+ vespaembed train --config config.yaml
72
+ ```
73
+
74
+ ## Tasks
75
+
76
+ VespaEmbed supports 4 training tasks based on your data format:
77
+
78
+ ### Pairs
79
+
80
+ Text pairs for semantic search. Use when you have query-document pairs without explicit negatives.
81
+
82
+ **Data format:**
83
+ ```csv
84
+ anchor,positive
85
+ What is machine learning?,Machine learning is a subset of AI...
86
+ How does photosynthesis work?,Photosynthesis converts sunlight...
87
+ ```
88
+
89
+ **Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
90
+
91
+ ### Triplets
92
+
93
+ Text triplets with hard negatives. Use when you have explicit negative examples.
94
+
95
+ **Data format:**
96
+ ```csv
97
+ anchor,positive,negative
98
+ What is Python?,Python is a programming language...,A python is a large snake...
99
+ ```
100
+
101
+ **Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
102
+
103
+ ### Similarity
104
+
105
+ Text pairs with similarity scores (STS-style). Use when you have continuous similarity labels.
106
+
107
+ **Data format:**
108
+ ```csv
109
+ sentence1,sentence2,score
110
+ A man is playing guitar,A person plays music,0.85
111
+ The cat is sleeping,A dog is running,0.12
112
+ ```
113
+
114
+ **Loss variants:** `cosine` (default), `cosent`, `angle`
115
+
116
+ ### TSDAE
117
+
118
+ Unsupervised learning with denoising auto-encoder. Use when you only have unlabeled text for domain adaptation.
119
+
120
+ **Data format:**
121
+ ```csv
122
+ text
123
+ Machine learning is transforming how we analyze data.
124
+ Natural language processing enables computers to understand human language.
125
+ ```
126
+
127
+ ## Configuration
128
+
129
+ ### CLI Arguments
130
+
131
+ ```bash
132
+ vespaembed train \
133
+ --data <path> # Training data (CSV, JSONL, or HF dataset)
134
+ --task <task> # Task type: pairs, triplets, similarity, tsdae
135
+ --base-model <model> # Base model name or path
136
+ --project <name> # Project name (optional)
137
+ --eval-data <path> # Evaluation data (optional)
138
+ --epochs <n> # Number of epochs (default: 3)
139
+ --batch-size <n> # Batch size (default: 32)
140
+ --learning-rate <lr> # Learning rate (default: 2e-5)
141
+ --optimizer <opt> # Optimizer (default: adamw_torch)
142
+ --scheduler <sched> # LR scheduler (default: linear)
143
+ --matryoshka # Enable Matryoshka embeddings
144
+ --matryoshka-dims <dims> # Dimensions (default: 768,512,256,128,64)
145
+ --unsloth # Use Unsloth for faster training
146
+ --subset <name> # HuggingFace dataset subset
147
+ --split <name> # HuggingFace dataset split
148
+ ```
149
+
150
+ ### Optimizers
151
+
152
+ | Option | Description |
153
+ |--------|-------------|
154
+ | `adamw_torch` | AdamW (default) |
155
+ | `adamw_torch_fused` | Fused AdamW (faster on CUDA) |
156
+ | `adamw_8bit` | 8-bit AdamW (memory efficient) |
157
+ | `adafactor` | Adafactor (memory efficient, no momentum) |
158
+ | `sgd` | SGD with momentum |
159
+
160
+ ### Schedulers
161
+
162
+ | Option | Description |
163
+ |--------|-------------|
164
+ | `linear` | Linear decay (default) |
165
+ | `cosine` | Cosine annealing |
166
+ | `cosine_with_restarts` | Cosine with warm restarts |
167
+ | `constant` | Constant learning rate |
168
+ | `constant_with_warmup` | Constant after warmup |
169
+ | `polynomial` | Polynomial decay |
170
+
171
+ ### YAML Configuration
172
+
173
+ ```yaml
174
+ task: pairs
175
+ base_model: sentence-transformers/all-MiniLM-L6-v2
176
+
177
+ data:
178
+ train: train.csv
179
+ eval: eval.csv # optional
180
+
181
+ training:
182
+ epochs: 3
183
+ batch_size: 32
184
+ learning_rate: 2e-5
185
+ warmup_ratio: 0.1
186
+ weight_decay: 0.01
187
+ fp16: true
188
+ eval_steps: 500
189
+ save_steps: 500
190
+ logging_steps: 100
191
+ optimizer: adamw_torch # adamw_torch, adamw_8bit, adafactor, sgd
192
+ scheduler: linear # linear, cosine, constant, polynomial
193
+
194
+ output:
195
+ dir: ./output
196
+ push_to_hub: false
197
+ hf_username: null
198
+
199
+ # Optional: LoRA configuration
200
+ lora:
201
+ enabled: false
202
+ r: 64
203
+ alpha: 128
204
+ dropout: 0.1
205
+ target_modules: [query, key, value, dense]
206
+
207
+ # Optional: Matryoshka dimensions
208
+ matryoshka_dims: [768, 512, 256, 128, 64]
209
+
210
+ # Optional: Loss variant (uses task default if not specified)
211
+ loss_variant: mnr
212
+ ```
213
+
214
+ ### HuggingFace Datasets
215
+
216
+ Load datasets directly from HuggingFace Hub:
217
+
218
+ ```bash
219
+ vespaembed train \
220
+ --data sentence-transformers/all-nli \
221
+ --subset triplet \
222
+ --split train \
223
+ --task triplets \
224
+ --base-model sentence-transformers/all-MiniLM-L6-v2
225
+ ```
226
+
227
+ ## CLI Commands
228
+
229
+ | Command | Description |
230
+ |---------|-------------|
231
+ | `vespaembed` | Launch web UI (default) |
232
+ | `vespaembed serve` | Launch web UI |
233
+ | `vespaembed train` | Train a model |
234
+ | `vespaembed evaluate` | Evaluate a model |
235
+ | `vespaembed export` | Export model to ONNX |
236
+ | `vespaembed info` | Show task information |
237
+
238
+ ## Output
239
+
240
+ Trained models are saved to `~/.vespaembed/projects/<project-name>/`:
241
+
242
+ ```
243
+ ~/.vespaembed/projects/my-project/
244
+ ├── final/ # Final trained model
245
+ ├── checkpoint-500/ # Training checkpoints
246
+ ├── checkpoint-1000/
247
+ └── logs/ # TensorBoard logs
248
+ ```
249
+
250
+ ## Column Aliases
251
+
252
+ VespaEmbed automatically recognizes common column name variations:
253
+
254
+ | Task | Expected | Also Accepts |
255
+ |------|----------|--------------|
256
+ | pairs | `anchor` | `query`, `question`, `sent1`, `sentence1`, `text1` |
257
+ | pairs | `positive` | `document`, `answer`, `pos`, `sent2`, `sentence2`, `text2` |
258
+ | triplets | `negative` | `neg`, `hard_negative`, `sent3`, `sentence3`, `text3` |
259
+ | similarity | `sentence1` | `sent1`, `text1`, `anchor`, `query` |
260
+ | similarity | `sentence2` | `sent2`, `text2`, `positive`, `document` |
261
+ | similarity | `score` | `similarity`, `label`, `sim_score` |
262
+ | tsdae | `text` | `sentence`, `sentences`, `content`, `input` |
263
+
264
+ ## Development
265
+
266
+ ```bash
267
+ # Install dev dependencies
268
+ uv sync --extra dev
269
+
270
+ # Run tests
271
+ uv run pytest tests/
272
+
273
+ # Run tests with coverage
274
+ uv run pytest tests/ --cov=vespaembed
275
+
276
+ # Format code
277
+ make format
278
+
279
+ # Lint
280
+ make lint
281
+ ```
282
+
283
+ ## License
284
+
285
+ Apache 2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "vespaembed"
3
- version = "0.0.1"
3
+ version = "0.0.2"
4
4
  description = "vespaembed: no-code training for embedding models"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -9,11 +9,28 @@ authors = [
9
9
  {name = "Abhishek Thakur"}
10
10
  ]
11
11
  dependencies = [
12
- "pandas>=3.0.0",
12
+ "sentence-transformers>=3.0.0",
13
+ "transformers>=4.40.0",
14
+ "accelerate>=0.26.0",
15
+ "torch>=2.0.0",
16
+ "datasets>=2.18.0",
17
+ "pandas>=2.0.0",
18
+ "pydantic>=2.0.0",
19
+ "rich>=13.0.0",
20
+ "fastapi>=0.111.0",
21
+ "uvicorn>=0.30.0",
22
+ "python-multipart>=0.0.9",
23
+ "websockets>=12.0",
24
+ "jinja2>=3.1.0",
25
+ "pyyaml>=6.0.0",
26
+ "tensorboard>=2.15.0",
27
+ "peft>=0.18.1",
28
+ "unsloth>=2026.1.4",
13
29
  ]
14
30
 
15
31
  [project.optional-dependencies]
16
- onnxruntime = ["onnxruntime>=1.23.2"]
32
+ unsloth = ["unsloth"]
33
+ onnx = ["onnx>=1.14.0", "onnxruntime>=1.23.2"]
17
34
  dev = [
18
35
  "black==26.1.0",
19
36
  "isort==7.0.0",
@@ -50,17 +67,14 @@ exclude = '''
50
67
  | env
51
68
  | build
52
69
  | dist
70
+ | unsloth_examples
53
71
  )/
54
72
  '''
55
73
 
56
74
  [tool.isort]
57
- ensure_newline_before_comments = true
58
- force_grid_wrap = 0
59
- include_trailing_comma = true
75
+ profile = "black"
60
76
  line_length = 119
61
- lines_after_imports = 2
62
- multi_line_output = 3
63
- use_parentheses = true
77
+ skip = [".venv", "venv", ".env", "env", "build", "dist", "unsloth_examples"]
64
78
 
65
79
  [tool.flake8]
66
80
  ignore = ["E203", "E501", "W503"]
@@ -0,0 +1 @@
1
+ __version__ = "0.0.2"
@@ -0,0 +1,17 @@
1
+ from abc import ABC, abstractmethod
2
+ from argparse import ArgumentParser
3
+
4
+
5
+ class BaseCommand(ABC):
6
+ """Base class for all CLI commands."""
7
+
8
+ @staticmethod
9
+ @abstractmethod
10
+ def register_subcommand(parser: ArgumentParser):
11
+ """Register the subcommand with argparse."""
12
+ raise NotImplementedError
13
+
14
+ @abstractmethod
15
+ def execute(self):
16
+ """Execute the command."""
17
+ raise NotImplementedError
@@ -0,0 +1,7 @@
1
+ from vespaembed.cli.commands.evaluate import EvaluateCommand
2
+ from vespaembed.cli.commands.export import ExportCommand
3
+ from vespaembed.cli.commands.info import InfoCommand
4
+ from vespaembed.cli.commands.serve import ServeCommand
5
+ from vespaembed.cli.commands.train import TrainCommand
6
+
7
+ __all__ = ["EvaluateCommand", "ExportCommand", "InfoCommand", "ServeCommand", "TrainCommand"]