vespaembed 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vespaembed-0.0.3/PKG-INFO +325 -0
- vespaembed-0.0.3/README.md +285 -0
- {vespaembed-0.0.1 → vespaembed-0.0.3}/pyproject.toml +23 -9
- vespaembed-0.0.3/src/vespaembed/__init__.py +1 -0
- vespaembed-0.0.3/src/vespaembed/cli/__init__.py +17 -0
- vespaembed-0.0.3/src/vespaembed/cli/commands/__init__.py +7 -0
- vespaembed-0.0.3/src/vespaembed/cli/commands/evaluate.py +85 -0
- vespaembed-0.0.3/src/vespaembed/cli/commands/export.py +86 -0
- vespaembed-0.0.3/src/vespaembed/cli/commands/info.py +52 -0
- vespaembed-0.0.3/src/vespaembed/cli/commands/serve.py +49 -0
- vespaembed-0.0.3/src/vespaembed/cli/commands/train.py +267 -0
- vespaembed-0.0.3/src/vespaembed/cli/vespaembed.py +55 -0
- vespaembed-0.0.3/src/vespaembed/core/__init__.py +2 -0
- vespaembed-0.0.3/src/vespaembed/core/config.py +164 -0
- vespaembed-0.0.3/src/vespaembed/core/registry.py +158 -0
- vespaembed-0.0.3/src/vespaembed/core/trainer.py +573 -0
- vespaembed-0.0.3/src/vespaembed/datasets/__init__.py +3 -0
- vespaembed-0.0.3/src/vespaembed/datasets/formats/__init__.py +5 -0
- vespaembed-0.0.3/src/vespaembed/datasets/formats/csv.py +15 -0
- vespaembed-0.0.3/src/vespaembed/datasets/formats/huggingface.py +34 -0
- vespaembed-0.0.3/src/vespaembed/datasets/formats/jsonl.py +26 -0
- vespaembed-0.0.3/src/vespaembed/datasets/loader.py +80 -0
- vespaembed-0.0.3/src/vespaembed/db.py +176 -0
- vespaembed-0.0.3/src/vespaembed/enums.py +58 -0
- vespaembed-0.0.3/src/vespaembed/evaluation/__init__.py +3 -0
- vespaembed-0.0.3/src/vespaembed/evaluation/factory.py +86 -0
- vespaembed-0.0.3/src/vespaembed/models/__init__.py +4 -0
- vespaembed-0.0.3/src/vespaembed/models/export.py +89 -0
- vespaembed-0.0.3/src/vespaembed/models/loader.py +25 -0
- vespaembed-0.0.3/src/vespaembed/static/css/styles.css +1800 -0
- vespaembed-0.0.3/src/vespaembed/static/js/app.js +1485 -0
- vespaembed-0.0.3/src/vespaembed/tasks/__init__.py +23 -0
- vespaembed-0.0.3/src/vespaembed/tasks/base.py +144 -0
- vespaembed-0.0.3/src/vespaembed/tasks/pairs.py +91 -0
- vespaembed-0.0.3/src/vespaembed/tasks/similarity.py +84 -0
- vespaembed-0.0.3/src/vespaembed/tasks/triplets.py +90 -0
- vespaembed-0.0.3/src/vespaembed/tasks/tsdae.py +102 -0
- vespaembed-0.0.3/src/vespaembed/templates/index.html +544 -0
- vespaembed-0.0.3/src/vespaembed/utils/__init__.py +3 -0
- vespaembed-0.0.3/src/vespaembed/utils/logging.py +69 -0
- vespaembed-0.0.3/src/vespaembed/web/__init__.py +1 -0
- vespaembed-0.0.3/src/vespaembed/web/api/__init__.py +1 -0
- vespaembed-0.0.3/src/vespaembed/web/app.py +605 -0
- vespaembed-0.0.3/src/vespaembed/worker.py +313 -0
- vespaembed-0.0.3/src/vespaembed.egg-info/PKG-INFO +325 -0
- vespaembed-0.0.3/src/vespaembed.egg-info/SOURCES.txt +57 -0
- vespaembed-0.0.3/src/vespaembed.egg-info/requires.txt +31 -0
- vespaembed-0.0.3/tests/test_api.py +343 -0
- vespaembed-0.0.3/tests/test_cli.py +113 -0
- vespaembed-0.0.3/tests/test_config.py +350 -0
- vespaembed-0.0.3/tests/test_db.py +163 -0
- vespaembed-0.0.3/tests/test_e2e.py +526 -0
- vespaembed-0.0.3/tests/test_registry.py +362 -0
- vespaembed-0.0.3/tests/test_trainer.py +174 -0
- vespaembed-0.0.1/PKG-INFO +0 -20
- vespaembed-0.0.1/README.md +0 -0
- vespaembed-0.0.1/src/vespaembed/__init__.py +0 -1
- vespaembed-0.0.1/src/vespaembed.egg-info/PKG-INFO +0 -20
- vespaembed-0.0.1/src/vespaembed.egg-info/SOURCES.txt +0 -10
- vespaembed-0.0.1/src/vespaembed.egg-info/requires.txt +0 -11
- {vespaembed-0.0.1 → vespaembed-0.0.3}/LICENSE +0 -0
- {vespaembed-0.0.1 → vespaembed-0.0.3}/setup.cfg +0 -0
- {vespaembed-0.0.1 → vespaembed-0.0.3}/src/vespaembed.egg-info/dependency_links.txt +0 -0
- {vespaembed-0.0.1 → vespaembed-0.0.3}/src/vespaembed.egg-info/entry_points.txt +0 -0
- {vespaembed-0.0.1 → vespaembed-0.0.3}/src/vespaembed.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vespaembed
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: vespaembed: no-code training for embedding models
|
|
5
|
+
Author: Abhishek Thakur
|
|
6
|
+
License: Apache 2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/vespaai-playground/vespaembed
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: sentence-transformers>=3.0.0
|
|
12
|
+
Requires-Dist: transformers>=4.40.0
|
|
13
|
+
Requires-Dist: accelerate>=0.26.0
|
|
14
|
+
Requires-Dist: torch>=2.0.0
|
|
15
|
+
Requires-Dist: datasets>=2.18.0
|
|
16
|
+
Requires-Dist: pandas>=2.0.0
|
|
17
|
+
Requires-Dist: pydantic>=2.0.0
|
|
18
|
+
Requires-Dist: rich>=13.0.0
|
|
19
|
+
Requires-Dist: fastapi>=0.111.0
|
|
20
|
+
Requires-Dist: uvicorn>=0.30.0
|
|
21
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
22
|
+
Requires-Dist: websockets>=12.0
|
|
23
|
+
Requires-Dist: jinja2>=3.1.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
25
|
+
Requires-Dist: tensorboard>=2.15.0
|
|
26
|
+
Requires-Dist: peft>=0.18.1
|
|
27
|
+
Requires-Dist: unsloth>=2026.1.4
|
|
28
|
+
Provides-Extra: unsloth
|
|
29
|
+
Requires-Dist: unsloth; extra == "unsloth"
|
|
30
|
+
Provides-Extra: onnx
|
|
31
|
+
Requires-Dist: onnx>=1.14.0; extra == "onnx"
|
|
32
|
+
Requires-Dist: onnxruntime>=1.23.2; extra == "onnx"
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: black==26.1.0; extra == "dev"
|
|
35
|
+
Requires-Dist: isort==7.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: flake8==7.3.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest>=9.0.2; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# VespaEmbed
|
|
42
|
+
|
|
43
|
+
No-code training for embedding models. Train custom embedding models with a web UI or CLI.
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **Web UI** - Visual interface for configuring and monitoring training
|
|
48
|
+
- **CLI** - Command-line interface for scripting and automation
|
|
49
|
+
- **Multiple Tasks** - Support for pairs, triplets, similarity scoring, and unsupervised learning
|
|
50
|
+
- **Loss Variants** - Choose from multiple loss functions per task
|
|
51
|
+
- **Matryoshka Embeddings** - Train multi-dimensional embeddings for flexible retrieval
|
|
52
|
+
- **LoRA Support** - Parameter-efficient fine-tuning with LoRA adapters
|
|
53
|
+
- **Unsloth Integration** - Faster training with Unsloth optimizations
|
|
54
|
+
- **HuggingFace Integration** - Load datasets, models from HuggingFace Hub, push models to Hub
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
> **Note:** VespaEmbed is in experimental phase. Install from source.
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
git clone https://github.com/vespaai-playground/vespaembed.git
|
|
62
|
+
cd vespaembed
|
|
63
|
+
uv sync
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Optional Dependencies
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# For Unsloth acceleration (requires NVIDIA/AMD GPU)
|
|
70
|
+
uv sync --extra unsloth
|
|
71
|
+
|
|
72
|
+
# For ONNX export
|
|
73
|
+
uv sync --extra onnx
|
|
74
|
+
|
|
75
|
+
# For development
|
|
76
|
+
uv sync --extra dev
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
### Web UI
|
|
82
|
+
|
|
83
|
+
Launch the web interface:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
vespaembed
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Open http://localhost:8000 in your browser. The UI lets you:
|
|
90
|
+
- Upload training data (CSV or JSONL)
|
|
91
|
+
- Select task type and base model
|
|
92
|
+
- Configure hyperparameters
|
|
93
|
+
- Monitor training progress
|
|
94
|
+
- Download trained models
|
|
95
|
+
|
|
96
|
+
### CLI
|
|
97
|
+
|
|
98
|
+
Train a model from the command line:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
vespaembed train \
|
|
102
|
+
--data examples/data/pairs.csv \
|
|
103
|
+
--task pairs \
|
|
104
|
+
--base-model sentence-transformers/all-MiniLM-L6-v2 \
|
|
105
|
+
--epochs 3
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Or use a YAML config file:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
vespaembed train --config config.yaml
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Tasks
|
|
115
|
+
|
|
116
|
+
VespaEmbed supports 4 training tasks based on your data format:
|
|
117
|
+
|
|
118
|
+
### Pairs
|
|
119
|
+
|
|
120
|
+
Text pairs for semantic search. Use when you have query-document pairs without explicit negatives.
|
|
121
|
+
|
|
122
|
+
**Data format:**
|
|
123
|
+
```csv
|
|
124
|
+
anchor,positive
|
|
125
|
+
What is machine learning?,Machine learning is a subset of AI...
|
|
126
|
+
How does photosynthesis work?,Photosynthesis converts sunlight...
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
|
|
130
|
+
|
|
131
|
+
### Triplets
|
|
132
|
+
|
|
133
|
+
Text triplets with hard negatives. Use when you have explicit negative examples.
|
|
134
|
+
|
|
135
|
+
**Data format:**
|
|
136
|
+
```csv
|
|
137
|
+
anchor,positive,negative
|
|
138
|
+
What is Python?,Python is a programming language...,A python is a large snake...
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
|
|
142
|
+
|
|
143
|
+
### Similarity
|
|
144
|
+
|
|
145
|
+
Text pairs with similarity scores (STS-style). Use when you have continuous similarity labels.
|
|
146
|
+
|
|
147
|
+
**Data format:**
|
|
148
|
+
```csv
|
|
149
|
+
sentence1,sentence2,score
|
|
150
|
+
A man is playing guitar,A person plays music,0.85
|
|
151
|
+
The cat is sleeping,A dog is running,0.12
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Loss variants:** `cosine` (default), `cosent`, `angle`
|
|
155
|
+
|
|
156
|
+
### TSDAE
|
|
157
|
+
|
|
158
|
+
Unsupervised learning with denoising auto-encoder. Use when you only have unlabeled text for domain adaptation.
|
|
159
|
+
|
|
160
|
+
**Data format:**
|
|
161
|
+
```csv
|
|
162
|
+
text
|
|
163
|
+
Machine learning is transforming how we analyze data.
|
|
164
|
+
Natural language processing enables computers to understand human language.
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Configuration
|
|
168
|
+
|
|
169
|
+
### CLI Arguments
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
vespaembed train \
|
|
173
|
+
--data <path> # Training data (CSV, JSONL, or HF dataset)
|
|
174
|
+
--task <task> # Task type: pairs, triplets, similarity, tsdae
|
|
175
|
+
--base-model <model> # Base model name or path
|
|
176
|
+
--project <name> # Project name (optional)
|
|
177
|
+
--eval-data <path> # Evaluation data (optional)
|
|
178
|
+
--epochs <n> # Number of epochs (default: 3)
|
|
179
|
+
--batch-size <n> # Batch size (default: 32)
|
|
180
|
+
--learning-rate <lr> # Learning rate (default: 2e-5)
|
|
181
|
+
--optimizer <opt> # Optimizer (default: adamw_torch)
|
|
182
|
+
--scheduler <sched> # LR scheduler (default: linear)
|
|
183
|
+
--matryoshka # Enable Matryoshka embeddings
|
|
184
|
+
--matryoshka-dims <dims> # Dimensions (default: 768,512,256,128,64)
|
|
185
|
+
--unsloth # Use Unsloth for faster training
|
|
186
|
+
--subset <name> # HuggingFace dataset subset
|
|
187
|
+
--split <name> # HuggingFace dataset split
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Optimizers
|
|
191
|
+
|
|
192
|
+
| Option | Description |
|
|
193
|
+
|--------|-------------|
|
|
194
|
+
| `adamw_torch` | AdamW (default) |
|
|
195
|
+
| `adamw_torch_fused` | Fused AdamW (faster on CUDA) |
|
|
196
|
+
| `adamw_8bit` | 8-bit AdamW (memory efficient) |
|
|
197
|
+
| `adafactor` | Adafactor (memory efficient, no momentum) |
|
|
198
|
+
| `sgd` | SGD with momentum |
|
|
199
|
+
|
|
200
|
+
### Schedulers
|
|
201
|
+
|
|
202
|
+
| Option | Description |
|
|
203
|
+
|--------|-------------|
|
|
204
|
+
| `linear` | Linear decay (default) |
|
|
205
|
+
| `cosine` | Cosine annealing |
|
|
206
|
+
| `cosine_with_restarts` | Cosine with warm restarts |
|
|
207
|
+
| `constant` | Constant learning rate |
|
|
208
|
+
| `constant_with_warmup` | Constant after warmup |
|
|
209
|
+
| `polynomial` | Polynomial decay |
|
|
210
|
+
|
|
211
|
+
### YAML Configuration
|
|
212
|
+
|
|
213
|
+
```yaml
|
|
214
|
+
task: pairs
|
|
215
|
+
base_model: sentence-transformers/all-MiniLM-L6-v2
|
|
216
|
+
|
|
217
|
+
data:
|
|
218
|
+
train: train.csv
|
|
219
|
+
eval: eval.csv # optional
|
|
220
|
+
|
|
221
|
+
training:
|
|
222
|
+
epochs: 3
|
|
223
|
+
batch_size: 32
|
|
224
|
+
learning_rate: 2e-5
|
|
225
|
+
warmup_ratio: 0.1
|
|
226
|
+
weight_decay: 0.01
|
|
227
|
+
fp16: true
|
|
228
|
+
eval_steps: 500
|
|
229
|
+
save_steps: 500
|
|
230
|
+
logging_steps: 100
|
|
231
|
+
optimizer: adamw_torch # adamw_torch, adamw_8bit, adafactor, sgd
|
|
232
|
+
scheduler: linear # linear, cosine, constant, polynomial
|
|
233
|
+
|
|
234
|
+
output:
|
|
235
|
+
dir: ./output
|
|
236
|
+
push_to_hub: false
|
|
237
|
+
hf_username: null
|
|
238
|
+
|
|
239
|
+
# Optional: LoRA configuration
|
|
240
|
+
lora:
|
|
241
|
+
enabled: false
|
|
242
|
+
r: 64
|
|
243
|
+
alpha: 128
|
|
244
|
+
dropout: 0.1
|
|
245
|
+
target_modules: [query, key, value, dense]
|
|
246
|
+
|
|
247
|
+
# Optional: Matryoshka dimensions
|
|
248
|
+
matryoshka_dims: [768, 512, 256, 128, 64]
|
|
249
|
+
|
|
250
|
+
# Optional: Loss variant (uses task default if not specified)
|
|
251
|
+
loss_variant: mnr
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### HuggingFace Datasets
|
|
255
|
+
|
|
256
|
+
Load datasets directly from HuggingFace Hub:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
vespaembed train \
|
|
260
|
+
--data sentence-transformers/all-nli \
|
|
261
|
+
--subset triplet \
|
|
262
|
+
--split train \
|
|
263
|
+
--task triplets \
|
|
264
|
+
--base-model sentence-transformers/all-MiniLM-L6-v2
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
## CLI Commands
|
|
268
|
+
|
|
269
|
+
| Command | Description |
|
|
270
|
+
|---------|-------------|
|
|
271
|
+
| `vespaembed` | Launch web UI (default) |
|
|
272
|
+
| `vespaembed serve` | Launch web UI |
|
|
273
|
+
| `vespaembed train` | Train a model |
|
|
274
|
+
| `vespaembed evaluate` | Evaluate a model |
|
|
275
|
+
| `vespaembed export` | Export model to ONNX |
|
|
276
|
+
| `vespaembed info` | Show task information |
|
|
277
|
+
|
|
278
|
+
## Output
|
|
279
|
+
|
|
280
|
+
Trained models are saved to `~/.vespaembed/projects/<project-name>/`:
|
|
281
|
+
|
|
282
|
+
```
|
|
283
|
+
~/.vespaembed/projects/my-project/
|
|
284
|
+
├── final/ # Final trained model
|
|
285
|
+
├── checkpoint-500/ # Training checkpoints
|
|
286
|
+
├── checkpoint-1000/
|
|
287
|
+
└── logs/ # TensorBoard logs
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Column Aliases
|
|
291
|
+
|
|
292
|
+
VespaEmbed automatically recognizes common column name variations:
|
|
293
|
+
|
|
294
|
+
| Task | Expected | Also Accepts |
|
|
295
|
+
|------|----------|--------------|
|
|
296
|
+
| pairs | `anchor` | `query`, `question`, `sent1`, `sentence1`, `text1` |
|
|
297
|
+
| pairs | `positive` | `document`, `answer`, `pos`, `sent2`, `sentence2`, `text2` |
|
|
298
|
+
| triplets | `negative` | `neg`, `hard_negative`, `sent3`, `sentence3`, `text3` |
|
|
299
|
+
| similarity | `sentence1` | `sent1`, `text1`, `anchor`, `query` |
|
|
300
|
+
| similarity | `sentence2` | `sent2`, `text2`, `positive`, `document` |
|
|
301
|
+
| similarity | `score` | `similarity`, `label`, `sim_score` |
|
|
302
|
+
| tsdae | `text` | `sentence`, `sentences`, `content`, `input` |
|
|
303
|
+
|
|
304
|
+
## Development
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
# Install dev dependencies
|
|
308
|
+
uv sync --extra dev
|
|
309
|
+
|
|
310
|
+
# Run tests
|
|
311
|
+
uv run pytest tests/
|
|
312
|
+
|
|
313
|
+
# Run tests with coverage
|
|
314
|
+
uv run pytest tests/ --cov=vespaembed
|
|
315
|
+
|
|
316
|
+
# Format code
|
|
317
|
+
make format
|
|
318
|
+
|
|
319
|
+
# Lint
|
|
320
|
+
make lint
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## License
|
|
324
|
+
|
|
325
|
+
Apache 2.0
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# VespaEmbed
|
|
2
|
+
|
|
3
|
+
No-code training for embedding models. Train custom embedding models with a web UI or CLI.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Web UI** - Visual interface for configuring and monitoring training
|
|
8
|
+
- **CLI** - Command-line interface for scripting and automation
|
|
9
|
+
- **Multiple Tasks** - Support for pairs, triplets, similarity scoring, and unsupervised learning
|
|
10
|
+
- **Loss Variants** - Choose from multiple loss functions per task
|
|
11
|
+
- **Matryoshka Embeddings** - Train multi-dimensional embeddings for flexible retrieval
|
|
12
|
+
- **LoRA Support** - Parameter-efficient fine-tuning with LoRA adapters
|
|
13
|
+
- **Unsloth Integration** - Faster training with Unsloth optimizations
|
|
14
|
+
- **HuggingFace Integration** - Load datasets, models from HuggingFace Hub, push models to Hub
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
> **Note:** VespaEmbed is in experimental phase. Install from source.
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
git clone https://github.com/vespaai-playground/vespaembed.git
|
|
22
|
+
cd vespaembed
|
|
23
|
+
uv sync
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Optional Dependencies
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# For Unsloth acceleration (requires NVIDIA/AMD GPU)
|
|
30
|
+
uv sync --extra unsloth
|
|
31
|
+
|
|
32
|
+
# For ONNX export
|
|
33
|
+
uv sync --extra onnx
|
|
34
|
+
|
|
35
|
+
# For development
|
|
36
|
+
uv sync --extra dev
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
### Web UI
|
|
42
|
+
|
|
43
|
+
Launch the web interface:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
vespaembed
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Open http://localhost:8000 in your browser. The UI lets you:
|
|
50
|
+
- Upload training data (CSV or JSONL)
|
|
51
|
+
- Select task type and base model
|
|
52
|
+
- Configure hyperparameters
|
|
53
|
+
- Monitor training progress
|
|
54
|
+
- Download trained models
|
|
55
|
+
|
|
56
|
+
### CLI
|
|
57
|
+
|
|
58
|
+
Train a model from the command line:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
vespaembed train \
|
|
62
|
+
--data examples/data/pairs.csv \
|
|
63
|
+
--task pairs \
|
|
64
|
+
--base-model sentence-transformers/all-MiniLM-L6-v2 \
|
|
65
|
+
--epochs 3
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Or use a YAML config file:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
vespaembed train --config config.yaml
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Tasks
|
|
75
|
+
|
|
76
|
+
VespaEmbed supports 4 training tasks based on your data format:
|
|
77
|
+
|
|
78
|
+
### Pairs
|
|
79
|
+
|
|
80
|
+
Text pairs for semantic search. Use when you have query-document pairs without explicit negatives.
|
|
81
|
+
|
|
82
|
+
**Data format:**
|
|
83
|
+
```csv
|
|
84
|
+
anchor,positive
|
|
85
|
+
What is machine learning?,Machine learning is a subset of AI...
|
|
86
|
+
How does photosynthesis work?,Photosynthesis converts sunlight...
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
|
|
90
|
+
|
|
91
|
+
### Triplets
|
|
92
|
+
|
|
93
|
+
Text triplets with hard negatives. Use when you have explicit negative examples.
|
|
94
|
+
|
|
95
|
+
**Data format:**
|
|
96
|
+
```csv
|
|
97
|
+
anchor,positive,negative
|
|
98
|
+
What is Python?,Python is a programming language...,A python is a large snake...
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Loss variants:** `mnr` (default), `mnr_symmetric`, `gist`, `cached_mnr`, `cached_gist`
|
|
102
|
+
|
|
103
|
+
### Similarity
|
|
104
|
+
|
|
105
|
+
Text pairs with similarity scores (STS-style). Use when you have continuous similarity labels.
|
|
106
|
+
|
|
107
|
+
**Data format:**
|
|
108
|
+
```csv
|
|
109
|
+
sentence1,sentence2,score
|
|
110
|
+
A man is playing guitar,A person plays music,0.85
|
|
111
|
+
The cat is sleeping,A dog is running,0.12
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**Loss variants:** `cosine` (default), `cosent`, `angle`
|
|
115
|
+
|
|
116
|
+
### TSDAE
|
|
117
|
+
|
|
118
|
+
Unsupervised learning with denoising auto-encoder. Use when you only have unlabeled text for domain adaptation.
|
|
119
|
+
|
|
120
|
+
**Data format:**
|
|
121
|
+
```csv
|
|
122
|
+
text
|
|
123
|
+
Machine learning is transforming how we analyze data.
|
|
124
|
+
Natural language processing enables computers to understand human language.
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Configuration
|
|
128
|
+
|
|
129
|
+
### CLI Arguments
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
vespaembed train \
|
|
133
|
+
--data <path> # Training data (CSV, JSONL, or HF dataset)
|
|
134
|
+
--task <task> # Task type: pairs, triplets, similarity, tsdae
|
|
135
|
+
--base-model <model> # Base model name or path
|
|
136
|
+
--project <name> # Project name (optional)
|
|
137
|
+
--eval-data <path> # Evaluation data (optional)
|
|
138
|
+
--epochs <n> # Number of epochs (default: 3)
|
|
139
|
+
--batch-size <n> # Batch size (default: 32)
|
|
140
|
+
--learning-rate <lr> # Learning rate (default: 2e-5)
|
|
141
|
+
--optimizer <opt> # Optimizer (default: adamw_torch)
|
|
142
|
+
--scheduler <sched> # LR scheduler (default: linear)
|
|
143
|
+
--matryoshka # Enable Matryoshka embeddings
|
|
144
|
+
--matryoshka-dims <dims> # Dimensions (default: 768,512,256,128,64)
|
|
145
|
+
--unsloth # Use Unsloth for faster training
|
|
146
|
+
--subset <name> # HuggingFace dataset subset
|
|
147
|
+
--split <name> # HuggingFace dataset split
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Optimizers
|
|
151
|
+
|
|
152
|
+
| Option | Description |
|
|
153
|
+
|--------|-------------|
|
|
154
|
+
| `adamw_torch` | AdamW (default) |
|
|
155
|
+
| `adamw_torch_fused` | Fused AdamW (faster on CUDA) |
|
|
156
|
+
| `adamw_8bit` | 8-bit AdamW (memory efficient) |
|
|
157
|
+
| `adafactor` | Adafactor (memory efficient, no momentum) |
|
|
158
|
+
| `sgd` | SGD with momentum |
|
|
159
|
+
|
|
160
|
+
### Schedulers
|
|
161
|
+
|
|
162
|
+
| Option | Description |
|
|
163
|
+
|--------|-------------|
|
|
164
|
+
| `linear` | Linear decay (default) |
|
|
165
|
+
| `cosine` | Cosine annealing |
|
|
166
|
+
| `cosine_with_restarts` | Cosine with warm restarts |
|
|
167
|
+
| `constant` | Constant learning rate |
|
|
168
|
+
| `constant_with_warmup` | Constant after warmup |
|
|
169
|
+
| `polynomial` | Polynomial decay |
|
|
170
|
+
|
|
171
|
+
### YAML Configuration
|
|
172
|
+
|
|
173
|
+
```yaml
|
|
174
|
+
task: pairs
|
|
175
|
+
base_model: sentence-transformers/all-MiniLM-L6-v2
|
|
176
|
+
|
|
177
|
+
data:
|
|
178
|
+
train: train.csv
|
|
179
|
+
eval: eval.csv # optional
|
|
180
|
+
|
|
181
|
+
training:
|
|
182
|
+
epochs: 3
|
|
183
|
+
batch_size: 32
|
|
184
|
+
learning_rate: 2e-5
|
|
185
|
+
warmup_ratio: 0.1
|
|
186
|
+
weight_decay: 0.01
|
|
187
|
+
fp16: true
|
|
188
|
+
eval_steps: 500
|
|
189
|
+
save_steps: 500
|
|
190
|
+
logging_steps: 100
|
|
191
|
+
optimizer: adamw_torch # adamw_torch, adamw_8bit, adafactor, sgd
|
|
192
|
+
scheduler: linear # linear, cosine, constant, polynomial
|
|
193
|
+
|
|
194
|
+
output:
|
|
195
|
+
dir: ./output
|
|
196
|
+
push_to_hub: false
|
|
197
|
+
hf_username: null
|
|
198
|
+
|
|
199
|
+
# Optional: LoRA configuration
|
|
200
|
+
lora:
|
|
201
|
+
enabled: false
|
|
202
|
+
r: 64
|
|
203
|
+
alpha: 128
|
|
204
|
+
dropout: 0.1
|
|
205
|
+
target_modules: [query, key, value, dense]
|
|
206
|
+
|
|
207
|
+
# Optional: Matryoshka dimensions
|
|
208
|
+
matryoshka_dims: [768, 512, 256, 128, 64]
|
|
209
|
+
|
|
210
|
+
# Optional: Loss variant (uses task default if not specified)
|
|
211
|
+
loss_variant: mnr
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### HuggingFace Datasets
|
|
215
|
+
|
|
216
|
+
Load datasets directly from HuggingFace Hub:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
vespaembed train \
|
|
220
|
+
--data sentence-transformers/all-nli \
|
|
221
|
+
--subset triplet \
|
|
222
|
+
--split train \
|
|
223
|
+
--task triplets \
|
|
224
|
+
--base-model sentence-transformers/all-MiniLM-L6-v2
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## CLI Commands
|
|
228
|
+
|
|
229
|
+
| Command | Description |
|
|
230
|
+
|---------|-------------|
|
|
231
|
+
| `vespaembed` | Launch web UI (default) |
|
|
232
|
+
| `vespaembed serve` | Launch web UI |
|
|
233
|
+
| `vespaembed train` | Train a model |
|
|
234
|
+
| `vespaembed evaluate` | Evaluate a model |
|
|
235
|
+
| `vespaembed export` | Export model to ONNX |
|
|
236
|
+
| `vespaembed info` | Show task information |
|
|
237
|
+
|
|
238
|
+
## Output
|
|
239
|
+
|
|
240
|
+
Trained models are saved to `~/.vespaembed/projects/<project-name>/`:
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
~/.vespaembed/projects/my-project/
|
|
244
|
+
├── final/ # Final trained model
|
|
245
|
+
├── checkpoint-500/ # Training checkpoints
|
|
246
|
+
├── checkpoint-1000/
|
|
247
|
+
└── logs/ # TensorBoard logs
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Column Aliases
|
|
251
|
+
|
|
252
|
+
VespaEmbed automatically recognizes common column name variations:
|
|
253
|
+
|
|
254
|
+
| Task | Expected | Also Accepts |
|
|
255
|
+
|------|----------|--------------|
|
|
256
|
+
| pairs | `anchor` | `query`, `question`, `sent1`, `sentence1`, `text1` |
|
|
257
|
+
| pairs | `positive` | `document`, `answer`, `pos`, `sent2`, `sentence2`, `text2` |
|
|
258
|
+
| triplets | `negative` | `neg`, `hard_negative`, `sent3`, `sentence3`, `text3` |
|
|
259
|
+
| similarity | `sentence1` | `sent1`, `text1`, `anchor`, `query` |
|
|
260
|
+
| similarity | `sentence2` | `sent2`, `text2`, `positive`, `document` |
|
|
261
|
+
| similarity | `score` | `similarity`, `label`, `sim_score` |
|
|
262
|
+
| tsdae | `text` | `sentence`, `sentences`, `content`, `input` |
|
|
263
|
+
|
|
264
|
+
## Development
|
|
265
|
+
|
|
266
|
+
```bash
|
|
267
|
+
# Install dev dependencies
|
|
268
|
+
uv sync --extra dev
|
|
269
|
+
|
|
270
|
+
# Run tests
|
|
271
|
+
uv run pytest tests/
|
|
272
|
+
|
|
273
|
+
# Run tests with coverage
|
|
274
|
+
uv run pytest tests/ --cov=vespaembed
|
|
275
|
+
|
|
276
|
+
# Format code
|
|
277
|
+
make format
|
|
278
|
+
|
|
279
|
+
# Lint
|
|
280
|
+
make lint
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
Apache 2.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "vespaembed"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.3"
|
|
4
4
|
description = "vespaembed: no-code training for embedding models"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.11"
|
|
@@ -9,11 +9,28 @@ authors = [
|
|
|
9
9
|
{name = "Abhishek Thakur"}
|
|
10
10
|
]
|
|
11
11
|
dependencies = [
|
|
12
|
-
"
|
|
12
|
+
"sentence-transformers>=3.0.0",
|
|
13
|
+
"transformers>=4.40.0",
|
|
14
|
+
"accelerate>=0.26.0",
|
|
15
|
+
"torch>=2.0.0",
|
|
16
|
+
"datasets>=2.18.0",
|
|
17
|
+
"pandas>=2.0.0",
|
|
18
|
+
"pydantic>=2.0.0",
|
|
19
|
+
"rich>=13.0.0",
|
|
20
|
+
"fastapi>=0.111.0",
|
|
21
|
+
"uvicorn>=0.30.0",
|
|
22
|
+
"python-multipart>=0.0.9",
|
|
23
|
+
"websockets>=12.0",
|
|
24
|
+
"jinja2>=3.1.0",
|
|
25
|
+
"pyyaml>=6.0.0",
|
|
26
|
+
"tensorboard>=2.15.0",
|
|
27
|
+
"peft>=0.18.1",
|
|
28
|
+
"unsloth>=2026.1.4",
|
|
13
29
|
]
|
|
14
30
|
|
|
15
31
|
[project.optional-dependencies]
|
|
16
|
-
|
|
32
|
+
unsloth = ["unsloth"]
|
|
33
|
+
onnx = ["onnx>=1.14.0", "onnxruntime>=1.23.2"]
|
|
17
34
|
dev = [
|
|
18
35
|
"black==26.1.0",
|
|
19
36
|
"isort==7.0.0",
|
|
@@ -50,17 +67,14 @@ exclude = '''
|
|
|
50
67
|
| env
|
|
51
68
|
| build
|
|
52
69
|
| dist
|
|
70
|
+
| unsloth_examples
|
|
53
71
|
)/
|
|
54
72
|
'''
|
|
55
73
|
|
|
56
74
|
[tool.isort]
|
|
57
|
-
|
|
58
|
-
force_grid_wrap = 0
|
|
59
|
-
include_trailing_comma = true
|
|
75
|
+
profile = "black"
|
|
60
76
|
line_length = 119
|
|
61
|
-
|
|
62
|
-
multi_line_output = 3
|
|
63
|
-
use_parentheses = true
|
|
77
|
+
skip = [".venv", "venv", ".env", "env", "build", "dist", "unsloth_examples"]
|
|
64
78
|
|
|
65
79
|
[tool.flake8]
|
|
66
80
|
ignore = ["E203", "E501", "W503"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.3"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseCommand(ABC):
|
|
6
|
+
"""Base class for all CLI commands."""
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def register_subcommand(parser: ArgumentParser):
|
|
11
|
+
"""Register the subcommand with argparse."""
|
|
12
|
+
raise NotImplementedError
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def execute(self):
|
|
16
|
+
"""Execute the command."""
|
|
17
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from vespaembed.cli.commands.evaluate import EvaluateCommand
|
|
2
|
+
from vespaembed.cli.commands.export import ExportCommand
|
|
3
|
+
from vespaembed.cli.commands.info import InfoCommand
|
|
4
|
+
from vespaembed.cli.commands.serve import ServeCommand
|
|
5
|
+
from vespaembed.cli.commands.train import TrainCommand
|
|
6
|
+
|
|
7
|
+
__all__ = ["EvaluateCommand", "ExportCommand", "InfoCommand", "ServeCommand", "TrainCommand"]
|