vespaembed 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. vespaembed/__init__.py +1 -1
  2. vespaembed/cli/__init__.py +17 -0
  3. vespaembed/cli/commands/__init__.py +7 -0
  4. vespaembed/cli/commands/evaluate.py +85 -0
  5. vespaembed/cli/commands/export.py +86 -0
  6. vespaembed/cli/commands/info.py +52 -0
  7. vespaembed/cli/commands/serve.py +49 -0
  8. vespaembed/cli/commands/train.py +267 -0
  9. vespaembed/cli/vespaembed.py +55 -0
  10. vespaembed/core/__init__.py +2 -0
  11. vespaembed/core/config.py +164 -0
  12. vespaembed/core/registry.py +158 -0
  13. vespaembed/core/trainer.py +573 -0
  14. vespaembed/datasets/__init__.py +3 -0
  15. vespaembed/datasets/formats/__init__.py +5 -0
  16. vespaembed/datasets/formats/csv.py +15 -0
  17. vespaembed/datasets/formats/huggingface.py +34 -0
  18. vespaembed/datasets/formats/jsonl.py +26 -0
  19. vespaembed/datasets/loader.py +80 -0
  20. vespaembed/db.py +176 -0
  21. vespaembed/enums.py +58 -0
  22. vespaembed/evaluation/__init__.py +3 -0
  23. vespaembed/evaluation/factory.py +86 -0
  24. vespaembed/models/__init__.py +4 -0
  25. vespaembed/models/export.py +89 -0
  26. vespaembed/models/loader.py +25 -0
  27. vespaembed/static/css/styles.css +1800 -0
  28. vespaembed/static/js/app.js +1485 -0
  29. vespaembed/tasks/__init__.py +23 -0
  30. vespaembed/tasks/base.py +144 -0
  31. vespaembed/tasks/pairs.py +91 -0
  32. vespaembed/tasks/similarity.py +84 -0
  33. vespaembed/tasks/triplets.py +90 -0
  34. vespaembed/tasks/tsdae.py +102 -0
  35. vespaembed/templates/index.html +544 -0
  36. vespaembed/utils/__init__.py +3 -0
  37. vespaembed/utils/logging.py +69 -0
  38. vespaembed/web/__init__.py +1 -0
  39. vespaembed/web/api/__init__.py +1 -0
  40. vespaembed/web/app.py +605 -0
  41. vespaembed/worker.py +313 -0
  42. vespaembed-0.0.3.dist-info/METADATA +325 -0
  43. vespaembed-0.0.3.dist-info/RECORD +47 -0
  44. {vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/WHEEL +1 -1
  45. vespaembed-0.0.1.dist-info/METADATA +0 -20
  46. vespaembed-0.0.1.dist-info/RECORD +0 -7
  47. {vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/entry_points.txt +0 -0
  48. {vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/licenses/LICENSE +0 -0
  49. {vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,164 @@
1
+ from typing import Literal, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from vespaembed.enums import LossVariant, TaskType
6
+
7
+ # Optimizer choices (most common ones from HuggingFace)
8
+ OptimizerType = Literal[
9
+ "adamw_torch", # Default AdamW
10
+ "adamw_torch_fused", # Fused AdamW (faster on CUDA)
11
+ "adamw_8bit", # 8-bit AdamW (memory efficient)
12
+ "adafactor", # Adafactor (memory efficient, no momentum)
13
+ "sgd", # SGD with momentum
14
+ ]
15
+
16
+ # Scheduler choices
17
+ SchedulerType = Literal[
18
+ "linear", # Linear decay (default)
19
+ "cosine", # Cosine annealing
20
+ "cosine_with_restarts", # Cosine with warm restarts
21
+ "constant", # Constant learning rate
22
+ "constant_with_warmup", # Constant after warmup
23
+ "polynomial", # Polynomial decay
24
+ ]
25
+
26
+
27
+ class DataConfig(BaseModel):
28
+ """Data configuration."""
29
+
30
+ train: str = Field(..., description="Path to training data (CSV, JSONL, or HF dataset)")
31
+ eval: Optional[str] = Field(None, description="Path to evaluation data (or HF dataset name)")
32
+ subset: Optional[str] = Field(None, description="HuggingFace dataset subset")
33
+ split: Optional[str] = Field(None, description="HuggingFace dataset split for training")
34
+ eval_split: Optional[str] = Field(None, description="HuggingFace dataset split for evaluation")
35
+
36
+
37
+ class LoraConfig(BaseModel):
38
+ """LoRA/PEFT configuration - works with both standard and Unsloth training."""
39
+
40
+ enabled: bool = Field(False, description="Enable LoRA training")
41
+ r: int = Field(64, description="LoRA rank (common values: 8, 16, 32, 64, 128)", ge=1)
42
+ alpha: int = Field(128, description="LoRA alpha (typically 2x rank)")
43
+ dropout: float = Field(0.1, description="LoRA dropout (use 0 for Unsloth optimization)", ge=0, le=1)
44
+ target_modules: list[str] = Field(
45
+ default=["query", "key", "value", "dense"],
46
+ description="Target modules for LoRA",
47
+ )
48
+
49
+
50
+ class UnslothConfig(BaseModel):
51
+ """Unsloth-specific configuration for faster training."""
52
+
53
+ enabled: bool = Field(False, description="Enable Unsloth for faster training")
54
+ save_method: Literal["lora", "merged_16bit", "merged_4bit"] = Field(
55
+ "merged_16bit",
56
+ description="How to save the model (lora=adapters only, merged=full model)",
57
+ )
58
+
59
+
60
+ class TrainingHyperparameters(BaseModel):
61
+ """Training hyperparameters."""
62
+
63
+ epochs: int = Field(3, description="Number of training epochs", ge=1)
64
+ batch_size: int = Field(32, description="Batch size", ge=1)
65
+ learning_rate: float = Field(2e-5, description="Learning rate", gt=0)
66
+ warmup_ratio: float = Field(0.1, description="Warmup ratio", ge=0, le=1)
67
+ weight_decay: float = Field(0.01, description="Weight decay", ge=0)
68
+ fp16: bool = Field(True, description="Use FP16 training")
69
+ bf16: bool = Field(False, description="Use BF16 training")
70
+ eval_steps: int = Field(500, description="Evaluate every N steps", ge=1)
71
+ save_steps: int = Field(500, description="Save checkpoint every N steps", ge=1)
72
+ logging_steps: int = Field(100, description="Log every N steps", ge=1)
73
+ gradient_accumulation_steps: int = Field(1, description="Gradient accumulation steps", ge=1)
74
+
75
+ # Optimizer and scheduler
76
+ optimizer: OptimizerType = Field(
77
+ "adamw_torch",
78
+ description="Optimizer type (adamw_torch, adamw_torch_fused, adamw_8bit, adafactor, sgd)",
79
+ )
80
+ scheduler: SchedulerType = Field(
81
+ "linear",
82
+ description="Learning rate scheduler (linear, cosine, cosine_with_restarts, constant, constant_with_warmup, polynomial)",
83
+ )
84
+
85
+
86
+ class OutputConfig(BaseModel):
87
+ """Output configuration."""
88
+
89
+ dir: str = Field("./output", description="Output directory")
90
+ save_total_limit: int = Field(3, description="Maximum checkpoints to keep", ge=1)
91
+ push_to_hub: bool = Field(False, description="Push model to HuggingFace Hub")
92
+ hf_username: Optional[str] = Field(None, description="HuggingFace username")
93
+
94
+
95
+ class TrainingConfig(BaseModel):
96
+ """Complete training configuration."""
97
+
98
+ # Required
99
+ task: TaskType = Field(..., description="Training task type")
100
+ base_model: str = Field(..., description="Base model name or path")
101
+ data: DataConfig = Field(..., description="Data configuration")
102
+
103
+ # Loss variant (optional - uses task default if not specified)
104
+ loss_variant: Optional[LossVariant] = Field(
105
+ None,
106
+ description="Loss function variant (task-specific, uses default if not specified)",
107
+ )
108
+
109
+ # Optional
110
+ training: TrainingHyperparameters = Field(
111
+ default_factory=TrainingHyperparameters,
112
+ description="Training hyperparameters",
113
+ )
114
+ output: OutputConfig = Field(
115
+ default_factory=OutputConfig,
116
+ description="Output configuration",
117
+ )
118
+
119
+ # LoRA/PEFT configuration
120
+ lora: LoraConfig = Field(
121
+ default_factory=LoraConfig,
122
+ description="LoRA/PEFT configuration",
123
+ )
124
+
125
+ # Unsloth configuration
126
+ unsloth: UnslothConfig = Field(
127
+ default_factory=UnslothConfig,
128
+ description="Unsloth configuration for faster training",
129
+ )
130
+
131
+ # Model configuration
132
+ max_seq_length: Optional[int] = Field(
133
+ None,
134
+ description="Maximum sequence length (auto-detect from model if not specified)",
135
+ ge=1,
136
+ )
137
+ gradient_checkpointing: bool = Field(
138
+ False,
139
+ description="Enable gradient checkpointing (saves VRAM, uses Unsloth optimization when Unsloth is enabled)",
140
+ )
141
+
142
+ # Matryoshka dimensions (optional)
143
+ matryoshka_dims: Optional[list[int]] = Field(
144
+ None,
145
+ description="Matryoshka embedding dimensions (e.g., [768, 512, 256, 128])",
146
+ )
147
+
148
+ class Config:
149
+ use_enum_values = True
150
+
151
+
152
+ def load_config_from_yaml(path: str) -> TrainingConfig:
153
+ """Load configuration from a YAML file."""
154
+ import yaml
155
+
156
+ with open(path) as f:
157
+ data = yaml.safe_load(f)
158
+
159
+ return TrainingConfig(**data)
160
+
161
+
162
+ def load_config_from_dict(data: dict) -> TrainingConfig:
163
+ """Load configuration from a dictionary."""
164
+ return TrainingConfig(**data)
@@ -0,0 +1,158 @@
1
+ from typing import TYPE_CHECKING, Type
2
+
3
+ if TYPE_CHECKING:
4
+ from vespaembed.tasks.base import BaseTask
5
+
6
+
7
+ # Default hyperparameters for all tasks
8
+ DEFAULT_HYPERPARAMETERS = {
9
+ "epochs": 3,
10
+ "batch_size": 32,
11
+ "learning_rate": 2e-5,
12
+ "warmup_ratio": 0.1,
13
+ "weight_decay": 0.01,
14
+ "fp16": True,
15
+ "bf16": False,
16
+ "eval_steps": 500,
17
+ "save_steps": 500,
18
+ "logging_steps": 100,
19
+ "gradient_accumulation_steps": 1,
20
+ "optimizer": "adamw_torch",
21
+ "scheduler": "linear",
22
+ }
23
+
24
+ # Available optimizer options
25
+ OPTIMIZER_OPTIONS = {
26
+ "adamw_torch": "AdamW (default)",
27
+ "adamw_torch_fused": "AdamW Fused (faster on CUDA)",
28
+ "adamw_8bit": "AdamW 8-bit (memory efficient)",
29
+ "adafactor": "Adafactor (memory efficient)",
30
+ "sgd": "SGD with momentum",
31
+ }
32
+
33
+ # Available scheduler options
34
+ SCHEDULER_OPTIONS = {
35
+ "linear": "Linear decay (default)",
36
+ "cosine": "Cosine annealing",
37
+ "cosine_with_restarts": "Cosine with warm restarts",
38
+ "constant": "Constant learning rate",
39
+ "constant_with_warmup": "Constant after warmup",
40
+ "polynomial": "Polynomial decay",
41
+ }
42
+
43
+ # Task-specific parameter definitions (currently none - matryoshka is now a global option)
44
+ TASK_SPECIFIC_PARAMS = {}
45
+
46
+ # Sample data for each task
47
+ TASK_SAMPLE_DATA = {
48
+ "pairs": [
49
+ {
50
+ "anchor": "What is machine learning?",
51
+ "positive": "Machine learning is a subset of AI that enables systems to learn from data.",
52
+ },
53
+ {
54
+ "anchor": "How does photosynthesis work?",
55
+ "positive": "Photosynthesis converts sunlight into chemical energy in plants.",
56
+ },
57
+ ],
58
+ "triplets": [
59
+ {
60
+ "anchor": "What is Python?",
61
+ "positive": "Python is a programming language known for its simple syntax.",
62
+ "negative": "A python is a large non-venomous snake.",
63
+ },
64
+ {
65
+ "anchor": "Apple stock price",
66
+ "positive": "AAPL shares are trading on NASDAQ.",
67
+ "negative": "Apples are nutritious fruits that grow on trees.",
68
+ },
69
+ ],
70
+ "similarity": [
71
+ {"sentence1": "A man is playing guitar", "sentence2": "A person plays a musical instrument", "score": 0.85},
72
+ {"sentence1": "A dog is running", "sentence2": "The cat sleeps peacefully", "score": 0.12},
73
+ ],
74
+ "tsdae": [
75
+ {"text": "Machine learning is transforming how we analyze data."},
76
+ {"text": "Natural language processing enables computers to understand human language."},
77
+ ],
78
+ }
79
+
80
+
81
+ class Registry:
82
+ """Central registry for tasks."""
83
+
84
+ _tasks: dict[str, Type["BaseTask"]] = {}
85
+
86
+ @classmethod
87
+ def register_task(cls, name: str):
88
+ """Decorator to register a task.
89
+
90
+ Usage:
91
+ @Registry.register_task("mnr")
92
+ class MNRTask(BaseTask):
93
+ ...
94
+ """
95
+
96
+ def decorator(task_cls: Type["BaseTask"]):
97
+ cls._tasks[name] = task_cls
98
+ return task_cls
99
+
100
+ return decorator
101
+
102
+ @classmethod
103
+ def get_task(cls, name: str) -> Type["BaseTask"]:
104
+ """Get a task class by name.
105
+
106
+ Args:
107
+ name: Task name (e.g., "mnr", "triplet")
108
+
109
+ Returns:
110
+ Task class
111
+
112
+ Raises:
113
+ ValueError: If task is not found
114
+ """
115
+ if name not in cls._tasks:
116
+ available = ", ".join(sorted(cls._tasks.keys()))
117
+ raise ValueError(f"Unknown task: '{name}'. Available tasks: {available}")
118
+ return cls._tasks[name]
119
+
120
+ @classmethod
121
+ def list_tasks(cls) -> list[str]:
122
+ """List all registered task names."""
123
+ return sorted(cls._tasks.keys())
124
+
125
+ @classmethod
126
+ def get_task_info(cls, name: str = None) -> dict | list[dict]:
127
+ """Get information about registered tasks.
128
+
129
+ Args:
130
+ name: If provided, get info for a specific task. Otherwise, get all tasks.
131
+
132
+ Returns:
133
+ Task info dict or list of task info dicts
134
+ """
135
+ if name:
136
+ if name not in cls._tasks:
137
+ available = ", ".join(sorted(cls._tasks.keys()))
138
+ raise ValueError(f"Unknown task: '{name}'. Available tasks: {available}")
139
+ return cls._build_task_info(name, cls._tasks[name])
140
+
141
+ return [cls._build_task_info(task_name, task_cls) for task_name, task_cls in sorted(cls._tasks.items())]
142
+
143
+ @classmethod
144
+ def _build_task_info(cls, name: str, task_cls: Type["BaseTask"]) -> dict:
145
+ """Build task info dictionary."""
146
+ info = {
147
+ "name": name,
148
+ "description": getattr(task_cls, "description", ""),
149
+ "expected_columns": getattr(task_cls, "expected_columns", []),
150
+ "optional_columns": getattr(task_cls, "optional_columns", []),
151
+ "column_aliases": getattr(task_cls, "column_aliases", {}),
152
+ "loss_options": getattr(task_cls, "loss_options", []),
153
+ "default_loss": getattr(task_cls, "default_loss", ""),
154
+ "hyperparameters": DEFAULT_HYPERPARAMETERS.copy(),
155
+ "task_specific_params": TASK_SPECIFIC_PARAMS.get(name, {}),
156
+ "sample_data": TASK_SAMPLE_DATA.get(name, []),
157
+ }
158
+ return info