PyPI - vespaembed - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

vespaembed 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

vespaembed/__init__.py +1 -1
vespaembed/cli/__init__.py +17 -0
vespaembed/cli/commands/__init__.py +7 -0
vespaembed/cli/commands/evaluate.py +85 -0
vespaembed/cli/commands/export.py +86 -0
vespaembed/cli/commands/info.py +52 -0
vespaembed/cli/commands/serve.py +49 -0
vespaembed/cli/commands/train.py +267 -0
vespaembed/cli/vespaembed.py +55 -0
vespaembed/core/__init__.py +2 -0
vespaembed/core/config.py +164 -0
vespaembed/core/registry.py +158 -0
vespaembed/core/trainer.py +573 -0
vespaembed/datasets/__init__.py +3 -0
vespaembed/datasets/formats/__init__.py +5 -0
vespaembed/datasets/formats/csv.py +15 -0
vespaembed/datasets/formats/huggingface.py +34 -0
vespaembed/datasets/formats/jsonl.py +26 -0
vespaembed/datasets/loader.py +80 -0
vespaembed/db.py +176 -0
vespaembed/enums.py +58 -0
vespaembed/evaluation/__init__.py +3 -0
vespaembed/evaluation/factory.py +86 -0
vespaembed/models/__init__.py +4 -0
vespaembed/models/export.py +89 -0
vespaembed/models/loader.py +25 -0
vespaembed/static/css/styles.css +1800 -0
vespaembed/static/js/app.js +1485 -0
vespaembed/tasks/__init__.py +23 -0
vespaembed/tasks/base.py +144 -0
vespaembed/tasks/pairs.py +91 -0
vespaembed/tasks/similarity.py +84 -0
vespaembed/tasks/triplets.py +90 -0
vespaembed/tasks/tsdae.py +102 -0
vespaembed/templates/index.html +544 -0
vespaembed/utils/__init__.py +3 -0
vespaembed/utils/logging.py +69 -0
vespaembed/web/__init__.py +1 -0
vespaembed/web/api/__init__.py +1 -0
vespaembed/web/app.py +605 -0
vespaembed/worker.py +313 -0
vespaembed-0.0.3.dist-info/METADATA +325 -0
vespaembed-0.0.3.dist-info/RECORD +47 -0
{vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/WHEEL +1 -1
vespaembed-0.0.1.dist-info/METADATA +0 -20
vespaembed-0.0.1.dist-info/RECORD +0 -7
{vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/entry_points.txt +0 -0
{vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/licenses/LICENSE +0 -0
{vespaembed-0.0.1.dist-info → vespaembed-0.0.3.dist-info}/top_level.txt +0 -0

vespaembed/core/config.py ADDED Viewed

@@ -0,0 +1,164 @@
+from typing import Literal, Optional
+from pydantic import BaseModel, Field
+from vespaembed.enums import LossVariant, TaskType
+# Optimizer choices (most common ones from HuggingFace)
+OptimizerType = Literal[
+    "adamw_torch",  # Default AdamW
+    "adamw_torch_fused",  # Fused AdamW (faster on CUDA)
+    "adamw_8bit",  # 8-bit AdamW (memory efficient)
+    "adafactor",  # Adafactor (memory efficient, no momentum)
+    "sgd",  # SGD with momentum
+]
+# Scheduler choices
+SchedulerType = Literal[
+    "linear",  # Linear decay (default)
+    "cosine",  # Cosine annealing
+    "cosine_with_restarts",  # Cosine with warm restarts
+    "constant",  # Constant learning rate
+    "constant_with_warmup",  # Constant after warmup
+    "polynomial",  # Polynomial decay
+]
+class DataConfig(BaseModel):
+    """Data configuration."""
+    train: str = Field(..., description="Path to training data (CSV, JSONL, or HF dataset)")
+    eval: Optional[str] = Field(None, description="Path to evaluation data (or HF dataset name)")
+    subset: Optional[str] = Field(None, description="HuggingFace dataset subset")
+    split: Optional[str] = Field(None, description="HuggingFace dataset split for training")
+    eval_split: Optional[str] = Field(None, description="HuggingFace dataset split for evaluation")
+class LoraConfig(BaseModel):
+    """LoRA/PEFT configuration - works with both standard and Unsloth training."""
+    enabled: bool = Field(False, description="Enable LoRA training")
+    r: int = Field(64, description="LoRA rank (common values: 8, 16, 32, 64, 128)", ge=1)
+    alpha: int = Field(128, description="LoRA alpha (typically 2x rank)")
+    dropout: float = Field(0.1, description="LoRA dropout (use 0 for Unsloth optimization)", ge=0, le=1)
+    target_modules: list[str] = Field(
+        default=["query", "key", "value", "dense"],
+        description="Target modules for LoRA",
+    )
+class UnslothConfig(BaseModel):
+    """Unsloth-specific configuration for faster training."""
+    enabled: bool = Field(False, description="Enable Unsloth for faster training")
+    save_method: Literal["lora", "merged_16bit", "merged_4bit"] = Field(
+        "merged_16bit",
+        description="How to save the model (lora=adapters only, merged=full model)",
+    )
+class TrainingHyperparameters(BaseModel):
+    """Training hyperparameters."""
+    epochs: int = Field(3, description="Number of training epochs", ge=1)
+    batch_size: int = Field(32, description="Batch size", ge=1)
+    learning_rate: float = Field(2e-5, description="Learning rate", gt=0)
+    warmup_ratio: float = Field(0.1, description="Warmup ratio", ge=0, le=1)
+    weight_decay: float = Field(0.01, description="Weight decay", ge=0)
+    fp16: bool = Field(True, description="Use FP16 training")
+    bf16: bool = Field(False, description="Use BF16 training")
+    eval_steps: int = Field(500, description="Evaluate every N steps", ge=1)
+    save_steps: int = Field(500, description="Save checkpoint every N steps", ge=1)
+    logging_steps: int = Field(100, description="Log every N steps", ge=1)
+    gradient_accumulation_steps: int = Field(1, description="Gradient accumulation steps", ge=1)
+    # Optimizer and scheduler
+    optimizer: OptimizerType = Field(
+        "adamw_torch",
+        description="Optimizer type (adamw_torch, adamw_torch_fused, adamw_8bit, adafactor, sgd)",
+    )
+    scheduler: SchedulerType = Field(
+        "linear",
+        description="Learning rate scheduler (linear, cosine, cosine_with_restarts, constant, constant_with_warmup, polynomial)",
+    )
+class OutputConfig(BaseModel):
+    """Output configuration."""
+    dir: str = Field("./output", description="Output directory")
+    save_total_limit: int = Field(3, description="Maximum checkpoints to keep", ge=1)
+    push_to_hub: bool = Field(False, description="Push model to HuggingFace Hub")
+    hf_username: Optional[str] = Field(None, description="HuggingFace username")
+class TrainingConfig(BaseModel):
+    """Complete training configuration."""
+    # Required
+    task: TaskType = Field(..., description="Training task type")
+    base_model: str = Field(..., description="Base model name or path")
+    data: DataConfig = Field(..., description="Data configuration")
+    # Loss variant (optional - uses task default if not specified)
+    loss_variant: Optional[LossVariant] = Field(
+        None,
+        description="Loss function variant (task-specific, uses default if not specified)",
+    )
+    # Optional
+    training: TrainingHyperparameters = Field(
+        default_factory=TrainingHyperparameters,
+        description="Training hyperparameters",
+    )
+    output: OutputConfig = Field(
+        default_factory=OutputConfig,
+        description="Output configuration",
+    )
+    # LoRA/PEFT configuration
+    lora: LoraConfig = Field(
+        default_factory=LoraConfig,
+        description="LoRA/PEFT configuration",
+    )
+    # Unsloth configuration
+    unsloth: UnslothConfig = Field(
+        default_factory=UnslothConfig,
+        description="Unsloth configuration for faster training",
+    )
+    # Model configuration
+    max_seq_length: Optional[int] = Field(
+        None,
+        description="Maximum sequence length (auto-detect from model if not specified)",
+        ge=1,
+    )
+    gradient_checkpointing: bool = Field(
+        False,
+        description="Enable gradient checkpointing (saves VRAM, uses Unsloth optimization when Unsloth is enabled)",
+    )
+    # Matryoshka dimensions (optional)
+    matryoshka_dims: Optional[list[int]] = Field(
+        None,
+        description="Matryoshka embedding dimensions (e.g., [768, 512, 256, 128])",
+    )
+    class Config:
+        use_enum_values = True
+def load_config_from_yaml(path: str) -> TrainingConfig:
+    """Load configuration from a YAML file."""
+    import yaml
+    with open(path) as f:
+        data = yaml.safe_load(f)
+    return TrainingConfig(**data)
+def load_config_from_dict(data: dict) -> TrainingConfig:
+    """Load configuration from a dictionary."""
+    return TrainingConfig(**data)

vespaembed/core/registry.py ADDED Viewed

@@ -0,0 +1,158 @@
+from typing import TYPE_CHECKING, Type
+if TYPE_CHECKING:
+    from vespaembed.tasks.base import BaseTask
+# Default hyperparameters for all tasks
+DEFAULT_HYPERPARAMETERS = {
+    "epochs": 3,
+    "batch_size": 32,
+    "learning_rate": 2e-5,
+    "warmup_ratio": 0.1,
+    "weight_decay": 0.01,
+    "fp16": True,
+    "bf16": False,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "logging_steps": 100,
+    "gradient_accumulation_steps": 1,
+    "optimizer": "adamw_torch",
+    "scheduler": "linear",
+}
+# Available optimizer options
+OPTIMIZER_OPTIONS = {
+    "adamw_torch": "AdamW (default)",
+    "adamw_torch_fused": "AdamW Fused (faster on CUDA)",
+    "adamw_8bit": "AdamW 8-bit (memory efficient)",
+    "adafactor": "Adafactor (memory efficient)",
+    "sgd": "SGD with momentum",
+}
+# Available scheduler options
+SCHEDULER_OPTIONS = {
+    "linear": "Linear decay (default)",
+    "cosine": "Cosine annealing",
+    "cosine_with_restarts": "Cosine with warm restarts",
+    "constant": "Constant learning rate",
+    "constant_with_warmup": "Constant after warmup",
+    "polynomial": "Polynomial decay",
+}
+# Task-specific parameter definitions (currently none - matryoshka is now a global option)
+TASK_SPECIFIC_PARAMS = {}
+# Sample data for each task
+TASK_SAMPLE_DATA = {
+    "pairs": [
+        {
+            "anchor": "What is machine learning?",
+            "positive": "Machine learning is a subset of AI that enables systems to learn from data.",
+        },
+        {
+            "anchor": "How does photosynthesis work?",
+            "positive": "Photosynthesis converts sunlight into chemical energy in plants.",
+        },
+    ],
+    "triplets": [
+        {
+            "anchor": "What is Python?",
+            "positive": "Python is a programming language known for its simple syntax.",
+            "negative": "A python is a large non-venomous snake.",
+        },
+        {
+            "anchor": "Apple stock price",
+            "positive": "AAPL shares are trading on NASDAQ.",
+            "negative": "Apples are nutritious fruits that grow on trees.",
+        },
+    ],
+    "similarity": [
+        {"sentence1": "A man is playing guitar", "sentence2": "A person plays a musical instrument", "score": 0.85},
+        {"sentence1": "A dog is running", "sentence2": "The cat sleeps peacefully", "score": 0.12},
+    ],
+    "tsdae": [
+        {"text": "Machine learning is transforming how we analyze data."},
+        {"text": "Natural language processing enables computers to understand human language."},
+    ],
+}
+class Registry:
+    """Central registry for tasks."""
+    _tasks: dict[str, Type["BaseTask"]] = {}
+    @classmethod
+    def register_task(cls, name: str):
+        """Decorator to register a task.
+        Usage:
+            @Registry.register_task("mnr")
+            class MNRTask(BaseTask):
+                ...
+        """
+        def decorator(task_cls: Type["BaseTask"]):
+            cls._tasks[name] = task_cls
+            return task_cls
+        return decorator
+    @classmethod
+    def get_task(cls, name: str) -> Type["BaseTask"]:
+        """Get a task class by name.
+        Args:
+            name: Task name (e.g., "mnr", "triplet")
+        Returns:
+            Task class
+        Raises:
+            ValueError: If task is not found
+        """
+        if name not in cls._tasks:
+            available = ", ".join(sorted(cls._tasks.keys()))
+            raise ValueError(f"Unknown task: '{name}'. Available tasks: {available}")
+        return cls._tasks[name]
+    @classmethod
+    def list_tasks(cls) -> list[str]:
+        """List all registered task names."""
+        return sorted(cls._tasks.keys())
+    @classmethod
+    def get_task_info(cls, name: str = None) -> dict | list[dict]:
+        """Get information about registered tasks.
+        Args:
+            name: If provided, get info for a specific task. Otherwise, get all tasks.
+        Returns:
+            Task info dict or list of task info dicts
+        """
+        if name:
+            if name not in cls._tasks:
+                available = ", ".join(sorted(cls._tasks.keys()))
+                raise ValueError(f"Unknown task: '{name}'. Available tasks: {available}")
+            return cls._build_task_info(name, cls._tasks[name])
+        return [cls._build_task_info(task_name, task_cls) for task_name, task_cls in sorted(cls._tasks.items())]
+    @classmethod
+    def _build_task_info(cls, name: str, task_cls: Type["BaseTask"]) -> dict:
+        """Build task info dictionary."""
+        info = {
+            "name": name,
+            "description": getattr(task_cls, "description", ""),
+            "expected_columns": getattr(task_cls, "expected_columns", []),
+            "optional_columns": getattr(task_cls, "optional_columns", []),
+            "column_aliases": getattr(task_cls, "column_aliases", {}),
+            "loss_options": getattr(task_cls, "loss_options", []),
+            "default_loss": getattr(task_cls, "default_loss", ""),
+            "hyperparameters": DEFAULT_HYPERPARAMETERS.copy(),
+            "task_specific_params": TASK_SPECIFIC_PARAMS.get(name, {}),
+            "sample_data": TASK_SAMPLE_DATA.get(name, []),
+        }
+        return info

vespaembed 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

vespaembed 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl