PyPI - embed-train - Versions diffs - 1.0.0__tar.gz - Mend

embed-train 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

embed_train-1.0.0/.gitignore +38 -0
embed_train-1.0.0/.gitlab-ci.yml +110 -0
embed_train-1.0.0/.pre-commit-config.yaml +48 -0
embed_train-1.0.0/.releaserc.json +28 -0
embed_train-1.0.0/AGENTS.md +187 -0
embed_train-1.0.0/CHANGELOG.md +6 -0
embed_train-1.0.0/Makefile +87 -0
embed_train-1.0.0/PKG-INFO +283 -0
embed_train-1.0.0/README.md +270 -0
embed_train-1.0.0/codecov.yml +30 -0
embed_train-1.0.0/commitlint.config.cjs +22 -0
embed_train-1.0.0/pyproject.toml +84 -0
embed_train-1.0.0/src/embed_train/__init__.py +37 -0
embed_train-1.0.0/src/embed_train/constants.py +3 -0
embed_train-1.0.0/src/embed_train/exceptions.py +31 -0
embed_train-1.0.0/src/embed_train/models/__init__.py +61 -0
embed_train-1.0.0/src/embed_train/push_to_hf/__init__.py +131 -0
embed_train-1.0.0/src/embed_train/py.typed +0 -0
embed_train-1.0.0/src/embed_train/settings.py +202 -0
embed_train-1.0.0/src/embed_train/train/__init__.py +17 -0
embed_train-1.0.0/src/embed_train/train/dataset/__init__.py +109 -0
embed_train-1.0.0/src/embed_train/train/dataset/collate.py +52 -0
embed_train-1.0.0/src/embed_train/train/dataset/sampling/__init__.py +46 -0
embed_train-1.0.0/src/embed_train/train/dataset/sampling/samplers.py +36 -0
embed_train-1.0.0/src/embed_train/train/dataset/torch_datasets.py +71 -0
embed_train-1.0.0/src/embed_train/train/trainers/__init__.py +22 -0
embed_train-1.0.0/src/embed_train/train/trainers/hf/__init__.py +158 -0
embed_train-1.0.0/src/embed_train/train/trainers/torch/__init__.py +226 -0
embed_train-1.0.0/src/embed_train/train/trainers/torch/loss.py +99 -0
embed_train-1.0.0/src/embed_train/utils.py +80 -0
embed_train-1.0.0/tests/__init__.py +0 -0
embed_train-1.0.0/tests/conftest.py +33 -0
embed_train-1.0.0/tests/fixtures/__init__.py +0 -0
embed_train-1.0.0/tests/fixtures/components.py +477 -0
embed_train-1.0.0/tests/fixtures/data.py +40 -0
embed_train-1.0.0/tests/integration/__init__.py +0 -0
embed_train-1.0.0/tests/integration/test_dataset/__init__.py +0 -0
embed_train-1.0.0/tests/integration/test_dataset/test_to_hf_dataset.py +14 -0
embed_train-1.0.0/tests/integration/test_train_runner/__init__.py +0 -0
embed_train-1.0.0/tests/integration/test_train_runner/test_train_runner_flow.py +19 -0
embed_train-1.0.0/tests/unit/__init__.py +0 -0
embed_train-1.0.0/tests/unit/test_abstract_guards.py +60 -0
embed_train-1.0.0/tests/unit/test_embed_train.py +36 -0
embed_train-1.0.0/tests/unit/test_exceptions.py +17 -0
embed_train-1.0.0/tests/unit/test_models.py +65 -0
embed_train-1.0.0/tests/unit/test_push_to_hf.py +207 -0
embed_train-1.0.0/tests/unit/test_settings.py +83 -0
embed_train-1.0.0/tests/unit/test_train/__init__.py +0 -0
embed_train-1.0.0/tests/unit/test_train/test_collate.py +42 -0
embed_train-1.0.0/tests/unit/test_train/test_dataset.py +64 -0
embed_train-1.0.0/tests/unit/test_train/test_hf_trainer.py +153 -0
embed_train-1.0.0/tests/unit/test_train/test_loss.py +39 -0
embed_train-1.0.0/tests/unit/test_train/test_runner.py +19 -0
embed_train-1.0.0/tests/unit/test_train/test_samplers.py +19 -0
embed_train-1.0.0/tests/unit/test_train/test_sampling.py +9 -0
embed_train-1.0.0/tests/unit/test_train/test_torch_datasets.py +29 -0
embed_train-1.0.0/tests/unit/test_train/test_torch_trainer.py +127 -0
embed_train-1.0.0/tests/unit/test_train/test_trainers.py +16 -0
embed_train-1.0.0/tests/unit/test_utils.py +98 -0
embed_train-1.0.0/uv.lock +2572 -0

embed_train-1.0.0/.gitignore ADDED Viewed

@@ -0,0 +1,38 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# env files
+.env
+# Pycharm
+.idea/
+# VSCode
+.vscode/
+# Testing
+.coverage
+coverage.xml
+htmlcov/
+.pytest_cache/
+.mypy_cache/
+# Ruff
+.ruff_cache/
+# Pre-commit
+.pre-commit-cache/
+# MacOS
+.DS_Store
+# Claude
+.claude/

embed_train-1.0.0/.gitlab-ci.yml ADDED Viewed

@@ -0,0 +1,110 @@
+image: python:3.12
+workflow:
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "dev"'
+    - if: '$CI_COMMIT_BRANCH == "dev"'
+    - when: never
+stages:
+  - checks
+  - test
+  - deploy
+variables:
+  UV_SYSTEM_PYTHON: "1"
+  PIP_DISABLE_PIP_VERSION_CHECK: "1"
+  GIT_DEPTH: 0
+cache:
+  key:
+    files:
+      - uv.lock
+  paths:
+    - .venv/
+    - ~/.cache/uv
+before_script:
+  - pip install uv
+  - uv sync --group dev --all-extras
+ruff:
+  stage: checks
+  script:
+    - uv run ruff check .
+    - uv run ruff format --check .
+ty:
+  stage: checks
+  script:
+    - uv run ty check .
+bandit:
+  stage: checks
+  script:
+    - uv run bandit -c pyproject.toml -r .
+pytest:
+  stage: test
+  script:
+    - |
+      if [ "$CI_PIPELINE_SOURCE" = "merge_request_event" ]; then
+        uv run pytest --cov=src --cov-report=xml --cov-report=term --cov-fail-under=80
+      else
+        uv run pytest
+      fi
+  artifacts:
+    paths:
+      - coverage.xml
+check-package:
+  stage: test
+  needs: ["pytest"]
+  script:
+    - uv build
+    - pip install dist/*.whl
+codecov:
+  stage: test
+  needs: ["pytest"]
+  script:
+    - curl -Os https://cli.codecov.io/latest/linux/codecov
+    - chmod +x codecov
+    - ./codecov upload-process -t $CODECOV_TOKEN
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
+release:
+  stage: deploy
+  image: node:22
+  needs: ["check-package"]
+  before_script:
+    - apt-get update && apt-get install -y python3 python3-pip
+    - pip install --break-system-packages uv
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "dev"'
+  script:
+    - npm install -g semantic-release @semantic-release/gitlab @semantic-release/changelog @semantic-release/git @semantic-release/exec
+    - semantic-release
+conventional-commits:
+  stage: checks
+  image: node:20
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
+  before_script: []
+  script:
+    - npm install -g @commitlint/cli @commitlint/config-conventional
+    - echo "$CI_MERGE_REQUEST_TITLE" > pr_title.txt
+    - |
+      npx commitlint --config commitlint.config.cjs --edit pr_title.txt || {
+        echo "❌ PR title must follow Conventional Commits format (e.g. feat:, fix:)"
+        exit 1
+      }
+default:
+  interruptible: true
+  retry: 1
+  timeout: 10m

embed_train-1.0.0/.pre-commit-config.yaml ADDED Viewed

@@ -0,0 +1,48 @@
+repos:
+  # Ruff for linting and formatting
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.13.3
+    hooks:
+      # Run the linter
+      - id: ruff
+        args: [--fix]
+      # Run the formatter
+      - id: ruff-format
+  # Type checking with ty (disabled in pre-commit, enabled in CI)
+  # - repo: https://github.com/astral-sh/ty-pre-commit
+  #   rev: v0.0.17
+  #   hooks:
+  #     - id: ty
+  #       exclude: ^(tests/|alembic/)
+  # Conventional commits enforcement
+  - repo: https://github.com/compilerla/conventional-pre-commit
+    rev: v3.6.0
+    hooks:
+      - id: conventional-pre-commit
+        stages: [commit-msg]
+  # General file checks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+        args: [--maxkb=1000]
+      - id: check-json
+      - id: check-toml
+      - id: detect-private-key
+      - id: mixed-line-ending
+      - id: check-merge-conflict
+  # Security checks with bandit
+  - repo: https://github.com/PyCQA/bandit
+    rev: 1.9.4
+    hooks:
+      - id: bandit
+        args: [-c, pyproject.toml]
+        additional_dependencies: ["bandit[toml]"]
+        exclude: ^tests/

embed_train-1.0.0/.releaserc.json ADDED Viewed

@@ -0,0 +1,28 @@
+{
+    "branches": [
+        "dev"
+    ],
+    "plugins": [
+        "@semantic-release/commit-analyzer",
+        "@semantic-release/release-notes-generator",
+        "@semantic-release/changelog",
+        [
+            "@semantic-release/exec",
+            {
+                "prepareCmd": "uv version ${nextRelease.version}",
+                "publishCmd": "uv build && uv publish --token $PYPI_TOKEN"
+            }
+        ],
+        [
+            "@semantic-release/git",
+            {
+                "assets": [
+                    "CHANGELOG.md",
+                    "pyproject.toml"
+                ],
+                "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
+            }
+        ],
+        "@semantic-release/gitlab"
+    ]
+}

embed_train-1.0.0/AGENTS.md ADDED Viewed

@@ -0,0 +1,187 @@
+# AGENTS.md
+## Purpose
+This file gives coding agents project-specific guidance for working in `embed-train`.
+Use it together with the repository code and tests, not as a substitute for reading the implementation.
+## Project Summary
+`embed-train` is a config-driven Python library for:
+- training embedding models with a custom PyTorch loop
+- training embedding models with SentenceTransformers
+- loading components dynamically from `module_path`
+- checkpointing models
+- packaging and optionally pushing model repos to Hugging Face Hub
+The codebase is built around small abstractions connected through typed settings objects in `src/embed_train/settings.py`.
+## Source Layout
+Main library code lives under `src/embed_train/`.
+Key modules:
+- `src/embed_train/__init__.py`
+  Top-level `Runner` abstraction and YAML bootstrap via `Runner.get_runner(...)`.
+- `src/embed_train/settings.py`
+  Central configuration contract for runners, trainers, datasets, collate functions, losses, and HF publishing.
+- `src/embed_train/models/__init__.py`
+  Base `Model` wrapper abstraction with `save`, `to`, and `from_checkpoint`.
+- `src/embed_train/train/__init__.py`
+  `TrainRunner`, which instantiates a configured trainer and executes `train()`.
+- `src/embed_train/train/trainers/torch/__init__.py`
+  Custom PyTorch training loop with checkpointing, train/val split, TensorBoard logging, and pluggable loss/collate/dataset pieces.
+- `src/embed_train/train/trainers/hf/__init__.py`
+  SentenceTransformers-based training path with `InformationRetrievalEvaluator`.
+- `src/embed_train/train/dataset/__init__.py`
+  Base `TorchDataset` and `CollateFn` abstractions.
+- `src/embed_train/train/dataset/collate.py`
+  Built-in collate functions for in-batch positive training.
+- `src/embed_train/train/dataset/torch_datasets.py`
+  Built-in grouped and flattened query/positive dataset views.
+- `src/embed_train/train/trainers/torch/loss.py`
+  Built-in contrastive losses.
+- `src/embed_train/push_to_hf/__init__.py`
+  `PushToHFRunner` for checkpoint restore, local repo export, and HF upload.
+- `src/embed_train/utils.py`
+  Dynamic class loading and checkpoint loading utilities.
+Tests live in `tests/` and are the best executable reference for intended behavior.
+## Working Style For This Repo
+- Read the relevant module and its tests before changing behavior.
+- Treat `settings.py` as the public config contract.
+- Prefer extending existing abstractions instead of bypassing them.
+- Keep the config-driven architecture intact. Avoid hardcoding project-specific classes where `module_path` is the current pattern.
+- Preserve typed settings and validation rules when adding new configuration.
+- Keep changes small and local unless the task clearly requires a cross-cutting refactor.
+## What To Check Before Editing
+For most changes, inspect the matching implementation and tests together:
+- runner changes:
+  `src/embed_train/__init__.py`, `src/embed_train/train/__init__.py`, `src/embed_train/push_to_hf/__init__.py`
+- settings changes:
+  `src/embed_train/settings.py`, `tests/unit/test_settings.py`
+- model wrapper changes:
+  `src/embed_train/models/__init__.py`, `tests/unit/test_models.py`
+- custom PyTorch trainer changes:
+  `src/embed_train/train/trainers/torch/__init__.py`, `tests/unit/test_train/test_torch_trainer.py`
+- dataset or collate changes:
+  `src/embed_train/train/dataset/`, `tests/unit/test_train/test_dataset.py`, `tests/unit/test_train/test_collate.py`
+- loss changes:
+  `src/embed_train/train/trainers/torch/loss.py`, `tests/unit/test_train/test_loss.py`
+- HF publishing changes:
+  `src/embed_train/push_to_hf/__init__.py`, `tests/unit/test_push_to_hf.py`
+## Change Guidelines
+### Configuration Changes
+- Add new fields in `src/embed_train/settings.py`.
+- Prefer Pydantic validation for cross-field rules.
+- Keep names explicit and aligned with existing settings models.
+- If behavior depends on a new config field, add or update tests that validate both valid and invalid cases.
+### Trainer Changes
+- Maintain separation between:
+  - dataset loading
+  - collate/tokenization
+  - embedding computation
+  - loss calculation
+  - checkpointing/logging
+- Keep output paths consistent with the current `data_dir/checkpoints/...` and TensorBoard structure unless migration is intentional.
+### Dataset and Collate Changes
+- Preserve the row contracts expected by built-in collate functions:
+  - grouped format: `{"query": str, "positives": list[str]}`
+  - flattened format: `{"query": str, "positive": str}`
+- When changing row formats, update the collate functions and tests together.
+- Keep tokenizer behavior explicit through settings rather than hidden defaults.
+### Model and Checkpoint Changes
+- `Model.to_hf_model()` should keep returning a Hugging Face-compatible model object.
+- Preserve support for loading checkpoints from:
+  - `.pt`-style files
+  - `.safetensors`
+  - directories containing `model.safetensors`
+- Be careful with checkpoint loading and save format changes because they affect training and HF publishing flows.
+### Hugging Face Publishing Changes
+- `PushToHFRunner` currently assumes custom model source files may need to be copied into the exported repo.
+- If you change the file discovery logic, keep the supported patterns aligned with the current intent:
+  - `modeling_*.py`
+  - `configuration_*.py`
+  - `vllm_modeling_*.py`
+  - `vllm_configuration_*.py`
+- Avoid making remote-side effects happen implicitly. Keep `push` behavior explicit.
+## Documentation Expectations
+When changing public behavior, update docs in the same task when appropriate:
+- `README.md` for library usage, structure, or workflows
+- inline docstrings only when they add real clarity
+Do not add generic documentation that is not supported by the code.
+## Testing Expectations
+Use the Makefile targets when possible.
+Common commands:
+```bash
+make test
+make lint
+make type-check
+make ci
+```
+Minimum expectations by change type:
+- docs-only changes:
+  usually no tests required
+- settings or validation changes:
+  run affected unit tests, ideally `make test`
+- trainer, dataset, loss, or runner changes:
+  run relevant unit tests and any affected integration tests
+- broad behavioral changes:
+  run `make ci` if feasible
+If you cannot run tests, say so explicitly in the final handoff.
+## Repo Conventions
+- Python version target is defined in `pyproject.toml`.
+- Formatting and linting use Ruff.
+- Type checking uses `ty`.
+- Tests use Pytest.
+- The project may rely on `retrievalbase` types and mixins for configuration and dataset/processor integration.
+## Agent Pitfalls To Avoid
+- Do not assume this is a CLI-first project. The main contract is the Python library plus config-driven orchestration.
+- Do not invent public APIs that are not present in `src/embed_train/`.
+- Do not document package publishing flows unless they actually exist in the repo.
+- Do not bypass dynamic imports with direct hardcoded references unless the task is specifically to remove that pattern.
+- Do not change config names casually; they are effectively part of the library interface.
+- Do not remove or weaken validation without a clear reason and test coverage.
+## Good Agent Outcomes
+A good change in this repo usually has these properties:
+- it respects the current abstraction boundaries
+- it updates tests with behavior changes
+- it keeps configuration explicit
+- it improves the library without making it more project-specific
+- it leaves README and docs more accurate, not more generic

embed_train-1.0.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,6 @@
+# 1.0.0 (2026-04-21)
+### Features
+* initial release ([0ef0776](https://gitlab.com/efysent/agentic-core/embed-train/commit/0ef07764d7279d95e149fb795348e6637e4f52a3))

embed_train-1.0.0/Makefile ADDED Viewed

@@ -0,0 +1,87 @@
+.PHONY: help install dev-install setup-hooks \
+        format lint type-check security \
+        test test-cov coverage-diff \
+        ci ci-fast clean run
+# ------------------------
+# 📚 Help
+# ------------------------
+help:
+	@echo "Available commands:"
+	@echo "  make install         - Install production dependencies"
+	@echo "  make dev-install     - Install all dependencies + hooks"
+	@echo "  make setup-hooks     - Install pre-commit hooks"
+	@echo "  make format          - Format code (ruff)"
+	@echo "  make lint            - Lint code (ruff)"
+	@echo "  make type-check      - Run type checking (ty)"
+	@echo "  make security        - Run security checks (bandit)"
+	@echo "  make test            - Run tests"
+	@echo "  make test-cov        - Run tests with coverage"
+	@echo "  make ci              - Full CI locally"
+	@echo "  make ci-fast         - CI without slow steps"
+	@echo "  make clean           - Clean cache files"
+# ------------------------
+# 📦 Install
+# ------------------------
+install:
+	uv sync --no-dev --all-extras
+dev-install:
+	uv sync --group dev --all-extras
+	uv run pre-commit install
+	uv run pre-commit install --hook-type commit-msg
+	@echo "✅ Dev environment ready"
+setup-hooks:
+	uv run pre-commit install
+	uv run pre-commit install --hook-type commit-msg
+# ------------------------
+# 🧹 Code quality
+# ------------------------
+format:
+	uv run ruff format .
+	uv run ruff check --fix .
+lint:
+	uv run ruff check .
+type-check:
+	uv run ty check .
+security:
+	uv run bandit -c pyproject.toml -r .
+# ------------------------
+# 🧪 Tests
+# ------------------------
+test:
+	uv run pytest
+test-cov:
+	uv run pytest \
+		--cov=src \
+		--cov-report=term \
+		--cov-report=xml:coverage.xml \
+		--cov-fail-under=80
+# ------------------------
+# 🚀 CI equivalent
+# ------------------------
+ci: lint type-check security test-cov
+	@echo ""
+	@echo "✅ Full CI checks passed"
+# Faster version (no bandit blocking)
+ci-fast: lint type-check test
+	@echo ""
+	@echo "⚡ Fast CI checks passed"
+# ------------------------
+# 🧹 Cleanup
+# ------------------------
+clean:
+	find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+	find . -type f -name "*.pyc" -delete 2>/dev/null || true
+	rm -rf .pytest_cache .ruff_cache .ty_cache htmlcov coverage.xml 2>/dev/null || true