embed-train 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. embed_train-1.0.0/.gitignore +38 -0
  2. embed_train-1.0.0/.gitlab-ci.yml +110 -0
  3. embed_train-1.0.0/.pre-commit-config.yaml +48 -0
  4. embed_train-1.0.0/.releaserc.json +28 -0
  5. embed_train-1.0.0/AGENTS.md +187 -0
  6. embed_train-1.0.0/CHANGELOG.md +6 -0
  7. embed_train-1.0.0/Makefile +87 -0
  8. embed_train-1.0.0/PKG-INFO +283 -0
  9. embed_train-1.0.0/README.md +270 -0
  10. embed_train-1.0.0/codecov.yml +30 -0
  11. embed_train-1.0.0/commitlint.config.cjs +22 -0
  12. embed_train-1.0.0/pyproject.toml +84 -0
  13. embed_train-1.0.0/src/embed_train/__init__.py +37 -0
  14. embed_train-1.0.0/src/embed_train/constants.py +3 -0
  15. embed_train-1.0.0/src/embed_train/exceptions.py +31 -0
  16. embed_train-1.0.0/src/embed_train/models/__init__.py +61 -0
  17. embed_train-1.0.0/src/embed_train/push_to_hf/__init__.py +131 -0
  18. embed_train-1.0.0/src/embed_train/py.typed +0 -0
  19. embed_train-1.0.0/src/embed_train/settings.py +202 -0
  20. embed_train-1.0.0/src/embed_train/train/__init__.py +17 -0
  21. embed_train-1.0.0/src/embed_train/train/dataset/__init__.py +109 -0
  22. embed_train-1.0.0/src/embed_train/train/dataset/collate.py +52 -0
  23. embed_train-1.0.0/src/embed_train/train/dataset/sampling/__init__.py +46 -0
  24. embed_train-1.0.0/src/embed_train/train/dataset/sampling/samplers.py +36 -0
  25. embed_train-1.0.0/src/embed_train/train/dataset/torch_datasets.py +71 -0
  26. embed_train-1.0.0/src/embed_train/train/trainers/__init__.py +22 -0
  27. embed_train-1.0.0/src/embed_train/train/trainers/hf/__init__.py +158 -0
  28. embed_train-1.0.0/src/embed_train/train/trainers/torch/__init__.py +226 -0
  29. embed_train-1.0.0/src/embed_train/train/trainers/torch/loss.py +99 -0
  30. embed_train-1.0.0/src/embed_train/utils.py +80 -0
  31. embed_train-1.0.0/tests/__init__.py +0 -0
  32. embed_train-1.0.0/tests/conftest.py +33 -0
  33. embed_train-1.0.0/tests/fixtures/__init__.py +0 -0
  34. embed_train-1.0.0/tests/fixtures/components.py +477 -0
  35. embed_train-1.0.0/tests/fixtures/data.py +40 -0
  36. embed_train-1.0.0/tests/integration/__init__.py +0 -0
  37. embed_train-1.0.0/tests/integration/test_dataset/__init__.py +0 -0
  38. embed_train-1.0.0/tests/integration/test_dataset/test_to_hf_dataset.py +14 -0
  39. embed_train-1.0.0/tests/integration/test_train_runner/__init__.py +0 -0
  40. embed_train-1.0.0/tests/integration/test_train_runner/test_train_runner_flow.py +19 -0
  41. embed_train-1.0.0/tests/unit/__init__.py +0 -0
  42. embed_train-1.0.0/tests/unit/test_abstract_guards.py +60 -0
  43. embed_train-1.0.0/tests/unit/test_embed_train.py +36 -0
  44. embed_train-1.0.0/tests/unit/test_exceptions.py +17 -0
  45. embed_train-1.0.0/tests/unit/test_models.py +65 -0
  46. embed_train-1.0.0/tests/unit/test_push_to_hf.py +207 -0
  47. embed_train-1.0.0/tests/unit/test_settings.py +83 -0
  48. embed_train-1.0.0/tests/unit/test_train/__init__.py +0 -0
  49. embed_train-1.0.0/tests/unit/test_train/test_collate.py +42 -0
  50. embed_train-1.0.0/tests/unit/test_train/test_dataset.py +64 -0
  51. embed_train-1.0.0/tests/unit/test_train/test_hf_trainer.py +153 -0
  52. embed_train-1.0.0/tests/unit/test_train/test_loss.py +39 -0
  53. embed_train-1.0.0/tests/unit/test_train/test_runner.py +19 -0
  54. embed_train-1.0.0/tests/unit/test_train/test_samplers.py +19 -0
  55. embed_train-1.0.0/tests/unit/test_train/test_sampling.py +9 -0
  56. embed_train-1.0.0/tests/unit/test_train/test_torch_datasets.py +29 -0
  57. embed_train-1.0.0/tests/unit/test_train/test_torch_trainer.py +127 -0
  58. embed_train-1.0.0/tests/unit/test_train/test_trainers.py +16 -0
  59. embed_train-1.0.0/tests/unit/test_utils.py +98 -0
  60. embed_train-1.0.0/uv.lock +2572 -0
@@ -0,0 +1,38 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # env files
13
+ .env
14
+
15
+ # Pycharm
16
+ .idea/
17
+
18
+ # VSCode
19
+ .vscode/
20
+
21
+ # Testing
22
+ .coverage
23
+ coverage.xml
24
+ htmlcov/
25
+ .pytest_cache/
26
+ .mypy_cache/
27
+
28
+ # Ruff
29
+ .ruff_cache/
30
+
31
+ # Pre-commit
32
+ .pre-commit-cache/
33
+
34
+ # MacOS
35
+ .DS_Store
36
+
37
+ # Claude
38
+ .claude/
@@ -0,0 +1,110 @@
1
+ image: python:3.12
2
+
3
+ workflow:
4
+ rules:
5
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "dev"'
6
+ - if: '$CI_COMMIT_BRANCH == "dev"'
7
+ - when: never
8
+
9
+ stages:
10
+ - checks
11
+ - test
12
+ - deploy
13
+
14
+ variables:
15
+ UV_SYSTEM_PYTHON: "1"
16
+ PIP_DISABLE_PIP_VERSION_CHECK: "1"
17
+ GIT_DEPTH: 0
18
+
19
+
20
+ cache:
21
+ key:
22
+ files:
23
+ - uv.lock
24
+ paths:
25
+ - .venv/
26
+ - ~/.cache/uv
27
+
28
+
29
+ before_script:
30
+ - pip install uv
31
+ - uv sync --group dev --all-extras
32
+
33
+ ruff:
34
+ stage: checks
35
+ script:
36
+ - uv run ruff check .
37
+ - uv run ruff format --check .
38
+
39
+ ty:
40
+ stage: checks
41
+ script:
42
+ - uv run ty check .
43
+
44
+ bandit:
45
+ stage: checks
46
+ script:
47
+ - uv run bandit -c pyproject.toml -r .
48
+
49
+ pytest:
50
+ stage: test
51
+ script:
52
+ - |
53
+ if [ "$CI_PIPELINE_SOURCE" = "merge_request_event" ]; then
54
+ uv run pytest --cov=src --cov-report=xml --cov-report=term --cov-fail-under=80
55
+ else
56
+ uv run pytest
57
+ fi
58
+ artifacts:
59
+ paths:
60
+ - coverage.xml
61
+
62
+ check-package:
63
+ stage: test
64
+ needs: ["pytest"]
65
+ script:
66
+ - uv build
67
+ - pip install dist/*.whl
68
+
69
+ codecov:
70
+ stage: test
71
+ needs: ["pytest"]
72
+ script:
73
+ - curl -Os https://cli.codecov.io/latest/linux/codecov
74
+ - chmod +x codecov
75
+ - ./codecov upload-process -t $CODECOV_TOKEN
76
+ rules:
77
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
78
+
79
+ release:
80
+ stage: deploy
81
+ image: node:22
82
+ needs: ["check-package"]
83
+ before_script:
84
+ - apt-get update && apt-get install -y python3 python3-pip
85
+ - pip install --break-system-packages uv
86
+ rules:
87
+ - if: '$CI_COMMIT_BRANCH == "dev"'
88
+ script:
89
+ - npm install -g semantic-release @semantic-release/gitlab @semantic-release/changelog @semantic-release/git @semantic-release/exec
90
+ - semantic-release
91
+
92
+ conventional-commits:
93
+ stage: checks
94
+ image: node:20
95
+ rules:
96
+ - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
97
+ before_script: []
98
+ script:
99
+ - npm install -g @commitlint/cli @commitlint/config-conventional
100
+ - echo "$CI_MERGE_REQUEST_TITLE" > pr_title.txt
101
+ - |
102
+ npx commitlint --config commitlint.config.cjs --edit pr_title.txt || {
103
+ echo "โŒ PR title must follow Conventional Commits format (e.g. feat:, fix:)"
104
+ exit 1
105
+ }
106
+
107
+ default:
108
+ interruptible: true
109
+ retry: 1
110
+ timeout: 10m
@@ -0,0 +1,48 @@
1
+ repos:
2
+ # Ruff for linting and formatting
3
+ - repo: https://github.com/astral-sh/ruff-pre-commit
4
+ rev: v0.13.3
5
+ hooks:
6
+ # Run the linter
7
+ - id: ruff
8
+ args: [--fix]
9
+ # Run the formatter
10
+ - id: ruff-format
11
+
12
+ # Type checking with ty (disabled in pre-commit, enabled in CI)
13
+ # - repo: https://github.com/astral-sh/ty-pre-commit
14
+ # rev: v0.0.17
15
+ # hooks:
16
+ # - id: ty
17
+ # exclude: ^(tests/|alembic/)
18
+
19
+ # Conventional commits enforcement
20
+ - repo: https://github.com/compilerla/conventional-pre-commit
21
+ rev: v3.6.0
22
+ hooks:
23
+ - id: conventional-pre-commit
24
+ stages: [commit-msg]
25
+
26
+ # General file checks
27
+ - repo: https://github.com/pre-commit/pre-commit-hooks
28
+ rev: v5.0.0
29
+ hooks:
30
+ - id: trailing-whitespace
31
+ - id: end-of-file-fixer
32
+ - id: check-yaml
33
+ - id: check-added-large-files
34
+ args: [--maxkb=1000]
35
+ - id: check-json
36
+ - id: check-toml
37
+ - id: detect-private-key
38
+ - id: mixed-line-ending
39
+ - id: check-merge-conflict
40
+
41
+ # Security checks with bandit
42
+ - repo: https://github.com/PyCQA/bandit
43
+ rev: 1.9.4
44
+ hooks:
45
+ - id: bandit
46
+ args: [-c, pyproject.toml]
47
+ additional_dependencies: ["bandit[toml]"]
48
+ exclude: ^tests/
@@ -0,0 +1,28 @@
1
+ {
2
+ "branches": [
3
+ "dev"
4
+ ],
5
+ "plugins": [
6
+ "@semantic-release/commit-analyzer",
7
+ "@semantic-release/release-notes-generator",
8
+ "@semantic-release/changelog",
9
+ [
10
+ "@semantic-release/exec",
11
+ {
12
+ "prepareCmd": "uv version ${nextRelease.version}",
13
+ "publishCmd": "uv build && uv publish --token $PYPI_TOKEN"
14
+ }
15
+ ],
16
+ [
17
+ "@semantic-release/git",
18
+ {
19
+ "assets": [
20
+ "CHANGELOG.md",
21
+ "pyproject.toml"
22
+ ],
23
+ "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
24
+ }
25
+ ],
26
+ "@semantic-release/gitlab"
27
+ ]
28
+ }
@@ -0,0 +1,187 @@
1
+ # AGENTS.md
2
+
3
+ ## Purpose
4
+
5
+ This file gives coding agents project-specific guidance for working in `embed-train`.
6
+ Use it together with the repository code and tests, not as a substitute for reading the implementation.
7
+
8
+ ## Project Summary
9
+
10
+ `embed-train` is a config-driven Python library for:
11
+
12
+ - training embedding models with a custom PyTorch loop
13
+ - training embedding models with SentenceTransformers
14
+ - loading components dynamically from `module_path`
15
+ - checkpointing models
16
+ - packaging and optionally pushing model repos to Hugging Face Hub
17
+
18
+ The codebase is built around small abstractions connected through typed settings objects in `src/embed_train/settings.py`.
19
+
20
+ ## Source Layout
21
+
22
+ Main library code lives under `src/embed_train/`.
23
+
24
+ Key modules:
25
+
26
+ - `src/embed_train/__init__.py`
27
+ Top-level `Runner` abstraction and YAML bootstrap via `Runner.get_runner(...)`.
28
+ - `src/embed_train/settings.py`
29
+ Central configuration contract for runners, trainers, datasets, collate functions, losses, and HF publishing.
30
+ - `src/embed_train/models/__init__.py`
31
+ Base `Model` wrapper abstraction with `save`, `to`, and `from_checkpoint`.
32
+ - `src/embed_train/train/__init__.py`
33
+ `TrainRunner`, which instantiates a configured trainer and executes `train()`.
34
+ - `src/embed_train/train/trainers/torch/__init__.py`
35
+ Custom PyTorch training loop with checkpointing, train/val split, TensorBoard logging, and pluggable loss/collate/dataset pieces.
36
+ - `src/embed_train/train/trainers/hf/__init__.py`
37
+ SentenceTransformers-based training path with `InformationRetrievalEvaluator`.
38
+ - `src/embed_train/train/dataset/__init__.py`
39
+ Base `TorchDataset` and `CollateFn` abstractions.
40
+ - `src/embed_train/train/dataset/collate.py`
41
+ Built-in collate functions for in-batch positive training.
42
+ - `src/embed_train/train/dataset/torch_datasets.py`
43
+ Built-in grouped and flattened query/positive dataset views.
44
+ - `src/embed_train/train/trainers/torch/loss.py`
45
+ Built-in contrastive losses.
46
+ - `src/embed_train/push_to_hf/__init__.py`
47
+ `PushToHFRunner` for checkpoint restore, local repo export, and HF upload.
48
+ - `src/embed_train/utils.py`
49
+ Dynamic class loading and checkpoint loading utilities.
50
+
51
+ Tests live in `tests/` and are the best executable reference for intended behavior.
52
+
53
+ ## Working Style For This Repo
54
+
55
+ - Read the relevant module and its tests before changing behavior.
56
+ - Treat `settings.py` as the public config contract.
57
+ - Prefer extending existing abstractions instead of bypassing them.
58
+ - Keep the config-driven architecture intact. Avoid hardcoding project-specific classes where `module_path` is the current pattern.
59
+ - Preserve typed settings and validation rules when adding new configuration.
60
+ - Keep changes small and local unless the task clearly requires a cross-cutting refactor.
61
+
62
+ ## What To Check Before Editing
63
+
64
+ For most changes, inspect the matching implementation and tests together:
65
+
66
+ - runner changes:
67
+ `src/embed_train/__init__.py`, `src/embed_train/train/__init__.py`, `src/embed_train/push_to_hf/__init__.py`
68
+ - settings changes:
69
+ `src/embed_train/settings.py`, `tests/unit/test_settings.py`
70
+ - model wrapper changes:
71
+ `src/embed_train/models/__init__.py`, `tests/unit/test_models.py`
72
+ - custom PyTorch trainer changes:
73
+ `src/embed_train/train/trainers/torch/__init__.py`, `tests/unit/test_train/test_torch_trainer.py`
74
+ - dataset or collate changes:
75
+ `src/embed_train/train/dataset/`, `tests/unit/test_train/test_dataset.py`, `tests/unit/test_train/test_collate.py`
76
+ - loss changes:
77
+ `src/embed_train/train/trainers/torch/loss.py`, `tests/unit/test_train/test_loss.py`
78
+ - HF publishing changes:
79
+ `src/embed_train/push_to_hf/__init__.py`, `tests/unit/test_push_to_hf.py`
80
+
81
+ ## Change Guidelines
82
+
83
+ ### Configuration Changes
84
+
85
+ - Add new fields in `src/embed_train/settings.py`.
86
+ - Prefer Pydantic validation for cross-field rules.
87
+ - Keep names explicit and aligned with existing settings models.
88
+ - If behavior depends on a new config field, add or update tests that validate both valid and invalid cases.
89
+
90
+ ### Trainer Changes
91
+
92
+ - Maintain separation between:
93
+ - dataset loading
94
+ - collate/tokenization
95
+ - embedding computation
96
+ - loss calculation
97
+ - checkpointing/logging
98
+ - Keep output paths consistent with the current `data_dir/checkpoints/...` and TensorBoard structure unless migration is intentional.
99
+
100
+ ### Dataset and Collate Changes
101
+
102
+ - Preserve the row contracts expected by built-in collate functions:
103
+ - grouped format: `{"query": str, "positives": list[str]}`
104
+ - flattened format: `{"query": str, "positive": str}`
105
+ - When changing row formats, update the collate functions and tests together.
106
+ - Keep tokenizer behavior explicit through settings rather than hidden defaults.
107
+
108
+ ### Model and Checkpoint Changes
109
+
110
+ - `Model.to_hf_model()` should keep returning a Hugging Face-compatible model object.
111
+ - Preserve support for loading checkpoints from:
112
+ - `.pt`-style files
113
+ - `.safetensors`
114
+ - directories containing `model.safetensors`
115
+ - Be careful with checkpoint loading and save format changes because they affect training and HF publishing flows.
116
+
117
+ ### Hugging Face Publishing Changes
118
+
119
+ - `PushToHFRunner` currently assumes custom model source files may need to be copied into the exported repo.
120
+ - If you change the file discovery logic, keep the supported patterns aligned with the current intent:
121
+ - `modeling_*.py`
122
+ - `configuration_*.py`
123
+ - `vllm_modeling_*.py`
124
+ - `vllm_configuration_*.py`
125
+ - Avoid making remote-side effects happen implicitly. Keep `push` behavior explicit.
126
+
127
+ ## Documentation Expectations
128
+
129
+ When changing public behavior, update docs in the same task when appropriate:
130
+
131
+ - `README.md` for library usage, structure, or workflows
132
+ - inline docstrings only when they add real clarity
133
+
134
+ Do not add generic documentation that is not supported by the code.
135
+
136
+ ## Testing Expectations
137
+
138
+ Use the Makefile targets when possible.
139
+
140
+ Common commands:
141
+
142
+ ```bash
143
+ make test
144
+ make lint
145
+ make type-check
146
+ make ci
147
+ ```
148
+
149
+ Minimum expectations by change type:
150
+
151
+ - docs-only changes:
152
+ usually no tests required
153
+ - settings or validation changes:
154
+ run affected unit tests, ideally `make test`
155
+ - trainer, dataset, loss, or runner changes:
156
+ run relevant unit tests and any affected integration tests
157
+ - broad behavioral changes:
158
+ run `make ci` if feasible
159
+
160
+ If you cannot run tests, say so explicitly in the final handoff.
161
+
162
+ ## Repo Conventions
163
+
164
+ - Python version target is defined in `pyproject.toml`.
165
+ - Formatting and linting use Ruff.
166
+ - Type checking uses `ty`.
167
+ - Tests use Pytest.
168
+ - The project may rely on `retrievalbase` types and mixins for configuration and dataset/processor integration.
169
+
170
+ ## Agent Pitfalls To Avoid
171
+
172
+ - Do not assume this is a CLI-first project. The main contract is the Python library plus config-driven orchestration.
173
+ - Do not invent public APIs that are not present in `src/embed_train/`.
174
+ - Do not document package publishing flows unless they actually exist in the repo.
175
+ - Do not bypass dynamic imports with direct hardcoded references unless the task is specifically to remove that pattern.
176
+ - Do not change config names casually; they are effectively part of the library interface.
177
+ - Do not remove or weaken validation without a clear reason and test coverage.
178
+
179
+ ## Good Agent Outcomes
180
+
181
+ A good change in this repo usually has these properties:
182
+
183
+ - it respects the current abstraction boundaries
184
+ - it updates tests with behavior changes
185
+ - it keeps configuration explicit
186
+ - it improves the library without making it more project-specific
187
+ - it leaves README and docs more accurate, not more generic
@@ -0,0 +1,6 @@
1
+ # 1.0.0 (2026-04-21)
2
+
3
+
4
+ ### Features
5
+
6
+ * initial release ([0ef0776](https://gitlab.com/efysent/agentic-core/embed-train/commit/0ef07764d7279d95e149fb795348e6637e4f52a3))
@@ -0,0 +1,87 @@
1
+ .PHONY: help install dev-install setup-hooks \
2
+ format lint type-check security \
3
+ test test-cov coverage-diff \
4
+ ci ci-fast clean run
5
+
6
+ # ------------------------
7
+ # ๐Ÿ“š Help
8
+ # ------------------------
9
+ help:
10
+ @echo "Available commands:"
11
+ @echo " make install - Install production dependencies"
12
+ @echo " make dev-install - Install all dependencies + hooks"
13
+ @echo " make setup-hooks - Install pre-commit hooks"
14
+ @echo " make format - Format code (ruff)"
15
+ @echo " make lint - Lint code (ruff)"
16
+ @echo " make type-check - Run type checking (ty)"
17
+ @echo " make security - Run security checks (bandit)"
18
+ @echo " make test - Run tests"
19
+ @echo " make test-cov - Run tests with coverage"
20
+ @echo " make ci - Full CI locally"
21
+ @echo " make ci-fast - CI without slow steps"
22
+ @echo " make clean - Clean cache files"
23
+
24
+ # ------------------------
25
+ # ๐Ÿ“ฆ Install
26
+ # ------------------------
27
+ install:
28
+ uv sync --no-dev --all-extras
29
+
30
+ dev-install:
31
+ uv sync --group dev --all-extras
32
+ uv run pre-commit install
33
+ uv run pre-commit install --hook-type commit-msg
34
+ @echo "โœ… Dev environment ready"
35
+
36
+ setup-hooks:
37
+ uv run pre-commit install
38
+ uv run pre-commit install --hook-type commit-msg
39
+
40
+ # ------------------------
41
+ # ๐Ÿงน Code quality
42
+ # ------------------------
43
+ format:
44
+ uv run ruff format .
45
+ uv run ruff check --fix .
46
+
47
+ lint:
48
+ uv run ruff check .
49
+
50
+ type-check:
51
+ uv run ty check .
52
+
53
+ security:
54
+ uv run bandit -c pyproject.toml -r .
55
+
56
+ # ------------------------
57
+ # ๐Ÿงช Tests
58
+ # ------------------------
59
+ test:
60
+ uv run pytest
61
+
62
+ test-cov:
63
+ uv run pytest \
64
+ --cov=src \
65
+ --cov-report=term \
66
+ --cov-report=xml:coverage.xml \
67
+ --cov-fail-under=80
68
+
69
+ # ------------------------
70
+ # ๐Ÿš€ CI equivalent
71
+ # ------------------------
72
+ ci: lint type-check security test-cov
73
+ @echo ""
74
+ @echo "โœ… Full CI checks passed"
75
+
76
+ # Faster version (no bandit blocking)
77
+ ci-fast: lint type-check test
78
+ @echo ""
79
+ @echo "โšก Fast CI checks passed"
80
+
81
+ # ------------------------
82
+ # ๐Ÿงน Cleanup
83
+ # ------------------------
84
+ clean:
85
+ find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
86
+ find . -type f -name "*.pyc" -delete 2>/dev/null || true
87
+ rm -rf .pytest_cache .ruff_cache .ty_cache htmlcov coverage.xml 2>/dev/null || true