lmxlab 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lmxlab-0.2.0/.github/dependabot.yml +11 -0
- lmxlab-0.2.0/.github/pull_request_template.md +13 -0
- lmxlab-0.2.0/.github/workflows/ci.yml +42 -0
- lmxlab-0.2.0/.github/workflows/dependabot-auto-merge.yml +27 -0
- lmxlab-0.2.0/.github/workflows/docs.yml +30 -0
- lmxlab-0.2.0/.github/workflows/publish.yml +25 -0
- lmxlab-0.2.0/.gitignore +42 -0
- lmxlab-0.2.0/.pre-commit-config.yaml +29 -0
- lmxlab-0.2.0/CHANGELOG.md +92 -0
- lmxlab-0.2.0/CONTRIBUTING.md +153 -0
- lmxlab-0.2.0/LICENSE +21 -0
- lmxlab-0.2.0/PKG-INFO +212 -0
- lmxlab-0.2.0/README.md +169 -0
- lmxlab-0.2.0/docs/api/core.md +109 -0
- lmxlab-0.2.0/docs/api/data.md +61 -0
- lmxlab-0.2.0/docs/api/eval.md +56 -0
- lmxlab-0.2.0/docs/api/experiments.md +43 -0
- lmxlab-0.2.0/docs/api/index.md +15 -0
- lmxlab-0.2.0/docs/api/inference.md +50 -0
- lmxlab-0.2.0/docs/api/models.md +82 -0
- lmxlab-0.2.0/docs/api/training.md +63 -0
- lmxlab-0.2.0/docs/architecture/compiled-training.md +186 -0
- lmxlab-0.2.0/docs/architecture/configurable-block.md +211 -0
- lmxlab-0.2.0/docs/architecture/mlx-idioms.md +215 -0
- lmxlab-0.2.0/docs/architecture/overview.md +167 -0
- lmxlab-0.2.0/docs/architecture/production-optimizations.md +431 -0
- lmxlab-0.2.0/docs/architecture/unified-memory.md +202 -0
- lmxlab-0.2.0/docs/data/index.md +253 -0
- lmxlab-0.2.0/docs/devlog/index.md +562 -0
- lmxlab-0.2.0/docs/experiments/methodology.md +245 -0
- lmxlab-0.2.0/docs/getting-started/first-training-run.md +367 -0
- lmxlab-0.2.0/docs/getting-started/installation.md +79 -0
- lmxlab-0.2.0/docs/getting-started/quickstart.md +307 -0
- lmxlab-0.2.0/docs/index.md +99 -0
- lmxlab-0.2.0/docs/inference/index.md +193 -0
- lmxlab-0.2.0/docs/models/index.md +490 -0
- lmxlab-0.2.0/docs/recipes/index.md +414 -0
- lmxlab-0.2.0/docs/training/index.md +249 -0
- lmxlab-0.2.0/mkdocs.yml +71 -0
- lmxlab-0.2.0/pyproject.toml +90 -0
- lmxlab-0.2.0/recipes/ablation_gpt_to_llama.py +594 -0
- lmxlab-0.2.0/recipes/ablation_hyp001b.py +529 -0
- lmxlab-0.2.0/recipes/ablation_hyp001c.py +815 -0
- lmxlab-0.2.0/recipes/ablation_hyp001d.py +828 -0
- lmxlab-0.2.0/recipes/advanced_sampling.py +171 -0
- lmxlab-0.2.0/recipes/analyze_experiments.py +204 -0
- lmxlab-0.2.0/recipes/benchmark_compile.py +174 -0
- lmxlab-0.2.0/recipes/checkpoint_resume.py +154 -0
- lmxlab-0.2.0/recipes/compare_architectures.py +123 -0
- lmxlab-0.2.0/recipes/compare_kv_cache.py +294 -0
- lmxlab-0.2.0/recipes/compare_optimizers.py +369 -0
- lmxlab-0.2.0/recipes/compare_schedules.py +185 -0
- lmxlab-0.2.0/recipes/compare_training.py +141 -0
- lmxlab-0.2.0/recipes/distill_model.py +278 -0
- lmxlab-0.2.0/recipes/evaluate_model.py +163 -0
- lmxlab-0.2.0/recipes/finetune_lora.py +191 -0
- lmxlab-0.2.0/recipes/finetune_qlora.py +166 -0
- lmxlab-0.2.0/recipes/interactive_generate.py +129 -0
- lmxlab-0.2.0/recipes/load_pretrained.py +108 -0
- lmxlab-0.2.0/recipes/mup_coordinate_check.py +379 -0
- lmxlab-0.2.0/recipes/profile_models.py +124 -0
- lmxlab-0.2.0/recipes/quantize_and_generate.py +171 -0
- lmxlab-0.2.0/recipes/run_experiment.py +243 -0
- lmxlab-0.2.0/recipes/speculative_decoding.py +204 -0
- lmxlab-0.2.0/recipes/sweep_learning_rate.py +241 -0
- lmxlab-0.2.0/recipes/train_curriculum.py +254 -0
- lmxlab-0.2.0/recipes/train_deltanet.py +164 -0
- lmxlab-0.2.0/recipes/train_dpo.py +165 -0
- lmxlab-0.2.0/recipes/train_grpo.py +202 -0
- lmxlab-0.2.0/recipes/train_llama_shakespeare.py +145 -0
- lmxlab-0.2.0/recipes/train_moe.py +152 -0
- lmxlab-0.2.0/recipes/train_mtp.py +201 -0
- lmxlab-0.2.0/recipes/train_tiny_gpt.py +102 -0
- lmxlab-0.2.0/recipes/train_with_callbacks.py +163 -0
- lmxlab-0.2.0/recipes/train_with_datasets.py +169 -0
- lmxlab-0.2.0/src/lmxlab/__init__.py +20 -0
- lmxlab-0.2.0/src/lmxlab/__main__.py +5 -0
- lmxlab-0.2.0/src/lmxlab/cli.py +233 -0
- lmxlab-0.2.0/src/lmxlab/core/__init__.py +63 -0
- lmxlab-0.2.0/src/lmxlab/core/attention.py +541 -0
- lmxlab-0.2.0/src/lmxlab/core/block.py +147 -0
- lmxlab-0.2.0/src/lmxlab/core/config.py +162 -0
- lmxlab-0.2.0/src/lmxlab/core/deltanet.py +295 -0
- lmxlab-0.2.0/src/lmxlab/core/ffn.py +131 -0
- lmxlab-0.2.0/src/lmxlab/core/lora.py +311 -0
- lmxlab-0.2.0/src/lmxlab/core/mamba2.py +630 -0
- lmxlab-0.2.0/src/lmxlab/core/mamba3.py +459 -0
- lmxlab-0.2.0/src/lmxlab/core/mla.py +181 -0
- lmxlab-0.2.0/src/lmxlab/core/moe.py +403 -0
- lmxlab-0.2.0/src/lmxlab/core/norm.py +36 -0
- lmxlab-0.2.0/src/lmxlab/core/position.py +143 -0
- lmxlab-0.2.0/src/lmxlab/core/qlora.py +211 -0
- lmxlab-0.2.0/src/lmxlab/core/quantize.py +99 -0
- lmxlab-0.2.0/src/lmxlab/core/registry.py +106 -0
- lmxlab-0.2.0/src/lmxlab/core/sparse_attention.py +304 -0
- lmxlab-0.2.0/src/lmxlab/data/__init__.py +21 -0
- lmxlab-0.2.0/src/lmxlab/data/batching.py +66 -0
- lmxlab-0.2.0/src/lmxlab/data/dataset.py +166 -0
- lmxlab-0.2.0/src/lmxlab/data/tokenizer.py +248 -0
- lmxlab-0.2.0/src/lmxlab/eval/__init__.py +15 -0
- lmxlab-0.2.0/src/lmxlab/eval/metrics.py +168 -0
- lmxlab-0.2.0/src/lmxlab/experiments/__init__.py +54 -0
- lmxlab-0.2.0/src/lmxlab/experiments/analysis.py +174 -0
- lmxlab-0.2.0/src/lmxlab/experiments/flops.py +103 -0
- lmxlab-0.2.0/src/lmxlab/experiments/mlflow.py +206 -0
- lmxlab-0.2.0/src/lmxlab/experiments/profiling.py +231 -0
- lmxlab-0.2.0/src/lmxlab/experiments/runner.py +123 -0
- lmxlab-0.2.0/src/lmxlab/experiments/sweep.py +83 -0
- lmxlab-0.2.0/src/lmxlab/experiments/tracking.py +129 -0
- lmxlab-0.2.0/src/lmxlab/inference/__init__.py +17 -0
- lmxlab-0.2.0/src/lmxlab/inference/beam_search.py +110 -0
- lmxlab-0.2.0/src/lmxlab/inference/reward_model.py +45 -0
- lmxlab-0.2.0/src/lmxlab/inference/sampling.py +142 -0
- lmxlab-0.2.0/src/lmxlab/inference/speculative.py +101 -0
- lmxlab-0.2.0/src/lmxlab/models/__init__.py +108 -0
- lmxlab-0.2.0/src/lmxlab/models/bamba.py +135 -0
- lmxlab-0.2.0/src/lmxlab/models/base.py +188 -0
- lmxlab-0.2.0/src/lmxlab/models/convert.py +616 -0
- lmxlab-0.2.0/src/lmxlab/models/deepseek.py +206 -0
- lmxlab-0.2.0/src/lmxlab/models/falcon.py +137 -0
- lmxlab-0.2.0/src/lmxlab/models/gemma.py +68 -0
- lmxlab-0.2.0/src/lmxlab/models/gemma3.py +107 -0
- lmxlab-0.2.0/src/lmxlab/models/generate.py +266 -0
- lmxlab-0.2.0/src/lmxlab/models/glm.py +82 -0
- lmxlab-0.2.0/src/lmxlab/models/gpt.py +87 -0
- lmxlab-0.2.0/src/lmxlab/models/gpt_oss.py +77 -0
- lmxlab-0.2.0/src/lmxlab/models/grok.py +87 -0
- lmxlab-0.2.0/src/lmxlab/models/jamba.py +164 -0
- lmxlab-0.2.0/src/lmxlab/models/kimi.py +117 -0
- lmxlab-0.2.0/src/lmxlab/models/llama.py +92 -0
- lmxlab-0.2.0/src/lmxlab/models/llama4.py +203 -0
- lmxlab-0.2.0/src/lmxlab/models/mistral.py +81 -0
- lmxlab-0.2.0/src/lmxlab/models/mixtral.py +82 -0
- lmxlab-0.2.0/src/lmxlab/models/nemotron.py +324 -0
- lmxlab-0.2.0/src/lmxlab/models/olmo.py +78 -0
- lmxlab-0.2.0/src/lmxlab/models/qwen.py +146 -0
- lmxlab-0.2.0/src/lmxlab/models/qwen35.py +111 -0
- lmxlab-0.2.0/src/lmxlab/models/qwen_next.py +78 -0
- lmxlab-0.2.0/src/lmxlab/models/smollm.py +106 -0
- lmxlab-0.2.0/src/lmxlab/py.typed +0 -0
- lmxlab-0.2.0/src/lmxlab/training/__init__.py +56 -0
- lmxlab-0.2.0/src/lmxlab/training/callbacks.py +226 -0
- lmxlab-0.2.0/src/lmxlab/training/checkpoints.py +85 -0
- lmxlab-0.2.0/src/lmxlab/training/config.py +41 -0
- lmxlab-0.2.0/src/lmxlab/training/curriculum.py +113 -0
- lmxlab-0.2.0/src/lmxlab/training/distillation.py +144 -0
- lmxlab-0.2.0/src/lmxlab/training/dpo.py +76 -0
- lmxlab-0.2.0/src/lmxlab/training/grpo.py +64 -0
- lmxlab-0.2.0/src/lmxlab/training/grpo_trainer.py +244 -0
- lmxlab-0.2.0/src/lmxlab/training/mtp.py +233 -0
- lmxlab-0.2.0/src/lmxlab/training/optimizers.py +169 -0
- lmxlab-0.2.0/src/lmxlab/training/trainer.py +375 -0
- lmxlab-0.2.0/tests/__init__.py +0 -0
- lmxlab-0.2.0/tests/conftest.py +27 -0
- lmxlab-0.2.0/tests/test_advanced_training.py +265 -0
- lmxlab-0.2.0/tests/test_architectures.py +610 -0
- lmxlab-0.2.0/tests/test_behavioral.py +394 -0
- lmxlab-0.2.0/tests/test_cli.py +173 -0
- lmxlab-0.2.0/tests/test_convert.py +269 -0
- lmxlab-0.2.0/tests/test_core.py +295 -0
- lmxlab-0.2.0/tests/test_cross_reference.py +3120 -0
- lmxlab-0.2.0/tests/test_data.py +305 -0
- lmxlab-0.2.0/tests/test_eval.py +146 -0
- lmxlab-0.2.0/tests/test_experiments.py +368 -0
- lmxlab-0.2.0/tests/test_flops.py +178 -0
- lmxlab-0.2.0/tests/test_grpo_trainer.py +172 -0
- lmxlab-0.2.0/tests/test_inference.py +557 -0
- lmxlab-0.2.0/tests/test_lora.py +207 -0
- lmxlab-0.2.0/tests/test_lora_save_load.py +176 -0
- lmxlab-0.2.0/tests/test_models.py +1671 -0
- lmxlab-0.2.0/tests/test_mup.py +582 -0
- lmxlab-0.2.0/tests/test_profiling.py +194 -0
- lmxlab-0.2.0/tests/test_qlora.py +179 -0
- lmxlab-0.2.0/tests/test_quantize.py +316 -0
- lmxlab-0.2.0/tests/test_sampling.py +208 -0
- lmxlab-0.2.0/tests/test_training.py +332 -0
- lmxlab-0.2.0/uv.lock +3414 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
## Summary
|
|
2
|
+
|
|
3
|
+
<!-- 1-3 sentences describing what this PR does and why -->
|
|
4
|
+
|
|
5
|
+
## Changes
|
|
6
|
+
|
|
7
|
+
-
|
|
8
|
+
|
|
9
|
+
## Checklist
|
|
10
|
+
|
|
11
|
+
- [ ] Tests added/updated
|
|
12
|
+
- [ ] Lint passes (`uv run ruff check src/ tests/ recipes/`)
|
|
13
|
+
- [ ] Format passes (`uv run ruff format --check src/ tests/ recipes/`)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v6
|
|
14
|
+
- uses: astral-sh/setup-uv@v7
|
|
15
|
+
with:
|
|
16
|
+
enable-cache: true
|
|
17
|
+
cache-dependency-glob: "uv.lock"
|
|
18
|
+
- run: uv sync --locked --extra dev
|
|
19
|
+
- run: uv run ruff check src/ tests/ recipes/
|
|
20
|
+
- run: uv run ruff format --check src/ tests/ recipes/
|
|
21
|
+
|
|
22
|
+
docs:
|
|
23
|
+
runs-on: ubuntu-latest
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v6
|
|
26
|
+
- uses: astral-sh/setup-uv@v7
|
|
27
|
+
with:
|
|
28
|
+
enable-cache: true
|
|
29
|
+
cache-dependency-glob: "uv.lock"
|
|
30
|
+
- run: uv sync --locked --extra dev
|
|
31
|
+
- run: uv run mkdocs build --strict
|
|
32
|
+
|
|
33
|
+
test:
|
|
34
|
+
runs-on: macos-14
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@v6
|
|
37
|
+
- uses: astral-sh/setup-uv@v7
|
|
38
|
+
with:
|
|
39
|
+
enable-cache: true
|
|
40
|
+
cache-dependency-glob: "uv.lock"
|
|
41
|
+
- run: uv sync --locked --extra dev
|
|
42
|
+
- run: uv run pytest tests/ -v -m "not slow"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: Dependabot Auto-Merge
|
|
2
|
+
|
|
3
|
+
on: pull_request
|
|
4
|
+
|
|
5
|
+
permissions:
|
|
6
|
+
contents: write
|
|
7
|
+
pull-requests: write
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
auto-merge:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
if: github.actor == 'dependabot[bot]'
|
|
13
|
+
steps:
|
|
14
|
+
- name: Fetch Dependabot metadata
|
|
15
|
+
id: metadata
|
|
16
|
+
uses: dependabot/fetch-metadata@v2
|
|
17
|
+
with:
|
|
18
|
+
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
|
19
|
+
|
|
20
|
+
- name: Auto-merge minor and patch updates
|
|
21
|
+
if: >-
|
|
22
|
+
steps.metadata.outputs.update-type == 'version-update:semver-minor' ||
|
|
23
|
+
steps.metadata.outputs.update-type == 'version-update:semver-patch'
|
|
24
|
+
run: gh pr merge --auto --squash "$PR_URL"
|
|
25
|
+
env:
|
|
26
|
+
PR_URL: ${{ github.event.pull_request.html_url }}
|
|
27
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
pages: write
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
deploy:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
environment:
|
|
16
|
+
name: github-pages
|
|
17
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v6
|
|
20
|
+
- uses: astral-sh/setup-uv@v7
|
|
21
|
+
with:
|
|
22
|
+
enable-cache: true
|
|
23
|
+
cache-dependency-glob: "uv.lock"
|
|
24
|
+
- run: uv sync --locked --extra dev
|
|
25
|
+
- run: uv run mkdocs build
|
|
26
|
+
- uses: actions/upload-pages-artifact@v4
|
|
27
|
+
with:
|
|
28
|
+
path: site
|
|
29
|
+
- id: deployment
|
|
30
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
environment: pypi
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v6
|
|
16
|
+
|
|
17
|
+
- uses: astral-sh/setup-uv@v7
|
|
18
|
+
with:
|
|
19
|
+
enable-cache: true
|
|
20
|
+
cache-dependency-glob: "uv.lock"
|
|
21
|
+
|
|
22
|
+
- run: uv build
|
|
23
|
+
|
|
24
|
+
- name: Publish to PyPI
|
|
25
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
lmxlab-0.2.0/.gitignore
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
*.pyo
|
|
5
|
+
dist/
|
|
6
|
+
*.egg-info/
|
|
7
|
+
|
|
8
|
+
# Tool caches
|
|
9
|
+
.ruff_cache/
|
|
10
|
+
.mypy_cache/
|
|
11
|
+
.pytest_cache/
|
|
12
|
+
|
|
13
|
+
# Virtual environment
|
|
14
|
+
.venv/
|
|
15
|
+
|
|
16
|
+
# Claude Code (local-only)
|
|
17
|
+
.claude/
|
|
18
|
+
CLAUDE.md
|
|
19
|
+
|
|
20
|
+
# Secrets
|
|
21
|
+
.env
|
|
22
|
+
.env.*
|
|
23
|
+
credentials.json
|
|
24
|
+
|
|
25
|
+
# Data (downloaded by recipes)
|
|
26
|
+
/data/
|
|
27
|
+
|
|
28
|
+
# Experiment results
|
|
29
|
+
results.jsonl
|
|
30
|
+
|
|
31
|
+
# Agent team output (temporary)
|
|
32
|
+
.team-output/
|
|
33
|
+
|
|
34
|
+
# Checkpoints
|
|
35
|
+
checkpoints/
|
|
36
|
+
*.safetensors
|
|
37
|
+
|
|
38
|
+
# Build artifacts
|
|
39
|
+
site/
|
|
40
|
+
|
|
41
|
+
# OS
|
|
42
|
+
.DS_Store
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-toml
|
|
9
|
+
- id: check-added-large-files
|
|
10
|
+
args: ["--maxkb=500"]
|
|
11
|
+
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.11.3
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args: [--fix]
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
|
|
19
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
|
20
|
+
rev: 0.7.12
|
|
21
|
+
hooks:
|
|
22
|
+
- id: uv-lock
|
|
23
|
+
|
|
24
|
+
- repo: https://github.com/compilerla/conventional-pre-commit
|
|
25
|
+
rev: v4.0.0
|
|
26
|
+
hooks:
|
|
27
|
+
- id: conventional-pre-commit
|
|
28
|
+
stages: [commit-msg]
|
|
29
|
+
args: [feat, fix, refactor, test, docs, chore, ci, build, perf]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to lmxlab will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.2.0] - 2026-03-14
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **16 new architecture config factories**: DeepSeek V3 (MLA + MoE),
|
|
15
|
+
Nemotron (hybrid Mamba-Transformer MoE), Llama 4 Scout/Maverick
|
|
16
|
+
(iRoPE + chunked attention + MoE), Mistral Small (sliding window),
|
|
17
|
+
OLMo 2 (QK-norm), GPT-OSS (QK-norm), Grok (SharedExpertMoE),
|
|
18
|
+
Kimi K2.5 (DeltaNet + MoE), Qwen-Next (gated attention),
|
|
19
|
+
SmolLM3 (iRoPE), Qwen 3 MoE, Falcon H1 (hybrid Mamba-2),
|
|
20
|
+
Jamba (Mamba-2 + MoE), Bamba (hybrid Mamba-2), GLM-4.5 (MLA NoPE)
|
|
21
|
+
- **Mamba-2 SSD**: structured state-space sequence mixer with chunked
|
|
22
|
+
parallel scan and recurrent inference paths
|
|
23
|
+
- **Mamba-3**: trapezoidal discretization, BCNorm, complex A
|
|
24
|
+
(data-dependent RoPE on B/C)
|
|
25
|
+
- **QK-norm**: per-head RMSNorm on Q and K projections (OLMo 2 style)
|
|
26
|
+
- **GatedGQA**: sigmoid output gating on attention
|
|
27
|
+
(arXiv:2505.06708)
|
|
28
|
+
- **ChunkedGQA**: fixed-size local attention with per-chunk RoPE
|
|
29
|
+
(Llama 4 iRoPE pattern)
|
|
30
|
+
- **LatentMoE**: down-project before routing for many-expert MoE
|
|
31
|
+
(arXiv:2601.18089)
|
|
32
|
+
- **SharedExpertMoE**: shared expert alongside routed experts
|
|
33
|
+
(DeepSeek V3 style)
|
|
34
|
+
- **ReluSquaredFFN**: squared ReLU activation (Primer / Nemotron)
|
|
35
|
+
- **muP parameterization**: width-independent hyperparameter transfer
|
|
36
|
+
- **Dropout support**: configurable dropout in attention and FFN
|
|
37
|
+
- **SparseGQA (DSA)**: DeepSeek Sparse Attention with compressed tokens,
|
|
38
|
+
selected tokens, and sliding window (arXiv:2512.02556)
|
|
39
|
+
- **GRPOTrainer**: full GRPO training loop with group sampling, reward
|
|
40
|
+
scoring, and clipped surrogate objective (arXiv:2501.12948)
|
|
41
|
+
- **Beam search**: standard beam search with optional custom scoring
|
|
42
|
+
- **RewardModel**: language model + scalar head for reward scoring
|
|
43
|
+
|
|
44
|
+
## [0.1.0] - 2026-03-11
|
|
45
|
+
|
|
46
|
+
Initial release.
|
|
47
|
+
|
|
48
|
+
### Added
|
|
49
|
+
|
|
50
|
+
- **8 architecture config factories**: GPT, LLaMA, Gemma, Qwen, Mixtral (MoE),
|
|
51
|
+
DeepSeek V2 (MLA), Gemma 3 (sliding window), Qwen 3.5 (hybrid DeltaNet)
|
|
52
|
+
- **Core components**: MHA, GQA, MLA, GatedDeltaNet, SlidingWindowGQA,
|
|
53
|
+
StandardFFN, GatedFFN, MoEFFN, SharedExpertMoEFFN, RMSNorm, LayerNorm,
|
|
54
|
+
RoPE, ALiBi, sinusoidal positional encoding
|
|
55
|
+
- **ConfigurableBlock** with typed Registry pattern for component resolution
|
|
56
|
+
- **LanguageModel** base class with tied/untied embeddings and KV cache
|
|
57
|
+
- **Compiled training** with `mx.compile`, `nn.value_and_grad`, gradient
|
|
58
|
+
clipping, and cosine/linear/warmup learning rate schedules
|
|
59
|
+
- **Optimizers**: AdamW, Lion, Adafactor, SGD with momentum
|
|
60
|
+
- **Advanced training**: DPO, GRPO, multi-token prediction, curriculum
|
|
61
|
+
learning, knowledge distillation
|
|
62
|
+
- **LoRA and QLoRA**: parameter-efficient fine-tuning with optional 4-bit
|
|
63
|
+
quantization
|
|
64
|
+
- **Post-training quantization**: 4-bit and 8-bit via MLX native quantization,
|
|
65
|
+
with dequantization support
|
|
66
|
+
- **Inference**: autoregressive generation with KV cache, streaming generation,
|
|
67
|
+
top-k/top-p/temperature sampling, repetition penalty, stop tokens
|
|
68
|
+
- **Advanced inference**: best-of-N sampling, majority vote, speculative
|
|
69
|
+
decoding
|
|
70
|
+
- **HuggingFace integration**: load pretrained weights (`load_from_hf`),
|
|
71
|
+
tokenizer wrapper (`HFTokenizer`), streaming dataset (`HFDataset`)
|
|
72
|
+
- **Data pipeline**: CharTokenizer, TiktokenTokenizer, TextDataset,
|
|
73
|
+
TokenDataset, batch iterator
|
|
74
|
+
- **Evaluation**: perplexity, bits-per-byte, pass@k for code generation
|
|
75
|
+
- **Experiment framework**: ExperimentRunner with time budgets, ExperimentLog,
|
|
76
|
+
grid/random hyperparameter sweeps, statistical analysis (confidence
|
|
77
|
+
intervals, Cohen's d, experiment comparison)
|
|
78
|
+
- **MLX profiling**: benchmark_fn, memory_estimate, profile_forward,
|
|
79
|
+
profile_generation, count_parameters_by_module
|
|
80
|
+
- **CLI**: `lmxlab list`, `lmxlab info`, `lmxlab count`
|
|
81
|
+
- **Callbacks**: MetricsLogger, EarlyStopping, ThroughputMonitor
|
|
82
|
+
- **Checkpointing**: save/load via safetensors with JSON metadata
|
|
83
|
+
- **31 recipe scripts** covering training, fine-tuning, DPO, GRPO, MTP,
|
|
84
|
+
distillation, curriculum learning, architecture comparison, ablation
|
|
85
|
+
studies, quantization, speculative decoding, and more
|
|
86
|
+
- **Documentation site** with MkDocs Material: architecture guides, MLX
|
|
87
|
+
idioms, model comparison, API reference, recipes index, devlog
|
|
88
|
+
- **PyPI publish workflow** via trusted publishing (OIDC)
|
|
89
|
+
|
|
90
|
+
[Unreleased]: https://github.com/michaelellis003/lmxlab/compare/v0.2.0...HEAD
|
|
91
|
+
[0.2.0]: https://github.com/michaelellis003/lmxlab/compare/v0.1.0...v0.2.0
|
|
92
|
+
[0.1.0]: https://github.com/michaelellis003/lmxlab/releases/tag/v0.1.0
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Contributing to lmxlab
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing! lmxlab is a research platform,
|
|
4
|
+
so clarity and rapid iteration are valued over production optimization.
|
|
5
|
+
|
|
6
|
+
## Setup
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
git clone https://github.com/michaelellis003/lmxlab.git
|
|
10
|
+
cd lmxlab
|
|
11
|
+
uv sync --extra dev
|
|
12
|
+
uv run pre-commit install
|
|
13
|
+
uv run pre-commit install --hook-type commit-msg
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
The pre-commit hooks will automatically:
|
|
17
|
+
- Run ruff lint and format checks on staged files
|
|
18
|
+
- Verify `uv.lock` stays in sync with `pyproject.toml`
|
|
19
|
+
- Enforce conventional commit message format
|
|
20
|
+
|
|
21
|
+
## Development workflow
|
|
22
|
+
|
|
23
|
+
1. Create a feature branch from `main`:
|
|
24
|
+
```bash
|
|
25
|
+
git checkout -b feat/my-feature
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
2. Write tests first (TDD). Tests go in `tests/`:
|
|
29
|
+
```bash
|
|
30
|
+
uv run pytest tests/test_my_module.py -v
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
3. Implement the feature in `src/lmxlab/`.
|
|
34
|
+
|
|
35
|
+
4. Verify everything passes locally before pushing:
|
|
36
|
+
```bash
|
|
37
|
+
uv run pytest # All tests
|
|
38
|
+
uv run ruff check src/ tests/ recipes/ # Lint
|
|
39
|
+
uv run ruff format --check src/ tests/ recipes/ # Formatting
|
|
40
|
+
uv run mkdocs build --strict # Docs build
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
5. Open a PR against `main`. CI must pass before merging.
|
|
44
|
+
|
|
45
|
+
## CI pipeline
|
|
46
|
+
|
|
47
|
+
Every PR runs three jobs:
|
|
48
|
+
|
|
49
|
+
- **lint** (ubuntu): ruff check + format on `src/`, `tests/`, `recipes/`
|
|
50
|
+
- **docs** (ubuntu): `mkdocs build --strict` catches broken links/refs
|
|
51
|
+
- **test** (macos-14): pytest on Apple Silicon (MLX requires M-series)
|
|
52
|
+
|
|
53
|
+
All three must pass before merging. Do not bypass CI with `--admin`.
|
|
54
|
+
|
|
55
|
+
## Keeping `uv.lock` in sync
|
|
56
|
+
|
|
57
|
+
If you change `pyproject.toml` (add/remove/update dependencies), you must
|
|
58
|
+
regenerate the lockfile:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
uv lock
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
The `uv-lock` pre-commit hook catches this automatically. If CI fails with
|
|
65
|
+
"lockfile needs to be updated", run `uv lock` and commit the result.
|
|
66
|
+
|
|
67
|
+
## Branch naming
|
|
68
|
+
|
|
69
|
+
- `feat/` — new features
|
|
70
|
+
- `fix/` — bug fixes
|
|
71
|
+
- `docs/` — documentation changes
|
|
72
|
+
- `refactor/` — code restructuring
|
|
73
|
+
- `test/` — test additions
|
|
74
|
+
|
|
75
|
+
## Commit messages
|
|
76
|
+
|
|
77
|
+
Follow the `type: description` format (enforced by pre-commit hook):
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
feat: add sliding window attention
|
|
81
|
+
fix: correct RoPE dimension calculation
|
|
82
|
+
docs: expand installation guide
|
|
83
|
+
test: add CLI command tests
|
|
84
|
+
refactor: simplify MoE routing logic
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Code style
|
|
88
|
+
|
|
89
|
+
- **Line length:** 79 characters
|
|
90
|
+
- **Quotes:** double quotes (enforced by ruff)
|
|
91
|
+
- **Docstrings:** Google style
|
|
92
|
+
- **Type annotations:** required on all public functions
|
|
93
|
+
- **Imports:** sorted by ruff (stdlib, third-party, local)
|
|
94
|
+
|
|
95
|
+
Ruff handles formatting and linting:
|
|
96
|
+
```bash
|
|
97
|
+
ruff check --fix src/ tests/ recipes/ # Auto-fix lint issues
|
|
98
|
+
ruff format src/ tests/ recipes/ # Auto-format
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Testing
|
|
102
|
+
|
|
103
|
+
- Use **behavioral tests** for ML code:
|
|
104
|
+
- Shape tests: output dimensions are correct
|
|
105
|
+
- Invariance tests: same input + seed = same output
|
|
106
|
+
- Directional tests: loss decreases after training
|
|
107
|
+
- Minimum functionality: no NaN, no Inf
|
|
108
|
+
- Keep tests fast: use tiny model configs (`gpt_tiny()`, `llama_tiny()`)
|
|
109
|
+
- Mark slow tests with `@pytest.mark.slow`
|
|
110
|
+
|
|
111
|
+
## Architecture guidelines
|
|
112
|
+
|
|
113
|
+
- **Config factories, not subclasses.** New architectures should be
|
|
114
|
+
config factory functions, not new model classes.
|
|
115
|
+
- **Registry pattern.** New attention/FFN/norm types should register
|
|
116
|
+
themselves in the appropriate registry.
|
|
117
|
+
- **Simplicity bias.** When two approaches achieve similar results,
|
|
118
|
+
prefer the simpler one.
|
|
119
|
+
- **Clarity.** Comments should explain *why*, not just *what*.
|
|
120
|
+
|
|
121
|
+
## Citations and attribution
|
|
122
|
+
|
|
123
|
+
Every new building block (attention, FFN, position encoding, SSM,
|
|
124
|
+
etc.) must cite its source paper with an arXiv ID in the module
|
|
125
|
+
docstring:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
"""My new attention variant.
|
|
129
|
+
|
|
130
|
+
Reference: Author (Year, arXiv:XXXX.XXXXX)
|
|
131
|
+
"""
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Code adapted from reference implementations must note the source:
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
# Cross-references:
|
|
138
|
+
# - org/repo filename.py (canonical implementation)
|
|
139
|
+
# - HuggingFace transformers modeling_xxx.py
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Use `Reference:` for the originating paper and `Cross-references:`
|
|
143
|
+
for implementation sources consulted during development.
|
|
144
|
+
|
|
145
|
+
## Adding a new architecture
|
|
146
|
+
|
|
147
|
+
1. Create `src/lmxlab/models/myarch.py` with a config factory function
|
|
148
|
+
2. Register any new components in the appropriate registry
|
|
149
|
+
3. Add a `myarch_tiny()` config for tests
|
|
150
|
+
4. Add tests in `tests/test_architectures.py`
|
|
151
|
+
5. Update `src/lmxlab/models/__init__.py` exports
|
|
152
|
+
6. Add to CLI in `src/lmxlab/cli.py`
|
|
153
|
+
7. Document in `docs/models/index.md`
|
lmxlab-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Michael Ellis
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|