freesolo-chalk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- freesolo_chalk-0.1.0/.github/workflows/ci.yml +43 -0
- freesolo_chalk-0.1.0/.github/workflows/main-source-guard.yml +20 -0
- freesolo_chalk-0.1.0/.gitignore +28 -0
- freesolo_chalk-0.1.0/.pre-commit-config.yaml +10 -0
- freesolo_chalk-0.1.0/LICENSE +25 -0
- freesolo_chalk-0.1.0/Makefile +41 -0
- freesolo_chalk-0.1.0/NOTICE +26 -0
- freesolo_chalk-0.1.0/PKG-INFO +104 -0
- freesolo_chalk-0.1.0/README.md +56 -0
- freesolo_chalk-0.1.0/benchmark/README.md +20 -0
- freesolo_chalk-0.1.0/benchmark/__init__.py +0 -0
- freesolo_chalk-0.1.0/benchmark/scripts/.gitkeep +0 -0
- freesolo_chalk-0.1.0/dev/fmt-requirements.txt +1 -0
- freesolo_chalk-0.1.0/pyproject.toml +84 -0
- freesolo_chalk-0.1.0/setup.cfg +4 -0
- freesolo_chalk-0.1.0/setup.py +102 -0
- freesolo_chalk-0.1.0/src/chalk/__init__.py +0 -0
- freesolo_chalk-0.1.0/src/chalk/ops/__init__.py +12 -0
- freesolo_chalk-0.1.0/src/chalk/ops/embedding.py +353 -0
- freesolo_chalk-0.1.0/src/chalk/ops/fp8_base.py +349 -0
- freesolo_chalk-0.1.0/src/chalk/ops/lora.py +608 -0
- freesolo_chalk-0.1.0/src/chalk/ops/mlp.py +947 -0
- freesolo_chalk-0.1.0/src/chalk/ops/qkv.py +636 -0
- freesolo_chalk-0.1.0/src/chalk/ops/rope.py +455 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/__init__.py +38 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/apply.py +160 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/embedding.py +5 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/fp8_base.py +5 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/lora.py +5 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/mlp.py +6 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/qkv.py +5 -0
- freesolo_chalk-0.1.0/src/chalk/transformers/rope.py +5 -0
- freesolo_chalk-0.1.0/src/chalk/utils.py +35 -0
- freesolo_chalk-0.1.0/src/freesolo_chalk.egg-info/PKG-INFO +104 -0
- freesolo_chalk-0.1.0/src/freesolo_chalk.egg-info/SOURCES.txt +57 -0
- freesolo_chalk-0.1.0/src/freesolo_chalk.egg-info/dependency_links.txt +1 -0
- freesolo_chalk-0.1.0/src/freesolo_chalk.egg-info/requires.txt +11 -0
- freesolo_chalk-0.1.0/src/freesolo_chalk.egg-info/top_level.txt +1 -0
- freesolo_chalk-0.1.0/test/__init__.py +0 -0
- freesolo_chalk-0.1.0/test/conftest.py +31 -0
- freesolo_chalk-0.1.0/test/ops/__init__.py +0 -0
- freesolo_chalk-0.1.0/test/ops/test_embedding.py +16 -0
- freesolo_chalk-0.1.0/test/ops/test_fp8_base.py +81 -0
- freesolo_chalk-0.1.0/test/ops/test_lora.py +82 -0
- freesolo_chalk-0.1.0/test/ops/test_mlp.py +337 -0
- freesolo_chalk-0.1.0/test/ops/test_ops_contract.py +23 -0
- freesolo_chalk-0.1.0/test/ops/test_qkv.py +16 -0
- freesolo_chalk-0.1.0/test/ops/test_rope.py +105 -0
- freesolo_chalk-0.1.0/test/test_helpers.py +67 -0
- freesolo_chalk-0.1.0/test/test_package.py +68 -0
- freesolo_chalk-0.1.0/test/test_smoke.py +38 -0
- freesolo_chalk-0.1.0/test/test_utils.py +86 -0
- freesolo_chalk-0.1.0/test/transformers/__init__.py +0 -0
- freesolo_chalk-0.1.0/test/transformers/test_apply_aggregator.py +220 -0
- freesolo_chalk-0.1.0/test/transformers/test_installer_noop.py +63 -0
- freesolo_chalk-0.1.0/test/transformers/test_kernel_install.py +97 -0
- freesolo_chalk-0.1.0/test/transformers/test_rope_install.py +75 -0
- freesolo_chalk-0.1.0/test/transformers/test_transformers_contract.py +33 -0
- freesolo_chalk-0.1.0/test/utils.py +170 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main, dev]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main, dev]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
checkstyle:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.11"
|
|
18
|
+
- name: Install ruff
|
|
19
|
+
run: pip install -r dev/fmt-requirements.txt
|
|
20
|
+
- name: Lint
|
|
21
|
+
run: ruff check --output-format=concise .
|
|
22
|
+
- name: Format check
|
|
23
|
+
run: ruff format --check --diff .
|
|
24
|
+
|
|
25
|
+
test:
|
|
26
|
+
runs-on: ubuntu-latest
|
|
27
|
+
strategy:
|
|
28
|
+
matrix:
|
|
29
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
- uses: actions/setup-python@v5
|
|
33
|
+
with:
|
|
34
|
+
python-version: ${{ matrix.python-version }}
|
|
35
|
+
# CPU runner: install the package + pytest only (no torch/triton). GPU-only
|
|
36
|
+
# kernel correctness tests self-skip when torch/triton/CUDA are unavailable.
|
|
37
|
+
- name: Install
|
|
38
|
+
run: |
|
|
39
|
+
python -m pip install --upgrade pip
|
|
40
|
+
pip install -e .
|
|
41
|
+
pip install pytest pytest-cov
|
|
42
|
+
- name: Test (CPU)
|
|
43
|
+
run: python -m pytest --disable-warnings test/
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
name: Main source guard
|
|
2
|
+
on:
|
|
3
|
+
pull_request:
|
|
4
|
+
branches: [main]
|
|
5
|
+
permissions:
|
|
6
|
+
contents: read
|
|
7
|
+
jobs:
|
|
8
|
+
source-is-dev:
|
|
9
|
+
name: Source branch is dev
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- name: Require PRs into main to originate from dev
|
|
13
|
+
env:
|
|
14
|
+
HEAD_REF: ${{ github.head_ref }}
|
|
15
|
+
run: |
|
|
16
|
+
if [ "$HEAD_REF" != "dev" ]; then
|
|
17
|
+
echo "::error::PRs into main must come from 'dev' (got '$HEAD_REF'). Merge into dev, then promote dev -> main."
|
|
18
|
+
exit 1
|
|
19
|
+
fi
|
|
20
|
+
echo "Source branch '$HEAD_REF' is allowed."
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.egg-info/
|
|
3
|
+
site/
|
|
4
|
+
.cache/
|
|
5
|
+
.venv/
|
|
6
|
+
venv/
|
|
7
|
+
.ipynb_checkpoints/
|
|
8
|
+
.vscode/
|
|
9
|
+
.idea/
|
|
10
|
+
|
|
11
|
+
# Misc
|
|
12
|
+
.DS_Store
|
|
13
|
+
|
|
14
|
+
# Build
|
|
15
|
+
build/
|
|
16
|
+
dist/
|
|
17
|
+
|
|
18
|
+
# Lockfiles
|
|
19
|
+
uv.lock
|
|
20
|
+
|
|
21
|
+
# Benchmark images
|
|
22
|
+
benchmark/visualizations
|
|
23
|
+
|
|
24
|
+
# Coverage
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
.pytest_cache/
|
|
28
|
+
.ruff_cache/
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
BSD 2-CLAUSE LICENSE
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Freesolo, Inc.
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
20
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
21
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
22
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
23
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
24
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
.PHONY: test checkstyle coverage run-benchmarks all
|
|
2
|
+
|
|
3
|
+
all: checkstyle test
|
|
4
|
+
|
|
5
|
+
# Command to run pytest for correctness tests
|
|
6
|
+
test:
|
|
7
|
+
python -m pytest --disable-warnings \
|
|
8
|
+
--cov=src/chalk \
|
|
9
|
+
--cov-report=term-missing \
|
|
10
|
+
test/
|
|
11
|
+
|
|
12
|
+
# Command to run coverage report
|
|
13
|
+
coverage:
|
|
14
|
+
coverage report -m
|
|
15
|
+
|
|
16
|
+
# Command to run ruff for linting and formatting code
|
|
17
|
+
checkstyle:
|
|
18
|
+
ruff check --output-format=concise .; ruff_check_status=$$?; \
|
|
19
|
+
ruff format --check --diff .; ruff_format_status=$$?; \
|
|
20
|
+
ruff check . --fix; \
|
|
21
|
+
ruff format .; \
|
|
22
|
+
if [ $$ruff_check_status -ne 0 ] || [ $$ruff_format_status -ne 0 ]; then \
|
|
23
|
+
exit 1; \
|
|
24
|
+
fi
|
|
25
|
+
|
|
26
|
+
# Command to run all benchmark scripts and update benchmarking data file
|
|
27
|
+
# By default this doesn't overwrite existing data for the same benchmark experiment
|
|
28
|
+
# run with `make run-benchmarks OVERWRITE=1` to overwrite existing benchmark data
|
|
29
|
+
BENCHMARK_DIR = benchmark/scripts
|
|
30
|
+
BENCHMARK_SCRIPTS = $(wildcard $(BENCHMARK_DIR)/benchmark_*.py)
|
|
31
|
+
OVERWRITE ?= 0
|
|
32
|
+
|
|
33
|
+
run-benchmarks:
|
|
34
|
+
@for script in $(BENCHMARK_SCRIPTS); do \
|
|
35
|
+
echo "Running benchmark: $$script"; \
|
|
36
|
+
if [ $(OVERWRITE) -eq 1 ]; then \
|
|
37
|
+
python $$script --overwrite; \
|
|
38
|
+
else \
|
|
39
|
+
python $$script; \
|
|
40
|
+
fi; \
|
|
41
|
+
done
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Copyright 2026 Freesolo, Inc.
|
|
2
|
+
All Rights Reserved.
|
|
3
|
+
|
|
4
|
+
Licensed under the BSD 2-Clause License (the "License"). See LICENSE in the
|
|
5
|
+
project root for license information.
|
|
6
|
+
|
|
7
|
+
Chalk provides custom Triton/CUDA kernels designed to complement Liger Kernel
|
|
8
|
+
(https://github.com/linkedin/Liger-Kernel, BSD 2-Clause). Its repository layout,
|
|
9
|
+
tooling, and conventions intentionally mirror Liger Kernel.
|
|
10
|
+
|
|
11
|
+
This product may contain code derived from the following open source projects:
|
|
12
|
+
|
|
13
|
+
1. Liger Kernel
|
|
14
|
+
Copyright (c) 2024 LinkedIn Corporation
|
|
15
|
+
Licensed under the BSD 2-Clause License
|
|
16
|
+
Source: https://github.com/linkedin/Liger-Kernel
|
|
17
|
+
|
|
18
|
+
Repository structure (src/ layout, ops/transformers split), benchmarking
|
|
19
|
+
harness conventions, and test scaffolding were referenced from this project.
|
|
20
|
+
|
|
21
|
+
2. Triton
|
|
22
|
+
Copyright (c) 2023 OpenAI
|
|
23
|
+
Licensed under the MIT License
|
|
24
|
+
Source: https://github.com/openai/triton
|
|
25
|
+
|
|
26
|
+
For full license texts, please refer to the respective project repositories.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: freesolo-chalk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Custom Triton/CUDA kernels that complement Liger Kernel for LLM post-training
|
|
5
|
+
License: BSD 2-CLAUSE LICENSE
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2026 Freesolo, Inc.
|
|
8
|
+
All rights reserved.
|
|
9
|
+
|
|
10
|
+
Redistribution and use in source and binary forms, with or without
|
|
11
|
+
modification, are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
14
|
+
list of conditions and the following disclaimer.
|
|
15
|
+
|
|
16
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
17
|
+
this list of conditions and the following disclaimer in the documentation
|
|
18
|
+
and/or other materials provided with the distribution.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
30
|
+
|
|
31
|
+
Project-URL: Homepage, https://github.com/freesolo-co/chalk
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
License-File: NOTICE
|
|
35
|
+
Requires-Dist: torch>=2.1.2
|
|
36
|
+
Requires-Dist: triton>=2.3.1
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: transformers>=4.52.0; extra == "dev"
|
|
39
|
+
Requires-Dist: matplotlib>=3.7.2; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff>=0.12.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest>=7.1.2; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-xdist; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
44
|
+
Requires-Dist: datasets>=2.19.2; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: provides-extra
|
|
47
|
+
Dynamic: requires-dist
|
|
48
|
+
|
|
49
|
+
# Chalk
|
|
50
|
+
|
|
51
|
+
**Custom Triton/CUDA kernels that complement [Liger Kernel](https://github.com/linkedin/Liger-Kernel).**
|
|
52
|
+
|
|
53
|
+
`pip install freesolo-chalk`
|
|
54
|
+
|
|
55
|
+
Liger fuses the cross-entropy, activation, and RMSNorm paths. Chalk fills the gaps that
|
|
56
|
+
matter for Freesolo's [Flash](https://github.com/freesolo-co/flash) post-training stack —
|
|
57
|
+
fused GEMMs, the LoRA-delta matmuls, the QKV norm+RoPE epilogue, embedding gather, and
|
|
58
|
+
FP8 frozen-base GEMMs — each behind a documented, benchmarked, opt-in entry point.
|
|
59
|
+
|
|
60
|
+
Chalk's repository layout and conventions intentionally mirror Liger Kernel.
|
|
61
|
+
|
|
62
|
+
## Layout
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
src/chalk/
|
|
66
|
+
ops/ # raw Triton/CUDA kernels + autograd.Function wrappers
|
|
67
|
+
transformers/ # model-level installers that monkeypatch kernels into HF modules
|
|
68
|
+
utils.py # device detection helpers
|
|
69
|
+
test/ # correctness + gating tests (mirrors test/ops, test/transformers)
|
|
70
|
+
benchmark/ # speed + fp32-correctness A/B harness
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Design principles
|
|
74
|
+
|
|
75
|
+
- **Worker-side kernel library.** Like Liger, chalk depends on `torch` + `triton` — it is
|
|
76
|
+
meant to be installed where kernels actually run (the GPU worker), so consumers should
|
|
77
|
+
depend on it from a `gpu` extra rather than their base install. Importing the top-level
|
|
78
|
+
`chalk` package is still cheap (kernels lazy-load), so probing `chalk.utils.infer_device()`
|
|
79
|
+
never forces a heavy import.
|
|
80
|
+
- **Complements, not replaces, Liger.** Liger fuses CE / activation / RMSNorm; chalk fuses
|
|
81
|
+
the GEMMs, LoRA delta, QKV epilogue, embedding, and FP8 base.
|
|
82
|
+
- **Safe fallback.** Every installer is arch-gated, runs a numeric self-test on install,
|
|
83
|
+
patches only frozen `nn.Linear` layers (never trainable / PEFT-wrapped layers), and
|
|
84
|
+
silently falls back to the eager / Liger path on any import / compile / self-test failure.
|
|
85
|
+
- **Opt-in & evidence-based.** Kernels are off unless explicitly enabled, and every kept
|
|
86
|
+
kernel has end-to-end loss-curve evidence — not just a microbenchmark.
|
|
87
|
+
|
|
88
|
+
## Development
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install -e '.[dev]'
|
|
92
|
+
make checkstyle # ruff check + format
|
|
93
|
+
make test # pytest with coverage
|
|
94
|
+
make run-benchmarks
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Status
|
|
98
|
+
|
|
99
|
+
Intentionally minimal to start — kernels are landed one at a time under `chalk/ops` +
|
|
100
|
+
`chalk/transformers`, each with correctness tests and benchmark evidence.
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
BSD-2-Clause. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Chalk
|
|
2
|
+
|
|
3
|
+
**Custom Triton/CUDA kernels that complement [Liger Kernel](https://github.com/linkedin/Liger-Kernel).**
|
|
4
|
+
|
|
5
|
+
`pip install freesolo-chalk`
|
|
6
|
+
|
|
7
|
+
Liger fuses the cross-entropy, activation, and RMSNorm paths. Chalk fills the gaps that
|
|
8
|
+
matter for Freesolo's [Flash](https://github.com/freesolo-co/flash) post-training stack —
|
|
9
|
+
fused GEMMs, the LoRA-delta matmuls, the QKV norm+RoPE epilogue, embedding gather, and
|
|
10
|
+
FP8 frozen-base GEMMs — each behind a documented, benchmarked, opt-in entry point.
|
|
11
|
+
|
|
12
|
+
Chalk's repository layout and conventions intentionally mirror Liger Kernel.
|
|
13
|
+
|
|
14
|
+
## Layout
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
src/chalk/
|
|
18
|
+
ops/ # raw Triton/CUDA kernels + autograd.Function wrappers
|
|
19
|
+
transformers/ # model-level installers that monkeypatch kernels into HF modules
|
|
20
|
+
utils.py # device detection helpers
|
|
21
|
+
test/ # correctness + gating tests (mirrors test/ops, test/transformers)
|
|
22
|
+
benchmark/ # speed + fp32-correctness A/B harness
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Design principles
|
|
26
|
+
|
|
27
|
+
- **Worker-side kernel library.** Like Liger, chalk depends on `torch` + `triton` — it is
|
|
28
|
+
meant to be installed where kernels actually run (the GPU worker), so consumers should
|
|
29
|
+
depend on it from a `gpu` extra rather than their base install. Importing the top-level
|
|
30
|
+
`chalk` package is still cheap (kernels lazy-load), so probing `chalk.utils.infer_device()`
|
|
31
|
+
never forces a heavy import.
|
|
32
|
+
- **Complements, not replaces, Liger.** Liger fuses CE / activation / RMSNorm; chalk fuses
|
|
33
|
+
the GEMMs, LoRA delta, QKV epilogue, embedding, and FP8 base.
|
|
34
|
+
- **Safe fallback.** Every installer is arch-gated, runs a numeric self-test on install,
|
|
35
|
+
patches only frozen `nn.Linear` layers (never trainable / PEFT-wrapped layers), and
|
|
36
|
+
silently falls back to the eager / Liger path on any import / compile / self-test failure.
|
|
37
|
+
- **Opt-in & evidence-based.** Kernels are off unless explicitly enabled, and every kept
|
|
38
|
+
kernel has end-to-end loss-curve evidence — not just a microbenchmark.
|
|
39
|
+
|
|
40
|
+
## Development
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e '.[dev]'
|
|
44
|
+
make checkstyle # ruff check + format
|
|
45
|
+
make test # pytest with coverage
|
|
46
|
+
make run-benchmarks
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Status
|
|
50
|
+
|
|
51
|
+
Intentionally minimal to start — kernels are landed one at a time under `chalk/ops` +
|
|
52
|
+
`chalk/transformers`, each with correctness tests and benchmark evidence.
|
|
53
|
+
|
|
54
|
+
## License
|
|
55
|
+
|
|
56
|
+
BSD-2-Clause. See [LICENSE](LICENSE) and [NOTICE](NOTICE).
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
## Benchmarking Chalk Kernels
|
|
2
|
+
|
|
3
|
+
Chalk's benchmarking harness mirrors [Liger Kernel's](https://github.com/linkedin/Liger-Kernel/tree/main/benchmark):
|
|
4
|
+
each kernel ships a `benchmark/scripts/benchmark_<kernel>.py` that A/Bs the chalk kernel
|
|
5
|
+
against the eager / Liger / `torch` baseline across model configs and sequence lengths,
|
|
6
|
+
reporting both speed and a fp32-correctness check.
|
|
7
|
+
|
|
8
|
+
### Running
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
make run-benchmarks # run every benchmark/scripts/benchmark_*.py
|
|
12
|
+
make run-benchmarks OVERWRITE=1 # overwrite existing recorded data
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Conventions
|
|
16
|
+
|
|
17
|
+
* Every kept kernel must have **end-to-end** evidence (a real LoRA-SFT / GRPO loss-curve
|
|
18
|
+
A/B), not just a microbenchmark — a microbenchmark win that disappears E2E is not a win.
|
|
19
|
+
* Record the GPU (A40 / A6000 / H100 / H200) the numbers were measured on; cuBLAS strength
|
|
20
|
+
differs enough across SKUs that a win on one can vanish on another.
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruff>=0.1.6
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel", "setuptools-scm"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "freesolo-chalk"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Custom Triton/CUDA kernels that complement Liger Kernel for LLM post-training"
|
|
9
|
+
urls = { "Homepage" = "https://github.com/freesolo-co/chalk" }
|
|
10
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
dynamic = ["dependencies", "optional-dependencies"]
|
|
13
|
+
|
|
14
|
+
[tool.setuptools]
|
|
15
|
+
package-dir = {"" = "src"}
|
|
16
|
+
|
|
17
|
+
[tool.setuptools.packages.find]
|
|
18
|
+
where = ["src"]
|
|
19
|
+
include = ["chalk*"]
|
|
20
|
+
namespaces = false
|
|
21
|
+
|
|
22
|
+
[tool.pytest.ini_options]
|
|
23
|
+
pythonpath = ["src", "."]
|
|
24
|
+
addopts = [
|
|
25
|
+
"--cov=src/chalk",
|
|
26
|
+
"--cov-report=term-missing",
|
|
27
|
+
"--cov-report=html",
|
|
28
|
+
"--cov-config=pyproject.toml",
|
|
29
|
+
"--durations=0"
|
|
30
|
+
]
|
|
31
|
+
python_files = "test_*.py"
|
|
32
|
+
testpaths = ["test/"]
|
|
33
|
+
markers = [
|
|
34
|
+
"gpu: requires a CUDA GPU + torch/triton (skipped on CPU CI)",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.coverage.run]
|
|
38
|
+
branch = true
|
|
39
|
+
parallel = true
|
|
40
|
+
source = ["src/chalk"]
|
|
41
|
+
concurrency = ["multiprocessing"]
|
|
42
|
+
|
|
43
|
+
[tool.coverage.paths]
|
|
44
|
+
chalk = [
|
|
45
|
+
"src/chalk",
|
|
46
|
+
"*/site-packages/chalk"
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[tool.coverage.report]
|
|
50
|
+
omit = ["test/*"]
|
|
51
|
+
show_missing = true
|
|
52
|
+
skip_covered = false
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
[tool.ruff]
|
|
56
|
+
line-length = 120
|
|
57
|
+
target-version = "py310"
|
|
58
|
+
respect-gitignore = true
|
|
59
|
+
src = ["src"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = [
|
|
63
|
+
"E", # pycodestyle errors
|
|
64
|
+
"F", # pyflakes
|
|
65
|
+
"I", # isort
|
|
66
|
+
]
|
|
67
|
+
ignore = ["E501", "B006", "E731", "A002", "E203"]
|
|
68
|
+
|
|
69
|
+
exclude = [
|
|
70
|
+
".git",
|
|
71
|
+
"__pycache__",
|
|
72
|
+
".venv",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
[tool.ruff.format]
|
|
76
|
+
quote-style = "double"
|
|
77
|
+
indent-style = "space"
|
|
78
|
+
skip-magic-trailing-comma = false
|
|
79
|
+
line-ending = "auto"
|
|
80
|
+
|
|
81
|
+
[tool.ruff.lint.isort]
|
|
82
|
+
known-first-party = ["chalk"]
|
|
83
|
+
force-single-line = true
|
|
84
|
+
lines-between-types = 1
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# setup.py
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from setuptools import find_packages
|
|
8
|
+
from setuptools import setup
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_default_dependencies():
|
|
12
|
+
"""Determine the appropriate dependencies based on detected hardware."""
|
|
13
|
+
platform = get_platform()
|
|
14
|
+
|
|
15
|
+
if platform in ["cuda", "cpu"]:
|
|
16
|
+
return [
|
|
17
|
+
"torch>=2.1.2",
|
|
18
|
+
"triton>=2.3.1",
|
|
19
|
+
]
|
|
20
|
+
elif platform == "rocm":
|
|
21
|
+
return [
|
|
22
|
+
"triton>=3.0.0",
|
|
23
|
+
]
|
|
24
|
+
elif platform == "xpu":
|
|
25
|
+
return [
|
|
26
|
+
"torch>=2.6.0",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_optional_dependencies():
|
|
31
|
+
"""Get optional dependency groups."""
|
|
32
|
+
dev_deps = [
|
|
33
|
+
"transformers>=4.52.0",
|
|
34
|
+
"matplotlib>=3.7.2",
|
|
35
|
+
"ruff>=0.12.0",
|
|
36
|
+
"pytest>=7.1.2",
|
|
37
|
+
"pytest-xdist",
|
|
38
|
+
"pytest-cov",
|
|
39
|
+
"datasets>=2.19.2",
|
|
40
|
+
]
|
|
41
|
+
return {
|
|
42
|
+
"dev": dev_deps,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def is_xpu_available():
|
|
47
|
+
"""Check if Intel XPU is available."""
|
|
48
|
+
try:
|
|
49
|
+
subprocess.run(["xpu-smi"], check=True)
|
|
50
|
+
return True
|
|
51
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
result = subprocess.run("sycl-ls", check=True, capture_output=True, shell=True)
|
|
56
|
+
if "level_zero:gpu" in result.stdout.decode():
|
|
57
|
+
return True
|
|
58
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
|
|
65
|
+
"""Detect whether the system has NVIDIA or AMD GPU without a torch dependency."""
|
|
66
|
+
# Try nvidia-smi first
|
|
67
|
+
try:
|
|
68
|
+
subprocess.run(["nvidia-smi"], check=True)
|
|
69
|
+
print("NVIDIA GPU detected")
|
|
70
|
+
return "cuda"
|
|
71
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
72
|
+
# If nvidia-smi fails, check for ROCm
|
|
73
|
+
try:
|
|
74
|
+
subprocess.run(["rocm-smi"], check=True)
|
|
75
|
+
print("ROCm GPU detected")
|
|
76
|
+
return "rocm"
|
|
77
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
78
|
+
if is_xpu_available():
|
|
79
|
+
print("Intel GPU detected")
|
|
80
|
+
return "xpu"
|
|
81
|
+
else:
|
|
82
|
+
print("No GPU detected")
|
|
83
|
+
return "cpu"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
setup(
|
|
87
|
+
name="freesolo-chalk",
|
|
88
|
+
package_dir={"": "src"},
|
|
89
|
+
packages=find_packages(where="src"),
|
|
90
|
+
install_requires=get_default_dependencies(),
|
|
91
|
+
extras_require=get_optional_dependencies(),
|
|
92
|
+
classifiers=[
|
|
93
|
+
"Development Status :: 3 - Alpha",
|
|
94
|
+
"Intended Audience :: Developers",
|
|
95
|
+
"Intended Audience :: Science/Research",
|
|
96
|
+
"Programming Language :: Python :: 3",
|
|
97
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
98
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
99
|
+
"License :: OSI Approved :: BSD License",
|
|
100
|
+
"Operating System :: OS Independent",
|
|
101
|
+
],
|
|
102
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chalk operators — raw Triton/CUDA kernels and their ``torch.autograd.Function`` wrappers.
|
|
3
|
+
|
|
4
|
+
Mirrors Liger Kernel's layout: ``chalk.ops`` holds the low-level kernel implementations
|
|
5
|
+
(``@triton.jit`` functions, autograd Functions, FP8 GEMM helpers), while ``chalk.transformers``
|
|
6
|
+
holds the model-level installers that monkeypatch these kernels into HuggingFace modules.
|
|
7
|
+
|
|
8
|
+
This namespace starts empty by design — kernels are landed one at a time, each with its own
|
|
9
|
+
benchmark evidence.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__all__: list[str] = []
|