turboquant-ml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- turboquant_ml-0.1.0/.claude/settings.local.json +95 -0
- turboquant_ml-0.1.0/.github/workflows/ci.yml +71 -0
- turboquant_ml-0.1.0/.github/workflows/docs.yml +48 -0
- turboquant_ml-0.1.0/.gitignore +69 -0
- turboquant_ml-0.1.0/.pre-commit-config.yaml +16 -0
- turboquant_ml-0.1.0/CHANGELOG.md +23 -0
- turboquant_ml-0.1.0/LICENSE +21 -0
- turboquant_ml-0.1.0/PKG-INFO +300 -0
- turboquant_ml-0.1.0/README.md +231 -0
- turboquant_ml-0.1.0/benchmarks/results/.gitkeep +0 -0
- turboquant_ml-0.1.0/benchmarks/results/README.md +23 -0
- turboquant_ml-0.1.0/benchmarks/results/gpt2.json +48 -0
- turboquant_ml-0.1.0/benchmarks/results/gpt2.png +0 -0
- turboquant_ml-0.1.0/benchmarks/results/smollm2_135m.json +48 -0
- turboquant_ml-0.1.0/benchmarks/results/smollm2_135m.png +0 -0
- turboquant_ml-0.1.0/benchmarks/scripts/sweep_cpu.py +170 -0
- turboquant_ml-0.1.0/benchmarks/scripts/sweep_llm.py +63 -0
- turboquant_ml-0.1.0/docs/CONTRIBUTING.md +58 -0
- turboquant_ml-0.1.0/docs/benchmarks.md +54 -0
- turboquant_ml-0.1.0/docs/index.md +79 -0
- turboquant_ml-0.1.0/docs/pruning.md +54 -0
- turboquant_ml-0.1.0/docs/quantization.md +101 -0
- turboquant_ml-0.1.0/examples/bench_compare.py +50 -0
- turboquant_ml-0.1.0/examples/prune_resnet.py +49 -0
- turboquant_ml-0.1.0/examples/quantize_llm_gptq.py +39 -0
- turboquant_ml-0.1.0/examples/quantize_llm_int4.py +44 -0
- turboquant_ml-0.1.0/mkdocs.yml +73 -0
- turboquant_ml-0.1.0/pyproject.toml +99 -0
- turboquant_ml-0.1.0/src/turboquant/__init__.py +29 -0
- turboquant_ml-0.1.0/src/turboquant/benchmark/__init__.py +21 -0
- turboquant_ml-0.1.0/src/turboquant/benchmark/accuracy.py +77 -0
- turboquant_ml-0.1.0/src/turboquant/benchmark/compare.py +104 -0
- turboquant_ml-0.1.0/src/turboquant/benchmark/latency.py +74 -0
- turboquant_ml-0.1.0/src/turboquant/benchmark/memory.py +61 -0
- turboquant_ml-0.1.0/src/turboquant/cli.py +249 -0
- turboquant_ml-0.1.0/src/turboquant/export/__init__.py +6 -0
- turboquant_ml-0.1.0/src/turboquant/export/onnx_export.py +77 -0
- turboquant_ml-0.1.0/src/turboquant/export/tensorrt_export.py +128 -0
- turboquant_ml-0.1.0/src/turboquant/models/__init__.py +51 -0
- turboquant_ml-0.1.0/src/turboquant/pruning/__init__.py +57 -0
- turboquant_ml-0.1.0/src/turboquant/pruning/magnitude.py +76 -0
- turboquant_ml-0.1.0/src/turboquant/pruning/nm_sparsity.py +49 -0
- turboquant_ml-0.1.0/src/turboquant/pruning/structured.py +119 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/__init__.py +103 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/awq_int.py +77 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/bnb.py +144 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/dtype_cast.py +36 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/gptq_int.py +128 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/int8_dynamic.py +42 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/int8_static.py +89 -0
- turboquant_ml-0.1.0/src/turboquant/quantization/observers.py +80 -0
- turboquant_ml-0.1.0/src/turboquant/utils.py +52 -0
- turboquant_ml-0.1.0/tests/__init__.py +0 -0
- turboquant_ml-0.1.0/tests/conftest.py +56 -0
- turboquant_ml-0.1.0/tests/test_benchmark.py +55 -0
- turboquant_ml-0.1.0/tests/test_cli.py +25 -0
- turboquant_ml-0.1.0/tests/test_export.py +37 -0
- turboquant_ml-0.1.0/tests/test_pruning.py +63 -0
- turboquant_ml-0.1.0/tests/test_quantization.py +72 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(find E:/turboquant -type f -not -path '*/\\\\.*')",
|
|
5
|
+
"Bash(python -c \"import ast, pathlib; [ast.parse\\(p.read_text\\(encoding='utf-8'\\)\\) for p in pathlib.Path\\('src'\\).rglob\\('*.py'\\)]; print\\('src OK'\\)\")",
|
|
6
|
+
"Bash(python -m py_compile src/turboquant/__init__.py)",
|
|
7
|
+
"Bash(python -c \"import ast, pathlib; [ast.parse\\(p.read_text\\(encoding='utf-8'\\)\\) for p in list\\(pathlib.Path\\('tests'\\).rglob\\('*.py'\\)\\) + list\\(pathlib.Path\\('examples'\\).rglob\\('*.py'\\)\\) + list\\(pathlib.Path\\('benchmarks'\\).rglob\\('*.py'\\)\\)]; print\\('tests+examples+benchmarks OK'\\)\")",
|
|
8
|
+
"Bash(git --version)",
|
|
9
|
+
"Bash(gh --version)",
|
|
10
|
+
"Bash(git config *)",
|
|
11
|
+
"Bash(rtk git *)",
|
|
12
|
+
"Bash(git init *)",
|
|
13
|
+
"Bash(git add *)",
|
|
14
|
+
"Bash(git commit -m ' *)",
|
|
15
|
+
"Bash(git remote *)",
|
|
16
|
+
"Bash(git ls-remote *)",
|
|
17
|
+
"Bash(git push *)",
|
|
18
|
+
"Bash(python -c \"import torch; print\\('torch', torch.__version__, 'cuda', torch.cuda.is_available\\(\\)\\)\")",
|
|
19
|
+
"Bash(python -m venv .venv)",
|
|
20
|
+
"Bash(.venv/Scripts/python.exe -m pip install --upgrade pip wheel)",
|
|
21
|
+
"Bash(.venv/Scripts/python.exe -m pip install --index-url https://download.pytorch.org/whl/cpu torch)",
|
|
22
|
+
"Bash(.venv/Scripts/python.exe -m pip install transformers numpy psutil typer rich pyyaml accelerate)",
|
|
23
|
+
"Bash(.venv/Scripts/python.exe -m pip install -e . --no-deps)",
|
|
24
|
+
"Bash(.venv/Scripts/python.exe -c ' *)",
|
|
25
|
+
"Bash(.venv/Scripts/python.exe -m pip install pytest)",
|
|
26
|
+
"Bash(.venv/Scripts/python.exe -m pytest -m \"not slow and not gpu\" -x --tb=short)",
|
|
27
|
+
"Bash(.venv/Scripts/python.exe -m pip install matplotlib)",
|
|
28
|
+
"Bash(.venv/Scripts/python.exe benchmarks/scripts/sweep_cpu.py --model-id HuggingFaceTB/SmolLM2-135M --methods fp32,fp16,bf16,int8-dynamic --warmup 2 --iters 8 --max-new-tokens 24 --out benchmarks/results/smollm2_135m.json --plot benchmarks/results/smollm2_135m.png)",
|
|
29
|
+
"Bash(HF_HUB_DISABLE_XET=1 .venv/Scripts/python.exe -c ' *)",
|
|
30
|
+
"Bash(.venv/Scripts/python.exe -m pip install \"huggingface_hub<1.0\")",
|
|
31
|
+
"Bash(.venv/Scripts/python.exe -m pip install \"transformers<5\")",
|
|
32
|
+
"Bash(CERT='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' SSL_CERT_FILE='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' REQUESTS_CA_BUNDLE='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' CURL_CA_BUNDLE='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' .venv/Scripts/python.exe -c ' *)",
|
|
33
|
+
"Bash(.venv/Scripts/python.exe -m pip install truststore)",
|
|
34
|
+
"Bash(.venv/Scripts/python.exe benchmarks/scripts/sweep_cpu.py --model-id gpt2 --methods fp32,fp16,bf16,int8-dynamic --warmup 2 --iters 8 --max-new-tokens 24 --out benchmarks/results/gpt2.json --plot benchmarks/results/gpt2.png)",
|
|
35
|
+
"Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe benchmarks/scripts/sweep_cpu.py --model-id gpt2 --methods fp32,fp16,bf16,int8-dynamic --warmup 2 --iters 8 --max-new-tokens 24 --out benchmarks/results/gpt2.json --plot benchmarks/results/gpt2.png)",
|
|
36
|
+
"Bash(env)",
|
|
37
|
+
"Bash(curl --version)",
|
|
38
|
+
"Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" https://Ademo93.github.io/turboquant/)",
|
|
39
|
+
"Bash(.venv/Scripts/python.exe -m pip install mkdocs-material pymdown-extensions)",
|
|
40
|
+
"Bash(.venv/Scripts/python.exe -m mkdocs build --strict)",
|
|
41
|
+
"Bash(curl -sS \"https://api.github.com/repos/Ademo93/turboquant/actions/workflows\")",
|
|
42
|
+
"Bash(curl -sS -o /dev/null -w \"HTTP %{http_code} \\(final URL: %{url_effective}\\)\\\\n\" -L https://Ademo93.github.io/turboquant/)",
|
|
43
|
+
"Bash(curl -sS -o /dev/null -w \"HTTP %{http_code}\\\\n\" https://api.github.com/repos/Ademo93/turboquant/pages)",
|
|
44
|
+
"Bash(curl -sS \"https://api.github.com/repos/Ademo93/turboquant/actions/runs?per_page=5\")",
|
|
45
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/workflows\")",
|
|
46
|
+
"Bash(curl -sS --ssl-no-revoke -o /dev/null -w \"HTTP %{http_code} \\(final URL: %{url_effective}\\)\\\\n\" -L https://Ademo93.github.io/turboquant/)",
|
|
47
|
+
"Bash(curl -sS --ssl-no-revoke -o /dev/null -w \"HTTP %{http_code}\\\\n\" https://api.github.com/repos/Ademo93/turboquant/pages)",
|
|
48
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs?per_page=5\")",
|
|
49
|
+
"Bash(curl -sS --ssl-no-revoke https://api.github.com/repos/Ademo93/turboquant/actions/runs/27765595606/jobs)",
|
|
50
|
+
"Bash(curl -sS --ssl-no-revoke https://api.github.com/repos/Ademo93/turboquant/actions/runs/27766724363/jobs)",
|
|
51
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs/27766724363/jobs\")",
|
|
52
|
+
"Bash(python -c ' *)",
|
|
53
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs/27765595606/jobs\")",
|
|
54
|
+
"Bash(.venv/Scripts/python.exe -m pip install ruff)",
|
|
55
|
+
"Bash(.venv/Scripts/python.exe -m ruff check src tests)",
|
|
56
|
+
"Bash(.venv/Scripts/python.exe -m ruff format --check src tests)",
|
|
57
|
+
"Bash(curl -sSL --ssl-no-revoke -o /tmp/test_log.zip \"https://api.github.com/repos/Ademo93/turboquant/actions/jobs/82155364717/logs\")",
|
|
58
|
+
"Bash(.venv/Scripts/python.exe -m ruff check --fix src tests)",
|
|
59
|
+
"Bash(.venv/Scripts/python.exe -m ruff format src tests)",
|
|
60
|
+
"Bash(.venv/Scripts/python.exe -m ruff check --output-format=concise src tests)",
|
|
61
|
+
"Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest -m \"not slow and not gpu\" --tb=short)",
|
|
62
|
+
"Bash(.venv/Scripts/python.exe -m pip install onnx onnxruntime onnxslim)",
|
|
63
|
+
"Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest tests/test_export.py -x --tb=long)",
|
|
64
|
+
"Bash(.venv/Scripts/python.exe -m pip install onnxscript)",
|
|
65
|
+
"Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest tests/test_export.py::test_onnx_dynamic_int8 --tb=long)",
|
|
66
|
+
"Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest tests/test_export.py::test_onnx_dynamic_int8 --tb=long --no-header)",
|
|
67
|
+
"Bash(git rm *)",
|
|
68
|
+
"Bash(gh auth *)",
|
|
69
|
+
"Bash(curl -sS --ssl-no-revoke -o /dev/null -w \"HTTP %{http_code}\\\\n\" -L https://Ademo93.github.io/turboquant/)",
|
|
70
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs?per_page=6\")",
|
|
71
|
+
"Bash(curl -sS --ssl-no-revoke https://api.github.com/users/Ademo93)",
|
|
72
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/users/Ademo93/repos?per_page=100&sort=pushed&direction=desc\")",
|
|
73
|
+
"Bash(curl -sS --ssl-no-revoke -i https://api.github.com/users/Ademo93)",
|
|
74
|
+
"Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" \"https://github.com/Ademo93?tab=repositories\")",
|
|
75
|
+
"Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" \"https://github.com/Ademo93?page=2&tab=repositories\")",
|
|
76
|
+
"Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" \"https://github.com/Ademo93?page=3&tab=repositories\")",
|
|
77
|
+
"Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" -o /tmp/profile.html \"https://github.com/Ademo93?tab=repositories\")",
|
|
78
|
+
"Read(//tmp/**)",
|
|
79
|
+
"Bash(cp /tmp/profile.html E:/turboquant/.venv/profile.html)",
|
|
80
|
+
"Bash(E:/turboquant/.venv/Scripts/python.exe -c ' *)",
|
|
81
|
+
"Bash(git reset *)",
|
|
82
|
+
"Bash(mkdir -p E:/Ademo93-profile)",
|
|
83
|
+
"Bash(mv E:/turboquant/profile-readme-staging/README.md E:/Ademo93-profile/README.md)",
|
|
84
|
+
"Read(//e//**)",
|
|
85
|
+
"Bash(git tag -a v0.1.0 -m 'v0.1.0 — first release *)",
|
|
86
|
+
"Bash(.venv/Scripts/python.exe -m pip install build twine)",
|
|
87
|
+
"Bash(.venv/Scripts/python.exe -m build)",
|
|
88
|
+
"Bash(.venv/Scripts/python.exe -m twine check dist/*)",
|
|
89
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/git/refs/tags\")",
|
|
90
|
+
"Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" https://github.com/Ademo93/turboquant/releases)",
|
|
91
|
+
"Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/git/refs/tags/v0.1.0\")",
|
|
92
|
+
"Bash(curl *)"
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.11"
|
|
17
|
+
cache: pip
|
|
18
|
+
- name: Install dev tooling
|
|
19
|
+
run: |
|
|
20
|
+
python -m pip install --upgrade pip
|
|
21
|
+
pip install ruff mypy
|
|
22
|
+
- name: Ruff
|
|
23
|
+
run: ruff check src tests
|
|
24
|
+
- name: Ruff format check
|
|
25
|
+
run: ruff format --check src tests
|
|
26
|
+
- name: Mypy (non-blocking)
|
|
27
|
+
continue-on-error: true
|
|
28
|
+
run: mypy src/turboquant
|
|
29
|
+
|
|
30
|
+
test:
|
|
31
|
+
runs-on: ${{ matrix.os }}
|
|
32
|
+
strategy:
|
|
33
|
+
fail-fast: false
|
|
34
|
+
matrix:
|
|
35
|
+
os: [ubuntu-latest]
|
|
36
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/checkout@v4
|
|
39
|
+
- uses: actions/setup-python@v5
|
|
40
|
+
with:
|
|
41
|
+
python-version: ${{ matrix.python-version }}
|
|
42
|
+
cache: pip
|
|
43
|
+
- name: Install
|
|
44
|
+
run: |
|
|
45
|
+
python -m pip install --upgrade pip
|
|
46
|
+
pip install -e ".[dev,onnx,viz]"
|
|
47
|
+
- name: Run tests
|
|
48
|
+
run: pytest -m "not slow and not gpu" --cov=turboquant --cov-report=xml
|
|
49
|
+
- name: Upload coverage
|
|
50
|
+
if: matrix.python-version == '3.11'
|
|
51
|
+
uses: codecov/codecov-action@v4
|
|
52
|
+
with:
|
|
53
|
+
files: coverage.xml
|
|
54
|
+
fail_ci_if_error: false
|
|
55
|
+
|
|
56
|
+
build:
|
|
57
|
+
runs-on: ubuntu-latest
|
|
58
|
+
needs: [lint, test]
|
|
59
|
+
steps:
|
|
60
|
+
- uses: actions/checkout@v4
|
|
61
|
+
- uses: actions/setup-python@v5
|
|
62
|
+
with:
|
|
63
|
+
python-version: "3.11"
|
|
64
|
+
- name: Build wheel
|
|
65
|
+
run: |
|
|
66
|
+
python -m pip install --upgrade pip build
|
|
67
|
+
python -m build
|
|
68
|
+
- uses: actions/upload-artifact@v4
|
|
69
|
+
with:
|
|
70
|
+
name: dist
|
|
71
|
+
path: dist/
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- docs/**
|
|
8
|
+
- mkdocs.yml
|
|
9
|
+
- .github/workflows/docs.yml
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
pages: write
|
|
15
|
+
id-token: write
|
|
16
|
+
|
|
17
|
+
concurrency:
|
|
18
|
+
group: pages
|
|
19
|
+
cancel-in-progress: false
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
build:
|
|
23
|
+
runs-on: ubuntu-latest
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
- uses: actions/setup-python@v5
|
|
27
|
+
with:
|
|
28
|
+
python-version: "3.11"
|
|
29
|
+
cache: pip
|
|
30
|
+
- name: Install MkDocs Material
|
|
31
|
+
run: |
|
|
32
|
+
python -m pip install --upgrade pip
|
|
33
|
+
pip install mkdocs-material mkdocs-glightbox pymdown-extensions
|
|
34
|
+
- name: Build site
|
|
35
|
+
run: mkdocs build --strict
|
|
36
|
+
- uses: actions/upload-pages-artifact@v3
|
|
37
|
+
with:
|
|
38
|
+
path: site
|
|
39
|
+
|
|
40
|
+
deploy:
|
|
41
|
+
needs: build
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
environment:
|
|
44
|
+
name: github-pages
|
|
45
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
46
|
+
steps:
|
|
47
|
+
- id: deployment
|
|
48
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
*.egg-info/
|
|
12
|
+
*.egg
|
|
13
|
+
wheels/
|
|
14
|
+
pip-wheel-metadata/
|
|
15
|
+
|
|
16
|
+
# Virtual environments
|
|
17
|
+
.venv/
|
|
18
|
+
venv/
|
|
19
|
+
env/
|
|
20
|
+
ENV/
|
|
21
|
+
|
|
22
|
+
# IDEs
|
|
23
|
+
.vscode/
|
|
24
|
+
.idea/
|
|
25
|
+
*.swp
|
|
26
|
+
.DS_Store
|
|
27
|
+
|
|
28
|
+
# Tooling
|
|
29
|
+
.pytest_cache/
|
|
30
|
+
.mypy_cache/
|
|
31
|
+
.ruff_cache/
|
|
32
|
+
.coverage
|
|
33
|
+
htmlcov/
|
|
34
|
+
|
|
35
|
+
# Notebooks
|
|
36
|
+
.ipynb_checkpoints/
|
|
37
|
+
|
|
38
|
+
# Models, datasets, weights
|
|
39
|
+
*.bin
|
|
40
|
+
*.safetensors
|
|
41
|
+
*.pt
|
|
42
|
+
*.pth
|
|
43
|
+
*.onnx
|
|
44
|
+
*.engine
|
|
45
|
+
*.gguf
|
|
46
|
+
models_cache/
|
|
47
|
+
hf_cache/
|
|
48
|
+
.cache/
|
|
49
|
+
|
|
50
|
+
# Benchmarks output (gitignored by default; reference results are allowlisted below)
|
|
51
|
+
benchmarks/results/*.json
|
|
52
|
+
benchmarks/results/*.csv
|
|
53
|
+
benchmarks/results/*.png
|
|
54
|
+
!benchmarks/results/.gitkeep
|
|
55
|
+
!benchmarks/results/smollm2_135m.json
|
|
56
|
+
!benchmarks/results/smollm2_135m.png
|
|
57
|
+
!benchmarks/results/gpt2.json
|
|
58
|
+
!benchmarks/results/gpt2.png
|
|
59
|
+
|
|
60
|
+
# MkDocs build output
|
|
61
|
+
site/
|
|
62
|
+
|
|
63
|
+
# Logs
|
|
64
|
+
*.log
|
|
65
|
+
logs/
|
|
66
|
+
|
|
67
|
+
# Env files
|
|
68
|
+
.env
|
|
69
|
+
.env.local
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.6.9
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
9
|
+
rev: v4.6.0
|
|
10
|
+
hooks:
|
|
11
|
+
- id: trailing-whitespace
|
|
12
|
+
- id: end-of-file-fixer
|
|
13
|
+
- id: check-yaml
|
|
14
|
+
- id: check-toml
|
|
15
|
+
- id: check-added-large-files
|
|
16
|
+
args: [--maxkb=512]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. Format loosely
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and adheres
|
|
5
|
+
to [Semantic Versioning](https://semver.org).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.1.0] — 2026-06-18 — first release
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- Unified `quantize(model, method=...)` dispatcher with FP16/BF16, INT8 dynamic
|
|
13
|
+
& static, BitsAndBytes (INT8 / NF4 / FP4), GPTQ, AWQ backends.
|
|
14
|
+
- Unified `prune(model, strategy=...)` dispatcher with magnitude, L1/L2
|
|
15
|
+
structured channel pruning, and 2:4 N:M sparsity.
|
|
16
|
+
- ONNX export with optional `onnxslim` graph optimization and ORT dynamic INT8
|
|
17
|
+
weight quantization.
|
|
18
|
+
- TensorRT engine builder with FP16 / INT8 calibration support.
|
|
19
|
+
- Benchmark helpers: latency (CUDA events + p95/p99), peak GPU/CPU memory,
|
|
20
|
+
serialized model size, sliding-window perplexity, top-k accuracy.
|
|
21
|
+
- Typer CLI with `quantize`, `prune`, `export`, `bench`, `methods` subcommands.
|
|
22
|
+
- pytest suite covering quantization, pruning, benchmark and CLI smoke paths.
|
|
23
|
+
- GitHub Actions CI (lint + tests on Python 3.10–3.12 + wheel build).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 TurboQuant Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: turboquant-ml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: TurboQuant — model quantization and optimization toolkit for edge and resource-constrained deployment.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Ademo93/turboquant
|
|
6
|
+
Project-URL: Repository, https://github.com/Ademo93/turboquant
|
|
7
|
+
Project-URL: Issues, https://github.com/Ademo93/turboquant/issues
|
|
8
|
+
Author: TurboQuant Contributors
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: awq,bitsandbytes,edge-ai,gptq,llm,model-optimization,onnx,pruning,quantization,tensorrt
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: accelerate>=0.30
|
|
23
|
+
Requires-Dist: datasets>=2.18
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: psutil>=5.9
|
|
26
|
+
Requires-Dist: pyyaml>=6.0
|
|
27
|
+
Requires-Dist: rich>=13.7
|
|
28
|
+
Requires-Dist: torch>=2.2
|
|
29
|
+
Requires-Dist: transformers>=4.40
|
|
30
|
+
Requires-Dist: typer>=0.12
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: auto-gptq>=0.7.1; extra == 'all'
|
|
33
|
+
Requires-Dist: autoawq>=0.2.5; extra == 'all'
|
|
34
|
+
Requires-Dist: bitsandbytes>=0.43; extra == 'all'
|
|
35
|
+
Requires-Dist: evaluate>=0.4; extra == 'all'
|
|
36
|
+
Requires-Dist: lm-eval>=0.4.3; extra == 'all'
|
|
37
|
+
Requires-Dist: matplotlib>=3.8; extra == 'all'
|
|
38
|
+
Requires-Dist: onnx>=1.16; extra == 'all'
|
|
39
|
+
Requires-Dist: onnxruntime>=1.18; extra == 'all'
|
|
40
|
+
Requires-Dist: onnxscript>=0.1; extra == 'all'
|
|
41
|
+
Requires-Dist: onnxslim>=0.1; extra == 'all'
|
|
42
|
+
Requires-Dist: optimum>=1.20; extra == 'all'
|
|
43
|
+
Requires-Dist: pandas>=2.2; extra == 'all'
|
|
44
|
+
Provides-Extra: awq
|
|
45
|
+
Requires-Dist: autoawq>=0.2.5; extra == 'awq'
|
|
46
|
+
Provides-Extra: bnb
|
|
47
|
+
Requires-Dist: bitsandbytes>=0.43; extra == 'bnb'
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
50
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
51
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
54
|
+
Provides-Extra: eval
|
|
55
|
+
Requires-Dist: evaluate>=0.4; extra == 'eval'
|
|
56
|
+
Requires-Dist: lm-eval>=0.4.3; extra == 'eval'
|
|
57
|
+
Provides-Extra: gptq
|
|
58
|
+
Requires-Dist: auto-gptq>=0.7.1; extra == 'gptq'
|
|
59
|
+
Requires-Dist: optimum>=1.20; extra == 'gptq'
|
|
60
|
+
Provides-Extra: onnx
|
|
61
|
+
Requires-Dist: onnx>=1.16; extra == 'onnx'
|
|
62
|
+
Requires-Dist: onnxruntime>=1.18; extra == 'onnx'
|
|
63
|
+
Requires-Dist: onnxscript>=0.1; extra == 'onnx'
|
|
64
|
+
Requires-Dist: onnxslim>=0.1; extra == 'onnx'
|
|
65
|
+
Provides-Extra: viz
|
|
66
|
+
Requires-Dist: matplotlib>=3.8; extra == 'viz'
|
|
67
|
+
Requires-Dist: pandas>=2.2; extra == 'viz'
|
|
68
|
+
Description-Content-Type: text/markdown
|
|
69
|
+
|
|
70
|
+
<h1 align="center">TurboQuant</h1>
|
|
71
|
+
|
|
72
|
+
<p align="center">
|
|
73
|
+
<strong>Model quantization & optimization toolkit for edge and resource-constrained deployment.</strong><br>
|
|
74
|
+
INT4 · INT8 · FP16 · GPTQ · AWQ · BitsandBytes · Structured pruning · ONNX & TensorRT export
|
|
75
|
+
</p>
|
|
76
|
+
|
|
77
|
+
<p align="center">
|
|
78
|
+
<a href="#"><img alt="Python" src="https://img.shields.io/badge/python-3.10%2B-blue"></a>
|
|
79
|
+
<a href="#"><img alt="PyTorch" src="https://img.shields.io/badge/pytorch-2.2%2B-ee4c2c"></a>
|
|
80
|
+
<a href="#"><img alt="License" src="https://img.shields.io/badge/license-MIT-green"></a>
|
|
81
|
+
<a href="https://github.com/Ademo93/turboquant/actions/workflows/ci.yml"><img alt="CI" src="https://github.com/Ademo93/turboquant/actions/workflows/ci.yml/badge.svg"></a>
|
|
82
|
+
<a href="https://Ademo93.github.io/turboquant/"><img alt="Docs" src="https://img.shields.io/badge/docs-mkdocs--material-blue"></a>
|
|
83
|
+
<a href="#"><img alt="Status" src="https://img.shields.io/badge/status-beta-orange"></a>
|
|
84
|
+
</p>
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Why TurboQuant?
|
|
89
|
+
|
|
90
|
+
Modern open-source models are powerful but expensive to serve. Shipping a 7B-parameter LLM in FP16 demands ~14 GB of VRAM; a vision transformer that fits comfortably on a workstation may blow up on a Jetson Orin or a phone. **TurboQuant gives you a single, consistent interface to compress, quantize, prune, export, and benchmark models** — so you can ship them on the hardware you actually have.
|
|
91
|
+
|
|
92
|
+
It is built around three principles:
|
|
93
|
+
|
|
94
|
+
1. **One API, many backends.** Wrap `bitsandbytes`, `auto-gptq`, `autoawq`, native PyTorch quantization, and ONNX/TensorRT export behind a uniform `quantize(model, method=...)` interface.
|
|
95
|
+
2. **Reproducible benchmarks.** Latency, peak memory, model size, and task accuracy (perplexity, classification top-1, etc.) are first-class citizens — every example ships with a comparable benchmark.
|
|
96
|
+
3. **No magic.** Each technique is implemented as a small, readable module so it doubles as a reference for how the methods work.
|
|
97
|
+
|
|
98
|
+
## Features
|
|
99
|
+
|
|
100
|
+
| Category | Techniques |
|
|
101
|
+
|---|---|
|
|
102
|
+
| **Weight quantization** | INT8 dynamic & static PTQ, FP16/BF16 casting, INT4 (bitsandbytes NF4 / FP4), GPTQ, AWQ |
|
|
103
|
+
| **Pruning** | Magnitude (unstructured), L1 structured (channel/filter), N:M sparsity helpers |
|
|
104
|
+
| **Export** | ONNX (with `onnxslim` graph optimization), TensorRT engine builder, ORT quantization |
|
|
105
|
+
| **Calibration** | Per-tensor & per-channel, MinMax / Entropy / Percentile observers |
|
|
106
|
+
| **Benchmark** | Latency (warmup + median + p95), peak GPU/CPU memory, throughput, model size, perplexity, top-k accuracy |
|
|
107
|
+
| **CLI** | `turboquant quantize`, `turboquant prune`, `turboquant export`, `turboquant bench` |
|
|
108
|
+
|
|
109
|
+
## Installation
|
|
110
|
+
|
|
111
|
+
The PyPI package is named **`turboquant-ml`** (the unsuffixed `turboquant`
|
|
112
|
+
name was taken by an unrelated project). The Python import and CLI are still
|
|
113
|
+
just `turboquant` / `tq`:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Core install
|
|
117
|
+
pip install turboquant-ml
|
|
118
|
+
|
|
119
|
+
# With ONNX export
|
|
120
|
+
pip install "turboquant-ml[onnx]"
|
|
121
|
+
|
|
122
|
+
# Full LLM compression stack (GPTQ + AWQ + bitsandbytes)
|
|
123
|
+
pip install "turboquant-ml[gptq,awq,bnb,eval]"
|
|
124
|
+
|
|
125
|
+
# Everything
|
|
126
|
+
pip install "turboquant-ml[all]"
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
import turboquant # import name unchanged
|
|
131
|
+
from turboquant import quantize # same API
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
> **Note** — `bitsandbytes`, `auto-gptq`, `autoawq` and `tensorrt` are heavy native dependencies. They are deliberately optional; TurboQuant degrades gracefully when they are missing.
|
|
135
|
+
|
|
136
|
+
## Quick start
|
|
137
|
+
|
|
138
|
+
### Python API
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from turboquant import quantize, benchmark
|
|
142
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
143
|
+
|
|
144
|
+
model_id = "meta-llama/Llama-3.2-1B"
|
|
145
|
+
tok = AutoTokenizer.from_pretrained(model_id)
|
|
146
|
+
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
|
|
147
|
+
|
|
148
|
+
# One-line INT4 weight-only quantization via bitsandbytes
|
|
149
|
+
qmodel = quantize(model, method="bnb-nf4")
|
|
150
|
+
|
|
151
|
+
# Benchmark side-by-side
|
|
152
|
+
report = benchmark.compare(
|
|
153
|
+
baseline=model,
|
|
154
|
+
candidate=qmodel,
|
|
155
|
+
tokenizer=tok,
|
|
156
|
+
prompts=["Explain quantization in one sentence."],
|
|
157
|
+
metrics=["latency", "memory", "size", "perplexity"],
|
|
158
|
+
)
|
|
159
|
+
print(report.as_table())
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### CLI
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Quantize a HuggingFace model to INT4 with GPTQ + W4A16
|
|
166
|
+
tq quantize meta-llama/Llama-3.2-1B \
|
|
167
|
+
--method gptq \
|
|
168
|
+
--bits 4 \
|
|
169
|
+
--group-size 128 \
|
|
170
|
+
--calib-dataset wikitext \
|
|
171
|
+
--out ./outputs/llama-3.2-1b-gptq
|
|
172
|
+
|
|
173
|
+
# Structured prune a vision model and re-evaluate
|
|
174
|
+
tq prune microsoft/resnet-50 \
|
|
175
|
+
--strategy l1-channel \
|
|
176
|
+
--sparsity 0.30 \
|
|
177
|
+
--eval imagenet-val \
|
|
178
|
+
--out ./outputs/resnet50-pruned
|
|
179
|
+
|
|
180
|
+
# Export to ONNX with INT8 dynamic quantization
|
|
181
|
+
tq export ./outputs/resnet50-pruned \
|
|
182
|
+
--format onnx \
|
|
183
|
+
--quant int8-dynamic \
|
|
184
|
+
--opset 17
|
|
185
|
+
|
|
186
|
+
# Benchmark FP16 vs INT8 vs INT4 on a model
|
|
187
|
+
tq bench meta-llama/Llama-3.2-1B --methods fp16,int8-dynamic,bnb-nf4 \
|
|
188
|
+
--report ./benchmarks/results/llama32-1b.json
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Supported methods at a glance
|
|
192
|
+
|
|
193
|
+
| Method | Bits | Backend | Calibration | Typical use case |
|
|
194
|
+
|---|---|---|---|---|
|
|
195
|
+
| `fp16` / `bf16` | 16 | PyTorch | none | Fast, lossless-ish baseline |
|
|
196
|
+
| `int8-dynamic` | 8 | PyTorch | none | CPU inference, transformers |
|
|
197
|
+
| `int8-static` | 8 | PyTorch | required | CNNs, edge CPUs |
|
|
198
|
+
| `bnb-int8` | 8 | bitsandbytes | none | LLM training & serving on GPU |
|
|
199
|
+
| `bnb-nf4` / `bnb-fp4` | 4 | bitsandbytes | none | LLM inference, QLoRA |
|
|
200
|
+
| `gptq` | 2–8 | auto-gptq | required | LLM weight-only, best accuracy/bit |
|
|
201
|
+
| `awq` | 4 | autoawq | required | LLM weight-only, fast inference |
|
|
202
|
+
|
|
203
|
+
## Reference benchmarks
|
|
204
|
+
|
|
205
|
+
### SmolLM2-135M on CPU (real measured numbers)
|
|
206
|
+
|
|
207
|
+
`python benchmarks/scripts/sweep_cpu.py --model-id HuggingFaceTB/SmolLM2-135M --methods fp32,fp16,bf16,int8-dynamic`
|
|
208
|
+
|
|
209
|
+
| Method | Size (MB) | Forward latency (ms) | Generation throughput (tok/s) |
|
|
210
|
+
|---|---:|---:|---:|
|
|
211
|
+
| FP32 (baseline) | 513.2 | 31.3 | 32.6 |
|
|
212
|
+
| FP16 | 256.7 | 57.2 | 47.5 |
|
|
213
|
+
| BF16 | 256.7 | 55.4 | **48.9** |
|
|
214
|
+
| INT8 dynamic | **236.6** | **30.7** | 30.0 |
|
|
215
|
+
|
|
216
|
+
Read this carefully — the result is realistic, not flattering:
|
|
217
|
+
|
|
218
|
+
- **FP16/BF16 cut size in half**, and *generation* throughput goes **up ~50%**
|
|
219
|
+
(smaller KV cache wins), but the per-step forward pass is **2× slower**
|
|
220
|
+
because consumer CPUs have no fast FP16 matmul kernel. On a Tensor-Core GPU
|
|
221
|
+
these numbers flip.
|
|
222
|
+
- **INT8 dynamic is the smallest** (≈54 % off) and matches FP32 forward
|
|
223
|
+
latency, but generation throughput is similar to FP32 here — the small
|
|
224
|
+
hidden size of a 135 M model limits how much INT8 GEMM kernels can help.
|
|
225
|
+
- The right baseline matters: comparing INT8 to a poorly-quantizable
|
|
226
|
+
reference (e.g. GPT-2, which uses `transformers.Conv1D` instead of
|
|
227
|
+
`nn.Linear`) makes INT8 look bad. Always check what your method actually
|
|
228
|
+
rewrites — `tq methods` plus `print(model)` will tell you.
|
|
229
|
+
|
|
230
|
+

|
|
231
|
+
|
|
232
|
+
### Reproduce
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
pip install -e ".[viz]" truststore
|
|
236
|
+
python benchmarks/scripts/sweep_cpu.py \
|
|
237
|
+
--model-id HuggingFaceTB/SmolLM2-135M \
|
|
238
|
+
--methods fp32,fp16,bf16,int8-dynamic \
|
|
239
|
+
--out benchmarks/results/smollm2_135m.json \
|
|
240
|
+
--plot benchmarks/results/smollm2_135m.png
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
GPU sweeps (Llama-class models with GPTQ / AWQ / NF4) will land here once a CUDA
|
|
244
|
+
runner is added to CI — contributions welcome.
|
|
245
|
+
|
|
246
|
+
## Architecture
|
|
247
|
+
|
|
248
|
+
```
|
|
249
|
+
turboquant/
|
|
250
|
+
├── quantization/ # Algorithms: int8, fp16, gptq, awq, bnb, observers
|
|
251
|
+
├── pruning/ # Magnitude + structured (L1, L2, taylor) + N:M
|
|
252
|
+
├── export/ # ONNX, TensorRT, ORT quantization
|
|
253
|
+
├── benchmark/ # Latency, memory, perplexity, classification, plot
|
|
254
|
+
├── calibration/ # Datasets, dataloaders, observer fitting
|
|
255
|
+
├── models/ # Convenience loaders + registry
|
|
256
|
+
└── cli.py # Typer-based CLI
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
Each algorithm lives in a single, readable file with a `quantize_*` / `prune_*` function and a short docstring referencing the original paper.
|
|
260
|
+
|
|
261
|
+
## Roadmap
|
|
262
|
+
|
|
263
|
+
- [x] INT8 dynamic & static PTQ (PyTorch native)
|
|
264
|
+
- [x] FP16/BF16 casting
|
|
265
|
+
- [x] BitsAndBytes INT8 / NF4 / FP4 wrappers
|
|
266
|
+
- [x] GPTQ & AWQ integration
|
|
267
|
+
- [x] L1 structured & magnitude pruning
|
|
268
|
+
- [x] ONNX export with `onnxslim`
|
|
269
|
+
- [x] Latency / memory / perplexity benchmarks
|
|
270
|
+
- [ ] TensorRT INT8 calibration cache
|
|
271
|
+
- [ ] SmoothQuant W8A8
|
|
272
|
+
- [ ] HQQ (Half-Quadratic Quantization)
|
|
273
|
+
- [ ] Distillation-aware quantization
|
|
274
|
+
- [ ] Mobile export (CoreML / TFLite)
|
|
275
|
+
- [ ] Web dashboard for benchmark comparison
|
|
276
|
+
|
|
277
|
+
## Citing & related work
|
|
278
|
+
|
|
279
|
+
TurboQuant stands on the shoulders of giants. If you use it in research, please also cite the underlying algorithms:
|
|
280
|
+
|
|
281
|
+
- **GPTQ** — Frantar et al., 2023 (arXiv:2210.17323)
|
|
282
|
+
- **AWQ** — Lin et al., 2023 (arXiv:2306.00978)
|
|
283
|
+
- **LLM.int8()** / **QLoRA** — Dettmers et al., 2022 / 2023 (arXiv:2208.07339, 2305.14314)
|
|
284
|
+
- **SmoothQuant** — Xiao et al., 2022 (arXiv:2211.10438)
|
|
285
|
+
|
|
286
|
+
## Contributing
|
|
287
|
+
|
|
288
|
+
Contributions are very welcome — see [`CONTRIBUTING.md`](docs/CONTRIBUTING.md). Good first issues are tagged on the issue tracker.
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
git clone https://github.com/Ademo93/turboquant
|
|
292
|
+
cd turboquant
|
|
293
|
+
pip install -e ".[dev,all]"
|
|
294
|
+
pre-commit install
|
|
295
|
+
pytest
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
## License
|
|
299
|
+
|
|
300
|
+
[MIT](LICENSE) — do whatever you like, just keep the copyright notice.
|