turboquant-ml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. turboquant_ml-0.1.0/.claude/settings.local.json +95 -0
  2. turboquant_ml-0.1.0/.github/workflows/ci.yml +71 -0
  3. turboquant_ml-0.1.0/.github/workflows/docs.yml +48 -0
  4. turboquant_ml-0.1.0/.gitignore +69 -0
  5. turboquant_ml-0.1.0/.pre-commit-config.yaml +16 -0
  6. turboquant_ml-0.1.0/CHANGELOG.md +23 -0
  7. turboquant_ml-0.1.0/LICENSE +21 -0
  8. turboquant_ml-0.1.0/PKG-INFO +300 -0
  9. turboquant_ml-0.1.0/README.md +231 -0
  10. turboquant_ml-0.1.0/benchmarks/results/.gitkeep +0 -0
  11. turboquant_ml-0.1.0/benchmarks/results/README.md +23 -0
  12. turboquant_ml-0.1.0/benchmarks/results/gpt2.json +48 -0
  13. turboquant_ml-0.1.0/benchmarks/results/gpt2.png +0 -0
  14. turboquant_ml-0.1.0/benchmarks/results/smollm2_135m.json +48 -0
  15. turboquant_ml-0.1.0/benchmarks/results/smollm2_135m.png +0 -0
  16. turboquant_ml-0.1.0/benchmarks/scripts/sweep_cpu.py +170 -0
  17. turboquant_ml-0.1.0/benchmarks/scripts/sweep_llm.py +63 -0
  18. turboquant_ml-0.1.0/docs/CONTRIBUTING.md +58 -0
  19. turboquant_ml-0.1.0/docs/benchmarks.md +54 -0
  20. turboquant_ml-0.1.0/docs/index.md +79 -0
  21. turboquant_ml-0.1.0/docs/pruning.md +54 -0
  22. turboquant_ml-0.1.0/docs/quantization.md +101 -0
  23. turboquant_ml-0.1.0/examples/bench_compare.py +50 -0
  24. turboquant_ml-0.1.0/examples/prune_resnet.py +49 -0
  25. turboquant_ml-0.1.0/examples/quantize_llm_gptq.py +39 -0
  26. turboquant_ml-0.1.0/examples/quantize_llm_int4.py +44 -0
  27. turboquant_ml-0.1.0/mkdocs.yml +73 -0
  28. turboquant_ml-0.1.0/pyproject.toml +99 -0
  29. turboquant_ml-0.1.0/src/turboquant/__init__.py +29 -0
  30. turboquant_ml-0.1.0/src/turboquant/benchmark/__init__.py +21 -0
  31. turboquant_ml-0.1.0/src/turboquant/benchmark/accuracy.py +77 -0
  32. turboquant_ml-0.1.0/src/turboquant/benchmark/compare.py +104 -0
  33. turboquant_ml-0.1.0/src/turboquant/benchmark/latency.py +74 -0
  34. turboquant_ml-0.1.0/src/turboquant/benchmark/memory.py +61 -0
  35. turboquant_ml-0.1.0/src/turboquant/cli.py +249 -0
  36. turboquant_ml-0.1.0/src/turboquant/export/__init__.py +6 -0
  37. turboquant_ml-0.1.0/src/turboquant/export/onnx_export.py +77 -0
  38. turboquant_ml-0.1.0/src/turboquant/export/tensorrt_export.py +128 -0
  39. turboquant_ml-0.1.0/src/turboquant/models/__init__.py +51 -0
  40. turboquant_ml-0.1.0/src/turboquant/pruning/__init__.py +57 -0
  41. turboquant_ml-0.1.0/src/turboquant/pruning/magnitude.py +76 -0
  42. turboquant_ml-0.1.0/src/turboquant/pruning/nm_sparsity.py +49 -0
  43. turboquant_ml-0.1.0/src/turboquant/pruning/structured.py +119 -0
  44. turboquant_ml-0.1.0/src/turboquant/quantization/__init__.py +103 -0
  45. turboquant_ml-0.1.0/src/turboquant/quantization/awq_int.py +77 -0
  46. turboquant_ml-0.1.0/src/turboquant/quantization/bnb.py +144 -0
  47. turboquant_ml-0.1.0/src/turboquant/quantization/dtype_cast.py +36 -0
  48. turboquant_ml-0.1.0/src/turboquant/quantization/gptq_int.py +128 -0
  49. turboquant_ml-0.1.0/src/turboquant/quantization/int8_dynamic.py +42 -0
  50. turboquant_ml-0.1.0/src/turboquant/quantization/int8_static.py +89 -0
  51. turboquant_ml-0.1.0/src/turboquant/quantization/observers.py +80 -0
  52. turboquant_ml-0.1.0/src/turboquant/utils.py +52 -0
  53. turboquant_ml-0.1.0/tests/__init__.py +0 -0
  54. turboquant_ml-0.1.0/tests/conftest.py +56 -0
  55. turboquant_ml-0.1.0/tests/test_benchmark.py +55 -0
  56. turboquant_ml-0.1.0/tests/test_cli.py +25 -0
  57. turboquant_ml-0.1.0/tests/test_export.py +37 -0
  58. turboquant_ml-0.1.0/tests/test_pruning.py +63 -0
  59. turboquant_ml-0.1.0/tests/test_quantization.py +72 -0
@@ -0,0 +1,95 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(find E:/turboquant -type f -not -path '*/\\\\.*')",
5
+ "Bash(python -c \"import ast, pathlib; [ast.parse\\(p.read_text\\(encoding='utf-8'\\)\\) for p in pathlib.Path\\('src'\\).rglob\\('*.py'\\)]; print\\('src OK'\\)\")",
6
+ "Bash(python -m py_compile src/turboquant/__init__.py)",
7
+ "Bash(python -c \"import ast, pathlib; [ast.parse\\(p.read_text\\(encoding='utf-8'\\)\\) for p in list\\(pathlib.Path\\('tests'\\).rglob\\('*.py'\\)\\) + list\\(pathlib.Path\\('examples'\\).rglob\\('*.py'\\)\\) + list\\(pathlib.Path\\('benchmarks'\\).rglob\\('*.py'\\)\\)]; print\\('tests+examples+benchmarks OK'\\)\")",
8
+ "Bash(git --version)",
9
+ "Bash(gh --version)",
10
+ "Bash(git config *)",
11
+ "Bash(rtk git *)",
12
+ "Bash(git init *)",
13
+ "Bash(git add *)",
14
+ "Bash(git commit -m ' *)",
15
+ "Bash(git remote *)",
16
+ "Bash(git ls-remote *)",
17
+ "Bash(git push *)",
18
+ "Bash(python -c \"import torch; print\\('torch', torch.__version__, 'cuda', torch.cuda.is_available\\(\\)\\)\")",
19
+ "Bash(python -m venv .venv)",
20
+ "Bash(.venv/Scripts/python.exe -m pip install --upgrade pip wheel)",
21
+ "Bash(.venv/Scripts/python.exe -m pip install --index-url https://download.pytorch.org/whl/cpu torch)",
22
+ "Bash(.venv/Scripts/python.exe -m pip install transformers numpy psutil typer rich pyyaml accelerate)",
23
+ "Bash(.venv/Scripts/python.exe -m pip install -e . --no-deps)",
24
+ "Bash(.venv/Scripts/python.exe -c ' *)",
25
+ "Bash(.venv/Scripts/python.exe -m pip install pytest)",
26
+ "Bash(.venv/Scripts/python.exe -m pytest -m \"not slow and not gpu\" -x --tb=short)",
27
+ "Bash(.venv/Scripts/python.exe -m pip install matplotlib)",
28
+ "Bash(.venv/Scripts/python.exe benchmarks/scripts/sweep_cpu.py --model-id HuggingFaceTB/SmolLM2-135M --methods fp32,fp16,bf16,int8-dynamic --warmup 2 --iters 8 --max-new-tokens 24 --out benchmarks/results/smollm2_135m.json --plot benchmarks/results/smollm2_135m.png)",
29
+ "Bash(HF_HUB_DISABLE_XET=1 .venv/Scripts/python.exe -c ' *)",
30
+ "Bash(.venv/Scripts/python.exe -m pip install \"huggingface_hub<1.0\")",
31
+ "Bash(.venv/Scripts/python.exe -m pip install \"transformers<5\")",
32
+ "Bash(CERT='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' SSL_CERT_FILE='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' REQUESTS_CA_BUNDLE='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' CURL_CA_BUNDLE='E:\\\\turboquant\\\\.venv\\\\Lib\\\\site-packages\\\\certifi\\\\cacert.pem' .venv/Scripts/python.exe -c ' *)",
33
+ "Bash(.venv/Scripts/python.exe -m pip install truststore)",
34
+ "Bash(.venv/Scripts/python.exe benchmarks/scripts/sweep_cpu.py --model-id gpt2 --methods fp32,fp16,bf16,int8-dynamic --warmup 2 --iters 8 --max-new-tokens 24 --out benchmarks/results/gpt2.json --plot benchmarks/results/gpt2.png)",
35
+ "Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe benchmarks/scripts/sweep_cpu.py --model-id gpt2 --methods fp32,fp16,bf16,int8-dynamic --warmup 2 --iters 8 --max-new-tokens 24 --out benchmarks/results/gpt2.json --plot benchmarks/results/gpt2.png)",
36
+ "Bash(env)",
37
+ "Bash(curl --version)",
38
+ "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" https://Ademo93.github.io/turboquant/)",
39
+ "Bash(.venv/Scripts/python.exe -m pip install mkdocs-material pymdown-extensions)",
40
+ "Bash(.venv/Scripts/python.exe -m mkdocs build --strict)",
41
+ "Bash(curl -sS \"https://api.github.com/repos/Ademo93/turboquant/actions/workflows\")",
42
+ "Bash(curl -sS -o /dev/null -w \"HTTP %{http_code} \\(final URL: %{url_effective}\\)\\\\n\" -L https://Ademo93.github.io/turboquant/)",
43
+ "Bash(curl -sS -o /dev/null -w \"HTTP %{http_code}\\\\n\" https://api.github.com/repos/Ademo93/turboquant/pages)",
44
+ "Bash(curl -sS \"https://api.github.com/repos/Ademo93/turboquant/actions/runs?per_page=5\")",
45
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/workflows\")",
46
+ "Bash(curl -sS --ssl-no-revoke -o /dev/null -w \"HTTP %{http_code} \\(final URL: %{url_effective}\\)\\\\n\" -L https://Ademo93.github.io/turboquant/)",
47
+ "Bash(curl -sS --ssl-no-revoke -o /dev/null -w \"HTTP %{http_code}\\\\n\" https://api.github.com/repos/Ademo93/turboquant/pages)",
48
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs?per_page=5\")",
49
+ "Bash(curl -sS --ssl-no-revoke https://api.github.com/repos/Ademo93/turboquant/actions/runs/27765595606/jobs)",
50
+ "Bash(curl -sS --ssl-no-revoke https://api.github.com/repos/Ademo93/turboquant/actions/runs/27766724363/jobs)",
51
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs/27766724363/jobs\")",
52
+ "Bash(python -c ' *)",
53
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs/27765595606/jobs\")",
54
+ "Bash(.venv/Scripts/python.exe -m pip install ruff)",
55
+ "Bash(.venv/Scripts/python.exe -m ruff check src tests)",
56
+ "Bash(.venv/Scripts/python.exe -m ruff format --check src tests)",
57
+ "Bash(curl -sSL --ssl-no-revoke -o /tmp/test_log.zip \"https://api.github.com/repos/Ademo93/turboquant/actions/jobs/82155364717/logs\")",
58
+ "Bash(.venv/Scripts/python.exe -m ruff check --fix src tests)",
59
+ "Bash(.venv/Scripts/python.exe -m ruff format src tests)",
60
+ "Bash(.venv/Scripts/python.exe -m ruff check --output-format=concise src tests)",
61
+ "Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest -m \"not slow and not gpu\" --tb=short)",
62
+ "Bash(.venv/Scripts/python.exe -m pip install onnx onnxruntime onnxslim)",
63
+ "Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest tests/test_export.py -x --tb=long)",
64
+ "Bash(.venv/Scripts/python.exe -m pip install onnxscript)",
65
+ "Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest tests/test_export.py::test_onnx_dynamic_int8 --tb=long)",
66
+ "Bash(PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe -m pytest tests/test_export.py::test_onnx_dynamic_int8 --tb=long --no-header)",
67
+ "Bash(git rm *)",
68
+ "Bash(gh auth *)",
69
+ "Bash(curl -sS --ssl-no-revoke -o /dev/null -w \"HTTP %{http_code}\\\\n\" -L https://Ademo93.github.io/turboquant/)",
70
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/actions/runs?per_page=6\")",
71
+ "Bash(curl -sS --ssl-no-revoke https://api.github.com/users/Ademo93)",
72
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/users/Ademo93/repos?per_page=100&sort=pushed&direction=desc\")",
73
+ "Bash(curl -sS --ssl-no-revoke -i https://api.github.com/users/Ademo93)",
74
+ "Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" \"https://github.com/Ademo93?tab=repositories\")",
75
+ "Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" \"https://github.com/Ademo93?page=2&tab=repositories\")",
76
+ "Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" \"https://github.com/Ademo93?page=3&tab=repositories\")",
77
+ "Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" -o /tmp/profile.html \"https://github.com/Ademo93?tab=repositories\")",
78
+ "Read(//tmp/**)",
79
+ "Bash(cp /tmp/profile.html E:/turboquant/.venv/profile.html)",
80
+ "Bash(E:/turboquant/.venv/Scripts/python.exe -c ' *)",
81
+ "Bash(git reset *)",
82
+ "Bash(mkdir -p E:/Ademo93-profile)",
83
+ "Bash(mv E:/turboquant/profile-readme-staging/README.md E:/Ademo93-profile/README.md)",
84
+ "Read(//e//**)",
85
+ "Bash(git tag -a v0.1.0 -m 'v0.1.0 — first release *)",
86
+ "Bash(.venv/Scripts/python.exe -m pip install build twine)",
87
+ "Bash(.venv/Scripts/python.exe -m build)",
88
+ "Bash(.venv/Scripts/python.exe -m twine check dist/*)",
89
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/git/refs/tags\")",
90
+ "Bash(curl -sS --ssl-no-revoke -A \"Mozilla/5.0\" https://github.com/Ademo93/turboquant/releases)",
91
+ "Bash(curl -sS --ssl-no-revoke \"https://api.github.com/repos/Ademo93/turboquant/git/refs/tags/v0.1.0\")",
92
+ "Bash(curl *)"
93
+ ]
94
+ }
95
+ }
@@ -0,0 +1,71 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+ cache: pip
18
+ - name: Install dev tooling
19
+ run: |
20
+ python -m pip install --upgrade pip
21
+ pip install ruff mypy
22
+ - name: Ruff
23
+ run: ruff check src tests
24
+ - name: Ruff format check
25
+ run: ruff format --check src tests
26
+ - name: Mypy (non-blocking)
27
+ continue-on-error: true
28
+ run: mypy src/turboquant
29
+
30
+ test:
31
+ runs-on: ${{ matrix.os }}
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ os: [ubuntu-latest]
36
+ python-version: ["3.10", "3.11", "3.12"]
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+ - uses: actions/setup-python@v5
40
+ with:
41
+ python-version: ${{ matrix.python-version }}
42
+ cache: pip
43
+ - name: Install
44
+ run: |
45
+ python -m pip install --upgrade pip
46
+ pip install -e ".[dev,onnx,viz]"
47
+ - name: Run tests
48
+ run: pytest -m "not slow and not gpu" --cov=turboquant --cov-report=xml
49
+ - name: Upload coverage
50
+ if: matrix.python-version == '3.11'
51
+ uses: codecov/codecov-action@v4
52
+ with:
53
+ files: coverage.xml
54
+ fail_ci_if_error: false
55
+
56
+ build:
57
+ runs-on: ubuntu-latest
58
+ needs: [lint, test]
59
+ steps:
60
+ - uses: actions/checkout@v4
61
+ - uses: actions/setup-python@v5
62
+ with:
63
+ python-version: "3.11"
64
+ - name: Build wheel
65
+ run: |
66
+ python -m pip install --upgrade pip build
67
+ python -m build
68
+ - uses: actions/upload-artifact@v4
69
+ with:
70
+ name: dist
71
+ path: dist/
@@ -0,0 +1,48 @@
1
+ name: docs
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - docs/**
8
+ - mkdocs.yml
9
+ - .github/workflows/docs.yml
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+ pages: write
15
+ id-token: write
16
+
17
+ concurrency:
18
+ group: pages
19
+ cancel-in-progress: false
20
+
21
+ jobs:
22
+ build:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - uses: actions/checkout@v4
26
+ - uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.11"
29
+ cache: pip
30
+ - name: Install MkDocs Material
31
+ run: |
32
+ python -m pip install --upgrade pip
33
+ pip install mkdocs-material mkdocs-glightbox pymdown-extensions
34
+ - name: Build site
35
+ run: mkdocs build --strict
36
+ - uses: actions/upload-pages-artifact@v3
37
+ with:
38
+ path: site
39
+
40
+ deploy:
41
+ needs: build
42
+ runs-on: ubuntu-latest
43
+ environment:
44
+ name: github-pages
45
+ url: ${{ steps.deployment.outputs.page_url }}
46
+ steps:
47
+ - id: deployment
48
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,69 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+ *.egg
13
+ wheels/
14
+ pip-wheel-metadata/
15
+
16
+ # Virtual environments
17
+ .venv/
18
+ venv/
19
+ env/
20
+ ENV/
21
+
22
+ # IDEs
23
+ .vscode/
24
+ .idea/
25
+ *.swp
26
+ .DS_Store
27
+
28
+ # Tooling
29
+ .pytest_cache/
30
+ .mypy_cache/
31
+ .ruff_cache/
32
+ .coverage
33
+ htmlcov/
34
+
35
+ # Notebooks
36
+ .ipynb_checkpoints/
37
+
38
+ # Models, datasets, weights
39
+ *.bin
40
+ *.safetensors
41
+ *.pt
42
+ *.pth
43
+ *.onnx
44
+ *.engine
45
+ *.gguf
46
+ models_cache/
47
+ hf_cache/
48
+ .cache/
49
+
50
+ # Benchmarks output (gitignored by default; reference results are allowlisted below)
51
+ benchmarks/results/*.json
52
+ benchmarks/results/*.csv
53
+ benchmarks/results/*.png
54
+ !benchmarks/results/.gitkeep
55
+ !benchmarks/results/smollm2_135m.json
56
+ !benchmarks/results/smollm2_135m.png
57
+ !benchmarks/results/gpt2.json
58
+ !benchmarks/results/gpt2.png
59
+
60
+ # MkDocs build output
61
+ site/
62
+
63
+ # Logs
64
+ *.log
65
+ logs/
66
+
67
+ # Env files
68
+ .env
69
+ .env.local
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.6.9
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: https://github.com/pre-commit/pre-commit-hooks
9
+ rev: v4.6.0
10
+ hooks:
11
+ - id: trailing-whitespace
12
+ - id: end-of-file-fixer
13
+ - id: check-yaml
14
+ - id: check-toml
15
+ - id: check-added-large-files
16
+ args: [--maxkb=512]
@@ -0,0 +1,23 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. Format loosely
4
+ follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and adheres
5
+ to [Semantic Versioning](https://semver.org).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.1.0] — 2026-06-18 — first release
10
+
11
+ ### Added
12
+ - Unified `quantize(model, method=...)` dispatcher with FP16/BF16, INT8 dynamic
13
+ & static, BitsAndBytes (INT8 / NF4 / FP4), GPTQ, AWQ backends.
14
+ - Unified `prune(model, strategy=...)` dispatcher with magnitude, L1/L2
15
+ structured channel pruning, and 2:4 N:M sparsity.
16
+ - ONNX export with optional `onnxslim` graph optimization and ORT dynamic INT8
17
+ weight quantization.
18
+ - TensorRT engine builder with FP16 / INT8 calibration support.
19
+ - Benchmark helpers: latency (CUDA events + p95/p99), peak GPU/CPU memory,
20
+ serialized model size, sliding-window perplexity, top-k accuracy.
21
+ - Typer CLI with `quantize`, `prune`, `export`, `bench`, `methods` subcommands.
22
+ - pytest suite covering quantization, pruning, benchmark and CLI smoke paths.
23
+ - GitHub Actions CI (lint + tests on Python 3.10–3.12 + wheel build).
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 TurboQuant Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,300 @@
1
+ Metadata-Version: 2.4
2
+ Name: turboquant-ml
3
+ Version: 0.1.0
4
+ Summary: TurboQuant — model quantization and optimization toolkit for edge and resource-constrained deployment.
5
+ Project-URL: Homepage, https://github.com/Ademo93/turboquant
6
+ Project-URL: Repository, https://github.com/Ademo93/turboquant
7
+ Project-URL: Issues, https://github.com/Ademo93/turboquant/issues
8
+ Author: TurboQuant Contributors
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: awq,bitsandbytes,edge-ai,gptq,llm,model-optimization,onnx,pruning,quantization,tensorrt
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: accelerate>=0.30
23
+ Requires-Dist: datasets>=2.18
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: psutil>=5.9
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: rich>=13.7
28
+ Requires-Dist: torch>=2.2
29
+ Requires-Dist: transformers>=4.40
30
+ Requires-Dist: typer>=0.12
31
+ Provides-Extra: all
32
+ Requires-Dist: auto-gptq>=0.7.1; extra == 'all'
33
+ Requires-Dist: autoawq>=0.2.5; extra == 'all'
34
+ Requires-Dist: bitsandbytes>=0.43; extra == 'all'
35
+ Requires-Dist: evaluate>=0.4; extra == 'all'
36
+ Requires-Dist: lm-eval>=0.4.3; extra == 'all'
37
+ Requires-Dist: matplotlib>=3.8; extra == 'all'
38
+ Requires-Dist: onnx>=1.16; extra == 'all'
39
+ Requires-Dist: onnxruntime>=1.18; extra == 'all'
40
+ Requires-Dist: onnxscript>=0.1; extra == 'all'
41
+ Requires-Dist: onnxslim>=0.1; extra == 'all'
42
+ Requires-Dist: optimum>=1.20; extra == 'all'
43
+ Requires-Dist: pandas>=2.2; extra == 'all'
44
+ Provides-Extra: awq
45
+ Requires-Dist: autoawq>=0.2.5; extra == 'awq'
46
+ Provides-Extra: bnb
47
+ Requires-Dist: bitsandbytes>=0.43; extra == 'bnb'
48
+ Provides-Extra: dev
49
+ Requires-Dist: mypy>=1.10; extra == 'dev'
50
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
51
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
52
+ Requires-Dist: pytest>=8.0; extra == 'dev'
53
+ Requires-Dist: ruff>=0.5; extra == 'dev'
54
+ Provides-Extra: eval
55
+ Requires-Dist: evaluate>=0.4; extra == 'eval'
56
+ Requires-Dist: lm-eval>=0.4.3; extra == 'eval'
57
+ Provides-Extra: gptq
58
+ Requires-Dist: auto-gptq>=0.7.1; extra == 'gptq'
59
+ Requires-Dist: optimum>=1.20; extra == 'gptq'
60
+ Provides-Extra: onnx
61
+ Requires-Dist: onnx>=1.16; extra == 'onnx'
62
+ Requires-Dist: onnxruntime>=1.18; extra == 'onnx'
63
+ Requires-Dist: onnxscript>=0.1; extra == 'onnx'
64
+ Requires-Dist: onnxslim>=0.1; extra == 'onnx'
65
+ Provides-Extra: viz
66
+ Requires-Dist: matplotlib>=3.8; extra == 'viz'
67
+ Requires-Dist: pandas>=2.2; extra == 'viz'
68
+ Description-Content-Type: text/markdown
69
+
70
+ <h1 align="center">TurboQuant</h1>
71
+
72
+ <p align="center">
73
+ <strong>Model quantization & optimization toolkit for edge and resource-constrained deployment.</strong><br>
74
+ INT4 · INT8 · FP16 · GPTQ · AWQ · BitsandBytes · Structured pruning · ONNX & TensorRT export
75
+ </p>
76
+
77
+ <p align="center">
78
+ <a href="#"><img alt="Python" src="https://img.shields.io/badge/python-3.10%2B-blue"></a>
79
+ <a href="#"><img alt="PyTorch" src="https://img.shields.io/badge/pytorch-2.2%2B-ee4c2c"></a>
80
+ <a href="#"><img alt="License" src="https://img.shields.io/badge/license-MIT-green"></a>
81
+ <a href="https://github.com/Ademo93/turboquant/actions/workflows/ci.yml"><img alt="CI" src="https://github.com/Ademo93/turboquant/actions/workflows/ci.yml/badge.svg"></a>
82
+ <a href="https://Ademo93.github.io/turboquant/"><img alt="Docs" src="https://img.shields.io/badge/docs-mkdocs--material-blue"></a>
83
+ <a href="#"><img alt="Status" src="https://img.shields.io/badge/status-beta-orange"></a>
84
+ </p>
85
+
86
+ ---
87
+
88
+ ## Why TurboQuant?
89
+
90
+ Modern open-source models are powerful but expensive to serve. Shipping a 7B-parameter LLM in FP16 demands ~14&nbsp;GB of VRAM; a vision transformer that fits comfortably on a workstation may blow up on a Jetson Orin or a phone. **TurboQuant gives you a single, consistent interface to compress, quantize, prune, export, and benchmark models** — so you can ship them on the hardware you actually have.
91
+
92
+ It is built around three principles:
93
+
94
+ 1. **One API, many backends.** Wrap `bitsandbytes`, `auto-gptq`, `autoawq`, native PyTorch quantization, and ONNX/TensorRT export behind a uniform `quantize(model, method=...)` interface.
95
+ 2. **Reproducible benchmarks.** Latency, peak memory, model size, and task accuracy (perplexity, classification top-1, etc.) are first-class citizens — every example ships with a comparable benchmark.
96
+ 3. **No magic.** Each technique is implemented as a small, readable module so it doubles as a reference for how the methods work.
97
+
98
+ ## Features
99
+
100
+ | Category | Techniques |
101
+ |---|---|
102
+ | **Weight quantization** | INT8 dynamic & static PTQ, FP16/BF16 casting, INT4 (bitsandbytes NF4 / FP4), GPTQ, AWQ |
103
+ | **Pruning** | Magnitude (unstructured), L1 structured (channel/filter), N:M sparsity helpers |
104
+ | **Export** | ONNX (with `onnxslim` graph optimization), TensorRT engine builder, ORT quantization |
105
+ | **Calibration** | Per-tensor & per-channel, MinMax / Entropy / Percentile observers |
106
+ | **Benchmark** | Latency (warmup + median + p95), peak GPU/CPU memory, throughput, model size, perplexity, top-k accuracy |
107
+ | **CLI** | `turboquant quantize`, `turboquant prune`, `turboquant export`, `turboquant bench` |
108
+
109
+ ## Installation
110
+
111
+ The PyPI package is named **`turboquant-ml`** (the unsuffixed `turboquant`
112
+ name was taken by an unrelated project). The Python import and CLI are still
113
+ just `turboquant` / `tq`:
114
+
115
+ ```bash
116
+ # Core install
117
+ pip install turboquant-ml
118
+
119
+ # With ONNX export
120
+ pip install "turboquant-ml[onnx]"
121
+
122
+ # Full LLM compression stack (GPTQ + AWQ + bitsandbytes)
123
+ pip install "turboquant-ml[gptq,awq,bnb,eval]"
124
+
125
+ # Everything
126
+ pip install "turboquant-ml[all]"
127
+ ```
128
+
129
+ ```python
130
+ import turboquant # import name unchanged
131
+ from turboquant import quantize # same API
132
+ ```
133
+
134
+ > **Note** — `bitsandbytes`, `auto-gptq`, `autoawq` and `tensorrt` are heavy native dependencies. They are deliberately optional; TurboQuant degrades gracefully when they are missing.
135
+
136
+ ## Quick start
137
+
138
+ ### Python API
139
+
140
+ ```python
141
+ from turboquant import quantize, benchmark
142
+ from transformers import AutoModelForCausalLM, AutoTokenizer
143
+
144
+ model_id = "meta-llama/Llama-3.2-1B"
145
+ tok = AutoTokenizer.from_pretrained(model_id)
146
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
147
+
148
+ # One-line INT4 weight-only quantization via bitsandbytes
149
+ qmodel = quantize(model, method="bnb-nf4")
150
+
151
+ # Benchmark side-by-side
152
+ report = benchmark.compare(
153
+ baseline=model,
154
+ candidate=qmodel,
155
+ tokenizer=tok,
156
+ prompts=["Explain quantization in one sentence."],
157
+ metrics=["latency", "memory", "size", "perplexity"],
158
+ )
159
+ print(report.as_table())
160
+ ```
161
+
162
+ ### CLI
163
+
164
+ ```bash
165
+ # Quantize a HuggingFace model to INT4 with GPTQ + W4A16
166
+ tq quantize meta-llama/Llama-3.2-1B \
167
+ --method gptq \
168
+ --bits 4 \
169
+ --group-size 128 \
170
+ --calib-dataset wikitext \
171
+ --out ./outputs/llama-3.2-1b-gptq
172
+
173
+ # Structured prune a vision model and re-evaluate
174
+ tq prune microsoft/resnet-50 \
175
+ --strategy l1-channel \
176
+ --sparsity 0.30 \
177
+ --eval imagenet-val \
178
+ --out ./outputs/resnet50-pruned
179
+
180
+ # Export to ONNX with INT8 dynamic quantization
181
+ tq export ./outputs/resnet50-pruned \
182
+ --format onnx \
183
+ --quant int8-dynamic \
184
+ --opset 17
185
+
186
+ # Benchmark FP16 vs INT8 vs INT4 on a model
187
+ tq bench meta-llama/Llama-3.2-1B --methods fp16,int8-dynamic,bnb-nf4 \
188
+ --report ./benchmarks/results/llama32-1b.json
189
+ ```
190
+
191
+ ## Supported methods at a glance
192
+
193
+ | Method | Bits | Backend | Calibration | Typical use case |
194
+ |---|---|---|---|---|
195
+ | `fp16` / `bf16` | 16 | PyTorch | none | Fast, lossless-ish baseline |
196
+ | `int8-dynamic` | 8 | PyTorch | none | CPU inference, transformers |
197
+ | `int8-static` | 8 | PyTorch | required | CNNs, edge CPUs |
198
+ | `bnb-int8` | 8 | bitsandbytes | none | LLM training & serving on GPU |
199
+ | `bnb-nf4` / `bnb-fp4` | 4 | bitsandbytes | none | LLM inference, QLoRA |
200
+ | `gptq` | 2–8 | auto-gptq | required | LLM weight-only, best accuracy/bit |
201
+ | `awq` | 4 | autoawq | required | LLM weight-only, fast inference |
202
+
203
+ ## Reference benchmarks
204
+
205
+ ### SmolLM2-135M on CPU (real measured numbers)
206
+
207
+ `python benchmarks/scripts/sweep_cpu.py --model-id HuggingFaceTB/SmolLM2-135M --methods fp32,fp16,bf16,int8-dynamic`
208
+
209
+ | Method | Size (MB) | Forward latency (ms) | Generation throughput (tok/s) |
210
+ |---|---:|---:|---:|
211
+ | FP32 (baseline) | 513.2 | 31.3 | 32.6 |
212
+ | FP16 | 256.7 | 57.2 | 47.5 |
213
+ | BF16 | 256.7 | 55.4 | **48.9** |
214
+ | INT8 dynamic | **236.6** | **30.7** | 30.0 |
215
+
216
+ Read this carefully — the result is realistic, not flattering:
217
+
218
+ - **FP16/BF16 cut size in half**, and *generation* throughput goes **up ~50%**
219
+ (smaller KV cache wins), but the per-step forward pass is **2× slower**
220
+ because consumer CPUs have no fast FP16 matmul kernel. On a Tensor-Core GPU
221
+ these numbers flip.
222
+ - **INT8 dynamic is the smallest** (≈54 % off) and matches FP32 forward
223
+ latency, but generation throughput is similar to FP32 here — the small
224
+ hidden size of a 135 M model limits how much INT8 GEMM kernels can help.
225
+ - The right baseline matters: comparing INT8 to a poorly-quantizable
226
+ reference (e.g. GPT-2, which uses `transformers.Conv1D` instead of
227
+ `nn.Linear`) makes INT8 look bad. Always check what your method actually
228
+ rewrites — `tq methods` plus `print(model)` will tell you.
229
+
230
+ ![SmolLM2 sweep](benchmarks/results/smollm2_135m.png)
231
+
232
+ ### Reproduce
233
+
234
+ ```bash
235
+ pip install -e ".[viz]" truststore
236
+ python benchmarks/scripts/sweep_cpu.py \
237
+ --model-id HuggingFaceTB/SmolLM2-135M \
238
+ --methods fp32,fp16,bf16,int8-dynamic \
239
+ --out benchmarks/results/smollm2_135m.json \
240
+ --plot benchmarks/results/smollm2_135m.png
241
+ ```
242
+
243
+ GPU sweeps (Llama-class models with GPTQ / AWQ / NF4) will land here once a CUDA
244
+ runner is added to CI — contributions welcome.
245
+
246
+ ## Architecture
247
+
248
+ ```
249
+ turboquant/
250
+ ├── quantization/ # Algorithms: int8, fp16, gptq, awq, bnb, observers
251
+ ├── pruning/ # Magnitude + structured (L1, L2, taylor) + N:M
252
+ ├── export/ # ONNX, TensorRT, ORT quantization
253
+ ├── benchmark/ # Latency, memory, perplexity, classification, plot
254
+ ├── calibration/ # Datasets, dataloaders, observer fitting
255
+ ├── models/ # Convenience loaders + registry
256
+ └── cli.py # Typer-based CLI
257
+ ```
258
+
259
+ Each algorithm lives in a single, readable file with a `quantize_*` / `prune_*` function and a short docstring referencing the original paper.
260
+
261
+ ## Roadmap
262
+
263
+ - [x] INT8 dynamic & static PTQ (PyTorch native)
264
+ - [x] FP16/BF16 casting
265
+ - [x] BitsAndBytes INT8 / NF4 / FP4 wrappers
266
+ - [x] GPTQ & AWQ integration
267
+ - [x] L1 structured & magnitude pruning
268
+ - [x] ONNX export with `onnxslim`
269
+ - [x] Latency / memory / perplexity benchmarks
270
+ - [ ] TensorRT INT8 calibration cache
271
+ - [ ] SmoothQuant W8A8
272
+ - [ ] HQQ (Half-Quadratic Quantization)
273
+ - [ ] Distillation-aware quantization
274
+ - [ ] Mobile export (CoreML / TFLite)
275
+ - [ ] Web dashboard for benchmark comparison
276
+
277
+ ## Citing & related work
278
+
279
+ TurboQuant stands on the shoulders of giants. If you use it in research, please also cite the underlying algorithms:
280
+
281
+ - **GPTQ** — Frantar et al., 2023 (arXiv:2210.17323)
282
+ - **AWQ** — Lin et al., 2023 (arXiv:2306.00978)
283
+ - **LLM.int8()** / **QLoRA** — Dettmers et al., 2022 / 2023 (arXiv:2208.07339, 2305.14314)
284
+ - **SmoothQuant** — Xiao et al., 2022 (arXiv:2211.10438)
285
+
286
+ ## Contributing
287
+
288
+ Contributions are very welcome — see [`CONTRIBUTING.md`](docs/CONTRIBUTING.md). Good first issues are tagged on the issue tracker.
289
+
290
+ ```bash
291
+ git clone https://github.com/Ademo93/turboquant
292
+ cd turboquant
293
+ pip install -e ".[dev,all]"
294
+ pre-commit install
295
+ pytest
296
+ ```
297
+
298
+ ## License
299
+
300
+ [MIT](LICENSE) — do whatever you like, just keep the copyright notice.