mlenvdoctor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlenvdoctor-0.1.0/.github/workflows/ci.yml +79 -0
- mlenvdoctor-0.1.0/.gitignore +141 -0
- mlenvdoctor-0.1.0/CHANGELOG.md +37 -0
- mlenvdoctor-0.1.0/CONTRIBUTING.md +104 -0
- mlenvdoctor-0.1.0/LICENSE +22 -0
- mlenvdoctor-0.1.0/PKG-INFO +282 -0
- mlenvdoctor-0.1.0/README.md +250 -0
- mlenvdoctor-0.1.0/docker/README.md +32 -0
- mlenvdoctor-0.1.0/pyproject.toml +65 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/__init__.py +4 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/cli.py +153 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/diagnose.py +493 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/dockerize.py +204 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/fix.py +249 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/gpu.py +184 -0
- mlenvdoctor-0.1.0/src/mlenvdoctor/utils.py +107 -0
- mlenvdoctor-0.1.0/tests/__init__.py +2 -0
- mlenvdoctor-0.1.0/tests/test_diagnose.py +45 -0
- mlenvdoctor-0.1.0/tests/test_dockerize.py +53 -0
- mlenvdoctor-0.1.0/tests/test_fix.py +52 -0
- mlenvdoctor-0.1.0/tests/test_utils.py +41 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, develop]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main, develop]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ${{ matrix.os }}
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
|
15
|
+
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Install dependencies
|
|
26
|
+
run: |
|
|
27
|
+
python -m pip install --upgrade pip
|
|
28
|
+
pip install -e ".[dev]"
|
|
29
|
+
|
|
30
|
+
- name: Run linters
|
|
31
|
+
run: |
|
|
32
|
+
pip install black ruff mypy
|
|
33
|
+
black --check src/ tests/
|
|
34
|
+
ruff check src/ tests/
|
|
35
|
+
mypy src/ || true # Allow mypy to fail for now
|
|
36
|
+
|
|
37
|
+
- name: Run tests
|
|
38
|
+
run: |
|
|
39
|
+
pytest --cov=mlenvdoctor --cov-report=xml --cov-report=term-missing
|
|
40
|
+
|
|
41
|
+
- name: Upload coverage
|
|
42
|
+
uses: codecov/codecov-action@v3
|
|
43
|
+
with:
|
|
44
|
+
file: ./coverage.xml
|
|
45
|
+
fail_ci_if_error: false
|
|
46
|
+
|
|
47
|
+
build:
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
needs: test
|
|
50
|
+
|
|
51
|
+
steps:
|
|
52
|
+
- uses: actions/checkout@v4
|
|
53
|
+
|
|
54
|
+
- name: Set up Python
|
|
55
|
+
uses: actions/setup-python@v5
|
|
56
|
+
with:
|
|
57
|
+
python-version: "3.10"
|
|
58
|
+
|
|
59
|
+
- name: Install build dependencies
|
|
60
|
+
run: |
|
|
61
|
+
python -m pip install --upgrade pip
|
|
62
|
+
pip install build twine
|
|
63
|
+
|
|
64
|
+
- name: Build package
|
|
65
|
+
run: |
|
|
66
|
+
python -m build
|
|
67
|
+
|
|
68
|
+
- name: Check package
|
|
69
|
+
run: |
|
|
70
|
+
twine check dist/*
|
|
71
|
+
|
|
72
|
+
- name: Publish to PyPI (on release)
|
|
73
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
74
|
+
env:
|
|
75
|
+
TWINE_USERNAME: __token__
|
|
76
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
|
77
|
+
run: |
|
|
78
|
+
twine upload dist/*
|
|
79
|
+
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
*.manifest
|
|
32
|
+
*.spec
|
|
33
|
+
|
|
34
|
+
# Installer logs
|
|
35
|
+
pip-log.txt
|
|
36
|
+
pip-delete-this-directory.txt
|
|
37
|
+
|
|
38
|
+
# Unit test / coverage reports
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*.cover
|
|
48
|
+
*.py,cover
|
|
49
|
+
.hypothesis/
|
|
50
|
+
.pytest_cache/
|
|
51
|
+
|
|
52
|
+
# Translations
|
|
53
|
+
*.mo
|
|
54
|
+
*.pot
|
|
55
|
+
|
|
56
|
+
# Django stuff:
|
|
57
|
+
*.log
|
|
58
|
+
local_settings.py
|
|
59
|
+
db.sqlite3
|
|
60
|
+
db.sqlite3-journal
|
|
61
|
+
|
|
62
|
+
# Flask stuff:
|
|
63
|
+
instance/
|
|
64
|
+
.webassets-cache
|
|
65
|
+
|
|
66
|
+
# Scrapy stuff:
|
|
67
|
+
.scrapy
|
|
68
|
+
|
|
69
|
+
# Sphinx documentation
|
|
70
|
+
docs/_build/
|
|
71
|
+
|
|
72
|
+
# PyBuilder
|
|
73
|
+
target/
|
|
74
|
+
|
|
75
|
+
# Jupyter Notebook
|
|
76
|
+
.ipynb_checkpoints
|
|
77
|
+
|
|
78
|
+
# IPython
|
|
79
|
+
profile_default/
|
|
80
|
+
ipython_config.py
|
|
81
|
+
|
|
82
|
+
# pyenv
|
|
83
|
+
.python-version
|
|
84
|
+
|
|
85
|
+
# pipenv
|
|
86
|
+
Pipfile.lock
|
|
87
|
+
|
|
88
|
+
# PEP 582
|
|
89
|
+
__pypackages__/
|
|
90
|
+
|
|
91
|
+
# Celery stuff
|
|
92
|
+
celerybeat-schedule
|
|
93
|
+
celerybeat.pid
|
|
94
|
+
|
|
95
|
+
# SageMath parsed files
|
|
96
|
+
*.sage.py
|
|
97
|
+
|
|
98
|
+
# Environments
|
|
99
|
+
.env
|
|
100
|
+
.venv
|
|
101
|
+
env/
|
|
102
|
+
venv/
|
|
103
|
+
ENV/
|
|
104
|
+
env.bak/
|
|
105
|
+
venv.bak/
|
|
106
|
+
|
|
107
|
+
# Spyder project settings
|
|
108
|
+
.spyderproject
|
|
109
|
+
.spyproject
|
|
110
|
+
|
|
111
|
+
# Rope project settings
|
|
112
|
+
.ropeproject
|
|
113
|
+
|
|
114
|
+
# mkdocs documentation
|
|
115
|
+
/site
|
|
116
|
+
|
|
117
|
+
# mypy
|
|
118
|
+
.mypy_cache/
|
|
119
|
+
.dmypy.json
|
|
120
|
+
dmypy.json
|
|
121
|
+
|
|
122
|
+
# Pyre type checker
|
|
123
|
+
.pyre/
|
|
124
|
+
|
|
125
|
+
# IDEs
|
|
126
|
+
.vscode/
|
|
127
|
+
.idea/
|
|
128
|
+
*.swp
|
|
129
|
+
*.swo
|
|
130
|
+
*~
|
|
131
|
+
|
|
132
|
+
# OS
|
|
133
|
+
.DS_Store
|
|
134
|
+
Thumbs.db
|
|
135
|
+
|
|
136
|
+
# ML Environment Doctor specific
|
|
137
|
+
requirements-mlenvdoctor.txt
|
|
138
|
+
environment-mlenvdoctor.yml
|
|
139
|
+
Dockerfile.mlenvdoctor
|
|
140
|
+
*.mlenvdoctor
|
|
141
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to ML Environment Doctor will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2024-XX-XX
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Initial release of ML Environment Doctor
|
|
12
|
+
- `diagnose` command for environment diagnostics
|
|
13
|
+
- CUDA driver detection
|
|
14
|
+
- PyTorch/CUDA compatibility checks
|
|
15
|
+
- ML library version checks (transformers, peft, trl, datasets, accelerate)
|
|
16
|
+
- GPU memory checks (with `--full` flag)
|
|
17
|
+
- Disk space checks
|
|
18
|
+
- Docker GPU support detection
|
|
19
|
+
- Internet connectivity checks
|
|
20
|
+
- `fix` command for auto-fixing environment issues
|
|
21
|
+
- Requirements.txt generation
|
|
22
|
+
- Conda environment file generation
|
|
23
|
+
- Virtual environment creation
|
|
24
|
+
- Automatic dependency installation
|
|
25
|
+
- `dockerize` command for Dockerfile generation
|
|
26
|
+
- Model-specific Dockerfiles (mistral-7b, tinyllama, gpt2)
|
|
27
|
+
- FastAPI service template generation
|
|
28
|
+
- CUDA 12.4 base images
|
|
29
|
+
- `test-model` command for model smoke tests
|
|
30
|
+
- `smoke-test` command for LoRA fine-tuning smoke tests
|
|
31
|
+
- Rich UI with colored output and tables
|
|
32
|
+
- Comprehensive test suite
|
|
33
|
+
- CI/CD workflow with GitHub Actions
|
|
34
|
+
- Documentation (README.md, CONTRIBUTING.md)
|
|
35
|
+
|
|
36
|
+
[0.1.0]: https://github.com/yourusername/ml_env_doctor/releases/tag/v0.1.0
|
|
37
|
+
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Contributing to ML Environment Doctor
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to ML Environment Doctor! ๐
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
1. Fork the repository
|
|
8
|
+
2. Clone your fork: `git clone https://github.com/yourusername/ml_env_doctor.git`
|
|
9
|
+
3. Create a branch: `git checkout -b feature/your-feature-name`
|
|
10
|
+
4. Install in development mode: `pip install -e ".[dev]"`
|
|
11
|
+
5. Make your changes
|
|
12
|
+
6. Run tests: `pytest`
|
|
13
|
+
7. Run linters: `black src/ tests/ && ruff check src/ tests/`
|
|
14
|
+
8. Commit your changes: `git commit -m "Add feature: your feature"`
|
|
15
|
+
9. Push to your fork: `git push origin feature/your-feature-name`
|
|
16
|
+
10. Open a Pull Request
|
|
17
|
+
|
|
18
|
+
## Development Setup
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Install dependencies
|
|
22
|
+
pip install -e ".[dev]"
|
|
23
|
+
|
|
24
|
+
# Install pre-commit hooks
|
|
25
|
+
pre-commit install
|
|
26
|
+
|
|
27
|
+
# Run tests
|
|
28
|
+
pytest
|
|
29
|
+
|
|
30
|
+
# Run with coverage
|
|
31
|
+
pytest --cov=mlenvdoctor --cov-report=html
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Code Style
|
|
35
|
+
|
|
36
|
+
We use:
|
|
37
|
+
- **Black** for code formatting (line length: 100)
|
|
38
|
+
- **Ruff** for linting
|
|
39
|
+
- **mypy** for type checking (optional for now)
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Format code
|
|
43
|
+
black src/ tests/
|
|
44
|
+
|
|
45
|
+
# Check linting
|
|
46
|
+
ruff check src/ tests/
|
|
47
|
+
|
|
48
|
+
# Type check
|
|
49
|
+
mypy src/
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Testing
|
|
53
|
+
|
|
54
|
+
- Write tests for new features
|
|
55
|
+
- Ensure all tests pass: `pytest`
|
|
56
|
+
- Aim for good test coverage
|
|
57
|
+
- Tests are in the `tests/` directory
|
|
58
|
+
|
|
59
|
+
## Commit Messages
|
|
60
|
+
|
|
61
|
+
Use clear, descriptive commit messages:
|
|
62
|
+
- `Add feature: GPU memory check`
|
|
63
|
+
- `Fix: CUDA version detection on Windows`
|
|
64
|
+
- `Update: Documentation for dockerize command`
|
|
65
|
+
|
|
66
|
+
## Pull Request Process
|
|
67
|
+
|
|
68
|
+
1. Update README.md if needed
|
|
69
|
+
2. Update CHANGELOG.md with your changes
|
|
70
|
+
3. Ensure all tests pass
|
|
71
|
+
4. Ensure code is formatted and linted
|
|
72
|
+
5. Request review from maintainers
|
|
73
|
+
|
|
74
|
+
## Project Structure
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
mlenvdoctor/
|
|
78
|
+
โโโ src/mlenvdoctor/ # Source code
|
|
79
|
+
โ โโโ cli.py # CLI entrypoint
|
|
80
|
+
โ โโโ diagnose.py # Diagnostic logic
|
|
81
|
+
โ โโโ fix.py # Auto-fix logic
|
|
82
|
+
โ โโโ dockerize.py # Dockerfile generation
|
|
83
|
+
โ โโโ gpu.py # GPU tests
|
|
84
|
+
โ โโโ utils.py # Utilities
|
|
85
|
+
โโโ tests/ # Test suite
|
|
86
|
+
โโโ docs/ # Documentation (if added)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Areas for Contribution
|
|
90
|
+
|
|
91
|
+
- ๐ Bug fixes
|
|
92
|
+
- โจ New features (diagnostics, fixes, Docker templates)
|
|
93
|
+
- ๐ Documentation improvements
|
|
94
|
+
- ๐งช Additional tests
|
|
95
|
+
- ๐จ UI/UX improvements
|
|
96
|
+
- โก Performance optimizations
|
|
97
|
+
- ๐ Support for additional ML frameworks
|
|
98
|
+
|
|
99
|
+
## Questions?
|
|
100
|
+
|
|
101
|
+
Open an issue for questions or discussions. We're happy to help!
|
|
102
|
+
|
|
103
|
+
Thank you for contributing! ๐
|
|
104
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 ML Environment Doctor Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mlenvdoctor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Diagnose & fix ML environments for LLM fine-tuning
|
|
5
|
+
Author: ML Environment Doctor Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: cuda,diagnostics,fine-tuning,llm,machine-learning,pytorch
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Requires-Dist: packaging>=23.0
|
|
21
|
+
Requires-Dist: psutil>=5.9.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: typer>=0.9.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# ๐ ML Environment Doctor
|
|
34
|
+
|
|
35
|
+
[](https://www.python.org/downloads/)
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
[](https://badge.fury.io/py/mlenvdoctor)
|
|
38
|
+
|
|
39
|
+
> **Single command fixes 90% of "my torch.cuda.is_available() is False" issues.**
|
|
40
|
+
|
|
41
|
+
ML Environment Doctor is a production-ready Python CLI that diagnoses, auto-fixes, and Dockerizes ML environments for LLM fine-tuning. It detects CUDA conflicts, generates locked requirements.txt/conda envs, tests GPU readiness with real LLM smoke tests, and outputs production Dockerfiles.
|
|
42
|
+
|
|
43
|
+
## ๐ฏ Why ML Environment Doctor?
|
|
44
|
+
|
|
45
|
+
**Problem**: LLM fine-tuning setup is fragmented across StackOverflow answers, conflicting PyTorch/CUDA versions, and missing dependencies. Hours wasted debugging `torch.cuda.is_available() == False`.
|
|
46
|
+
|
|
47
|
+
**Solution**: ONE TOOL that:
|
|
48
|
+
- โ
Diagnoses your environment in <5 seconds
|
|
49
|
+
- โ
Auto-fixes 80% of common issues
|
|
50
|
+
- โ
Generates production-ready Dockerfiles
|
|
51
|
+
- โ
Tests GPU readiness with real models
|
|
52
|
+
- โ
Supports PyTorch 2.4+ with CUDA 12.1/12.4
|
|
53
|
+
|
|
54
|
+
## ๐ Quick Start
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Install
|
|
58
|
+
pip install mlenvdoctor
|
|
59
|
+
|
|
60
|
+
# Diagnose your environment
|
|
61
|
+
mlenvdoctor diagnose
|
|
62
|
+
|
|
63
|
+
# Full diagnostic scan
|
|
64
|
+
mlenvdoctor diagnose --full
|
|
65
|
+
|
|
66
|
+
# Auto-fix issues and generate requirements.txt
|
|
67
|
+
mlenvdoctor fix
|
|
68
|
+
|
|
69
|
+
# Generate Dockerfile for fine-tuning
|
|
70
|
+
mlenvdoctor dockerize mistral-7b
|
|
71
|
+
|
|
72
|
+
# Run smoke test with real model
|
|
73
|
+
mlenvdoctor test-model tinyllama
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## ๐ Features
|
|
77
|
+
|
|
78
|
+
### ๐ Diagnosis
|
|
79
|
+
|
|
80
|
+
- **CUDA Detection**: NVIDIA driver, CUDA version, GPU availability
|
|
81
|
+
- **PyTorch/CUDA Compatibility**: Version matrix matching
|
|
82
|
+
- **Library Checks**: transformers, peft, trl, datasets, accelerate
|
|
83
|
+
- **GPU Memory**: Available memory for fine-tuning
|
|
84
|
+
- **Disk Space**: HF cache space warnings (~50GB)
|
|
85
|
+
- **Docker GPU**: nvidia-docker support detection
|
|
86
|
+
- **Connectivity**: Hugging Face Hub access
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
mlenvdoctor diagnose --full
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Output:
|
|
93
|
+
```
|
|
94
|
+
๐ Running ML Environment Diagnostics...
|
|
95
|
+
|
|
96
|
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
97
|
+
โ ML Environment Doctor - Diagnostic Results โ
|
|
98
|
+
โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ
|
|
99
|
+
โ Issue โ Status โ Severity โ Fix โ
|
|
100
|
+
โโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโค
|
|
101
|
+
โ NVIDIA GPU Driver โ โ
PASS - CUDA 12.4 โ INFO โ โ
|
|
102
|
+
โ PyTorch CUDA โ โ
PASS - CUDA 12.4 (1 GPU) โ INFO โ โ
|
|
103
|
+
โ transformers โ โ
PASS - 4.44.0 โ INFO โ โ
|
|
104
|
+
โ peft โ โ FAIL - Not installed โ CRITICAL โ pip install peft>=0.12 โ
|
|
105
|
+
โโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
106
|
+
|
|
107
|
+
โ
Passed: 3
|
|
108
|
+
โ Critical Issues: 1
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### ๐ง Auto-Fix
|
|
112
|
+
|
|
113
|
+
Generates optimized `requirements.txt` or conda environment files based on detected issues.
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Generate requirements.txt
|
|
117
|
+
mlenvdoctor fix
|
|
118
|
+
|
|
119
|
+
# Generate conda environment
|
|
120
|
+
mlenvdoctor fix --conda
|
|
121
|
+
|
|
122
|
+
# Create virtual environment and install
|
|
123
|
+
mlenvdoctor fix --venv
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### ๐ณ Dockerize
|
|
127
|
+
|
|
128
|
+
Generate production-ready Dockerfiles for ML fine-tuning.
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Basic Dockerfile
|
|
132
|
+
mlenvdoctor dockerize mistral-7b
|
|
133
|
+
|
|
134
|
+
# FastAPI service template
|
|
135
|
+
mlenvdoctor dockerize --service
|
|
136
|
+
|
|
137
|
+
# Custom output
|
|
138
|
+
mlenvdoctor dockerize tinyllama -o Dockerfile
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Generated Dockerfile includes:
|
|
142
|
+
- NVIDIA CUDA 12.4 base image
|
|
143
|
+
- PyTorch with CUDA support
|
|
144
|
+
- ML libraries (transformers, peft, trl, accelerate)
|
|
145
|
+
- Optimized layer caching
|
|
146
|
+
- GPU runtime configuration
|
|
147
|
+
|
|
148
|
+
### ๐งช Testing
|
|
149
|
+
|
|
150
|
+
Run smoke tests with real LLM models to verify fine-tuning readiness.
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
# Test with TinyLlama (fast)
|
|
154
|
+
mlenvdoctor test-model tinyllama
|
|
155
|
+
|
|
156
|
+
# Test with Mistral-7B (requires 16GB+ GPU)
|
|
157
|
+
mlenvdoctor test-model mistral-7b
|
|
158
|
+
|
|
159
|
+
# LoRA smoke test
|
|
160
|
+
mlenvdoctor smoke-test
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## ๐ฆ Installation
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
# From PyPI (when published)
|
|
167
|
+
pip install mlenvdoctor
|
|
168
|
+
|
|
169
|
+
# From source
|
|
170
|
+
git clone https://github.com/dheena731/ml_env_doctor.git
|
|
171
|
+
cd ml_env_doctor
|
|
172
|
+
pip install -e .
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## ๐ ๏ธ Development
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Clone repository
|
|
179
|
+
git clone https://github.com/dheena731/ml_env_doctor.git
|
|
180
|
+
cd ml_env_doctor
|
|
181
|
+
|
|
182
|
+
# Install with dev dependencies
|
|
183
|
+
pip install -e ".[dev]"
|
|
184
|
+
|
|
185
|
+
# Run tests
|
|
186
|
+
pytest
|
|
187
|
+
|
|
188
|
+
# Run linters
|
|
189
|
+
black src/ tests/
|
|
190
|
+
ruff check src/ tests/
|
|
191
|
+
mypy src/
|
|
192
|
+
|
|
193
|
+
# Pre-commit hooks
|
|
194
|
+
pre-commit install
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## ๐ CLI Reference
|
|
198
|
+
|
|
199
|
+
### `diagnose`
|
|
200
|
+
|
|
201
|
+
Diagnose ML environment issues.
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
mlenvdoctor diagnose # Quick scan
|
|
205
|
+
mlenvdoctor diagnose --full # Full scan with GPU benchmark
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### `fix`
|
|
209
|
+
|
|
210
|
+
Auto-fix environment issues.
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
mlenvdoctor fix # Generate requirements.txt
|
|
214
|
+
mlenvdoctor fix --conda # Generate conda environment
|
|
215
|
+
mlenvdoctor fix --venv # Create virtual environment
|
|
216
|
+
mlenvdoctor fix --stack minimal # Use minimal ML stack
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### `dockerize`
|
|
220
|
+
|
|
221
|
+
Generate Dockerfile for ML fine-tuning.
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
mlenvdoctor dockerize [model] # Model: mistral-7b, tinyllama, gpt2
|
|
225
|
+
mlenvdoctor dockerize --service # FastAPI service template
|
|
226
|
+
mlenvdoctor dockerize mistral-7b -o Dockerfile
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### `test-model`
|
|
230
|
+
|
|
231
|
+
Test model loading and forward pass.
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
mlenvdoctor test-model tinyllama
|
|
235
|
+
mlenvdoctor test-model mistral-7b
|
|
236
|
+
mlenvdoctor test-model gpt2
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### `smoke-test`
|
|
240
|
+
|
|
241
|
+
Run LoRA fine-tuning smoke test.
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
mlenvdoctor smoke-test
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## ๐ฏ Use Cases
|
|
248
|
+
|
|
249
|
+
1. **Fresh Setup**: Diagnose and fix a new ML environment in minutes
|
|
250
|
+
2. **CUDA Issues**: Detect and fix PyTorch/CUDA version mismatches
|
|
251
|
+
3. **Production Deployment**: Generate Dockerfiles for containerized training
|
|
252
|
+
4. **CI/CD**: Verify GPU readiness in automated pipelines
|
|
253
|
+
5. **Environment Debugging**: Quick diagnosis of "why is my GPU not working?"
|
|
254
|
+
|
|
255
|
+
## ๐ค Contributing
|
|
256
|
+
|
|
257
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
258
|
+
|
|
259
|
+
1. Fork the repository
|
|
260
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
261
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
262
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
263
|
+
5. Open a Pull Request
|
|
264
|
+
|
|
265
|
+
## ๐ License
|
|
266
|
+
|
|
267
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
268
|
+
|
|
269
|
+
## ๐ Acknowledgments
|
|
270
|
+
|
|
271
|
+
- Built with [Typer](https://typer.tiangolo.com/) and [Rich](https://rich.readthedocs.io/)
|
|
272
|
+
- Inspired by the LLM fine-tuning community's setup struggles
|
|
273
|
+
- Thanks to Hugging Face for amazing ML libraries
|
|
274
|
+
|
|
275
|
+
## โญ Star History
|
|
276
|
+
|
|
277
|
+
If you find this tool helpful, please star the repository! Our goal: **500 GitHub stars in the first month**.
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
**Made with โค๏ธ for the ML fine-tuning community**
|
|
282
|
+
|