rocmate 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rocmate/__init__.py +2 -0
- rocmate/_configs/tools/axolotl.yaml +55 -0
- rocmate/_configs/tools/comfyui.yaml +50 -0
- rocmate/_configs/tools/exllamav2.yaml +52 -0
- rocmate/_configs/tools/faster-whisper.yaml +35 -0
- rocmate/_configs/tools/llama-cpp.yaml +50 -0
- rocmate/_configs/tools/ollama.yaml +66 -0
- rocmate/_configs/tools/stable-diffusion-webui.yaml +49 -0
- rocmate/_configs/tools/vllm.yaml +60 -0
- rocmate/cli.py +142 -0
- rocmate/configs.py +98 -0
- rocmate/doctor.py +221 -0
- rocmate/fixer.py +85 -0
- rocmate/gpu.py +172 -0
- rocmate/install.py +147 -0
- rocmate-0.1.0.dist-info/METADATA +99 -0
- rocmate-0.1.0.dist-info/RECORD +20 -0
- rocmate-0.1.0.dist-info/WHEEL +4 -0
- rocmate-0.1.0.dist-info/entry_points.txt +2 -0
- rocmate-0.1.0.dist-info/licenses/LICENSE +21 -0
rocmate/__init__.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
name: Axolotl
|
|
2
|
+
description: Fine-tuning framework for LLMs supporting LoRA, QLoRA, and full fine-tuning.
|
|
3
|
+
homepage: https://github.com/axolotl-ai-cloud/axolotl
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
RX 7900 XTX (24 GB) handles QLoRA fine-tuning of 7B–13B models comfortably.
|
|
11
|
+
Flash-attention 2 works via ROCm CK (install separately). bitsandbytes ROCm fork
|
|
12
|
+
required for quantized training.
|
|
13
|
+
env_vars:
|
|
14
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
15
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
16
|
+
install_hints:
|
|
17
|
+
- "git clone https://github.com/axolotl-ai-cloud/axolotl && cd axolotl"
|
|
18
|
+
- "pip install torch --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
19
|
+
- "pip install packaging ninja && pip install flash-attn --no-build-isolation"
|
|
20
|
+
- "pip install -e '.[deepspeed]'"
|
|
21
|
+
- "accelerate launch -m axolotl.cli.train examples/llama-3/qlora.yml"
|
|
22
|
+
|
|
23
|
+
gfx1101:
|
|
24
|
+
status: tested
|
|
25
|
+
tested_on_rocm: "6.2"
|
|
26
|
+
notes: RX 7800 XT / RX 7700 XT — QLoRA of 7B models fits in 16 GB. Same setup as gfx1100.
|
|
27
|
+
env_vars:
|
|
28
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
29
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
30
|
+
install_hints:
|
|
31
|
+
- "Same install as gfx1100. Use gradient_checkpointing: true in your YAML for memory savings."
|
|
32
|
+
|
|
33
|
+
gfx1030:
|
|
34
|
+
status: partial
|
|
35
|
+
tested_on_rocm: "6.2"
|
|
36
|
+
notes: >
|
|
37
|
+
RX 6800/6900 — QLoRA of 7B works. Full fine-tuning requires gradient checkpointing.
|
|
38
|
+
Flash-attention fallback path is slower than RDNA3.
|
|
39
|
+
env_vars: {}
|
|
40
|
+
install_hints:
|
|
41
|
+
- "Same install as gfx1100."
|
|
42
|
+
- "Add gradient_checkpointing: true and micro_batch_size: 1 to your config."
|
|
43
|
+
|
|
44
|
+
gfx1102:
|
|
45
|
+
status: partial
|
|
46
|
+
tested_on_rocm: "6.2"
|
|
47
|
+
notes: >
|
|
48
|
+
RX 7600 (8 GB) — QLoRA of 7B is tight. Use per_device_train_batch_size: 1 and
|
|
49
|
+
gradient_checkpointing. Offload optimizer states to CPU if OOM.
|
|
50
|
+
env_vars:
|
|
51
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
52
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
53
|
+
install_hints:
|
|
54
|
+
- "Same install as gfx1100."
|
|
55
|
+
- "Config: micro_batch_size: 1, gradient_checkpointing: true, optimizer: adamw_8bit"
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: ComfyUI
|
|
2
|
+
description: Node-based UI for Stable Diffusion and other image generation models.
|
|
3
|
+
homepage: https://github.com/comfyanonymous/ComfyUI
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
Works well on RX 7900 XTX with PyTorch ROCm 6.2+. SDXL runs comfortably in 24 GB VRAM.
|
|
11
|
+
Flux.1 also works but requires careful memory management.
|
|
12
|
+
env_vars:
|
|
13
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
14
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
15
|
+
install_hints:
|
|
16
|
+
- "Linux: git clone https://github.com/comfyanonymous/ComfyUI && cd ComfyUI"
|
|
17
|
+
- "python -m venv venv && source venv/bin/activate"
|
|
18
|
+
- "pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
19
|
+
- "pip install -r requirements.txt && python main.py --listen"
|
|
20
|
+
- "Windows (HIP SDK): pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
21
|
+
|
|
22
|
+
gfx1101:
|
|
23
|
+
status: tested
|
|
24
|
+
tested_on_rocm: "6.2"
|
|
25
|
+
notes: RX 7800 XT / RX 7700 XT — same setup as gfx1100. 16 GB VRAM limits Flux.1; SDXL and SD 1.5 run well.
|
|
26
|
+
env_vars:
|
|
27
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
28
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
29
|
+
install_hints:
|
|
30
|
+
- "Same install as gfx1100."
|
|
31
|
+
|
|
32
|
+
gfx1102:
|
|
33
|
+
status: partial
|
|
34
|
+
tested_on_rocm: "6.2"
|
|
35
|
+
notes: RX 7600 — 8 GB VRAM is tight. SD 1.5 works; SDXL requires --lowvram flag and is slow.
|
|
36
|
+
env_vars:
|
|
37
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
38
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
39
|
+
install_hints:
|
|
40
|
+
- "Same install as gfx1100."
|
|
41
|
+
- "Launch with: python main.py --lowvram --listen"
|
|
42
|
+
|
|
43
|
+
gfx1030:
|
|
44
|
+
status: partial
|
|
45
|
+
tested_on_rocm: "6.2"
|
|
46
|
+
notes: SDXL is slow; SD 1.5 works fine. Memory pressure with larger models.
|
|
47
|
+
env_vars:
|
|
48
|
+
HSA_OVERRIDE_GFX_VERSION: "10.3.0"
|
|
49
|
+
install_hints:
|
|
50
|
+
- "Same as gfx1100 install; expect ~3-4x slower SDXL generation."
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
name: ExLlamaV2
|
|
2
|
+
description: Fast inference library for GPTQ and EXL2 quantized LLMs, optimised for AMD GPUs via ROCm.
|
|
3
|
+
homepage: https://github.com/turboderp/exllamav2
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
RX 7900 XTX — excellent performance. ExLlamaV2 is one of the fastest GPTQ/EXL2
|
|
11
|
+
backends on AMD. Mistral 7B EXL2 4bpw runs at ~80 tok/s. 24 GB allows 34B Q4.
|
|
12
|
+
env_vars:
|
|
13
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
14
|
+
install_hints:
|
|
15
|
+
- "pip install torch --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
16
|
+
- "pip install exllamav2"
|
|
17
|
+
- "python test_inference.py -m /path/to/model -p \"Hello world\""
|
|
18
|
+
- "Or build from source for latest features: git clone https://github.com/turboderp/exllamav2 && pip install -e ."
|
|
19
|
+
- "cmake -DCMAKE_BUILD_TYPE=Release . && cmake --build . --target exl2 --config Release"
|
|
20
|
+
|
|
21
|
+
gfx1101:
|
|
22
|
+
status: tested
|
|
23
|
+
tested_on_rocm: "6.2"
|
|
24
|
+
notes: RX 7800 XT / RX 7700 XT — works well. 16 GB VRAM suits 13B Q4 or 7B Q8 models.
|
|
25
|
+
env_vars:
|
|
26
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
27
|
+
install_hints:
|
|
28
|
+
- "Same install as gfx1100."
|
|
29
|
+
|
|
30
|
+
gfx1030:
|
|
31
|
+
status: partial
|
|
32
|
+
tested_on_rocm: "6.2"
|
|
33
|
+
notes: >
|
|
34
|
+
RX 6800/6900 — functional but kernel tuning may differ from RDNA3.
|
|
35
|
+
Performance is good for quantized models; throughput ~60-70 % of gfx1100.
|
|
36
|
+
env_vars: {}
|
|
37
|
+
install_hints:
|
|
38
|
+
- "pip install torch --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
39
|
+
- "pip install exllamav2"
|
|
40
|
+
|
|
41
|
+
gfx1102:
|
|
42
|
+
status: partial
|
|
43
|
+
tested_on_rocm: "6.2"
|
|
44
|
+
notes: >
|
|
45
|
+
RX 7600 (8 GB) — 7B EXL2 4bpw fits with room to spare. 13B Q4 is tight but possible.
|
|
46
|
+
Monitor VRAM with rocm-smi; context length may need reducing.
|
|
47
|
+
env_vars:
|
|
48
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
49
|
+
install_hints:
|
|
50
|
+
- "pip install torch --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
51
|
+
- "pip install exllamav2"
|
|
52
|
+
- "Reduce max_seq_len if VRAM is exhausted."
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: faster-whisper
|
|
2
|
+
description: Fast Whisper inference using CTranslate2. ROCm support via PyTorch backend.
|
|
3
|
+
homepage: https://github.com/SYSTRAN/faster-whisper
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
faster-whisper itself targets CUDA; on AMD use the openai-whisper or whisperX
|
|
11
|
+
route with PyTorch + ROCm, or run faster-whisper on CPU with int8 quantization
|
|
12
|
+
(still fast for short clips). For GPU on AMD, use openai-whisper-rocm fork.
|
|
13
|
+
env_vars:
|
|
14
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
15
|
+
install_hints:
|
|
16
|
+
- "pip install torch torchaudio --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
17
|
+
- "Verify torch.cuda.is_available() returns True (yes, 'cuda' under ROCm)"
|
|
18
|
+
- "For pure faster-whisper: use device='cpu', compute_type='int8' as fallback"
|
|
19
|
+
|
|
20
|
+
gfx1101:
|
|
21
|
+
status: partial
|
|
22
|
+
tested_on_rocm: "6.2"
|
|
23
|
+
notes: RX 7800 XT / RX 7700 XT — same PyTorch ROCm approach as gfx1100. Transcription speed similar to gfx1100.
|
|
24
|
+
env_vars:
|
|
25
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
26
|
+
install_hints:
|
|
27
|
+
- "pip install torch torchaudio --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
28
|
+
|
|
29
|
+
gfx1030:
|
|
30
|
+
status: partial
|
|
31
|
+
tested_on_rocm: "6.2"
|
|
32
|
+
notes: PyTorch ROCm wheels available, but transcription is slower than gfx1100.
|
|
33
|
+
env_vars: {}
|
|
34
|
+
install_hints:
|
|
35
|
+
- "pip install torch torchaudio --index-url https://download.pytorch.org/whl/rocm6.2"
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: llama.cpp
|
|
2
|
+
description: Efficient LLM inference in C++ with HIP backend for AMD GPUs.
|
|
3
|
+
homepage: https://github.com/ggerganov/llama.cpp
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
Compile with GGML_HIP=ON. Runs well on RX 7900 XTX; Q4_K_M models up to 70B fit in 24 GB.
|
|
11
|
+
Pre-built HIP binaries available in GitHub releases.
|
|
12
|
+
env_vars: {}
|
|
13
|
+
install_hints:
|
|
14
|
+
- "Linux: git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp"
|
|
15
|
+
- "cmake -B build -DGGML_HIP=ON && cmake --build build --config Release -j$(nproc)"
|
|
16
|
+
- "Windows: download pre-built HIP binary from GitHub Releases (look for 'hip' in filename)"
|
|
17
|
+
- "Verify GPU is used: ./build/bin/llama-cli -m model.gguf -p 'hello' --n-gpu-layers 99"
|
|
18
|
+
|
|
19
|
+
gfx1101:
|
|
20
|
+
status: tested
|
|
21
|
+
tested_on_rocm: "6.2"
|
|
22
|
+
notes: RX 7800 XT / RX 7700 XT — same build as gfx1100. 16 GB VRAM; Q4_K_M models up to 32B.
|
|
23
|
+
env_vars: {}
|
|
24
|
+
install_hints:
|
|
25
|
+
- "Same build as gfx1100."
|
|
26
|
+
|
|
27
|
+
gfx1102:
|
|
28
|
+
status: tested
|
|
29
|
+
tested_on_rocm: "6.2"
|
|
30
|
+
notes: RX 7600 — 8 GB VRAM; Q4_K_M models up to 8B. Offload remaining layers to CPU with --n-gpu-layers.
|
|
31
|
+
env_vars: {}
|
|
32
|
+
install_hints:
|
|
33
|
+
- "Same build as gfx1100."
|
|
34
|
+
- "Use --n-gpu-layers 30 to partially offload larger models."
|
|
35
|
+
|
|
36
|
+
gfx1030:
|
|
37
|
+
status: tested
|
|
38
|
+
tested_on_rocm: "6.2"
|
|
39
|
+
notes: RX 6800/6900 — works out of the box with HIP build. Performance slightly below gfx1100.
|
|
40
|
+
env_vars: {}
|
|
41
|
+
install_hints:
|
|
42
|
+
- "Same build as gfx1100."
|
|
43
|
+
|
|
44
|
+
gfx1201:
|
|
45
|
+
status: partial
|
|
46
|
+
tested_on_rocm: "7.x"
|
|
47
|
+
notes: RX 9070 — requires ROCm 7.x. HIP build may need HSA_OVERRIDE for gfx120x target.
|
|
48
|
+
env_vars: {}
|
|
49
|
+
install_hints:
|
|
50
|
+
- "Build with ROCm 7.x; report results to the rocmate project."
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
name: Ollama
|
|
2
|
+
description: Run large language models locally, with built-in ROCm support.
|
|
3
|
+
homepage: https://ollama.com
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.3"
|
|
9
|
+
notes: >
|
|
10
|
+
Works out of the box on Linux with ROCm 6.x.
|
|
11
|
+
Tested on RX 7900 XTX (24 GB) running Qwen 2.5 14B and Llama 3.1 8B.
|
|
12
|
+
env_vars: {}
|
|
13
|
+
install_hints:
|
|
14
|
+
- "Linux: curl -fsSL https://ollama.com/install.sh | sh"
|
|
15
|
+
- "Windows: download the Ollama installer from https://ollama.com/download/windows (ships HIP libs)"
|
|
16
|
+
- "Verify with: ollama run llama3.1:8b (should hit GPU, not CPU)"
|
|
17
|
+
- "Watch GPU usage live: watch -n 1 rocm-smi"
|
|
18
|
+
|
|
19
|
+
gfx1030:
|
|
20
|
+
status: tested
|
|
21
|
+
tested_on_rocm: "6.3"
|
|
22
|
+
notes: Works on RX 6800/6900 with ROCm 6.x. Some older guides recommend HSA_OVERRIDE — no longer needed.
|
|
23
|
+
env_vars: {}
|
|
24
|
+
install_hints:
|
|
25
|
+
- "curl -fsSL https://ollama.com/install.sh | sh"
|
|
26
|
+
|
|
27
|
+
gfx1201:
|
|
28
|
+
status: partial
|
|
29
|
+
tested_on_rocm: "7.x"
|
|
30
|
+
notes: >
|
|
31
|
+
RX 9070 series needs HIP SDK 7.1+ with gfx120x rocblas libraries.
|
|
32
|
+
Official Ollama Windows installer ships these as of January 2026.
|
|
33
|
+
env_vars: {}
|
|
34
|
+
install_hints:
|
|
35
|
+
- "On Linux: requires ROCm 7.x — see https://rocm.docs.amd.com"
|
|
36
|
+
- "On Windows: AMD AI Bundle installer (Adrenalin 26.x+) ships compatible libs."
|
|
37
|
+
|
|
38
|
+
gfx1101:
|
|
39
|
+
status: tested
|
|
40
|
+
tested_on_rocm: "6.3"
|
|
41
|
+
notes: RX 7800 XT / RX 7700 XT — works out of the box on Linux with ROCm 6.x, same as gfx1100.
|
|
42
|
+
env_vars: {}
|
|
43
|
+
install_hints:
|
|
44
|
+
- "curl -fsSL https://ollama.com/install.sh | sh"
|
|
45
|
+
|
|
46
|
+
gfx1102:
|
|
47
|
+
status: tested
|
|
48
|
+
tested_on_rocm: "6.3"
|
|
49
|
+
notes: RX 7600 — works on Linux with ROCm 6.x. Lower VRAM (8 GB) limits model size; stick to ≤7B Q4.
|
|
50
|
+
env_vars: {}
|
|
51
|
+
install_hints:
|
|
52
|
+
- "curl -fsSL https://ollama.com/install.sh | sh"
|
|
53
|
+
- "Limit to models ≤7B Q4 due to 8 GB VRAM."
|
|
54
|
+
|
|
55
|
+
gfx1034:
|
|
56
|
+
status: partial
|
|
57
|
+
tested_on_rocm: "6.3"
|
|
58
|
+
notes: >
|
|
59
|
+
RX 5400/5500 series — ROCm does not officially support gfx1034.
|
|
60
|
+
Force compatibility with HSA_OVERRIDE_GFX_VERSION=10.3.0 (treats card as gfx1030).
|
|
61
|
+
Token generation is slow but functional for small models.
|
|
62
|
+
env_vars:
|
|
63
|
+
HSA_OVERRIDE_GFX_VERSION: "10.3.0"
|
|
64
|
+
install_hints:
|
|
65
|
+
- "Set HSA_OVERRIDE before launching the Ollama service"
|
|
66
|
+
- "Limit to small models (≤7B Q4) due to lower memory bandwidth"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: Stable Diffusion WebUI
|
|
2
|
+
description: AUTOMATIC1111's browser UI for Stable Diffusion, with ROCm support via PyTorch.
|
|
3
|
+
homepage: https://github.com/AUTOMATIC1111/stable-diffusion-webui
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
Works on RX 7900 XTX with PyTorch ROCm wheels. SDXL and SD 1.5 run well.
|
|
11
|
+
Flux.1 requires additional setup (install flux dependencies separately).
|
|
12
|
+
env_vars:
|
|
13
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
14
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
15
|
+
install_hints:
|
|
16
|
+
- "git clone https://github.com/AUTOMATIC1111/stable-diffusion-webui && cd stable-diffusion-webui"
|
|
17
|
+
- "Set TORCH_COMMAND before launch: export TORCH_COMMAND='pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2'"
|
|
18
|
+
- "Linux launch: ./webui.sh"
|
|
19
|
+
- "Windows: set TORCH_COMMAND=pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm6.2 && webui-user.bat"
|
|
20
|
+
|
|
21
|
+
gfx1101:
|
|
22
|
+
status: tested
|
|
23
|
+
tested_on_rocm: "6.2"
|
|
24
|
+
notes: RX 7800 XT / RX 7700 XT — same setup as gfx1100. 16 GB VRAM; SDXL runs well, Flux.1 tight.
|
|
25
|
+
env_vars:
|
|
26
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
27
|
+
PYTORCH_HIP_ALLOC_CONF: "expandable_segments:True"
|
|
28
|
+
install_hints:
|
|
29
|
+
- "Same as gfx1100."
|
|
30
|
+
|
|
31
|
+
gfx1102:
|
|
32
|
+
status: partial
|
|
33
|
+
tested_on_rocm: "6.2"
|
|
34
|
+
notes: RX 7600 — 8 GB VRAM. SD 1.5 works; SDXL needs --medvram or --lowvram flag and is slow.
|
|
35
|
+
env_vars:
|
|
36
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
37
|
+
install_hints:
|
|
38
|
+
- "Same as gfx1100."
|
|
39
|
+
- "Launch with: ./webui.sh --medvram"
|
|
40
|
+
|
|
41
|
+
gfx1030:
|
|
42
|
+
status: partial
|
|
43
|
+
tested_on_rocm: "6.2"
|
|
44
|
+
notes: RX 6800/6900 — works with HSA_OVERRIDE. SDXL slow; SD 1.5 fine. Use --medvram for stability.
|
|
45
|
+
env_vars:
|
|
46
|
+
HSA_OVERRIDE_GFX_VERSION: "10.3.0"
|
|
47
|
+
install_hints:
|
|
48
|
+
- "Same install as gfx1100."
|
|
49
|
+
- "Launch with: ./webui.sh --medvram"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: vLLM
|
|
2
|
+
description: High-throughput LLM inference server with continuous batching and PagedAttention.
|
|
3
|
+
homepage: https://github.com/vllm-project/vllm
|
|
4
|
+
|
|
5
|
+
chips:
|
|
6
|
+
gfx1100:
|
|
7
|
+
status: tested
|
|
8
|
+
tested_on_rocm: "6.2"
|
|
9
|
+
notes: >
|
|
10
|
+
RX 7900 XTX works well. vLLM pre-allocates GPU memory (90 % by default) so 24 GB lets
|
|
11
|
+
you run 13–34B models. Use --gpu-memory-utilization to tune. Flash-attention is supported
|
|
12
|
+
via ROCm's CK library.
|
|
13
|
+
env_vars:
|
|
14
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
15
|
+
install_hints:
|
|
16
|
+
- "python -m venv venv && source venv/bin/activate"
|
|
17
|
+
- "pip install vllm --extra-index-url https://download.pytorch.org/whl/rocm6.2"
|
|
18
|
+
- "python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct"
|
|
19
|
+
- "Verify with: curl http://localhost:8000/v1/models"
|
|
20
|
+
|
|
21
|
+
gfx1101:
|
|
22
|
+
status: tested
|
|
23
|
+
tested_on_rocm: "6.2"
|
|
24
|
+
notes: RX 7800 XT / RX 7700 XT — works on Linux. 16 GB VRAM limits to ≤13B models.
|
|
25
|
+
env_vars:
|
|
26
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
27
|
+
install_hints:
|
|
28
|
+
- "Same install as gfx1100. Set --gpu-memory-utilization 0.85 to leave headroom."
|
|
29
|
+
|
|
30
|
+
gfx1030:
|
|
31
|
+
status: partial
|
|
32
|
+
tested_on_rocm: "6.2"
|
|
33
|
+
notes: >
|
|
34
|
+
RX 6800/6900 — works but slower than RDNA3 cards. PagedAttention works.
|
|
35
|
+
Flash-attention backend may fall back to a slower kernel.
|
|
36
|
+
env_vars: {}
|
|
37
|
+
install_hints:
|
|
38
|
+
- "pip install vllm --extra-index-url https://download.pytorch.org/whl/rocm6.2"
|
|
39
|
+
|
|
40
|
+
gfx1102:
|
|
41
|
+
status: partial
|
|
42
|
+
tested_on_rocm: "6.2"
|
|
43
|
+
notes: >
|
|
44
|
+
RX 7600 (8 GB) — vLLM's default memory pre-allocation requires lowering
|
|
45
|
+
--gpu-memory-utilization to 0.80 or less. Stick to 7B Q4/Q8 models.
|
|
46
|
+
env_vars:
|
|
47
|
+
HSA_OVERRIDE_GFX_VERSION: "11.0.0"
|
|
48
|
+
install_hints:
|
|
49
|
+
- "pip install vllm --extra-index-url https://download.pytorch.org/whl/rocm6.2"
|
|
50
|
+
- "python -m vllm.entrypoints.openai.api_server --model <model> --gpu-memory-utilization 0.80"
|
|
51
|
+
|
|
52
|
+
gfx1201:
|
|
53
|
+
status: partial
|
|
54
|
+
tested_on_rocm: "7.x"
|
|
55
|
+
notes: >
|
|
56
|
+
RX 9070 series — requires ROCm 7.x builds of vLLM. Pre-built wheels may not be
|
|
57
|
+
available; expect to build from source. Flash-attention support still maturing.
|
|
58
|
+
env_vars: {}
|
|
59
|
+
install_hints:
|
|
60
|
+
- "Build from source: git clone https://github.com/vllm-project/vllm && pip install -e . --extra-index-url https://download.pytorch.org/whl/rocm7.x"
|
rocmate/cli.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""rocmate command-line interface."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
from rocmate import configs as configs_module
|
|
10
|
+
from rocmate import doctor as doctor_module
|
|
11
|
+
from rocmate import fixer as fixer_module
|
|
12
|
+
from rocmate import install as install_module
|
|
13
|
+
from rocmate.doctor import Status
|
|
14
|
+
from rocmate.install import InstallError
|
|
15
|
+
|
|
16
|
+
app = typer.Typer(
|
|
17
|
+
name="rocmate",
|
|
18
|
+
help="Curated AMD GPU compatibility index and CLI for AI workloads.",
|
|
19
|
+
no_args_is_help=True,
|
|
20
|
+
add_completion=False,
|
|
21
|
+
)
|
|
22
|
+
console = Console()
|
|
23
|
+
|
|
24
|
+
_STATUS_ICON = {
|
|
25
|
+
"tested": "[green]✅ tested[/green]",
|
|
26
|
+
"partial": "[yellow]🟡 partial[/yellow]",
|
|
27
|
+
"broken": "[red]❌ broken[/red]",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@app.command()
|
|
32
|
+
def doctor(
|
|
33
|
+
tool: Optional[str] = typer.Option(None, "--tool", help="Also show compatibility for a specific tool."),
|
|
34
|
+
fix: bool = typer.Option(False, "--fix", help="Interactively apply fixes for detected issues."),
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Check whether the system is ready for AI workloads on an AMD GPU."""
|
|
37
|
+
result = doctor_module.run()
|
|
38
|
+
doctor_module.render(result, console)
|
|
39
|
+
|
|
40
|
+
if tool is not None:
|
|
41
|
+
try:
|
|
42
|
+
cfg = configs_module.load_tool(tool)
|
|
43
|
+
except FileNotFoundError:
|
|
44
|
+
console.print(f"[red]No config for tool '{tool}'.[/red]")
|
|
45
|
+
console.print(f"Available tools: {', '.join(configs_module.list_tools())}")
|
|
46
|
+
raise typer.Exit(code=1)
|
|
47
|
+
|
|
48
|
+
console.print(f"\n[bold]Tool compatibility: {cfg.name}[/bold]")
|
|
49
|
+
if not result.gpu_info:
|
|
50
|
+
console.print("[yellow] No AMD GPU detected — cannot check tool compatibility.[/yellow]")
|
|
51
|
+
else:
|
|
52
|
+
for gpu_info in result.gpu_info:
|
|
53
|
+
chip = gpu_info.gfx_version
|
|
54
|
+
if chip in cfg.chips:
|
|
55
|
+
support = cfg.chips[chip]
|
|
56
|
+
icon = _STATUS_ICON.get(support.status, support.status)
|
|
57
|
+
rocm = f" (ROCm {support.tested_on_rocm})" if support.tested_on_rocm else ""
|
|
58
|
+
console.print(f" {chip}: {icon}{rocm}")
|
|
59
|
+
else:
|
|
60
|
+
console.print(f" {chip}: [dim]no data[/dim]")
|
|
61
|
+
|
|
62
|
+
if fix:
|
|
63
|
+
fixable = [c for c in result.checks if c.fix and c.status != Status.OK]
|
|
64
|
+
if not fixable:
|
|
65
|
+
console.print("[green]Nothing to fix.[/green]")
|
|
66
|
+
else:
|
|
67
|
+
for check in fixable:
|
|
68
|
+
console.print(f"\n[bold]Fix available:[/bold] {check.name}")
|
|
69
|
+
console.print(f" [dim]{check.fix}[/dim]")
|
|
70
|
+
if typer.confirm("Apply?", default=False):
|
|
71
|
+
fix_result = fixer_module.apply_fix(check)
|
|
72
|
+
if fix_result and fix_result.applied:
|
|
73
|
+
console.print(f" [green]✓ {fix_result.message}[/green]")
|
|
74
|
+
elif fix_result:
|
|
75
|
+
console.print(f" [yellow]⚠ {fix_result.message}[/yellow]")
|
|
76
|
+
|
|
77
|
+
if result.has_blocking_issues():
|
|
78
|
+
raise typer.Exit(code=1)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@app.command()
|
|
82
|
+
def show(tool: str = typer.Argument(..., help="Tool name, e.g. 'ollama'")) -> None:
|
|
83
|
+
"""Show the tested configuration for a given tool."""
|
|
84
|
+
try:
|
|
85
|
+
config = configs_module.load_tool(tool)
|
|
86
|
+
except FileNotFoundError:
|
|
87
|
+
console.print(f"[red]No config available for tool '{tool}'.[/red]")
|
|
88
|
+
console.print(f"Available tools: {', '.join(configs_module.list_tools())}")
|
|
89
|
+
raise typer.Exit(code=1)
|
|
90
|
+
configs_module.render(config, console)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@app.command(name="list")
|
|
94
|
+
def list_tools() -> None:
|
|
95
|
+
"""List all tools with tested configurations."""
|
|
96
|
+
tools = configs_module.list_tools()
|
|
97
|
+
console.print(f"[bold]Available tools ({len(tools)}):[/bold]")
|
|
98
|
+
for t in tools:
|
|
99
|
+
console.print(f" • {t}")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@app.command()
|
|
103
|
+
def install(
|
|
104
|
+
tool: str = typer.Argument(..., help="Tool to install, e.g. 'ollama'"),
|
|
105
|
+
docker: bool = typer.Option(False, "--docker", help="Print a Docker Compose snippet instead."),
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Install a tool with the correct ENV vars and pip indexes for your AMD GPU."""
|
|
108
|
+
from rocmate import gpu as gpu_module
|
|
109
|
+
|
|
110
|
+
gpus = gpu_module.detect_amd_gpus()
|
|
111
|
+
if not gpus:
|
|
112
|
+
console.print("[red]No AMD GPU detected — cannot determine install config.[/red]")
|
|
113
|
+
raise typer.Exit(code=1)
|
|
114
|
+
|
|
115
|
+
chip = gpus[0].gfx_version
|
|
116
|
+
try:
|
|
117
|
+
plan = install_module.build_plan(tool, chip)
|
|
118
|
+
except FileNotFoundError:
|
|
119
|
+
console.print(f"[red]No config for tool '{tool}'.[/red]")
|
|
120
|
+
console.print(f"Available tools: {', '.join(configs_module.list_tools())}")
|
|
121
|
+
raise typer.Exit(code=1)
|
|
122
|
+
except KeyError:
|
|
123
|
+
console.print(f"[yellow]No install config for {chip} + {tool}.[/yellow]")
|
|
124
|
+
raise typer.Exit(code=1)
|
|
125
|
+
|
|
126
|
+
if docker:
|
|
127
|
+
console.print(install_module.render_docker_compose(plan))
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
install_module.render_dry_run(plan, console)
|
|
131
|
+
|
|
132
|
+
if typer.confirm("Install now?", default=False):
|
|
133
|
+
try:
|
|
134
|
+
install_module.execute(plan)
|
|
135
|
+
console.print("[green]Done.[/green]")
|
|
136
|
+
except InstallError as e:
|
|
137
|
+
console.print(f"[red]Install failed:[/red] {e}")
|
|
138
|
+
raise typer.Exit(code=1)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
|
+
app()
|
rocmate/configs.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Tool configuration loading and rendering."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from importlib.resources import files
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ChipSupport(BaseModel):
|
|
16
|
+
status: str # "tested" | "partial" | "broken"
|
|
17
|
+
tested_on_rocm: Optional[str] = None # e.g. "6.2", "6.3" — None = unknown
|
|
18
|
+
notes: Optional[str] = None
|
|
19
|
+
env_vars: dict[str, str] = Field(default_factory=dict)
|
|
20
|
+
install_hints: list[str] = Field(default_factory=list)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ToolConfig(BaseModel):
|
|
24
|
+
name: str
|
|
25
|
+
description: str
|
|
26
|
+
homepage: Optional[str] = None
|
|
27
|
+
chips: dict[str, ChipSupport] = Field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _configs_dir() -> Path:
|
|
31
|
+
"""Locate the bundled configs directory."""
|
|
32
|
+
# Dev mode: configs/ next to src/ (repo checkout)
|
|
33
|
+
here = Path(__file__).resolve()
|
|
34
|
+
for parent in here.parents:
|
|
35
|
+
candidate = parent / "configs" / "tools"
|
|
36
|
+
if candidate.is_dir():
|
|
37
|
+
return candidate
|
|
38
|
+
# Installed: configs live under rocmate/_configs (via hatch force-include)
|
|
39
|
+
try:
|
|
40
|
+
installed = Path(str(files("rocmate").joinpath("_configs/tools")))
|
|
41
|
+
if installed.is_dir():
|
|
42
|
+
return installed
|
|
43
|
+
except (ModuleNotFoundError, FileNotFoundError):
|
|
44
|
+
pass
|
|
45
|
+
raise FileNotFoundError("Could not locate rocmate configs directory")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def list_tools() -> list[str]:
|
|
49
|
+
return sorted(p.stem for p in _configs_dir().glob("*.yaml"))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def load_tool(tool: str) -> ToolConfig:
|
|
53
|
+
path = _configs_dir() / f"{tool}.yaml"
|
|
54
|
+
if not path.is_file():
|
|
55
|
+
raise FileNotFoundError(f"No config for tool '{tool}'")
|
|
56
|
+
data = yaml.safe_load(path.read_text())
|
|
57
|
+
return ToolConfig(**data)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_STATUS_STYLE = {
|
|
61
|
+
"tested": "[green]✅ tested[/green]",
|
|
62
|
+
"partial": "[yellow]🟡 partial[/yellow]",
|
|
63
|
+
"broken": "[red]❌ broken[/red]",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def render(config: ToolConfig, console: Console) -> None:
|
|
68
|
+
header = f"[bold]{config.name}[/bold]\n{config.description}"
|
|
69
|
+
if config.homepage:
|
|
70
|
+
header += f"\n[dim]{config.homepage}[/dim]"
|
|
71
|
+
console.print(Panel(header, expand=False))
|
|
72
|
+
|
|
73
|
+
table = Table(title="Chip support", show_lines=False)
|
|
74
|
+
table.add_column("Chip")
|
|
75
|
+
table.add_column("Status")
|
|
76
|
+
table.add_column("ROCm")
|
|
77
|
+
table.add_column("Notes")
|
|
78
|
+
for chip, support in config.chips.items():
|
|
79
|
+
table.add_row(
|
|
80
|
+
chip,
|
|
81
|
+
_STATUS_STYLE.get(support.status, support.status),
|
|
82
|
+
support.tested_on_rocm or "[dim]?[/dim]",
|
|
83
|
+
support.notes or "",
|
|
84
|
+
)
|
|
85
|
+
console.print(table)
|
|
86
|
+
|
|
87
|
+
for chip, support in config.chips.items():
|
|
88
|
+
if not support.env_vars and not support.install_hints:
|
|
89
|
+
continue
|
|
90
|
+
console.print(f"\n[bold]{chip}[/bold]")
|
|
91
|
+
if support.env_vars:
|
|
92
|
+
console.print(" Environment:")
|
|
93
|
+
for k, v in support.env_vars.items():
|
|
94
|
+
console.print(f" [cyan]export {k}={v}[/cyan]")
|
|
95
|
+
if support.install_hints:
|
|
96
|
+
console.print(" Install hints:")
|
|
97
|
+
for hint in support.install_hints:
|
|
98
|
+
console.print(f" • {hint}")
|