legion-llm 0.9.54 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -0
- data/Gemfile +1 -0
- data/bin/h200-setup-postreboot.sh +266 -0
- data/bin/h200-setup-prereboot.sh +228 -0
- data/bin/h200-setup-remaining-prereboot.sh +187 -0
- data/bin/h200-setup-resume-safe.sh +279 -0
- data/legion-llm.gemspec +1 -0
- data/lib/legion/llm/api/auth.rb +14 -2
- data/lib/legion/llm/api/namespaces/anthropic/files.rb +303 -0
- data/lib/legion/llm/api/namespaces/anthropic/messages/batches.rb +181 -0
- data/lib/legion/llm/api/namespaces/anthropic/messages/count_tokens.rb +49 -0
- data/lib/legion/llm/api/namespaces/anthropic/messages.rb +190 -0
- data/lib/legion/llm/api/namespaces/anthropic/models.rb +48 -0
- data/lib/legion/llm/api/namespaces/helpers.rb +52 -0
- data/lib/legion/llm/api/namespaces/native/chat.rb +133 -0
- data/lib/legion/llm/api/namespaces/native/inference.rb +260 -0
- data/lib/legion/llm/api/namespaces/native/instances.rb +64 -0
- data/lib/legion/llm/api/namespaces/native/models.rb +63 -0
- data/lib/legion/llm/api/namespaces/native/offerings.rb +57 -0
- data/lib/legion/llm/api/namespaces/native/providers.rb +87 -0
- data/lib/legion/llm/api/namespaces/native/routing.rb +59 -0
- data/lib/legion/llm/api/namespaces/native/tiers.rb +194 -0
- data/lib/legion/llm/api/namespaces/openai/audio/speech.rb +208 -0
- data/lib/legion/llm/api/namespaces/openai/audio/transcriptions.rb +210 -0
- data/lib/legion/llm/api/namespaces/openai/audio/translations.rb +205 -0
- data/lib/legion/llm/api/namespaces/openai/batches.rb +311 -0
- data/lib/legion/llm/api/namespaces/openai/chat/completions.rb +287 -0
- data/lib/legion/llm/api/namespaces/openai/chat/messages.rb +39 -0
- data/lib/legion/llm/api/namespaces/openai/completions.rb +107 -0
- data/lib/legion/llm/api/namespaces/openai/conversations/items.rb +184 -0
- data/lib/legion/llm/api/namespaces/openai/conversations.rb +149 -0
- data/lib/legion/llm/api/namespaces/openai/embeddings.rb +68 -0
- data/lib/legion/llm/api/namespaces/openai/files.rb +219 -0
- data/lib/legion/llm/api/namespaces/openai/images.rb +327 -0
- data/lib/legion/llm/api/namespaces/openai/models.rb +114 -0
- data/lib/legion/llm/api/namespaces/openai/moderations.rb +209 -0
- data/lib/legion/llm/api/namespaces/openai/responses.rb +355 -0
- data/lib/legion/llm/api/namespaces/openai/uploads/parts.rb +102 -0
- data/lib/legion/llm/api/namespaces/openai/uploads.rb +175 -0
- data/lib/legion/llm/api/namespaces/openai/vector_stores/file_batches.rb +295 -0
- data/lib/legion/llm/api/namespaces/openai/vector_stores/files.rb +291 -0
- data/lib/legion/llm/api/namespaces/openai/vector_stores.rb +281 -0
- data/lib/legion/llm/api/namespaces/registration.rb +189 -0
- data/lib/legion/llm/api/native/helpers.rb +1 -1
- data/lib/legion/llm/api/native/providers.rb +1 -1
- data/lib/legion/llm/api/openai/chat_completions.rb +157 -26
- data/lib/legion/llm/api/openai/responses.rb +29 -4
- data/lib/legion/llm/api/shared_helpers.rb +422 -0
- data/lib/legion/llm/api/translators/anthropic_request.rb +80 -28
- data/lib/legion/llm/api/translators/anthropic_response.rb +11 -5
- data/lib/legion/llm/api/translators/openai_request.rb +28 -2
- data/lib/legion/llm/api/translators/openai_response.rb +15 -1
- data/lib/legion/llm/api.rb +20 -9
- data/lib/legion/llm/call/dispatch.rb +1 -1
- data/lib/legion/llm/call/lex_llm_adapter.rb +42 -10
- data/lib/legion/llm/context/curator.rb +44 -7
- data/lib/legion/llm/inference/conversation.rb +1 -1
- data/lib/legion/llm/inference/executor.rb +23 -6
- data/lib/legion/llm/inference/native_tool_loop.rb +2 -1
- data/lib/legion/llm/inference/steps/debate.rb +1 -1
- data/lib/legion/llm/inference/steps/knowledge_capture.rb +1 -1
- data/lib/legion/llm/inference/steps/logging.rb +1 -1
- data/lib/legion/llm/inference/steps/rag_context.rb +4 -4
- data/lib/legion/llm/inference/steps/rag_guard.rb +1 -1
- data/lib/legion/llm/inference/steps/span_annotator.rb +1 -1
- data/lib/legion/llm/inference/steps/sticky_persist.rb +1 -1
- data/lib/legion/llm/inference/steps/token_budget.rb +1 -1
- data/lib/legion/llm/inference/steps/tool_calls.rb +3 -3
- data/lib/legion/llm/inference/steps/tool_history.rb +1 -1
- data/lib/legion/llm/inference/steps/trigger_match.rb +12 -2
- data/lib/legion/llm/router.rb +12 -1
- data/lib/legion/llm/settings.rb +6 -5
- data/lib/legion/llm/token_estimation.rb +48 -0
- data/lib/legion/llm/types/message.rb +0 -1
- data/lib/legion/llm/vector_store/storage.rb +141 -0
- data/lib/legion/llm/version.rb +1 -1
- metadata +57 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 94a90f69b3940eb805b6b7bf0b15eb30aa8c616e66562560df01d3f0d5b24ca2
|
|
4
|
+
data.tar.gz: 4a90c910e19cec4b3da0e5e44a00d1076495d41d653ad4f7161b3b38892f8d8b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e5f444b505e4b75937223475300cd4bae1e2f292f93148fbb3f0a06446ae6b90b0bf482a7d308a3e580e55219ddc4eb6f0e9274f6479c6e96619d54946cc0ef6
|
|
7
|
+
data.tar.gz: 6a9dfedae0e30d61f1bf8c4dc825764e4e9174829cc2d1ea6ee65a00ab7dea2cec5299e5988117aacd24adba93d956970124e00c11a1360bacfab04aa9cf6044
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,71 @@
|
|
|
1
1
|
# Legion LLM Changelog
|
|
2
2
|
|
|
3
|
+
|
|
4
|
+
## [0.10.1] - 2026-05-29
|
|
5
|
+
|
|
6
|
+
### Fixed
|
|
7
|
+
- **Anthropic message format translation** — `AnthropicRequest` translator now properly converts `tool_use` content blocks to `tool_calls` arrays and `tool_result` blocks to `role: :tool` messages. This was the root cause of vLLM outputting JSON blobs instead of calling tools — it was receiving malformed messages with raw Anthropic content block hashes in the content field.
|
|
8
|
+
- **Streaming tool_use event ordering** — Rewrote namespace streaming to emit tool_use content blocks inline after stream completes. Text `content_block_start` is deferred until actual text arrives, matching real Anthropic API behavior for tool-only responses.
|
|
9
|
+
- **Curator current-turn preservation** — `curate_turn` now only curates messages older than the current turn. `apply_curation_pipeline` preserves recent turns by counting user messages (`preserve_recent_turns` setting, default 2). Prevents the model from losing context of its recent work.
|
|
10
|
+
- **LexLLMAdapter TypeError flood** — Guarded `text_part_content` and `defined_method_access` against Array inputs. Flattened nested content arrays before iterating.
|
|
11
|
+
- **Rescue log levels** — All rescue blocks in inference steps, executor, dispatch, and adapter now use `:warn` or `:error` (never `:debug`). Caught exceptions should be visible.
|
|
12
|
+
- **Thinking override removed** — No longer forces `thinking: { enabled: false }` for vLLM with tools. Lets the model use its thinking template properly.
|
|
13
|
+
- **Model routing hardcode removed** — Anthropic namespace no longer hardcodes `routing: { model: 'legionio' }`. Uses daemon default_provider/default_model from settings.
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- **RAG defaults** — `min_confidence` 0.85 → 0.92, `full_limit` 10 → 5, `compact_limit` 5 → 3. Tighter relevance threshold, fewer context injections.
|
|
17
|
+
- **AnthropicResponse translator** — `extract_tool_calls`, `format_stop_reason`, `token_count` are now public methods (needed by streaming handler).
|
|
18
|
+
- **Message.build debug log removed** — Fired dozens of times per request with no diagnostic value.
|
|
19
|
+
- **format_chunk log** — Now includes actual text content and index for debugging stream flow.
|
|
20
|
+
- **Curator debug logs** — Show full before/after content (no truncation).
|
|
21
|
+
- **Request info log** — Namespace handler logs message count, char count, estimated tokens, and tool count at request start.
|
|
22
|
+
- **Stream completion log** — Logs tool_calls count, stop_reason, and text length after streaming.
|
|
23
|
+
|
|
24
|
+
## [0.10.0] - 2026-05-29
|
|
25
|
+
|
|
26
|
+
### Added
|
|
27
|
+
- **Sinatra Namespace API** — complete API refactor using `sinatra-contrib` namespaces with thin route blocks. Enabled via `settings[:llm][:api][:use_namespaces] = true`.
|
|
28
|
+
- **OpenAI-compatible endpoints (full surface):**
|
|
29
|
+
- `POST /v1/responses` — Responses API with typed SSE streaming (Codex CLI drop-in)
|
|
30
|
+
- `GET/DELETE /v1/responses/:id`, `POST /:id/cancel`, `GET /:id/input_items`
|
|
31
|
+
- `POST /v1/chat/completions` — streaming (`data: [DONE]`) and sync
|
|
32
|
+
- `GET/POST/DELETE /v1/chat/completions/:id`, `GET /v1/chat/completions/:id/messages`
|
|
33
|
+
- `GET /v1/models`, `GET /v1/models/:id` — with Anthropic format branching via `detect_client`
|
|
34
|
+
- `POST /v1/embeddings`, `POST /v1/completions` (legacy)
|
|
35
|
+
- `POST /v1/images/generations`, `/edits`, `/variations`
|
|
36
|
+
- `POST /v1/audio/transcriptions`, `/translations`, `/speech`
|
|
37
|
+
- `POST /v1/moderations`
|
|
38
|
+
- `POST/GET /v1/conversations`, CRUD + items
|
|
39
|
+
- `POST/GET /v1/batches`, cancel
|
|
40
|
+
- `POST/GET /v1/files`, content download, delete
|
|
41
|
+
- `POST /v1/uploads`, parts, cancel, complete
|
|
42
|
+
- `POST/GET /v1/vector_stores`, search, files, file batches
|
|
43
|
+
- **Anthropic-compatible endpoints (full surface):**
|
|
44
|
+
- `POST /v1/messages` — Messages API with correct SSE event ordering (Claude Code drop-in)
|
|
45
|
+
- `POST /v1/messages/count_tokens` — token estimation
|
|
46
|
+
- `POST/GET/DELETE /v1/messages/batches` — full batch CRUD + JSONL results
|
|
47
|
+
- `GET /v1/models` — Anthropic format via detect_client header branching
|
|
48
|
+
- Anthropic files namespace (standalone deployment mode)
|
|
49
|
+
- **Native namespace endpoints** — all `/api/llm/*` routes ported to namespace pattern
|
|
50
|
+
- **SharedHelpers module** (`lib/legion/llm/api/shared_helpers.rb`) — extracted from `native/helpers.rb`, shared by both legacy and namespace routes
|
|
51
|
+
- **TokenEstimation module** (`lib/legion/llm/token_estimation.rb`) — character-based token counting for `count_tokens` endpoint
|
|
52
|
+
- **Client detection** — `detect_client(env)` determines OpenAI vs Anthropic client from headers (anthropic-version, x-api-key)
|
|
53
|
+
- **Auth middleware** — now returns format-appropriate error shapes (OpenAI vs Anthropic) based on detected client
|
|
54
|
+
- **VectorStore::Storage** — table DDL, cosine similarity, chunk_text utilities for vector store endpoints
|
|
55
|
+
- **Codex CLI conformance tests** — verifies exact streaming event order for drop-in compatibility
|
|
56
|
+
- **Claude Code conformance tests** — verifies exact SSE event order, tool use, system prompt handling
|
|
57
|
+
|
|
58
|
+
### Changed
|
|
59
|
+
- `sinatra-contrib` added as runtime dependency (>= 2.0)
|
|
60
|
+
- `rack-test` added to development dependencies
|
|
61
|
+
- API registration now conditional: `use_namespaces: true` uses `Namespaces::Registration`, `false` (default) uses legacy flat routes
|
|
62
|
+
- All namespace inference routes go through full 18-step `Inference::Executor` pipeline (Gaia, RAG, metering, audit, escalation, RBAC, tools, etc.)
|
|
63
|
+
|
|
64
|
+
### Fixed
|
|
65
|
+
- Auth before filter returns Anthropic-shaped error for Anthropic clients (was always OpenAI-shaped)
|
|
66
|
+
- Anthropic streaming event order: `message_start` now emitted before any `content_block_delta` events
|
|
67
|
+
- `/v1/models` path conflict resolved — single handler with client detection branching
|
|
68
|
+
|
|
3
69
|
## [0.9.54] - 2026-05-29
|
|
4
70
|
|
|
5
71
|
### Fixed
|
|
@@ -13,6 +79,12 @@
|
|
|
13
79
|
|
|
14
80
|
## [0.9.52] - 2026-05-27
|
|
15
81
|
|
|
82
|
+
### Added
|
|
83
|
+
- API: `/v1/chat/completions` now has full pipeline feature parity with the native `/api/llm/inference` endpoint — routing, escalation, RAG context injection, Gaia advisory, knowledge capture, tool discovery, sticky runners, confidence scoring, metering, and debate all activate when the relevant fields are provided.
|
|
84
|
+
- API: `/v1/chat/completions` accepts extended fields via request body (`conversation_id`, `provider`, `tier`, `instance`, `cwd`, `requested_tools`, `client_tool_passthrough`, `caller`) or via `X-Legion-*` headers (`X-Legion-Conversation-Id`, `X-Legion-Provider`, `X-Legion-Tier`, `X-Legion-Instance`, `X-Legion-Cwd`, `X-Legion-Client-Tool-Passthrough`). Headers take precedence for scalar values.
|
|
85
|
+
- API: `/v1/chat/completions` now performs pre-pipeline Gaia ingest (mirrors native endpoint awareness).
|
|
86
|
+
- API: `/v1/chat/completions` reasoning/thinking token streaming via `include_reasoning: true` (or `include_thinking: true`). Emits `reasoning_content` delta chunks in OpenAI format. Non-streaming responses include `reasoning_content` in the message body.
|
|
87
|
+
|
|
16
88
|
### Fixed
|
|
17
89
|
- Discovery: `verify_embedding` now checks `model_available?` for Ollama instead of blindly returning true — prevents `can_embed?` from reporting true when the embedding model (e.g. `mxbai-embed-large`) hasn't been pulled on the local node
|
|
18
90
|
- Discovery: `detect_embedding_from_registry` now calls `verify_embedding` before setting `@can_embed = true`, closing a gap where registry-declared capability metadata was trusted without verifying the model exists locally
|
data/Gemfile
CHANGED
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# DGX/HGX H200 SXM 8-GPU Setup Script (Post-Reboot)
|
|
5
|
+
# Run as root after reboot: su - then bash h200-setup-postreboot.sh
|
|
6
|
+
# Requires: h200-setup-prereboot.sh completed + reboot done
|
|
7
|
+
|
|
8
|
+
echo "=== Verifying root ==="
|
|
9
|
+
if [ "$(id -u)" -ne 0 ]; then
|
|
10
|
+
echo "ERROR: Must run as root (su -)"
|
|
11
|
+
exit 1
|
|
12
|
+
fi
|
|
13
|
+
|
|
14
|
+
echo "=== Verifying H200 PCI BAR recovery settings ==="
|
|
15
|
+
if ! grep -qw 'pci=realloc=off' /proc/cmdline; then
|
|
16
|
+
echo "ERROR: pci=realloc=off is not active. Refusing to continue because Linux may zero H200 BARs."
|
|
17
|
+
echo "Current cmdline: $(cat /proc/cmdline)"
|
|
18
|
+
echo "Expected grub setting: GRUB_CMDLINE_LINUX_DEFAULT=\"quiet pci=realloc=off\""
|
|
19
|
+
exit 1
|
|
20
|
+
fi
|
|
21
|
+
|
|
22
|
+
for bdf in 04 14 64 77 84 94 e9 f6; do
|
|
23
|
+
resource="/sys/bus/pci/devices/0000:${bdf}:00.0/resource"
|
|
24
|
+
if [ ! -r "$resource" ]; then
|
|
25
|
+
echo "ERROR: GPU 0000:${bdf}:00.0 is missing from sysfs."
|
|
26
|
+
exit 1
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
if awk 'NR==1||NR==3||NR==5 { if ($1 == "0x0000000000000000") bad = 1 } END { exit bad }' "$resource"; then
|
|
30
|
+
:
|
|
31
|
+
else
|
|
32
|
+
echo "ERROR: GPU 0000:${bdf}:00.0 has an unassigned BAR."
|
|
33
|
+
awk 'NR==1||NR==3||NR==5{print}' "$resource"
|
|
34
|
+
exit 1
|
|
35
|
+
fi
|
|
36
|
+
done
|
|
37
|
+
|
|
38
|
+
echo "=== Verifying NVMe mounts ==="
|
|
39
|
+
if ! mountpoint -q /data; then
|
|
40
|
+
echo "ERROR: /data is not mounted. Check fstab."
|
|
41
|
+
exit 1
|
|
42
|
+
fi
|
|
43
|
+
if ! mountpoint -q /srv/legion; then
|
|
44
|
+
echo "ERROR: /srv/legion is not mounted. Check fstab."
|
|
45
|
+
exit 1
|
|
46
|
+
fi
|
|
47
|
+
|
|
48
|
+
echo "=== Verifying NVIDIA driver ==="
|
|
49
|
+
if ! timeout 60 nvidia-smi; then
|
|
50
|
+
echo "ERROR: nvidia-smi failed. Driver not loaded."
|
|
51
|
+
echo "Check: dmesg | grep -i nvidia"
|
|
52
|
+
echo "Try: apt install -y nvidia-open && reboot"
|
|
53
|
+
exit 1
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
GPU_COUNT=$(timeout 60 nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
|
57
|
+
echo "Detected $GPU_COUNT GPUs"
|
|
58
|
+
if [ "$GPU_COUNT" -ne 8 ]; then
|
|
59
|
+
echo "ERROR: Expected 8 H200 GPUs, found $GPU_COUNT"
|
|
60
|
+
echo "Check: lspci | grep -i nvidia"
|
|
61
|
+
exit 1
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
echo "=== Starting nvidia-fabricmanager (NVSwitch required) ==="
|
|
65
|
+
systemctl start nvidia-fabricmanager
|
|
66
|
+
if ! systemctl status nvidia-fabricmanager --no-pager -l; then
|
|
67
|
+
echo "ERROR: fabricmanager failed to start. NVSwitch won't work without it."
|
|
68
|
+
echo "Check: journalctl -u nvidia-fabricmanager"
|
|
69
|
+
exit 1
|
|
70
|
+
fi
|
|
71
|
+
|
|
72
|
+
echo "=== GPU persistence mode ==="
|
|
73
|
+
nvidia-smi -pm 1
|
|
74
|
+
|
|
75
|
+
echo "=== Verifying NVSwitch topology ==="
|
|
76
|
+
timeout 60 nvidia-smi topo -m
|
|
77
|
+
echo ""
|
|
78
|
+
echo "All GPUs should show NV18 (NVSwitch full mesh) connections."
|
|
79
|
+
echo ""
|
|
80
|
+
timeout 60 nvidia-smi nvlink -s
|
|
81
|
+
|
|
82
|
+
echo "=== CUDA library path ==="
|
|
83
|
+
CUDA_PATH=$(find /usr/local -maxdepth 1 -name "cuda-12*" -type d | sort -V | tail -1)
|
|
84
|
+
if [ -z "$CUDA_PATH" ]; then
|
|
85
|
+
echo "WARNING: CUDA toolkit directory not found under /usr/local"
|
|
86
|
+
CUDA_PATH="/usr/local/cuda-12.8"
|
|
87
|
+
fi
|
|
88
|
+
echo "$CUDA_PATH/lib64" > /etc/ld.so.conf.d/cuda.conf
|
|
89
|
+
ldconfig
|
|
90
|
+
ln -sf "$CUDA_PATH" /usr/local/cuda
|
|
91
|
+
|
|
92
|
+
echo "=== CPU performance governor ==="
|
|
93
|
+
for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
|
94
|
+
echo performance > "$gov" 2>/dev/null || true
|
|
95
|
+
done
|
|
96
|
+
echo never > /sys/kernel/mm/transparent_hugepage/defrag
|
|
97
|
+
echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
|
|
98
|
+
|
|
99
|
+
echo "=== NUMA topology check ==="
|
|
100
|
+
numactl --hardware
|
|
101
|
+
echo ""
|
|
102
|
+
echo "For optimal GPU-CPU affinity, vLLM will use all NUMA nodes."
|
|
103
|
+
|
|
104
|
+
echo "=== Setting up Ollama ownership ==="
|
|
105
|
+
chown -R ollama:ollama /usr/share/ollama/.ollama
|
|
106
|
+
chown -R ollama:llm /srv/legion/ollama
|
|
107
|
+
chmod -R 775 /srv/legion/ollama
|
|
108
|
+
|
|
109
|
+
echo "=== Restarting Ollama ==="
|
|
110
|
+
systemctl restart ollama
|
|
111
|
+
sleep 2
|
|
112
|
+
systemctl status ollama --no-pager
|
|
113
|
+
|
|
114
|
+
echo "=== Installing vLLM ==="
|
|
115
|
+
if [ ! -d "/data/vllm/env" ]; then
|
|
116
|
+
python3 -m venv /data/vllm/env
|
|
117
|
+
fi
|
|
118
|
+
|
|
119
|
+
mkdir -p /data/tmp /data/cache/pip
|
|
120
|
+
source /data/vllm/env/bin/activate
|
|
121
|
+
TMPDIR=/data/tmp PIP_CACHE_DIR=/data/cache/pip pip install vllm xxhash
|
|
122
|
+
deactivate
|
|
123
|
+
|
|
124
|
+
echo "=== Creating vLLM config ==="
|
|
125
|
+
cat > /data/vllm/config.yaml << 'EOF'
|
|
126
|
+
# Model (rsync from Apollo: rsync -avP root@10.11.164.93:/data/models/qwen3.6-27b/ /data/models/qwen3.6-27b/)
|
|
127
|
+
model: /data/models/qwen3.6-27b
|
|
128
|
+
dtype: bfloat16
|
|
129
|
+
tensor-parallel-size: 8
|
|
130
|
+
served-model-name: qwen3.6-27b
|
|
131
|
+
|
|
132
|
+
# Context & Memory — H200 141GB per card, 1.1TB total
|
|
133
|
+
max-model-len: 131072
|
|
134
|
+
gpu-memory-utilization: 0.95
|
|
135
|
+
enable-prefix-caching: true
|
|
136
|
+
prefix-caching-hash-algo: xxhash
|
|
137
|
+
|
|
138
|
+
# Scheduling & Batching — aggressive for 8x H200
|
|
139
|
+
max-num-seqs: 128
|
|
140
|
+
max-num-batched-tokens: 131072
|
|
141
|
+
enable-chunked-prefill: true
|
|
142
|
+
scheduling-policy: fcfs
|
|
143
|
+
performance-mode: throughput
|
|
144
|
+
async-scheduling: true
|
|
145
|
+
|
|
146
|
+
# Model Loading
|
|
147
|
+
load-format: auto
|
|
148
|
+
safetensors-load-strategy: mmap
|
|
149
|
+
|
|
150
|
+
# Streaming
|
|
151
|
+
stream-interval: 1
|
|
152
|
+
|
|
153
|
+
# Thinking/Reasoning
|
|
154
|
+
reasoning-parser: deepseek_v3
|
|
155
|
+
|
|
156
|
+
# Tool calling
|
|
157
|
+
enable-auto-tool-choice: true
|
|
158
|
+
tool-call-parser: hermes
|
|
159
|
+
|
|
160
|
+
# Server
|
|
161
|
+
host: 0.0.0.0
|
|
162
|
+
port: 8000
|
|
163
|
+
|
|
164
|
+
# Monitoring
|
|
165
|
+
enable-server-load-tracking: true
|
|
166
|
+
enable-prompt-tokens-details: true
|
|
167
|
+
enable-log-requests: true
|
|
168
|
+
kv-cache-metrics: true
|
|
169
|
+
enable-mfu-metrics: true
|
|
170
|
+
enable-logging-iteration-details: true
|
|
171
|
+
EOF
|
|
172
|
+
|
|
173
|
+
echo "=== Creating vLLM systemd service ==="
|
|
174
|
+
cat > /etc/systemd/system/vllm.service << 'EOF'
|
|
175
|
+
[Unit]
|
|
176
|
+
Description=vLLM Inference Server (8x H200 SXM)
|
|
177
|
+
After=network.target nvidia-persistenced.service nvidia-fabricmanager.service
|
|
178
|
+
Wants=nvidia-persistenced.service nvidia-fabricmanager.service
|
|
179
|
+
|
|
180
|
+
[Service]
|
|
181
|
+
Type=simple
|
|
182
|
+
User=vllm
|
|
183
|
+
Group=llm
|
|
184
|
+
Environment="PATH=/data/vllm/env/bin:/usr/local/bin:/usr/bin:/bin"
|
|
185
|
+
Environment="TMPDIR=/data/tmp"
|
|
186
|
+
Environment="TORCHINDUCTOR_CACHE_DIR=/data/cache/torchinductor"
|
|
187
|
+
Environment="TRITON_CACHE_DIR=/data/cache/triton"
|
|
188
|
+
Environment="HF_HOME=/data/cache/huggingface"
|
|
189
|
+
Environment="LD_LIBRARY_PATH=/usr/local/cuda/lib64"
|
|
190
|
+
ExecStart=/data/vllm/env/bin/python -m vllm.entrypoints.openai.api_server --config /data/vllm/config.yaml
|
|
191
|
+
Restart=on-failure
|
|
192
|
+
RestartSec=10
|
|
193
|
+
LimitNOFILE=1048576
|
|
194
|
+
LimitMEMLOCK=infinity
|
|
195
|
+
StandardOutput=append:/srv/legion/logs/vllm/vllm.log
|
|
196
|
+
StandardError=append:/srv/legion/logs/vllm/vllm-error.log
|
|
197
|
+
|
|
198
|
+
[Install]
|
|
199
|
+
WantedBy=multi-user.target
|
|
200
|
+
EOF
|
|
201
|
+
|
|
202
|
+
echo "=== Setting ownership ==="
|
|
203
|
+
chown -R vllm:llm /data/vllm /data/models /data/cache /data/tmp /srv/legion/logs/vllm
|
|
204
|
+
|
|
205
|
+
systemctl daemon-reload
|
|
206
|
+
systemctl enable vllm
|
|
207
|
+
|
|
208
|
+
echo "=== Checking for model files ==="
|
|
209
|
+
if [ -d "/data/models/qwen3.6-27b" ] && [ "$(find /data/models/qwen3.6-27b -name '*.safetensors' 2>/dev/null | wc -l)" -gt 0 ]; then
|
|
210
|
+
echo "Model found at /data/models/qwen3.6-27b"
|
|
211
|
+
echo "=== Starting vLLM ==="
|
|
212
|
+
rm -f /dev/shm/psm_* /dev/shm/sem.mp-* /dev/shm/VLLM_*
|
|
213
|
+
systemctl start vllm
|
|
214
|
+
echo ""
|
|
215
|
+
echo "vLLM is starting. First startup compiles CUDA graphs (~5-8 min on H200)."
|
|
216
|
+
echo "Subsequent restarts load from cache (~20 sec)."
|
|
217
|
+
echo "Monitor: journalctl -u vllm -f"
|
|
218
|
+
echo "Logs: tail -f /srv/legion/logs/vllm/vllm.log"
|
|
219
|
+
else
|
|
220
|
+
echo ""
|
|
221
|
+
echo "WARNING: No model found at /data/models/qwen3.6-27b"
|
|
222
|
+
echo ""
|
|
223
|
+
echo "Rsync from Apollo V100 node:"
|
|
224
|
+
echo " rsync -aHAXx --numeric-ids --info=progress2 --partial --append-verify root@10.11.164.93:/data/models/qwen3.6-27b/ /data/models/qwen3.6-27b/"
|
|
225
|
+
echo " chown -R vllm:llm /data/models"
|
|
226
|
+
echo " systemctl start vllm"
|
|
227
|
+
fi
|
|
228
|
+
|
|
229
|
+
echo ""
|
|
230
|
+
echo "==========================================="
|
|
231
|
+
echo " POST-REBOOT SETUP COMPLETE"
|
|
232
|
+
echo "==========================================="
|
|
233
|
+
echo ""
|
|
234
|
+
echo " Hardware: 8x H200 SXM 141GB, NVSwitch full-mesh"
|
|
235
|
+
echo " VRAM: 1.128 TB total"
|
|
236
|
+
echo " RAM: 2.2 TiB"
|
|
237
|
+
echo ""
|
|
238
|
+
echo " Services:"
|
|
239
|
+
echo " Ollama: http://0.0.0.0:11434 (embeddings)"
|
|
240
|
+
echo " vLLM: http://0.0.0.0:8000 (inference, TP=8, 131K context)"
|
|
241
|
+
echo " fabricmanager: active (NVSwitch)"
|
|
242
|
+
echo ""
|
|
243
|
+
echo " vLLM config highlights:"
|
|
244
|
+
echo " - TP=8 (all GPUs via NVSwitch, no DP needed for 27B)"
|
|
245
|
+
echo " - 131K context (H200 has room to spare with 27B)"
|
|
246
|
+
echo " - max-num-seqs=128, batched-tokens=131K"
|
|
247
|
+
echo " - bfloat16 (native H200 dtype)"
|
|
248
|
+
echo ""
|
|
249
|
+
echo " Test:"
|
|
250
|
+
echo " curl http://localhost:8000/v1/models"
|
|
251
|
+
echo " curl http://localhost:8000/v1/chat/completions \\"
|
|
252
|
+
echo " -H 'Content-Type: application/json' \\"
|
|
253
|
+
echo " -d '{\"model\":\"qwen3.6-27b\",\"messages\":[{\"role\":\"user\",\"content\":\"hello\"}],\"max_tokens\":50,\"chat_template_kwargs\":{\"enable_thinking\":false}}'"
|
|
254
|
+
echo ""
|
|
255
|
+
echo " For 235B later:"
|
|
256
|
+
echo " - Update /data/vllm/config.yaml: model path, max-model-len, and batching limits"
|
|
257
|
+
echo " - 235B at bf16 = ~470GB, fits easily in 1.1TB with room for KV cache"
|
|
258
|
+
echo ""
|
|
259
|
+
echo " Useful commands:"
|
|
260
|
+
echo " nvtop # GPU monitoring"
|
|
261
|
+
echo " nvidia-smi topo -m # NVSwitch topology"
|
|
262
|
+
echo " tail -f /srv/legion/logs/vllm/vllm.log # vLLM logs"
|
|
263
|
+
echo " systemctl status nvidia-fabricmanager # NVSwitch status"
|
|
264
|
+
echo " systemctl status vllm # vLLM status"
|
|
265
|
+
echo " systemctl status ollama # Ollama status"
|
|
266
|
+
echo "==========================================="
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# DGX/HGX H200 SXM 8-GPU Setup Script (Pre-Reboot)
|
|
5
|
+
# Hardware: 8x H200 SXM 141GB, 4x NVSwitch, 2x AMD EPYC 9535, 2.2TiB RAM
|
|
6
|
+
# Host: mn011-6labhz1-h200-0001
|
|
7
|
+
# OS: Debian 13 (trixie), kernel 6.12
|
|
8
|
+
# Run as root: su - then bash h200-setup-prereboot.sh
|
|
9
|
+
# After completion: reboot, then run h200-setup-postreboot.sh
|
|
10
|
+
#
|
|
11
|
+
# NOTE: Corporate firewall does TLS inspection. All external HTTPS downloads
|
|
12
|
+
# use --no-check-certificate / -k and apt repos use trusted=yes to bypass
|
|
13
|
+
# certificate verification failures from the MITM proxy.
|
|
14
|
+
|
|
15
|
+
echo "=== Verifying root ==="
|
|
16
|
+
if [ "$(id -u)" -ne 0 ]; then
|
|
17
|
+
echo "ERROR: Must run as root (su -)"
|
|
18
|
+
exit 1
|
|
19
|
+
fi
|
|
20
|
+
|
|
21
|
+
if [ "${H200_ALLOW_DESTRUCTIVE_PREREBOOT:-0}" != "1" ]; then
|
|
22
|
+
echo "ERROR: This legacy prereboot script formats NVMe devices and is no longer the supported rebuild path."
|
|
23
|
+
echo "Use scripts/h200/h200-rebuild.sh for full automation, or set H200_ALLOW_DESTRUCTIVE_PREREBOOT=1 only if you intend to wipe the scripted NVMe devices."
|
|
24
|
+
exit 1
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
echo "=== Fixing apt sources (remove cdrom if present) ==="
|
|
28
|
+
sed -i '/cdrom/d' /etc/apt/sources.list 2>/dev/null || true
|
|
29
|
+
|
|
30
|
+
echo "=== Installing base packages ==="
|
|
31
|
+
apt update
|
|
32
|
+
apt install -y \
|
|
33
|
+
sudo curl wget gnupg2 lsb-release ca-certificates \
|
|
34
|
+
build-essential linux-headers-$(uname -r) \
|
|
35
|
+
git vim htop tmux nvtop \
|
|
36
|
+
numactl hwloc \
|
|
37
|
+
pciutils dkms \
|
|
38
|
+
apt-transport-https \
|
|
39
|
+
rsync parted jq iperf3 \
|
|
40
|
+
python3-pip python3-venv \
|
|
41
|
+
mdadm lvm2
|
|
42
|
+
|
|
43
|
+
echo "=== Setting up sudo for miverso2 ==="
|
|
44
|
+
usermod -aG sudo miverso2
|
|
45
|
+
|
|
46
|
+
echo "=== Bypassing TLS inspection for external repos ==="
|
|
47
|
+
cat > /etc/apt/apt.conf.d/99bypass-tls-inspection << 'EOF'
|
|
48
|
+
Acquire::https::developer.download.nvidia.com::Verify-Peer "false";
|
|
49
|
+
Acquire::https::nvidia.github.io::Verify-Peer "false";
|
|
50
|
+
Acquire::https::apt.releases.hashicorp.com::Verify-Peer "false";
|
|
51
|
+
EOF
|
|
52
|
+
|
|
53
|
+
echo "=== Setting up NVIDIA CUDA repo (debian12 — compatible with trixie) ==="
|
|
54
|
+
wget -qO /usr/share/keyrings/cuda-archive-keyring.gpg --no-check-certificate \
|
|
55
|
+
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-archive-keyring.gpg
|
|
56
|
+
echo "deb [trusted=yes] https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
|
57
|
+
> /etc/apt/sources.list.d/cuda-debian12.list
|
|
58
|
+
apt update
|
|
59
|
+
|
|
60
|
+
echo "=== Installing NVIDIA open kernel module + driver ==="
|
|
61
|
+
apt install -y nvidia-kernel-open-dkms
|
|
62
|
+
|
|
63
|
+
echo "=== Installing CUDA toolkit 12.8 ==="
|
|
64
|
+
apt install -y cuda-toolkit-12-8
|
|
65
|
+
|
|
66
|
+
echo "=== Installing nvidia-fabricmanager (REQUIRED for NVSwitch) ==="
|
|
67
|
+
apt install -y nvidia-fabricmanager
|
|
68
|
+
systemctl enable nvidia-fabricmanager
|
|
69
|
+
|
|
70
|
+
echo "=== Installing Docker ==="
|
|
71
|
+
apt install -y docker.io
|
|
72
|
+
systemctl enable docker
|
|
73
|
+
|
|
74
|
+
echo "=== Installing NVIDIA Container Toolkit ==="
|
|
75
|
+
curl -fsSLk https://nvidia.github.io/libnvidia-container/gpgkey | \
|
|
76
|
+
gpg --dearmor > /etc/apt/trusted.gpg.d/nvidia-container-toolkit.gpg
|
|
77
|
+
echo "deb [trusted=yes] https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /" \
|
|
78
|
+
> /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
|
79
|
+
apt update
|
|
80
|
+
apt install -y nvidia-container-toolkit
|
|
81
|
+
nvidia-ctk runtime configure --runtime=docker
|
|
82
|
+
|
|
83
|
+
echo "=== Installing HashiCorp Enterprise ==="
|
|
84
|
+
wget -qO- --no-check-certificate https://apt.releases.hashicorp.com/gpg | \
|
|
85
|
+
gpg --dearmor > /etc/apt/trusted.gpg.d/hashicorp.gpg
|
|
86
|
+
echo "deb [signed-by=/etc/apt/trusted.gpg.d/hashicorp.gpg trusted=yes] https://apt.releases.hashicorp.com bookworm main" \
|
|
87
|
+
> /etc/apt/sources.list.d/hashicorp.list
|
|
88
|
+
apt update
|
|
89
|
+
apt install -y nomad-enterprise consul-enterprise
|
|
90
|
+
systemctl enable nomad
|
|
91
|
+
systemctl enable consul
|
|
92
|
+
|
|
93
|
+
echo "=== Installing Ollama ==="
|
|
94
|
+
curl -fsSLk https://ollama.com/install.sh | sh
|
|
95
|
+
|
|
96
|
+
echo "=== Setting up NVMe drives ==="
|
|
97
|
+
echo " nvme1n1 (7TB) -> /data (models, vllm cache, vllm env)"
|
|
98
|
+
echo " nvme2n1 (7TB) -> /srv/legion (ollama, docker, nomad, consul, logs)"
|
|
99
|
+
echo " nvme3n1 (7TB) -> /reserve (future use)"
|
|
100
|
+
|
|
101
|
+
echo "=== Formatting NVMe drives ==="
|
|
102
|
+
mkfs.ext4 -L data -m 0 /dev/nvme1n1
|
|
103
|
+
mkfs.ext4 -L srv -m 0 /dev/nvme2n1
|
|
104
|
+
mkfs.ext4 -L reserve -m 0 /dev/nvme3n1
|
|
105
|
+
|
|
106
|
+
echo "=== Creating mount points ==="
|
|
107
|
+
mkdir -p /data /srv/legion /reserve
|
|
108
|
+
|
|
109
|
+
echo "=== Adding fstab entries ==="
|
|
110
|
+
cat >> /etc/fstab << 'EOF'
|
|
111
|
+
/dev/nvme1n1 /data ext4 defaults,noatime,discard 0 2
|
|
112
|
+
/dev/nvme2n1 /srv/legion ext4 defaults,noatime,discard 0 2
|
|
113
|
+
/dev/nvme3n1 /reserve ext4 defaults,noatime,discard 0 2
|
|
114
|
+
EOF
|
|
115
|
+
|
|
116
|
+
echo "=== Mounting all ==="
|
|
117
|
+
mount -a
|
|
118
|
+
|
|
119
|
+
echo "=== Creating /data directory structure ==="
|
|
120
|
+
mkdir -p /data/{models,cache,vllm,tmp}
|
|
121
|
+
mkdir -p /data/cache/{pip,triton,torchinductor,huggingface}
|
|
122
|
+
|
|
123
|
+
echo "=== Creating /srv/legion directory structure ==="
|
|
124
|
+
mkdir -p /srv/legion/{ollama,docker,nomad,consul,rabbitmq}
|
|
125
|
+
mkdir -p /srv/legion/logs/{vllm,ollama,nomad}
|
|
126
|
+
|
|
127
|
+
echo "=== Moving docker storage to NVMe ==="
|
|
128
|
+
systemctl stop docker 2>/dev/null || true
|
|
129
|
+
if [ -d "/var/lib/docker" ] && [ ! -L "/var/lib/docker" ]; then
|
|
130
|
+
rsync -a /var/lib/docker/ /srv/legion/docker/ 2>/dev/null || true
|
|
131
|
+
rm -rf /var/lib/docker
|
|
132
|
+
ln -sf /srv/legion/docker /var/lib/docker
|
|
133
|
+
fi
|
|
134
|
+
|
|
135
|
+
echo "=== Setting up Ollama storage on NVMe ==="
|
|
136
|
+
mkdir -p /usr/share/ollama/.ollama
|
|
137
|
+
echo "/srv/legion/ollama /usr/share/ollama/.ollama none bind 0 0" >> /etc/fstab
|
|
138
|
+
mount -a
|
|
139
|
+
|
|
140
|
+
echo "=== Configuring Ollama ==="
|
|
141
|
+
mkdir -p /etc/systemd/system/ollama.service.d
|
|
142
|
+
cat > /etc/systemd/system/ollama.service.d/override.conf << 'OLLEOF'
|
|
143
|
+
[Service]
|
|
144
|
+
Environment="OLLAMA_HOST=0.0.0.0:11434"
|
|
145
|
+
OLLEOF
|
|
146
|
+
|
|
147
|
+
echo "=== Creating users and groups ==="
|
|
148
|
+
groupadd -f llm
|
|
149
|
+
useradd -r -s /bin/false -d /data/vllm vllm 2>/dev/null || true
|
|
150
|
+
usermod -aG llm miverso2
|
|
151
|
+
usermod -aG llm ollama 2>/dev/null || true
|
|
152
|
+
usermod -aG llm vllm
|
|
153
|
+
usermod -aG video vllm
|
|
154
|
+
usermod -aG render vllm
|
|
155
|
+
|
|
156
|
+
echo "=== Setting ownership ==="
|
|
157
|
+
chown -R ollama:ollama /usr/share/ollama/.ollama
|
|
158
|
+
chown -R ollama:llm /srv/legion/ollama
|
|
159
|
+
chmod -R 775 /srv/legion/ollama
|
|
160
|
+
chown -R vllm:llm /data/vllm /data/models /data/cache /data/tmp
|
|
161
|
+
chown -R vllm:llm /srv/legion/logs/vllm
|
|
162
|
+
|
|
163
|
+
echo "=== Network tuning (EPYC + ConnectX-7) ==="
|
|
164
|
+
cat > /etc/sysctl.d/99-h200.conf << 'EOF'
|
|
165
|
+
net.ipv4.tcp_window_scaling = 1
|
|
166
|
+
net.core.rmem_max = 67108864
|
|
167
|
+
net.core.wmem_max = 67108864
|
|
168
|
+
net.ipv4.tcp_rmem = 4096 87380 67108864
|
|
169
|
+
net.ipv4.tcp_wmem = 4096 65536 67108864
|
|
170
|
+
net.ipv4.tcp_congestion_control = bbr
|
|
171
|
+
net.core.somaxconn = 3240000
|
|
172
|
+
net.core.busy_poll = 50
|
|
173
|
+
net.core.busy_read = 50
|
|
174
|
+
net.core.netdev_max_backlog = 65536
|
|
175
|
+
net.ipv4.tcp_max_syn_backlog = 3240000
|
|
176
|
+
net.ipv4.ip_local_port_range = 10000 65535
|
|
177
|
+
net.ipv4.tcp_fin_timeout = 15
|
|
178
|
+
net.ipv4.tcp_max_tw_buckets = 65536
|
|
179
|
+
kernel.numa_balancing = 0
|
|
180
|
+
vm.nr_hugepages = 4096
|
|
181
|
+
vm.swappiness = 1
|
|
182
|
+
EOF
|
|
183
|
+
sysctl -p /etc/sysctl.d/99-h200.conf
|
|
184
|
+
|
|
185
|
+
echo "=== File descriptor limits ==="
|
|
186
|
+
cat >> /etc/security/limits.conf << 'EOF'
|
|
187
|
+
vllm soft nofile 1048576
|
|
188
|
+
vllm hard nofile 1048576
|
|
189
|
+
ollama soft nofile 65536
|
|
190
|
+
ollama hard nofile 65536
|
|
191
|
+
* soft memlock unlimited
|
|
192
|
+
* hard memlock unlimited
|
|
193
|
+
EOF
|
|
194
|
+
|
|
195
|
+
echo "=== THP + CPU governor persistence ==="
|
|
196
|
+
cat > /etc/rc.local << 'RCEOF'
|
|
197
|
+
#!/bin/bash
|
|
198
|
+
echo never > /sys/kernel/mm/transparent_hugepage/defrag
|
|
199
|
+
echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
|
|
200
|
+
for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
|
201
|
+
echo performance > $gov 2>/dev/null
|
|
202
|
+
done
|
|
203
|
+
exit 0
|
|
204
|
+
RCEOF
|
|
205
|
+
chmod +x /etc/rc.local
|
|
206
|
+
|
|
207
|
+
echo "=== Applying THP settings now ==="
|
|
208
|
+
echo never > /sys/kernel/mm/transparent_hugepage/defrag
|
|
209
|
+
echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
|
|
210
|
+
|
|
211
|
+
systemctl daemon-reload
|
|
212
|
+
|
|
213
|
+
echo ""
|
|
214
|
+
echo "==========================================="
|
|
215
|
+
echo " PRE-REBOOT SETUP COMPLETE"
|
|
216
|
+
echo "==========================================="
|
|
217
|
+
echo ""
|
|
218
|
+
echo " Hardware: 8x H200 SXM 141GB + 4x NVSwitch"
|
|
219
|
+
echo " NVMe: nvme1n1 -> /data (models/vllm)"
|
|
220
|
+
echo " nvme2n1 -> /srv/legion (services/logs)"
|
|
221
|
+
echo " nvme3n1 -> /reserve (future)"
|
|
222
|
+
echo ""
|
|
223
|
+
echo " Next steps:"
|
|
224
|
+
echo " 1. reboot"
|
|
225
|
+
echo " 2. After reboot, run: bash h200-setup-postreboot.sh"
|
|
226
|
+
echo ""
|
|
227
|
+
echo " The reboot is required for the NVIDIA driver to load."
|
|
228
|
+
echo "==========================================="
|