legion-llm 0.9.54 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +72 -0
  3. data/Gemfile +1 -0
  4. data/bin/h200-setup-postreboot.sh +266 -0
  5. data/bin/h200-setup-prereboot.sh +228 -0
  6. data/bin/h200-setup-remaining-prereboot.sh +187 -0
  7. data/bin/h200-setup-resume-safe.sh +279 -0
  8. data/legion-llm.gemspec +1 -0
  9. data/lib/legion/llm/api/auth.rb +14 -2
  10. data/lib/legion/llm/api/namespaces/anthropic/files.rb +303 -0
  11. data/lib/legion/llm/api/namespaces/anthropic/messages/batches.rb +181 -0
  12. data/lib/legion/llm/api/namespaces/anthropic/messages/count_tokens.rb +49 -0
  13. data/lib/legion/llm/api/namespaces/anthropic/messages.rb +190 -0
  14. data/lib/legion/llm/api/namespaces/anthropic/models.rb +48 -0
  15. data/lib/legion/llm/api/namespaces/helpers.rb +52 -0
  16. data/lib/legion/llm/api/namespaces/native/chat.rb +133 -0
  17. data/lib/legion/llm/api/namespaces/native/inference.rb +260 -0
  18. data/lib/legion/llm/api/namespaces/native/instances.rb +64 -0
  19. data/lib/legion/llm/api/namespaces/native/models.rb +63 -0
  20. data/lib/legion/llm/api/namespaces/native/offerings.rb +57 -0
  21. data/lib/legion/llm/api/namespaces/native/providers.rb +87 -0
  22. data/lib/legion/llm/api/namespaces/native/routing.rb +59 -0
  23. data/lib/legion/llm/api/namespaces/native/tiers.rb +194 -0
  24. data/lib/legion/llm/api/namespaces/openai/audio/speech.rb +208 -0
  25. data/lib/legion/llm/api/namespaces/openai/audio/transcriptions.rb +210 -0
  26. data/lib/legion/llm/api/namespaces/openai/audio/translations.rb +205 -0
  27. data/lib/legion/llm/api/namespaces/openai/batches.rb +311 -0
  28. data/lib/legion/llm/api/namespaces/openai/chat/completions.rb +287 -0
  29. data/lib/legion/llm/api/namespaces/openai/chat/messages.rb +39 -0
  30. data/lib/legion/llm/api/namespaces/openai/completions.rb +107 -0
  31. data/lib/legion/llm/api/namespaces/openai/conversations/items.rb +184 -0
  32. data/lib/legion/llm/api/namespaces/openai/conversations.rb +149 -0
  33. data/lib/legion/llm/api/namespaces/openai/embeddings.rb +68 -0
  34. data/lib/legion/llm/api/namespaces/openai/files.rb +219 -0
  35. data/lib/legion/llm/api/namespaces/openai/images.rb +327 -0
  36. data/lib/legion/llm/api/namespaces/openai/models.rb +114 -0
  37. data/lib/legion/llm/api/namespaces/openai/moderations.rb +209 -0
  38. data/lib/legion/llm/api/namespaces/openai/responses.rb +355 -0
  39. data/lib/legion/llm/api/namespaces/openai/uploads/parts.rb +102 -0
  40. data/lib/legion/llm/api/namespaces/openai/uploads.rb +175 -0
  41. data/lib/legion/llm/api/namespaces/openai/vector_stores/file_batches.rb +295 -0
  42. data/lib/legion/llm/api/namespaces/openai/vector_stores/files.rb +291 -0
  43. data/lib/legion/llm/api/namespaces/openai/vector_stores.rb +281 -0
  44. data/lib/legion/llm/api/namespaces/registration.rb +189 -0
  45. data/lib/legion/llm/api/native/helpers.rb +1 -1
  46. data/lib/legion/llm/api/native/providers.rb +1 -1
  47. data/lib/legion/llm/api/openai/chat_completions.rb +157 -26
  48. data/lib/legion/llm/api/openai/responses.rb +29 -4
  49. data/lib/legion/llm/api/shared_helpers.rb +422 -0
  50. data/lib/legion/llm/api/translators/anthropic_request.rb +80 -28
  51. data/lib/legion/llm/api/translators/anthropic_response.rb +11 -5
  52. data/lib/legion/llm/api/translators/openai_request.rb +28 -2
  53. data/lib/legion/llm/api/translators/openai_response.rb +15 -1
  54. data/lib/legion/llm/api.rb +20 -9
  55. data/lib/legion/llm/call/dispatch.rb +1 -1
  56. data/lib/legion/llm/call/lex_llm_adapter.rb +42 -10
  57. data/lib/legion/llm/context/curator.rb +44 -7
  58. data/lib/legion/llm/inference/conversation.rb +1 -1
  59. data/lib/legion/llm/inference/executor.rb +23 -6
  60. data/lib/legion/llm/inference/native_tool_loop.rb +2 -1
  61. data/lib/legion/llm/inference/steps/debate.rb +1 -1
  62. data/lib/legion/llm/inference/steps/knowledge_capture.rb +1 -1
  63. data/lib/legion/llm/inference/steps/logging.rb +1 -1
  64. data/lib/legion/llm/inference/steps/rag_context.rb +4 -4
  65. data/lib/legion/llm/inference/steps/rag_guard.rb +1 -1
  66. data/lib/legion/llm/inference/steps/span_annotator.rb +1 -1
  67. data/lib/legion/llm/inference/steps/sticky_persist.rb +1 -1
  68. data/lib/legion/llm/inference/steps/token_budget.rb +1 -1
  69. data/lib/legion/llm/inference/steps/tool_calls.rb +3 -3
  70. data/lib/legion/llm/inference/steps/tool_history.rb +1 -1
  71. data/lib/legion/llm/inference/steps/trigger_match.rb +12 -2
  72. data/lib/legion/llm/router.rb +12 -1
  73. data/lib/legion/llm/settings.rb +6 -5
  74. data/lib/legion/llm/token_estimation.rb +48 -0
  75. data/lib/legion/llm/types/message.rb +0 -1
  76. data/lib/legion/llm/vector_store/storage.rb +141 -0
  77. data/lib/legion/llm/version.rb +1 -1
  78. metadata +57 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '09bf7eb9fe4c93ccba0bb574864c66a518225327399b47a30dd08aa148ac3b74'
4
- data.tar.gz: a54854addf081e387d94ed9fd3ba67f826df9b64ec3b042b2767a7e5a441d715
3
+ metadata.gz: 94a90f69b3940eb805b6b7bf0b15eb30aa8c616e66562560df01d3f0d5b24ca2
4
+ data.tar.gz: 4a90c910e19cec4b3da0e5e44a00d1076495d41d653ad4f7161b3b38892f8d8b
5
5
  SHA512:
6
- metadata.gz: 69fa173952297d7da6410c101c9b9e13548514db85075188fdef153d0f9340f50c314e3f633432e5481c5a4d979468ea84dc3b24edff84d4d728a6d7df7f94c0
7
- data.tar.gz: 52852f542515ec121bf1d9851506c3866e70cae5a551f034777f15d9c68fc7886888eb6d9c5cb9a9f58e18016d84d92d4dd010a58cc7c312b2c683bfb477815d
6
+ metadata.gz: e5f444b505e4b75937223475300cd4bae1e2f292f93148fbb3f0a06446ae6b90b0bf482a7d308a3e580e55219ddc4eb6f0e9274f6479c6e96619d54946cc0ef6
7
+ data.tar.gz: 6a9dfedae0e30d61f1bf8c4dc825764e4e9174829cc2d1ea6ee65a00ab7dea2cec5299e5988117aacd24adba93d956970124e00c11a1360bacfab04aa9cf6044
data/CHANGELOG.md CHANGED
@@ -1,5 +1,71 @@
1
1
  # Legion LLM Changelog
2
2
 
3
+
4
+ ## [0.10.1] - 2026-05-29
5
+
6
+ ### Fixed
7
+ - **Anthropic message format translation** — `AnthropicRequest` translator now properly converts `tool_use` content blocks to `tool_calls` arrays and `tool_result` blocks to `role: :tool` messages. This was the root cause of vLLM outputting JSON blobs instead of calling tools — it was receiving malformed messages with raw Anthropic content block hashes in the content field.
8
+ - **Streaming tool_use event ordering** — Rewrote namespace streaming to emit tool_use content blocks inline after stream completes. Text `content_block_start` is deferred until actual text arrives, matching real Anthropic API behavior for tool-only responses.
9
+ - **Curator current-turn preservation** — `curate_turn` now only curates messages older than the current turn. `apply_curation_pipeline` preserves recent turns by counting user messages (`preserve_recent_turns` setting, default 2). Prevents the model from losing context of its recent work.
10
+ - **LexLLMAdapter TypeError flood** — Guarded `text_part_content` and `defined_method_access` against Array inputs. Flattened nested content arrays before iterating.
11
+ - **Rescue log levels** — All rescue blocks in inference steps, executor, dispatch, and adapter now use `:warn` or `:error` (never `:debug`). Caught exceptions should be visible.
12
+ - **Thinking override removed** — No longer forces `thinking: { enabled: false }` for vLLM with tools. Lets the model use its thinking template properly.
13
+ - **Model routing hardcode removed** — Anthropic namespace no longer hardcodes `routing: { model: 'legionio' }`. Uses daemon default_provider/default_model from settings.
14
+
15
+ ### Changed
16
+ - **RAG defaults** — `min_confidence` 0.85 → 0.92, `full_limit` 10 → 5, `compact_limit` 5 → 3. Tighter relevance threshold, fewer context injections.
17
+ - **AnthropicResponse translator** — `extract_tool_calls`, `format_stop_reason`, `token_count` are now public methods (needed by streaming handler).
18
+ - **Message.build debug log removed** — Fired dozens of times per request with no diagnostic value.
19
+ - **format_chunk log** — Now includes actual text content and index for debugging stream flow.
20
+ - **Curator debug logs** — Show full before/after content (no truncation).
21
+ - **Request info log** — Namespace handler logs message count, char count, estimated tokens, and tool count at request start.
22
+ - **Stream completion log** — Logs tool_calls count, stop_reason, and text length after streaming.
23
+
24
+ ## [0.10.0] - 2026-05-29
25
+
26
+ ### Added
27
+ - **Sinatra Namespace API** — complete API refactor using `sinatra-contrib` namespaces with thin route blocks. Enabled via `settings[:llm][:api][:use_namespaces] = true`.
28
+ - **OpenAI-compatible endpoints (full surface):**
29
+ - `POST /v1/responses` — Responses API with typed SSE streaming (Codex CLI drop-in)
30
+ - `GET/DELETE /v1/responses/:id`, `POST /:id/cancel`, `GET /:id/input_items`
31
+ - `POST /v1/chat/completions` — streaming (`data: [DONE]`) and sync
32
+ - `GET/POST/DELETE /v1/chat/completions/:id`, `GET /v1/chat/completions/:id/messages`
33
+ - `GET /v1/models`, `GET /v1/models/:id` — with Anthropic format branching via `detect_client`
34
+ - `POST /v1/embeddings`, `POST /v1/completions` (legacy)
35
+ - `POST /v1/images/generations`, `/edits`, `/variations`
36
+ - `POST /v1/audio/transcriptions`, `/translations`, `/speech`
37
+ - `POST /v1/moderations`
38
+ - `POST/GET /v1/conversations`, CRUD + items
39
+ - `POST/GET /v1/batches`, cancel
40
+ - `POST/GET /v1/files`, content download, delete
41
+ - `POST /v1/uploads`, parts, cancel, complete
42
+ - `POST/GET /v1/vector_stores`, search, files, file batches
43
+ - **Anthropic-compatible endpoints (full surface):**
44
+ - `POST /v1/messages` — Messages API with correct SSE event ordering (Claude Code drop-in)
45
+ - `POST /v1/messages/count_tokens` — token estimation
46
+ - `POST/GET/DELETE /v1/messages/batches` — full batch CRUD + JSONL results
47
+ - `GET /v1/models` — Anthropic format via detect_client header branching
48
+ - Anthropic files namespace (standalone deployment mode)
49
+ - **Native namespace endpoints** — all `/api/llm/*` routes ported to namespace pattern
50
+ - **SharedHelpers module** (`lib/legion/llm/api/shared_helpers.rb`) — extracted from `native/helpers.rb`, shared by both legacy and namespace routes
51
+ - **TokenEstimation module** (`lib/legion/llm/token_estimation.rb`) — character-based token counting for `count_tokens` endpoint
52
+ - **Client detection** — `detect_client(env)` determines OpenAI vs Anthropic client from headers (anthropic-version, x-api-key)
53
+ - **Auth middleware** — now returns format-appropriate error shapes (OpenAI vs Anthropic) based on detected client
54
+ - **VectorStore::Storage** — table DDL, cosine similarity, chunk_text utilities for vector store endpoints
55
+ - **Codex CLI conformance tests** — verifies exact streaming event order for drop-in compatibility
56
+ - **Claude Code conformance tests** — verifies exact SSE event order, tool use, system prompt handling
57
+
58
+ ### Changed
59
+ - `sinatra-contrib` added as runtime dependency (>= 2.0)
60
+ - `rack-test` added to development dependencies
61
+ - API registration now conditional: `use_namespaces: true` uses `Namespaces::Registration`, `false` (default) uses legacy flat routes
62
+ - All namespace inference routes go through full 18-step `Inference::Executor` pipeline (Gaia, RAG, metering, audit, escalation, RBAC, tools, etc.)
63
+
64
+ ### Fixed
65
+ - Auth before filter returns Anthropic-shaped error for Anthropic clients (was always OpenAI-shaped)
66
+ - Anthropic streaming event order: `message_start` now emitted before any `content_block_delta` events
67
+ - `/v1/models` path conflict resolved — single handler with client detection branching
68
+
3
69
  ## [0.9.54] - 2026-05-29
4
70
 
5
71
  ### Fixed
@@ -13,6 +79,12 @@
13
79
 
14
80
  ## [0.9.52] - 2026-05-27
15
81
 
82
+ ### Added
83
+ - API: `/v1/chat/completions` now has full pipeline feature parity with the native `/api/llm/inference` endpoint — routing, escalation, RAG context injection, Gaia advisory, knowledge capture, tool discovery, sticky runners, confidence scoring, metering, and debate all activate when the relevant fields are provided.
84
+ - API: `/v1/chat/completions` accepts extended fields via request body (`conversation_id`, `provider`, `tier`, `instance`, `cwd`, `requested_tools`, `client_tool_passthrough`, `caller`) or via `X-Legion-*` headers (`X-Legion-Conversation-Id`, `X-Legion-Provider`, `X-Legion-Tier`, `X-Legion-Instance`, `X-Legion-Cwd`, `X-Legion-Client-Tool-Passthrough`). Headers take precedence for scalar values.
85
+ - API: `/v1/chat/completions` now performs pre-pipeline Gaia ingest (mirrors native endpoint awareness).
86
+ - API: `/v1/chat/completions` reasoning/thinking token streaming via `include_reasoning: true` (or `include_thinking: true`). Emits `reasoning_content` delta chunks in OpenAI format. Non-streaming responses include `reasoning_content` in the message body.
87
+
16
88
  ### Fixed
17
89
  - Discovery: `verify_embedding` now checks `model_available?` for Ollama instead of blindly returning true — prevents `can_embed?` from reporting true when the embedding model (e.g. `mxbai-embed-large`) hasn't been pulled on the local node
18
90
  - Discovery: `detect_embedding_from_registry` now calls `verify_embedding` before setting `@can_embed = true`, closing a gap where registry-declared capability metadata was trusted without verifying the model exists locally
data/Gemfile CHANGED
@@ -30,6 +30,7 @@ group :test do
30
30
  gem provider_gem, path: provider_path if Dir.exist?(provider_path)
31
31
  end
32
32
 
33
+ gem 'rack-test', '~> 2.0'
33
34
  gem 'rake'
34
35
  gem 'rspec'
35
36
  gem 'rspec_junit_formatter'
@@ -0,0 +1,266 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ # DGX/HGX H200 SXM 8-GPU Setup Script (Post-Reboot)
5
+ # Run as root after reboot: su - then bash h200-setup-postreboot.sh
6
+ # Requires: h200-setup-prereboot.sh completed + reboot done
7
+
8
+ echo "=== Verifying root ==="
9
+ if [ "$(id -u)" -ne 0 ]; then
10
+ echo "ERROR: Must run as root (su -)"
11
+ exit 1
12
+ fi
13
+
14
+ echo "=== Verifying H200 PCI BAR recovery settings ==="
15
+ if ! grep -qw 'pci=realloc=off' /proc/cmdline; then
16
+ echo "ERROR: pci=realloc=off is not active. Refusing to continue because Linux may zero H200 BARs."
17
+ echo "Current cmdline: $(cat /proc/cmdline)"
18
+ echo "Expected grub setting: GRUB_CMDLINE_LINUX_DEFAULT=\"quiet pci=realloc=off\""
19
+ exit 1
20
+ fi
21
+
22
+ for bdf in 04 14 64 77 84 94 e9 f6; do
23
+ resource="/sys/bus/pci/devices/0000:${bdf}:00.0/resource"
24
+ if [ ! -r "$resource" ]; then
25
+ echo "ERROR: GPU 0000:${bdf}:00.0 is missing from sysfs."
26
+ exit 1
27
+ fi
28
+
29
+ if awk 'NR==1||NR==3||NR==5 { if ($1 == "0x0000000000000000") bad = 1 } END { exit bad }' "$resource"; then
30
+ :
31
+ else
32
+ echo "ERROR: GPU 0000:${bdf}:00.0 has an unassigned BAR."
33
+ awk 'NR==1||NR==3||NR==5{print}' "$resource"
34
+ exit 1
35
+ fi
36
+ done
37
+
38
+ echo "=== Verifying NVMe mounts ==="
39
+ if ! mountpoint -q /data; then
40
+ echo "ERROR: /data is not mounted. Check fstab."
41
+ exit 1
42
+ fi
43
+ if ! mountpoint -q /srv/legion; then
44
+ echo "ERROR: /srv/legion is not mounted. Check fstab."
45
+ exit 1
46
+ fi
47
+
48
+ echo "=== Verifying NVIDIA driver ==="
49
+ if ! timeout 60 nvidia-smi; then
50
+ echo "ERROR: nvidia-smi failed. Driver not loaded."
51
+ echo "Check: dmesg | grep -i nvidia"
52
+ echo "Try: apt install -y nvidia-open && reboot"
53
+ exit 1
54
+ fi
55
+
56
+ GPU_COUNT=$(timeout 60 nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
57
+ echo "Detected $GPU_COUNT GPUs"
58
+ if [ "$GPU_COUNT" -ne 8 ]; then
59
+ echo "ERROR: Expected 8 H200 GPUs, found $GPU_COUNT"
60
+ echo "Check: lspci | grep -i nvidia"
61
+ exit 1
62
+ fi
63
+
64
+ echo "=== Starting nvidia-fabricmanager (NVSwitch required) ==="
65
+ systemctl start nvidia-fabricmanager
66
+ if ! systemctl status nvidia-fabricmanager --no-pager -l; then
67
+ echo "ERROR: fabricmanager failed to start. NVSwitch won't work without it."
68
+ echo "Check: journalctl -u nvidia-fabricmanager"
69
+ exit 1
70
+ fi
71
+
72
+ echo "=== GPU persistence mode ==="
73
+ nvidia-smi -pm 1
74
+
75
+ echo "=== Verifying NVSwitch topology ==="
76
+ timeout 60 nvidia-smi topo -m
77
+ echo ""
78
+ echo "All GPUs should show NV18 (NVSwitch full mesh) connections."
79
+ echo ""
80
+ timeout 60 nvidia-smi nvlink -s
81
+
82
+ echo "=== CUDA library path ==="
83
+ CUDA_PATH=$(find /usr/local -maxdepth 1 -name "cuda-12*" -type d | sort -V | tail -1)
84
+ if [ -z "$CUDA_PATH" ]; then
85
+ echo "WARNING: CUDA toolkit directory not found under /usr/local"
86
+ CUDA_PATH="/usr/local/cuda-12.8"
87
+ fi
88
+ echo "$CUDA_PATH/lib64" > /etc/ld.so.conf.d/cuda.conf
89
+ ldconfig
90
+ ln -sf "$CUDA_PATH" /usr/local/cuda
91
+
92
+ echo "=== CPU performance governor ==="
93
+ for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
94
+ echo performance > "$gov" 2>/dev/null || true
95
+ done
96
+ echo never > /sys/kernel/mm/transparent_hugepage/defrag
97
+ echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
98
+
99
+ echo "=== NUMA topology check ==="
100
+ numactl --hardware
101
+ echo ""
102
+ echo "For optimal GPU-CPU affinity, vLLM will use all NUMA nodes."
103
+
104
+ echo "=== Setting up Ollama ownership ==="
105
+ chown -R ollama:ollama /usr/share/ollama/.ollama
106
+ chown -R ollama:llm /srv/legion/ollama
107
+ chmod -R 775 /srv/legion/ollama
108
+
109
+ echo "=== Restarting Ollama ==="
110
+ systemctl restart ollama
111
+ sleep 2
112
+ systemctl status ollama --no-pager
113
+
114
+ echo "=== Installing vLLM ==="
115
+ if [ ! -d "/data/vllm/env" ]; then
116
+ python3 -m venv /data/vllm/env
117
+ fi
118
+
119
+ mkdir -p /data/tmp /data/cache/pip
120
+ source /data/vllm/env/bin/activate
121
+ TMPDIR=/data/tmp PIP_CACHE_DIR=/data/cache/pip pip install vllm xxhash
122
+ deactivate
123
+
124
+ echo "=== Creating vLLM config ==="
125
+ cat > /data/vllm/config.yaml << 'EOF'
126
+ # Model (rsync from Apollo: rsync -avP root@10.11.164.93:/data/models/qwen3.6-27b/ /data/models/qwen3.6-27b/)
127
+ model: /data/models/qwen3.6-27b
128
+ dtype: bfloat16
129
+ tensor-parallel-size: 8
130
+ served-model-name: qwen3.6-27b
131
+
132
+ # Context & Memory — H200 141GB per card, 1.1TB total
133
+ max-model-len: 131072
134
+ gpu-memory-utilization: 0.95
135
+ enable-prefix-caching: true
136
+ prefix-caching-hash-algo: xxhash
137
+
138
+ # Scheduling & Batching — aggressive for 8x H200
139
+ max-num-seqs: 128
140
+ max-num-batched-tokens: 131072
141
+ enable-chunked-prefill: true
142
+ scheduling-policy: fcfs
143
+ performance-mode: throughput
144
+ async-scheduling: true
145
+
146
+ # Model Loading
147
+ load-format: auto
148
+ safetensors-load-strategy: mmap
149
+
150
+ # Streaming
151
+ stream-interval: 1
152
+
153
+ # Thinking/Reasoning
154
+ reasoning-parser: deepseek_v3
155
+
156
+ # Tool calling
157
+ enable-auto-tool-choice: true
158
+ tool-call-parser: hermes
159
+
160
+ # Server
161
+ host: 0.0.0.0
162
+ port: 8000
163
+
164
+ # Monitoring
165
+ enable-server-load-tracking: true
166
+ enable-prompt-tokens-details: true
167
+ enable-log-requests: true
168
+ kv-cache-metrics: true
169
+ enable-mfu-metrics: true
170
+ enable-logging-iteration-details: true
171
+ EOF
172
+
173
+ echo "=== Creating vLLM systemd service ==="
174
+ cat > /etc/systemd/system/vllm.service << 'EOF'
175
+ [Unit]
176
+ Description=vLLM Inference Server (8x H200 SXM)
177
+ After=network.target nvidia-persistenced.service nvidia-fabricmanager.service
178
+ Wants=nvidia-persistenced.service nvidia-fabricmanager.service
179
+
180
+ [Service]
181
+ Type=simple
182
+ User=vllm
183
+ Group=llm
184
+ Environment="PATH=/data/vllm/env/bin:/usr/local/bin:/usr/bin:/bin"
185
+ Environment="TMPDIR=/data/tmp"
186
+ Environment="TORCHINDUCTOR_CACHE_DIR=/data/cache/torchinductor"
187
+ Environment="TRITON_CACHE_DIR=/data/cache/triton"
188
+ Environment="HF_HOME=/data/cache/huggingface"
189
+ Environment="LD_LIBRARY_PATH=/usr/local/cuda/lib64"
190
+ ExecStart=/data/vllm/env/bin/python -m vllm.entrypoints.openai.api_server --config /data/vllm/config.yaml
191
+ Restart=on-failure
192
+ RestartSec=10
193
+ LimitNOFILE=1048576
194
+ LimitMEMLOCK=infinity
195
+ StandardOutput=append:/srv/legion/logs/vllm/vllm.log
196
+ StandardError=append:/srv/legion/logs/vllm/vllm-error.log
197
+
198
+ [Install]
199
+ WantedBy=multi-user.target
200
+ EOF
201
+
202
+ echo "=== Setting ownership ==="
203
+ chown -R vllm:llm /data/vllm /data/models /data/cache /data/tmp /srv/legion/logs/vllm
204
+
205
+ systemctl daemon-reload
206
+ systemctl enable vllm
207
+
208
+ echo "=== Checking for model files ==="
209
+ if [ -d "/data/models/qwen3.6-27b" ] && [ "$(find /data/models/qwen3.6-27b -name '*.safetensors' 2>/dev/null | wc -l)" -gt 0 ]; then
210
+ echo "Model found at /data/models/qwen3.6-27b"
211
+ echo "=== Starting vLLM ==="
212
+ rm -f /dev/shm/psm_* /dev/shm/sem.mp-* /dev/shm/VLLM_*
213
+ systemctl start vllm
214
+ echo ""
215
+ echo "vLLM is starting. First startup compiles CUDA graphs (~5-8 min on H200)."
216
+ echo "Subsequent restarts load from cache (~20 sec)."
217
+ echo "Monitor: journalctl -u vllm -f"
218
+ echo "Logs: tail -f /srv/legion/logs/vllm/vllm.log"
219
+ else
220
+ echo ""
221
+ echo "WARNING: No model found at /data/models/qwen3.6-27b"
222
+ echo ""
223
+ echo "Rsync from Apollo V100 node:"
224
+ echo " rsync -aHAXx --numeric-ids --info=progress2 --partial --append-verify root@10.11.164.93:/data/models/qwen3.6-27b/ /data/models/qwen3.6-27b/"
225
+ echo " chown -R vllm:llm /data/models"
226
+ echo " systemctl start vllm"
227
+ fi
228
+
229
+ echo ""
230
+ echo "==========================================="
231
+ echo " POST-REBOOT SETUP COMPLETE"
232
+ echo "==========================================="
233
+ echo ""
234
+ echo " Hardware: 8x H200 SXM 141GB, NVSwitch full-mesh"
235
+ echo " VRAM: 1.128 TB total"
236
+ echo " RAM: 2.2 TiB"
237
+ echo ""
238
+ echo " Services:"
239
+ echo " Ollama: http://0.0.0.0:11434 (embeddings)"
240
+ echo " vLLM: http://0.0.0.0:8000 (inference, TP=8, 131K context)"
241
+ echo " fabricmanager: active (NVSwitch)"
242
+ echo ""
243
+ echo " vLLM config highlights:"
244
+ echo " - TP=8 (all GPUs via NVSwitch, no DP needed for 27B)"
245
+ echo " - 131K context (H200 has room to spare with 27B)"
246
+ echo " - max-num-seqs=128, batched-tokens=131K"
247
+ echo " - bfloat16 (native H200 dtype)"
248
+ echo ""
249
+ echo " Test:"
250
+ echo " curl http://localhost:8000/v1/models"
251
+ echo " curl http://localhost:8000/v1/chat/completions \\"
252
+ echo " -H 'Content-Type: application/json' \\"
253
+ echo " -d '{\"model\":\"qwen3.6-27b\",\"messages\":[{\"role\":\"user\",\"content\":\"hello\"}],\"max_tokens\":50,\"chat_template_kwargs\":{\"enable_thinking\":false}}'"
254
+ echo ""
255
+ echo " For 235B later:"
256
+ echo " - Update /data/vllm/config.yaml: model path, max-model-len, and batching limits"
257
+ echo " - 235B at bf16 = ~470GB, fits easily in 1.1TB with room for KV cache"
258
+ echo ""
259
+ echo " Useful commands:"
260
+ echo " nvtop # GPU monitoring"
261
+ echo " nvidia-smi topo -m # NVSwitch topology"
262
+ echo " tail -f /srv/legion/logs/vllm/vllm.log # vLLM logs"
263
+ echo " systemctl status nvidia-fabricmanager # NVSwitch status"
264
+ echo " systemctl status vllm # vLLM status"
265
+ echo " systemctl status ollama # Ollama status"
266
+ echo "==========================================="
@@ -0,0 +1,228 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ # DGX/HGX H200 SXM 8-GPU Setup Script (Pre-Reboot)
5
+ # Hardware: 8x H200 SXM 141GB, 4x NVSwitch, 2x AMD EPYC 9535, 2.2TiB RAM
6
+ # Host: mn011-6labhz1-h200-0001
7
+ # OS: Debian 13 (trixie), kernel 6.12
8
+ # Run as root: su - then bash h200-setup-prereboot.sh
9
+ # After completion: reboot, then run h200-setup-postreboot.sh
10
+ #
11
+ # NOTE: Corporate firewall does TLS inspection. All external HTTPS downloads
12
+ # use --no-check-certificate / -k and apt repos use trusted=yes to bypass
13
+ # certificate verification failures from the MITM proxy.
14
+
15
+ echo "=== Verifying root ==="
16
+ if [ "$(id -u)" -ne 0 ]; then
17
+ echo "ERROR: Must run as root (su -)"
18
+ exit 1
19
+ fi
20
+
21
+ if [ "${H200_ALLOW_DESTRUCTIVE_PREREBOOT:-0}" != "1" ]; then
22
+ echo "ERROR: This legacy prereboot script formats NVMe devices and is no longer the supported rebuild path."
23
+ echo "Use scripts/h200/h200-rebuild.sh for full automation, or set H200_ALLOW_DESTRUCTIVE_PREREBOOT=1 only if you intend to wipe the scripted NVMe devices."
24
+ exit 1
25
+ fi
26
+
27
+ echo "=== Fixing apt sources (remove cdrom if present) ==="
28
+ sed -i '/cdrom/d' /etc/apt/sources.list 2>/dev/null || true
29
+
30
+ echo "=== Installing base packages ==="
31
+ apt update
32
+ apt install -y \
33
+ sudo curl wget gnupg2 lsb-release ca-certificates \
34
+ build-essential linux-headers-$(uname -r) \
35
+ git vim htop tmux nvtop \
36
+ numactl hwloc \
37
+ pciutils dkms \
38
+ apt-transport-https \
39
+ rsync parted jq iperf3 \
40
+ python3-pip python3-venv \
41
+ mdadm lvm2
42
+
43
+ echo "=== Setting up sudo for miverso2 ==="
44
+ usermod -aG sudo miverso2
45
+
46
+ echo "=== Bypassing TLS inspection for external repos ==="
47
+ cat > /etc/apt/apt.conf.d/99bypass-tls-inspection << 'EOF'
48
+ Acquire::https::developer.download.nvidia.com::Verify-Peer "false";
49
+ Acquire::https::nvidia.github.io::Verify-Peer "false";
50
+ Acquire::https::apt.releases.hashicorp.com::Verify-Peer "false";
51
+ EOF
52
+
53
+ echo "=== Setting up NVIDIA CUDA repo (debian12 — compatible with trixie) ==="
54
+ wget -qO /usr/share/keyrings/cuda-archive-keyring.gpg --no-check-certificate \
55
+ https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-archive-keyring.gpg
56
+ echo "deb [trusted=yes] https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
57
+ > /etc/apt/sources.list.d/cuda-debian12.list
58
+ apt update
59
+
60
+ echo "=== Installing NVIDIA open kernel module + driver ==="
61
+ apt install -y nvidia-kernel-open-dkms
62
+
63
+ echo "=== Installing CUDA toolkit 12.8 ==="
64
+ apt install -y cuda-toolkit-12-8
65
+
66
+ echo "=== Installing nvidia-fabricmanager (REQUIRED for NVSwitch) ==="
67
+ apt install -y nvidia-fabricmanager
68
+ systemctl enable nvidia-fabricmanager
69
+
70
+ echo "=== Installing Docker ==="
71
+ apt install -y docker.io
72
+ systemctl enable docker
73
+
74
+ echo "=== Installing NVIDIA Container Toolkit ==="
75
+ curl -fsSLk https://nvidia.github.io/libnvidia-container/gpgkey | \
76
+ gpg --dearmor > /etc/apt/trusted.gpg.d/nvidia-container-toolkit.gpg
77
+ echo "deb [trusted=yes] https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /" \
78
+ > /etc/apt/sources.list.d/nvidia-container-toolkit.list
79
+ apt update
80
+ apt install -y nvidia-container-toolkit
81
+ nvidia-ctk runtime configure --runtime=docker
82
+
83
+ echo "=== Installing HashiCorp Enterprise ==="
84
+ wget -qO- --no-check-certificate https://apt.releases.hashicorp.com/gpg | \
85
+ gpg --dearmor > /etc/apt/trusted.gpg.d/hashicorp.gpg
86
+ echo "deb [signed-by=/etc/apt/trusted.gpg.d/hashicorp.gpg trusted=yes] https://apt.releases.hashicorp.com bookworm main" \
87
+ > /etc/apt/sources.list.d/hashicorp.list
88
+ apt update
89
+ apt install -y nomad-enterprise consul-enterprise
90
+ systemctl enable nomad
91
+ systemctl enable consul
92
+
93
+ echo "=== Installing Ollama ==="
94
+ curl -fsSLk https://ollama.com/install.sh | sh
95
+
96
+ echo "=== Setting up NVMe drives ==="
97
+ echo " nvme1n1 (7TB) -> /data (models, vllm cache, vllm env)"
98
+ echo " nvme2n1 (7TB) -> /srv/legion (ollama, docker, nomad, consul, logs)"
99
+ echo " nvme3n1 (7TB) -> /reserve (future use)"
100
+
101
+ echo "=== Formatting NVMe drives ==="
102
+ mkfs.ext4 -L data -m 0 /dev/nvme1n1
103
+ mkfs.ext4 -L srv -m 0 /dev/nvme2n1
104
+ mkfs.ext4 -L reserve -m 0 /dev/nvme3n1
105
+
106
+ echo "=== Creating mount points ==="
107
+ mkdir -p /data /srv/legion /reserve
108
+
109
+ echo "=== Adding fstab entries ==="
110
+ cat >> /etc/fstab << 'EOF'
111
+ /dev/nvme1n1 /data ext4 defaults,noatime,discard 0 2
112
+ /dev/nvme2n1 /srv/legion ext4 defaults,noatime,discard 0 2
113
+ /dev/nvme3n1 /reserve ext4 defaults,noatime,discard 0 2
114
+ EOF
115
+
116
+ echo "=== Mounting all ==="
117
+ mount -a
118
+
119
+ echo "=== Creating /data directory structure ==="
120
+ mkdir -p /data/{models,cache,vllm,tmp}
121
+ mkdir -p /data/cache/{pip,triton,torchinductor,huggingface}
122
+
123
+ echo "=== Creating /srv/legion directory structure ==="
124
+ mkdir -p /srv/legion/{ollama,docker,nomad,consul,rabbitmq}
125
+ mkdir -p /srv/legion/logs/{vllm,ollama,nomad}
126
+
127
+ echo "=== Moving docker storage to NVMe ==="
128
+ systemctl stop docker 2>/dev/null || true
129
+ if [ -d "/var/lib/docker" ] && [ ! -L "/var/lib/docker" ]; then
130
+ rsync -a /var/lib/docker/ /srv/legion/docker/ 2>/dev/null || true
131
+ rm -rf /var/lib/docker
132
+ ln -sf /srv/legion/docker /var/lib/docker
133
+ fi
134
+
135
+ echo "=== Setting up Ollama storage on NVMe ==="
136
+ mkdir -p /usr/share/ollama/.ollama
137
+ echo "/srv/legion/ollama /usr/share/ollama/.ollama none bind 0 0" >> /etc/fstab
138
+ mount -a
139
+
140
+ echo "=== Configuring Ollama ==="
141
+ mkdir -p /etc/systemd/system/ollama.service.d
142
+ cat > /etc/systemd/system/ollama.service.d/override.conf << 'OLLEOF'
143
+ [Service]
144
+ Environment="OLLAMA_HOST=0.0.0.0:11434"
145
+ OLLEOF
146
+
147
+ echo "=== Creating users and groups ==="
148
+ groupadd -f llm
149
+ useradd -r -s /bin/false -d /data/vllm vllm 2>/dev/null || true
150
+ usermod -aG llm miverso2
151
+ usermod -aG llm ollama 2>/dev/null || true
152
+ usermod -aG llm vllm
153
+ usermod -aG video vllm
154
+ usermod -aG render vllm
155
+
156
+ echo "=== Setting ownership ==="
157
+ chown -R ollama:ollama /usr/share/ollama/.ollama
158
+ chown -R ollama:llm /srv/legion/ollama
159
+ chmod -R 775 /srv/legion/ollama
160
+ chown -R vllm:llm /data/vllm /data/models /data/cache /data/tmp
161
+ chown -R vllm:llm /srv/legion/logs/vllm
162
+
163
+ echo "=== Network tuning (EPYC + ConnectX-7) ==="
164
+ cat > /etc/sysctl.d/99-h200.conf << 'EOF'
165
+ net.ipv4.tcp_window_scaling = 1
166
+ net.core.rmem_max = 67108864
167
+ net.core.wmem_max = 67108864
168
+ net.ipv4.tcp_rmem = 4096 87380 67108864
169
+ net.ipv4.tcp_wmem = 4096 65536 67108864
170
+ net.ipv4.tcp_congestion_control = bbr
171
+ net.core.somaxconn = 3240000
172
+ net.core.busy_poll = 50
173
+ net.core.busy_read = 50
174
+ net.core.netdev_max_backlog = 65536
175
+ net.ipv4.tcp_max_syn_backlog = 3240000
176
+ net.ipv4.ip_local_port_range = 10000 65535
177
+ net.ipv4.tcp_fin_timeout = 15
178
+ net.ipv4.tcp_max_tw_buckets = 65536
179
+ kernel.numa_balancing = 0
180
+ vm.nr_hugepages = 4096
181
+ vm.swappiness = 1
182
+ EOF
183
+ sysctl -p /etc/sysctl.d/99-h200.conf
184
+
185
+ echo "=== File descriptor limits ==="
186
+ cat >> /etc/security/limits.conf << 'EOF'
187
+ vllm soft nofile 1048576
188
+ vllm hard nofile 1048576
189
+ ollama soft nofile 65536
190
+ ollama hard nofile 65536
191
+ * soft memlock unlimited
192
+ * hard memlock unlimited
193
+ EOF
194
+
195
+ echo "=== THP + CPU governor persistence ==="
196
+ cat > /etc/rc.local << 'RCEOF'
197
+ #!/bin/bash
198
+ echo never > /sys/kernel/mm/transparent_hugepage/defrag
199
+ echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
200
+ for gov in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
201
+ echo performance > $gov 2>/dev/null
202
+ done
203
+ exit 0
204
+ RCEOF
205
+ chmod +x /etc/rc.local
206
+
207
+ echo "=== Applying THP settings now ==="
208
+ echo never > /sys/kernel/mm/transparent_hugepage/defrag
209
+ echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
210
+
211
+ systemctl daemon-reload
212
+
213
+ echo ""
214
+ echo "==========================================="
215
+ echo " PRE-REBOOT SETUP COMPLETE"
216
+ echo "==========================================="
217
+ echo ""
218
+ echo " Hardware: 8x H200 SXM 141GB + 4x NVSwitch"
219
+ echo " NVMe: nvme1n1 -> /data (models/vllm)"
220
+ echo " nvme2n1 -> /srv/legion (services/logs)"
221
+ echo " nvme3n1 -> /reserve (future)"
222
+ echo ""
223
+ echo " Next steps:"
224
+ echo " 1. reboot"
225
+ echo " 2. After reboot, run: bash h200-setup-postreboot.sh"
226
+ echo ""
227
+ echo " The reboot is required for the NVIDIA driver to load."
228
+ echo "==========================================="