sglang 0.2.12__tar.gz → 0.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.12/sglang.egg-info → sglang-0.2.14}/PKG-INFO +102 -27
- {sglang-0.2.12 → sglang-0.2.14}/README.md +97 -25
- {sglang-0.2.12 → sglang-0.2.14}/pyproject.toml +4 -4
- {sglang-0.2.12 → sglang-0.2.14}/sglang/api.py +13 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/bench_latency.py +10 -5
- {sglang-0.2.12 → sglang-0.2.14}/sglang/bench_serving.py +50 -26
- {sglang-0.2.12 → sglang-0.2.14}/sglang/check_env.py +15 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/global_config.py +1 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/runtime_endpoint.py +60 -49
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/chat_template.py +10 -5
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/compiler.py +4 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/interpreter.py +5 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/ir.py +22 -4
- {sglang-0.2.12 → sglang-0.2.14}/sglang/launch_server.py +8 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/jump_forward.py +13 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/conversation.py +50 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/hf_transformers_utils.py +22 -23
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/activation.py +24 -2
- sglang-0.2.14/sglang/srt/layers/decode_attention.py +627 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/extend_attention.py +3 -1
- sglang-0.2.14/sglang/srt/layers/fused_moe/__init__.py +1 -0
- {sglang-0.2.12/sglang/srt/layers → sglang-0.2.14/sglang/srt/layers/fused_moe}/fused_moe.py +165 -108
- sglang-0.2.14/sglang/srt/layers/fused_moe/layer.py +587 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/layernorm.py +3 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/logits_processor.py +64 -27
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/radix_attention.py +41 -18
- sglang-0.2.14/sglang/srt/layers/sampler.py +154 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/controller_multi.py +2 -8
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/controller_single.py +7 -10
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/detokenizer_manager.py +20 -9
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/io_struct.py +44 -11
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/policy_scheduler.py +5 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/schedule_batch.py +59 -179
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/tokenizer_manager.py +193 -84
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/managers/tp_worker.py +131 -50
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/memory_pool.py +82 -8
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mm_utils.py +79 -7
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_executor/cuda_graph_runner.py +97 -28
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_executor/forward_batch_info.py +188 -82
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_executor/model_runner.py +269 -87
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/chatglm.py +6 -14
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/commandr.py +6 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/dbrx.py +5 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/deepseek.py +7 -3
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/deepseek_v2.py +12 -7
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/gemma.py +6 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/gemma2.py +22 -8
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/gpt_bigcode.py +5 -1
- sglang-0.2.14/sglang/srt/models/grok.py +422 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/internlm2.py +5 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llama2.py +7 -3
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llama_classification.py +2 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llama_embedding.py +4 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llava.py +176 -59
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/minicpm.py +7 -3
- sglang-0.2.14/sglang/srt/models/mixtral.py +384 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/mixtral_quant.py +6 -5
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/qwen.py +7 -4
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/qwen2.py +15 -5
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/qwen2_moe.py +7 -16
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/stablelm.py +6 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/openai_api/adapter.py +149 -58
- sglang-0.2.14/sglang/srt/sampling/sampling_batch_info.py +209 -0
- {sglang-0.2.12/sglang/srt → sglang-0.2.14/sglang/srt/sampling}/sampling_params.py +18 -4
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/server.py +107 -71
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/server_args.py +49 -15
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/utils.py +27 -18
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/runners.py +38 -38
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_common.py +9 -10
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_gpqa.py +2 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_humaneval.py +2 -2
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_math.py +2 -1
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_mmlu.py +2 -1
- sglang-0.2.14/sglang/test/test_activation.py +55 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/test_programs.py +32 -5
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/test_utils.py +37 -50
- sglang-0.2.14/sglang/version.py +1 -0
- {sglang-0.2.12 → sglang-0.2.14/sglang.egg-info}/PKG-INFO +102 -27
- {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/SOURCES.txt +7 -5
- {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/requires.txt +4 -1
- sglang-0.2.12/sglang/launch_server_llavavid.py +0 -29
- sglang-0.2.12/sglang/srt/layers/decode_attention.py +0 -339
- sglang-0.2.12/sglang/srt/model_loader/model_loader.py +0 -292
- sglang-0.2.12/sglang/srt/model_loader/utils.py +0 -275
- sglang-0.2.12/sglang/srt/models/grok.py +0 -754
- sglang-0.2.12/sglang/srt/models/mixtral.py +0 -578
- sglang-0.2.12/sglang/version.py +0 -1
- {sglang-0.2.12 → sglang-0.2.14}/LICENSE +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/setup.cfg +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/__init__.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/choices.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/layers/prefill_attention.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/run_eval.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang/utils.py +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.12 → sglang-0.2.14}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.14
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -216,6 +216,7 @@ Requires-Dist: tqdm
|
|
216
216
|
Requires-Dist: numpy
|
217
217
|
Provides-Extra: srt
|
218
218
|
Requires-Dist: aiohttp; extra == "srt"
|
219
|
+
Requires-Dist: decord; extra == "srt"
|
219
220
|
Requires-Dist: fastapi; extra == "srt"
|
220
221
|
Requires-Dist: hf_transfer; extra == "srt"
|
221
222
|
Requires-Dist: huggingface_hub; extra == "srt"
|
@@ -229,7 +230,7 @@ Requires-Dist: torch; extra == "srt"
|
|
229
230
|
Requires-Dist: uvicorn; extra == "srt"
|
230
231
|
Requires-Dist: uvloop; extra == "srt"
|
231
232
|
Requires-Dist: zmq; extra == "srt"
|
232
|
-
Requires-Dist: vllm==0.5.
|
233
|
+
Requires-Dist: vllm==0.5.5; extra == "srt"
|
233
234
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
234
235
|
Provides-Extra: openai
|
235
236
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -242,6 +243,8 @@ Provides-Extra: test
|
|
242
243
|
Requires-Dist: jsonlines; extra == "test"
|
243
244
|
Requires-Dist: matplotlib; extra == "test"
|
244
245
|
Requires-Dist: pandas; extra == "test"
|
246
|
+
Requires-Dist: sentence_transformers; extra == "test"
|
247
|
+
Requires-Dist: accelerate; extra == "test"
|
245
248
|
Provides-Extra: all
|
246
249
|
Requires-Dist: sglang[srt]; extra == "all"
|
247
250
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
270
273
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
271
274
|
|
272
275
|
The core features include:
|
273
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
276
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
274
277
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
275
278
|
|
276
279
|
## News
|
277
280
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
278
|
-
- [2024/
|
281
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
279
282
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
280
283
|
|
281
284
|
<details>
|
282
285
|
<summary>More</summary>
|
283
286
|
|
287
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
284
288
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
285
289
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
286
290
|
|
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
312
|
### Method 2: From source
|
309
313
|
```
|
310
314
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
312
316
|
cd sglang
|
313
317
|
|
314
318
|
pip install --upgrade pip
|
@@ -329,11 +333,63 @@ docker run --gpus all \
|
|
329
333
|
--env "HF_TOKEN=<secret>" \
|
330
334
|
--ipc=host \
|
331
335
|
lmsysorg/sglang:latest \
|
332
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
336
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
333
337
|
```
|
334
338
|
|
339
|
+
### Method 4: Using docker compose
|
340
|
+
|
341
|
+
<details>
|
342
|
+
|
343
|
+
> This method is recommended if you plan to serve it as a service.
|
344
|
+
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
345
|
+
|
346
|
+
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
347
|
+
2. Execute the command `docker compose up -d` in your terminal.
|
348
|
+
</details>
|
349
|
+
|
350
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
351
|
+
|
352
|
+
<details>
|
353
|
+
|
354
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
355
|
+
|
356
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
357
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
358
|
+
<details>
|
359
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
360
|
+
|
361
|
+
```yaml
|
362
|
+
# sglang.yaml
|
363
|
+
envs:
|
364
|
+
HF_TOKEN: null
|
365
|
+
|
366
|
+
resources:
|
367
|
+
image_id: docker:lmsysorg/sglang:latest
|
368
|
+
accelerators: A100
|
369
|
+
ports: 30000
|
370
|
+
|
371
|
+
run: |
|
372
|
+
conda deactivate
|
373
|
+
python3 -m sglang.launch_server \
|
374
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
375
|
+
--host 0.0.0.0 \
|
376
|
+
--port 30000
|
377
|
+
```
|
378
|
+
</details>
|
379
|
+
|
380
|
+
```bash
|
381
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
382
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
383
|
+
|
384
|
+
# Get the HTTP API endpoint
|
385
|
+
sky status --endpoint 30000 sglang
|
386
|
+
```
|
387
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
388
|
+
</details>
|
389
|
+
|
390
|
+
|
335
391
|
### Common Notes
|
336
|
-
-
|
392
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
337
393
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
338
394
|
|
339
395
|
## Backend: SGLang Runtime (SRT)
|
@@ -387,6 +443,13 @@ response = client.chat.completions.create(
|
|
387
443
|
max_tokens=64,
|
388
444
|
)
|
389
445
|
print(response)
|
446
|
+
|
447
|
+
# Text embedding
|
448
|
+
response = client.embeddings.create(
|
449
|
+
model="default",
|
450
|
+
input="How are you today",
|
451
|
+
)
|
452
|
+
print(response)
|
390
453
|
```
|
391
454
|
|
392
455
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -423,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
423
486
|
|
424
487
|
### Supported Models
|
425
488
|
|
489
|
+
**Generative Models**
|
490
|
+
|
426
491
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
427
492
|
- Mistral / Mixtral / Mistral NeMo
|
428
493
|
- Gemma / Gemma 2
|
429
494
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
430
495
|
- DeepSeek / DeepSeek 2
|
431
|
-
- LLaVA
|
432
|
-
- `
|
433
|
-
-
|
434
|
-
|
435
|
-
-
|
436
|
-
-
|
496
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
497
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
498
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
499
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
500
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
501
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
502
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
437
503
|
- Yi-VL
|
438
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
439
504
|
- StableLM
|
440
505
|
- Command-R
|
441
506
|
- DBRX
|
@@ -443,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
443
508
|
- ChatGLM
|
444
509
|
- InternLM 2
|
445
510
|
|
511
|
+
**Embedding Models**
|
512
|
+
|
513
|
+
- e5-mistral
|
514
|
+
- gte-Qwen2
|
515
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
516
|
+
|
446
517
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
447
518
|
|
448
519
|
#### Use Models From ModelScope
|
449
|
-
|
520
|
+
<details>
|
521
|
+
|
522
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
450
523
|
```
|
451
524
|
export SGLANG_USE_MODELSCOPE=true
|
452
525
|
```
|
453
526
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
454
527
|
```
|
455
528
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
456
|
-
```
|
529
|
+
```
|
530
|
+
|
531
|
+
</details>
|
457
532
|
|
458
533
|
#### Run Llama 3.1 405B
|
534
|
+
<details>
|
459
535
|
|
460
536
|
```bash
|
461
|
-
|
537
|
+
# Run 405B (fp8) on a single node
|
462
538
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
463
539
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
# on the first node
|
468
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
540
|
+
# Run 405B (fp16) on two nodes
|
541
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
542
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
469
543
|
|
470
|
-
|
471
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
544
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
545
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
472
546
|
```
|
473
547
|
|
548
|
+
</details>
|
549
|
+
|
474
550
|
### Benchmark Performance
|
475
551
|
|
476
552
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -606,7 +682,7 @@ def tip_suggestion(s):
|
|
606
682
|
s += "In summary" + sgl.gen("summary")
|
607
683
|
```
|
608
684
|
|
609
|
-
#### Multi
|
685
|
+
#### Multi-Modality
|
610
686
|
Use `sgl.image` to pass an image as input.
|
611
687
|
|
612
688
|
```python
|
@@ -660,7 +736,7 @@ def character_gen(s, name):
|
|
660
736
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
661
737
|
```
|
662
738
|
|
663
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
739
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
664
740
|
|
665
741
|
#### Batching
|
666
742
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -722,7 +798,6 @@ def chat_example(s):
|
|
722
798
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
723
799
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
724
800
|
|
725
|
-
|
726
801
|
## Benchmark And Performance
|
727
802
|

|
728
803
|

|
@@ -17,17 +17,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
17
17
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
18
18
|
|
19
19
|
The core features include:
|
20
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
20
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
21
21
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
22
22
|
|
23
23
|
## News
|
24
24
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
25
|
-
- [2024/
|
25
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
26
26
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
27
27
|
|
28
28
|
<details>
|
29
29
|
<summary>More</summary>
|
30
30
|
|
31
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
31
32
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
32
33
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
33
34
|
|
@@ -55,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
55
56
|
### Method 2: From source
|
56
57
|
```
|
57
58
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
59
|
+
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
59
60
|
cd sglang
|
60
61
|
|
61
62
|
pip install --upgrade pip
|
@@ -76,11 +77,63 @@ docker run --gpus all \
|
|
76
77
|
--env "HF_TOKEN=<secret>" \
|
77
78
|
--ipc=host \
|
78
79
|
lmsysorg/sglang:latest \
|
79
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
80
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
80
81
|
```
|
81
82
|
|
83
|
+
### Method 4: Using docker compose
|
84
|
+
|
85
|
+
<details>
|
86
|
+
|
87
|
+
> This method is recommended if you plan to serve it as a service.
|
88
|
+
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
89
|
+
|
90
|
+
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
91
|
+
2. Execute the command `docker compose up -d` in your terminal.
|
92
|
+
</details>
|
93
|
+
|
94
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
95
|
+
|
96
|
+
<details>
|
97
|
+
|
98
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
99
|
+
|
100
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
101
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
102
|
+
<details>
|
103
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
104
|
+
|
105
|
+
```yaml
|
106
|
+
# sglang.yaml
|
107
|
+
envs:
|
108
|
+
HF_TOKEN: null
|
109
|
+
|
110
|
+
resources:
|
111
|
+
image_id: docker:lmsysorg/sglang:latest
|
112
|
+
accelerators: A100
|
113
|
+
ports: 30000
|
114
|
+
|
115
|
+
run: |
|
116
|
+
conda deactivate
|
117
|
+
python3 -m sglang.launch_server \
|
118
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
119
|
+
--host 0.0.0.0 \
|
120
|
+
--port 30000
|
121
|
+
```
|
122
|
+
</details>
|
123
|
+
|
124
|
+
```bash
|
125
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
126
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
127
|
+
|
128
|
+
# Get the HTTP API endpoint
|
129
|
+
sky status --endpoint 30000 sglang
|
130
|
+
```
|
131
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
132
|
+
</details>
|
133
|
+
|
134
|
+
|
82
135
|
### Common Notes
|
83
|
-
-
|
136
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
84
137
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
85
138
|
|
86
139
|
## Backend: SGLang Runtime (SRT)
|
@@ -134,6 +187,13 @@ response = client.chat.completions.create(
|
|
134
187
|
max_tokens=64,
|
135
188
|
)
|
136
189
|
print(response)
|
190
|
+
|
191
|
+
# Text embedding
|
192
|
+
response = client.embeddings.create(
|
193
|
+
model="default",
|
194
|
+
input="How are you today",
|
195
|
+
)
|
196
|
+
print(response)
|
137
197
|
```
|
138
198
|
|
139
199
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -170,19 +230,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
170
230
|
|
171
231
|
### Supported Models
|
172
232
|
|
233
|
+
**Generative Models**
|
234
|
+
|
173
235
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
174
236
|
- Mistral / Mixtral / Mistral NeMo
|
175
237
|
- Gemma / Gemma 2
|
176
238
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
177
239
|
- DeepSeek / DeepSeek 2
|
178
|
-
- LLaVA
|
179
|
-
- `
|
180
|
-
-
|
181
|
-
|
182
|
-
-
|
183
|
-
-
|
240
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
241
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
242
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
243
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
244
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
245
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
246
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
184
247
|
- Yi-VL
|
185
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
186
248
|
- StableLM
|
187
249
|
- Command-R
|
188
250
|
- DBRX
|
@@ -190,34 +252,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
190
252
|
- ChatGLM
|
191
253
|
- InternLM 2
|
192
254
|
|
255
|
+
**Embedding Models**
|
256
|
+
|
257
|
+
- e5-mistral
|
258
|
+
- gte-Qwen2
|
259
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
260
|
+
|
193
261
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
194
262
|
|
195
263
|
#### Use Models From ModelScope
|
196
|
-
|
264
|
+
<details>
|
265
|
+
|
266
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
197
267
|
```
|
198
268
|
export SGLANG_USE_MODELSCOPE=true
|
199
269
|
```
|
200
270
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
201
271
|
```
|
202
272
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
203
|
-
```
|
273
|
+
```
|
274
|
+
|
275
|
+
</details>
|
204
276
|
|
205
277
|
#### Run Llama 3.1 405B
|
278
|
+
<details>
|
206
279
|
|
207
280
|
```bash
|
208
|
-
|
281
|
+
# Run 405B (fp8) on a single node
|
209
282
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
210
283
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
# on the first node
|
215
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
284
|
+
# Run 405B (fp16) on two nodes
|
285
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
286
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
216
287
|
|
217
|
-
|
218
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
288
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
289
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
219
290
|
```
|
220
291
|
|
292
|
+
</details>
|
293
|
+
|
221
294
|
### Benchmark Performance
|
222
295
|
|
223
296
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -353,7 +426,7 @@ def tip_suggestion(s):
|
|
353
426
|
s += "In summary" + sgl.gen("summary")
|
354
427
|
```
|
355
428
|
|
356
|
-
#### Multi
|
429
|
+
#### Multi-Modality
|
357
430
|
Use `sgl.image` to pass an image as input.
|
358
431
|
|
359
432
|
```python
|
@@ -407,7 +480,7 @@ def character_gen(s, name):
|
|
407
480
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
408
481
|
```
|
409
482
|
|
410
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
483
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
411
484
|
|
412
485
|
#### Batching
|
413
486
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -469,7 +542,6 @@ def chat_example(s):
|
|
469
542
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
470
543
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
471
544
|
|
472
|
-
|
473
545
|
## Benchmark And Performance
|
474
546
|

|
475
547
|

|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.14"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,14 +20,14 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
23
|
+
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.
|
26
|
+
"vllm==0.5.5", "outlines>=0.0.44"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
30
|
-
test = ["jsonlines", "matplotlib", "pandas"]
|
30
|
+
test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"]
|
31
31
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
32
|
dev = ["sglang[all]", "sglang[test]"]
|
33
33
|
|
@@ -62,9 +62,11 @@ def gen(
|
|
62
62
|
name: Optional[str] = None,
|
63
63
|
max_tokens: Optional[int] = None,
|
64
64
|
stop: Optional[Union[str, List[str]]] = None,
|
65
|
+
stop_token_ids: Optional[List[int]] = None,
|
65
66
|
temperature: Optional[float] = None,
|
66
67
|
top_p: Optional[float] = None,
|
67
68
|
top_k: Optional[int] = None,
|
69
|
+
min_p: Optional[float] = None,
|
68
70
|
frequency_penalty: Optional[float] = None,
|
69
71
|
presence_penalty: Optional[float] = None,
|
70
72
|
ignore_eos: Optional[bool] = None,
|
@@ -72,7 +74,7 @@ def gen(
|
|
72
74
|
logprob_start_len: Optional[int] = None,
|
73
75
|
top_logprobs_num: Optional[int] = None,
|
74
76
|
return_text_in_logprobs: Optional[bool] = None,
|
75
|
-
dtype: Optional[type] = None,
|
77
|
+
dtype: Optional[Union[type, str]] = None,
|
76
78
|
choices: Optional[List[str]] = None,
|
77
79
|
choices_method: Optional[ChoicesSamplingMethod] = None,
|
78
80
|
regex: Optional[str] = None,
|
@@ -98,9 +100,11 @@ def gen(
|
|
98
100
|
name,
|
99
101
|
max_tokens,
|
100
102
|
stop,
|
103
|
+
stop_token_ids,
|
101
104
|
temperature,
|
102
105
|
top_p,
|
103
106
|
top_k,
|
107
|
+
min_p,
|
104
108
|
frequency_penalty,
|
105
109
|
presence_penalty,
|
106
110
|
ignore_eos,
|
@@ -117,9 +121,11 @@ def gen_int(
|
|
117
121
|
name: Optional[str] = None,
|
118
122
|
max_tokens: Optional[int] = None,
|
119
123
|
stop: Optional[Union[str, List[str]]] = None,
|
124
|
+
stop_token_ids: Optional[List[int]] = None,
|
120
125
|
temperature: Optional[float] = None,
|
121
126
|
top_p: Optional[float] = None,
|
122
127
|
top_k: Optional[int] = None,
|
128
|
+
min_p: Optional[float] = None,
|
123
129
|
frequency_penalty: Optional[float] = None,
|
124
130
|
presence_penalty: Optional[float] = None,
|
125
131
|
ignore_eos: Optional[bool] = None,
|
@@ -132,9 +138,11 @@ def gen_int(
|
|
132
138
|
name,
|
133
139
|
max_tokens,
|
134
140
|
stop,
|
141
|
+
stop_token_ids,
|
135
142
|
temperature,
|
136
143
|
top_p,
|
137
144
|
top_k,
|
145
|
+
min_p,
|
138
146
|
frequency_penalty,
|
139
147
|
presence_penalty,
|
140
148
|
ignore_eos,
|
@@ -151,9 +159,11 @@ def gen_string(
|
|
151
159
|
name: Optional[str] = None,
|
152
160
|
max_tokens: Optional[int] = None,
|
153
161
|
stop: Optional[Union[str, List[str]]] = None,
|
162
|
+
stop_token_ids: Optional[List[int]] = None,
|
154
163
|
temperature: Optional[float] = None,
|
155
164
|
top_p: Optional[float] = None,
|
156
165
|
top_k: Optional[int] = None,
|
166
|
+
min_p: Optional[float] = None,
|
157
167
|
frequency_penalty: Optional[float] = None,
|
158
168
|
presence_penalty: Optional[float] = None,
|
159
169
|
ignore_eos: Optional[bool] = None,
|
@@ -166,9 +176,11 @@ def gen_string(
|
|
166
176
|
name,
|
167
177
|
max_tokens,
|
168
178
|
stop,
|
179
|
+
stop_token_ids,
|
169
180
|
temperature,
|
170
181
|
top_p,
|
171
182
|
top_k,
|
183
|
+
min_p,
|
172
184
|
frequency_penalty,
|
173
185
|
presence_penalty,
|
174
186
|
ignore_eos,
|
@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
|
54
54
|
from sglang.srt.model_config import ModelConfig
|
55
55
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
56
56
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
57
|
-
from sglang.srt.sampling_params import SamplingParams
|
57
|
+
from sglang.srt.sampling.sampling_params import SamplingParams
|
58
58
|
from sglang.srt.server_args import ServerArgs
|
59
59
|
from sglang.srt.utils import suppress_other_loggers
|
60
60
|
|
@@ -64,7 +64,7 @@ class BenchArgs:
|
|
64
64
|
run_name: str = "before"
|
65
65
|
batch_size: Tuple[int] = (1,)
|
66
66
|
input_len: Tuple[int] = (1024,)
|
67
|
-
output_len: Tuple[int] = (
|
67
|
+
output_len: Tuple[int] = (16,)
|
68
68
|
result_filename: str = ""
|
69
69
|
correctness_test: bool = False
|
70
70
|
# This is only used for correctness test
|
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
|
|
111
111
|
suppress_other_loggers()
|
112
112
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
113
113
|
|
114
|
-
model_config = ModelConfig(
|
114
|
+
model_config = ModelConfig(
|
115
|
+
server_args.model_path,
|
116
|
+
server_args.trust_remote_code,
|
117
|
+
context_length=server_args.context_length,
|
118
|
+
)
|
115
119
|
model_runner = ModelRunner(
|
116
120
|
model_config=model_config,
|
117
121
|
mem_fraction_static=server_args.mem_fraction_static,
|
@@ -195,7 +199,7 @@ def extend(reqs, model_runner):
|
|
195
199
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
196
200
|
tree_cache=None,
|
197
201
|
)
|
198
|
-
batch.prepare_for_extend(model_runner.model_config.vocab_size
|
202
|
+
batch.prepare_for_extend(model_runner.model_config.vocab_size)
|
199
203
|
output = model_runner.forward(batch, ForwardMode.EXTEND)
|
200
204
|
next_token_ids = batch.sample(output.next_token_logits)
|
201
205
|
return next_token_ids, output.next_token_logits, batch
|
@@ -221,6 +225,7 @@ def correctness_test(
|
|
221
225
|
|
222
226
|
# Prepare inputs
|
223
227
|
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
|
228
|
+
rank_print(f"{input_ids=}")
|
224
229
|
|
225
230
|
if bench_args.cut_len > 0:
|
226
231
|
# Prefill
|
@@ -349,7 +354,7 @@ def latency_test(
|
|
349
354
|
for bs, il, ol in itertools.product(
|
350
355
|
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
351
356
|
):
|
352
|
-
|
357
|
+
reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
|
353
358
|
ret = latency_test_run_once(
|
354
359
|
bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
|
355
360
|
)
|