sglang 0.2.13__tar.gz → 0.2.14.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.13/sglang.egg-info → sglang-0.2.14.post1}/PKG-INFO +100 -27
- {sglang-0.2.13 → sglang-0.2.14.post1}/README.md +95 -25
- {sglang-0.2.13 → sglang-0.2.14.post1}/pyproject.toml +4 -4
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/api.py +6 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/bench_latency.py +7 -3
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/bench_serving.py +50 -26
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/check_env.py +15 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/chat_template.py +10 -5
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/compiler.py +4 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/interpreter.py +1 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/ir.py +9 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/launch_server.py +8 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/fsm_cache.py +11 -2
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/jump_forward.py +1 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/conversation.py +50 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/hf_transformers_utils.py +22 -23
- sglang-0.2.14.post1/sglang/srt/layers/activation.py +131 -0
- sglang-0.2.14.post1/sglang/srt/layers/decode_attention.py +627 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/fused_moe/layer.py +2 -2
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/logits_processor.py +56 -19
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/radix_attention.py +3 -4
- sglang-0.2.14.post1/sglang/srt/layers/sampler.py +101 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/controller_multi.py +2 -8
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/controller_single.py +7 -10
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/detokenizer_manager.py +20 -9
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/io_struct.py +44 -11
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/policy_scheduler.py +5 -2
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/schedule_batch.py +46 -166
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/tokenizer_manager.py +192 -83
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/managers/tp_worker.py +118 -24
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/memory_pool.py +82 -8
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mm_utils.py +79 -7
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_executor/cuda_graph_runner.py +32 -8
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_executor/forward_batch_info.py +51 -26
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_executor/model_runner.py +201 -58
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/gemma2.py +10 -6
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/gpt_bigcode.py +1 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/grok.py +11 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llama_embedding.py +4 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llava.py +176 -59
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/qwen2.py +9 -3
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/openai_api/adapter.py +200 -39
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/openai_api/protocol.py +2 -0
- sglang-0.2.14.post1/sglang/srt/sampling/sampling_batch_info.py +136 -0
- {sglang-0.2.13/sglang/srt → sglang-0.2.14.post1/sglang/srt/sampling}/sampling_params.py +22 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/server.py +92 -57
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/server_args.py +43 -15
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/utils.py +26 -16
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/runners.py +22 -30
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_common.py +9 -10
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_gpqa.py +2 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_humaneval.py +2 -2
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_math.py +2 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_mmlu.py +2 -1
- sglang-0.2.14.post1/sglang/test/test_activation.py +55 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/test_utils.py +36 -53
- sglang-0.2.14.post1/sglang/version.py +1 -0
- {sglang-0.2.13 → sglang-0.2.14.post1/sglang.egg-info}/PKG-INFO +100 -27
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/SOURCES.txt +4 -2
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/requires.txt +4 -1
- sglang-0.2.13/sglang/launch_server_llavavid.py +0 -29
- sglang-0.2.13/sglang/srt/layers/activation.py +0 -32
- sglang-0.2.13/sglang/srt/layers/decode_attention.py +0 -339
- sglang-0.2.13/sglang/version.py +0 -1
- {sglang-0.2.13 → sglang-0.2.14.post1}/LICENSE +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/setup.cfg +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/global_config.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/layers/prefill_attention.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llama2.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang/utils.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.13 → sglang-0.2.14.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.14.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -216,6 +216,7 @@ Requires-Dist: tqdm
|
|
216
216
|
Requires-Dist: numpy
|
217
217
|
Provides-Extra: srt
|
218
218
|
Requires-Dist: aiohttp; extra == "srt"
|
219
|
+
Requires-Dist: decord; extra == "srt"
|
219
220
|
Requires-Dist: fastapi; extra == "srt"
|
220
221
|
Requires-Dist: hf_transfer; extra == "srt"
|
221
222
|
Requires-Dist: huggingface_hub; extra == "srt"
|
@@ -229,7 +230,7 @@ Requires-Dist: torch; extra == "srt"
|
|
229
230
|
Requires-Dist: uvicorn; extra == "srt"
|
230
231
|
Requires-Dist: uvloop; extra == "srt"
|
231
232
|
Requires-Dist: zmq; extra == "srt"
|
232
|
-
Requires-Dist: vllm==0.5.
|
233
|
+
Requires-Dist: vllm==0.5.5; extra == "srt"
|
233
234
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
234
235
|
Provides-Extra: openai
|
235
236
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -242,6 +243,8 @@ Provides-Extra: test
|
|
242
243
|
Requires-Dist: jsonlines; extra == "test"
|
243
244
|
Requires-Dist: matplotlib; extra == "test"
|
244
245
|
Requires-Dist: pandas; extra == "test"
|
246
|
+
Requires-Dist: sentence_transformers; extra == "test"
|
247
|
+
Requires-Dist: accelerate; extra == "test"
|
245
248
|
Provides-Extra: all
|
246
249
|
Requires-Dist: sglang[srt]; extra == "all"
|
247
250
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
270
273
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
271
274
|
|
272
275
|
The core features include:
|
273
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
276
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
274
277
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
275
278
|
|
276
279
|
## News
|
277
280
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
278
|
-
- [2024/
|
281
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
279
282
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
280
283
|
|
281
284
|
<details>
|
282
285
|
<summary>More</summary>
|
283
286
|
|
287
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
284
288
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
285
289
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
286
290
|
|
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
312
|
### Method 2: From source
|
309
313
|
```
|
310
314
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
|
312
316
|
cd sglang
|
313
317
|
|
314
318
|
pip install --upgrade pip
|
@@ -334,14 +338,60 @@ docker run --gpus all \
|
|
334
338
|
|
335
339
|
### Method 4: Using docker compose
|
336
340
|
|
341
|
+
<details>
|
342
|
+
<summary>More</summary>
|
343
|
+
|
337
344
|
> This method is recommended if you plan to serve it as a service.
|
338
345
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
339
346
|
|
340
347
|
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
341
348
|
2. Execute the command `docker compose up -d` in your terminal.
|
349
|
+
</details>
|
350
|
+
|
351
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
352
|
+
|
353
|
+
<details>
|
354
|
+
<summary>More</summary>
|
355
|
+
|
356
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
357
|
+
|
358
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
359
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
360
|
+
<details>
|
361
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
362
|
+
|
363
|
+
```yaml
|
364
|
+
# sglang.yaml
|
365
|
+
envs:
|
366
|
+
HF_TOKEN: null
|
367
|
+
|
368
|
+
resources:
|
369
|
+
image_id: docker:lmsysorg/sglang:latest
|
370
|
+
accelerators: A100
|
371
|
+
ports: 30000
|
372
|
+
|
373
|
+
run: |
|
374
|
+
conda deactivate
|
375
|
+
python3 -m sglang.launch_server \
|
376
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
377
|
+
--host 0.0.0.0 \
|
378
|
+
--port 30000
|
379
|
+
```
|
380
|
+
</details>
|
381
|
+
|
382
|
+
```bash
|
383
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
384
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
385
|
+
|
386
|
+
# Get the HTTP API endpoint
|
387
|
+
sky status --endpoint 30000 sglang
|
388
|
+
```
|
389
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
390
|
+
</details>
|
391
|
+
|
342
392
|
|
343
393
|
### Common Notes
|
344
|
-
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang.
|
394
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
|
345
395
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
346
396
|
|
347
397
|
## Backend: SGLang Runtime (SRT)
|
@@ -395,6 +445,13 @@ response = client.chat.completions.create(
|
|
395
445
|
max_tokens=64,
|
396
446
|
)
|
397
447
|
print(response)
|
448
|
+
|
449
|
+
# Text embedding
|
450
|
+
response = client.embeddings.create(
|
451
|
+
model="default",
|
452
|
+
input="How are you today",
|
453
|
+
)
|
454
|
+
print(response)
|
398
455
|
```
|
399
456
|
|
400
457
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -431,19 +488,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
431
488
|
|
432
489
|
### Supported Models
|
433
490
|
|
491
|
+
**Generative Models**
|
492
|
+
|
434
493
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
435
494
|
- Mistral / Mixtral / Mistral NeMo
|
436
495
|
- Gemma / Gemma 2
|
437
496
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
438
497
|
- DeepSeek / DeepSeek 2
|
439
|
-
- LLaVA
|
440
|
-
- `
|
441
|
-
-
|
442
|
-
|
443
|
-
-
|
444
|
-
-
|
498
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
499
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
500
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
501
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
502
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
503
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
504
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
445
505
|
- Yi-VL
|
446
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
447
506
|
- StableLM
|
448
507
|
- Command-R
|
449
508
|
- DBRX
|
@@ -451,37 +510,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
451
510
|
- ChatGLM
|
452
511
|
- InternLM 2
|
453
512
|
|
513
|
+
**Embedding Models**
|
514
|
+
|
515
|
+
- e5-mistral
|
516
|
+
- gte-Qwen2
|
517
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
518
|
+
|
454
519
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
455
520
|
|
456
521
|
#### Use Models From ModelScope
|
457
|
-
|
522
|
+
<details>
|
523
|
+
<summary>More</summary>
|
524
|
+
|
525
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
458
526
|
```
|
459
527
|
export SGLANG_USE_MODELSCOPE=true
|
460
528
|
```
|
461
529
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
462
530
|
```
|
463
531
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
464
|
-
```
|
532
|
+
```
|
533
|
+
|
534
|
+
</details>
|
465
535
|
|
466
536
|
#### Run Llama 3.1 405B
|
537
|
+
<details>
|
538
|
+
<summary>More</summary>
|
467
539
|
|
468
540
|
```bash
|
469
|
-
|
541
|
+
# Run 405B (fp8) on a single node
|
470
542
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
471
543
|
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
# on the first node
|
476
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
544
|
+
# Run 405B (fp16) on two nodes
|
545
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
546
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
477
547
|
|
478
|
-
|
479
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
548
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
549
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
480
550
|
```
|
481
551
|
|
552
|
+
</details>
|
553
|
+
|
482
554
|
### Benchmark Performance
|
483
555
|
|
484
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
556
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
557
|
+
Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
|
558
|
+
A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
|
485
559
|
```
|
486
560
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
487
561
|
```
|
@@ -614,7 +688,7 @@ def tip_suggestion(s):
|
|
614
688
|
s += "In summary" + sgl.gen("summary")
|
615
689
|
```
|
616
690
|
|
617
|
-
#### Multi
|
691
|
+
#### Multi-Modality
|
618
692
|
Use `sgl.image` to pass an image as input.
|
619
693
|
|
620
694
|
```python
|
@@ -668,7 +742,7 @@ def character_gen(s, name):
|
|
668
742
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
669
743
|
```
|
670
744
|
|
671
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
745
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
672
746
|
|
673
747
|
#### Batching
|
674
748
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -730,7 +804,6 @@ def chat_example(s):
|
|
730
804
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
731
805
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
732
806
|
|
733
|
-
|
734
807
|
## Benchmark And Performance
|
735
808
|

|
736
809
|

|
@@ -17,17 +17,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
17
17
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
18
18
|
|
19
19
|
The core features include:
|
20
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
20
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
21
21
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
22
22
|
|
23
23
|
## News
|
24
24
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
25
|
-
- [2024/
|
25
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
26
26
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
27
27
|
|
28
28
|
<details>
|
29
29
|
<summary>More</summary>
|
30
30
|
|
31
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
31
32
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
32
33
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
33
34
|
|
@@ -55,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
55
56
|
### Method 2: From source
|
56
57
|
```
|
57
58
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
59
|
+
git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
|
59
60
|
cd sglang
|
60
61
|
|
61
62
|
pip install --upgrade pip
|
@@ -81,14 +82,60 @@ docker run --gpus all \
|
|
81
82
|
|
82
83
|
### Method 4: Using docker compose
|
83
84
|
|
85
|
+
<details>
|
86
|
+
<summary>More</summary>
|
87
|
+
|
84
88
|
> This method is recommended if you plan to serve it as a service.
|
85
89
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
86
90
|
|
87
91
|
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
88
92
|
2. Execute the command `docker compose up -d` in your terminal.
|
93
|
+
</details>
|
94
|
+
|
95
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
96
|
+
|
97
|
+
<details>
|
98
|
+
<summary>More</summary>
|
99
|
+
|
100
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
101
|
+
|
102
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
103
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
104
|
+
<details>
|
105
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
106
|
+
|
107
|
+
```yaml
|
108
|
+
# sglang.yaml
|
109
|
+
envs:
|
110
|
+
HF_TOKEN: null
|
111
|
+
|
112
|
+
resources:
|
113
|
+
image_id: docker:lmsysorg/sglang:latest
|
114
|
+
accelerators: A100
|
115
|
+
ports: 30000
|
116
|
+
|
117
|
+
run: |
|
118
|
+
conda deactivate
|
119
|
+
python3 -m sglang.launch_server \
|
120
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
121
|
+
--host 0.0.0.0 \
|
122
|
+
--port 30000
|
123
|
+
```
|
124
|
+
</details>
|
125
|
+
|
126
|
+
```bash
|
127
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
128
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
129
|
+
|
130
|
+
# Get the HTTP API endpoint
|
131
|
+
sky status --endpoint 30000 sglang
|
132
|
+
```
|
133
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
134
|
+
</details>
|
135
|
+
|
89
136
|
|
90
137
|
### Common Notes
|
91
|
-
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang.
|
138
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
|
92
139
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
93
140
|
|
94
141
|
## Backend: SGLang Runtime (SRT)
|
@@ -142,6 +189,13 @@ response = client.chat.completions.create(
|
|
142
189
|
max_tokens=64,
|
143
190
|
)
|
144
191
|
print(response)
|
192
|
+
|
193
|
+
# Text embedding
|
194
|
+
response = client.embeddings.create(
|
195
|
+
model="default",
|
196
|
+
input="How are you today",
|
197
|
+
)
|
198
|
+
print(response)
|
145
199
|
```
|
146
200
|
|
147
201
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -178,19 +232,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
178
232
|
|
179
233
|
### Supported Models
|
180
234
|
|
235
|
+
**Generative Models**
|
236
|
+
|
181
237
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
182
238
|
- Mistral / Mixtral / Mistral NeMo
|
183
239
|
- Gemma / Gemma 2
|
184
240
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
185
241
|
- DeepSeek / DeepSeek 2
|
186
|
-
- LLaVA
|
187
|
-
- `
|
188
|
-
-
|
189
|
-
|
190
|
-
-
|
191
|
-
-
|
242
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
243
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
244
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
245
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
246
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
247
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
248
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
192
249
|
- Yi-VL
|
193
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
194
250
|
- StableLM
|
195
251
|
- Command-R
|
196
252
|
- DBRX
|
@@ -198,37 +254,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
198
254
|
- ChatGLM
|
199
255
|
- InternLM 2
|
200
256
|
|
257
|
+
**Embedding Models**
|
258
|
+
|
259
|
+
- e5-mistral
|
260
|
+
- gte-Qwen2
|
261
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
262
|
+
|
201
263
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
202
264
|
|
203
265
|
#### Use Models From ModelScope
|
204
|
-
|
266
|
+
<details>
|
267
|
+
<summary>More</summary>
|
268
|
+
|
269
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
205
270
|
```
|
206
271
|
export SGLANG_USE_MODELSCOPE=true
|
207
272
|
```
|
208
273
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
209
274
|
```
|
210
275
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
211
|
-
```
|
276
|
+
```
|
277
|
+
|
278
|
+
</details>
|
212
279
|
|
213
280
|
#### Run Llama 3.1 405B
|
281
|
+
<details>
|
282
|
+
<summary>More</summary>
|
214
283
|
|
215
284
|
```bash
|
216
|
-
|
285
|
+
# Run 405B (fp8) on a single node
|
217
286
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
218
287
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
# on the first node
|
223
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
288
|
+
# Run 405B (fp16) on two nodes
|
289
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
290
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
224
291
|
|
225
|
-
|
226
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
292
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
293
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
227
294
|
```
|
228
295
|
|
296
|
+
</details>
|
297
|
+
|
229
298
|
### Benchmark Performance
|
230
299
|
|
231
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
300
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
301
|
+
Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
|
302
|
+
A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
|
232
303
|
```
|
233
304
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
234
305
|
```
|
@@ -361,7 +432,7 @@ def tip_suggestion(s):
|
|
361
432
|
s += "In summary" + sgl.gen("summary")
|
362
433
|
```
|
363
434
|
|
364
|
-
#### Multi
|
435
|
+
#### Multi-Modality
|
365
436
|
Use `sgl.image` to pass an image as input.
|
366
437
|
|
367
438
|
```python
|
@@ -415,7 +486,7 @@ def character_gen(s, name):
|
|
415
486
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
416
487
|
```
|
417
488
|
|
418
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
489
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
419
490
|
|
420
491
|
#### Batching
|
421
492
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -477,7 +548,6 @@ def chat_example(s):
|
|
477
548
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
478
549
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
479
550
|
|
480
|
-
|
481
551
|
## Benchmark And Performance
|
482
552
|

|
483
553
|

|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.14.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,14 +20,14 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
23
|
+
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.
|
26
|
+
"vllm==0.5.5", "outlines>=0.0.44"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
30
|
-
test = ["jsonlines", "matplotlib", "pandas"]
|
30
|
+
test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"]
|
31
31
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
32
|
dev = ["sglang[all]", "sglang[test]"]
|
33
33
|
|
@@ -66,6 +66,7 @@ def gen(
|
|
66
66
|
temperature: Optional[float] = None,
|
67
67
|
top_p: Optional[float] = None,
|
68
68
|
top_k: Optional[int] = None,
|
69
|
+
min_p: Optional[float] = None,
|
69
70
|
frequency_penalty: Optional[float] = None,
|
70
71
|
presence_penalty: Optional[float] = None,
|
71
72
|
ignore_eos: Optional[bool] = None,
|
@@ -103,6 +104,7 @@ def gen(
|
|
103
104
|
temperature,
|
104
105
|
top_p,
|
105
106
|
top_k,
|
107
|
+
min_p,
|
106
108
|
frequency_penalty,
|
107
109
|
presence_penalty,
|
108
110
|
ignore_eos,
|
@@ -123,6 +125,7 @@ def gen_int(
|
|
123
125
|
temperature: Optional[float] = None,
|
124
126
|
top_p: Optional[float] = None,
|
125
127
|
top_k: Optional[int] = None,
|
128
|
+
min_p: Optional[float] = None,
|
126
129
|
frequency_penalty: Optional[float] = None,
|
127
130
|
presence_penalty: Optional[float] = None,
|
128
131
|
ignore_eos: Optional[bool] = None,
|
@@ -139,6 +142,7 @@ def gen_int(
|
|
139
142
|
temperature,
|
140
143
|
top_p,
|
141
144
|
top_k,
|
145
|
+
min_p,
|
142
146
|
frequency_penalty,
|
143
147
|
presence_penalty,
|
144
148
|
ignore_eos,
|
@@ -159,6 +163,7 @@ def gen_string(
|
|
159
163
|
temperature: Optional[float] = None,
|
160
164
|
top_p: Optional[float] = None,
|
161
165
|
top_k: Optional[int] = None,
|
166
|
+
min_p: Optional[float] = None,
|
162
167
|
frequency_penalty: Optional[float] = None,
|
163
168
|
presence_penalty: Optional[float] = None,
|
164
169
|
ignore_eos: Optional[bool] = None,
|
@@ -175,6 +180,7 @@ def gen_string(
|
|
175
180
|
temperature,
|
176
181
|
top_p,
|
177
182
|
top_k,
|
183
|
+
min_p,
|
178
184
|
frequency_penalty,
|
179
185
|
presence_penalty,
|
180
186
|
ignore_eos,
|
@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
|
54
54
|
from sglang.srt.model_config import ModelConfig
|
55
55
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
56
56
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
57
|
-
from sglang.srt.sampling_params import SamplingParams
|
57
|
+
from sglang.srt.sampling.sampling_params import SamplingParams
|
58
58
|
from sglang.srt.server_args import ServerArgs
|
59
59
|
from sglang.srt.utils import suppress_other_loggers
|
60
60
|
|
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
|
|
111
111
|
suppress_other_loggers()
|
112
112
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
113
113
|
|
114
|
-
model_config = ModelConfig(
|
114
|
+
model_config = ModelConfig(
|
115
|
+
server_args.model_path,
|
116
|
+
server_args.trust_remote_code,
|
117
|
+
context_length=server_args.context_length,
|
118
|
+
)
|
115
119
|
model_runner = ModelRunner(
|
116
120
|
model_config=model_config,
|
117
121
|
mem_fraction_static=server_args.mem_fraction_static,
|
@@ -350,7 +354,7 @@ def latency_test(
|
|
350
354
|
for bs, il, ol in itertools.product(
|
351
355
|
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
352
356
|
):
|
353
|
-
|
357
|
+
reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
|
354
358
|
ret = latency_test_run_once(
|
355
359
|
bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
|
356
360
|
)
|