sglang 0.2.13__tar.gz → 0.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.13/sglang.egg-info → sglang-0.2.14}/PKG-INFO +92 -25
- {sglang-0.2.13 → sglang-0.2.14}/README.md +87 -23
- {sglang-0.2.13 → sglang-0.2.14}/pyproject.toml +4 -4
- {sglang-0.2.13 → sglang-0.2.14}/sglang/api.py +6 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/bench_latency.py +7 -3
- {sglang-0.2.13 → sglang-0.2.14}/sglang/bench_serving.py +50 -26
- {sglang-0.2.13 → sglang-0.2.14}/sglang/check_env.py +15 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/chat_template.py +10 -5
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/compiler.py +4 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/interpreter.py +1 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/ir.py +9 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/launch_server.py +8 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/conversation.py +50 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/hf_transformers_utils.py +22 -23
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/activation.py +24 -1
- sglang-0.2.14/sglang/srt/layers/decode_attention.py +627 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/fused_moe/layer.py +2 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/layernorm.py +3 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/logits_processor.py +60 -23
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/radix_attention.py +3 -4
- sglang-0.2.14/sglang/srt/layers/sampler.py +154 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/controller_multi.py +2 -8
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/controller_single.py +7 -10
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/detokenizer_manager.py +20 -9
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/io_struct.py +44 -11
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/policy_scheduler.py +5 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/schedule_batch.py +52 -167
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/tokenizer_manager.py +192 -83
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/managers/tp_worker.py +130 -43
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/memory_pool.py +82 -8
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mm_utils.py +79 -7
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_executor/cuda_graph_runner.py +49 -11
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_executor/forward_batch_info.py +59 -27
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_executor/model_runner.py +210 -61
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/chatglm.py +4 -12
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/commandr.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/dbrx.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/deepseek.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/deepseek_v2.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/gemma.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/gemma2.py +15 -7
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/gpt_bigcode.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/grok.py +16 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/internlm2.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llama2.py +7 -3
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llama_classification.py +2 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llama_embedding.py +4 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llava.py +176 -59
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/minicpm.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/mixtral.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/mixtral_quant.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/qwen.py +5 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/qwen2.py +13 -3
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/qwen2_moe.py +5 -14
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/stablelm.py +5 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/openai_api/adapter.py +117 -37
- sglang-0.2.14/sglang/srt/sampling/sampling_batch_info.py +209 -0
- {sglang-0.2.13/sglang/srt → sglang-0.2.14/sglang/srt/sampling}/sampling_params.py +18 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/server.py +84 -56
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/server_args.py +43 -15
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/utils.py +26 -16
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/runners.py +23 -31
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_common.py +9 -10
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_gpqa.py +2 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_humaneval.py +2 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_math.py +2 -1
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_mmlu.py +2 -1
- sglang-0.2.14/sglang/test/test_activation.py +55 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/test_utils.py +36 -53
- sglang-0.2.14/sglang/version.py +1 -0
- {sglang-0.2.13 → sglang-0.2.14/sglang.egg-info}/PKG-INFO +92 -25
- {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/SOURCES.txt +4 -2
- {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/requires.txt +4 -1
- sglang-0.2.13/sglang/launch_server_llavavid.py +0 -29
- sglang-0.2.13/sglang/srt/layers/decode_attention.py +0 -339
- sglang-0.2.13/sglang/version.py +0 -1
- {sglang-0.2.13 → sglang-0.2.14}/LICENSE +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/setup.cfg +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/global_config.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/choices.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/layers/prefill_attention.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/run_eval.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/test/test_programs.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang/utils.py +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.13 → sglang-0.2.14}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.14
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -216,6 +216,7 @@ Requires-Dist: tqdm
|
|
216
216
|
Requires-Dist: numpy
|
217
217
|
Provides-Extra: srt
|
218
218
|
Requires-Dist: aiohttp; extra == "srt"
|
219
|
+
Requires-Dist: decord; extra == "srt"
|
219
220
|
Requires-Dist: fastapi; extra == "srt"
|
220
221
|
Requires-Dist: hf_transfer; extra == "srt"
|
221
222
|
Requires-Dist: huggingface_hub; extra == "srt"
|
@@ -229,7 +230,7 @@ Requires-Dist: torch; extra == "srt"
|
|
229
230
|
Requires-Dist: uvicorn; extra == "srt"
|
230
231
|
Requires-Dist: uvloop; extra == "srt"
|
231
232
|
Requires-Dist: zmq; extra == "srt"
|
232
|
-
Requires-Dist: vllm==0.5.
|
233
|
+
Requires-Dist: vllm==0.5.5; extra == "srt"
|
233
234
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
234
235
|
Provides-Extra: openai
|
235
236
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -242,6 +243,8 @@ Provides-Extra: test
|
|
242
243
|
Requires-Dist: jsonlines; extra == "test"
|
243
244
|
Requires-Dist: matplotlib; extra == "test"
|
244
245
|
Requires-Dist: pandas; extra == "test"
|
246
|
+
Requires-Dist: sentence_transformers; extra == "test"
|
247
|
+
Requires-Dist: accelerate; extra == "test"
|
245
248
|
Provides-Extra: all
|
246
249
|
Requires-Dist: sglang[srt]; extra == "all"
|
247
250
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
270
273
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
271
274
|
|
272
275
|
The core features include:
|
273
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
276
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
274
277
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
275
278
|
|
276
279
|
## News
|
277
280
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
278
|
-
- [2024/
|
281
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
279
282
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
280
283
|
|
281
284
|
<details>
|
282
285
|
<summary>More</summary>
|
283
286
|
|
287
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
284
288
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
285
289
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
286
290
|
|
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
312
|
### Method 2: From source
|
309
313
|
```
|
310
314
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
312
316
|
cd sglang
|
313
317
|
|
314
318
|
pip install --upgrade pip
|
@@ -334,11 +338,55 @@ docker run --gpus all \
|
|
334
338
|
|
335
339
|
### Method 4: Using docker compose
|
336
340
|
|
341
|
+
<details>
|
342
|
+
|
337
343
|
> This method is recommended if you plan to serve it as a service.
|
338
344
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
339
345
|
|
340
346
|
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
341
347
|
2. Execute the command `docker compose up -d` in your terminal.
|
348
|
+
</details>
|
349
|
+
|
350
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
351
|
+
|
352
|
+
<details>
|
353
|
+
|
354
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
355
|
+
|
356
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
357
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
358
|
+
<details>
|
359
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
360
|
+
|
361
|
+
```yaml
|
362
|
+
# sglang.yaml
|
363
|
+
envs:
|
364
|
+
HF_TOKEN: null
|
365
|
+
|
366
|
+
resources:
|
367
|
+
image_id: docker:lmsysorg/sglang:latest
|
368
|
+
accelerators: A100
|
369
|
+
ports: 30000
|
370
|
+
|
371
|
+
run: |
|
372
|
+
conda deactivate
|
373
|
+
python3 -m sglang.launch_server \
|
374
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
375
|
+
--host 0.0.0.0 \
|
376
|
+
--port 30000
|
377
|
+
```
|
378
|
+
</details>
|
379
|
+
|
380
|
+
```bash
|
381
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
382
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
383
|
+
|
384
|
+
# Get the HTTP API endpoint
|
385
|
+
sky status --endpoint 30000 sglang
|
386
|
+
```
|
387
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
388
|
+
</details>
|
389
|
+
|
342
390
|
|
343
391
|
### Common Notes
|
344
392
|
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
@@ -395,6 +443,13 @@ response = client.chat.completions.create(
|
|
395
443
|
max_tokens=64,
|
396
444
|
)
|
397
445
|
print(response)
|
446
|
+
|
447
|
+
# Text embedding
|
448
|
+
response = client.embeddings.create(
|
449
|
+
model="default",
|
450
|
+
input="How are you today",
|
451
|
+
)
|
452
|
+
print(response)
|
398
453
|
```
|
399
454
|
|
400
455
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -431,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
431
486
|
|
432
487
|
### Supported Models
|
433
488
|
|
489
|
+
**Generative Models**
|
490
|
+
|
434
491
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
435
492
|
- Mistral / Mixtral / Mistral NeMo
|
436
493
|
- Gemma / Gemma 2
|
437
494
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
438
495
|
- DeepSeek / DeepSeek 2
|
439
|
-
- LLaVA
|
440
|
-
- `
|
441
|
-
-
|
442
|
-
|
443
|
-
-
|
444
|
-
-
|
496
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
497
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
498
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
499
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
500
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
501
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
502
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
445
503
|
- Yi-VL
|
446
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
447
504
|
- StableLM
|
448
505
|
- Command-R
|
449
506
|
- DBRX
|
@@ -451,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
451
508
|
- ChatGLM
|
452
509
|
- InternLM 2
|
453
510
|
|
511
|
+
**Embedding Models**
|
512
|
+
|
513
|
+
- e5-mistral
|
514
|
+
- gte-Qwen2
|
515
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
516
|
+
|
454
517
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
455
518
|
|
456
519
|
#### Use Models From ModelScope
|
457
|
-
|
520
|
+
<details>
|
521
|
+
|
522
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
458
523
|
```
|
459
524
|
export SGLANG_USE_MODELSCOPE=true
|
460
525
|
```
|
461
526
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
462
527
|
```
|
463
528
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
464
|
-
```
|
529
|
+
```
|
530
|
+
|
531
|
+
</details>
|
465
532
|
|
466
533
|
#### Run Llama 3.1 405B
|
534
|
+
<details>
|
467
535
|
|
468
536
|
```bash
|
469
|
-
|
537
|
+
# Run 405B (fp8) on a single node
|
470
538
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
471
539
|
|
472
|
-
|
473
|
-
|
540
|
+
# Run 405B (fp16) on two nodes
|
541
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
542
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
474
543
|
|
475
|
-
|
476
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank
|
477
|
-
|
478
|
-
# on the second
|
479
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
544
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
545
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
480
546
|
```
|
481
547
|
|
548
|
+
</details>
|
549
|
+
|
482
550
|
### Benchmark Performance
|
483
551
|
|
484
552
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -614,7 +682,7 @@ def tip_suggestion(s):
|
|
614
682
|
s += "In summary" + sgl.gen("summary")
|
615
683
|
```
|
616
684
|
|
617
|
-
#### Multi
|
685
|
+
#### Multi-Modality
|
618
686
|
Use `sgl.image` to pass an image as input.
|
619
687
|
|
620
688
|
```python
|
@@ -668,7 +736,7 @@ def character_gen(s, name):
|
|
668
736
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
669
737
|
```
|
670
738
|
|
671
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
739
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
672
740
|
|
673
741
|
#### Batching
|
674
742
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -730,7 +798,6 @@ def chat_example(s):
|
|
730
798
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
731
799
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
732
800
|
|
733
|
-
|
734
801
|
## Benchmark And Performance
|
735
802
|

|
736
803
|

|
@@ -17,17 +17,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
17
17
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
18
18
|
|
19
19
|
The core features include:
|
20
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
20
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
21
21
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
22
22
|
|
23
23
|
## News
|
24
24
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
25
|
-
- [2024/
|
25
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
26
26
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
27
27
|
|
28
28
|
<details>
|
29
29
|
<summary>More</summary>
|
30
30
|
|
31
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
31
32
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
32
33
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
33
34
|
|
@@ -55,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
55
56
|
### Method 2: From source
|
56
57
|
```
|
57
58
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
59
|
+
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
59
60
|
cd sglang
|
60
61
|
|
61
62
|
pip install --upgrade pip
|
@@ -81,11 +82,55 @@ docker run --gpus all \
|
|
81
82
|
|
82
83
|
### Method 4: Using docker compose
|
83
84
|
|
85
|
+
<details>
|
86
|
+
|
84
87
|
> This method is recommended if you plan to serve it as a service.
|
85
88
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
86
89
|
|
87
90
|
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
88
91
|
2. Execute the command `docker compose up -d` in your terminal.
|
92
|
+
</details>
|
93
|
+
|
94
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
95
|
+
|
96
|
+
<details>
|
97
|
+
|
98
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
99
|
+
|
100
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
101
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
102
|
+
<details>
|
103
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
104
|
+
|
105
|
+
```yaml
|
106
|
+
# sglang.yaml
|
107
|
+
envs:
|
108
|
+
HF_TOKEN: null
|
109
|
+
|
110
|
+
resources:
|
111
|
+
image_id: docker:lmsysorg/sglang:latest
|
112
|
+
accelerators: A100
|
113
|
+
ports: 30000
|
114
|
+
|
115
|
+
run: |
|
116
|
+
conda deactivate
|
117
|
+
python3 -m sglang.launch_server \
|
118
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
119
|
+
--host 0.0.0.0 \
|
120
|
+
--port 30000
|
121
|
+
```
|
122
|
+
</details>
|
123
|
+
|
124
|
+
```bash
|
125
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
126
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
127
|
+
|
128
|
+
# Get the HTTP API endpoint
|
129
|
+
sky status --endpoint 30000 sglang
|
130
|
+
```
|
131
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
132
|
+
</details>
|
133
|
+
|
89
134
|
|
90
135
|
### Common Notes
|
91
136
|
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
@@ -142,6 +187,13 @@ response = client.chat.completions.create(
|
|
142
187
|
max_tokens=64,
|
143
188
|
)
|
144
189
|
print(response)
|
190
|
+
|
191
|
+
# Text embedding
|
192
|
+
response = client.embeddings.create(
|
193
|
+
model="default",
|
194
|
+
input="How are you today",
|
195
|
+
)
|
196
|
+
print(response)
|
145
197
|
```
|
146
198
|
|
147
199
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -178,19 +230,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
178
230
|
|
179
231
|
### Supported Models
|
180
232
|
|
233
|
+
**Generative Models**
|
234
|
+
|
181
235
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
182
236
|
- Mistral / Mixtral / Mistral NeMo
|
183
237
|
- Gemma / Gemma 2
|
184
238
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
185
239
|
- DeepSeek / DeepSeek 2
|
186
|
-
- LLaVA
|
187
|
-
- `
|
188
|
-
-
|
189
|
-
|
190
|
-
-
|
191
|
-
-
|
240
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
241
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
242
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
243
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
244
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
245
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
246
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
192
247
|
- Yi-VL
|
193
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
194
248
|
- StableLM
|
195
249
|
- Command-R
|
196
250
|
- DBRX
|
@@ -198,34 +252,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
198
252
|
- ChatGLM
|
199
253
|
- InternLM 2
|
200
254
|
|
255
|
+
**Embedding Models**
|
256
|
+
|
257
|
+
- e5-mistral
|
258
|
+
- gte-Qwen2
|
259
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
260
|
+
|
201
261
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
202
262
|
|
203
263
|
#### Use Models From ModelScope
|
204
|
-
|
264
|
+
<details>
|
265
|
+
|
266
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
205
267
|
```
|
206
268
|
export SGLANG_USE_MODELSCOPE=true
|
207
269
|
```
|
208
270
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
209
271
|
```
|
210
272
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
211
|
-
```
|
273
|
+
```
|
274
|
+
|
275
|
+
</details>
|
212
276
|
|
213
277
|
#### Run Llama 3.1 405B
|
278
|
+
<details>
|
214
279
|
|
215
280
|
```bash
|
216
|
-
|
281
|
+
# Run 405B (fp8) on a single node
|
217
282
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
218
283
|
|
219
|
-
|
220
|
-
|
284
|
+
# Run 405B (fp16) on two nodes
|
285
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
286
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
221
287
|
|
222
|
-
|
223
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank
|
224
|
-
|
225
|
-
# on the second
|
226
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
288
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
289
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
227
290
|
```
|
228
291
|
|
292
|
+
</details>
|
293
|
+
|
229
294
|
### Benchmark Performance
|
230
295
|
|
231
296
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -361,7 +426,7 @@ def tip_suggestion(s):
|
|
361
426
|
s += "In summary" + sgl.gen("summary")
|
362
427
|
```
|
363
428
|
|
364
|
-
#### Multi
|
429
|
+
#### Multi-Modality
|
365
430
|
Use `sgl.image` to pass an image as input.
|
366
431
|
|
367
432
|
```python
|
@@ -415,7 +480,7 @@ def character_gen(s, name):
|
|
415
480
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
416
481
|
```
|
417
482
|
|
418
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
483
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
419
484
|
|
420
485
|
#### Batching
|
421
486
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -477,7 +542,6 @@ def chat_example(s):
|
|
477
542
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
478
543
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
479
544
|
|
480
|
-
|
481
545
|
## Benchmark And Performance
|
482
546
|

|
483
547
|

|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.14"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,14 +20,14 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
23
|
+
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.
|
26
|
+
"vllm==0.5.5", "outlines>=0.0.44"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
30
|
-
test = ["jsonlines", "matplotlib", "pandas"]
|
30
|
+
test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate"]
|
31
31
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
32
|
dev = ["sglang[all]", "sglang[test]"]
|
33
33
|
|
@@ -66,6 +66,7 @@ def gen(
|
|
66
66
|
temperature: Optional[float] = None,
|
67
67
|
top_p: Optional[float] = None,
|
68
68
|
top_k: Optional[int] = None,
|
69
|
+
min_p: Optional[float] = None,
|
69
70
|
frequency_penalty: Optional[float] = None,
|
70
71
|
presence_penalty: Optional[float] = None,
|
71
72
|
ignore_eos: Optional[bool] = None,
|
@@ -103,6 +104,7 @@ def gen(
|
|
103
104
|
temperature,
|
104
105
|
top_p,
|
105
106
|
top_k,
|
107
|
+
min_p,
|
106
108
|
frequency_penalty,
|
107
109
|
presence_penalty,
|
108
110
|
ignore_eos,
|
@@ -123,6 +125,7 @@ def gen_int(
|
|
123
125
|
temperature: Optional[float] = None,
|
124
126
|
top_p: Optional[float] = None,
|
125
127
|
top_k: Optional[int] = None,
|
128
|
+
min_p: Optional[float] = None,
|
126
129
|
frequency_penalty: Optional[float] = None,
|
127
130
|
presence_penalty: Optional[float] = None,
|
128
131
|
ignore_eos: Optional[bool] = None,
|
@@ -139,6 +142,7 @@ def gen_int(
|
|
139
142
|
temperature,
|
140
143
|
top_p,
|
141
144
|
top_k,
|
145
|
+
min_p,
|
142
146
|
frequency_penalty,
|
143
147
|
presence_penalty,
|
144
148
|
ignore_eos,
|
@@ -159,6 +163,7 @@ def gen_string(
|
|
159
163
|
temperature: Optional[float] = None,
|
160
164
|
top_p: Optional[float] = None,
|
161
165
|
top_k: Optional[int] = None,
|
166
|
+
min_p: Optional[float] = None,
|
162
167
|
frequency_penalty: Optional[float] = None,
|
163
168
|
presence_penalty: Optional[float] = None,
|
164
169
|
ignore_eos: Optional[bool] = None,
|
@@ -175,6 +180,7 @@ def gen_string(
|
|
175
180
|
temperature,
|
176
181
|
top_p,
|
177
182
|
top_k,
|
183
|
+
min_p,
|
178
184
|
frequency_penalty,
|
179
185
|
presence_penalty,
|
180
186
|
ignore_eos,
|
@@ -54,7 +54,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
|
54
54
|
from sglang.srt.model_config import ModelConfig
|
55
55
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
56
56
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
57
|
-
from sglang.srt.sampling_params import SamplingParams
|
57
|
+
from sglang.srt.sampling.sampling_params import SamplingParams
|
58
58
|
from sglang.srt.server_args import ServerArgs
|
59
59
|
from sglang.srt.utils import suppress_other_loggers
|
60
60
|
|
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
|
|
111
111
|
suppress_other_loggers()
|
112
112
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
113
113
|
|
114
|
-
model_config = ModelConfig(
|
114
|
+
model_config = ModelConfig(
|
115
|
+
server_args.model_path,
|
116
|
+
server_args.trust_remote_code,
|
117
|
+
context_length=server_args.context_length,
|
118
|
+
)
|
115
119
|
model_runner = ModelRunner(
|
116
120
|
model_config=model_config,
|
117
121
|
mem_fraction_static=server_args.mem_fraction_static,
|
@@ -350,7 +354,7 @@ def latency_test(
|
|
350
354
|
for bs, il, ol in itertools.product(
|
351
355
|
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
352
356
|
):
|
353
|
-
|
357
|
+
reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
|
354
358
|
ret = latency_test_run_once(
|
355
359
|
bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
|
356
360
|
)
|