sglang 0.3.1.post3__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.1.post3/sglang.egg-info → sglang-0.3.3}/PKG-INFO +42 -23
- {sglang-0.3.1.post3 → sglang-0.3.3}/README.md +40 -22
- {sglang-0.3.1.post3 → sglang-0.3.3}/pyproject.toml +2 -2
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/__init__.py +2 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/api.py +23 -1
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/bench_latency.py +48 -33
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/bench_server_latency.py +0 -6
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/bench_serving.py +2 -2
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/runtime_endpoint.py +14 -1
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/interpreter.py +16 -6
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/ir.py +20 -4
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/configs/model_config.py +11 -9
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/constrained/fsm_cache.py +9 -1
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/constrained/jump_forward.py +15 -2
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/hf_transformers_utils.py +1 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/activation.py +4 -4
- sglang-0.3.3/sglang/srt/layers/attention/__init__.py +49 -0
- sglang-0.3.3/sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- {sglang-0.3.1.post3/sglang/srt/layers → sglang-0.3.3/sglang/srt/layers/attention}/flashinfer_utils.py +82 -80
- sglang-0.3.3/sglang/srt/layers/attention/triton_backend.py +161 -0
- {sglang-0.3.1.post3/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/extend_attention.py +3 -1
- sglang-0.3.3/sglang/srt/layers/fused_moe/patch.py +117 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/layernorm.py +4 -4
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/logits_processor.py +19 -15
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/pooler.py +3 -3
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/quantization/__init__.py +0 -2
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/radix_attention.py +6 -4
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/sampler.py +6 -4
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/torchao_utils.py +18 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/lora/lora.py +20 -21
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/lora/lora_manager.py +97 -25
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang-0.3.3/sglang/srt/managers/image_processor.py +187 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/managers/io_struct.py +99 -75
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/managers/schedule_batch.py +187 -68
- sglang-0.3.1.post3/sglang/srt/managers/policy_scheduler.py → sglang-0.3.3/sglang/srt/managers/schedule_policy.py +31 -21
- sglang-0.3.1.post3/sglang/srt/managers/tp_worker.py → sglang-0.3.3/sglang/srt/managers/scheduler.py +380 -384
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/managers/tokenizer_manager.py +120 -247
- sglang-0.3.3/sglang/srt/managers/tp_worker.py +128 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/mem_cache/memory_pool.py +34 -52
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/mem_cache/radix_cache.py +5 -5
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/model_executor/cuda_graph_runner.py +25 -25
- sglang-0.3.3/sglang/srt/model_executor/forward_batch_info.py +173 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/model_executor/model_runner.py +76 -78
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/baichuan.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/chatglm.py +12 -12
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/commandr.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/dbrx.py +12 -12
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/deepseek.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/deepseek_v2.py +14 -15
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/exaone.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/gemma.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/gemma2.py +11 -11
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/gpt_bigcode.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/grok.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/internlm2.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/llama.py +22 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/llama_classification.py +5 -5
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/llama_embedding.py +4 -4
- sglang-0.3.3/sglang/srt/models/llama_reward.py +142 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/llava.py +39 -33
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/llavavid.py +31 -28
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/minicpm.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/minicpm3.py +14 -15
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/mixtral.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/mixtral_quant.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/olmoe.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/qwen.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/qwen2.py +11 -11
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/qwen2_moe.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/stablelm.py +10 -10
- sglang-0.3.3/sglang/srt/models/torch_native_llama.py +506 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/xverse.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/xverse_moe.py +10 -10
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/openai_api/adapter.py +7 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/sampling_batch_info.py +36 -27
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/sampling_params.py +3 -1
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/server.py +170 -119
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/server_args.py +54 -27
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/utils.py +101 -128
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/runners.py +76 -33
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/test_programs.py +38 -5
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/test_utils.py +53 -9
- sglang-0.3.3/sglang/version.py +1 -0
- {sglang-0.3.1.post3 → sglang-0.3.3/sglang.egg-info}/PKG-INFO +42 -23
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang.egg-info/SOURCES.txt +13 -8
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang.egg-info/requires.txt +1 -0
- sglang-0.3.1.post3/sglang/srt/layers/attention_backend.py +0 -482
- sglang-0.3.1.post3/sglang/srt/managers/controller_multi.py +0 -207
- sglang-0.3.1.post3/sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.1.post3/sglang/srt/model_executor/forward_batch_info.py +0 -176
- sglang-0.3.1.post3/sglang/version.py +0 -1
- {sglang-0.3.1.post3 → sglang-0.3.3}/LICENSE +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/setup.cfg +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/check_env.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/global_config.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/choices.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/launch_server.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.1.post3/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/decode_attention.py +0 -0
- {sglang-0.3.1.post3/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang/utils.py +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.1.post3 → sglang-0.3.3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -233,6 +233,7 @@ Requires-Dist: uvloop; extra == "srt"
|
|
233
233
|
Requires-Dist: zmq; extra == "srt"
|
234
234
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
235
235
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
236
|
+
Requires-Dist: modelscope; extra == "srt"
|
236
237
|
Provides-Extra: openai
|
237
238
|
Requires-Dist: openai>=1.0; extra == "openai"
|
238
239
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -269,16 +270,11 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
269
270
|
|
270
271
|
--------------------------------------------------------------------------------
|
271
272
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
275
|
+
## Upcoming Events
|
276
|
+
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
282
278
|
|
283
279
|
## News
|
284
280
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -294,6 +290,16 @@ The core features include:
|
|
294
290
|
|
295
291
|
</details>
|
296
292
|
|
293
|
+
## About
|
294
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
295
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
296
|
+
The core features include:
|
297
|
+
|
298
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
299
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
300
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
301
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
302
|
+
|
297
303
|
## Contents
|
298
304
|
- [Install](#install)
|
299
305
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -318,7 +324,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
324
|
### Method 2: From source
|
319
325
|
```
|
320
326
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
327
|
+
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
322
328
|
cd sglang
|
323
329
|
|
324
330
|
pip install --upgrade pip
|
@@ -339,7 +345,7 @@ docker run --gpus all \
|
|
339
345
|
--env "HF_TOKEN=<secret>" \
|
340
346
|
--ipc=host \
|
341
347
|
lmsysorg/sglang:latest \
|
342
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
348
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
343
349
|
```
|
344
350
|
|
345
351
|
### Method 4: Using docker compose
|
@@ -348,9 +354,9 @@ docker run --gpus all \
|
|
348
354
|
<summary>More</summary>
|
349
355
|
|
350
356
|
> This method is recommended if you plan to serve it as a service.
|
351
|
-
> A better approach is to use the [k8s-sglang-service.yaml](
|
357
|
+
> A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
|
352
358
|
|
353
|
-
1. Copy the [compose.yml](
|
359
|
+
1. Copy the [compose.yml](docker/compose.yaml) to your local machine
|
354
360
|
2. Execute the command `docker compose up -d` in your terminal.
|
355
361
|
</details>
|
356
362
|
|
@@ -379,7 +385,7 @@ resources:
|
|
379
385
|
run: |
|
380
386
|
conda deactivate
|
381
387
|
python3 -m sglang.launch_server \
|
382
|
-
--model-path meta-llama/
|
388
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
383
389
|
--host 0.0.0.0 \
|
384
390
|
--port 30000
|
385
391
|
```
|
@@ -421,7 +427,8 @@ curl http://localhost:30000/generate \
|
|
421
427
|
}
|
422
428
|
}'
|
423
429
|
```
|
424
|
-
|
430
|
+
|
431
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
425
432
|
|
426
433
|
### OpenAI Compatible API
|
427
434
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -460,7 +467,7 @@ response = client.embeddings.create(
|
|
460
467
|
print(response)
|
461
468
|
```
|
462
469
|
|
463
|
-
It supports streaming, vision, and
|
470
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
464
471
|
|
465
472
|
### Additional Server Arguments
|
466
473
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -481,10 +488,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
481
488
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
482
489
|
```
|
483
490
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
491
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
484
492
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
493
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
494
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
487
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
495
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
488
496
|
```
|
489
497
|
# Node 0
|
490
498
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -499,9 +507,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
507
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
508
|
- Mistral / Mixtral / Mistral NeMo
|
501
509
|
- Gemma / Gemma 2
|
502
|
-
- OLMoE
|
503
510
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
504
511
|
- DeepSeek / DeepSeek 2
|
512
|
+
- OLMoE
|
505
513
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
506
514
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
507
515
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -521,7 +529,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
521
529
|
- BaiChuan2
|
522
530
|
- MiniCPM / MiniCPM 3
|
523
531
|
- XVERSE / XVERSE MoE
|
524
|
-
|
532
|
+
- SmolLM
|
525
533
|
|
526
534
|
**Embedding Models**
|
527
535
|
|
@@ -529,7 +537,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
529
537
|
- gte-Qwen2
|
530
538
|
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
531
539
|
|
532
|
-
Instructions for supporting a new model are [here](
|
540
|
+
Instructions for supporting a new model are [here](docs/en/model_support.md).
|
533
541
|
|
534
542
|
#### Use Models From ModelScope
|
535
543
|
<details>
|
@@ -543,6 +551,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
543
551
|
```
|
544
552
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
545
553
|
```
|
554
|
+
|
555
|
+
Or start it by docker.
|
556
|
+
```bash
|
557
|
+
docker run --gpus all \
|
558
|
+
-p 30000:30000 \
|
559
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
560
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
561
|
+
--ipc=host \
|
562
|
+
lmsysorg/sglang:latest \
|
563
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
564
|
+
```
|
546
565
|
|
547
566
|
</details>
|
548
567
|
|
@@ -581,7 +600,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
581
600
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
582
601
|
|
583
602
|
### Quick Start
|
584
|
-
The example below shows how to use sglang to answer a
|
603
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
585
604
|
|
586
605
|
#### Using Local Models
|
587
606
|
First, launch a server with
|
@@ -824,7 +843,7 @@ def chat_example(s):
|
|
824
843
|
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
825
844
|
|
826
845
|
## Roadmap
|
827
|
-
[Development Roadmap (2024
|
846
|
+
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
828
847
|
|
829
848
|
## Citation And Acknowledgment
|
830
849
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -11,16 +11,11 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
21
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
22
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
23
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
16
|
+
## Upcoming Events
|
17
|
+
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
18
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
24
19
|
|
25
20
|
## News
|
26
21
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -36,6 +31,16 @@ The core features include:
|
|
36
31
|
|
37
32
|
</details>
|
38
33
|
|
34
|
+
## About
|
35
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
36
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
37
|
+
The core features include:
|
38
|
+
|
39
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
40
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
41
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
42
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
43
|
+
|
39
44
|
## Contents
|
40
45
|
- [Install](#install)
|
41
46
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -60,7 +65,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
60
65
|
### Method 2: From source
|
61
66
|
```
|
62
67
|
# Use the last release branch
|
63
|
-
git clone -b v0.3.
|
68
|
+
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
64
69
|
cd sglang
|
65
70
|
|
66
71
|
pip install --upgrade pip
|
@@ -81,7 +86,7 @@ docker run --gpus all \
|
|
81
86
|
--env "HF_TOKEN=<secret>" \
|
82
87
|
--ipc=host \
|
83
88
|
lmsysorg/sglang:latest \
|
84
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
89
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
85
90
|
```
|
86
91
|
|
87
92
|
### Method 4: Using docker compose
|
@@ -90,9 +95,9 @@ docker run --gpus all \
|
|
90
95
|
<summary>More</summary>
|
91
96
|
|
92
97
|
> This method is recommended if you plan to serve it as a service.
|
93
|
-
> A better approach is to use the [k8s-sglang-service.yaml](
|
98
|
+
> A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
|
94
99
|
|
95
|
-
1. Copy the [compose.yml](
|
100
|
+
1. Copy the [compose.yml](docker/compose.yaml) to your local machine
|
96
101
|
2. Execute the command `docker compose up -d` in your terminal.
|
97
102
|
</details>
|
98
103
|
|
@@ -121,7 +126,7 @@ resources:
|
|
121
126
|
run: |
|
122
127
|
conda deactivate
|
123
128
|
python3 -m sglang.launch_server \
|
124
|
-
--model-path meta-llama/
|
129
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
125
130
|
--host 0.0.0.0 \
|
126
131
|
--port 30000
|
127
132
|
```
|
@@ -163,7 +168,8 @@ curl http://localhost:30000/generate \
|
|
163
168
|
}
|
164
169
|
}'
|
165
170
|
```
|
166
|
-
|
171
|
+
|
172
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
167
173
|
|
168
174
|
### OpenAI Compatible API
|
169
175
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -202,7 +208,7 @@ response = client.embeddings.create(
|
|
202
208
|
print(response)
|
203
209
|
```
|
204
210
|
|
205
|
-
It supports streaming, vision, and
|
211
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
206
212
|
|
207
213
|
### Additional Server Arguments
|
208
214
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -223,10 +229,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
223
229
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
224
230
|
```
|
225
231
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
232
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
226
233
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
227
234
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
228
235
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
229
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
236
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
230
237
|
```
|
231
238
|
# Node 0
|
232
239
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -241,9 +248,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
241
248
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
242
249
|
- Mistral / Mixtral / Mistral NeMo
|
243
250
|
- Gemma / Gemma 2
|
244
|
-
- OLMoE
|
245
251
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
246
252
|
- DeepSeek / DeepSeek 2
|
253
|
+
- OLMoE
|
247
254
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
248
255
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
249
256
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -263,7 +270,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
263
270
|
- BaiChuan2
|
264
271
|
- MiniCPM / MiniCPM 3
|
265
272
|
- XVERSE / XVERSE MoE
|
266
|
-
|
273
|
+
- SmolLM
|
267
274
|
|
268
275
|
**Embedding Models**
|
269
276
|
|
@@ -271,7 +278,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
271
278
|
- gte-Qwen2
|
272
279
|
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
273
280
|
|
274
|
-
Instructions for supporting a new model are [here](
|
281
|
+
Instructions for supporting a new model are [here](docs/en/model_support.md).
|
275
282
|
|
276
283
|
#### Use Models From ModelScope
|
277
284
|
<details>
|
@@ -285,6 +292,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
285
292
|
```
|
286
293
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
287
294
|
```
|
295
|
+
|
296
|
+
Or start it by docker.
|
297
|
+
```bash
|
298
|
+
docker run --gpus all \
|
299
|
+
-p 30000:30000 \
|
300
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
301
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
302
|
+
--ipc=host \
|
303
|
+
lmsysorg/sglang:latest \
|
304
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
305
|
+
```
|
288
306
|
|
289
307
|
</details>
|
290
308
|
|
@@ -323,7 +341,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
323
341
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
324
342
|
|
325
343
|
### Quick Start
|
326
|
-
The example below shows how to use sglang to answer a
|
344
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
327
345
|
|
328
346
|
#### Using Local Models
|
329
347
|
First, launch a server with
|
@@ -566,7 +584,7 @@ def chat_example(s):
|
|
566
584
|
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
567
585
|
|
568
586
|
## Roadmap
|
569
|
-
[Development Roadmap (2024
|
587
|
+
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
570
588
|
|
571
589
|
## Citation And Acknowledgment
|
572
590
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.
|
7
|
+
version = "0.3.3"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,7 +23,7 @@ dependencies = [
|
|
23
23
|
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "torchao", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.5", "outlines>=0.0.44"]
|
26
|
+
"vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SGL API Components
|
2
2
|
|
3
3
|
from sglang.api import (
|
4
|
+
Engine,
|
4
5
|
Runtime,
|
5
6
|
assistant,
|
6
7
|
assistant_begin,
|
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
|
|
31
32
|
# SGLang DSL APIs
|
32
33
|
__all__ = [
|
33
34
|
"Runtime",
|
35
|
+
"Engine",
|
34
36
|
"assistant",
|
35
37
|
"assistant_begin",
|
36
38
|
"assistant_end",
|
@@ -33,13 +33,23 @@ def function(
|
|
33
33
|
|
34
34
|
|
35
35
|
def Runtime(*args, **kwargs):
|
36
|
-
# Avoid importing unnecessary dependency
|
37
36
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
37
|
+
|
38
|
+
# Avoid importing unnecessary dependency
|
38
39
|
from sglang.srt.server import Runtime
|
39
40
|
|
40
41
|
return Runtime(*args, **kwargs)
|
41
42
|
|
42
43
|
|
44
|
+
def Engine(*args, **kwargs):
|
45
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
46
|
+
|
47
|
+
# Avoid importing unnecessary dependency
|
48
|
+
from sglang.srt.server import Engine
|
49
|
+
|
50
|
+
return Engine(*args, **kwargs)
|
51
|
+
|
52
|
+
|
43
53
|
def set_default_backend(backend: BaseBackend):
|
44
54
|
global_config.default_backend = backend
|
45
55
|
|
@@ -48,6 +58,10 @@ def flush_cache(backend: Optional[BaseBackend] = None):
|
|
48
58
|
backend = backend or global_config.default_backend
|
49
59
|
if backend is None:
|
50
60
|
return False
|
61
|
+
|
62
|
+
# If backend is Runtime
|
63
|
+
if hasattr(backend, "endpoint"):
|
64
|
+
backend = backend.endpoint
|
51
65
|
return backend.flush_cache()
|
52
66
|
|
53
67
|
|
@@ -55,12 +69,17 @@ def get_server_args(backend: Optional[BaseBackend] = None):
|
|
55
69
|
backend = backend or global_config.default_backend
|
56
70
|
if backend is None:
|
57
71
|
return None
|
72
|
+
|
73
|
+
# If backend is Runtime
|
74
|
+
if hasattr(backend, "endpoint"):
|
75
|
+
backend = backend.endpoint
|
58
76
|
return backend.get_server_args()
|
59
77
|
|
60
78
|
|
61
79
|
def gen(
|
62
80
|
name: Optional[str] = None,
|
63
81
|
max_tokens: Optional[int] = None,
|
82
|
+
min_tokens: Optional[int] = None,
|
64
83
|
stop: Optional[Union[str, List[str]]] = None,
|
65
84
|
stop_token_ids: Optional[List[int]] = None,
|
66
85
|
temperature: Optional[float] = None,
|
@@ -100,6 +119,7 @@ def gen(
|
|
100
119
|
return SglGen(
|
101
120
|
name,
|
102
121
|
max_tokens,
|
122
|
+
min_tokens,
|
103
123
|
stop,
|
104
124
|
stop_token_ids,
|
105
125
|
temperature,
|
@@ -139,6 +159,7 @@ def gen_int(
|
|
139
159
|
return SglGen(
|
140
160
|
name,
|
141
161
|
max_tokens,
|
162
|
+
None,
|
142
163
|
stop,
|
143
164
|
stop_token_ids,
|
144
165
|
temperature,
|
@@ -177,6 +198,7 @@ def gen_string(
|
|
177
198
|
return SglGen(
|
178
199
|
name,
|
179
200
|
max_tokens,
|
201
|
+
None,
|
180
202
|
stop,
|
181
203
|
stop_token_ids,
|
182
204
|
temperature,
|