sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +23 -1
- sglang/bench_latency.py +48 -33
- sglang/bench_server_latency.py +0 -6
- sglang/bench_serving.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +14 -1
- sglang/lang/interpreter.py +16 -6
- sglang/lang/ir.py +20 -4
- sglang/srt/configs/model_config.py +11 -9
- sglang/srt/constrained/fsm_cache.py +9 -1
- sglang/srt/constrained/jump_forward.py +15 -2
- sglang/srt/hf_transformers_utils.py +1 -0
- sglang/srt/layers/activation.py +4 -4
- sglang/srt/layers/attention/__init__.py +49 -0
- sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
- sglang/srt/layers/attention/triton_backend.py +161 -0
- sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
- sglang/srt/layers/fused_moe/patch.py +117 -0
- sglang/srt/layers/layernorm.py +4 -4
- sglang/srt/layers/logits_processor.py +19 -15
- sglang/srt/layers/pooler.py +3 -3
- sglang/srt/layers/quantization/__init__.py +0 -2
- sglang/srt/layers/radix_attention.py +6 -4
- sglang/srt/layers/sampler.py +6 -4
- sglang/srt/layers/torchao_utils.py +18 -0
- sglang/srt/lora/lora.py +20 -21
- sglang/srt/lora/lora_manager.py +97 -25
- sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang/srt/managers/image_processor.py +187 -0
- sglang/srt/managers/io_struct.py +99 -75
- sglang/srt/managers/schedule_batch.py +187 -68
- sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
- sglang/srt/managers/scheduler.py +1021 -0
- sglang/srt/managers/tokenizer_manager.py +120 -247
- sglang/srt/managers/tp_worker.py +28 -925
- sglang/srt/mem_cache/memory_pool.py +34 -52
- sglang/srt/mem_cache/radix_cache.py +5 -5
- sglang/srt/model_executor/cuda_graph_runner.py +25 -25
- sglang/srt/model_executor/forward_batch_info.py +94 -97
- sglang/srt/model_executor/model_runner.py +76 -78
- sglang/srt/models/baichuan.py +10 -10
- sglang/srt/models/chatglm.py +12 -12
- sglang/srt/models/commandr.py +10 -10
- sglang/srt/models/dbrx.py +12 -12
- sglang/srt/models/deepseek.py +10 -10
- sglang/srt/models/deepseek_v2.py +14 -15
- sglang/srt/models/exaone.py +10 -10
- sglang/srt/models/gemma.py +10 -10
- sglang/srt/models/gemma2.py +11 -11
- sglang/srt/models/gpt_bigcode.py +10 -10
- sglang/srt/models/grok.py +10 -10
- sglang/srt/models/internlm2.py +10 -10
- sglang/srt/models/llama.py +22 -10
- sglang/srt/models/llama_classification.py +5 -5
- sglang/srt/models/llama_embedding.py +4 -4
- sglang/srt/models/llama_reward.py +142 -0
- sglang/srt/models/llava.py +39 -33
- sglang/srt/models/llavavid.py +31 -28
- sglang/srt/models/minicpm.py +10 -10
- sglang/srt/models/minicpm3.py +14 -15
- sglang/srt/models/mixtral.py +10 -10
- sglang/srt/models/mixtral_quant.py +10 -10
- sglang/srt/models/olmoe.py +10 -10
- sglang/srt/models/qwen.py +10 -10
- sglang/srt/models/qwen2.py +11 -11
- sglang/srt/models/qwen2_moe.py +10 -10
- sglang/srt/models/stablelm.py +10 -10
- sglang/srt/models/torch_native_llama.py +506 -0
- sglang/srt/models/xverse.py +10 -10
- sglang/srt/models/xverse_moe.py +10 -10
- sglang/srt/openai_api/adapter.py +7 -0
- sglang/srt/sampling/sampling_batch_info.py +36 -27
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +170 -119
- sglang/srt/server_args.py +54 -27
- sglang/srt/utils.py +101 -128
- sglang/test/runners.py +76 -33
- sglang/test/test_programs.py +38 -5
- sglang/test/test_utils.py +53 -9
- sglang/version.py +1 -1
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
- sglang-0.3.3.dist-info/RECORD +139 -0
- sglang/srt/layers/attention_backend.py +0 -482
- sglang/srt/managers/controller_multi.py +0 -207
- sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.1.post3.dist-info/RECORD +0 -134
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
sglang/test/test_programs.py
CHANGED
@@ -72,7 +72,7 @@ def test_select(check_answer):
|
|
72
72
|
statement="The capital of Germany is Berlin.",
|
73
73
|
)
|
74
74
|
if check_answer:
|
75
|
-
assert ret["answer"] == "True", ret.text
|
75
|
+
assert ret["answer"] == "True", ret.text()
|
76
76
|
else:
|
77
77
|
assert ret["answer"] in ["True", "False", "Unknown"]
|
78
78
|
|
@@ -80,7 +80,7 @@ def test_select(check_answer):
|
|
80
80
|
statement="The capital of Canada is Tokyo.",
|
81
81
|
)
|
82
82
|
if check_answer:
|
83
|
-
assert ret["answer"] == "False", ret.text
|
83
|
+
assert ret["answer"] == "False", ret.text()
|
84
84
|
else:
|
85
85
|
assert ret["answer"] in ["True", "False", "Unknown"]
|
86
86
|
|
@@ -88,7 +88,7 @@ def test_select(check_answer):
|
|
88
88
|
statement="Purple is a better color than green.",
|
89
89
|
)
|
90
90
|
if check_answer:
|
91
|
-
assert ret["answer"] == "Unknown", ret.text
|
91
|
+
assert ret["answer"] == "Unknown", ret.text()
|
92
92
|
else:
|
93
93
|
assert ret["answer"] in ["True", "False", "Unknown"]
|
94
94
|
|
@@ -100,8 +100,8 @@ def test_decode_int():
|
|
100
100
|
s += "The number of days in a year is " + sgl.gen_int("days") + "\n"
|
101
101
|
|
102
102
|
ret = decode_int.run(temperature=0.1)
|
103
|
-
assert int(ret["hours"]) == 24, ret.text
|
104
|
-
assert int(ret["days"]) == 365, ret.text
|
103
|
+
assert int(ret["hours"]) == 24, ret.text()
|
104
|
+
assert int(ret["days"]) == 365, ret.text()
|
105
105
|
|
106
106
|
|
107
107
|
def test_decode_json_regex():
|
@@ -517,3 +517,36 @@ def test_hellaswag_select():
|
|
517
517
|
accuracy = np.mean(np.array(preds) == np.array(labels))
|
518
518
|
|
519
519
|
return accuracy, latency
|
520
|
+
|
521
|
+
|
522
|
+
def test_gen_min_new_tokens():
|
523
|
+
"""
|
524
|
+
Validate sgl.gen(min_tokens) functionality.
|
525
|
+
|
526
|
+
The test asks a question where, without a min_tokens constraint, the generated answer is expected to be short.
|
527
|
+
By enforcing the min_tokens parameter, we ensure the generated answer has at least the specified number of tokens.
|
528
|
+
We verify that the number of tokens in the answer is >= the min_tokens threshold.
|
529
|
+
"""
|
530
|
+
import sglang as sgl
|
531
|
+
from sglang.srt.hf_transformers_utils import get_tokenizer
|
532
|
+
|
533
|
+
model_path = sgl.global_config.default_backend.endpoint.get_model_name()
|
534
|
+
MIN_TOKENS, MAX_TOKENS = 64, 128
|
535
|
+
|
536
|
+
@sgl.function
|
537
|
+
def convo_1(s):
|
538
|
+
s += sgl.user("What is the capital of the United States?")
|
539
|
+
s += sgl.assistant(
|
540
|
+
sgl.gen("answer", min_tokens=MIN_TOKENS, max_tokens=MAX_TOKENS)
|
541
|
+
)
|
542
|
+
|
543
|
+
def assert_min_tokens(tokenizer, text):
|
544
|
+
token_ids = tokenizer.encode(text)
|
545
|
+
assert (
|
546
|
+
len(token_ids) >= MIN_TOKENS
|
547
|
+
), f"Generated {len(token_ids)} tokens, min required: {MIN_TOKENS}. Text: {text}"
|
548
|
+
|
549
|
+
tokenizer = get_tokenizer(model_path)
|
550
|
+
|
551
|
+
state = convo_1.run()
|
552
|
+
assert_min_tokens(tokenizer, state["answer"])
|
sglang/test/test_utils.py
CHANGED
@@ -23,12 +23,13 @@ from sglang.srt.utils import kill_child_process
|
|
23
23
|
from sglang.utils import get_exception_traceback
|
24
24
|
|
25
25
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
26
|
-
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/
|
26
|
+
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
27
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
28
28
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
29
|
+
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
29
30
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
30
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/
|
31
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/
|
31
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
32
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
32
33
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
33
34
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
34
35
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
@@ -84,7 +85,7 @@ def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None
|
|
84
85
|
|
85
86
|
|
86
87
|
def call_generate_outlines(
|
87
|
-
prompt, temperature, max_tokens, stop=
|
88
|
+
prompt, temperature, max_tokens, stop=None, regex=None, n=1, url=None
|
88
89
|
):
|
89
90
|
assert url is not None
|
90
91
|
|
@@ -513,7 +514,16 @@ def get_similarities(vec1, vec2):
|
|
513
514
|
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
|
514
515
|
|
515
516
|
|
516
|
-
def run_bench_serving(
|
517
|
+
def run_bench_serving(
|
518
|
+
model,
|
519
|
+
num_prompts,
|
520
|
+
request_rate,
|
521
|
+
other_server_args,
|
522
|
+
dataset_name="random",
|
523
|
+
random_input_len=4096,
|
524
|
+
random_output_len=2048,
|
525
|
+
disable_stream=False,
|
526
|
+
):
|
517
527
|
# Launch the server
|
518
528
|
base_url = DEFAULT_URL_FOR_TEST
|
519
529
|
process = popen_launch_server(
|
@@ -529,21 +539,21 @@ def run_bench_serving(model, num_prompts, request_rate, other_server_args):
|
|
529
539
|
base_url=base_url,
|
530
540
|
host=None,
|
531
541
|
port=None,
|
532
|
-
dataset_name=
|
542
|
+
dataset_name=dataset_name,
|
533
543
|
dataset_path="",
|
534
544
|
model=None,
|
535
545
|
tokenizer=None,
|
536
546
|
num_prompts=num_prompts,
|
537
547
|
sharegpt_output_len=None,
|
538
|
-
random_input_len=
|
539
|
-
random_output_len=
|
548
|
+
random_input_len=random_input_len,
|
549
|
+
random_output_len=random_output_len,
|
540
550
|
random_range_ratio=0.0,
|
541
551
|
request_rate=request_rate,
|
542
552
|
multi=None,
|
543
553
|
seed=0,
|
544
554
|
output_file=None,
|
545
555
|
disable_tqdm=False,
|
546
|
-
disable_stream=
|
556
|
+
disable_stream=disable_stream,
|
547
557
|
disable_ignore_eos=False,
|
548
558
|
extra_request_body=None,
|
549
559
|
)
|
@@ -587,3 +597,37 @@ def run_bench_latency(model, other_args):
|
|
587
597
|
kill_child_process(process.pid)
|
588
598
|
|
589
599
|
return output_throughput
|
600
|
+
|
601
|
+
|
602
|
+
def lcs(X, Y):
|
603
|
+
m = len(X)
|
604
|
+
n = len(Y)
|
605
|
+
L = [[0] * (n + 1) for _ in range(m + 1)]
|
606
|
+
|
607
|
+
for i in range(m + 1):
|
608
|
+
for j in range(n + 1):
|
609
|
+
if i == 0 or j == 0:
|
610
|
+
L[i][j] = 0
|
611
|
+
elif X[i - 1] == Y[j - 1]:
|
612
|
+
L[i][j] = L[i - 1][j - 1] + 1
|
613
|
+
else:
|
614
|
+
L[i][j] = max(L[i - 1][j], L[i][j - 1])
|
615
|
+
|
616
|
+
return L[m][n]
|
617
|
+
|
618
|
+
|
619
|
+
def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
620
|
+
"""calculate the ROUGE-L score"""
|
621
|
+
rouge_l_scores = []
|
622
|
+
|
623
|
+
for s1, s2 in zip(output_strs_list1, output_strs_list2):
|
624
|
+
lcs_len = lcs(s1, s2)
|
625
|
+
precision = lcs_len / len(s1) if len(s1) > 0 else 0
|
626
|
+
recall = lcs_len / len(s2) if len(s2) > 0 else 0
|
627
|
+
if precision + recall > 0:
|
628
|
+
fmeasure = (2 * precision * recall) / (precision + recall)
|
629
|
+
else:
|
630
|
+
fmeasure = 0.0
|
631
|
+
rouge_l_scores.append(fmeasure)
|
632
|
+
|
633
|
+
return rouge_l_scores
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -248,6 +248,7 @@ Requires-Dist: uvloop; extra == "srt"
|
|
248
248
|
Requires-Dist: zmq; extra == "srt"
|
249
249
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
250
250
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
251
|
+
Requires-Dist: modelscope; extra == "srt"
|
251
252
|
Provides-Extra: test
|
252
253
|
Requires-Dist: jsonlines; extra == "test"
|
253
254
|
Requires-Dist: matplotlib; extra == "test"
|
@@ -269,16 +270,11 @@ Requires-Dist: peft; extra == "test"
|
|
269
270
|
|
270
271
|
--------------------------------------------------------------------------------
|
271
272
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
275
|
+
## Upcoming Events
|
276
|
+
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
282
278
|
|
283
279
|
## News
|
284
280
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -294,6 +290,16 @@ The core features include:
|
|
294
290
|
|
295
291
|
</details>
|
296
292
|
|
293
|
+
## About
|
294
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
295
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
296
|
+
The core features include:
|
297
|
+
|
298
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
299
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
300
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
301
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
302
|
+
|
297
303
|
## Contents
|
298
304
|
- [Install](#install)
|
299
305
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -318,7 +324,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
324
|
### Method 2: From source
|
319
325
|
```
|
320
326
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
327
|
+
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
322
328
|
cd sglang
|
323
329
|
|
324
330
|
pip install --upgrade pip
|
@@ -339,7 +345,7 @@ docker run --gpus all \
|
|
339
345
|
--env "HF_TOKEN=<secret>" \
|
340
346
|
--ipc=host \
|
341
347
|
lmsysorg/sglang:latest \
|
342
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
348
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
343
349
|
```
|
344
350
|
|
345
351
|
### Method 4: Using docker compose
|
@@ -348,9 +354,9 @@ docker run --gpus all \
|
|
348
354
|
<summary>More</summary>
|
349
355
|
|
350
356
|
> This method is recommended if you plan to serve it as a service.
|
351
|
-
> A better approach is to use the [k8s-sglang-service.yaml](
|
357
|
+
> A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
|
352
358
|
|
353
|
-
1. Copy the [compose.yml](
|
359
|
+
1. Copy the [compose.yml](docker/compose.yaml) to your local machine
|
354
360
|
2. Execute the command `docker compose up -d` in your terminal.
|
355
361
|
</details>
|
356
362
|
|
@@ -379,7 +385,7 @@ resources:
|
|
379
385
|
run: |
|
380
386
|
conda deactivate
|
381
387
|
python3 -m sglang.launch_server \
|
382
|
-
--model-path meta-llama/
|
388
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
383
389
|
--host 0.0.0.0 \
|
384
390
|
--port 30000
|
385
391
|
```
|
@@ -421,7 +427,8 @@ curl http://localhost:30000/generate \
|
|
421
427
|
}
|
422
428
|
}'
|
423
429
|
```
|
424
|
-
|
430
|
+
|
431
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
425
432
|
|
426
433
|
### OpenAI Compatible API
|
427
434
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -460,7 +467,7 @@ response = client.embeddings.create(
|
|
460
467
|
print(response)
|
461
468
|
```
|
462
469
|
|
463
|
-
It supports streaming, vision, and
|
470
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
464
471
|
|
465
472
|
### Additional Server Arguments
|
466
473
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -481,10 +488,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
481
488
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
482
489
|
```
|
483
490
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
491
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
484
492
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
493
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
494
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
487
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
495
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
488
496
|
```
|
489
497
|
# Node 0
|
490
498
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -499,9 +507,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
507
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
508
|
- Mistral / Mixtral / Mistral NeMo
|
501
509
|
- Gemma / Gemma 2
|
502
|
-
- OLMoE
|
503
510
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
504
511
|
- DeepSeek / DeepSeek 2
|
512
|
+
- OLMoE
|
505
513
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
506
514
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
507
515
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -521,7 +529,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
521
529
|
- BaiChuan2
|
522
530
|
- MiniCPM / MiniCPM 3
|
523
531
|
- XVERSE / XVERSE MoE
|
524
|
-
|
532
|
+
- SmolLM
|
525
533
|
|
526
534
|
**Embedding Models**
|
527
535
|
|
@@ -529,7 +537,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
529
537
|
- gte-Qwen2
|
530
538
|
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
531
539
|
|
532
|
-
Instructions for supporting a new model are [here](
|
540
|
+
Instructions for supporting a new model are [here](docs/en/model_support.md).
|
533
541
|
|
534
542
|
#### Use Models From ModelScope
|
535
543
|
<details>
|
@@ -543,6 +551,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
543
551
|
```
|
544
552
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
545
553
|
```
|
554
|
+
|
555
|
+
Or start it by docker.
|
556
|
+
```bash
|
557
|
+
docker run --gpus all \
|
558
|
+
-p 30000:30000 \
|
559
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
560
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
561
|
+
--ipc=host \
|
562
|
+
lmsysorg/sglang:latest \
|
563
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
564
|
+
```
|
546
565
|
|
547
566
|
</details>
|
548
567
|
|
@@ -581,7 +600,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
581
600
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
582
601
|
|
583
602
|
### Quick Start
|
584
|
-
The example below shows how to use sglang to answer a
|
603
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
585
604
|
|
586
605
|
#### Using Local Models
|
587
606
|
First, launch a server with
|
@@ -824,7 +843,7 @@ def chat_example(s):
|
|
824
843
|
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
825
844
|
|
826
845
|
## Roadmap
|
827
|
-
[Development Roadmap (2024
|
846
|
+
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
828
847
|
|
829
848
|
## Citation And Acknowledgment
|
830
849
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -0,0 +1,139 @@
|
|
1
|
+
sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
|
2
|
+
sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
|
3
|
+
sglang/bench_latency.py,sha256=NkaL4YFWqDnochwaLd8o2pyZGqu6TeURbFB3TGyZHr4,17893
|
4
|
+
sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
|
5
|
+
sglang/bench_serving.py,sha256=1AQzkQ8ci9-rMZEM7wap8I09oPP4AZd93RfXMQRgVro,36386
|
6
|
+
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
7
|
+
sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
8
|
+
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
9
|
+
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
10
|
+
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
11
|
+
sglang/version.py,sha256=8KcCYTXH99C2-gCLuPILJvtT9YftRWJsartIx6TQ2ZY,22
|
12
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
14
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
15
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
16
|
+
sglang/lang/interpreter.py,sha256=zakc6IkzATaMqVDWKWvqDRrqnRykxFawajA7aUHUDbI,30640
|
17
|
+
sglang/lang/ir.py,sha256=F_9ac10OjktxR7KhOV07wiJXV20s79cRfh9d4koExJc,18262
|
18
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
19
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
21
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
22
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
23
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
24
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
|
25
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
26
|
+
sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
|
27
|
+
sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
|
28
|
+
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
29
|
+
sglang/srt/server.py,sha256=SKV6IxR8w0AmuwgHSEOfag_t-f6hAEq9Xg49iBioi2U,22224
|
30
|
+
sglang/srt/server_args.py,sha256=LI8ehxs0sfI0EDhON-OhNGbDx0-oo9QhfnpYjYwnH54,24405
|
31
|
+
sglang/srt/utils.py,sha256=amDWXIu1syU-kvdV8bUkNfYaMfpcN22BKZm_2xp59jI,22202
|
32
|
+
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
33
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
|
+
sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
|
35
|
+
sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
|
36
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
37
|
+
sglang/srt/constrained/fsm_cache.py,sha256=9GtliIN55Ov8Q9MSFfQC5rKrz3qTsB7Cm5OkhivKngY,3271
|
38
|
+
sglang/srt/constrained/jump_forward.py,sha256=o-CzJu3DEs0eFKlLzsQVYMSo4vBKpffs25sXLOJd6jc,6997
|
39
|
+
sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
|
40
|
+
sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
|
41
|
+
sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=Fq7VHwjP4iSzl_OBLo8qw_HVbIDbYB-0MGmfiD3Jk_E,12521
|
43
|
+
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
44
|
+
sglang/srt/layers/radix_attention.py,sha256=i07VRXPDHj-zJ1TSrXEqCxumQwYSHwAvc8DoIg-Irtg,1964
|
45
|
+
sglang/srt/layers/sampler.py,sha256=J5vd0CcLpLfgtLniCoe2VF6hjM_ld76hbDG4p1qoAMc,4010
|
46
|
+
sglang/srt/layers/torchao_utils.py,sha256=1nzZkSzbF4qCAMeBKAeeDpMl_mK8imiY2RL3xFEgvAw,3340
|
47
|
+
sglang/srt/layers/attention/__init__.py,sha256=zLLwinbYLAQHfVEz0jZiVa_cYNgSYoy4wYD_0y-ErHQ,1798
|
48
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=DOvm-d3XLjE6XJDD3a8aCnlpuAJZZ946YFDH_Ec4lqc,10150
|
49
|
+
sglang/srt/layers/attention/flashinfer_utils.py,sha256=9YMt7ab6F0gEVkxdVm8vDB0LVBRYRL0XIKVrmndp4n8,7571
|
50
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=I_kw0LXdgziHAFC8Qv5n5PDFJRLvZyzVsXwjmFZ0KSc,6041
|
51
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
|
52
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=oyqon1KG5-ICHcCANAbrglXLYKvWHFML-4tIQI9M5VI,11063
|
53
|
+
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
|
54
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
55
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
56
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
|
57
|
+
sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
|
58
|
+
sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
|
59
|
+
sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
|
60
|
+
sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
|
61
|
+
sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
|
62
|
+
sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
|
63
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=iCLPdHkL6lAp_-Qew1u4Tyt3jYRkJ8i-Bj3l8TC-uaA,7278
|
64
|
+
sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
|
65
|
+
sglang/srt/managers/io_struct.py,sha256=rPyQk5y-jJu4eyoqUVh4M8B14PifjkE8B3K5yI0NX24,12185
|
66
|
+
sglang/srt/managers/schedule_batch.py,sha256=mqdMg1QB6PNLbBjxkXoP_Ld82R1w34g_13YH82DGMh8,31216
|
67
|
+
sglang/srt/managers/schedule_policy.py,sha256=PiTKvsAFwoNWNsv_SFkghIHCL452MdboRc2cmN6ITcU,11935
|
68
|
+
sglang/srt/managers/scheduler.py,sha256=N9GQnp2SXd8-uN49KmQO-144N27M6h3dxRZuFZ-9AmY,39132
|
69
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=BAvLW_cRtIgjL0_cwrvDAb7g740fgEddyqaT3JtofR4,24548
|
70
|
+
sglang/srt/managers/tp_worker.py,sha256=fcaW-u7AAX49kQCNn_AEtdRPykRdT6Z6lx1O9LHA15E,4833
|
71
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
72
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
73
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
74
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=L-5drUt7vlyvple4OcjH1jJRzt2qhVrpc9klZn-bQfE,7125
|
75
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
|
76
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=iheZYErwFT_W4kJUE1dgbGoQQx7hyOSKa-Yv8guq0DI,10479
|
77
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=FIQ8XIIP724mIL2l7w7mSEFH452qw-TPpqm43J4YeHM,5822
|
78
|
+
sglang/srt/model_executor/model_runner.py,sha256=KyglHFIMb5TC-NszN2D85_k7oVQLhbwhUYa7u3RFkoc,22874
|
79
|
+
sglang/srt/models/baichuan.py,sha256=50m43kIVo-YamHFwxyiLGG_pCbF7mzUJfhEyuuSmVC8,15100
|
80
|
+
sglang/srt/models/chatglm.py,sha256=XaS_6-ZvRw7X-56sk9xQogqT0NzGEMVpiAdQnC5qbBY,13333
|
81
|
+
sglang/srt/models/commandr.py,sha256=2urK7u2FiwPBl60hMmt-wfaJ8V-ilv6l1B37MUlvSxk,14121
|
82
|
+
sglang/srt/models/dbrx.py,sha256=qTpyA1Iv56VI-ksPKt4JryX2Pn7T5FXAa0n0ZoT4qbw,14615
|
83
|
+
sglang/srt/models/deepseek.py,sha256=4sl4YYoxqe-vif7KJKcMjMA3KgvzYHqpQBgM58lzLHc,15973
|
84
|
+
sglang/srt/models/deepseek_v2.py,sha256=dt0FGAgW3jd7OJJnKfH-LIU13U0I9b7R9shYmAEins4,28390
|
85
|
+
sglang/srt/models/exaone.py,sha256=9JfFhYbpcHMXIaBNn8rc_GOlkItkIgbGNslNyFD7gvU,13054
|
86
|
+
sglang/srt/models/gemma.py,sha256=gui46inEJsrmppEMTUIQuzMxGPEBx_TjiZ5-PacjuSk,12240
|
87
|
+
sglang/srt/models/gemma2.py,sha256=V0GjEdTqxyXvBqjgyiyONipohjOqw0pLITmZZRb2kIE,14890
|
88
|
+
sglang/srt/models/gpt_bigcode.py,sha256=LgSm-8oxBfnzMAC4Jqqg-RJGge4E_wgJ1br7ylbTPZ0,10162
|
89
|
+
sglang/srt/models/grok.py,sha256=lUR_SmD_KhIiZx5OVUPZp8VVdrAga6WWTdMKJ5PCFbw,14896
|
90
|
+
sglang/srt/models/internlm2.py,sha256=4SUaeJl2dZlUowahfv7kLbz3jLXtmvdBPGURmhAeX6Q,12169
|
91
|
+
sglang/srt/models/llama.py,sha256=5j66LmvFhOKgFZiE75mJ80XBjZ2dNx7e8Yea5lsD0P0,15828
|
92
|
+
sglang/srt/models/llama_classification.py,sha256=Yhabu9FuBxjNo74crMsK0FqpD53ehOx_zcHgIXjvlvQ,3379
|
93
|
+
sglang/srt/models/llama_embedding.py,sha256=4j3WNLB-x7XQnJvohdRs7VSSEabbhiE2BRHmnG5IZRU,3453
|
94
|
+
sglang/srt/models/llama_reward.py,sha256=qQOPfn-9oqhsD0EaffXtk-EXKRdSZL1X7CYAGCDoG9A,5383
|
95
|
+
sglang/srt/models/llava.py,sha256=zbJs1P4_Bjh2_dSbyoheJZ1wGXuKHGz6BpV766G7ZUY,25094
|
96
|
+
sglang/srt/models/llavavid.py,sha256=qhBGHTxzGAOMgqMiwOc3mUbaK6qeXsEYSlNmlEEIdeM,12198
|
97
|
+
sglang/srt/models/minicpm.py,sha256=5vc-Lq7ggHrRxxkciVMdZ5Vq6ThLwnhFS62UCokFC2g,13792
|
98
|
+
sglang/srt/models/minicpm3.py,sha256=hhhgZTKQApUZpH_MYQZTk3K1Ox-xpJRxGCemoUw8x4U,25184
|
99
|
+
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
100
|
+
sglang/srt/models/mixtral.py,sha256=BonqX_rSB_UuBDQe3uy8-NOxB4Q4s2mTxTQItvFB9ZQ,13864
|
101
|
+
sglang/srt/models/mixtral_quant.py,sha256=SAHBIiD5O1TnojCpqTLcPy3TEvfSCKeOe3GC47fdFSg,14039
|
102
|
+
sglang/srt/models/olmoe.py,sha256=ghhNpZe4SzaZEpw0APYBbAmLb3LBagRC2N724RkOkH4,15312
|
103
|
+
sglang/srt/models/qwen.py,sha256=IrOKHS7b4SL2fnJegq811eeHnAQDya2PujIgKQ9URVY,9921
|
104
|
+
sglang/srt/models/qwen2.py,sha256=B7hXnW5uYPmpMgSN7tI3tTvMEmmQLpddsw_iNTiaHJI,12398
|
105
|
+
sglang/srt/models/qwen2_moe.py,sha256=MK-9W6FJhXoQYayg_jpXjKKq4n5j3s2b2ZaoCBfVJ2I,17120
|
106
|
+
sglang/srt/models/stablelm.py,sha256=ldtlRG1XGdYcjwqb48dpMTfbdh8KHUjcWrrUYNJ0MEk,11326
|
107
|
+
sglang/srt/models/torch_native_llama.py,sha256=c5GJ_k9zbSOk0PjLCXAK8YebGEy0RUVYZ9_h6_19A3M,19215
|
108
|
+
sglang/srt/models/xverse.py,sha256=i11wEKqqVCoVtH7yo9jfpNyGHxhw7NvTPid3ojmg79s,13634
|
109
|
+
sglang/srt/models/xverse_moe.py,sha256=JwkBhsyusP7e_hAMnomkP8cEmKNCLJPRtwaTERQ0D0M,15818
|
110
|
+
sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
|
111
|
+
sglang/srt/openai_api/adapter.py,sha256=ULX1lo23r6semogKcbUOXGSgPJi8NJ7IuC0WVvEbVbs,51458
|
112
|
+
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
113
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=mtE_kLC6U-X6Q20BVjPWyDOoGc4kcTdIPpcsNeZcRYo,6462
|
114
|
+
sglang/srt/sampling/sampling_params.py,sha256=Xwh4_M6PP4SWyGV-zNyIhp4XbRKbeU4251ao8UOlZlI,5704
|
115
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
116
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
117
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
118
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
119
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
120
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
121
|
+
sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
|
122
|
+
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
123
|
+
sglang/test/runners.py,sha256=VCmtH08FsAq_JTAKfKo0zB4o-osNMAxxwe4aKcSxr4c,13515
|
124
|
+
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
125
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
126
|
+
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
127
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
128
|
+
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
129
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
130
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
131
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
132
|
+
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
133
|
+
sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
|
134
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
135
|
+
sglang-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
136
|
+
sglang-0.3.3.dist-info/METADATA,sha256=zeY2pmiGPJb52zaHqiRHY4OcZqAHPvG_zPyve5KfANc,39063
|
137
|
+
sglang-0.3.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
138
|
+
sglang-0.3.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
139
|
+
sglang-0.3.3.dist-info/RECORD,,
|