sglang 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +23 -1
- sglang/bench_latency.py +46 -25
- sglang/bench_serving.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +14 -1
- sglang/lang/interpreter.py +16 -6
- sglang/lang/ir.py +20 -4
- sglang/srt/configs/model_config.py +11 -9
- sglang/srt/constrained/fsm_cache.py +9 -1
- sglang/srt/constrained/jump_forward.py +15 -2
- sglang/srt/layers/activation.py +4 -4
- sglang/srt/layers/attention/__init__.py +49 -0
- sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
- sglang/srt/layers/attention/triton_backend.py +161 -0
- sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
- sglang/srt/layers/layernorm.py +4 -4
- sglang/srt/layers/logits_processor.py +19 -15
- sglang/srt/layers/pooler.py +3 -3
- sglang/srt/layers/quantization/__init__.py +0 -2
- sglang/srt/layers/radix_attention.py +6 -4
- sglang/srt/layers/sampler.py +6 -4
- sglang/srt/layers/torchao_utils.py +18 -0
- sglang/srt/lora/lora.py +20 -21
- sglang/srt/lora/lora_manager.py +97 -25
- sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang/srt/managers/image_processor.py +187 -0
- sglang/srt/managers/io_struct.py +99 -75
- sglang/srt/managers/schedule_batch.py +184 -63
- sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
- sglang/srt/managers/scheduler.py +1021 -0
- sglang/srt/managers/tokenizer_manager.py +120 -248
- sglang/srt/managers/tp_worker.py +28 -925
- sglang/srt/mem_cache/memory_pool.py +34 -52
- sglang/srt/model_executor/cuda_graph_runner.py +15 -19
- sglang/srt/model_executor/forward_batch_info.py +94 -95
- sglang/srt/model_executor/model_runner.py +76 -75
- sglang/srt/models/baichuan.py +10 -10
- sglang/srt/models/chatglm.py +12 -12
- sglang/srt/models/commandr.py +10 -10
- sglang/srt/models/dbrx.py +12 -12
- sglang/srt/models/deepseek.py +10 -10
- sglang/srt/models/deepseek_v2.py +14 -15
- sglang/srt/models/exaone.py +10 -10
- sglang/srt/models/gemma.py +10 -10
- sglang/srt/models/gemma2.py +11 -11
- sglang/srt/models/gpt_bigcode.py +10 -10
- sglang/srt/models/grok.py +10 -10
- sglang/srt/models/internlm2.py +10 -10
- sglang/srt/models/llama.py +14 -10
- sglang/srt/models/llama_classification.py +5 -5
- sglang/srt/models/llama_embedding.py +4 -4
- sglang/srt/models/llama_reward.py +142 -0
- sglang/srt/models/llava.py +39 -33
- sglang/srt/models/llavavid.py +31 -28
- sglang/srt/models/minicpm.py +10 -10
- sglang/srt/models/minicpm3.py +14 -15
- sglang/srt/models/mixtral.py +10 -10
- sglang/srt/models/mixtral_quant.py +10 -10
- sglang/srt/models/olmoe.py +10 -10
- sglang/srt/models/qwen.py +10 -10
- sglang/srt/models/qwen2.py +11 -11
- sglang/srt/models/qwen2_moe.py +10 -10
- sglang/srt/models/stablelm.py +10 -10
- sglang/srt/models/torch_native_llama.py +506 -0
- sglang/srt/models/xverse.py +10 -10
- sglang/srt/models/xverse_moe.py +10 -10
- sglang/srt/sampling/sampling_batch_info.py +36 -27
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +170 -119
- sglang/srt/server_args.py +54 -27
- sglang/srt/utils.py +101 -128
- sglang/test/runners.py +71 -26
- sglang/test/test_programs.py +38 -5
- sglang/test/test_utils.py +18 -9
- sglang/version.py +1 -1
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/METADATA +37 -19
- sglang-0.3.3.dist-info/RECORD +139 -0
- sglang/srt/layers/attention_backend.py +0 -474
- sglang/srt/managers/controller_multi.py +0 -207
- sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.2.dist-info/RECORD +0 -135
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
sglang/test/test_programs.py
CHANGED
@@ -72,7 +72,7 @@ def test_select(check_answer):
|
|
72
72
|
statement="The capital of Germany is Berlin.",
|
73
73
|
)
|
74
74
|
if check_answer:
|
75
|
-
assert ret["answer"] == "True", ret.text
|
75
|
+
assert ret["answer"] == "True", ret.text()
|
76
76
|
else:
|
77
77
|
assert ret["answer"] in ["True", "False", "Unknown"]
|
78
78
|
|
@@ -80,7 +80,7 @@ def test_select(check_answer):
|
|
80
80
|
statement="The capital of Canada is Tokyo.",
|
81
81
|
)
|
82
82
|
if check_answer:
|
83
|
-
assert ret["answer"] == "False", ret.text
|
83
|
+
assert ret["answer"] == "False", ret.text()
|
84
84
|
else:
|
85
85
|
assert ret["answer"] in ["True", "False", "Unknown"]
|
86
86
|
|
@@ -88,7 +88,7 @@ def test_select(check_answer):
|
|
88
88
|
statement="Purple is a better color than green.",
|
89
89
|
)
|
90
90
|
if check_answer:
|
91
|
-
assert ret["answer"] == "Unknown", ret.text
|
91
|
+
assert ret["answer"] == "Unknown", ret.text()
|
92
92
|
else:
|
93
93
|
assert ret["answer"] in ["True", "False", "Unknown"]
|
94
94
|
|
@@ -100,8 +100,8 @@ def test_decode_int():
|
|
100
100
|
s += "The number of days in a year is " + sgl.gen_int("days") + "\n"
|
101
101
|
|
102
102
|
ret = decode_int.run(temperature=0.1)
|
103
|
-
assert int(ret["hours"]) == 24, ret.text
|
104
|
-
assert int(ret["days"]) == 365, ret.text
|
103
|
+
assert int(ret["hours"]) == 24, ret.text()
|
104
|
+
assert int(ret["days"]) == 365, ret.text()
|
105
105
|
|
106
106
|
|
107
107
|
def test_decode_json_regex():
|
@@ -517,3 +517,36 @@ def test_hellaswag_select():
|
|
517
517
|
accuracy = np.mean(np.array(preds) == np.array(labels))
|
518
518
|
|
519
519
|
return accuracy, latency
|
520
|
+
|
521
|
+
|
522
|
+
def test_gen_min_new_tokens():
|
523
|
+
"""
|
524
|
+
Validate sgl.gen(min_tokens) functionality.
|
525
|
+
|
526
|
+
The test asks a question where, without a min_tokens constraint, the generated answer is expected to be short.
|
527
|
+
By enforcing the min_tokens parameter, we ensure the generated answer has at least the specified number of tokens.
|
528
|
+
We verify that the number of tokens in the answer is >= the min_tokens threshold.
|
529
|
+
"""
|
530
|
+
import sglang as sgl
|
531
|
+
from sglang.srt.hf_transformers_utils import get_tokenizer
|
532
|
+
|
533
|
+
model_path = sgl.global_config.default_backend.endpoint.get_model_name()
|
534
|
+
MIN_TOKENS, MAX_TOKENS = 64, 128
|
535
|
+
|
536
|
+
@sgl.function
|
537
|
+
def convo_1(s):
|
538
|
+
s += sgl.user("What is the capital of the United States?")
|
539
|
+
s += sgl.assistant(
|
540
|
+
sgl.gen("answer", min_tokens=MIN_TOKENS, max_tokens=MAX_TOKENS)
|
541
|
+
)
|
542
|
+
|
543
|
+
def assert_min_tokens(tokenizer, text):
|
544
|
+
token_ids = tokenizer.encode(text)
|
545
|
+
assert (
|
546
|
+
len(token_ids) >= MIN_TOKENS
|
547
|
+
), f"Generated {len(token_ids)} tokens, min required: {MIN_TOKENS}. Text: {text}"
|
548
|
+
|
549
|
+
tokenizer = get_tokenizer(model_path)
|
550
|
+
|
551
|
+
state = convo_1.run()
|
552
|
+
assert_min_tokens(tokenizer, state["answer"])
|
sglang/test/test_utils.py
CHANGED
@@ -23,13 +23,13 @@ from sglang.srt.utils import kill_child_process
|
|
23
23
|
from sglang.utils import get_exception_traceback
|
24
24
|
|
25
25
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
26
|
-
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/
|
26
|
+
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
27
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
28
28
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
29
29
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
30
30
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
31
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/
|
32
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/
|
31
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
32
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
33
33
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
34
34
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
35
35
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
@@ -85,7 +85,7 @@ def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None
|
|
85
85
|
|
86
86
|
|
87
87
|
def call_generate_outlines(
|
88
|
-
prompt, temperature, max_tokens, stop=
|
88
|
+
prompt, temperature, max_tokens, stop=None, regex=None, n=1, url=None
|
89
89
|
):
|
90
90
|
assert url is not None
|
91
91
|
|
@@ -514,7 +514,16 @@ def get_similarities(vec1, vec2):
|
|
514
514
|
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
|
515
515
|
|
516
516
|
|
517
|
-
def run_bench_serving(
|
517
|
+
def run_bench_serving(
|
518
|
+
model,
|
519
|
+
num_prompts,
|
520
|
+
request_rate,
|
521
|
+
other_server_args,
|
522
|
+
dataset_name="random",
|
523
|
+
random_input_len=4096,
|
524
|
+
random_output_len=2048,
|
525
|
+
disable_stream=False,
|
526
|
+
):
|
518
527
|
# Launch the server
|
519
528
|
base_url = DEFAULT_URL_FOR_TEST
|
520
529
|
process = popen_launch_server(
|
@@ -530,21 +539,21 @@ def run_bench_serving(model, num_prompts, request_rate, other_server_args):
|
|
530
539
|
base_url=base_url,
|
531
540
|
host=None,
|
532
541
|
port=None,
|
533
|
-
dataset_name=
|
542
|
+
dataset_name=dataset_name,
|
534
543
|
dataset_path="",
|
535
544
|
model=None,
|
536
545
|
tokenizer=None,
|
537
546
|
num_prompts=num_prompts,
|
538
547
|
sharegpt_output_len=None,
|
539
|
-
random_input_len=
|
540
|
-
random_output_len=
|
548
|
+
random_input_len=random_input_len,
|
549
|
+
random_output_len=random_output_len,
|
541
550
|
random_range_ratio=0.0,
|
542
551
|
request_rate=request_rate,
|
543
552
|
multi=None,
|
544
553
|
seed=0,
|
545
554
|
output_file=None,
|
546
555
|
disable_tqdm=False,
|
547
|
-
disable_stream=
|
556
|
+
disable_stream=disable_stream,
|
548
557
|
disable_ignore_eos=False,
|
549
558
|
extra_request_body=None,
|
550
559
|
)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -248,6 +248,7 @@ Requires-Dist: uvloop; extra == "srt"
|
|
248
248
|
Requires-Dist: zmq; extra == "srt"
|
249
249
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
250
250
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
251
|
+
Requires-Dist: modelscope; extra == "srt"
|
251
252
|
Provides-Extra: test
|
252
253
|
Requires-Dist: jsonlines; extra == "test"
|
253
254
|
Requires-Dist: matplotlib; extra == "test"
|
@@ -269,16 +270,11 @@ Requires-Dist: peft; extra == "test"
|
|
269
270
|
|
270
271
|
--------------------------------------------------------------------------------
|
271
272
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
275
|
+
## Upcoming Events
|
276
|
+
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
282
278
|
|
283
279
|
## News
|
284
280
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -294,6 +290,16 @@ The core features include:
|
|
294
290
|
|
295
291
|
</details>
|
296
292
|
|
293
|
+
## About
|
294
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
295
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
296
|
+
The core features include:
|
297
|
+
|
298
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
299
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
300
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
301
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
302
|
+
|
297
303
|
## Contents
|
298
304
|
- [Install](#install)
|
299
305
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -318,7 +324,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
324
|
### Method 2: From source
|
319
325
|
```
|
320
326
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
327
|
+
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
322
328
|
cd sglang
|
323
329
|
|
324
330
|
pip install --upgrade pip
|
@@ -339,7 +345,7 @@ docker run --gpus all \
|
|
339
345
|
--env "HF_TOKEN=<secret>" \
|
340
346
|
--ipc=host \
|
341
347
|
lmsysorg/sglang:latest \
|
342
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
348
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
343
349
|
```
|
344
350
|
|
345
351
|
### Method 4: Using docker compose
|
@@ -379,7 +385,7 @@ resources:
|
|
379
385
|
run: |
|
380
386
|
conda deactivate
|
381
387
|
python3 -m sglang.launch_server \
|
382
|
-
--model-path meta-llama/
|
388
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
383
389
|
--host 0.0.0.0 \
|
384
390
|
--port 30000
|
385
391
|
```
|
@@ -421,7 +427,8 @@ curl http://localhost:30000/generate \
|
|
421
427
|
}
|
422
428
|
}'
|
423
429
|
```
|
424
|
-
|
430
|
+
|
431
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
425
432
|
|
426
433
|
### OpenAI Compatible API
|
427
434
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -460,7 +467,7 @@ response = client.embeddings.create(
|
|
460
467
|
print(response)
|
461
468
|
```
|
462
469
|
|
463
|
-
It supports streaming, vision, and
|
470
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
464
471
|
|
465
472
|
### Additional Server Arguments
|
466
473
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -481,10 +488,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
481
488
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
482
489
|
```
|
483
490
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
491
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
484
492
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
493
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
494
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
487
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
495
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
488
496
|
```
|
489
497
|
# Node 0
|
490
498
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -499,9 +507,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
507
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
508
|
- Mistral / Mixtral / Mistral NeMo
|
501
509
|
- Gemma / Gemma 2
|
502
|
-
- OLMoE
|
503
510
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
504
511
|
- DeepSeek / DeepSeek 2
|
512
|
+
- OLMoE
|
505
513
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
506
514
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
507
515
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -523,7 +531,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
523
531
|
- XVERSE / XVERSE MoE
|
524
532
|
- SmolLM
|
525
533
|
|
526
|
-
|
527
534
|
**Embedding Models**
|
528
535
|
|
529
536
|
- e5-mistral
|
@@ -544,6 +551,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
544
551
|
```
|
545
552
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
546
553
|
```
|
554
|
+
|
555
|
+
Or start it by docker.
|
556
|
+
```bash
|
557
|
+
docker run --gpus all \
|
558
|
+
-p 30000:30000 \
|
559
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
560
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
561
|
+
--ipc=host \
|
562
|
+
lmsysorg/sglang:latest \
|
563
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
564
|
+
```
|
547
565
|
|
548
566
|
</details>
|
549
567
|
|
@@ -582,7 +600,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
582
600
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
583
601
|
|
584
602
|
### Quick Start
|
585
|
-
The example below shows how to use sglang to answer a
|
603
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
586
604
|
|
587
605
|
#### Using Local Models
|
588
606
|
First, launch a server with
|
@@ -0,0 +1,139 @@
|
|
1
|
+
sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
|
2
|
+
sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
|
3
|
+
sglang/bench_latency.py,sha256=NkaL4YFWqDnochwaLd8o2pyZGqu6TeURbFB3TGyZHr4,17893
|
4
|
+
sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
|
5
|
+
sglang/bench_serving.py,sha256=1AQzkQ8ci9-rMZEM7wap8I09oPP4AZd93RfXMQRgVro,36386
|
6
|
+
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
7
|
+
sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
8
|
+
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
9
|
+
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
10
|
+
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
11
|
+
sglang/version.py,sha256=8KcCYTXH99C2-gCLuPILJvtT9YftRWJsartIx6TQ2ZY,22
|
12
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
14
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
15
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
16
|
+
sglang/lang/interpreter.py,sha256=zakc6IkzATaMqVDWKWvqDRrqnRykxFawajA7aUHUDbI,30640
|
17
|
+
sglang/lang/ir.py,sha256=F_9ac10OjktxR7KhOV07wiJXV20s79cRfh9d4koExJc,18262
|
18
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
19
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
21
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
22
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
23
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
24
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
|
25
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
26
|
+
sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
|
27
|
+
sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
|
28
|
+
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
29
|
+
sglang/srt/server.py,sha256=SKV6IxR8w0AmuwgHSEOfag_t-f6hAEq9Xg49iBioi2U,22224
|
30
|
+
sglang/srt/server_args.py,sha256=LI8ehxs0sfI0EDhON-OhNGbDx0-oo9QhfnpYjYwnH54,24405
|
31
|
+
sglang/srt/utils.py,sha256=amDWXIu1syU-kvdV8bUkNfYaMfpcN22BKZm_2xp59jI,22202
|
32
|
+
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
33
|
+
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
|
+
sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
|
35
|
+
sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
|
36
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
37
|
+
sglang/srt/constrained/fsm_cache.py,sha256=9GtliIN55Ov8Q9MSFfQC5rKrz3qTsB7Cm5OkhivKngY,3271
|
38
|
+
sglang/srt/constrained/jump_forward.py,sha256=o-CzJu3DEs0eFKlLzsQVYMSo4vBKpffs25sXLOJd6jc,6997
|
39
|
+
sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
|
40
|
+
sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
|
41
|
+
sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=Fq7VHwjP4iSzl_OBLo8qw_HVbIDbYB-0MGmfiD3Jk_E,12521
|
43
|
+
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
44
|
+
sglang/srt/layers/radix_attention.py,sha256=i07VRXPDHj-zJ1TSrXEqCxumQwYSHwAvc8DoIg-Irtg,1964
|
45
|
+
sglang/srt/layers/sampler.py,sha256=J5vd0CcLpLfgtLniCoe2VF6hjM_ld76hbDG4p1qoAMc,4010
|
46
|
+
sglang/srt/layers/torchao_utils.py,sha256=1nzZkSzbF4qCAMeBKAeeDpMl_mK8imiY2RL3xFEgvAw,3340
|
47
|
+
sglang/srt/layers/attention/__init__.py,sha256=zLLwinbYLAQHfVEz0jZiVa_cYNgSYoy4wYD_0y-ErHQ,1798
|
48
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=DOvm-d3XLjE6XJDD3a8aCnlpuAJZZ946YFDH_Ec4lqc,10150
|
49
|
+
sglang/srt/layers/attention/flashinfer_utils.py,sha256=9YMt7ab6F0gEVkxdVm8vDB0LVBRYRL0XIKVrmndp4n8,7571
|
50
|
+
sglang/srt/layers/attention/triton_backend.py,sha256=I_kw0LXdgziHAFC8Qv5n5PDFJRLvZyzVsXwjmFZ0KSc,6041
|
51
|
+
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
|
52
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=oyqon1KG5-ICHcCANAbrglXLYKvWHFML-4tIQI9M5VI,11063
|
53
|
+
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
|
54
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
55
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
56
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
|
57
|
+
sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
|
58
|
+
sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
|
59
|
+
sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
|
60
|
+
sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
|
61
|
+
sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
|
62
|
+
sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
|
63
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=iCLPdHkL6lAp_-Qew1u4Tyt3jYRkJ8i-Bj3l8TC-uaA,7278
|
64
|
+
sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
|
65
|
+
sglang/srt/managers/io_struct.py,sha256=rPyQk5y-jJu4eyoqUVh4M8B14PifjkE8B3K5yI0NX24,12185
|
66
|
+
sglang/srt/managers/schedule_batch.py,sha256=mqdMg1QB6PNLbBjxkXoP_Ld82R1w34g_13YH82DGMh8,31216
|
67
|
+
sglang/srt/managers/schedule_policy.py,sha256=PiTKvsAFwoNWNsv_SFkghIHCL452MdboRc2cmN6ITcU,11935
|
68
|
+
sglang/srt/managers/scheduler.py,sha256=N9GQnp2SXd8-uN49KmQO-144N27M6h3dxRZuFZ-9AmY,39132
|
69
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=BAvLW_cRtIgjL0_cwrvDAb7g740fgEddyqaT3JtofR4,24548
|
70
|
+
sglang/srt/managers/tp_worker.py,sha256=fcaW-u7AAX49kQCNn_AEtdRPykRdT6Z6lx1O9LHA15E,4833
|
71
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
72
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
73
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
74
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=L-5drUt7vlyvple4OcjH1jJRzt2qhVrpc9klZn-bQfE,7125
|
75
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
|
76
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=iheZYErwFT_W4kJUE1dgbGoQQx7hyOSKa-Yv8guq0DI,10479
|
77
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=FIQ8XIIP724mIL2l7w7mSEFH452qw-TPpqm43J4YeHM,5822
|
78
|
+
sglang/srt/model_executor/model_runner.py,sha256=KyglHFIMb5TC-NszN2D85_k7oVQLhbwhUYa7u3RFkoc,22874
|
79
|
+
sglang/srt/models/baichuan.py,sha256=50m43kIVo-YamHFwxyiLGG_pCbF7mzUJfhEyuuSmVC8,15100
|
80
|
+
sglang/srt/models/chatglm.py,sha256=XaS_6-ZvRw7X-56sk9xQogqT0NzGEMVpiAdQnC5qbBY,13333
|
81
|
+
sglang/srt/models/commandr.py,sha256=2urK7u2FiwPBl60hMmt-wfaJ8V-ilv6l1B37MUlvSxk,14121
|
82
|
+
sglang/srt/models/dbrx.py,sha256=qTpyA1Iv56VI-ksPKt4JryX2Pn7T5FXAa0n0ZoT4qbw,14615
|
83
|
+
sglang/srt/models/deepseek.py,sha256=4sl4YYoxqe-vif7KJKcMjMA3KgvzYHqpQBgM58lzLHc,15973
|
84
|
+
sglang/srt/models/deepseek_v2.py,sha256=dt0FGAgW3jd7OJJnKfH-LIU13U0I9b7R9shYmAEins4,28390
|
85
|
+
sglang/srt/models/exaone.py,sha256=9JfFhYbpcHMXIaBNn8rc_GOlkItkIgbGNslNyFD7gvU,13054
|
86
|
+
sglang/srt/models/gemma.py,sha256=gui46inEJsrmppEMTUIQuzMxGPEBx_TjiZ5-PacjuSk,12240
|
87
|
+
sglang/srt/models/gemma2.py,sha256=V0GjEdTqxyXvBqjgyiyONipohjOqw0pLITmZZRb2kIE,14890
|
88
|
+
sglang/srt/models/gpt_bigcode.py,sha256=LgSm-8oxBfnzMAC4Jqqg-RJGge4E_wgJ1br7ylbTPZ0,10162
|
89
|
+
sglang/srt/models/grok.py,sha256=lUR_SmD_KhIiZx5OVUPZp8VVdrAga6WWTdMKJ5PCFbw,14896
|
90
|
+
sglang/srt/models/internlm2.py,sha256=4SUaeJl2dZlUowahfv7kLbz3jLXtmvdBPGURmhAeX6Q,12169
|
91
|
+
sglang/srt/models/llama.py,sha256=5j66LmvFhOKgFZiE75mJ80XBjZ2dNx7e8Yea5lsD0P0,15828
|
92
|
+
sglang/srt/models/llama_classification.py,sha256=Yhabu9FuBxjNo74crMsK0FqpD53ehOx_zcHgIXjvlvQ,3379
|
93
|
+
sglang/srt/models/llama_embedding.py,sha256=4j3WNLB-x7XQnJvohdRs7VSSEabbhiE2BRHmnG5IZRU,3453
|
94
|
+
sglang/srt/models/llama_reward.py,sha256=qQOPfn-9oqhsD0EaffXtk-EXKRdSZL1X7CYAGCDoG9A,5383
|
95
|
+
sglang/srt/models/llava.py,sha256=zbJs1P4_Bjh2_dSbyoheJZ1wGXuKHGz6BpV766G7ZUY,25094
|
96
|
+
sglang/srt/models/llavavid.py,sha256=qhBGHTxzGAOMgqMiwOc3mUbaK6qeXsEYSlNmlEEIdeM,12198
|
97
|
+
sglang/srt/models/minicpm.py,sha256=5vc-Lq7ggHrRxxkciVMdZ5Vq6ThLwnhFS62UCokFC2g,13792
|
98
|
+
sglang/srt/models/minicpm3.py,sha256=hhhgZTKQApUZpH_MYQZTk3K1Ox-xpJRxGCemoUw8x4U,25184
|
99
|
+
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
100
|
+
sglang/srt/models/mixtral.py,sha256=BonqX_rSB_UuBDQe3uy8-NOxB4Q4s2mTxTQItvFB9ZQ,13864
|
101
|
+
sglang/srt/models/mixtral_quant.py,sha256=SAHBIiD5O1TnojCpqTLcPy3TEvfSCKeOe3GC47fdFSg,14039
|
102
|
+
sglang/srt/models/olmoe.py,sha256=ghhNpZe4SzaZEpw0APYBbAmLb3LBagRC2N724RkOkH4,15312
|
103
|
+
sglang/srt/models/qwen.py,sha256=IrOKHS7b4SL2fnJegq811eeHnAQDya2PujIgKQ9URVY,9921
|
104
|
+
sglang/srt/models/qwen2.py,sha256=B7hXnW5uYPmpMgSN7tI3tTvMEmmQLpddsw_iNTiaHJI,12398
|
105
|
+
sglang/srt/models/qwen2_moe.py,sha256=MK-9W6FJhXoQYayg_jpXjKKq4n5j3s2b2ZaoCBfVJ2I,17120
|
106
|
+
sglang/srt/models/stablelm.py,sha256=ldtlRG1XGdYcjwqb48dpMTfbdh8KHUjcWrrUYNJ0MEk,11326
|
107
|
+
sglang/srt/models/torch_native_llama.py,sha256=c5GJ_k9zbSOk0PjLCXAK8YebGEy0RUVYZ9_h6_19A3M,19215
|
108
|
+
sglang/srt/models/xverse.py,sha256=i11wEKqqVCoVtH7yo9jfpNyGHxhw7NvTPid3ojmg79s,13634
|
109
|
+
sglang/srt/models/xverse_moe.py,sha256=JwkBhsyusP7e_hAMnomkP8cEmKNCLJPRtwaTERQ0D0M,15818
|
110
|
+
sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
|
111
|
+
sglang/srt/openai_api/adapter.py,sha256=ULX1lo23r6semogKcbUOXGSgPJi8NJ7IuC0WVvEbVbs,51458
|
112
|
+
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
113
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=mtE_kLC6U-X6Q20BVjPWyDOoGc4kcTdIPpcsNeZcRYo,6462
|
114
|
+
sglang/srt/sampling/sampling_params.py,sha256=Xwh4_M6PP4SWyGV-zNyIhp4XbRKbeU4251ao8UOlZlI,5704
|
115
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
116
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
117
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
118
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
119
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
120
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
121
|
+
sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
|
122
|
+
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
123
|
+
sglang/test/runners.py,sha256=VCmtH08FsAq_JTAKfKo0zB4o-osNMAxxwe4aKcSxr4c,13515
|
124
|
+
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
125
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
126
|
+
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
127
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
128
|
+
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
129
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
130
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
131
|
+
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
132
|
+
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
133
|
+
sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
|
134
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
135
|
+
sglang-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
136
|
+
sglang-0.3.3.dist-info/METADATA,sha256=zeY2pmiGPJb52zaHqiRHY4OcZqAHPvG_zPyve5KfANc,39063
|
137
|
+
sglang-0.3.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
138
|
+
sglang-0.3.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
139
|
+
sglang-0.3.3.dist-info/RECORD,,
|