sglang 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -0
- sglang/api.py +10 -2
- sglang/bench_latency.py +145 -36
- sglang/check_env.py +24 -2
- sglang/global_config.py +0 -1
- sglang/lang/backend/base_backend.py +3 -1
- sglang/lang/backend/openai.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +46 -29
- sglang/lang/choices.py +164 -0
- sglang/lang/interpreter.py +6 -13
- sglang/lang/ir.py +11 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -5
- sglang/srt/managers/schedule_batch.py +95 -324
- sglang/srt/managers/tokenizer_manager.py +6 -3
- sglang/srt/managers/tp_worker.py +20 -22
- sglang/srt/mem_cache/memory_pool.py +9 -14
- sglang/srt/model_executor/cuda_graph_runner.py +3 -3
- sglang/srt/model_executor/forward_batch_info.py +256 -0
- sglang/srt/model_executor/model_runner.py +6 -10
- sglang/srt/models/chatglm.py +1 -1
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +1 -1
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +1 -1
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +1 -1
- sglang/srt/models/internlm2.py +1 -1
- sglang/srt/models/llama2.py +1 -1
- sglang/srt/models/llama_classification.py +1 -1
- sglang/srt/models/llava.py +1 -2
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +1 -1
- sglang/srt/models/mixtral.py +1 -1
- sglang/srt/models/mixtral_quant.py +1 -1
- sglang/srt/models/qwen.py +1 -1
- sglang/srt/models/qwen2.py +1 -1
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/openai_api/adapter.py +34 -12
- sglang/srt/openai_api/protocol.py +6 -0
- sglang/srt/server.py +24 -6
- sglang/srt/server_args.py +4 -0
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/METADATA +34 -24
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/RECORD +52 -50
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/LICENSE +0 -0
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/WHEEL +0 -0
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/top_level.txt +0 -0
sglang/srt/models/llavavid.py
CHANGED
@@ -26,13 +26,12 @@ from vllm.config import CacheConfig
|
|
26
26
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
27
27
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
28
28
|
|
29
|
-
from sglang.srt.managers.schedule_batch import ForwardMode
|
30
29
|
from sglang.srt.mm_utils import (
|
31
30
|
get_anyres_image_grid_shape,
|
32
31
|
unpad_image,
|
33
32
|
unpad_image_shape,
|
34
33
|
)
|
35
|
-
from sglang.srt.model_executor.
|
34
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
36
35
|
from sglang.srt.models.llama2 import LlamaForCausalLM
|
37
36
|
|
38
37
|
|
sglang/srt/models/minicpm.py
CHANGED
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
39
39
|
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
-
from sglang.srt.model_executor.
|
42
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
44
44
|
|
45
45
|
class MiniCPMMLP(nn.Module):
|
sglang/srt/models/mixtral.py
CHANGED
@@ -50,7 +50,7 @@ from vllm.utils import print_warning_once
|
|
50
50
|
|
51
51
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
52
52
|
from sglang.srt.layers.radix_attention import RadixAttention
|
53
|
-
from sglang.srt.model_executor.
|
53
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
54
54
|
|
55
55
|
|
56
56
|
class MixtralMoE(nn.Module):
|
@@ -45,7 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
45
45
|
|
46
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
|
-
from sglang.srt.model_executor.
|
48
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
49
49
|
|
50
50
|
|
51
51
|
class MixtralMLP(nn.Module):
|
sglang/srt/models/qwen.py
CHANGED
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
39
39
|
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
-
from sglang.srt.model_executor.
|
42
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
44
44
|
|
45
45
|
class QWenMLP(nn.Module):
|
sglang/srt/models/qwen2.py
CHANGED
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
39
39
|
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
-
from sglang.srt.model_executor.
|
42
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
44
44
|
Qwen2Config = None
|
45
45
|
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -51,7 +51,7 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
|
|
51
51
|
|
52
52
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
53
53
|
from sglang.srt.layers.radix_attention import RadixAttention
|
54
|
-
from sglang.srt.model_executor.
|
54
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
55
55
|
|
56
56
|
|
57
57
|
class Qwen2MoeMLP(nn.Module):
|
sglang/srt/models/stablelm.py
CHANGED
@@ -40,7 +40,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
40
40
|
|
41
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
|
-
from sglang.srt.model_executor.
|
43
|
+
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
45
45
|
|
46
46
|
class StablelmMLP(nn.Module):
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -53,6 +53,7 @@ from sglang.srt.openai_api.protocol import (
|
|
53
53
|
CompletionStreamResponse,
|
54
54
|
DeltaMessage,
|
55
55
|
ErrorResponse,
|
56
|
+
FileDeleteResponse,
|
56
57
|
FileRequest,
|
57
58
|
FileResponse,
|
58
59
|
LogProbs,
|
@@ -174,6 +175,20 @@ async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str
|
|
174
175
|
return {"error": "Invalid input", "details": e.errors()}
|
175
176
|
|
176
177
|
|
178
|
+
async def v1_delete_file(file_id: str):
|
179
|
+
# Retrieve the file job from the in-memory storage
|
180
|
+
file_response = file_id_response.get(file_id)
|
181
|
+
if file_response is None:
|
182
|
+
raise HTTPException(status_code=404, detail="File not found")
|
183
|
+
file_path = file_id_storage.get(file_id)
|
184
|
+
if file_path is None:
|
185
|
+
raise HTTPException(status_code=404, detail="File not found")
|
186
|
+
os.remove(file_path)
|
187
|
+
del file_id_response[file_id]
|
188
|
+
del file_id_storage[file_id]
|
189
|
+
return FileDeleteResponse(id=file_id, deleted=True)
|
190
|
+
|
191
|
+
|
177
192
|
async def v1_batches(tokenizer_manager, raw_request: Request):
|
178
193
|
try:
|
179
194
|
body = await raw_request.json()
|
@@ -287,6 +302,13 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
287
302
|
retrieve_batch = batch_storage[batch_id]
|
288
303
|
retrieve_batch.output_file_id = output_file_id
|
289
304
|
file_id_storage[output_file_id] = output_file_path
|
305
|
+
file_id_response[output_file_id] = FileResponse(
|
306
|
+
id=output_file_id,
|
307
|
+
bytes=os.path.getsize(output_file_path),
|
308
|
+
created_at=int(time.time()),
|
309
|
+
filename=f"{output_file_id}.jsonl",
|
310
|
+
purpose="batch_result",
|
311
|
+
)
|
290
312
|
# Update batch status to "completed"
|
291
313
|
retrieve_batch.status = "completed"
|
292
314
|
retrieve_batch.completed_at = int(time.time())
|
@@ -380,7 +402,7 @@ def v1_generate_request(all_requests):
|
|
380
402
|
else:
|
381
403
|
prompt_kwargs = {"input_ids": prompt}
|
382
404
|
else:
|
383
|
-
if isinstance(prompts[0], str)
|
405
|
+
if isinstance(prompts[0], str):
|
384
406
|
prompt_kwargs = {"text": prompts}
|
385
407
|
else:
|
386
408
|
prompt_kwargs = {"input_ids": prompts}
|
@@ -500,7 +522,9 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
500
522
|
responses.append(response)
|
501
523
|
return responses
|
502
524
|
else:
|
503
|
-
prompt_tokens = sum(
|
525
|
+
prompt_tokens = sum(
|
526
|
+
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
527
|
+
)
|
504
528
|
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
505
529
|
response = CompletionResponse(
|
506
530
|
id=ret[0]["meta_info"]["id"],
|
@@ -707,8 +731,6 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
|
|
707
731
|
|
708
732
|
def v1_chat_generate_response(request, ret, to_file=False):
|
709
733
|
choices = []
|
710
|
-
total_prompt_tokens = 0
|
711
|
-
total_completion_tokens = 0
|
712
734
|
|
713
735
|
for idx, ret_item in enumerate(ret):
|
714
736
|
logprobs = False
|
@@ -747,8 +769,6 @@ def v1_chat_generate_response(request, ret, to_file=False):
|
|
747
769
|
choice_logprobs = ChoiceLogprobs(content=token_logprobs)
|
748
770
|
else:
|
749
771
|
choice_logprobs = None
|
750
|
-
prompt_tokens = ret_item["meta_info"]["prompt_tokens"]
|
751
|
-
completion_tokens = ret_item["meta_info"]["completion_tokens"]
|
752
772
|
|
753
773
|
if to_file:
|
754
774
|
# to make the choice data json serializable
|
@@ -767,8 +787,7 @@ def v1_chat_generate_response(request, ret, to_file=False):
|
|
767
787
|
)
|
768
788
|
|
769
789
|
choices.append(choice_data)
|
770
|
-
|
771
|
-
total_completion_tokens += completion_tokens
|
790
|
+
|
772
791
|
if to_file:
|
773
792
|
responses = []
|
774
793
|
|
@@ -795,14 +814,18 @@ def v1_chat_generate_response(request, ret, to_file=False):
|
|
795
814
|
responses.append(response)
|
796
815
|
return responses
|
797
816
|
else:
|
817
|
+
prompt_tokens = sum(
|
818
|
+
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
819
|
+
)
|
820
|
+
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
798
821
|
response = ChatCompletionResponse(
|
799
822
|
id=ret[0]["meta_info"]["id"],
|
800
823
|
model=request.model,
|
801
824
|
choices=choices,
|
802
825
|
usage=UsageInfo(
|
803
|
-
prompt_tokens=
|
804
|
-
completion_tokens=
|
805
|
-
total_tokens=
|
826
|
+
prompt_tokens=prompt_tokens,
|
827
|
+
completion_tokens=completion_tokens,
|
828
|
+
total_tokens=prompt_tokens + completion_tokens,
|
806
829
|
),
|
807
830
|
)
|
808
831
|
return response
|
@@ -930,7 +953,6 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
930
953
|
).__anext__()
|
931
954
|
except ValueError as e:
|
932
955
|
return create_error_response(str(e))
|
933
|
-
|
934
956
|
if not isinstance(ret, list):
|
935
957
|
ret = [ret]
|
936
958
|
|
@@ -95,6 +95,12 @@ class FileResponse(BaseModel):
|
|
95
95
|
purpose: str
|
96
96
|
|
97
97
|
|
98
|
+
class FileDeleteResponse(BaseModel):
|
99
|
+
id: str
|
100
|
+
object: str = "file"
|
101
|
+
deleted: bool
|
102
|
+
|
103
|
+
|
98
104
|
class BatchRequest(BaseModel):
|
99
105
|
input_file_id: (
|
100
106
|
str # The ID of an uploaded file that contains requests for the new batch
|
sglang/srt/server.py
CHANGED
@@ -59,6 +59,7 @@ from sglang.srt.openai_api.adapter import (
|
|
59
59
|
v1_batches,
|
60
60
|
v1_chat_completions,
|
61
61
|
v1_completions,
|
62
|
+
v1_delete_file,
|
62
63
|
v1_files_create,
|
63
64
|
v1_retrieve_batch,
|
64
65
|
v1_retrieve_file,
|
@@ -175,6 +176,12 @@ async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("bat
|
|
175
176
|
)
|
176
177
|
|
177
178
|
|
179
|
+
@app.delete("/v1/files/{file_id}")
|
180
|
+
async def delete_file(file_id: str):
|
181
|
+
# https://platform.openai.com/docs/api-reference/files/delete
|
182
|
+
return await v1_delete_file(file_id)
|
183
|
+
|
184
|
+
|
178
185
|
@app.post("/v1/batches")
|
179
186
|
async def openai_v1_batches(raw_request: Request):
|
180
187
|
return await v1_batches(tokenizer_manager, raw_request)
|
@@ -367,14 +374,24 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|
367
374
|
headers["Authorization"] = f"Bearer {server_args.api_key}"
|
368
375
|
|
369
376
|
# Wait until the server is launched
|
377
|
+
success = False
|
370
378
|
for _ in range(120):
|
371
379
|
time.sleep(1)
|
372
380
|
try:
|
373
|
-
requests.get(url + "/get_model_info", timeout=5, headers=headers)
|
381
|
+
res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
|
382
|
+
assert res.status_code == 200, f"{res}"
|
383
|
+
success = True
|
374
384
|
break
|
375
|
-
except requests.exceptions.RequestException:
|
385
|
+
except (AssertionError, requests.exceptions.RequestException) as e:
|
386
|
+
last_traceback = get_exception_traceback()
|
376
387
|
pass
|
377
388
|
|
389
|
+
if not success:
|
390
|
+
if pipe_finish_writer is not None:
|
391
|
+
pipe_finish_writer.send(last_traceback)
|
392
|
+
print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
|
393
|
+
sys.exit(1)
|
394
|
+
|
378
395
|
# Send a warmup request
|
379
396
|
try:
|
380
397
|
for _ in range(server_args.dp_size):
|
@@ -390,12 +407,13 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|
390
407
|
headers=headers,
|
391
408
|
timeout=600,
|
392
409
|
)
|
393
|
-
assert res.status_code == 200
|
410
|
+
assert res.status_code == 200, f"{res}"
|
394
411
|
except Exception as e:
|
412
|
+
last_traceback = get_exception_traceback()
|
395
413
|
if pipe_finish_writer is not None:
|
396
|
-
pipe_finish_writer.send(
|
397
|
-
print(f"Initialization failed. warmup error: {
|
398
|
-
|
414
|
+
pipe_finish_writer.send(last_traceback)
|
415
|
+
print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
|
416
|
+
sys.exit(1)
|
399
417
|
|
400
418
|
logger.info("The server is fired up and ready to roll!")
|
401
419
|
if pipe_finish_writer is not None:
|
sglang/srt/server_args.py
CHANGED
@@ -264,6 +264,7 @@ class ServerArgs:
|
|
264
264
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
265
265
|
)
|
266
266
|
parser.add_argument(
|
267
|
+
"--tensor-parallel-size",
|
267
268
|
"--tp-size",
|
268
269
|
type=int,
|
269
270
|
default=ServerArgs.tp_size,
|
@@ -318,6 +319,7 @@ class ServerArgs:
|
|
318
319
|
|
319
320
|
# Data parallelism
|
320
321
|
parser.add_argument(
|
322
|
+
"--data-parallel-size",
|
321
323
|
"--dp-size",
|
322
324
|
type=int,
|
323
325
|
default=ServerArgs.dp_size,
|
@@ -413,6 +415,8 @@ class ServerArgs:
|
|
413
415
|
|
414
416
|
@classmethod
|
415
417
|
def from_cli_args(cls, args: argparse.Namespace):
|
418
|
+
args.tp_size = args.tensor_parallel_size
|
419
|
+
args.dp_size = args.data_parallel_size
|
416
420
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
417
421
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
418
422
|
|
sglang/test/test_utils.py
CHANGED
@@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI
|
|
18
18
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
19
19
|
from sglang.utils import get_exception_traceback
|
20
20
|
|
21
|
-
|
21
|
+
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
22
22
|
|
23
23
|
|
24
24
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.11"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.11
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -221,6 +221,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
|
|
221
221
|
Requires-Dist: sglang[litellm]; extra == "all"
|
222
222
|
Provides-Extra: anthropic
|
223
223
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
224
|
+
Provides-Extra: dev
|
225
|
+
Requires-Dist: sglang[all]; extra == "dev"
|
226
|
+
Requires-Dist: sglang[test]; extra == "dev"
|
224
227
|
Provides-Extra: litellm
|
225
228
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
226
229
|
Provides-Extra: openai
|
@@ -232,7 +235,6 @@ Requires-Dist: fastapi; extra == "srt"
|
|
232
235
|
Requires-Dist: hf-transfer; extra == "srt"
|
233
236
|
Requires-Dist: huggingface-hub; extra == "srt"
|
234
237
|
Requires-Dist: interegular; extra == "srt"
|
235
|
-
Requires-Dist: jsonlines; extra == "srt"
|
236
238
|
Requires-Dist: packaging; extra == "srt"
|
237
239
|
Requires-Dist: pillow; extra == "srt"
|
238
240
|
Requires-Dist: psutil; extra == "srt"
|
@@ -242,8 +244,12 @@ Requires-Dist: torch; extra == "srt"
|
|
242
244
|
Requires-Dist: uvicorn; extra == "srt"
|
243
245
|
Requires-Dist: uvloop; extra == "srt"
|
244
246
|
Requires-Dist: zmq; extra == "srt"
|
245
|
-
Requires-Dist: vllm==0.5.
|
247
|
+
Requires-Dist: vllm==0.5.4; extra == "srt"
|
246
248
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
249
|
+
Provides-Extra: test
|
250
|
+
Requires-Dist: jsonlines; extra == "test"
|
251
|
+
Requires-Dist: matplotlib; extra == "test"
|
252
|
+
Requires-Dist: pandas; extra == "test"
|
247
253
|
|
248
254
|
<div align="center">
|
249
255
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -296,20 +302,20 @@ pip install --upgrade pip
|
|
296
302
|
pip install "sglang[all]"
|
297
303
|
|
298
304
|
# Install FlashInfer CUDA kernels
|
299
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
305
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
300
306
|
```
|
301
307
|
|
302
308
|
### Method 2: From source
|
303
309
|
```
|
304
310
|
# Use the last release branch
|
305
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
|
306
312
|
cd sglang
|
307
313
|
|
308
314
|
pip install --upgrade pip
|
309
315
|
pip install -e "python[all]"
|
310
316
|
|
311
317
|
# Install FlashInfer CUDA kernels
|
312
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
318
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
313
319
|
```
|
314
320
|
|
315
321
|
### Method 3: Using docker
|
@@ -383,7 +389,7 @@ response = client.chat.completions.create(
|
|
383
389
|
print(response)
|
384
390
|
```
|
385
391
|
|
386
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
392
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
387
393
|
|
388
394
|
### Additional Server Arguments
|
389
395
|
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
@@ -394,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
394
400
|
```
|
395
401
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
396
402
|
```
|
397
|
-
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9
|
403
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
398
404
|
```
|
399
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
400
406
|
```
|
407
|
+
- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
|
408
|
+
```
|
409
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
|
410
|
+
```
|
401
411
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
402
412
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
403
413
|
```
|
@@ -411,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
411
421
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
412
422
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
413
423
|
|
414
|
-
### Run Llama 3.1 405B
|
415
|
-
|
416
|
-
```bash
|
417
|
-
## Run 405B (fp8) on a single node
|
418
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
419
|
-
|
420
|
-
## Run 405B (fp16) on two nodes
|
421
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
422
|
-
|
423
|
-
# on the first node
|
424
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
425
|
-
|
426
|
-
# on the second
|
427
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
428
|
-
```
|
429
|
-
|
430
424
|
### Supported Models
|
431
425
|
|
432
426
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -452,6 +446,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
452
446
|
|
453
447
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
454
448
|
|
449
|
+
### Run Llama 3.1 405B
|
450
|
+
|
451
|
+
```bash
|
452
|
+
## Run 405B (fp8) on a single node
|
453
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
454
|
+
|
455
|
+
## Run 405B (fp16) on two nodes
|
456
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
457
|
+
|
458
|
+
# on the first node
|
459
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
460
|
+
|
461
|
+
# on the second
|
462
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
463
|
+
```
|
464
|
+
|
455
465
|
### Benchmark Performance
|
456
466
|
|
457
467
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -1,33 +1,34 @@
|
|
1
|
-
sglang/__init__.py,sha256=
|
2
|
-
sglang/api.py,sha256=
|
3
|
-
sglang/bench_latency.py,sha256=
|
1
|
+
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
|
+
sglang/api.py,sha256=gAY9JhqWXjrYoWnMvR-iiuuY1YSN94We-lc1LH0z3cw,6030
|
3
|
+
sglang/bench_latency.py,sha256=CXvukEW0IeoH2IwN2vuriC0eHBdJsz3lgT7OwwNo_7A,16146
|
4
4
|
sglang/bench_serving.py,sha256=M0YQT6xElpkx-FtmyUe6lhX1DZfVLGh54qd6qfFYquc,34801
|
5
|
-
sglang/check_env.py,sha256=
|
6
|
-
sglang/global_config.py,sha256=
|
5
|
+
sglang/check_env.py,sha256=oU8VmjjPK2SviRhr41cF1953soBu-eTT5E0Hf04zMzo,4974
|
6
|
+
sglang/global_config.py,sha256=9JxaFkBKSgep6BVeEl_kx9tuW9PqdijYELyBGTryl6o,1704
|
7
7
|
sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
|
8
8
|
sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
|
9
9
|
sglang/utils.py,sha256=C50xm06WWKpKB8kSNs9vO4egJ2QTk_OAA6M13S2cB_A,8369
|
10
|
-
sglang/version.py,sha256=
|
10
|
+
sglang/version.py,sha256=_MLx4ac1juJPWEEiC9kMQISX3x3jFBr507jM2P_hxMg,23
|
11
11
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
13
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
13
14
|
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
14
|
-
sglang/lang/interpreter.py,sha256=
|
15
|
-
sglang/lang/ir.py,sha256=
|
15
|
+
sglang/lang/interpreter.py,sha256=3RIeSGdKlKTq2Ixg_Tyo0fGEDTvBKS2f9FaJYODBHzA,30102
|
16
|
+
sglang/lang/ir.py,sha256=FGWghAfVW9IcxcrVqHiqpf7vmWzuNYoVTMSbBZkYVRk,16839
|
16
17
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
17
18
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
19
|
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
19
|
-
sglang/lang/backend/base_backend.py,sha256=
|
20
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
20
21
|
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
21
|
-
sglang/lang/backend/openai.py,sha256=
|
22
|
-
sglang/lang/backend/runtime_endpoint.py,sha256=
|
22
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
23
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=AaBc5yczchX7mkwiKDMyjLjBkJsh2Lubrfd9lvCOlDo,9544
|
23
24
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
24
25
|
sglang/srt/conversation.py,sha256=V5YuoeO6-aLqGv0p3J2qx8TnBJbN1oTopYFutNul3GQ,16491
|
25
26
|
sglang/srt/hf_transformers_utils.py,sha256=Tf_RplcW7llVXsigRvSGqmeAUxBeAL8rPCkzuqWfZ8U,11925
|
26
27
|
sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
|
27
28
|
sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
|
28
29
|
sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
|
29
|
-
sglang/srt/server.py,sha256=
|
30
|
-
sglang/srt/server_args.py,sha256=
|
30
|
+
sglang/srt/server.py,sha256=hUNnTvH4c1AI2JJzoBUf9TQuTelx-vulcqwkEplw7Gk,16699
|
31
|
+
sglang/srt/server_args.py,sha256=SmvnebtDTsvPNDyW6lltuJKC7h8eVdYmurY1ieIMySA,16475
|
31
32
|
sglang/srt/utils.py,sha256=GcRFf3pb5l-Q5TJU4gF-Wp7Ct46l3BO0aMpjlyHXp3I,23766
|
32
33
|
sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
|
33
34
|
sglang/srt/constrained/base_tool_cache.py,sha256=1_m-AivPtWRwUgGiEZBafCrSFUGahK4UM4vgAd8TkMg,2004
|
@@ -37,8 +38,8 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1be
|
|
37
38
|
sglang/srt/layers/extend_attention.py,sha256=V5pm7toSDlzByaV4lGRgXVGWFUPf68chvvahlT2h4mk,14092
|
38
39
|
sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
|
39
40
|
sglang/srt/layers/linear.py,sha256=3Se2FRXyqXcd-uvNx2b7s-jolsUTEVeYBMYHmV82wPw,34518
|
40
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
41
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
41
|
+
sglang/srt/layers/logits_processor.py,sha256=wHKB1FjbfY0a7KGw5dCsEhmO4sc7VMy3gYtSPv4oQYM,11097
|
42
|
+
sglang/srt/layers/radix_attention.py,sha256=lXwm-qs7hPy_EFV1Zf2pPQ0-drAdrO8V5J4eX0LwLtU,7505
|
42
43
|
sglang/srt/layers/token_attention.py,sha256=pdBORaWQGvDy_Aitcq0XDHk2Rravol-jZZkrsgkXeng,8849
|
43
44
|
sglang/srt/layers/quantization/__init__.py,sha256=JMlgE-FWS759lfQ9Uc6mGFqBbTFLlvKeVEFpZLATe14,2536
|
44
45
|
sglang/srt/layers/quantization/fp8.py,sha256=GQOLeGbrcUfwO-7oClzDda0RXGPHR70ZXUHArZsa174,25511
|
@@ -47,43 +48,44 @@ sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk
|
|
47
48
|
sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
|
48
49
|
sglang/srt/managers/io_struct.py,sha256=VK61d6zfnBz5a3IMmwYsa5PNa9jUXPPmED1TdDRQGDs,7345
|
49
50
|
sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
|
50
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
51
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
52
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
51
|
+
sglang/srt/managers/schedule_batch.py,sha256=sKQAHRL6VoapGiO7yQV796gW4sVGAgVVBMtmENbKtvg,29641
|
52
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=wqb6zQbkHYcSNU14Auuh5519CVMmfbKGBQvn_IwDSAo,21408
|
53
|
+
sglang/srt/managers/tp_worker.py,sha256=3sHlN4hxksF22lkOJ8i3X6WSH4_5POy74BfbIAzIDtM,35216
|
53
54
|
sglang/srt/mem_cache/base_cache.py,sha256=czyN8IumXcMQskYOZDV3DzjfD4kdR-qwLVxceDqnOmE,788
|
54
55
|
sglang/srt/mem_cache/chunk_cache.py,sha256=u1mkGoTI7_31H0i0mhKT7S57StYSsdmsSPqyGubE7lY,1560
|
55
56
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
56
|
-
sglang/srt/mem_cache/memory_pool.py,sha256=
|
57
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=oOKtPTgzujo9gHXykSuER7VKqQRuwNKlXyXlaK-3dxo,5280
|
57
58
|
sglang/srt/mem_cache/radix_cache.py,sha256=pa5RD4xNKPSuvL55BnC4mimoca5oJRXr4Rg91-sbTcs,8881
|
58
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
59
|
-
sglang/srt/model_executor/
|
59
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=EyI8sMMoVlOjdTT2Y3cfwo1-uQ43QCQ1skx5BNgchjE,9433
|
60
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=P5bGeLsnFbEqgWLI5X5Eg0XFCG1j2oWZOsIAMZNkZW4,9022
|
61
|
+
sglang/srt/model_executor/model_runner.py,sha256=yzkJLIM41mhbfgfq87ToskAaA1PS67YzhmoSMbflkZI,17479
|
60
62
|
sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
|
61
63
|
sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
|
62
|
-
sglang/srt/models/chatglm.py,sha256=
|
63
|
-
sglang/srt/models/commandr.py,sha256=
|
64
|
-
sglang/srt/models/dbrx.py,sha256=
|
65
|
-
sglang/srt/models/deepseek.py,sha256=
|
66
|
-
sglang/srt/models/deepseek_v2.py,sha256=
|
67
|
-
sglang/srt/models/gemma.py,sha256=
|
68
|
-
sglang/srt/models/gemma2.py,sha256=
|
69
|
-
sglang/srt/models/gpt_bigcode.py,sha256=
|
70
|
-
sglang/srt/models/grok.py,sha256=
|
71
|
-
sglang/srt/models/internlm2.py,sha256=
|
72
|
-
sglang/srt/models/llama2.py,sha256=
|
73
|
-
sglang/srt/models/llama_classification.py,sha256=
|
74
|
-
sglang/srt/models/llava.py,sha256
|
75
|
-
sglang/srt/models/llavavid.py,sha256
|
76
|
-
sglang/srt/models/minicpm.py,sha256=
|
64
|
+
sglang/srt/models/chatglm.py,sha256=7bHU2AFoppINDZm0EdxgtAJe7rwr9OPkhOCfq2qNrIA,13862
|
65
|
+
sglang/srt/models/commandr.py,sha256=5BEtIS2uUQJANkkY-6ZeDqlrpUK5yXVYHiztU3vsTKY,14172
|
66
|
+
sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
|
67
|
+
sglang/srt/models/deepseek.py,sha256=E5W4nkH-Ne449rAIwQZgz-FAH2Qqp2r1vNfboyk5wEg,16024
|
68
|
+
sglang/srt/models/deepseek_v2.py,sha256=NMcckZb48kVUwAmDA2l8wO19T6DNkJOkKAhHa6utBZM,26968
|
69
|
+
sglang/srt/models/gemma.py,sha256=ilfN_NOcz7hpwEJ2y7NW3fBFmFO7YfjhdFDbfzl2qww,12285
|
70
|
+
sglang/srt/models/gemma2.py,sha256=D8GZOI1tAbEV9PaBmJSsJRzCmvaK3tGXttIbrMb5yiQ,16426
|
71
|
+
sglang/srt/models/gpt_bigcode.py,sha256=OKk9UP67as3T5bePlTRGHTCD-1wqaUEk92AowXPm6dg,10204
|
72
|
+
sglang/srt/models/grok.py,sha256=M9rtdXslqYBle5VyZqFVHiJUXq_q_aHbza63xa03zqI,27861
|
73
|
+
sglang/srt/models/internlm2.py,sha256=CKWBL0dBvLdaEUeJOUvLUNPb8BLrAZ8_BSf2mfFQhfU,12225
|
74
|
+
sglang/srt/models/llama2.py,sha256=3ZEWi0PVCDNjTrVNvLs1ESdyTcZhJlZjaH5uyS46JyM,14288
|
75
|
+
sglang/srt/models/llama_classification.py,sha256=Dvzy3PfETiJtnKFOk8qDDLUoZECf_cpSrNeA60PaDo4,4932
|
76
|
+
sglang/srt/models/llava.py,sha256=-ysi192vpBDxNaMS8qaLOhC34lXQyRtbG_0niVaceSo,18436
|
77
|
+
sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
|
78
|
+
sglang/srt/models/minicpm.py,sha256=ea_OyiwVTo6Tg9jNRAwqxETnA6FFeAqlIbiUS-xViEI,13843
|
77
79
|
sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
|
78
|
-
sglang/srt/models/mixtral.py,sha256=
|
79
|
-
sglang/srt/models/mixtral_quant.py,sha256=
|
80
|
-
sglang/srt/models/qwen.py,sha256=
|
81
|
-
sglang/srt/models/qwen2.py,sha256=
|
82
|
-
sglang/srt/models/qwen2_moe.py,sha256=
|
83
|
-
sglang/srt/models/stablelm.py,sha256=
|
80
|
+
sglang/srt/models/mixtral.py,sha256=raSLbp6AfWg5_u-f-lYeRejE9koAjbHt8iIHXd3nURM,21397
|
81
|
+
sglang/srt/models/mixtral_quant.py,sha256=xYeeatZ9OfwCTas_KbH9nl6lnUT4YqSY7NAxpgLp5LE,14222
|
82
|
+
sglang/srt/models/qwen.py,sha256=43ea6gn4wHzAaI3JTDLtl08aEm0vIqgzbVH9M8oeuY0,10006
|
83
|
+
sglang/srt/models/qwen2.py,sha256=Hyhks2r4KHpKeb9iHZpnvEVc5klmnrPwcLohqg8j1kw,12284
|
84
|
+
sglang/srt/models/qwen2_moe.py,sha256=PZdhEf0DUuGWsld3TyDWlIqSbrrOdqvCD4lAtCPWXeg,18147
|
85
|
+
sglang/srt/models/stablelm.py,sha256=yPrdzPEoUD2s_Q3RgOq7BBC7z-UtEaACzabqbDRs2tA,11368
|
84
86
|
sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
|
85
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
86
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
87
|
+
sglang/srt/openai_api/adapter.py,sha256=Eq44_hGwHcglCKOc6WqWDxBsgyRqtuC6VR4HB4GLfUY,38193
|
88
|
+
sglang/srt/openai_api/protocol.py,sha256=pcRgmDM3Kozh74Aj-qEo8q64BI6hEjrdhYDU4m9srdI,8294
|
87
89
|
sglang/test/run_eval.py,sha256=kbM6SiosfXj-1uYTFXPWMd7hZDvJZwV-AmdHi_WfP3A,3559
|
88
90
|
sglang/test/runners.py,sha256=APXXbrqmUGUqnX7T1Aq8X2NJQkIqtv6B42a2ybdlPjA,7459
|
89
91
|
sglang/test/simple_eval_common.py,sha256=HL1bfgkTAKP7sk-kShg73WTeADhuBD6xSsuLbV_9C3s,12359
|
@@ -92,9 +94,9 @@ sglang/test/simple_eval_humaneval.py,sha256=k50DKoAbXiw-ubrFXHet9B-7tboHU2dQJf5G
|
|
92
94
|
sglang/test/simple_eval_math.py,sha256=EQblQmtUt-kl558drzhP7c6KhpDNgr1EJhhKx5eeHM4,2519
|
93
95
|
sglang/test/simple_eval_mmlu.py,sha256=KqSSdSu2qfoKQ870ttxev1NJ7c90xv2mvKOQsSODtAw,4326
|
94
96
|
sglang/test/test_programs.py,sha256=e9_ifoIvuI1Ctkbkz3wfdZLBBSRikby8ywcodBIkf9M,13826
|
95
|
-
sglang/test/test_utils.py,sha256=
|
96
|
-
sglang-0.2.
|
97
|
-
sglang-0.2.
|
98
|
-
sglang-0.2.
|
99
|
-
sglang-0.2.
|
100
|
-
sglang-0.2.
|
97
|
+
sglang/test/test_utils.py,sha256=ITQcY3WGV4kLGWEkfU-AeuFX8yGLmq9LEK5jHiuW7Sw,13991
|
98
|
+
sglang-0.2.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
99
|
+
sglang-0.2.11.dist-info/METADATA,sha256=gSQA5-Hf9y41ulOKiMeHRu4Nf-c9Nbt6xhmlCGzvhNY,33783
|
100
|
+
sglang-0.2.11.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
101
|
+
sglang-0.2.11.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
102
|
+
sglang-0.2.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|