sglang 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. sglang/__init__.py +8 -0
  2. sglang/api.py +10 -2
  3. sglang/bench_latency.py +145 -36
  4. sglang/check_env.py +24 -2
  5. sglang/global_config.py +0 -1
  6. sglang/lang/backend/base_backend.py +3 -1
  7. sglang/lang/backend/openai.py +8 -3
  8. sglang/lang/backend/runtime_endpoint.py +46 -29
  9. sglang/lang/choices.py +164 -0
  10. sglang/lang/interpreter.py +6 -13
  11. sglang/lang/ir.py +11 -2
  12. sglang/srt/layers/logits_processor.py +1 -1
  13. sglang/srt/layers/radix_attention.py +2 -5
  14. sglang/srt/managers/schedule_batch.py +95 -324
  15. sglang/srt/managers/tokenizer_manager.py +6 -3
  16. sglang/srt/managers/tp_worker.py +20 -22
  17. sglang/srt/mem_cache/memory_pool.py +9 -14
  18. sglang/srt/model_executor/cuda_graph_runner.py +3 -3
  19. sglang/srt/model_executor/forward_batch_info.py +256 -0
  20. sglang/srt/model_executor/model_runner.py +6 -10
  21. sglang/srt/models/chatglm.py +1 -1
  22. sglang/srt/models/commandr.py +1 -1
  23. sglang/srt/models/dbrx.py +1 -1
  24. sglang/srt/models/deepseek.py +1 -1
  25. sglang/srt/models/deepseek_v2.py +1 -1
  26. sglang/srt/models/gemma.py +1 -1
  27. sglang/srt/models/gemma2.py +1 -1
  28. sglang/srt/models/gpt_bigcode.py +1 -1
  29. sglang/srt/models/grok.py +1 -1
  30. sglang/srt/models/internlm2.py +1 -1
  31. sglang/srt/models/llama2.py +1 -1
  32. sglang/srt/models/llama_classification.py +1 -1
  33. sglang/srt/models/llava.py +1 -2
  34. sglang/srt/models/llavavid.py +1 -2
  35. sglang/srt/models/minicpm.py +1 -1
  36. sglang/srt/models/mixtral.py +1 -1
  37. sglang/srt/models/mixtral_quant.py +1 -1
  38. sglang/srt/models/qwen.py +1 -1
  39. sglang/srt/models/qwen2.py +1 -1
  40. sglang/srt/models/qwen2_moe.py +1 -1
  41. sglang/srt/models/stablelm.py +1 -1
  42. sglang/srt/openai_api/adapter.py +34 -12
  43. sglang/srt/openai_api/protocol.py +6 -0
  44. sglang/srt/server.py +24 -6
  45. sglang/srt/server_args.py +4 -0
  46. sglang/test/test_utils.py +1 -1
  47. sglang/version.py +1 -1
  48. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/METADATA +34 -24
  49. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/RECORD +52 -50
  50. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/LICENSE +0 -0
  51. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/WHEEL +0 -0
  52. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/top_level.txt +0 -0
@@ -26,13 +26,12 @@ from vllm.config import CacheConfig
26
26
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
27
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
28
28
 
29
- from sglang.srt.managers.schedule_batch import ForwardMode
30
29
  from sglang.srt.mm_utils import (
31
30
  get_anyres_image_grid_shape,
32
31
  unpad_image,
33
32
  unpad_image_shape,
34
33
  )
35
- from sglang.srt.model_executor.model_runner import InputMetadata
34
+ from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
36
35
  from sglang.srt.models.llama2 import LlamaForCausalLM
37
36
 
38
37
 
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
39
 
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
- from sglang.srt.model_executor.model_runner import InputMetadata
42
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
44
44
 
45
45
  class MiniCPMMLP(nn.Module):
@@ -50,7 +50,7 @@ from vllm.utils import print_warning_once
50
50
 
51
51
  from sglang.srt.layers.logits_processor import LogitsProcessor
52
52
  from sglang.srt.layers.radix_attention import RadixAttention
53
- from sglang.srt.model_executor.model_runner import InputMetadata
53
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
54
54
 
55
55
 
56
56
  class MixtralMoE(nn.Module):
@@ -45,7 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
45
 
46
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
- from sglang.srt.model_executor.model_runner import InputMetadata
48
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
49
49
 
50
50
 
51
51
  class MixtralMLP(nn.Module):
sglang/srt/models/qwen.py CHANGED
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
39
 
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
- from sglang.srt.model_executor.model_runner import InputMetadata
42
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
44
44
 
45
45
  class QWenMLP(nn.Module):
@@ -39,7 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
39
 
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
- from sglang.srt.model_executor.model_runner import InputMetadata
42
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
44
44
  Qwen2Config = None
45
45
 
@@ -51,7 +51,7 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
51
51
 
52
52
  from sglang.srt.layers.logits_processor import LogitsProcessor
53
53
  from sglang.srt.layers.radix_attention import RadixAttention
54
- from sglang.srt.model_executor.model_runner import InputMetadata
54
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
55
55
 
56
56
 
57
57
  class Qwen2MoeMLP(nn.Module):
@@ -40,7 +40,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
40
 
41
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
- from sglang.srt.model_executor.model_runner import InputMetadata
43
+ from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
45
45
 
46
46
  class StablelmMLP(nn.Module):
@@ -53,6 +53,7 @@ from sglang.srt.openai_api.protocol import (
53
53
  CompletionStreamResponse,
54
54
  DeltaMessage,
55
55
  ErrorResponse,
56
+ FileDeleteResponse,
56
57
  FileRequest,
57
58
  FileResponse,
58
59
  LogProbs,
@@ -174,6 +175,20 @@ async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str
174
175
  return {"error": "Invalid input", "details": e.errors()}
175
176
 
176
177
 
178
+ async def v1_delete_file(file_id: str):
179
+ # Retrieve the file job from the in-memory storage
180
+ file_response = file_id_response.get(file_id)
181
+ if file_response is None:
182
+ raise HTTPException(status_code=404, detail="File not found")
183
+ file_path = file_id_storage.get(file_id)
184
+ if file_path is None:
185
+ raise HTTPException(status_code=404, detail="File not found")
186
+ os.remove(file_path)
187
+ del file_id_response[file_id]
188
+ del file_id_storage[file_id]
189
+ return FileDeleteResponse(id=file_id, deleted=True)
190
+
191
+
177
192
  async def v1_batches(tokenizer_manager, raw_request: Request):
178
193
  try:
179
194
  body = await raw_request.json()
@@ -287,6 +302,13 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
287
302
  retrieve_batch = batch_storage[batch_id]
288
303
  retrieve_batch.output_file_id = output_file_id
289
304
  file_id_storage[output_file_id] = output_file_path
305
+ file_id_response[output_file_id] = FileResponse(
306
+ id=output_file_id,
307
+ bytes=os.path.getsize(output_file_path),
308
+ created_at=int(time.time()),
309
+ filename=f"{output_file_id}.jsonl",
310
+ purpose="batch_result",
311
+ )
290
312
  # Update batch status to "completed"
291
313
  retrieve_batch.status = "completed"
292
314
  retrieve_batch.completed_at = int(time.time())
@@ -380,7 +402,7 @@ def v1_generate_request(all_requests):
380
402
  else:
381
403
  prompt_kwargs = {"input_ids": prompt}
382
404
  else:
383
- if isinstance(prompts[0], str) or isinstance(propmt[0][0], str):
405
+ if isinstance(prompts[0], str):
384
406
  prompt_kwargs = {"text": prompts}
385
407
  else:
386
408
  prompt_kwargs = {"input_ids": prompts}
@@ -500,7 +522,9 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
500
522
  responses.append(response)
501
523
  return responses
502
524
  else:
503
- prompt_tokens = sum(item["meta_info"]["prompt_tokens"] for item in ret)
525
+ prompt_tokens = sum(
526
+ ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
527
+ )
504
528
  completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
505
529
  response = CompletionResponse(
506
530
  id=ret[0]["meta_info"]["id"],
@@ -707,8 +731,6 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
707
731
 
708
732
  def v1_chat_generate_response(request, ret, to_file=False):
709
733
  choices = []
710
- total_prompt_tokens = 0
711
- total_completion_tokens = 0
712
734
 
713
735
  for idx, ret_item in enumerate(ret):
714
736
  logprobs = False
@@ -747,8 +769,6 @@ def v1_chat_generate_response(request, ret, to_file=False):
747
769
  choice_logprobs = ChoiceLogprobs(content=token_logprobs)
748
770
  else:
749
771
  choice_logprobs = None
750
- prompt_tokens = ret_item["meta_info"]["prompt_tokens"]
751
- completion_tokens = ret_item["meta_info"]["completion_tokens"]
752
772
 
753
773
  if to_file:
754
774
  # to make the choice data json serializable
@@ -767,8 +787,7 @@ def v1_chat_generate_response(request, ret, to_file=False):
767
787
  )
768
788
 
769
789
  choices.append(choice_data)
770
- total_prompt_tokens += prompt_tokens
771
- total_completion_tokens += completion_tokens
790
+
772
791
  if to_file:
773
792
  responses = []
774
793
 
@@ -795,14 +814,18 @@ def v1_chat_generate_response(request, ret, to_file=False):
795
814
  responses.append(response)
796
815
  return responses
797
816
  else:
817
+ prompt_tokens = sum(
818
+ ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
819
+ )
820
+ completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
798
821
  response = ChatCompletionResponse(
799
822
  id=ret[0]["meta_info"]["id"],
800
823
  model=request.model,
801
824
  choices=choices,
802
825
  usage=UsageInfo(
803
- prompt_tokens=total_prompt_tokens,
804
- completion_tokens=total_completion_tokens,
805
- total_tokens=total_prompt_tokens + total_completion_tokens,
826
+ prompt_tokens=prompt_tokens,
827
+ completion_tokens=completion_tokens,
828
+ total_tokens=prompt_tokens + completion_tokens,
806
829
  ),
807
830
  )
808
831
  return response
@@ -930,7 +953,6 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
930
953
  ).__anext__()
931
954
  except ValueError as e:
932
955
  return create_error_response(str(e))
933
-
934
956
  if not isinstance(ret, list):
935
957
  ret = [ret]
936
958
 
@@ -95,6 +95,12 @@ class FileResponse(BaseModel):
95
95
  purpose: str
96
96
 
97
97
 
98
+ class FileDeleteResponse(BaseModel):
99
+ id: str
100
+ object: str = "file"
101
+ deleted: bool
102
+
103
+
98
104
  class BatchRequest(BaseModel):
99
105
  input_file_id: (
100
106
  str # The ID of an uploaded file that contains requests for the new batch
sglang/srt/server.py CHANGED
@@ -59,6 +59,7 @@ from sglang.srt.openai_api.adapter import (
59
59
  v1_batches,
60
60
  v1_chat_completions,
61
61
  v1_completions,
62
+ v1_delete_file,
62
63
  v1_files_create,
63
64
  v1_retrieve_batch,
64
65
  v1_retrieve_file,
@@ -175,6 +176,12 @@ async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("bat
175
176
  )
176
177
 
177
178
 
179
+ @app.delete("/v1/files/{file_id}")
180
+ async def delete_file(file_id: str):
181
+ # https://platform.openai.com/docs/api-reference/files/delete
182
+ return await v1_delete_file(file_id)
183
+
184
+
178
185
  @app.post("/v1/batches")
179
186
  async def openai_v1_batches(raw_request: Request):
180
187
  return await v1_batches(tokenizer_manager, raw_request)
@@ -367,14 +374,24 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
367
374
  headers["Authorization"] = f"Bearer {server_args.api_key}"
368
375
 
369
376
  # Wait until the server is launched
377
+ success = False
370
378
  for _ in range(120):
371
379
  time.sleep(1)
372
380
  try:
373
- requests.get(url + "/get_model_info", timeout=5, headers=headers)
381
+ res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
382
+ assert res.status_code == 200, f"{res}"
383
+ success = True
374
384
  break
375
- except requests.exceptions.RequestException:
385
+ except (AssertionError, requests.exceptions.RequestException) as e:
386
+ last_traceback = get_exception_traceback()
376
387
  pass
377
388
 
389
+ if not success:
390
+ if pipe_finish_writer is not None:
391
+ pipe_finish_writer.send(last_traceback)
392
+ print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
393
+ sys.exit(1)
394
+
378
395
  # Send a warmup request
379
396
  try:
380
397
  for _ in range(server_args.dp_size):
@@ -390,12 +407,13 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
390
407
  headers=headers,
391
408
  timeout=600,
392
409
  )
393
- assert res.status_code == 200
410
+ assert res.status_code == 200, f"{res}"
394
411
  except Exception as e:
412
+ last_traceback = get_exception_traceback()
395
413
  if pipe_finish_writer is not None:
396
- pipe_finish_writer.send(get_exception_traceback())
397
- print(f"Initialization failed. warmup error: {e}", flush=True)
398
- raise e
414
+ pipe_finish_writer.send(last_traceback)
415
+ print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
416
+ sys.exit(1)
399
417
 
400
418
  logger.info("The server is fired up and ready to roll!")
401
419
  if pipe_finish_writer is not None:
sglang/srt/server_args.py CHANGED
@@ -264,6 +264,7 @@ class ServerArgs:
264
264
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
265
265
  )
266
266
  parser.add_argument(
267
+ "--tensor-parallel-size",
267
268
  "--tp-size",
268
269
  type=int,
269
270
  default=ServerArgs.tp_size,
@@ -318,6 +319,7 @@ class ServerArgs:
318
319
 
319
320
  # Data parallelism
320
321
  parser.add_argument(
322
+ "--data-parallel-size",
321
323
  "--dp-size",
322
324
  type=int,
323
325
  default=ServerArgs.dp_size,
@@ -413,6 +415,8 @@ class ServerArgs:
413
415
 
414
416
  @classmethod
415
417
  def from_cli_args(cls, args: argparse.Namespace):
418
+ args.tp_size = args.tensor_parallel_size
419
+ args.dp_size = args.data_parallel_size
416
420
  attrs = [attr.name for attr in dataclasses.fields(cls)]
417
421
  return cls(**{attr: getattr(args, attr) for attr in attrs})
418
422
 
sglang/test/test_utils.py CHANGED
@@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI
18
18
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
19
19
  from sglang.utils import get_exception_traceback
20
20
 
21
- MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
21
+ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
22
22
 
23
23
 
24
24
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.10"
1
+ __version__ = "0.2.11"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.10
3
+ Version: 0.2.11
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -221,6 +221,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
221
221
  Requires-Dist: sglang[litellm]; extra == "all"
222
222
  Provides-Extra: anthropic
223
223
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
224
+ Provides-Extra: dev
225
+ Requires-Dist: sglang[all]; extra == "dev"
226
+ Requires-Dist: sglang[test]; extra == "dev"
224
227
  Provides-Extra: litellm
225
228
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
226
229
  Provides-Extra: openai
@@ -232,7 +235,6 @@ Requires-Dist: fastapi; extra == "srt"
232
235
  Requires-Dist: hf-transfer; extra == "srt"
233
236
  Requires-Dist: huggingface-hub; extra == "srt"
234
237
  Requires-Dist: interegular; extra == "srt"
235
- Requires-Dist: jsonlines; extra == "srt"
236
238
  Requires-Dist: packaging; extra == "srt"
237
239
  Requires-Dist: pillow; extra == "srt"
238
240
  Requires-Dist: psutil; extra == "srt"
@@ -242,8 +244,12 @@ Requires-Dist: torch; extra == "srt"
242
244
  Requires-Dist: uvicorn; extra == "srt"
243
245
  Requires-Dist: uvloop; extra == "srt"
244
246
  Requires-Dist: zmq; extra == "srt"
245
- Requires-Dist: vllm==0.5.3.post1; extra == "srt"
247
+ Requires-Dist: vllm==0.5.4; extra == "srt"
246
248
  Requires-Dist: outlines>=0.0.44; extra == "srt"
249
+ Provides-Extra: test
250
+ Requires-Dist: jsonlines; extra == "test"
251
+ Requires-Dist: matplotlib; extra == "test"
252
+ Requires-Dist: pandas; extra == "test"
247
253
 
248
254
  <div align="center">
249
255
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -296,20 +302,20 @@ pip install --upgrade pip
296
302
  pip install "sglang[all]"
297
303
 
298
304
  # Install FlashInfer CUDA kernels
299
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
305
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
300
306
  ```
301
307
 
302
308
  ### Method 2: From source
303
309
  ```
304
310
  # Use the last release branch
305
- git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
306
312
  cd sglang
307
313
 
308
314
  pip install --upgrade pip
309
315
  pip install -e "python[all]"
310
316
 
311
317
  # Install FlashInfer CUDA kernels
312
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
318
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
313
319
  ```
314
320
 
315
321
  ### Method 3: Using docker
@@ -383,7 +389,7 @@ response = client.chat.completions.create(
383
389
  print(response)
384
390
  ```
385
391
 
386
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
392
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
387
393
 
388
394
  ### Additional Server Arguments
389
395
  - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -394,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
394
400
  ```
395
401
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
396
402
  ```
397
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
403
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
398
404
  ```
399
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
400
406
  ```
407
+ - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
408
+ ```
409
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
410
+ ```
401
411
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
402
412
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
403
413
  ```
@@ -411,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
411
421
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
412
422
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
413
423
 
414
- ### Run Llama 3.1 405B
415
-
416
- ```bash
417
- ## Run 405B (fp8) on a single node
418
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
419
-
420
- ## Run 405B (fp16) on two nodes
421
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
422
-
423
- # on the first node
424
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
425
-
426
- # on the second
427
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
428
- ```
429
-
430
424
  ### Supported Models
431
425
 
432
426
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -452,6 +446,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
452
446
 
453
447
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
454
448
 
449
+ ### Run Llama 3.1 405B
450
+
451
+ ```bash
452
+ ## Run 405B (fp8) on a single node
453
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
454
+
455
+ ## Run 405B (fp16) on two nodes
456
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
457
+
458
+ # on the first node
459
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
460
+
461
+ # on the second
462
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
463
+ ```
464
+
455
465
  ### Benchmark Performance
456
466
 
457
467
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -1,33 +1,34 @@
1
- sglang/__init__.py,sha256=ECjvAWlxIwKtUIXGchfkoCIbF-iqLjH-Q0o8xHTlVNY,1352
2
- sglang/api.py,sha256=s_P8BvGDCQ0PiqOapr2TLFge1NA7QmKqUx6bFQ8Q5GQ,5676
3
- sglang/bench_latency.py,sha256=lHk9C3XM1e-UQd6HY2qn-njr2rG5AFQ_sNVD5hcF5Vc,12162
1
+ sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
+ sglang/api.py,sha256=gAY9JhqWXjrYoWnMvR-iiuuY1YSN94We-lc1LH0z3cw,6030
3
+ sglang/bench_latency.py,sha256=CXvukEW0IeoH2IwN2vuriC0eHBdJsz3lgT7OwwNo_7A,16146
4
4
  sglang/bench_serving.py,sha256=M0YQT6xElpkx-FtmyUe6lhX1DZfVLGh54qd6qfFYquc,34801
5
- sglang/check_env.py,sha256=XlVou81XC20tPFVTuKDSKqDqLQJoO2QvlnReWMf-Ho4,4152
6
- sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
5
+ sglang/check_env.py,sha256=oU8VmjjPK2SviRhr41cF1953soBu-eTT5E0Hf04zMzo,4974
6
+ sglang/global_config.py,sha256=9JxaFkBKSgep6BVeEl_kx9tuW9PqdijYELyBGTryl6o,1704
7
7
  sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
8
8
  sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
9
9
  sglang/utils.py,sha256=C50xm06WWKpKB8kSNs9vO4egJ2QTk_OAA6M13S2cB_A,8369
10
- sglang/version.py,sha256=waXgc7p-jgGCsUjdVfO_KjlVZblnCvrzf4A0dsBj_lg,23
10
+ sglang/version.py,sha256=_MLx4ac1juJPWEEiC9kMQISX3x3jFBr507jM2P_hxMg,23
11
11
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
13
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
13
14
  sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
14
- sglang/lang/interpreter.py,sha256=_MbvYB0vweCgALklpM2DlofiCXuITCmX_fl8rPPcp5U,30340
15
- sglang/lang/ir.py,sha256=0r-mhA4aO-uuS97Dvkw99ERTcJXfzuV6jJQMmuCwHEg,16615
15
+ sglang/lang/interpreter.py,sha256=3RIeSGdKlKTq2Ixg_Tyo0fGEDTvBKS2f9FaJYODBHzA,30102
16
+ sglang/lang/ir.py,sha256=FGWghAfVW9IcxcrVqHiqpf7vmWzuNYoVTMSbBZkYVRk,16839
16
17
  sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
17
18
  sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
19
- sglang/lang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
20
+ sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
20
21
  sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
21
- sglang/lang/backend/openai.py,sha256=6ww2rwKouWgtmjaCf4hk-kXXJ6bY6n9Xnbm3UTFZvl4,14808
22
- sglang/lang/backend/runtime_endpoint.py,sha256=n78pyBWTCMYmDAS-0yZVFvzQYCiACz8Usj7FTDfdVKE,8763
22
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
23
+ sglang/lang/backend/runtime_endpoint.py,sha256=AaBc5yczchX7mkwiKDMyjLjBkJsh2Lubrfd9lvCOlDo,9544
23
24
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
24
25
  sglang/srt/conversation.py,sha256=V5YuoeO6-aLqGv0p3J2qx8TnBJbN1oTopYFutNul3GQ,16491
25
26
  sglang/srt/hf_transformers_utils.py,sha256=Tf_RplcW7llVXsigRvSGqmeAUxBeAL8rPCkzuqWfZ8U,11925
26
27
  sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
27
28
  sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
28
29
  sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
29
- sglang/srt/server.py,sha256=ur_fDb-nEmlzz1mSKwWa87XFJdQM1gxFz4cahMcMatA,16028
30
- sglang/srt/server_args.py,sha256=oUMzSSBrJ5_g0yeBapABUv2MlhDNWEfWLdLVROgqZOU,16305
30
+ sglang/srt/server.py,sha256=hUNnTvH4c1AI2JJzoBUf9TQuTelx-vulcqwkEplw7Gk,16699
31
+ sglang/srt/server_args.py,sha256=SmvnebtDTsvPNDyW6lltuJKC7h8eVdYmurY1ieIMySA,16475
31
32
  sglang/srt/utils.py,sha256=GcRFf3pb5l-Q5TJU4gF-Wp7Ct46l3BO0aMpjlyHXp3I,23766
32
33
  sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
33
34
  sglang/srt/constrained/base_tool_cache.py,sha256=1_m-AivPtWRwUgGiEZBafCrSFUGahK4UM4vgAd8TkMg,2004
@@ -37,8 +38,8 @@ sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1be
37
38
  sglang/srt/layers/extend_attention.py,sha256=V5pm7toSDlzByaV4lGRgXVGWFUPf68chvvahlT2h4mk,14092
38
39
  sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
39
40
  sglang/srt/layers/linear.py,sha256=3Se2FRXyqXcd-uvNx2b7s-jolsUTEVeYBMYHmV82wPw,34518
40
- sglang/srt/layers/logits_processor.py,sha256=5Cg3h5b4H0EUeOJRst3IOMWL5dniP63A5s15BRkAMmk,11091
41
- sglang/srt/layers/radix_attention.py,sha256=cNSQWO74DcXgpAMKSMaHzfpy5IcLORUnWe5gOwATLrw,7466
41
+ sglang/srt/layers/logits_processor.py,sha256=wHKB1FjbfY0a7KGw5dCsEhmO4sc7VMy3gYtSPv4oQYM,11097
42
+ sglang/srt/layers/radix_attention.py,sha256=lXwm-qs7hPy_EFV1Zf2pPQ0-drAdrO8V5J4eX0LwLtU,7505
42
43
  sglang/srt/layers/token_attention.py,sha256=pdBORaWQGvDy_Aitcq0XDHk2Rravol-jZZkrsgkXeng,8849
43
44
  sglang/srt/layers/quantization/__init__.py,sha256=JMlgE-FWS759lfQ9Uc6mGFqBbTFLlvKeVEFpZLATe14,2536
44
45
  sglang/srt/layers/quantization/fp8.py,sha256=GQOLeGbrcUfwO-7oClzDda0RXGPHR70ZXUHArZsa174,25511
@@ -47,43 +48,44 @@ sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk
47
48
  sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
48
49
  sglang/srt/managers/io_struct.py,sha256=VK61d6zfnBz5a3IMmwYsa5PNa9jUXPPmED1TdDRQGDs,7345
49
50
  sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
50
- sglang/srt/managers/schedule_batch.py,sha256=yIjiiMcaYYN9iaEOGQZoPUpFviDptMVh9hMwRRnDAco,37896
51
- sglang/srt/managers/tokenizer_manager.py,sha256=kxkoAa8VbQt9FJPX1fN-7IzAD8RIcIvz3AGR8uEMYjk,21202
52
- sglang/srt/managers/tp_worker.py,sha256=JPLneFwcPlmPXZX1QxZHWgcdau8FC8wNuVqfCqsgOkU,35234
51
+ sglang/srt/managers/schedule_batch.py,sha256=sKQAHRL6VoapGiO7yQV796gW4sVGAgVVBMtmENbKtvg,29641
52
+ sglang/srt/managers/tokenizer_manager.py,sha256=wqb6zQbkHYcSNU14Auuh5519CVMmfbKGBQvn_IwDSAo,21408
53
+ sglang/srt/managers/tp_worker.py,sha256=3sHlN4hxksF22lkOJ8i3X6WSH4_5POy74BfbIAzIDtM,35216
53
54
  sglang/srt/mem_cache/base_cache.py,sha256=czyN8IumXcMQskYOZDV3DzjfD4kdR-qwLVxceDqnOmE,788
54
55
  sglang/srt/mem_cache/chunk_cache.py,sha256=u1mkGoTI7_31H0i0mhKT7S57StYSsdmsSPqyGubE7lY,1560
55
56
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
56
- sglang/srt/mem_cache/memory_pool.py,sha256=8N4eHybhtBuwIwYyeNSvrZI90LGgMG8sA3OrXdXZAZs,5496
57
+ sglang/srt/mem_cache/memory_pool.py,sha256=oOKtPTgzujo9gHXykSuER7VKqQRuwNKlXyXlaK-3dxo,5280
57
58
  sglang/srt/mem_cache/radix_cache.py,sha256=pa5RD4xNKPSuvL55BnC4mimoca5oJRXr4Rg91-sbTcs,8881
58
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=OdmO6R7nHWrRJCtZOxYkt0KNdGoX7Md4knsypwPYjaQ,9365
59
- sglang/srt/model_executor/model_runner.py,sha256=tJHlqk_JH3RJDaPAiSljaDI951LUu9AYW679eCKMJXs,17404
59
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=EyI8sMMoVlOjdTT2Y3cfwo1-uQ43QCQ1skx5BNgchjE,9433
60
+ sglang/srt/model_executor/forward_batch_info.py,sha256=P5bGeLsnFbEqgWLI5X5Eg0XFCG1j2oWZOsIAMZNkZW4,9022
61
+ sglang/srt/model_executor/model_runner.py,sha256=yzkJLIM41mhbfgfq87ToskAaA1PS67YzhmoSMbflkZI,17479
60
62
  sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
61
63
  sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
62
- sglang/srt/models/chatglm.py,sha256=vYWooqyPmcSFZNjxj_g5I_FgHJlDytbEiz6vyv3JBNM,13856
63
- sglang/srt/models/commandr.py,sha256=gaTI77hgitPlcUNyxMEdGu_XZQj2DuAMnh3KbZQ9HFg,14166
64
- sglang/srt/models/dbrx.py,sha256=LQu7I2KH-XzY9iBlaK7IQsM1o3kzsuI1vTCspK2C19o,14655
65
- sglang/srt/models/deepseek.py,sha256=adr57ZX6aPOBOpmvm7YIvoqo6u0jdrKJPZ8SGcVXAh8,16014
66
- sglang/srt/models/deepseek_v2.py,sha256=jaVaQlL1aPCTu8nLcvtAW_rmtvHe6y2CviIOjXzh4q4,26962
67
- sglang/srt/models/gemma.py,sha256=PMPI1-WLuLdk6e7u6I9d_LoCkauLkWY3aOP8MFEZ-sI,12279
68
- sglang/srt/models/gemma2.py,sha256=kTjZcsptgtYaO8BL_NlygjVSMSloq2Mc4Rf3FKvEhbs,16420
69
- sglang/srt/models/gpt_bigcode.py,sha256=U7GmHKywSu12D-EwvuWv3RwHkx6bPawaRIjlFIpQkfs,10194
70
- sglang/srt/models/grok.py,sha256=NfZdsRVErDIUWFqjhtNf2pqC9G4cRdYHBFpgDq1IZ2A,27855
71
- sglang/srt/models/internlm2.py,sha256=Ld2GUxZeqqqJ2vd4QiX2s1y2AceJLA1nVnUYY88GMQk,12219
72
- sglang/srt/models/llama2.py,sha256=zfOk3OK1_B6s6yuXsZFmNCf07RsfytVD72GunLBt8Cc,14282
73
- sglang/srt/models/llama_classification.py,sha256=4r_orFZqBR3U_yC4bus1K3Z3-ADscYGSzgA82_VDN0g,4926
74
- sglang/srt/models/llava.py,sha256=BJphgyQGdo7uTpJcKGEfWwdpH9GTMDnyiznLSSgmvm8,18476
75
- sglang/srt/models/llavavid.py,sha256=-7vaVqaIfukCvMkNakEPblpwjIHC6ezrAvmpE5RzlUY,13602
76
- sglang/srt/models/minicpm.py,sha256=Mj-dbhfN7li7cTEP-0sV7i5PSYkMGIaYCqRU7eDc-BY,13837
64
+ sglang/srt/models/chatglm.py,sha256=7bHU2AFoppINDZm0EdxgtAJe7rwr9OPkhOCfq2qNrIA,13862
65
+ sglang/srt/models/commandr.py,sha256=5BEtIS2uUQJANkkY-6ZeDqlrpUK5yXVYHiztU3vsTKY,14172
66
+ sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
67
+ sglang/srt/models/deepseek.py,sha256=E5W4nkH-Ne449rAIwQZgz-FAH2Qqp2r1vNfboyk5wEg,16024
68
+ sglang/srt/models/deepseek_v2.py,sha256=NMcckZb48kVUwAmDA2l8wO19T6DNkJOkKAhHa6utBZM,26968
69
+ sglang/srt/models/gemma.py,sha256=ilfN_NOcz7hpwEJ2y7NW3fBFmFO7YfjhdFDbfzl2qww,12285
70
+ sglang/srt/models/gemma2.py,sha256=D8GZOI1tAbEV9PaBmJSsJRzCmvaK3tGXttIbrMb5yiQ,16426
71
+ sglang/srt/models/gpt_bigcode.py,sha256=OKk9UP67as3T5bePlTRGHTCD-1wqaUEk92AowXPm6dg,10204
72
+ sglang/srt/models/grok.py,sha256=M9rtdXslqYBle5VyZqFVHiJUXq_q_aHbza63xa03zqI,27861
73
+ sglang/srt/models/internlm2.py,sha256=CKWBL0dBvLdaEUeJOUvLUNPb8BLrAZ8_BSf2mfFQhfU,12225
74
+ sglang/srt/models/llama2.py,sha256=3ZEWi0PVCDNjTrVNvLs1ESdyTcZhJlZjaH5uyS46JyM,14288
75
+ sglang/srt/models/llama_classification.py,sha256=Dvzy3PfETiJtnKFOk8qDDLUoZECf_cpSrNeA60PaDo4,4932
76
+ sglang/srt/models/llava.py,sha256=-ysi192vpBDxNaMS8qaLOhC34lXQyRtbG_0niVaceSo,18436
77
+ sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
78
+ sglang/srt/models/minicpm.py,sha256=ea_OyiwVTo6Tg9jNRAwqxETnA6FFeAqlIbiUS-xViEI,13843
77
79
  sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
78
- sglang/srt/models/mixtral.py,sha256=QiswCUdZ4VwMghtrr_vGP_dkzxSCrcUIcBgjlOZh_Ao,21391
79
- sglang/srt/models/mixtral_quant.py,sha256=I1sIdistZHw7GO35qvlteA16DGVtME5rvEVV86v0-7Y,14216
80
- sglang/srt/models/qwen.py,sha256=xAtlWyhMkcfwocRqzZoH01qKbkohXxAf4tnkPh0xtpM,10000
81
- sglang/srt/models/qwen2.py,sha256=mXlVd6UTCXY3VdgodFpQnlaY-NYLIbA-SknxdA9R13w,12278
82
- sglang/srt/models/qwen2_moe.py,sha256=YYdJEezic7GyW-_bXlNIaqBa0C4IHQpz_vuRBLxms4k,18141
83
- sglang/srt/models/stablelm.py,sha256=b3d-ZwLQoLjZ6CupnkIq7d-z9tzGSxAyIcgSmZiZxZw,11362
80
+ sglang/srt/models/mixtral.py,sha256=raSLbp6AfWg5_u-f-lYeRejE9koAjbHt8iIHXd3nURM,21397
81
+ sglang/srt/models/mixtral_quant.py,sha256=xYeeatZ9OfwCTas_KbH9nl6lnUT4YqSY7NAxpgLp5LE,14222
82
+ sglang/srt/models/qwen.py,sha256=43ea6gn4wHzAaI3JTDLtl08aEm0vIqgzbVH9M8oeuY0,10006
83
+ sglang/srt/models/qwen2.py,sha256=Hyhks2r4KHpKeb9iHZpnvEVc5klmnrPwcLohqg8j1kw,12284
84
+ sglang/srt/models/qwen2_moe.py,sha256=PZdhEf0DUuGWsld3TyDWlIqSbrrOdqvCD4lAtCPWXeg,18147
85
+ sglang/srt/models/stablelm.py,sha256=yPrdzPEoUD2s_Q3RgOq7BBC7z-UtEaACzabqbDRs2tA,11368
84
86
  sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
85
- sglang/srt/openai_api/adapter.py,sha256=p2HeYO9Qgl7EERXutwpsQ659NvZhFnkQmTZX5s-x-oI,37444
86
- sglang/srt/openai_api/protocol.py,sha256=q1MuDUhwSM-8G2uGnWUMeEk87aZxei8lCcaP6VuA8So,8200
87
+ sglang/srt/openai_api/adapter.py,sha256=Eq44_hGwHcglCKOc6WqWDxBsgyRqtuC6VR4HB4GLfUY,38193
88
+ sglang/srt/openai_api/protocol.py,sha256=pcRgmDM3Kozh74Aj-qEo8q64BI6hEjrdhYDU4m9srdI,8294
87
89
  sglang/test/run_eval.py,sha256=kbM6SiosfXj-1uYTFXPWMd7hZDvJZwV-AmdHi_WfP3A,3559
88
90
  sglang/test/runners.py,sha256=APXXbrqmUGUqnX7T1Aq8X2NJQkIqtv6B42a2ybdlPjA,7459
89
91
  sglang/test/simple_eval_common.py,sha256=HL1bfgkTAKP7sk-kShg73WTeADhuBD6xSsuLbV_9C3s,12359
@@ -92,9 +94,9 @@ sglang/test/simple_eval_humaneval.py,sha256=k50DKoAbXiw-ubrFXHet9B-7tboHU2dQJf5G
92
94
  sglang/test/simple_eval_math.py,sha256=EQblQmtUt-kl558drzhP7c6KhpDNgr1EJhhKx5eeHM4,2519
93
95
  sglang/test/simple_eval_mmlu.py,sha256=KqSSdSu2qfoKQ870ttxev1NJ7c90xv2mvKOQsSODtAw,4326
94
96
  sglang/test/test_programs.py,sha256=e9_ifoIvuI1Ctkbkz3wfdZLBBSRikby8ywcodBIkf9M,13826
95
- sglang/test/test_utils.py,sha256=p-G6iiT5-Vkg6LMYgvDheomLJ6IYMLsYHCp3tkatiy8,13983
96
- sglang-0.2.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
97
- sglang-0.2.10.dist-info/METADATA,sha256=Lt9wnP2134unvF88fDj2PfQIf2YaeYJ6xZdfmMAJkoM,33303
98
- sglang-0.2.10.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
99
- sglang-0.2.10.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
100
- sglang-0.2.10.dist-info/RECORD,,
97
+ sglang/test/test_utils.py,sha256=ITQcY3WGV4kLGWEkfU-AeuFX8yGLmq9LEK5jHiuW7Sw,13991
98
+ sglang-0.2.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
99
+ sglang-0.2.11.dist-info/METADATA,sha256=gSQA5-Hf9y41ulOKiMeHRu4Nf-c9Nbt6xhmlCGzvhNY,33783
100
+ sglang-0.2.11.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
101
+ sglang-0.2.11.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
102
+ sglang-0.2.11.dist-info/RECORD,,